From 68efb27a1efe7ff2ced6fce16ce316f8ffc4abef Mon Sep 17 00:00:00 2001
From: Sergio Jimenez Jimenez <sergiojj932@gmail.com>
Date: Sun, 18 Jan 2026 18:03:23 +0100
Subject: [PATCH] debug set and locking

---
 .gitignore                                    |    2 +
 src/dataset_manager.py                        |   31 +-
 src/doctr_raytune_rest.ipynb                  |  111 ++
 src/doctr_service/dataset_manager.py          |   31 +-
 src/doctr_service/docker-compose.yml          |    2 +
 src/doctr_service/doctr_tuning_rest.py        |    7 +
 src/easyocr_raytune_rest.ipynb                |  111 ++
 src/easyocr_service/dataset_manager.py        |   31 +-
 src/easyocr_service/docker-compose.yml        |    2 +
 src/easyocr_service/easyocr_tuning_rest.py    |    7 +
 src/output_raytune.ipynb                      | 1037 -----------------
 src/paddle_ocr/dataset_manager.py             |   31 +-
 .../docker-compose.cpu-registry.yml           |    1 +
 .../docker-compose.gpu-registry.yml           |    1 +
 src/paddle_ocr/docker-compose.workers.yml     |    2 +
 src/paddle_ocr/docker-compose.yml             |    4 +-
 src/paddle_ocr/paddle_ocr_tuning_rest.py      |    7 +
 src/paddle_ocr_raytune_rest.ipynb             |  293 +----
 src/raytune_ocr.py                            |  333 ++++++
 19 files changed, 754 insertions(+), 1290 deletions(-)
 create mode 100644 src/doctr_raytune_rest.ipynb
 create mode 100644 src/easyocr_raytune_rest.ipynb
 delete mode 100644 src/output_raytune.ipynb
 create mode 100644 src/raytune_ocr.py

diff --git a/.gitignore b/.gitignore
index 0098713..1eb7d2f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,5 @@ results
 node_modules
 src/paddle_ocr/wheels
 src/*.log
+src/output_*.ipynb
+debugset/
diff --git a/src/dataset_manager.py b/src/dataset_manager.py
index 2d3ccac..e9ea973 100644
--- a/src/dataset_manager.py
+++ b/src/dataset_manager.py
@@ -42,4 +42,33 @@ class ImageTextDataset:
         with open(txt_path, "r", encoding="utf-8") as f:
             text = f.read()
 
-        return image, text
\ No newline at end of file
+        return image, text
+
+    def get_output_path(self, idx, output_subdir, debugset_root="/app/debugset"):
+        """Get output path for saving OCR result to debugset folder.
+
+        Args:
+            idx: Sample index
+            output_subdir: Subdirectory name (e.g., 'paddle_text', 'doctr_text')
+            debugset_root: Root folder for debug output (default: /app/debugset)
+
+        Returns:
+            Path like /app/debugset/doc1/{output_subdir}/page_001.txt
+        """
+        img_path, _ = self.samples[idx]
+        # img_path: /app/dataset/doc1/img/page_001.png
+        # Extract relative path: doc1/img/page_001.png
+        parts = img_path.split("/dataset/", 1)
+        if len(parts) == 2:
+            rel_path = parts[1]  # doc1/img/page_001.png
+        else:
+            rel_path = os.path.basename(img_path)
+
+        # Replace /img/ with /{output_subdir}/
+        rel_parts = rel_path.rsplit("/img/", 1)
+        doc_folder = rel_parts[0]  # doc1
+        fname = os.path.splitext(rel_parts[1])[0] + ".txt"  # page_001.txt
+
+        out_dir = os.path.join(debugset_root, doc_folder, output_subdir)
+        os.makedirs(out_dir, exist_ok=True)
+        return os.path.join(out_dir, fname)
\ No newline at end of file
diff --git a/src/doctr_raytune_rest.ipynb b/src/doctr_raytune_rest.ipynb
new file mode 100644
index 0000000..aafd28f
--- /dev/null
+++ b/src/doctr_raytune_rest.ipynb
@@ -0,0 +1,111 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "header",
+   "metadata": {},
+   "source": [
+    "# DocTR Hyperparameter Optimization via REST API\n",
+    "\n",
+    "Uses Ray Tune + Optuna to find optimal DocTR parameters.\n",
+    "\n",
+    "## Prerequisites\n",
+    "\n",
+    "```bash\n",
+    "cd src/doctr_service\n",
+    "docker compose up ocr-cpu  # or ocr-gpu\n",
+    "```\n",
+    "\n",
+    "Service runs on port 8003."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "deps",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -q -U \"ray[tune]\" optuna requests pandas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "setup",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from raytune_ocr import (\n",
+    "    check_workers, create_trainable, run_tuner, analyze_results, correlation_analysis,\n",
+    "    doctr_payload, DOCTR_SEARCH_SPACE, DOCTR_CONFIG_KEYS,\n",
+    ")\n",
+    "\n",
+    "# Worker ports\n",
+    "PORTS = [8003]\n",
+    "\n",
+    "# Check workers are running\n",
+    "healthy = check_workers(PORTS, \"DocTR\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "tune",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create trainable and run tuning\n",
+    "trainable = create_trainable(PORTS, doctr_payload)\n",
+    "\n",
+    "results = run_tuner(\n",
+    "    trainable=trainable,\n",
+    "    search_space=DOCTR_SEARCH_SPACE,\n",
+    "    num_samples=64,\n",
+    "    num_workers=len(healthy),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "analysis",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Analyze results\n",
+    "df = analyze_results(\n",
+    "    results,\n",
+    "    prefix=\"raytune_doctr\",\n",
+    "    config_keys=DOCTR_CONFIG_KEYS,\n",
+    ")\n",
+    "\n",
+    "df.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "correlation",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Correlation analysis\n",
+    "correlation_analysis(df, DOCTR_CONFIG_KEYS)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/doctr_service/dataset_manager.py b/src/doctr_service/dataset_manager.py
index 2d3ccac..e9ea973 100644
--- a/src/doctr_service/dataset_manager.py
+++ b/src/doctr_service/dataset_manager.py
@@ -42,4 +42,33 @@ class ImageTextDataset:
         with open(txt_path, "r", encoding="utf-8") as f:
             text = f.read()
 
-        return image, text
\ No newline at end of file
+        return image, text
+
+    def get_output_path(self, idx, output_subdir, debugset_root="/app/debugset"):
+        """Get output path for saving OCR result to debugset folder.
+
+        Args:
+            idx: Sample index
+            output_subdir: Subdirectory name (e.g., 'paddle_text', 'doctr_text')
+            debugset_root: Root folder for debug output (default: /app/debugset)
+
+        Returns:
+            Path like /app/debugset/doc1/{output_subdir}/page_001.txt
+        """
+        img_path, _ = self.samples[idx]
+        # img_path: /app/dataset/doc1/img/page_001.png
+        # Extract relative path: doc1/img/page_001.png
+        parts = img_path.split("/dataset/", 1)
+        if len(parts) == 2:
+            rel_path = parts[1]  # doc1/img/page_001.png
+        else:
+            rel_path = os.path.basename(img_path)
+
+        # Replace /img/ with /{output_subdir}/
+        rel_parts = rel_path.rsplit("/img/", 1)
+        doc_folder = rel_parts[0]  # doc1
+        fname = os.path.splitext(rel_parts[1])[0] + ".txt"  # page_001.txt
+
+        out_dir = os.path.join(debugset_root, doc_folder, output_subdir)
+        os.makedirs(out_dir, exist_ok=True)
+        return os.path.join(out_dir, fname)
\ No newline at end of file
diff --git a/src/doctr_service/docker-compose.yml b/src/doctr_service/docker-compose.yml
index 710f72b..f16c931 100644
--- a/src/doctr_service/docker-compose.yml
+++ b/src/doctr_service/docker-compose.yml
@@ -14,6 +14,7 @@ services:
       - "8003:8000"
     volumes:
       - ../dataset:/app/dataset:ro
+      - ../debugset:/app/debugset:rw
       - doctr-cache:/root/.cache/doctr
     environment:
       - PYTHONUNBUFFERED=1
@@ -35,6 +36,7 @@ services:
       - "8003:8000"
     volumes:
       - ../dataset:/app/dataset:ro
+      - ../debugset:/app/debugset:rw
       - doctr-cache:/root/.cache/doctr
     environment:
       - PYTHONUNBUFFERED=1
diff --git a/src/doctr_service/doctr_tuning_rest.py b/src/doctr_service/doctr_tuning_rest.py
index 4ef3928..9385f43 100644
--- a/src/doctr_service/doctr_tuning_rest.py
+++ b/src/doctr_service/doctr_tuning_rest.py
@@ -169,6 +169,7 @@ class EvaluateRequest(BaseModel):
     # Page range
     start_page: int = Field(5, ge=0, description="Start page index (inclusive)")
     end_page: int = Field(10, ge=1, description="End page index (exclusive)")
+    save_output: bool = Field(False, description="Save OCR predictions to debugset folder")
 
 
 class EvaluateResponse(BaseModel):
@@ -302,6 +303,12 @@ def evaluate(request: EvaluateRequest):
             )
             time_per_page_list.append(float(time.time() - tp0))
 
+            # Save prediction to debugset if requested
+            if request.save_output:
+                out_path = state.dataset.get_output_path(idx, "doctr_text")
+                with open(out_path, "w", encoding="utf-8") as f:
+                    f.write(pred)
+
             m = evaluate_text(ref, pred)
             cer_list.append(m["CER"])
             wer_list.append(m["WER"])
diff --git a/src/easyocr_raytune_rest.ipynb b/src/easyocr_raytune_rest.ipynb
new file mode 100644
index 0000000..723f97f
--- /dev/null
+++ b/src/easyocr_raytune_rest.ipynb
@@ -0,0 +1,111 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "header",
+   "metadata": {},
+   "source": [
+    "# EasyOCR Hyperparameter Optimization via REST API\n",
+    "\n",
+    "Uses Ray Tune + Optuna to find optimal EasyOCR parameters.\n",
+    "\n",
+    "## Prerequisites\n",
+    "\n",
+    "```bash\n",
+    "cd src/easyocr_service\n",
+    "docker compose up ocr-cpu  # or ocr-gpu\n",
+    "```\n",
+    "\n",
+    "Service runs on port 8002."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "deps",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -q -U \"ray[tune]\" optuna requests pandas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "setup",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from raytune_ocr import (\n",
+    "    check_workers, create_trainable, run_tuner, analyze_results, correlation_analysis,\n",
+    "    easyocr_payload, EASYOCR_SEARCH_SPACE, EASYOCR_CONFIG_KEYS,\n",
+    ")\n",
+    "\n",
+    "# Worker ports\n",
+    "PORTS = [8002]\n",
+    "\n",
+    "# Check workers are running\n",
+    "healthy = check_workers(PORTS, \"EasyOCR\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "tune",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create trainable and run tuning\n",
+    "trainable = create_trainable(PORTS, easyocr_payload)\n",
+    "\n",
+    "results = run_tuner(\n",
+    "    trainable=trainable,\n",
+    "    search_space=EASYOCR_SEARCH_SPACE,\n",
+    "    num_samples=64,\n",
+    "    num_workers=len(healthy),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "analysis",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Analyze results\n",
+    "df = analyze_results(\n",
+    "    results,\n",
+    "    prefix=\"raytune_easyocr\",\n",
+    "    config_keys=EASYOCR_CONFIG_KEYS,\n",
+    ")\n",
+    "\n",
+    "df.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "correlation",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Correlation analysis\n",
+    "correlation_analysis(df, EASYOCR_CONFIG_KEYS)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/easyocr_service/dataset_manager.py b/src/easyocr_service/dataset_manager.py
index 2d3ccac..e9ea973 100644
--- a/src/easyocr_service/dataset_manager.py
+++ b/src/easyocr_service/dataset_manager.py
@@ -42,4 +42,33 @@ class ImageTextDataset:
         with open(txt_path, "r", encoding="utf-8") as f:
             text = f.read()
 
-        return image, text
\ No newline at end of file
+        return image, text
+
+    def get_output_path(self, idx, output_subdir, debugset_root="/app/debugset"):
+        """Get output path for saving OCR result to debugset folder.
+
+        Args:
+            idx: Sample index
+            output_subdir: Subdirectory name (e.g., 'paddle_text', 'doctr_text')
+            debugset_root: Root folder for debug output (default: /app/debugset)
+
+        Returns:
+            Path like /app/debugset/doc1/{output_subdir}/page_001.txt
+        """
+        img_path, _ = self.samples[idx]
+        # img_path: /app/dataset/doc1/img/page_001.png
+        # Extract relative path: doc1/img/page_001.png
+        parts = img_path.split("/dataset/", 1)
+        if len(parts) == 2:
+            rel_path = parts[1]  # doc1/img/page_001.png
+        else:
+            rel_path = os.path.basename(img_path)
+
+        # Replace /img/ with /{output_subdir}/
+        rel_parts = rel_path.rsplit("/img/", 1)
+        doc_folder = rel_parts[0]  # doc1
+        fname = os.path.splitext(rel_parts[1])[0] + ".txt"  # page_001.txt
+
+        out_dir = os.path.join(debugset_root, doc_folder, output_subdir)
+        os.makedirs(out_dir, exist_ok=True)
+        return os.path.join(out_dir, fname)
\ No newline at end of file
diff --git a/src/easyocr_service/docker-compose.yml b/src/easyocr_service/docker-compose.yml
index 0b1b085..550e865 100644
--- a/src/easyocr_service/docker-compose.yml
+++ b/src/easyocr_service/docker-compose.yml
@@ -14,6 +14,7 @@ services:
       - "8002:8000"
     volumes:
       - ../dataset:/app/dataset:ro
+      - ../debugset:/app/debugset:rw
       - easyocr-cache:/root/.EasyOCR
     environment:
       - PYTHONUNBUFFERED=1
@@ -34,6 +35,7 @@ services:
       - "8002:8000"
     volumes:
       - ../dataset:/app/dataset:ro
+      - ../debugset:/app/debugset:rw
       - easyocr-cache:/root/.EasyOCR
     environment:
       - PYTHONUNBUFFERED=1
diff --git a/src/easyocr_service/easyocr_tuning_rest.py b/src/easyocr_service/easyocr_tuning_rest.py
index 5fa6cd5..dd1b565 100644
--- a/src/easyocr_service/easyocr_tuning_rest.py
+++ b/src/easyocr_service/easyocr_tuning_rest.py
@@ -133,6 +133,7 @@ class EvaluateRequest(BaseModel):
     # Page range
     start_page: int = Field(5, ge=0, description="Start page index (inclusive)")
     end_page: int = Field(10, ge=1, description="End page index (exclusive)")
+    save_output: bool = Field(False, description="Save OCR predictions to debugset folder")
 
 
 class EvaluateResponse(BaseModel):
@@ -301,6 +302,12 @@ def evaluate(request: EvaluateRequest):
             pred = assemble_easyocr_result(result)
             time_per_page_list.append(float(time.time() - tp0))
 
+            # Save prediction to debugset if requested
+            if request.save_output:
+                out_path = state.dataset.get_output_path(idx, "easyocr_text")
+                with open(out_path, "w", encoding="utf-8") as f:
+                    f.write(pred)
+
             m = evaluate_text(ref, pred)
             cer_list.append(m["CER"])
             wer_list.append(m["WER"])
diff --git a/src/output_raytune.ipynb b/src/output_raytune.ipynb
deleted file mode 100644
index 7230e7e..0000000
--- a/src/output_raytune.ipynb
+++ /dev/null
@@ -1,1037 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "header",
-   "metadata": {
-    "papermill": {
-     "duration": 0.002022,
-     "end_time": "2026-01-18T16:25:38.048417",
-     "exception": false,
-     "start_time": "2026-01-18T16:25:38.046395",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "source": [
-    "# PaddleOCR Hyperparameter Optimization via REST API\n",
-    "\n",
-    "This notebook runs Ray Tune hyperparameter search calling the PaddleOCR REST API (Docker container).\n",
-    "\n",
-    "**Benefits:**\n",
-    "- No model reload per trial - Model stays loaded in Docker container\n",
-    "- Faster trials - Skip ~10s model load time per trial\n",
-    "- Cleaner code - REST API replaces subprocess + CLI arg parsing"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "prereq",
-   "metadata": {
-    "papermill": {
-     "duration": 0.000855,
-     "end_time": "2026-01-18T16:25:38.058911",
-     "exception": false,
-     "start_time": "2026-01-18T16:25:38.058056",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "source": [
-    "## Prerequisites\n",
-    "\n",
-    "Start 2 PaddleOCR workers for parallel hyperparameter tuning:\n",
-    "\n",
-    "```bash\n",
-    "cd src/paddle_ocr\n",
-    "docker compose -f docker-compose.workers.yml up\n",
-    "```\n",
-    "\n",
-    "This starts 2 GPU workers on ports 8001-8002, allowing 2 concurrent trials.\n",
-    "\n",
-    "For CPU-only systems:\n",
-    "```bash\n",
-    "docker compose -f docker-compose.workers.yml --profile cpu up\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3ob9fsoilc4",
-   "metadata": {
-    "papermill": {
-     "duration": 0.000846,
-     "end_time": "2026-01-18T16:25:38.060620",
-     "exception": false,
-     "start_time": "2026-01-18T16:25:38.059774",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "source": [
-    "## 0. Dependencies"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "wyr2nsoj7",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-01-18T16:25:38.063421Z",
-     "iopub.status.busy": "2026-01-18T16:25:38.063287Z",
-     "iopub.status.idle": "2026-01-18T16:25:39.300678Z",
-     "shell.execute_reply": "2026-01-18T16:25:39.299298Z"
-    },
-    "papermill": {
-     "duration": 1.240519,
-     "end_time": "2026-01-18T16:25:39.301973",
-     "exception": false,
-     "start_time": "2026-01-18T16:25:38.061454",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: ray[tune] in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (2.53.0)\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: click>=7.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (8.3.1)\r\n",
-      "Requirement already satisfied: filelock in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (3.20.3)\r\n",
-      "Requirement already satisfied: jsonschema in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (4.26.0)\r\n",
-      "Requirement already satisfied: msgpack<2.0.0,>=1.0.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (1.1.2)\r\n",
-      "Requirement already satisfied: packaging>=24.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (25.0)\r\n",
-      "Requirement already satisfied: protobuf>=3.20.3 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (6.33.4)\r\n",
-      "Requirement already satisfied: pyyaml in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (6.0.3)\r\n",
-      "Requirement already satisfied: requests in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (2.32.5)\r\n",
-      "Requirement already satisfied: pandas in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (2.3.3)\r\n",
-      "Requirement already satisfied: pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (2.12.5)\r\n",
-      "Requirement already satisfied: tensorboardX>=1.9 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (2.6.4)\r\n",
-      "Requirement already satisfied: pyarrow>=9.0.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (22.0.0)\r\n",
-      "Requirement already satisfied: fsspec in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (2026.1.0)\r\n",
-      "Requirement already satisfied: annotated-types>=0.6.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (0.7.0)\r\n",
-      "Requirement already satisfied: pydantic-core==2.41.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (2.41.5)\r\n",
-      "Requirement already satisfied: typing-extensions>=4.14.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (4.15.0)\r\n",
-      "Requirement already satisfied: typing-inspection>=0.4.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (0.4.2)\r\n",
-      "Requirement already satisfied: numpy in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from tensorboardX>=1.9->ray[tune]) (2.4.1)\r\n",
-      "Requirement already satisfied: attrs>=22.2.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from jsonschema->ray[tune]) (25.4.0)\r\n",
-      "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from jsonschema->ray[tune]) (2025.9.1)\r\n",
-      "Requirement already satisfied: referencing>=0.28.4 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from jsonschema->ray[tune]) (0.37.0)\r\n",
-      "Requirement already satisfied: rpds-py>=0.25.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from jsonschema->ray[tune]) (0.30.0)\r\n",
-      "Requirement already satisfied: python-dateutil>=2.8.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pandas->ray[tune]) (2.9.0.post0)\r\n",
-      "Requirement already satisfied: pytz>=2020.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pandas->ray[tune]) (2025.2)\r\n",
-      "Requirement already satisfied: tzdata>=2022.7 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pandas->ray[tune]) (2025.3)\r\n",
-      "Requirement already satisfied: six>=1.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas->ray[tune]) (1.17.0)\r\n",
-      "Requirement already satisfied: charset_normalizer<4,>=2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests->ray[tune]) (3.4.4)\r\n",
-      "Requirement already satisfied: idna<4,>=2.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests->ray[tune]) (3.11)\r\n",
-      "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests->ray[tune]) (2.6.3)\r\n",
-      "Requirement already satisfied: certifi>=2017.4.17 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests->ray[tune]) (2026.1.4)\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Note: you may need to restart the kernel to use updated packages.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: optuna in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (4.6.0)\r\n",
-      "Requirement already satisfied: alembic>=1.5.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from optuna) (1.18.1)\r\n",
-      "Requirement already satisfied: colorlog in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from optuna) (6.10.1)\r\n",
-      "Requirement already satisfied: numpy in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from optuna) (2.4.1)\r\n",
-      "Requirement already satisfied: packaging>=20.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from optuna) (25.0)\r\n",
-      "Requirement already satisfied: sqlalchemy>=1.4.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from optuna) (2.0.45)\r\n",
-      "Requirement already satisfied: tqdm in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from optuna) (4.67.1)\r\n",
-      "Requirement already satisfied: PyYAML in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from optuna) (6.0.3)\r\n",
-      "Requirement already satisfied: Mako in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from alembic>=1.5.0->optuna) (1.3.10)\r\n",
-      "Requirement already satisfied: typing-extensions>=4.12 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from alembic>=1.5.0->optuna) (4.15.0)\r\n",
-      "Requirement already satisfied: greenlet>=1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from sqlalchemy>=1.4.2->optuna) (3.3.0)\r\n",
-      "Requirement already satisfied: MarkupSafe>=0.9.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from Mako->alembic>=1.5.0->optuna) (3.0.3)\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Note: you may need to restart the kernel to use updated packages.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: requests in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (2.32.5)\r\n",
-      "Requirement already satisfied: pandas in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (2.3.3)\r\n",
-      "Requirement already satisfied: charset_normalizer<4,>=2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (3.4.4)\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: idna<4,>=2.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (3.11)\r\n",
-      "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (2.6.3)\r\n",
-      "Requirement already satisfied: certifi>=2017.4.17 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (2026.1.4)\r\n",
-      "Requirement already satisfied: numpy>=1.26.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pandas) (2.4.1)\r\n",
-      "Requirement already satisfied: python-dateutil>=2.8.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pandas) (2.9.0.post0)\r\n",
-      "Requirement already satisfied: pytz>=2020.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pandas) (2025.2)\r\n",
-      "Requirement already satisfied: tzdata>=2022.7 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pandas) (2025.3)\r\n",
-      "Requirement already satisfied: six>=1.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Note: you may need to restart the kernel to use updated packages.\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Install dependencies (run once)\n",
-    "%pip install -U \"ray[tune]\"\n",
-    "%pip install optuna\n",
-    "%pip install requests pandas"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "imports-header",
-   "metadata": {
-    "papermill": {
-     "duration": 0.009444,
-     "end_time": "2026-01-18T16:25:39.312980",
-     "exception": false,
-     "start_time": "2026-01-18T16:25:39.303536",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "source": [
-    "## 1. Imports & Setup"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "imports",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-01-18T16:25:39.316439Z",
-     "iopub.status.busy": "2026-01-18T16:25:39.316230Z",
-     "iopub.status.idle": "2026-01-18T16:25:40.277894Z",
-     "shell.execute_reply": "2026-01-18T16:25:40.277012Z"
-    },
-    "papermill": {
-     "duration": 0.964409,
-     "end_time": "2026-01-18T16:25:40.278450",
-     "exception": false,
-     "start_time": "2026-01-18T16:25:39.314041",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "from datetime import datetime\n",
-    "\n",
-    "import requests\n",
-    "import pandas as pd\n",
-    "\n",
-    "import ray\n",
-    "from ray import tune, train\n",
-    "from ray.tune.search.optuna import OptunaSearch"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "config-header",
-   "metadata": {
-    "papermill": {
-     "duration": 0.009552,
-     "end_time": "2026-01-18T16:25:40.289551",
-     "exception": false,
-     "start_time": "2026-01-18T16:25:40.279999",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "source": [
-    "## 2. API Configuration"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "config",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-01-18T16:25:40.292573Z",
-     "iopub.status.busy": "2026-01-18T16:25:40.292489Z",
-     "iopub.status.idle": "2026-01-18T16:25:40.294713Z",
-     "shell.execute_reply": "2026-01-18T16:25:40.294164Z"
-    },
-    "papermill": {
-     "duration": 0.004591,
-     "end_time": "2026-01-18T16:25:40.295202",
-     "exception": false,
-     "start_time": "2026-01-18T16:25:40.290611",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# PaddleOCR REST API endpoints - 2 workers for parallel trials\n",
-    "# Start workers with: cd src/paddle_ocr && docker compose -f docker-compose.workers.yml up\n",
-    "WORKER_PORTS = [8001, 8002]\n",
-    "WORKER_URLS = [f\"http://localhost:{port}\" for port in WORKER_PORTS]\n",
-    "\n",
-    "# Output folder for results\n",
-    "OUTPUT_FOLDER = \"results\"\n",
-    "os.makedirs(OUTPUT_FOLDER, exist_ok=True)\n",
-    "\n",
-    "# Number of concurrent trials = number of workers\n",
-    "NUM_WORKERS = len(WORKER_URLS)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "health-check",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-01-18T16:25:40.298281Z",
-     "iopub.status.busy": "2026-01-18T16:25:40.298161Z",
-     "iopub.status.idle": "2026-01-18T16:25:40.306720Z",
-     "shell.execute_reply": "2026-01-18T16:25:40.306262Z"
-    },
-    "papermill": {
-     "duration": 0.010723,
-     "end_time": "2026-01-18T16:25:40.307025",
-     "exception": false,
-     "start_time": "2026-01-18T16:25:40.296302",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "✓ http://localhost:8001: ok (GPU: None)\n",
-      "✓ http://localhost:8002: ok (GPU: None)\n",
-      "\n",
-      "2/2 workers ready for parallel tuning\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Verify all workers are running\n",
-    "healthy_workers = []\n",
-    "for url in WORKER_URLS:\n",
-    "    try:\n",
-    "        health = requests.get(f\"{url}/health\", timeout=10).json()\n",
-    "        if health['status'] == 'ok' and health['model_loaded']:\n",
-    "            healthy_workers.append(url)\n",
-    "            print(f\"✓ {url}: {health['status']} (GPU: {health.get('gpu_name', 'N/A')})\")\n",
-    "        else:\n",
-    "            print(f\"✗ {url}: not ready yet\")\n",
-    "    except requests.exceptions.ConnectionError:\n",
-    "        print(f\"✗ {url}: not reachable\")\n",
-    "\n",
-    "if not healthy_workers:\n",
-    "    raise RuntimeError(\n",
-    "        \"No healthy workers found. Start them with:\\n\"\n",
-    "        \"  cd src/paddle_ocr && docker compose -f docker-compose.workers.yml up\"\n",
-    "    )\n",
-    "\n",
-    "print(f\"\\n{len(healthy_workers)}/{len(WORKER_URLS)} workers ready for parallel tuning\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "search-space-header",
-   "metadata": {
-    "papermill": {
-     "duration": 0.001073,
-     "end_time": "2026-01-18T16:25:40.309261",
-     "exception": false,
-     "start_time": "2026-01-18T16:25:40.308188",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "source": [
-    "## 3. Search Space"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "search-space",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-01-18T16:25:40.312177Z",
-     "iopub.status.busy": "2026-01-18T16:25:40.312107Z",
-     "iopub.status.idle": "2026-01-18T16:25:40.314237Z",
-     "shell.execute_reply": "2026-01-18T16:25:40.313794Z"
-    },
-    "papermill": {
-     "duration": 0.004476,
-     "end_time": "2026-01-18T16:25:40.314804",
-     "exception": false,
-     "start_time": "2026-01-18T16:25:40.310328",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "search_space = {\n",
-    "    # Whether to use document image orientation classification\n",
-    "    \"use_doc_orientation_classify\": tune.choice([True, False]),\n",
-    "    # Whether to use text image unwarping\n",
-    "    \"use_doc_unwarping\": tune.choice([True, False]),\n",
-    "    # Whether to use text line orientation classification\n",
-    "    \"textline_orientation\": tune.choice([True, False]),\n",
-    "    # Detection pixel threshold (pixels > threshold are considered text)\n",
-    "    \"text_det_thresh\": tune.uniform(0.0, 0.7),\n",
-    "    # Detection box threshold (average score within border)\n",
-    "    \"text_det_box_thresh\": tune.uniform(0.0, 0.7),\n",
-    "    # Text detection expansion coefficient\n",
-    "    \"text_det_unclip_ratio\": tune.choice([0.0]),\n",
-    "    # Text recognition threshold (filter low confidence results)\n",
-    "    \"text_rec_score_thresh\": tune.uniform(0.0, 0.7),\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "trainable-header",
-   "metadata": {
-    "papermill": {
-     "duration": 0.001057,
-     "end_time": "2026-01-18T16:25:40.316975",
-     "exception": false,
-     "start_time": "2026-01-18T16:25:40.315918",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "source": [
-    "## 4. Trainable Function"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "trainable",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-01-18T16:25:40.319825Z",
-     "iopub.status.busy": "2026-01-18T16:25:40.319771Z",
-     "iopub.status.idle": "2026-01-18T16:25:40.322602Z",
-     "shell.execute_reply": "2026-01-18T16:25:40.322112Z"
-    },
-    "papermill": {
-     "duration": 0.004907,
-     "end_time": "2026-01-18T16:25:40.322948",
-     "exception": false,
-     "start_time": "2026-01-18T16:25:40.318041",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "def trainable_paddle_ocr(config):\n",
-    "    \"\"\"Call PaddleOCR REST API with the given hyperparameter config.\"\"\"\n",
-    "    import random\n",
-    "    import requests\n",
-    "    from ray import train\n",
-    "\n",
-    "    # Worker URLs - random selection (load balances with 2 workers, 2 concurrent trials)\n",
-    "    WORKER_PORTS = [8001, 8002]\n",
-    "    api_url = f\"http://localhost:{random.choice(WORKER_PORTS)}\"\n",
-    "\n",
-    "    payload = {\n",
-    "        \"pdf_folder\": \"/app/dataset\",\n",
-    "        \"use_doc_orientation_classify\": config.get(\"use_doc_orientation_classify\", False),\n",
-    "        \"use_doc_unwarping\": config.get(\"use_doc_unwarping\", False),\n",
-    "        \"textline_orientation\": config.get(\"textline_orientation\", True),\n",
-    "        \"text_det_thresh\": config.get(\"text_det_thresh\", 0.0),\n",
-    "        \"text_det_box_thresh\": config.get(\"text_det_box_thresh\", 0.0),\n",
-    "        \"text_det_unclip_ratio\": config.get(\"text_det_unclip_ratio\", 1.5),\n",
-    "        \"text_rec_score_thresh\": config.get(\"text_rec_score_thresh\", 0.0),\n",
-    "        \"start_page\": 5,\n",
-    "        \"end_page\": 10,\n",
-    "    }\n",
-    "\n",
-    "    try:\n",
-    "        response = requests.post(f\"{api_url}/evaluate\", json=payload, timeout=None)\n",
-    "        response.raise_for_status()\n",
-    "        metrics = response.json()\n",
-    "        metrics[\"worker\"] = api_url\n",
-    "        train.report(metrics)\n",
-    "    except Exception as e:\n",
-    "        train.report({\n",
-    "            \"CER\": 1.0,\n",
-    "            \"WER\": 1.0,\n",
-    "            \"TIME\": 0.0,\n",
-    "            \"PAGES\": 0,\n",
-    "            \"TIME_PER_PAGE\": 0,\n",
-    "            \"worker\": api_url,\n",
-    "            \"ERROR\": str(e)[:500]\n",
-    "        })"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "tuner-header",
-   "metadata": {
-    "papermill": {
-     "duration": 0.001058,
-     "end_time": "2026-01-18T16:25:40.325120",
-     "exception": false,
-     "start_time": "2026-01-18T16:25:40.324062",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "source": [
-    "## 5. Run Tuner"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "ray-init",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-01-18T16:25:40.328162Z",
-     "iopub.status.busy": "2026-01-18T16:25:40.328055Z",
-     "iopub.status.idle": "2026-01-18T16:25:42.985307Z",
-     "shell.execute_reply": "2026-01-18T16:25:42.984863Z"
-    },
-    "papermill": {
-     "duration": 2.65986,
-     "end_time": "2026-01-18T16:25:42.986041",
-     "exception": false,
-     "start_time": "2026-01-18T16:25:40.326181",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2026-01-18 17:25:41,631\tINFO worker.py:2007 -- Started a local Ray instance.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Ray Tune ready (version: 2.53.0)\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py:2046: FutureWarning: Tip: In future versions of Ray, Ray will no longer override accelerator visible devices env var if num_gpus=0 or num_gpus=None (default). To enable this behavior and turn off this error message, set RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO=0\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
-   "source": [
-    "ray.init(ignore_reinit_error=True)\n",
-    "print(f\"Ray Tune ready (version: {ray.__version__})\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "tuner",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-01-18T16:25:42.998698Z",
-     "iopub.status.busy": "2026-01-18T16:25:42.998141Z"
-    },
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": false,
-     "start_time": "2026-01-18T16:25:42.987700",
-     "status": "running"
-    },
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div class=\"tuneStatus\">\n",
-       "  <div style=\"display: flex;flex-direction: row\">\n",
-       "    <div style=\"display: flex;flex-direction: column;\">\n",
-       "      <h3>Tune Status</h3>\n",
-       "      <table>\n",
-       "<tbody>\n",
-       "<tr><td>Current time:</td><td>2026-01-18 17:38:46</td></tr>\n",
-       "<tr><td>Running for: </td><td>00:13:03.82        </td></tr>\n",
-       "<tr><td>Memory:      </td><td>14.3/119.7 GiB     </td></tr>\n",
-       "</tbody>\n",
-       "</table>\n",
-       "    </div>\n",
-       "    <div class=\"vDivider\"></div>\n",
-       "    <div class=\"systemInfo\">\n",
-       "      <h3>System Info</h3>\n",
-       "      Using FIFO scheduling algorithm.<br>Logical resource usage: 2.0/20 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:GB10)\n",
-       "    </div>\n",
-       "    <div class=\"vDivider\"></div>\n",
-       "<div class=\"messages\">\n",
-       "  <h3>Messages</h3>\n",
-       "  \n",
-       "  \n",
-       "  Number of errored trials: 3<br><table>\n",
-       "<thead>\n",
-       "<tr><th>Trial name                   </th><th style=\"text-align: right;\">  # failures</th><th>error file                                                                                                                                                                                                                                                                                                  </th></tr>\n",
-       "</thead>\n",
-       "<tbody>\n",
-       "<tr><td>trainable_paddle_ocr_36ae4d11</td><td style=\"text-align: right;\">           1</td><td>/tmp/ray/session_2026-01-18_17-25-40_347373_1281294/artifacts/2026-01-18_17-25-43/trainable_paddle_ocr_2026-01-18_17-25-43/driver_artifacts/trainable_paddle_ocr_36ae4d11_1_text_det_box_thresh=0.5847,text_det_thresh=0.2571,text_det_unclip_ratio=0.0000,text_rec_score_thre_2026-01-18_17-25-43/error.txt</td></tr>\n",
-       "<tr><td>trainable_paddle_ocr_2312d29c</td><td style=\"text-align: right;\">           1</td><td>/tmp/ray/session_2026-01-18_17-25-40_347373_1281294/artifacts/2026-01-18_17-25-43/trainable_paddle_ocr_2026-01-18_17-25-43/driver_artifacts/trainable_paddle_ocr_2312d29c_2_text_det_box_thresh=0.0312,text_det_thresh=0.0223,text_det_unclip_ratio=0.0000,text_rec_score_thre_2026-01-18_17-25-44/error.txt</td></tr>\n",
-       "<tr><td>trainable_paddle_ocr_5b7b8e02</td><td style=\"text-align: right;\">           1</td><td>/tmp/ray/session_2026-01-18_17-25-40_347373_1281294/artifacts/2026-01-18_17-25-43/trainable_paddle_ocr_2026-01-18_17-25-43/driver_artifacts/trainable_paddle_ocr_5b7b8e02_3_text_det_box_thresh=0.5954,text_det_thresh=0.0707,text_det_unclip_ratio=0.0000,text_rec_score_thre_2026-01-18_17-31-48/error.txt</td></tr>\n",
-       "</tbody>\n",
-       "</table>\n",
-       "</div>\n",
-       "<style>\n",
-       ".messages {\n",
-       "  color: var(--jp-ui-font-color1);\n",
-       "  display: flex;\n",
-       "  flex-direction: column;\n",
-       "  padding-left: 1em;\n",
-       "  overflow-y: auto;\n",
-       "}\n",
-       ".messages h3 {\n",
-       "  font-weight: bold;\n",
-       "}\n",
-       ".vDivider {\n",
-       "  border-left-width: var(--jp-border-width);\n",
-       "  border-left-color: var(--jp-border-color0);\n",
-       "  border-left-style: solid;\n",
-       "  margin: 0.5em 1em 0.5em 1em;\n",
-       "}\n",
-       "</style>\n",
-       "\n",
-       "  </div>\n",
-       "  <div class=\"hDivider\"></div>\n",
-       "  <div class=\"trialStatus\">\n",
-       "    <h3>Trial Status</h3>\n",
-       "    <table>\n",
-       "<thead>\n",
-       "<tr><th>Trial name                   </th><th>status  </th><th>loc                   </th><th style=\"text-align: right;\">  text_det_box_thresh</th><th style=\"text-align: right;\">  text_det_thresh</th><th style=\"text-align: right;\">  text_det_unclip_rati\n",
-       "o</th><th style=\"text-align: right;\">         text_rec_score_thres\n",
-       "h</th><th>textline_orientation  </th><th>use_doc_orientation_\n",
-       "classify      </th><th>use_doc_unwarping  </th></tr>\n",
-       "</thead>\n",
-       "<tbody>\n",
-       "<tr><td>trainable_paddle_ocr_b3243c8a</td><td>RUNNING </td><td>192.168.65.140:1288101</td><td style=\"text-align: right;\">            0.360789 </td><td style=\"text-align: right;\">        0.499551 </td><td style=\"text-align: right;\">0</td><td style=\"text-align: right;\">0.115115</td><td>False                 </td><td>True </td><td>False              </td></tr>\n",
-       "<tr><td>trainable_paddle_ocr_7a4a43b0</td><td>PENDING </td><td>                      </td><td style=\"text-align: right;\">            0.0727848</td><td style=\"text-align: right;\">        0.237729 </td><td style=\"text-align: right;\">0</td><td style=\"text-align: right;\">0.33623 </td><td>True                  </td><td>False</td><td>True               </td></tr>\n",
-       "<tr><td>trainable_paddle_ocr_36ae4d11</td><td>ERROR   </td><td>192.168.65.140:1282742</td><td style=\"text-align: right;\">            0.58473  </td><td style=\"text-align: right;\">        0.257102 </td><td style=\"text-align: right;\">0</td><td style=\"text-align: right;\">0.634955</td><td>False                 </td><td>True </td><td>False              </td></tr>\n",
-       "<tr><td>trainable_paddle_ocr_2312d29c</td><td>ERROR   </td><td>192.168.65.140:1282844</td><td style=\"text-align: right;\">            0.0311783</td><td style=\"text-align: right;\">        0.0222724</td><td style=\"text-align: right;\">0</td><td style=\"text-align: right;\">0.141805</td><td>False                 </td><td>True </td><td>False              </td></tr>\n",
-       "<tr><td>trainable_paddle_ocr_5b7b8e02</td><td>ERROR   </td><td>192.168.65.140:1285648</td><td style=\"text-align: right;\">            0.595412 </td><td style=\"text-align: right;\">        0.0706522</td><td style=\"text-align: right;\">0</td><td style=\"text-align: right;\">0.132174</td><td>True                  </td><td>False</td><td>True               </td></tr>\n",
-       "</tbody>\n",
-       "</table>\n",
-       "  </div>\n",
-       "</div>\n",
-       "<style>\n",
-       ".tuneStatus {\n",
-       "  color: var(--jp-ui-font-color1);\n",
-       "}\n",
-       ".tuneStatus .systemInfo {\n",
-       "  display: flex;\n",
-       "  flex-direction: column;\n",
-       "}\n",
-       ".tuneStatus td {\n",
-       "  white-space: nowrap;\n",
-       "}\n",
-       ".tuneStatus .trialStatus {\n",
-       "  display: flex;\n",
-       "  flex-direction: column;\n",
-       "}\n",
-       ".tuneStatus h3 {\n",
-       "  font-weight: bold;\n",
-       "}\n",
-       ".tuneStatus .hDivider {\n",
-       "  border-bottom-width: var(--jp-border-width);\n",
-       "  border-bottom-color: var(--jp-border-color0);\n",
-       "  border-bottom-style: solid;\n",
-       "}\n",
-       ".tuneStatus .vDivider {\n",
-       "  border-left-width: var(--jp-border-width);\n",
-       "  border-left-color: var(--jp-border-color0);\n",
-       "  border-left-style: solid;\n",
-       "  margin: 0.5em 1em 0.5em 1em;\n",
-       "}\n",
-       "</style>\n"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[36m(pid=gcs_server)\u001b[0m [2026-01-18 17:26:10,501 E 1281442 1281442] (gcs_server) gcs_server.cc:303: Failed to establish connection to the event+metrics exporter agent. Events and metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[33m(raylet)\u001b[0m [2026-01-18 17:26:11,550 E 1281587 1281587] (raylet) main.cc:1032: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[36m(bundle_reservation_check_func pid=1281657)\u001b[0m [2026-01-18 17:26:12,349 E 1281657 1281801] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[2026-01-18 17:26:12,987 E 1281294 1281656] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2026-01-18 17:31:48,050\tERROR tune_controller.py:1331 -- Trial task failed for trial trainable_paddle_ocr_36ae4d11\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/execution/_internal/event_manager.py\", line 110, in resolve_future\n",
-      "    result = ray.get(future)\n",
-      "             ^^^^^^^^^^^^^^^\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/auto_init_hook.py\", line 22, in auto_init_wrapper\n",
-      "    return fn(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/client_mode_hook.py\", line 104, in wrapper\n",
-      "    return func(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 2967, in get\n",
-      "    values, debugger_breakpoint = worker.get_objects(\n",
-      "                                  ^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 1015, in get_objects\n",
-      "    raise value.as_instanceof_cause()\n",
-      "ray.exceptions.RayTaskError(DeprecationWarning): \u001b[36mray::ImplicitFunc.train()\u001b[39m (pid=1282742, ip=192.168.65.140, actor_id=d19d5170bbb9faf9c9fa055f01000000, repr=trainable_paddle_ocr)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/trainable.py\", line 331, in train\n",
-      "    raise skipped from exception_cause(skipped)\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/_internal/util.py\", line 98, in run\n",
-      "    self._ret = self._target(*self._args, **self._kwargs)\n",
-      "                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 44, in <lambda>\n",
-      "    training_func=lambda: self._trainable_func(self.config),\n",
-      "                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 249, in _trainable_func\n",
-      "    output = fn()\n",
-      "             ^^^^\n",
-      "  File \"/tmp/ipykernel_1281294/4208751894.py\", line 31, in trainable_paddle_ocr\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/train/v2/_internal/util.py\", line 273, in _wrapped_fn\n",
-      "    raise DeprecationWarning(\n",
-      "DeprecationWarning: `ray.train.report` is deprecated when running in a function passed to Ray Tune. Please use the equivalent `ray.tune` API instead. See this issue for more context: https://github.com/ray-project/ray/issues/49454\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[36m(trainable_paddle_ocr pid=1285648)\u001b[0m [2026-01-18 17:32:19,397 E 1285648 1285683] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\u001b[32m [repeated 20x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)\u001b[0m\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2026-01-18 17:38:37,341\tERROR tune_controller.py:1331 -- Trial task failed for trial trainable_paddle_ocr_2312d29c\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/execution/_internal/event_manager.py\", line 110, in resolve_future\n",
-      "    result = ray.get(future)\n",
-      "             ^^^^^^^^^^^^^^^\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/auto_init_hook.py\", line 22, in auto_init_wrapper\n",
-      "    return fn(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/client_mode_hook.py\", line 104, in wrapper\n",
-      "    return func(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 2967, in get\n",
-      "    values, debugger_breakpoint = worker.get_objects(\n",
-      "                                  ^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 1015, in get_objects\n",
-      "    raise value.as_instanceof_cause()\n",
-      "ray.exceptions.RayTaskError(DeprecationWarning): \u001b[36mray::ImplicitFunc.train()\u001b[39m (pid=1282844, ip=192.168.65.140, actor_id=845cd8594f8ace3d960b90e501000000, repr=trainable_paddle_ocr)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/trainable.py\", line 331, in train\n",
-      "    raise skipped from exception_cause(skipped)\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/_internal/util.py\", line 98, in run\n",
-      "    self._ret = self._target(*self._args, **self._kwargs)\n",
-      "                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 44, in <lambda>\n",
-      "    training_func=lambda: self._trainable_func(self.config),\n",
-      "                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 249, in _trainable_func\n",
-      "    output = fn()\n",
-      "             ^^^^\n",
-      "  File \"/tmp/ipykernel_1281294/4208751894.py\", line 31, in trainable_paddle_ocr\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/train/v2/_internal/util.py\", line 273, in _wrapped_fn\n",
-      "    raise DeprecationWarning(\n",
-      "DeprecationWarning: `ray.train.report` is deprecated when running in a function passed to Ray Tune. Please use the equivalent `ray.tune` API instead. See this issue for more context: https://github.com/ray-project/ray/issues/49454\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2026-01-18 17:38:46,519\tERROR tune_controller.py:1331 -- Trial task failed for trial trainable_paddle_ocr_5b7b8e02\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/execution/_internal/event_manager.py\", line 110, in resolve_future\n",
-      "    result = ray.get(future)\n",
-      "             ^^^^^^^^^^^^^^^\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/auto_init_hook.py\", line 22, in auto_init_wrapper\n",
-      "    return fn(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/client_mode_hook.py\", line 104, in wrapper\n",
-      "    return func(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 2967, in get\n",
-      "    values, debugger_breakpoint = worker.get_objects(\n",
-      "                                  ^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 1015, in get_objects\n",
-      "    raise value.as_instanceof_cause()\n",
-      "ray.exceptions.RayTaskError(DeprecationWarning): \u001b[36mray::ImplicitFunc.train()\u001b[39m (pid=1285648, ip=192.168.65.140, actor_id=b8478e34aea747352febbe0801000000, repr=trainable_paddle_ocr)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/trainable.py\", line 331, in train\n",
-      "    raise skipped from exception_cause(skipped)\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/_internal/util.py\", line 98, in run\n",
-      "    self._ret = self._target(*self._args, **self._kwargs)\n",
-      "                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 44, in <lambda>\n",
-      "    training_func=lambda: self._trainable_func(self.config),\n",
-      "                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 249, in _trainable_func\n",
-      "    output = fn()\n",
-      "             ^^^^\n",
-      "  File \"/tmp/ipykernel_1281294/4208751894.py\", line 31, in trainable_paddle_ocr\n",
-      "  File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/train/v2/_internal/util.py\", line 273, in _wrapped_fn\n",
-      "    raise DeprecationWarning(\n",
-      "DeprecationWarning: `ray.train.report` is deprecated when running in a function passed to Ray Tune. Please use the equivalent `ray.tune` API instead. See this issue for more context: https://github.com/ray-project/ray/issues/49454\n"
-     ]
-    }
-   ],
-   "source": [
-    "tuner = tune.Tuner(\n",
-    "    trainable_paddle_ocr,\n",
-    "    tune_config=tune.TuneConfig(\n",
-    "        metric=\"CER\",\n",
-    "        mode=\"min\",\n",
-    "        search_alg=OptunaSearch(),\n",
-    "        num_samples=64,\n",
-    "        max_concurrent_trials=NUM_WORKERS,  # Run trials in parallel across workers\n",
-    "    ),\n",
-    "    param_space=search_space,\n",
-    ")\n",
-    "\n",
-    "results = tuner.fit()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "analysis-header",
-   "metadata": {
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "pending"
-    },
-    "tags": []
-   },
-   "source": [
-    "## 6. Results Analysis"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "results-df",
-   "metadata": {
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "pending"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "df = results.get_dataframe()\n",
-    "df.describe()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "save-results",
-   "metadata": {
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "pending"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# Save results to CSV\n",
-    "timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
-    "filename = f\"raytune_paddle_rest_results_{timestamp}.csv\"\n",
-    "filepath = os.path.join(OUTPUT_FOLDER, filename)\n",
-    "\n",
-    "df.to_csv(filepath, index=False)\n",
-    "print(f\"Results saved: {filepath}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "best-config",
-   "metadata": {
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "pending"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# Best configuration\n",
-    "best = df.loc[df[\"CER\"].idxmin()]\n",
-    "\n",
-    "print(f\"Best CER: {best['CER']:.6f}\")\n",
-    "print(f\"Best WER: {best['WER']:.6f}\")\n",
-    "print(f\"\\nOptimal Configuration:\")\n",
-    "print(f\"  textline_orientation: {best['config/textline_orientation']}\")\n",
-    "print(f\"  use_doc_orientation_classify: {best['config/use_doc_orientation_classify']}\")\n",
-    "print(f\"  use_doc_unwarping: {best['config/use_doc_unwarping']}\")\n",
-    "print(f\"  text_det_thresh: {best['config/text_det_thresh']:.4f}\")\n",
-    "print(f\"  text_det_box_thresh: {best['config/text_det_box_thresh']:.4f}\")\n",
-    "print(f\"  text_det_unclip_ratio: {best['config/text_det_unclip_ratio']}\")\n",
-    "print(f\"  text_rec_score_thresh: {best['config/text_rec_score_thresh']:.4f}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "correlation",
-   "metadata": {
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "pending"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# Correlation analysis\n",
-    "param_cols = [\n",
-    "    \"config/text_det_thresh\",\n",
-    "    \"config/text_det_box_thresh\",\n",
-    "    \"config/text_det_unclip_ratio\",\n",
-    "    \"config/text_rec_score_thresh\",\n",
-    "]\n",
-    "\n",
-    "corr_cer = df[param_cols + [\"CER\"]].corr()[\"CER\"].sort_values(ascending=False)\n",
-    "corr_wer = df[param_cols + [\"WER\"]].corr()[\"WER\"].sort_values(ascending=False)\n",
-    "\n",
-    "print(\"Correlation with CER:\")\n",
-    "print(corr_cer)\n",
-    "print(\"\\nCorrelation with WER:\")\n",
-    "print(corr_wer)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.3"
-  },
-  "papermill": {
-   "default_parameters": {},
-   "duration": null,
-   "end_time": null,
-   "environment_variables": {},
-   "exception": null,
-   "input_path": "paddle_ocr_raytune_rest.ipynb",
-   "output_path": "output_raytune.ipynb",
-   "parameters": {},
-   "start_time": "2026-01-18T16:25:37.429790",
-   "version": "2.6.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
\ No newline at end of file
diff --git a/src/paddle_ocr/dataset_manager.py b/src/paddle_ocr/dataset_manager.py
index 2d3ccac..e9ea973 100644
--- a/src/paddle_ocr/dataset_manager.py
+++ b/src/paddle_ocr/dataset_manager.py
@@ -42,4 +42,33 @@ class ImageTextDataset:
         with open(txt_path, "r", encoding="utf-8") as f:
             text = f.read()
 
-        return image, text
\ No newline at end of file
+        return image, text
+
+    def get_output_path(self, idx, output_subdir, debugset_root="/app/debugset"):
+        """Get output path for saving OCR result to debugset folder.
+
+        Args:
+            idx: Sample index
+            output_subdir: Subdirectory name (e.g., 'paddle_text', 'doctr_text')
+            debugset_root: Root folder for debug output (default: /app/debugset)
+
+        Returns:
+            Path like /app/debugset/doc1/{output_subdir}/page_001.txt
+        """
+        img_path, _ = self.samples[idx]
+        # img_path: /app/dataset/doc1/img/page_001.png
+        # Extract relative path: doc1/img/page_001.png
+        parts = img_path.split("/dataset/", 1)
+        if len(parts) == 2:
+            rel_path = parts[1]  # doc1/img/page_001.png
+        else:
+            rel_path = os.path.basename(img_path)
+
+        # Replace /img/ with /{output_subdir}/
+        rel_parts = rel_path.rsplit("/img/", 1)
+        doc_folder = rel_parts[0]  # doc1
+        fname = os.path.splitext(rel_parts[1])[0] + ".txt"  # page_001.txt
+
+        out_dir = os.path.join(debugset_root, doc_folder, output_subdir)
+        os.makedirs(out_dir, exist_ok=True)
+        return os.path.join(out_dir, fname)
\ No newline at end of file
diff --git a/src/paddle_ocr/docker-compose.cpu-registry.yml b/src/paddle_ocr/docker-compose.cpu-registry.yml
index 1d9246f..550ecd3 100644
--- a/src/paddle_ocr/docker-compose.cpu-registry.yml
+++ b/src/paddle_ocr/docker-compose.cpu-registry.yml
@@ -9,6 +9,7 @@ services:
       - "8001:8000"
     volumes:
       - ../dataset:/app/dataset:ro
+      - ../debugset:/app/debugset:rw
       - paddlex-cache:/root/.paddlex
     environment:
       - PYTHONUNBUFFERED=1
diff --git a/src/paddle_ocr/docker-compose.gpu-registry.yml b/src/paddle_ocr/docker-compose.gpu-registry.yml
index 6e606c2..bd9b991 100644
--- a/src/paddle_ocr/docker-compose.gpu-registry.yml
+++ b/src/paddle_ocr/docker-compose.gpu-registry.yml
@@ -11,6 +11,7 @@ services:
       - "8002:8000"
     volumes:
       - ../dataset:/app/dataset:ro
+      - ../debugset:/app/debugset:rw
       - paddlex-cache:/root/.paddlex
       - ./scripts:/app/scripts:ro
     environment:
diff --git a/src/paddle_ocr/docker-compose.workers.yml b/src/paddle_ocr/docker-compose.workers.yml
index 222ea82..cada286 100644
--- a/src/paddle_ocr/docker-compose.workers.yml
+++ b/src/paddle_ocr/docker-compose.workers.yml
@@ -16,6 +16,7 @@ x-ocr-gpu-common: &ocr-gpu-common
   image: seryus.ddns.net/unir/paddle-ocr-gpu:latest
   volumes:
     - ../dataset:/app/dataset:ro
+    - ../debugset:/app/debugset:rw
     - paddlex-cache:/root/.paddlex
   environment:
     - PYTHONUNBUFFERED=1
@@ -39,6 +40,7 @@ x-ocr-cpu-common: &ocr-cpu-common
   image: seryus.ddns.net/unir/paddle-ocr-cpu:latest
   volumes:
     - ../dataset:/app/dataset:ro
+    - ../debugset:/app/debugset:rw
     - paddlex-cache:/root/.paddlex
   environment:
     - PYTHONUNBUFFERED=1
diff --git a/src/paddle_ocr/docker-compose.yml b/src/paddle_ocr/docker-compose.yml
index 22c887b..5641717 100644
--- a/src/paddle_ocr/docker-compose.yml
+++ b/src/paddle_ocr/docker-compose.yml
@@ -45,7 +45,8 @@ services:
     ports:
       - "8000:8000"
     volumes:
-      - ../dataset:/app/dataset:ro          # Your dataset
+      - ../dataset:/app/dataset:ro
+      - ../debugset:/app/debugset:rw          # Your dataset
       - paddlex-cache:/root/.paddlex        # For additional models at runtime
     environment:
       - PYTHONUNBUFFERED=1
@@ -74,6 +75,7 @@ services:
       - "8000:8000"
     volumes:
       - ../dataset:/app/dataset:ro
+      - ../debugset:/app/debugset:rw
       - paddlex-cache:/root/.paddlex
     environment:
       - PYTHONUNBUFFERED=1
diff --git a/src/paddle_ocr/paddle_ocr_tuning_rest.py b/src/paddle_ocr/paddle_ocr_tuning_rest.py
index 6e836c6..b61ff0e 100644
--- a/src/paddle_ocr/paddle_ocr_tuning_rest.py
+++ b/src/paddle_ocr/paddle_ocr_tuning_rest.py
@@ -127,6 +127,7 @@ class EvaluateRequest(BaseModel):
     text_rec_score_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Recognition score threshold")
     start_page: int = Field(5, ge=0, description="Start page index (inclusive)")
     end_page: int = Field(10, ge=1, description="End page index (exclusive)")
+    save_output: bool = Field(False, description="Save OCR predictions to debugset folder")
 
 
 class EvaluateResponse(BaseModel):
@@ -307,6 +308,12 @@ def evaluate(request: EvaluateRequest):
             pred = assemble_from_paddle_result(out)
             time_per_page_list.append(float(time.time() - tp0))
 
+            # Save prediction to debugset if requested
+            if request.save_output:
+                out_path = state.dataset.get_output_path(idx, "paddle_text")
+                with open(out_path, "w", encoding="utf-8") as f:
+                    f.write(pred)
+
             m = evaluate_text(ref, pred)
             cer_list.append(m["CER"])
             wer_list.append(m["WER"])
diff --git a/src/paddle_ocr_raytune_rest.ipynb b/src/paddle_ocr_raytune_rest.ipynb
index 44710b9..f2fe22c 100644
--- a/src/paddle_ocr_raytune_rest.ipynb
+++ b/src/paddle_ocr_raytune_rest.ipynb
@@ -7,263 +7,81 @@
    "source": [
     "# PaddleOCR Hyperparameter Optimization via REST API\n",
     "\n",
-    "This notebook runs Ray Tune hyperparameter search calling the PaddleOCR REST API (Docker container).\n",
+    "Uses Ray Tune + Optuna to find optimal PaddleOCR parameters.\n",
     "\n",
-    "**Benefits:**\n",
-    "- No model reload per trial - Model stays loaded in Docker container\n",
-    "- Faster trials - Skip ~10s model load time per trial\n",
-    "- Cleaner code - REST API replaces subprocess + CLI arg parsing"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "prereq",
-   "metadata": {},
-   "source": [
     "## Prerequisites\n",
     "\n",
-    "Start 2 PaddleOCR workers for parallel hyperparameter tuning:\n",
-    "\n",
     "```bash\n",
     "cd src/paddle_ocr\n",
-    "docker compose -f docker-compose.workers.yml up\n",
-    "```\n",
-    "\n",
-    "This starts 2 GPU workers on ports 8001-8002, allowing 2 concurrent trials.\n",
-    "\n",
-    "For CPU-only systems:\n",
-    "```bash\n",
-    "docker compose -f docker-compose.workers.yml --profile cpu up\n",
+    "docker compose -f docker-compose.workers.yml up  # GPU workers on 8001-8002\n",
+    "# or: docker compose -f docker-compose.workers.yml --profile cpu up\n",
     "```"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "3ob9fsoilc4",
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "deps",
    "metadata": {},
+   "outputs": [],
    "source": [
-    "## 0. Dependencies"
+    "%pip install -q -U \"ray[tune]\" optuna requests pandas"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "wyr2nsoj7",
+   "id": "setup",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Install dependencies (run once)\n",
-    "%pip install -U \"ray[tune]\"\n",
-    "%pip install optuna\n",
-    "%pip install requests pandas"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "imports-header",
-   "metadata": {},
-   "source": [
-    "## 1. Imports & Setup"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "imports",
-   "metadata": {},
-   "outputs": [],
-   "source": "import os\nfrom datetime import datetime\n\nimport requests\nimport pandas as pd\n\nimport ray\nfrom ray import tune, train\nfrom ray.tune.search.optuna import OptunaSearch"
-  },
-  {
-   "cell_type": "markdown",
-   "id": "config-header",
-   "metadata": {},
-   "source": [
-    "## 2. API Configuration"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "config",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# PaddleOCR REST API endpoints - 2 workers for parallel trials\n",
-    "# Start workers with: cd src/paddle_ocr && docker compose -f docker-compose.workers.yml up\n",
-    "WORKER_PORTS = [8001, 8002]\n",
-    "WORKER_URLS = [f\"http://localhost:{port}\" for port in WORKER_PORTS]\n",
+    "from raytune_ocr import (\n",
+    "    check_workers, create_trainable, run_tuner, analyze_results, correlation_analysis,\n",
+    "    paddle_ocr_payload, PADDLE_OCR_SEARCH_SPACE, PADDLE_OCR_CONFIG_KEYS,\n",
+    ")\n",
     "\n",
-    "# Output folder for results\n",
-    "OUTPUT_FOLDER = \"results\"\n",
-    "os.makedirs(OUTPUT_FOLDER, exist_ok=True)\n",
+    "# Worker ports\n",
+    "PORTS = [8001, 8002]\n",
     "\n",
-    "# Number of concurrent trials = number of workers\n",
-    "NUM_WORKERS = len(WORKER_URLS)"
+    "# Check workers are running\n",
+    "healthy = check_workers(PORTS, \"PaddleOCR\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "health-check",
+   "id": "tune",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Verify all workers are running\n",
-    "healthy_workers = []\n",
-    "for url in WORKER_URLS:\n",
-    "    try:\n",
-    "        health = requests.get(f\"{url}/health\", timeout=10).json()\n",
-    "        if health['status'] == 'ok' and health['model_loaded']:\n",
-    "            healthy_workers.append(url)\n",
-    "            print(f\"✓ {url}: {health['status']} (GPU: {health.get('gpu_name', 'N/A')})\")\n",
-    "        else:\n",
-    "            print(f\"✗ {url}: not ready yet\")\n",
-    "    except requests.exceptions.ConnectionError:\n",
-    "        print(f\"✗ {url}: not reachable\")\n",
+    "# Create trainable and run tuning\n",
+    "trainable = create_trainable(PORTS, paddle_ocr_payload)\n",
     "\n",
-    "if not healthy_workers:\n",
-    "    raise RuntimeError(\n",
-    "        \"No healthy workers found. Start them with:\\n\"\n",
-    "        \"  cd src/paddle_ocr && docker compose -f docker-compose.workers.yml up\"\n",
-    "    )\n",
+    "results = run_tuner(\n",
+    "    trainable=trainable,\n",
+    "    search_space=PADDLE_OCR_SEARCH_SPACE,\n",
+    "    num_samples=64,\n",
+    "    num_workers=len(healthy),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "analysis",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Analyze results\n",
+    "df = analyze_results(\n",
+    "    results,\n",
+    "    prefix=\"raytune_paddle\",\n",
+    "    config_keys=PADDLE_OCR_CONFIG_KEYS,\n",
+    ")\n",
     "\n",
-    "print(f\"\\n{len(healthy_workers)}/{len(WORKER_URLS)} workers ready for parallel tuning\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "search-space-header",
-   "metadata": {},
-   "source": [
-    "## 3. Search Space"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "search-space",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "search_space = {\n",
-    "    # Whether to use document image orientation classification\n",
-    "    \"use_doc_orientation_classify\": tune.choice([True, False]),\n",
-    "    # Whether to use text image unwarping\n",
-    "    \"use_doc_unwarping\": tune.choice([True, False]),\n",
-    "    # Whether to use text line orientation classification\n",
-    "    \"textline_orientation\": tune.choice([True, False]),\n",
-    "    # Detection pixel threshold (pixels > threshold are considered text)\n",
-    "    \"text_det_thresh\": tune.uniform(0.0, 0.7),\n",
-    "    # Detection box threshold (average score within border)\n",
-    "    \"text_det_box_thresh\": tune.uniform(0.0, 0.7),\n",
-    "    # Text detection expansion coefficient\n",
-    "    \"text_det_unclip_ratio\": tune.choice([0.0]),\n",
-    "    # Text recognition threshold (filter low confidence results)\n",
-    "    \"text_rec_score_thresh\": tune.uniform(0.0, 0.7),\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "trainable-header",
-   "metadata": {},
-   "source": [
-    "## 4. Trainable Function"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "trainable",
-   "metadata": {},
-   "outputs": [],
-   "source": "def trainable_paddle_ocr(config):\n    \"\"\"Call PaddleOCR REST API with the given hyperparameter config.\"\"\"\n    import random\n    import requests\n    from ray import train\n\n    # Worker URLs - random selection (load balances with 2 workers, 2 concurrent trials)\n    WORKER_PORTS = [8001, 8002]\n    api_url = f\"http://localhost:{random.choice(WORKER_PORTS)}\"\n\n    payload = {\n        \"pdf_folder\": \"/app/dataset\",\n        \"use_doc_orientation_classify\": config.get(\"use_doc_orientation_classify\", False),\n        \"use_doc_unwarping\": config.get(\"use_doc_unwarping\", False),\n        \"textline_orientation\": config.get(\"textline_orientation\", True),\n        \"text_det_thresh\": config.get(\"text_det_thresh\", 0.0),\n        \"text_det_box_thresh\": config.get(\"text_det_box_thresh\", 0.0),\n        \"text_det_unclip_ratio\": config.get(\"text_det_unclip_ratio\", 1.5),\n        \"text_rec_score_thresh\": config.get(\"text_rec_score_thresh\", 0.0),\n        \"start_page\": 5,\n        \"end_page\": 10,\n    }\n\n    try:\n        response = requests.post(f\"{api_url}/evaluate\", json=payload, timeout=None)\n        response.raise_for_status()\n        metrics = response.json()\n        metrics[\"worker\"] = api_url\n        train.report(metrics)\n    except Exception as e:\n        train.report({\n            \"CER\": 1.0,\n            \"WER\": 1.0,\n            \"TIME\": 0.0,\n            \"PAGES\": 0,\n            \"TIME_PER_PAGE\": 0,\n            \"worker\": api_url,\n            \"ERROR\": str(e)[:500]\n        })"
-  },
-  {
-   "cell_type": "markdown",
-   "id": "tuner-header",
-   "metadata": {},
-   "source": [
-    "## 5. Run Tuner"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ray-init",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ray.init(ignore_reinit_error=True)\n",
-    "print(f\"Ray Tune ready (version: {ray.__version__})\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "tuner",
-   "metadata": {},
-   "outputs": [],
-   "source": "tuner = tune.Tuner(\n    trainable_paddle_ocr,\n    tune_config=tune.TuneConfig(\n        metric=\"CER\",\n        mode=\"min\",\n        search_alg=OptunaSearch(),\n        num_samples=64,\n        max_concurrent_trials=NUM_WORKERS,  # Run trials in parallel across workers\n    ),\n    param_space=search_space,\n)\n\nresults = tuner.fit()"
-  },
-  {
-   "cell_type": "markdown",
-   "id": "analysis-header",
-   "metadata": {},
-   "source": [
-    "## 6. Results Analysis"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "results-df",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = results.get_dataframe()\n",
     "df.describe()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "save-results",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Save results to CSV\n",
-    "timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
-    "filename = f\"raytune_paddle_rest_results_{timestamp}.csv\"\n",
-    "filepath = os.path.join(OUTPUT_FOLDER, filename)\n",
-    "\n",
-    "df.to_csv(filepath, index=False)\n",
-    "print(f\"Results saved: {filepath}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "best-config",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Best configuration\n",
-    "best = df.loc[df[\"CER\"].idxmin()]\n",
-    "\n",
-    "print(f\"Best CER: {best['CER']:.6f}\")\n",
-    "print(f\"Best WER: {best['WER']:.6f}\")\n",
-    "print(f\"\\nOptimal Configuration:\")\n",
-    "print(f\"  textline_orientation: {best['config/textline_orientation']}\")\n",
-    "print(f\"  use_doc_orientation_classify: {best['config/use_doc_orientation_classify']}\")\n",
-    "print(f\"  use_doc_unwarping: {best['config/use_doc_unwarping']}\")\n",
-    "print(f\"  text_det_thresh: {best['config/text_det_thresh']:.4f}\")\n",
-    "print(f\"  text_det_box_thresh: {best['config/text_det_box_thresh']:.4f}\")\n",
-    "print(f\"  text_det_unclip_ratio: {best['config/text_det_unclip_ratio']}\")\n",
-    "print(f\"  text_rec_score_thresh: {best['config/text_rec_score_thresh']:.4f}\")"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -272,42 +90,21 @@
    "outputs": [],
    "source": [
     "# Correlation analysis\n",
-    "param_cols = [\n",
-    "    \"config/text_det_thresh\",\n",
-    "    \"config/text_det_box_thresh\",\n",
-    "    \"config/text_det_unclip_ratio\",\n",
-    "    \"config/text_rec_score_thresh\",\n",
-    "]\n",
-    "\n",
-    "corr_cer = df[param_cols + [\"CER\"]].corr()[\"CER\"].sort_values(ascending=False)\n",
-    "corr_wer = df[param_cols + [\"WER\"]].corr()[\"WER\"].sort_values(ascending=False)\n",
-    "\n",
-    "print(\"Correlation with CER:\")\n",
-    "print(corr_cer)\n",
-    "print(\"\\nCorrelation with WER:\")\n",
-    "print(corr_wer)"
+    "correlation_analysis(df, PADDLE_OCR_CONFIG_KEYS)"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
   "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
    "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.10.0"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/src/raytune_ocr.py b/src/raytune_ocr.py
new file mode 100644
index 0000000..c1f53c7
--- /dev/null
+++ b/src/raytune_ocr.py
@@ -0,0 +1,333 @@
+# raytune_ocr.py
+# Shared Ray Tune utilities for OCR hyperparameter optimization
+#
+# Usage:
+#   from raytune_ocr import check_workers, create_trainable, run_tuner, analyze_results
+
+import os
+from datetime import datetime
+from typing import List, Dict, Any, Callable
+
+import requests
+import pandas as pd
+
+import ray
+from ray import tune, train
+from ray.tune.search.optuna import OptunaSearch
+
+
+def check_workers(ports: List[int], service_name: str = "OCR") -> List[str]:
+    """
+    Verify workers are running and return healthy URLs.
+
+    Args:
+        ports: List of port numbers to check
+        service_name: Name for error messages
+
+    Returns:
+        List of healthy worker URLs
+
+    Raises:
+        RuntimeError if no healthy workers found
+    """
+    worker_urls = [f"http://localhost:{port}" for port in ports]
+    healthy_workers = []
+
+    for url in worker_urls:
+        try:
+            health = requests.get(f"{url}/health", timeout=10).json()
+            if health.get('status') == 'ok' and health.get('model_loaded'):
+                healthy_workers.append(url)
+                gpu = health.get('gpu_name', 'CPU')
+                print(f"✓ {url}: {health['status']} ({gpu})")
+            else:
+                print(f"✗ {url}: not ready yet")
+        except requests.exceptions.ConnectionError:
+            print(f"✗ {url}: not reachable")
+
+    if not healthy_workers:
+        raise RuntimeError(
+            f"No healthy {service_name} workers found.\n"
+            f"Checked ports: {ports}"
+        )
+
+    print(f"\n{len(healthy_workers)}/{len(worker_urls)} workers ready")
+    return healthy_workers
+
+
+def create_trainable(ports: List[int], payload_fn: Callable[[Dict], Dict]) -> Callable:
+    """
+    Factory to create a trainable function for Ray Tune.
+
+    Args:
+        ports: List of worker ports for load balancing
+        payload_fn: Function that takes config dict and returns API payload dict
+
+    Returns:
+        Trainable function for Ray Tune
+    """
+    def trainable(config):
+        import random
+        import requests
+        from ray import train
+
+        api_url = f"http://localhost:{random.choice(ports)}"
+        payload = payload_fn(config)
+
+        try:
+            response = requests.post(f"{api_url}/evaluate", json=payload, timeout=None)
+            response.raise_for_status()
+            metrics = response.json()
+            metrics["worker"] = api_url
+            train.report(metrics)
+        except Exception as e:
+            train.report({
+                "CER": 1.0,
+                "WER": 1.0,
+                "TIME": 0.0,
+                "PAGES": 0,
+                "TIME_PER_PAGE": 0,
+                "worker": api_url,
+                "ERROR": str(e)[:500]
+            })
+
+    return trainable
+
+
+def run_tuner(
+    trainable: Callable,
+    search_space: Dict[str, Any],
+    num_samples: int = 64,
+    num_workers: int = 1,
+    metric: str = "CER",
+    mode: str = "min",
+) -> tune.ResultGrid:
+    """
+    Initialize Ray and run hyperparameter tuning.
+
+    Args:
+        trainable: Trainable function from create_trainable()
+        search_space: Dict of parameter names to tune.* search spaces
+        num_samples: Number of trials to run
+        num_workers: Max concurrent trials
+        metric: Metric to optimize
+        mode: "min" or "max"
+
+    Returns:
+        Ray Tune ResultGrid
+    """
+    ray.init(ignore_reinit_error=True, include_dashboard=False)
+    print(f"Ray Tune ready (version: {ray.__version__})")
+
+    tuner = tune.Tuner(
+        trainable,
+        tune_config=tune.TuneConfig(
+            metric=metric,
+            mode=mode,
+            search_alg=OptunaSearch(),
+            num_samples=num_samples,
+            max_concurrent_trials=num_workers,
+        ),
+        param_space=search_space,
+    )
+
+    return tuner.fit()
+
+
+def analyze_results(
+    results: tune.ResultGrid,
+    output_folder: str = "results",
+    prefix: str = "raytune",
+    config_keys: List[str] = None,
+) -> pd.DataFrame:
+    """
+    Analyze and save tuning results.
+
+    Args:
+        results: Ray Tune ResultGrid
+        output_folder: Directory to save CSV
+        prefix: Filename prefix
+        config_keys: List of config keys to show in best result (without 'config/' prefix)
+
+    Returns:
+        Results DataFrame
+    """
+    os.makedirs(output_folder, exist_ok=True)
+    df = results.get_dataframe()
+
+    # Save to CSV
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"{prefix}_results_{timestamp}.csv"
+    filepath = os.path.join(output_folder, filename)
+    df.to_csv(filepath, index=False)
+    print(f"Results saved: {filepath}")
+
+    # Best configuration
+    best = df.loc[df["CER"].idxmin()]
+    print(f"\nBest CER: {best['CER']:.6f}")
+    print(f"Best WER: {best['WER']:.6f}")
+
+    if config_keys:
+        print(f"\nOptimal Configuration:")
+        for key in config_keys:
+            col = f"config/{key}"
+            if col in best:
+                val = best[col]
+                if isinstance(val, float):
+                    print(f"  {key}: {val:.4f}")
+                else:
+                    print(f"  {key}: {val}")
+
+    return df
+
+
+def correlation_analysis(df: pd.DataFrame, param_keys: List[str]) -> None:
+    """
+    Print correlation of numeric parameters with CER/WER.
+
+    Args:
+        df: Results DataFrame
+        param_keys: List of config keys (without 'config/' prefix)
+    """
+    param_cols = [f"config/{k}" for k in param_keys if f"config/{k}" in df.columns]
+    numeric_cols = [c for c in param_cols if df[c].dtype in ['float64', 'int64']]
+
+    if not numeric_cols:
+        print("No numeric parameters for correlation analysis")
+        return
+
+    corr_cer = df[numeric_cols + ["CER"]].corr()["CER"].sort_values(ascending=False)
+    corr_wer = df[numeric_cols + ["WER"]].corr()["WER"].sort_values(ascending=False)
+
+    print("Correlation with CER:")
+    print(corr_cer)
+    print("\nCorrelation with WER:")
+    print(corr_wer)
+
+
+# =============================================================================
+# OCR-specific payload functions
+# =============================================================================
+
+def paddle_ocr_payload(config: Dict, start_page: int = 5, end_page: int = 10, save_output: bool = False) -> Dict:
+    """Create payload for PaddleOCR API."""
+    return {
+        "pdf_folder": "/app/dataset",
+        "use_doc_orientation_classify": config.get("use_doc_orientation_classify", False),
+        "use_doc_unwarping": config.get("use_doc_unwarping", False),
+        "textline_orientation": config.get("textline_orientation", True),
+        "text_det_thresh": config.get("text_det_thresh", 0.0),
+        "text_det_box_thresh": config.get("text_det_box_thresh", 0.0),
+        "text_det_unclip_ratio": config.get("text_det_unclip_ratio", 1.5),
+        "text_rec_score_thresh": config.get("text_rec_score_thresh", 0.0),
+        "start_page": start_page,
+        "end_page": end_page,
+        "save_output": save_output,
+    }
+
+
+def doctr_payload(config: Dict, start_page: int = 5, end_page: int = 10, save_output: bool = False) -> Dict:
+    """Create payload for DocTR API."""
+    return {
+        "pdf_folder": "/app/dataset",
+        "assume_straight_pages": config.get("assume_straight_pages", True),
+        "straighten_pages": config.get("straighten_pages", False),
+        "preserve_aspect_ratio": config.get("preserve_aspect_ratio", True),
+        "symmetric_pad": config.get("symmetric_pad", True),
+        "disable_page_orientation": config.get("disable_page_orientation", False),
+        "disable_crop_orientation": config.get("disable_crop_orientation", False),
+        "resolve_lines": config.get("resolve_lines", True),
+        "resolve_blocks": config.get("resolve_blocks", False),
+        "paragraph_break": config.get("paragraph_break", 0.035),
+        "start_page": start_page,
+        "end_page": end_page,
+        "save_output": save_output,
+    }
+
+
+def easyocr_payload(config: Dict, start_page: int = 5, end_page: int = 10, save_output: bool = False) -> Dict:
+    """Create payload for EasyOCR API."""
+    return {
+        "pdf_folder": "/app/dataset",
+        "text_threshold": config.get("text_threshold", 0.7),
+        "low_text": config.get("low_text", 0.4),
+        "link_threshold": config.get("link_threshold", 0.4),
+        "slope_ths": config.get("slope_ths", 0.1),
+        "ycenter_ths": config.get("ycenter_ths", 0.5),
+        "height_ths": config.get("height_ths", 0.5),
+        "width_ths": config.get("width_ths", 0.5),
+        "add_margin": config.get("add_margin", 0.1),
+        "contrast_ths": config.get("contrast_ths", 0.1),
+        "adjust_contrast": config.get("adjust_contrast", 0.5),
+        "decoder": config.get("decoder", "greedy"),
+        "beamWidth": config.get("beamWidth", 5),
+        "min_size": config.get("min_size", 10),
+        "start_page": start_page,
+        "end_page": end_page,
+        "save_output": save_output,
+    }
+
+
+# =============================================================================
+# Search spaces
+# =============================================================================
+
+PADDLE_OCR_SEARCH_SPACE = {
+    "use_doc_orientation_classify": tune.choice([True, False]),
+    "use_doc_unwarping": tune.choice([True, False]),
+    "textline_orientation": tune.choice([True, False]),
+    "text_det_thresh": tune.uniform(0.0, 0.7),
+    "text_det_box_thresh": tune.uniform(0.0, 0.7),
+    "text_det_unclip_ratio": tune.choice([0.0]),
+    "text_rec_score_thresh": tune.uniform(0.0, 0.7),
+}
+
+DOCTR_SEARCH_SPACE = {
+    "assume_straight_pages": tune.choice([True, False]),
+    "straighten_pages": tune.choice([True, False]),
+    "preserve_aspect_ratio": tune.choice([True, False]),
+    "symmetric_pad": tune.choice([True, False]),
+    "disable_page_orientation": tune.choice([True, False]),
+    "disable_crop_orientation": tune.choice([True, False]),
+    "resolve_lines": tune.choice([True, False]),
+    "resolve_blocks": tune.choice([True, False]),
+    "paragraph_break": tune.uniform(0.01, 0.1),
+}
+
+EASYOCR_SEARCH_SPACE = {
+    "text_threshold": tune.uniform(0.3, 0.9),
+    "low_text": tune.uniform(0.2, 0.6),
+    "link_threshold": tune.uniform(0.2, 0.6),
+    "slope_ths": tune.uniform(0.0, 0.3),
+    "ycenter_ths": tune.uniform(0.3, 1.0),
+    "height_ths": tune.uniform(0.3, 1.0),
+    "width_ths": tune.uniform(0.3, 1.0),
+    "add_margin": tune.uniform(0.0, 0.3),
+    "contrast_ths": tune.uniform(0.05, 0.3),
+    "adjust_contrast": tune.uniform(0.3, 0.8),
+    "decoder": tune.choice(["greedy", "beamsearch"]),
+    "beamWidth": tune.choice([3, 5, 7, 10]),
+    "min_size": tune.choice([5, 10, 15, 20]),
+}
+
+
+# =============================================================================
+# Config keys for results display
+# =============================================================================
+
+PADDLE_OCR_CONFIG_KEYS = [
+    "use_doc_orientation_classify", "use_doc_unwarping", "textline_orientation",
+    "text_det_thresh", "text_det_box_thresh", "text_det_unclip_ratio", "text_rec_score_thresh",
+]
+
+DOCTR_CONFIG_KEYS = [
+    "assume_straight_pages", "straighten_pages", "preserve_aspect_ratio", "symmetric_pad",
+    "disable_page_orientation", "disable_crop_orientation", "resolve_lines", "resolve_blocks",
+    "paragraph_break",
+]
+
+EASYOCR_CONFIG_KEYS = [
+    "text_threshold", "low_text", "link_threshold", "slope_ths", "ycenter_ths",
+    "height_ths", "width_ths", "add_margin", "contrast_ths", "adjust_contrast",
+    "decoder", "beamWidth", "min_size",
+]