diff --git a/.gitignore b/.gitignore
index 0098713..1eb7d2f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,5 @@ results
node_modules
src/paddle_ocr/wheels
src/*.log
+src/output_*.ipynb
+debugset/
diff --git a/src/dataset_manager.py b/src/dataset_manager.py
index 2d3ccac..e9ea973 100644
--- a/src/dataset_manager.py
+++ b/src/dataset_manager.py
@@ -42,4 +42,33 @@ class ImageTextDataset:
with open(txt_path, "r", encoding="utf-8") as f:
text = f.read()
- return image, text
\ No newline at end of file
+ return image, text
+
+ def get_output_path(self, idx, output_subdir, debugset_root="/app/debugset"):
+ """Get output path for saving OCR result to debugset folder.
+
+ Args:
+ idx: Sample index
+ output_subdir: Subdirectory name (e.g., 'paddle_text', 'doctr_text')
+ debugset_root: Root folder for debug output (default: /app/debugset)
+
+ Returns:
+ Path like /app/debugset/doc1/{output_subdir}/page_001.txt
+ """
+ img_path, _ = self.samples[idx]
+ # img_path: /app/dataset/doc1/img/page_001.png
+ # Extract relative path: doc1/img/page_001.png
+ parts = img_path.split("/dataset/", 1)
+ if len(parts) == 2:
+ rel_path = parts[1] # doc1/img/page_001.png
+ else:
+ rel_path = os.path.basename(img_path)
+
+ # Replace /img/ with /{output_subdir}/
+ rel_parts = rel_path.rsplit("/img/", 1)
+ doc_folder = rel_parts[0] # doc1
+ fname = os.path.splitext(rel_parts[1])[0] + ".txt" # page_001.txt
+
+ out_dir = os.path.join(debugset_root, doc_folder, output_subdir)
+ os.makedirs(out_dir, exist_ok=True)
+ return os.path.join(out_dir, fname)
\ No newline at end of file
diff --git a/src/doctr_raytune_rest.ipynb b/src/doctr_raytune_rest.ipynb
new file mode 100644
index 0000000..aafd28f
--- /dev/null
+++ b/src/doctr_raytune_rest.ipynb
@@ -0,0 +1,111 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "header",
+ "metadata": {},
+ "source": [
+ "# DocTR Hyperparameter Optimization via REST API\n",
+ "\n",
+ "Uses Ray Tune + Optuna to find optimal DocTR parameters.\n",
+ "\n",
+ "## Prerequisites\n",
+ "\n",
+ "```bash\n",
+ "cd src/doctr_service\n",
+ "docker compose up ocr-cpu # or ocr-gpu\n",
+ "```\n",
+ "\n",
+ "Service runs on port 8003."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "deps",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%pip install -q -U \"ray[tune]\" optuna requests pandas"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "setup",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from raytune_ocr import (\n",
+ " check_workers, create_trainable, run_tuner, analyze_results, correlation_analysis,\n",
+ " doctr_payload, DOCTR_SEARCH_SPACE, DOCTR_CONFIG_KEYS,\n",
+ ")\n",
+ "\n",
+ "# Worker ports\n",
+ "PORTS = [8003]\n",
+ "\n",
+ "# Check workers are running\n",
+ "healthy = check_workers(PORTS, \"DocTR\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "tune",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create trainable and run tuning\n",
+ "trainable = create_trainable(PORTS, doctr_payload)\n",
+ "\n",
+ "results = run_tuner(\n",
+ " trainable=trainable,\n",
+ " search_space=DOCTR_SEARCH_SPACE,\n",
+ " num_samples=64,\n",
+ " num_workers=len(healthy),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "analysis",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Analyze results\n",
+ "df = analyze_results(\n",
+ " results,\n",
+ " prefix=\"raytune_doctr\",\n",
+ " config_keys=DOCTR_CONFIG_KEYS,\n",
+ ")\n",
+ "\n",
+ "df.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "correlation",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Correlation analysis\n",
+ "correlation_analysis(df, DOCTR_CONFIG_KEYS)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.10.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/doctr_service/dataset_manager.py b/src/doctr_service/dataset_manager.py
index 2d3ccac..e9ea973 100644
--- a/src/doctr_service/dataset_manager.py
+++ b/src/doctr_service/dataset_manager.py
@@ -42,4 +42,33 @@ class ImageTextDataset:
with open(txt_path, "r", encoding="utf-8") as f:
text = f.read()
- return image, text
\ No newline at end of file
+ return image, text
+
+ def get_output_path(self, idx, output_subdir, debugset_root="/app/debugset"):
+ """Get output path for saving OCR result to debugset folder.
+
+ Args:
+ idx: Sample index
+ output_subdir: Subdirectory name (e.g., 'paddle_text', 'doctr_text')
+ debugset_root: Root folder for debug output (default: /app/debugset)
+
+ Returns:
+ Path like /app/debugset/doc1/{output_subdir}/page_001.txt
+ """
+ img_path, _ = self.samples[idx]
+ # img_path: /app/dataset/doc1/img/page_001.png
+ # Extract relative path: doc1/img/page_001.png
+ parts = img_path.split("/dataset/", 1)
+ if len(parts) == 2:
+ rel_path = parts[1] # doc1/img/page_001.png
+ else:
+ rel_path = os.path.basename(img_path)
+
+ # Replace /img/ with /{output_subdir}/
+ rel_parts = rel_path.rsplit("/img/", 1)
+ doc_folder = rel_parts[0] # doc1
+ fname = os.path.splitext(rel_parts[1])[0] + ".txt" # page_001.txt
+
+ out_dir = os.path.join(debugset_root, doc_folder, output_subdir)
+ os.makedirs(out_dir, exist_ok=True)
+ return os.path.join(out_dir, fname)
\ No newline at end of file
diff --git a/src/doctr_service/docker-compose.yml b/src/doctr_service/docker-compose.yml
index 710f72b..f16c931 100644
--- a/src/doctr_service/docker-compose.yml
+++ b/src/doctr_service/docker-compose.yml
@@ -14,6 +14,7 @@ services:
- "8003:8000"
volumes:
- ../dataset:/app/dataset:ro
+ - ../debugset:/app/debugset:rw
- doctr-cache:/root/.cache/doctr
environment:
- PYTHONUNBUFFERED=1
@@ -35,6 +36,7 @@ services:
- "8003:8000"
volumes:
- ../dataset:/app/dataset:ro
+ - ../debugset:/app/debugset:rw
- doctr-cache:/root/.cache/doctr
environment:
- PYTHONUNBUFFERED=1
diff --git a/src/doctr_service/doctr_tuning_rest.py b/src/doctr_service/doctr_tuning_rest.py
index 4ef3928..9385f43 100644
--- a/src/doctr_service/doctr_tuning_rest.py
+++ b/src/doctr_service/doctr_tuning_rest.py
@@ -169,6 +169,7 @@ class EvaluateRequest(BaseModel):
# Page range
start_page: int = Field(5, ge=0, description="Start page index (inclusive)")
end_page: int = Field(10, ge=1, description="End page index (exclusive)")
+ save_output: bool = Field(False, description="Save OCR predictions to debugset folder")
class EvaluateResponse(BaseModel):
@@ -302,6 +303,12 @@ def evaluate(request: EvaluateRequest):
)
time_per_page_list.append(float(time.time() - tp0))
+ # Save prediction to debugset if requested
+ if request.save_output:
+ out_path = state.dataset.get_output_path(idx, "doctr_text")
+ with open(out_path, "w", encoding="utf-8") as f:
+ f.write(pred)
+
m = evaluate_text(ref, pred)
cer_list.append(m["CER"])
wer_list.append(m["WER"])
diff --git a/src/easyocr_raytune_rest.ipynb b/src/easyocr_raytune_rest.ipynb
new file mode 100644
index 0000000..723f97f
--- /dev/null
+++ b/src/easyocr_raytune_rest.ipynb
@@ -0,0 +1,111 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "header",
+ "metadata": {},
+ "source": [
+ "# EasyOCR Hyperparameter Optimization via REST API\n",
+ "\n",
+ "Uses Ray Tune + Optuna to find optimal EasyOCR parameters.\n",
+ "\n",
+ "## Prerequisites\n",
+ "\n",
+ "```bash\n",
+ "cd src/easyocr_service\n",
+ "docker compose up ocr-cpu # or ocr-gpu\n",
+ "```\n",
+ "\n",
+ "Service runs on port 8002."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "deps",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%pip install -q -U \"ray[tune]\" optuna requests pandas"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "setup",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from raytune_ocr import (\n",
+ " check_workers, create_trainable, run_tuner, analyze_results, correlation_analysis,\n",
+ " easyocr_payload, EASYOCR_SEARCH_SPACE, EASYOCR_CONFIG_KEYS,\n",
+ ")\n",
+ "\n",
+ "# Worker ports\n",
+ "PORTS = [8002]\n",
+ "\n",
+ "# Check workers are running\n",
+ "healthy = check_workers(PORTS, \"EasyOCR\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "tune",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create trainable and run tuning\n",
+ "trainable = create_trainable(PORTS, easyocr_payload)\n",
+ "\n",
+ "results = run_tuner(\n",
+ " trainable=trainable,\n",
+ " search_space=EASYOCR_SEARCH_SPACE,\n",
+ " num_samples=64,\n",
+ " num_workers=len(healthy),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "analysis",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Analyze results\n",
+ "df = analyze_results(\n",
+ " results,\n",
+ " prefix=\"raytune_easyocr\",\n",
+ " config_keys=EASYOCR_CONFIG_KEYS,\n",
+ ")\n",
+ "\n",
+ "df.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "correlation",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Correlation analysis\n",
+ "correlation_analysis(df, EASYOCR_CONFIG_KEYS)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.10.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/easyocr_service/dataset_manager.py b/src/easyocr_service/dataset_manager.py
index 2d3ccac..e9ea973 100644
--- a/src/easyocr_service/dataset_manager.py
+++ b/src/easyocr_service/dataset_manager.py
@@ -42,4 +42,33 @@ class ImageTextDataset:
with open(txt_path, "r", encoding="utf-8") as f:
text = f.read()
- return image, text
\ No newline at end of file
+ return image, text
+
+ def get_output_path(self, idx, output_subdir, debugset_root="/app/debugset"):
+ """Get output path for saving OCR result to debugset folder.
+
+ Args:
+ idx: Sample index
+ output_subdir: Subdirectory name (e.g., 'paddle_text', 'doctr_text')
+ debugset_root: Root folder for debug output (default: /app/debugset)
+
+ Returns:
+ Path like /app/debugset/doc1/{output_subdir}/page_001.txt
+ """
+ img_path, _ = self.samples[idx]
+ # img_path: /app/dataset/doc1/img/page_001.png
+ # Extract relative path: doc1/img/page_001.png
+ parts = img_path.split("/dataset/", 1)
+ if len(parts) == 2:
+ rel_path = parts[1] # doc1/img/page_001.png
+ else:
+ rel_path = os.path.basename(img_path)
+
+ # Replace /img/ with /{output_subdir}/
+ rel_parts = rel_path.rsplit("/img/", 1)
+ doc_folder = rel_parts[0] # doc1
+ fname = os.path.splitext(rel_parts[1])[0] + ".txt" # page_001.txt
+
+ out_dir = os.path.join(debugset_root, doc_folder, output_subdir)
+ os.makedirs(out_dir, exist_ok=True)
+ return os.path.join(out_dir, fname)
\ No newline at end of file
diff --git a/src/easyocr_service/docker-compose.yml b/src/easyocr_service/docker-compose.yml
index 0b1b085..550e865 100644
--- a/src/easyocr_service/docker-compose.yml
+++ b/src/easyocr_service/docker-compose.yml
@@ -14,6 +14,7 @@ services:
- "8002:8000"
volumes:
- ../dataset:/app/dataset:ro
+ - ../debugset:/app/debugset:rw
- easyocr-cache:/root/.EasyOCR
environment:
- PYTHONUNBUFFERED=1
@@ -34,6 +35,7 @@ services:
- "8002:8000"
volumes:
- ../dataset:/app/dataset:ro
+ - ../debugset:/app/debugset:rw
- easyocr-cache:/root/.EasyOCR
environment:
- PYTHONUNBUFFERED=1
diff --git a/src/easyocr_service/easyocr_tuning_rest.py b/src/easyocr_service/easyocr_tuning_rest.py
index 5fa6cd5..dd1b565 100644
--- a/src/easyocr_service/easyocr_tuning_rest.py
+++ b/src/easyocr_service/easyocr_tuning_rest.py
@@ -133,6 +133,7 @@ class EvaluateRequest(BaseModel):
# Page range
start_page: int = Field(5, ge=0, description="Start page index (inclusive)")
end_page: int = Field(10, ge=1, description="End page index (exclusive)")
+ save_output: bool = Field(False, description="Save OCR predictions to debugset folder")
class EvaluateResponse(BaseModel):
@@ -301,6 +302,12 @@ def evaluate(request: EvaluateRequest):
pred = assemble_easyocr_result(result)
time_per_page_list.append(float(time.time() - tp0))
+ # Save prediction to debugset if requested
+ if request.save_output:
+ out_path = state.dataset.get_output_path(idx, "easyocr_text")
+ with open(out_path, "w", encoding="utf-8") as f:
+ f.write(pred)
+
m = evaluate_text(ref, pred)
cer_list.append(m["CER"])
wer_list.append(m["WER"])
diff --git a/src/output_raytune.ipynb b/src/output_raytune.ipynb
deleted file mode 100644
index 7230e7e..0000000
--- a/src/output_raytune.ipynb
+++ /dev/null
@@ -1,1037 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "header",
- "metadata": {
- "papermill": {
- "duration": 0.002022,
- "end_time": "2026-01-18T16:25:38.048417",
- "exception": false,
- "start_time": "2026-01-18T16:25:38.046395",
- "status": "completed"
- },
- "tags": []
- },
- "source": [
- "# PaddleOCR Hyperparameter Optimization via REST API\n",
- "\n",
- "This notebook runs Ray Tune hyperparameter search calling the PaddleOCR REST API (Docker container).\n",
- "\n",
- "**Benefits:**\n",
- "- No model reload per trial - Model stays loaded in Docker container\n",
- "- Faster trials - Skip ~10s model load time per trial\n",
- "- Cleaner code - REST API replaces subprocess + CLI arg parsing"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "prereq",
- "metadata": {
- "papermill": {
- "duration": 0.000855,
- "end_time": "2026-01-18T16:25:38.058911",
- "exception": false,
- "start_time": "2026-01-18T16:25:38.058056",
- "status": "completed"
- },
- "tags": []
- },
- "source": [
- "## Prerequisites\n",
- "\n",
- "Start 2 PaddleOCR workers for parallel hyperparameter tuning:\n",
- "\n",
- "```bash\n",
- "cd src/paddle_ocr\n",
- "docker compose -f docker-compose.workers.yml up\n",
- "```\n",
- "\n",
- "This starts 2 GPU workers on ports 8001-8002, allowing 2 concurrent trials.\n",
- "\n",
- "For CPU-only systems:\n",
- "```bash\n",
- "docker compose -f docker-compose.workers.yml --profile cpu up\n",
- "```"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "3ob9fsoilc4",
- "metadata": {
- "papermill": {
- "duration": 0.000846,
- "end_time": "2026-01-18T16:25:38.060620",
- "exception": false,
- "start_time": "2026-01-18T16:25:38.059774",
- "status": "completed"
- },
- "tags": []
- },
- "source": [
- "## 0. Dependencies"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "wyr2nsoj7",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2026-01-18T16:25:38.063421Z",
- "iopub.status.busy": "2026-01-18T16:25:38.063287Z",
- "iopub.status.idle": "2026-01-18T16:25:39.300678Z",
- "shell.execute_reply": "2026-01-18T16:25:39.299298Z"
- },
- "papermill": {
- "duration": 1.240519,
- "end_time": "2026-01-18T16:25:39.301973",
- "exception": false,
- "start_time": "2026-01-18T16:25:38.061454",
- "status": "completed"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Requirement already satisfied: ray[tune] in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (2.53.0)\r\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Requirement already satisfied: click>=7.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (8.3.1)\r\n",
- "Requirement already satisfied: filelock in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (3.20.3)\r\n",
- "Requirement already satisfied: jsonschema in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (4.26.0)\r\n",
- "Requirement already satisfied: msgpack<2.0.0,>=1.0.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (1.1.2)\r\n",
- "Requirement already satisfied: packaging>=24.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (25.0)\r\n",
- "Requirement already satisfied: protobuf>=3.20.3 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (6.33.4)\r\n",
- "Requirement already satisfied: pyyaml in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (6.0.3)\r\n",
- "Requirement already satisfied: requests in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (2.32.5)\r\n",
- "Requirement already satisfied: pandas in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (2.3.3)\r\n",
- "Requirement already satisfied: pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (2.12.5)\r\n",
- "Requirement already satisfied: tensorboardX>=1.9 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (2.6.4)\r\n",
- "Requirement already satisfied: pyarrow>=9.0.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (22.0.0)\r\n",
- "Requirement already satisfied: fsspec in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (2026.1.0)\r\n",
- "Requirement already satisfied: annotated-types>=0.6.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (0.7.0)\r\n",
- "Requirement already satisfied: pydantic-core==2.41.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (2.41.5)\r\n",
- "Requirement already satisfied: typing-extensions>=4.14.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (4.15.0)\r\n",
- "Requirement already satisfied: typing-inspection>=0.4.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (0.4.2)\r\n",
- "Requirement already satisfied: numpy in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from tensorboardX>=1.9->ray[tune]) (2.4.1)\r\n",
- "Requirement already satisfied: attrs>=22.2.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from jsonschema->ray[tune]) (25.4.0)\r\n",
- "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from jsonschema->ray[tune]) (2025.9.1)\r\n",
- "Requirement already satisfied: referencing>=0.28.4 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from jsonschema->ray[tune]) (0.37.0)\r\n",
- "Requirement already satisfied: rpds-py>=0.25.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from jsonschema->ray[tune]) (0.30.0)\r\n",
- "Requirement already satisfied: python-dateutil>=2.8.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pandas->ray[tune]) (2.9.0.post0)\r\n",
- "Requirement already satisfied: pytz>=2020.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pandas->ray[tune]) (2025.2)\r\n",
- "Requirement already satisfied: tzdata>=2022.7 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pandas->ray[tune]) (2025.3)\r\n",
- "Requirement already satisfied: six>=1.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas->ray[tune]) (1.17.0)\r\n",
- "Requirement already satisfied: charset_normalizer<4,>=2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests->ray[tune]) (3.4.4)\r\n",
- "Requirement already satisfied: idna<4,>=2.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests->ray[tune]) (3.11)\r\n",
- "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests->ray[tune]) (2.6.3)\r\n",
- "Requirement already satisfied: certifi>=2017.4.17 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests->ray[tune]) (2026.1.4)\r\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Note: you may need to restart the kernel to use updated packages.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Requirement already satisfied: optuna in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (4.6.0)\r\n",
- "Requirement already satisfied: alembic>=1.5.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from optuna) (1.18.1)\r\n",
- "Requirement already satisfied: colorlog in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from optuna) (6.10.1)\r\n",
- "Requirement already satisfied: numpy in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from optuna) (2.4.1)\r\n",
- "Requirement already satisfied: packaging>=20.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from optuna) (25.0)\r\n",
- "Requirement already satisfied: sqlalchemy>=1.4.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from optuna) (2.0.45)\r\n",
- "Requirement already satisfied: tqdm in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from optuna) (4.67.1)\r\n",
- "Requirement already satisfied: PyYAML in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from optuna) (6.0.3)\r\n",
- "Requirement already satisfied: Mako in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from alembic>=1.5.0->optuna) (1.3.10)\r\n",
- "Requirement already satisfied: typing-extensions>=4.12 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from alembic>=1.5.0->optuna) (4.15.0)\r\n",
- "Requirement already satisfied: greenlet>=1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from sqlalchemy>=1.4.2->optuna) (3.3.0)\r\n",
- "Requirement already satisfied: MarkupSafe>=0.9.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from Mako->alembic>=1.5.0->optuna) (3.0.3)\r\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Note: you may need to restart the kernel to use updated packages.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Requirement already satisfied: requests in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (2.32.5)\r\n",
- "Requirement already satisfied: pandas in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (2.3.3)\r\n",
- "Requirement already satisfied: charset_normalizer<4,>=2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (3.4.4)\r\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Requirement already satisfied: idna<4,>=2.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (3.11)\r\n",
- "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (2.6.3)\r\n",
- "Requirement already satisfied: certifi>=2017.4.17 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (2026.1.4)\r\n",
- "Requirement already satisfied: numpy>=1.26.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pandas) (2.4.1)\r\n",
- "Requirement already satisfied: python-dateutil>=2.8.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pandas) (2.9.0.post0)\r\n",
- "Requirement already satisfied: pytz>=2020.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pandas) (2025.2)\r\n",
- "Requirement already satisfied: tzdata>=2022.7 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pandas) (2025.3)\r\n",
- "Requirement already satisfied: six>=1.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\r\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Note: you may need to restart the kernel to use updated packages.\n"
- ]
- }
- ],
- "source": [
- "# Install dependencies (run once)\n",
- "%pip install -U \"ray[tune]\"\n",
- "%pip install optuna\n",
- "%pip install requests pandas"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "imports-header",
- "metadata": {
- "papermill": {
- "duration": 0.009444,
- "end_time": "2026-01-18T16:25:39.312980",
- "exception": false,
- "start_time": "2026-01-18T16:25:39.303536",
- "status": "completed"
- },
- "tags": []
- },
- "source": [
- "## 1. Imports & Setup"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "imports",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2026-01-18T16:25:39.316439Z",
- "iopub.status.busy": "2026-01-18T16:25:39.316230Z",
- "iopub.status.idle": "2026-01-18T16:25:40.277894Z",
- "shell.execute_reply": "2026-01-18T16:25:40.277012Z"
- },
- "papermill": {
- "duration": 0.964409,
- "end_time": "2026-01-18T16:25:40.278450",
- "exception": false,
- "start_time": "2026-01-18T16:25:39.314041",
- "status": "completed"
- },
- "tags": []
- },
- "outputs": [],
- "source": [
- "import os\n",
- "from datetime import datetime\n",
- "\n",
- "import requests\n",
- "import pandas as pd\n",
- "\n",
- "import ray\n",
- "from ray import tune, train\n",
- "from ray.tune.search.optuna import OptunaSearch"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "config-header",
- "metadata": {
- "papermill": {
- "duration": 0.009552,
- "end_time": "2026-01-18T16:25:40.289551",
- "exception": false,
- "start_time": "2026-01-18T16:25:40.279999",
- "status": "completed"
- },
- "tags": []
- },
- "source": [
- "## 2. API Configuration"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "config",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2026-01-18T16:25:40.292573Z",
- "iopub.status.busy": "2026-01-18T16:25:40.292489Z",
- "iopub.status.idle": "2026-01-18T16:25:40.294713Z",
- "shell.execute_reply": "2026-01-18T16:25:40.294164Z"
- },
- "papermill": {
- "duration": 0.004591,
- "end_time": "2026-01-18T16:25:40.295202",
- "exception": false,
- "start_time": "2026-01-18T16:25:40.290611",
- "status": "completed"
- },
- "tags": []
- },
- "outputs": [],
- "source": [
- "# PaddleOCR REST API endpoints - 2 workers for parallel trials\n",
- "# Start workers with: cd src/paddle_ocr && docker compose -f docker-compose.workers.yml up\n",
- "WORKER_PORTS = [8001, 8002]\n",
- "WORKER_URLS = [f\"http://localhost:{port}\" for port in WORKER_PORTS]\n",
- "\n",
- "# Output folder for results\n",
- "OUTPUT_FOLDER = \"results\"\n",
- "os.makedirs(OUTPUT_FOLDER, exist_ok=True)\n",
- "\n",
- "# Number of concurrent trials = number of workers\n",
- "NUM_WORKERS = len(WORKER_URLS)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "health-check",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2026-01-18T16:25:40.298281Z",
- "iopub.status.busy": "2026-01-18T16:25:40.298161Z",
- "iopub.status.idle": "2026-01-18T16:25:40.306720Z",
- "shell.execute_reply": "2026-01-18T16:25:40.306262Z"
- },
- "papermill": {
- "duration": 0.010723,
- "end_time": "2026-01-18T16:25:40.307025",
- "exception": false,
- "start_time": "2026-01-18T16:25:40.296302",
- "status": "completed"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "✓ http://localhost:8001: ok (GPU: None)\n",
- "✓ http://localhost:8002: ok (GPU: None)\n",
- "\n",
- "2/2 workers ready for parallel tuning\n"
- ]
- }
- ],
- "source": [
- "# Verify all workers are running\n",
- "healthy_workers = []\n",
- "for url in WORKER_URLS:\n",
- " try:\n",
- " health = requests.get(f\"{url}/health\", timeout=10).json()\n",
- " if health['status'] == 'ok' and health['model_loaded']:\n",
- " healthy_workers.append(url)\n",
- " print(f\"✓ {url}: {health['status']} (GPU: {health.get('gpu_name', 'N/A')})\")\n",
- " else:\n",
- " print(f\"✗ {url}: not ready yet\")\n",
- " except requests.exceptions.ConnectionError:\n",
- " print(f\"✗ {url}: not reachable\")\n",
- "\n",
- "if not healthy_workers:\n",
- " raise RuntimeError(\n",
- " \"No healthy workers found. Start them with:\\n\"\n",
- " \" cd src/paddle_ocr && docker compose -f docker-compose.workers.yml up\"\n",
- " )\n",
- "\n",
- "print(f\"\\n{len(healthy_workers)}/{len(WORKER_URLS)} workers ready for parallel tuning\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "search-space-header",
- "metadata": {
- "papermill": {
- "duration": 0.001073,
- "end_time": "2026-01-18T16:25:40.309261",
- "exception": false,
- "start_time": "2026-01-18T16:25:40.308188",
- "status": "completed"
- },
- "tags": []
- },
- "source": [
- "## 3. Search Space"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "search-space",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2026-01-18T16:25:40.312177Z",
- "iopub.status.busy": "2026-01-18T16:25:40.312107Z",
- "iopub.status.idle": "2026-01-18T16:25:40.314237Z",
- "shell.execute_reply": "2026-01-18T16:25:40.313794Z"
- },
- "papermill": {
- "duration": 0.004476,
- "end_time": "2026-01-18T16:25:40.314804",
- "exception": false,
- "start_time": "2026-01-18T16:25:40.310328",
- "status": "completed"
- },
- "tags": []
- },
- "outputs": [],
- "source": [
- "search_space = {\n",
- " # Whether to use document image orientation classification\n",
- " \"use_doc_orientation_classify\": tune.choice([True, False]),\n",
- " # Whether to use text image unwarping\n",
- " \"use_doc_unwarping\": tune.choice([True, False]),\n",
- " # Whether to use text line orientation classification\n",
- " \"textline_orientation\": tune.choice([True, False]),\n",
- " # Detection pixel threshold (pixels > threshold are considered text)\n",
- " \"text_det_thresh\": tune.uniform(0.0, 0.7),\n",
- " # Detection box threshold (average score within border)\n",
- " \"text_det_box_thresh\": tune.uniform(0.0, 0.7),\n",
- " # Text detection expansion coefficient\n",
- " \"text_det_unclip_ratio\": tune.choice([0.0]),\n",
- " # Text recognition threshold (filter low confidence results)\n",
- " \"text_rec_score_thresh\": tune.uniform(0.0, 0.7),\n",
- "}"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "trainable-header",
- "metadata": {
- "papermill": {
- "duration": 0.001057,
- "end_time": "2026-01-18T16:25:40.316975",
- "exception": false,
- "start_time": "2026-01-18T16:25:40.315918",
- "status": "completed"
- },
- "tags": []
- },
- "source": [
- "## 4. Trainable Function"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "trainable",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2026-01-18T16:25:40.319825Z",
- "iopub.status.busy": "2026-01-18T16:25:40.319771Z",
- "iopub.status.idle": "2026-01-18T16:25:40.322602Z",
- "shell.execute_reply": "2026-01-18T16:25:40.322112Z"
- },
- "papermill": {
- "duration": 0.004907,
- "end_time": "2026-01-18T16:25:40.322948",
- "exception": false,
- "start_time": "2026-01-18T16:25:40.318041",
- "status": "completed"
- },
- "tags": []
- },
- "outputs": [],
- "source": [
- "def trainable_paddle_ocr(config):\n",
- " \"\"\"Call PaddleOCR REST API with the given hyperparameter config.\"\"\"\n",
- " import random\n",
- " import requests\n",
- " from ray import train\n",
- "\n",
- " # Worker URLs - random selection (load balances with 2 workers, 2 concurrent trials)\n",
- " WORKER_PORTS = [8001, 8002]\n",
- " api_url = f\"http://localhost:{random.choice(WORKER_PORTS)}\"\n",
- "\n",
- " payload = {\n",
- " \"pdf_folder\": \"/app/dataset\",\n",
- " \"use_doc_orientation_classify\": config.get(\"use_doc_orientation_classify\", False),\n",
- " \"use_doc_unwarping\": config.get(\"use_doc_unwarping\", False),\n",
- " \"textline_orientation\": config.get(\"textline_orientation\", True),\n",
- " \"text_det_thresh\": config.get(\"text_det_thresh\", 0.0),\n",
- " \"text_det_box_thresh\": config.get(\"text_det_box_thresh\", 0.0),\n",
- " \"text_det_unclip_ratio\": config.get(\"text_det_unclip_ratio\", 1.5),\n",
- " \"text_rec_score_thresh\": config.get(\"text_rec_score_thresh\", 0.0),\n",
- " \"start_page\": 5,\n",
- " \"end_page\": 10,\n",
- " }\n",
- "\n",
- " try:\n",
- " response = requests.post(f\"{api_url}/evaluate\", json=payload, timeout=None)\n",
- " response.raise_for_status()\n",
- " metrics = response.json()\n",
- " metrics[\"worker\"] = api_url\n",
- " train.report(metrics)\n",
- " except Exception as e:\n",
- " train.report({\n",
- " \"CER\": 1.0,\n",
- " \"WER\": 1.0,\n",
- " \"TIME\": 0.0,\n",
- " \"PAGES\": 0,\n",
- " \"TIME_PER_PAGE\": 0,\n",
- " \"worker\": api_url,\n",
- " \"ERROR\": str(e)[:500]\n",
- " })"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "tuner-header",
- "metadata": {
- "papermill": {
- "duration": 0.001058,
- "end_time": "2026-01-18T16:25:40.325120",
- "exception": false,
- "start_time": "2026-01-18T16:25:40.324062",
- "status": "completed"
- },
- "tags": []
- },
- "source": [
- "## 5. Run Tuner"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "ray-init",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2026-01-18T16:25:40.328162Z",
- "iopub.status.busy": "2026-01-18T16:25:40.328055Z",
- "iopub.status.idle": "2026-01-18T16:25:42.985307Z",
- "shell.execute_reply": "2026-01-18T16:25:42.984863Z"
- },
- "papermill": {
- "duration": 2.65986,
- "end_time": "2026-01-18T16:25:42.986041",
- "exception": false,
- "start_time": "2026-01-18T16:25:40.326181",
- "status": "completed"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-01-18 17:25:41,631\tINFO worker.py:2007 -- Started a local Ray instance.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Ray Tune ready (version: 2.53.0)\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py:2046: FutureWarning: Tip: In future versions of Ray, Ray will no longer override accelerator visible devices env var if num_gpus=0 or num_gpus=None (default). To enable this behavior and turn off this error message, set RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO=0\n",
- " warnings.warn(\n"
- ]
- }
- ],
- "source": [
- "ray.init(ignore_reinit_error=True)\n",
- "print(f\"Ray Tune ready (version: {ray.__version__})\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "tuner",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2026-01-18T16:25:42.998698Z",
- "iopub.status.busy": "2026-01-18T16:25:42.998141Z"
- },
- "papermill": {
- "duration": null,
- "end_time": null,
- "exception": false,
- "start_time": "2026-01-18T16:25:42.987700",
- "status": "running"
- },
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "
\n",
- "
\n",
- "
Tune Status
\n",
- "
\n",
- "\n",
- "| Current time: | 2026-01-18 17:38:46 |
\n",
- "| Running for: | 00:13:03.82 |
\n",
- "| Memory: | 14.3/119.7 GiB |
\n",
- "\n",
- "
\n",
- "
\n",
- "
\n",
- "
\n",
- "
System Info
\n",
- " Using FIFO scheduling algorithm.
Logical resource usage: 2.0/20 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:GB10)\n",
- " \n",
- "
\n",
- "
\n",
- "
Messages
\n",
- " \n",
- " \n",
- " Number of errored trials: 3
\n",
- "\n",
- "| Trial name | # failures | error file |
\n",
- "\n",
- "\n",
- "| trainable_paddle_ocr_36ae4d11 | 1 | /tmp/ray/session_2026-01-18_17-25-40_347373_1281294/artifacts/2026-01-18_17-25-43/trainable_paddle_ocr_2026-01-18_17-25-43/driver_artifacts/trainable_paddle_ocr_36ae4d11_1_text_det_box_thresh=0.5847,text_det_thresh=0.2571,text_det_unclip_ratio=0.0000,text_rec_score_thre_2026-01-18_17-25-43/error.txt |
\n",
- "| trainable_paddle_ocr_2312d29c | 1 | /tmp/ray/session_2026-01-18_17-25-40_347373_1281294/artifacts/2026-01-18_17-25-43/trainable_paddle_ocr_2026-01-18_17-25-43/driver_artifacts/trainable_paddle_ocr_2312d29c_2_text_det_box_thresh=0.0312,text_det_thresh=0.0223,text_det_unclip_ratio=0.0000,text_rec_score_thre_2026-01-18_17-25-44/error.txt |
\n",
- "| trainable_paddle_ocr_5b7b8e02 | 1 | /tmp/ray/session_2026-01-18_17-25-40_347373_1281294/artifacts/2026-01-18_17-25-43/trainable_paddle_ocr_2026-01-18_17-25-43/driver_artifacts/trainable_paddle_ocr_5b7b8e02_3_text_det_box_thresh=0.5954,text_det_thresh=0.0707,text_det_unclip_ratio=0.0000,text_rec_score_thre_2026-01-18_17-31-48/error.txt |
\n",
- "\n",
- "
\n",
- "
\n",
- "\n",
- "\n",
- "
\n",
- "
\n",
- "
\n",
- "
Trial Status
\n",
- "
\n",
- "\n",
- "| Trial name | status | loc | text_det_box_thresh | text_det_thresh | text_det_unclip_rati\n",
- "o | text_rec_score_thres\n",
- "h | textline_orientation | use_doc_orientation_\n",
- "classify | use_doc_unwarping |
\n",
- "\n",
- "\n",
- "| trainable_paddle_ocr_b3243c8a | RUNNING | 192.168.65.140:1288101 | 0.360789 | 0.499551 | 0 | 0.115115 | False | True | False |
\n",
- "| trainable_paddle_ocr_7a4a43b0 | PENDING | | 0.0727848 | 0.237729 | 0 | 0.33623 | True | False | True |
\n",
- "| trainable_paddle_ocr_36ae4d11 | ERROR | 192.168.65.140:1282742 | 0.58473 | 0.257102 | 0 | 0.634955 | False | True | False |
\n",
- "| trainable_paddle_ocr_2312d29c | ERROR | 192.168.65.140:1282844 | 0.0311783 | 0.0222724 | 0 | 0.141805 | False | True | False |
\n",
- "| trainable_paddle_ocr_5b7b8e02 | ERROR | 192.168.65.140:1285648 | 0.595412 | 0.0706522 | 0 | 0.132174 | True | False | True |
\n",
- "\n",
- "
\n",
- "
\n",
- "
\n",
- "\n"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[36m(pid=gcs_server)\u001b[0m [2026-01-18 17:26:10,501 E 1281442 1281442] (gcs_server) gcs_server.cc:303: Failed to establish connection to the event+metrics exporter agent. Events and metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[33m(raylet)\u001b[0m [2026-01-18 17:26:11,550 E 1281587 1281587] (raylet) main.cc:1032: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[36m(bundle_reservation_check_func pid=1281657)\u001b[0m [2026-01-18 17:26:12,349 E 1281657 1281801] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "[2026-01-18 17:26:12,987 E 1281294 1281656] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-01-18 17:31:48,050\tERROR tune_controller.py:1331 -- Trial task failed for trial trainable_paddle_ocr_36ae4d11\n",
- "Traceback (most recent call last):\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/execution/_internal/event_manager.py\", line 110, in resolve_future\n",
- " result = ray.get(future)\n",
- " ^^^^^^^^^^^^^^^\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/auto_init_hook.py\", line 22, in auto_init_wrapper\n",
- " return fn(*args, **kwargs)\n",
- " ^^^^^^^^^^^^^^^^^^^\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/client_mode_hook.py\", line 104, in wrapper\n",
- " return func(*args, **kwargs)\n",
- " ^^^^^^^^^^^^^^^^^^^^^\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 2967, in get\n",
- " values, debugger_breakpoint = worker.get_objects(\n",
- " ^^^^^^^^^^^^^^^^^^^\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 1015, in get_objects\n",
- " raise value.as_instanceof_cause()\n",
- "ray.exceptions.RayTaskError(DeprecationWarning): \u001b[36mray::ImplicitFunc.train()\u001b[39m (pid=1282742, ip=192.168.65.140, actor_id=d19d5170bbb9faf9c9fa055f01000000, repr=trainable_paddle_ocr)\n",
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/trainable.py\", line 331, in train\n",
- " raise skipped from exception_cause(skipped)\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/_internal/util.py\", line 98, in run\n",
- " self._ret = self._target(*self._args, **self._kwargs)\n",
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 44, in \n",
- " training_func=lambda: self._trainable_func(self.config),\n",
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 249, in _trainable_func\n",
- " output = fn()\n",
- " ^^^^\n",
- " File \"/tmp/ipykernel_1281294/4208751894.py\", line 31, in trainable_paddle_ocr\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/train/v2/_internal/util.py\", line 273, in _wrapped_fn\n",
- " raise DeprecationWarning(\n",
- "DeprecationWarning: `ray.train.report` is deprecated when running in a function passed to Ray Tune. Please use the equivalent `ray.tune` API instead. See this issue for more context: https://github.com/ray-project/ray/issues/49454\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[36m(trainable_paddle_ocr pid=1285648)\u001b[0m [2026-01-18 17:32:19,397 E 1285648 1285683] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\u001b[32m [repeated 20x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)\u001b[0m\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-01-18 17:38:37,341\tERROR tune_controller.py:1331 -- Trial task failed for trial trainable_paddle_ocr_2312d29c\n",
- "Traceback (most recent call last):\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/execution/_internal/event_manager.py\", line 110, in resolve_future\n",
- " result = ray.get(future)\n",
- " ^^^^^^^^^^^^^^^\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/auto_init_hook.py\", line 22, in auto_init_wrapper\n",
- " return fn(*args, **kwargs)\n",
- " ^^^^^^^^^^^^^^^^^^^\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/client_mode_hook.py\", line 104, in wrapper\n",
- " return func(*args, **kwargs)\n",
- " ^^^^^^^^^^^^^^^^^^^^^\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 2967, in get\n",
- " values, debugger_breakpoint = worker.get_objects(\n",
- " ^^^^^^^^^^^^^^^^^^^\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 1015, in get_objects\n",
- " raise value.as_instanceof_cause()\n",
- "ray.exceptions.RayTaskError(DeprecationWarning): \u001b[36mray::ImplicitFunc.train()\u001b[39m (pid=1282844, ip=192.168.65.140, actor_id=845cd8594f8ace3d960b90e501000000, repr=trainable_paddle_ocr)\n",
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/trainable.py\", line 331, in train\n",
- " raise skipped from exception_cause(skipped)\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/_internal/util.py\", line 98, in run\n",
- " self._ret = self._target(*self._args, **self._kwargs)\n",
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 44, in \n",
- " training_func=lambda: self._trainable_func(self.config),\n",
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 249, in _trainable_func\n",
- " output = fn()\n",
- " ^^^^\n",
- " File \"/tmp/ipykernel_1281294/4208751894.py\", line 31, in trainable_paddle_ocr\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/train/v2/_internal/util.py\", line 273, in _wrapped_fn\n",
- " raise DeprecationWarning(\n",
- "DeprecationWarning: `ray.train.report` is deprecated when running in a function passed to Ray Tune. Please use the equivalent `ray.tune` API instead. See this issue for more context: https://github.com/ray-project/ray/issues/49454\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2026-01-18 17:38:46,519\tERROR tune_controller.py:1331 -- Trial task failed for trial trainable_paddle_ocr_5b7b8e02\n",
- "Traceback (most recent call last):\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/execution/_internal/event_manager.py\", line 110, in resolve_future\n",
- " result = ray.get(future)\n",
- " ^^^^^^^^^^^^^^^\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/auto_init_hook.py\", line 22, in auto_init_wrapper\n",
- " return fn(*args, **kwargs)\n",
- " ^^^^^^^^^^^^^^^^^^^\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/client_mode_hook.py\", line 104, in wrapper\n",
- " return func(*args, **kwargs)\n",
- " ^^^^^^^^^^^^^^^^^^^^^\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 2967, in get\n",
- " values, debugger_breakpoint = worker.get_objects(\n",
- " ^^^^^^^^^^^^^^^^^^^\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 1015, in get_objects\n",
- " raise value.as_instanceof_cause()\n",
- "ray.exceptions.RayTaskError(DeprecationWarning): \u001b[36mray::ImplicitFunc.train()\u001b[39m (pid=1285648, ip=192.168.65.140, actor_id=b8478e34aea747352febbe0801000000, repr=trainable_paddle_ocr)\n",
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/trainable.py\", line 331, in train\n",
- " raise skipped from exception_cause(skipped)\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/_internal/util.py\", line 98, in run\n",
- " self._ret = self._target(*self._args, **self._kwargs)\n",
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 44, in \n",
- " training_func=lambda: self._trainable_func(self.config),\n",
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 249, in _trainable_func\n",
- " output = fn()\n",
- " ^^^^\n",
- " File \"/tmp/ipykernel_1281294/4208751894.py\", line 31, in trainable_paddle_ocr\n",
- " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/train/v2/_internal/util.py\", line 273, in _wrapped_fn\n",
- " raise DeprecationWarning(\n",
- "DeprecationWarning: `ray.train.report` is deprecated when running in a function passed to Ray Tune. Please use the equivalent `ray.tune` API instead. See this issue for more context: https://github.com/ray-project/ray/issues/49454\n"
- ]
- }
- ],
- "source": [
- "tuner = tune.Tuner(\n",
- " trainable_paddle_ocr,\n",
- " tune_config=tune.TuneConfig(\n",
- " metric=\"CER\",\n",
- " mode=\"min\",\n",
- " search_alg=OptunaSearch(),\n",
- " num_samples=64,\n",
- " max_concurrent_trials=NUM_WORKERS, # Run trials in parallel across workers\n",
- " ),\n",
- " param_space=search_space,\n",
- ")\n",
- "\n",
- "results = tuner.fit()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "analysis-header",
- "metadata": {
- "papermill": {
- "duration": null,
- "end_time": null,
- "exception": null,
- "start_time": null,
- "status": "pending"
- },
- "tags": []
- },
- "source": [
- "## 6. Results Analysis"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "results-df",
- "metadata": {
- "papermill": {
- "duration": null,
- "end_time": null,
- "exception": null,
- "start_time": null,
- "status": "pending"
- },
- "tags": []
- },
- "outputs": [],
- "source": [
- "df = results.get_dataframe()\n",
- "df.describe()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "save-results",
- "metadata": {
- "papermill": {
- "duration": null,
- "end_time": null,
- "exception": null,
- "start_time": null,
- "status": "pending"
- },
- "tags": []
- },
- "outputs": [],
- "source": [
- "# Save results to CSV\n",
- "timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
- "filename = f\"raytune_paddle_rest_results_{timestamp}.csv\"\n",
- "filepath = os.path.join(OUTPUT_FOLDER, filename)\n",
- "\n",
- "df.to_csv(filepath, index=False)\n",
- "print(f\"Results saved: {filepath}\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "best-config",
- "metadata": {
- "papermill": {
- "duration": null,
- "end_time": null,
- "exception": null,
- "start_time": null,
- "status": "pending"
- },
- "tags": []
- },
- "outputs": [],
- "source": [
- "# Best configuration\n",
- "best = df.loc[df[\"CER\"].idxmin()]\n",
- "\n",
- "print(f\"Best CER: {best['CER']:.6f}\")\n",
- "print(f\"Best WER: {best['WER']:.6f}\")\n",
- "print(f\"\\nOptimal Configuration:\")\n",
- "print(f\" textline_orientation: {best['config/textline_orientation']}\")\n",
- "print(f\" use_doc_orientation_classify: {best['config/use_doc_orientation_classify']}\")\n",
- "print(f\" use_doc_unwarping: {best['config/use_doc_unwarping']}\")\n",
- "print(f\" text_det_thresh: {best['config/text_det_thresh']:.4f}\")\n",
- "print(f\" text_det_box_thresh: {best['config/text_det_box_thresh']:.4f}\")\n",
- "print(f\" text_det_unclip_ratio: {best['config/text_det_unclip_ratio']}\")\n",
- "print(f\" text_rec_score_thresh: {best['config/text_rec_score_thresh']:.4f}\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "correlation",
- "metadata": {
- "papermill": {
- "duration": null,
- "end_time": null,
- "exception": null,
- "start_time": null,
- "status": "pending"
- },
- "tags": []
- },
- "outputs": [],
- "source": [
- "# Correlation analysis\n",
- "param_cols = [\n",
- " \"config/text_det_thresh\",\n",
- " \"config/text_det_box_thresh\",\n",
- " \"config/text_det_unclip_ratio\",\n",
- " \"config/text_rec_score_thresh\",\n",
- "]\n",
- "\n",
- "corr_cer = df[param_cols + [\"CER\"]].corr()[\"CER\"].sort_values(ascending=False)\n",
- "corr_wer = df[param_cols + [\"WER\"]].corr()[\"WER\"].sort_values(ascending=False)\n",
- "\n",
- "print(\"Correlation with CER:\")\n",
- "print(corr_cer)\n",
- "print(\"\\nCorrelation with WER:\")\n",
- "print(corr_wer)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": ".venv",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.12.3"
- },
- "papermill": {
- "default_parameters": {},
- "duration": null,
- "end_time": null,
- "environment_variables": {},
- "exception": null,
- "input_path": "paddle_ocr_raytune_rest.ipynb",
- "output_path": "output_raytune.ipynb",
- "parameters": {},
- "start_time": "2026-01-18T16:25:37.429790",
- "version": "2.6.0"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
\ No newline at end of file
diff --git a/src/paddle_ocr/dataset_manager.py b/src/paddle_ocr/dataset_manager.py
index 2d3ccac..e9ea973 100644
--- a/src/paddle_ocr/dataset_manager.py
+++ b/src/paddle_ocr/dataset_manager.py
@@ -42,4 +42,33 @@ class ImageTextDataset:
with open(txt_path, "r", encoding="utf-8") as f:
text = f.read()
- return image, text
\ No newline at end of file
+ return image, text
+
+ def get_output_path(self, idx, output_subdir, debugset_root="/app/debugset"):
+ """Get output path for saving OCR result to debugset folder.
+
+ Args:
+ idx: Sample index
+ output_subdir: Subdirectory name (e.g., 'paddle_text', 'doctr_text')
+ debugset_root: Root folder for debug output (default: /app/debugset)
+
+ Returns:
+ Path like /app/debugset/doc1/{output_subdir}/page_001.txt
+ """
+ img_path, _ = self.samples[idx]
+ # img_path: /app/dataset/doc1/img/page_001.png
+ # Extract relative path: doc1/img/page_001.png
+ parts = img_path.split("/dataset/", 1)
+ if len(parts) == 2:
+ rel_path = parts[1] # doc1/img/page_001.png
+ else:
+ rel_path = os.path.basename(img_path)
+
+ # Replace /img/ with /{output_subdir}/
+ rel_parts = rel_path.rsplit("/img/", 1)
+ doc_folder = rel_parts[0] # doc1
+ fname = os.path.splitext(rel_parts[1])[0] + ".txt" # page_001.txt
+
+ out_dir = os.path.join(debugset_root, doc_folder, output_subdir)
+ os.makedirs(out_dir, exist_ok=True)
+ return os.path.join(out_dir, fname)
\ No newline at end of file
diff --git a/src/paddle_ocr/docker-compose.cpu-registry.yml b/src/paddle_ocr/docker-compose.cpu-registry.yml
index 1d9246f..550ecd3 100644
--- a/src/paddle_ocr/docker-compose.cpu-registry.yml
+++ b/src/paddle_ocr/docker-compose.cpu-registry.yml
@@ -9,6 +9,7 @@ services:
- "8001:8000"
volumes:
- ../dataset:/app/dataset:ro
+ - ../debugset:/app/debugset:rw
- paddlex-cache:/root/.paddlex
environment:
- PYTHONUNBUFFERED=1
diff --git a/src/paddle_ocr/docker-compose.gpu-registry.yml b/src/paddle_ocr/docker-compose.gpu-registry.yml
index 6e606c2..bd9b991 100644
--- a/src/paddle_ocr/docker-compose.gpu-registry.yml
+++ b/src/paddle_ocr/docker-compose.gpu-registry.yml
@@ -11,6 +11,7 @@ services:
- "8002:8000"
volumes:
- ../dataset:/app/dataset:ro
+ - ../debugset:/app/debugset:rw
- paddlex-cache:/root/.paddlex
- ./scripts:/app/scripts:ro
environment:
diff --git a/src/paddle_ocr/docker-compose.workers.yml b/src/paddle_ocr/docker-compose.workers.yml
index 222ea82..cada286 100644
--- a/src/paddle_ocr/docker-compose.workers.yml
+++ b/src/paddle_ocr/docker-compose.workers.yml
@@ -16,6 +16,7 @@ x-ocr-gpu-common: &ocr-gpu-common
image: seryus.ddns.net/unir/paddle-ocr-gpu:latest
volumes:
- ../dataset:/app/dataset:ro
+ - ../debugset:/app/debugset:rw
- paddlex-cache:/root/.paddlex
environment:
- PYTHONUNBUFFERED=1
@@ -39,6 +40,7 @@ x-ocr-cpu-common: &ocr-cpu-common
image: seryus.ddns.net/unir/paddle-ocr-cpu:latest
volumes:
- ../dataset:/app/dataset:ro
+ - ../debugset:/app/debugset:rw
- paddlex-cache:/root/.paddlex
environment:
- PYTHONUNBUFFERED=1
diff --git a/src/paddle_ocr/docker-compose.yml b/src/paddle_ocr/docker-compose.yml
index 22c887b..5641717 100644
--- a/src/paddle_ocr/docker-compose.yml
+++ b/src/paddle_ocr/docker-compose.yml
@@ -45,7 +45,8 @@ services:
ports:
- "8000:8000"
volumes:
- - ../dataset:/app/dataset:ro # Your dataset
+ - ../dataset:/app/dataset:ro
+ - ../debugset:/app/debugset:rw # Your dataset
- paddlex-cache:/root/.paddlex # For additional models at runtime
environment:
- PYTHONUNBUFFERED=1
@@ -74,6 +75,7 @@ services:
- "8000:8000"
volumes:
- ../dataset:/app/dataset:ro
+ - ../debugset:/app/debugset:rw
- paddlex-cache:/root/.paddlex
environment:
- PYTHONUNBUFFERED=1
diff --git a/src/paddle_ocr/paddle_ocr_tuning_rest.py b/src/paddle_ocr/paddle_ocr_tuning_rest.py
index 6e836c6..b61ff0e 100644
--- a/src/paddle_ocr/paddle_ocr_tuning_rest.py
+++ b/src/paddle_ocr/paddle_ocr_tuning_rest.py
@@ -127,6 +127,7 @@ class EvaluateRequest(BaseModel):
text_rec_score_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Recognition score threshold")
start_page: int = Field(5, ge=0, description="Start page index (inclusive)")
end_page: int = Field(10, ge=1, description="End page index (exclusive)")
+ save_output: bool = Field(False, description="Save OCR predictions to debugset folder")
class EvaluateResponse(BaseModel):
@@ -307,6 +308,12 @@ def evaluate(request: EvaluateRequest):
pred = assemble_from_paddle_result(out)
time_per_page_list.append(float(time.time() - tp0))
+ # Save prediction to debugset if requested
+ if request.save_output:
+ out_path = state.dataset.get_output_path(idx, "paddle_text")
+ with open(out_path, "w", encoding="utf-8") as f:
+ f.write(pred)
+
m = evaluate_text(ref, pred)
cer_list.append(m["CER"])
wer_list.append(m["WER"])
diff --git a/src/paddle_ocr_raytune_rest.ipynb b/src/paddle_ocr_raytune_rest.ipynb
index 44710b9..f2fe22c 100644
--- a/src/paddle_ocr_raytune_rest.ipynb
+++ b/src/paddle_ocr_raytune_rest.ipynb
@@ -7,263 +7,81 @@
"source": [
"# PaddleOCR Hyperparameter Optimization via REST API\n",
"\n",
- "This notebook runs Ray Tune hyperparameter search calling the PaddleOCR REST API (Docker container).\n",
+ "Uses Ray Tune + Optuna to find optimal PaddleOCR parameters.\n",
"\n",
- "**Benefits:**\n",
- "- No model reload per trial - Model stays loaded in Docker container\n",
- "- Faster trials - Skip ~10s model load time per trial\n",
- "- Cleaner code - REST API replaces subprocess + CLI arg parsing"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "prereq",
- "metadata": {},
- "source": [
"## Prerequisites\n",
"\n",
- "Start 2 PaddleOCR workers for parallel hyperparameter tuning:\n",
- "\n",
"```bash\n",
"cd src/paddle_ocr\n",
- "docker compose -f docker-compose.workers.yml up\n",
- "```\n",
- "\n",
- "This starts 2 GPU workers on ports 8001-8002, allowing 2 concurrent trials.\n",
- "\n",
- "For CPU-only systems:\n",
- "```bash\n",
- "docker compose -f docker-compose.workers.yml --profile cpu up\n",
+ "docker compose -f docker-compose.workers.yml up # GPU workers on 8001-8002\n",
+ "# or: docker compose -f docker-compose.workers.yml --profile cpu up\n",
"```"
]
},
{
- "cell_type": "markdown",
- "id": "3ob9fsoilc4",
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "deps",
"metadata": {},
+ "outputs": [],
"source": [
- "## 0. Dependencies"
+ "%pip install -q -U \"ray[tune]\" optuna requests pandas"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "wyr2nsoj7",
+ "id": "setup",
"metadata": {},
"outputs": [],
"source": [
- "# Install dependencies (run once)\n",
- "%pip install -U \"ray[tune]\"\n",
- "%pip install optuna\n",
- "%pip install requests pandas"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "imports-header",
- "metadata": {},
- "source": [
- "## 1. Imports & Setup"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "imports",
- "metadata": {},
- "outputs": [],
- "source": "import os\nfrom datetime import datetime\n\nimport requests\nimport pandas as pd\n\nimport ray\nfrom ray import tune, train\nfrom ray.tune.search.optuna import OptunaSearch"
- },
- {
- "cell_type": "markdown",
- "id": "config-header",
- "metadata": {},
- "source": [
- "## 2. API Configuration"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "config",
- "metadata": {},
- "outputs": [],
- "source": [
- "# PaddleOCR REST API endpoints - 2 workers for parallel trials\n",
- "# Start workers with: cd src/paddle_ocr && docker compose -f docker-compose.workers.yml up\n",
- "WORKER_PORTS = [8001, 8002]\n",
- "WORKER_URLS = [f\"http://localhost:{port}\" for port in WORKER_PORTS]\n",
+ "from raytune_ocr import (\n",
+ " check_workers, create_trainable, run_tuner, analyze_results, correlation_analysis,\n",
+ " paddle_ocr_payload, PADDLE_OCR_SEARCH_SPACE, PADDLE_OCR_CONFIG_KEYS,\n",
+ ")\n",
"\n",
- "# Output folder for results\n",
- "OUTPUT_FOLDER = \"results\"\n",
- "os.makedirs(OUTPUT_FOLDER, exist_ok=True)\n",
+ "# Worker ports\n",
+ "PORTS = [8001, 8002]\n",
"\n",
- "# Number of concurrent trials = number of workers\n",
- "NUM_WORKERS = len(WORKER_URLS)"
+ "# Check workers are running\n",
+ "healthy = check_workers(PORTS, \"PaddleOCR\")"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "health-check",
+ "id": "tune",
"metadata": {},
"outputs": [],
"source": [
- "# Verify all workers are running\n",
- "healthy_workers = []\n",
- "for url in WORKER_URLS:\n",
- " try:\n",
- " health = requests.get(f\"{url}/health\", timeout=10).json()\n",
- " if health['status'] == 'ok' and health['model_loaded']:\n",
- " healthy_workers.append(url)\n",
- " print(f\"✓ {url}: {health['status']} (GPU: {health.get('gpu_name', 'N/A')})\")\n",
- " else:\n",
- " print(f\"✗ {url}: not ready yet\")\n",
- " except requests.exceptions.ConnectionError:\n",
- " print(f\"✗ {url}: not reachable\")\n",
+ "# Create trainable and run tuning\n",
+ "trainable = create_trainable(PORTS, paddle_ocr_payload)\n",
"\n",
- "if not healthy_workers:\n",
- " raise RuntimeError(\n",
- " \"No healthy workers found. Start them with:\\n\"\n",
- " \" cd src/paddle_ocr && docker compose -f docker-compose.workers.yml up\"\n",
- " )\n",
+ "results = run_tuner(\n",
+ " trainable=trainable,\n",
+ " search_space=PADDLE_OCR_SEARCH_SPACE,\n",
+ " num_samples=64,\n",
+ " num_workers=len(healthy),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "analysis",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Analyze results\n",
+ "df = analyze_results(\n",
+ " results,\n",
+ " prefix=\"raytune_paddle\",\n",
+ " config_keys=PADDLE_OCR_CONFIG_KEYS,\n",
+ ")\n",
"\n",
- "print(f\"\\n{len(healthy_workers)}/{len(WORKER_URLS)} workers ready for parallel tuning\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "search-space-header",
- "metadata": {},
- "source": [
- "## 3. Search Space"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "search-space",
- "metadata": {},
- "outputs": [],
- "source": [
- "search_space = {\n",
- " # Whether to use document image orientation classification\n",
- " \"use_doc_orientation_classify\": tune.choice([True, False]),\n",
- " # Whether to use text image unwarping\n",
- " \"use_doc_unwarping\": tune.choice([True, False]),\n",
- " # Whether to use text line orientation classification\n",
- " \"textline_orientation\": tune.choice([True, False]),\n",
- " # Detection pixel threshold (pixels > threshold are considered text)\n",
- " \"text_det_thresh\": tune.uniform(0.0, 0.7),\n",
- " # Detection box threshold (average score within border)\n",
- " \"text_det_box_thresh\": tune.uniform(0.0, 0.7),\n",
- " # Text detection expansion coefficient\n",
- " \"text_det_unclip_ratio\": tune.choice([0.0]),\n",
- " # Text recognition threshold (filter low confidence results)\n",
- " \"text_rec_score_thresh\": tune.uniform(0.0, 0.7),\n",
- "}"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "trainable-header",
- "metadata": {},
- "source": [
- "## 4. Trainable Function"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "trainable",
- "metadata": {},
- "outputs": [],
- "source": "def trainable_paddle_ocr(config):\n \"\"\"Call PaddleOCR REST API with the given hyperparameter config.\"\"\"\n import random\n import requests\n from ray import train\n\n # Worker URLs - random selection (load balances with 2 workers, 2 concurrent trials)\n WORKER_PORTS = [8001, 8002]\n api_url = f\"http://localhost:{random.choice(WORKER_PORTS)}\"\n\n payload = {\n \"pdf_folder\": \"/app/dataset\",\n \"use_doc_orientation_classify\": config.get(\"use_doc_orientation_classify\", False),\n \"use_doc_unwarping\": config.get(\"use_doc_unwarping\", False),\n \"textline_orientation\": config.get(\"textline_orientation\", True),\n \"text_det_thresh\": config.get(\"text_det_thresh\", 0.0),\n \"text_det_box_thresh\": config.get(\"text_det_box_thresh\", 0.0),\n \"text_det_unclip_ratio\": config.get(\"text_det_unclip_ratio\", 1.5),\n \"text_rec_score_thresh\": config.get(\"text_rec_score_thresh\", 0.0),\n \"start_page\": 5,\n \"end_page\": 10,\n }\n\n try:\n response = requests.post(f\"{api_url}/evaluate\", json=payload, timeout=None)\n response.raise_for_status()\n metrics = response.json()\n metrics[\"worker\"] = api_url\n train.report(metrics)\n except Exception as e:\n train.report({\n \"CER\": 1.0,\n \"WER\": 1.0,\n \"TIME\": 0.0,\n \"PAGES\": 0,\n \"TIME_PER_PAGE\": 0,\n \"worker\": api_url,\n \"ERROR\": str(e)[:500]\n })"
- },
- {
- "cell_type": "markdown",
- "id": "tuner-header",
- "metadata": {},
- "source": [
- "## 5. Run Tuner"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "ray-init",
- "metadata": {},
- "outputs": [],
- "source": [
- "ray.init(ignore_reinit_error=True)\n",
- "print(f\"Ray Tune ready (version: {ray.__version__})\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "tuner",
- "metadata": {},
- "outputs": [],
- "source": "tuner = tune.Tuner(\n trainable_paddle_ocr,\n tune_config=tune.TuneConfig(\n metric=\"CER\",\n mode=\"min\",\n search_alg=OptunaSearch(),\n num_samples=64,\n max_concurrent_trials=NUM_WORKERS, # Run trials in parallel across workers\n ),\n param_space=search_space,\n)\n\nresults = tuner.fit()"
- },
- {
- "cell_type": "markdown",
- "id": "analysis-header",
- "metadata": {},
- "source": [
- "## 6. Results Analysis"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "results-df",
- "metadata": {},
- "outputs": [],
- "source": [
- "df = results.get_dataframe()\n",
"df.describe()"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "save-results",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Save results to CSV\n",
- "timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
- "filename = f\"raytune_paddle_rest_results_{timestamp}.csv\"\n",
- "filepath = os.path.join(OUTPUT_FOLDER, filename)\n",
- "\n",
- "df.to_csv(filepath, index=False)\n",
- "print(f\"Results saved: {filepath}\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "best-config",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Best configuration\n",
- "best = df.loc[df[\"CER\"].idxmin()]\n",
- "\n",
- "print(f\"Best CER: {best['CER']:.6f}\")\n",
- "print(f\"Best WER: {best['WER']:.6f}\")\n",
- "print(f\"\\nOptimal Configuration:\")\n",
- "print(f\" textline_orientation: {best['config/textline_orientation']}\")\n",
- "print(f\" use_doc_orientation_classify: {best['config/use_doc_orientation_classify']}\")\n",
- "print(f\" use_doc_unwarping: {best['config/use_doc_unwarping']}\")\n",
- "print(f\" text_det_thresh: {best['config/text_det_thresh']:.4f}\")\n",
- "print(f\" text_det_box_thresh: {best['config/text_det_box_thresh']:.4f}\")\n",
- "print(f\" text_det_unclip_ratio: {best['config/text_det_unclip_ratio']}\")\n",
- "print(f\" text_rec_score_thresh: {best['config/text_rec_score_thresh']:.4f}\")"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
@@ -272,42 +90,21 @@
"outputs": [],
"source": [
"# Correlation analysis\n",
- "param_cols = [\n",
- " \"config/text_det_thresh\",\n",
- " \"config/text_det_box_thresh\",\n",
- " \"config/text_det_unclip_ratio\",\n",
- " \"config/text_rec_score_thresh\",\n",
- "]\n",
- "\n",
- "corr_cer = df[param_cols + [\"CER\"]].corr()[\"CER\"].sort_values(ascending=False)\n",
- "corr_wer = df[param_cols + [\"WER\"]].corr()[\"WER\"].sort_values(ascending=False)\n",
- "\n",
- "print(\"Correlation with CER:\")\n",
- "print(corr_cer)\n",
- "print(\"\\nCorrelation with WER:\")\n",
- "print(corr_wer)"
+ "correlation_analysis(df, PADDLE_OCR_CONFIG_KEYS)"
]
}
],
"metadata": {
"kernelspec": {
- "display_name": ".venv",
+ "display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
"name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.12.3"
+ "version": "3.10.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/src/raytune_ocr.py b/src/raytune_ocr.py
new file mode 100644
index 0000000..c1f53c7
--- /dev/null
+++ b/src/raytune_ocr.py
@@ -0,0 +1,333 @@
+# raytune_ocr.py
+# Shared Ray Tune utilities for OCR hyperparameter optimization
+#
+# Usage:
+# from raytune_ocr import check_workers, create_trainable, run_tuner, analyze_results
+
+import os
+from datetime import datetime
+from typing import List, Dict, Any, Callable
+
+import requests
+import pandas as pd
+
+import ray
+from ray import tune, train
+from ray.tune.search.optuna import OptunaSearch
+
+
+def check_workers(ports: List[int], service_name: str = "OCR") -> List[str]:
+ """
+ Verify workers are running and return healthy URLs.
+
+ Args:
+ ports: List of port numbers to check
+ service_name: Name for error messages
+
+ Returns:
+ List of healthy worker URLs
+
+ Raises:
+ RuntimeError if no healthy workers found
+ """
+ worker_urls = [f"http://localhost:{port}" for port in ports]
+ healthy_workers = []
+
+ for url in worker_urls:
+ try:
+ health = requests.get(f"{url}/health", timeout=10).json()
+ if health.get('status') == 'ok' and health.get('model_loaded'):
+ healthy_workers.append(url)
+ gpu = health.get('gpu_name', 'CPU')
+ print(f"✓ {url}: {health['status']} ({gpu})")
+ else:
+ print(f"✗ {url}: not ready yet")
+ except requests.exceptions.ConnectionError:
+ print(f"✗ {url}: not reachable")
+
+ if not healthy_workers:
+ raise RuntimeError(
+ f"No healthy {service_name} workers found.\n"
+ f"Checked ports: {ports}"
+ )
+
+ print(f"\n{len(healthy_workers)}/{len(worker_urls)} workers ready")
+ return healthy_workers
+
+
+def create_trainable(ports: List[int], payload_fn: Callable[[Dict], Dict]) -> Callable:
+ """
+ Factory to create a trainable function for Ray Tune.
+
+ Args:
+ ports: List of worker ports for load balancing
+ payload_fn: Function that takes config dict and returns API payload dict
+
+ Returns:
+ Trainable function for Ray Tune
+ """
+ def trainable(config):
+ import random
+ import requests
+ from ray import train
+
+ api_url = f"http://localhost:{random.choice(ports)}"
+ payload = payload_fn(config)
+
+ try:
+ response = requests.post(f"{api_url}/evaluate", json=payload, timeout=None)
+ response.raise_for_status()
+ metrics = response.json()
+ metrics["worker"] = api_url
+ train.report(metrics)
+ except Exception as e:
+ train.report({
+ "CER": 1.0,
+ "WER": 1.0,
+ "TIME": 0.0,
+ "PAGES": 0,
+ "TIME_PER_PAGE": 0,
+ "worker": api_url,
+ "ERROR": str(e)[:500]
+ })
+
+ return trainable
+
+
+def run_tuner(
+ trainable: Callable,
+ search_space: Dict[str, Any],
+ num_samples: int = 64,
+ num_workers: int = 1,
+ metric: str = "CER",
+ mode: str = "min",
+) -> tune.ResultGrid:
+ """
+ Initialize Ray and run hyperparameter tuning.
+
+ Args:
+ trainable: Trainable function from create_trainable()
+ search_space: Dict of parameter names to tune.* search spaces
+ num_samples: Number of trials to run
+ num_workers: Max concurrent trials
+ metric: Metric to optimize
+ mode: "min" or "max"
+
+ Returns:
+ Ray Tune ResultGrid
+ """
+ ray.init(ignore_reinit_error=True, include_dashboard=False)
+ print(f"Ray Tune ready (version: {ray.__version__})")
+
+ tuner = tune.Tuner(
+ trainable,
+ tune_config=tune.TuneConfig(
+ metric=metric,
+ mode=mode,
+ search_alg=OptunaSearch(),
+ num_samples=num_samples,
+ max_concurrent_trials=num_workers,
+ ),
+ param_space=search_space,
+ )
+
+ return tuner.fit()
+
+
+def analyze_results(
+ results: tune.ResultGrid,
+ output_folder: str = "results",
+ prefix: str = "raytune",
+ config_keys: List[str] = None,
+) -> pd.DataFrame:
+ """
+ Analyze and save tuning results.
+
+ Args:
+ results: Ray Tune ResultGrid
+ output_folder: Directory to save CSV
+ prefix: Filename prefix
+ config_keys: List of config keys to show in best result (without 'config/' prefix)
+
+ Returns:
+ Results DataFrame
+ """
+ os.makedirs(output_folder, exist_ok=True)
+ df = results.get_dataframe()
+
+ # Save to CSV
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ filename = f"{prefix}_results_{timestamp}.csv"
+ filepath = os.path.join(output_folder, filename)
+ df.to_csv(filepath, index=False)
+ print(f"Results saved: {filepath}")
+
+ # Best configuration
+ best = df.loc[df["CER"].idxmin()]
+ print(f"\nBest CER: {best['CER']:.6f}")
+ print(f"Best WER: {best['WER']:.6f}")
+
+ if config_keys:
+ print(f"\nOptimal Configuration:")
+ for key in config_keys:
+ col = f"config/{key}"
+ if col in best:
+ val = best[col]
+ if isinstance(val, float):
+ print(f" {key}: {val:.4f}")
+ else:
+ print(f" {key}: {val}")
+
+ return df
+
+
+def correlation_analysis(df: pd.DataFrame, param_keys: List[str]) -> None:
+ """
+ Print correlation of numeric parameters with CER/WER.
+
+ Args:
+ df: Results DataFrame
+ param_keys: List of config keys (without 'config/' prefix)
+ """
+ param_cols = [f"config/{k}" for k in param_keys if f"config/{k}" in df.columns]
+ numeric_cols = [c for c in param_cols if df[c].dtype in ['float64', 'int64']]
+
+ if not numeric_cols:
+ print("No numeric parameters for correlation analysis")
+ return
+
+ corr_cer = df[numeric_cols + ["CER"]].corr()["CER"].sort_values(ascending=False)
+ corr_wer = df[numeric_cols + ["WER"]].corr()["WER"].sort_values(ascending=False)
+
+ print("Correlation with CER:")
+ print(corr_cer)
+ print("\nCorrelation with WER:")
+ print(corr_wer)
+
+
+# =============================================================================
+# OCR-specific payload functions
+# =============================================================================
+
+def paddle_ocr_payload(config: Dict, start_page: int = 5, end_page: int = 10, save_output: bool = False) -> Dict:
+ """Create payload for PaddleOCR API."""
+ return {
+ "pdf_folder": "/app/dataset",
+ "use_doc_orientation_classify": config.get("use_doc_orientation_classify", False),
+ "use_doc_unwarping": config.get("use_doc_unwarping", False),
+ "textline_orientation": config.get("textline_orientation", True),
+ "text_det_thresh": config.get("text_det_thresh", 0.0),
+ "text_det_box_thresh": config.get("text_det_box_thresh", 0.0),
+ "text_det_unclip_ratio": config.get("text_det_unclip_ratio", 1.5),
+ "text_rec_score_thresh": config.get("text_rec_score_thresh", 0.0),
+ "start_page": start_page,
+ "end_page": end_page,
+ "save_output": save_output,
+ }
+
+
+def doctr_payload(config: Dict, start_page: int = 5, end_page: int = 10, save_output: bool = False) -> Dict:
+ """Create payload for DocTR API."""
+ return {
+ "pdf_folder": "/app/dataset",
+ "assume_straight_pages": config.get("assume_straight_pages", True),
+ "straighten_pages": config.get("straighten_pages", False),
+ "preserve_aspect_ratio": config.get("preserve_aspect_ratio", True),
+ "symmetric_pad": config.get("symmetric_pad", True),
+ "disable_page_orientation": config.get("disable_page_orientation", False),
+ "disable_crop_orientation": config.get("disable_crop_orientation", False),
+ "resolve_lines": config.get("resolve_lines", True),
+ "resolve_blocks": config.get("resolve_blocks", False),
+ "paragraph_break": config.get("paragraph_break", 0.035),
+ "start_page": start_page,
+ "end_page": end_page,
+ "save_output": save_output,
+ }
+
+
+def easyocr_payload(config: Dict, start_page: int = 5, end_page: int = 10, save_output: bool = False) -> Dict:
+ """Create payload for EasyOCR API."""
+ return {
+ "pdf_folder": "/app/dataset",
+ "text_threshold": config.get("text_threshold", 0.7),
+ "low_text": config.get("low_text", 0.4),
+ "link_threshold": config.get("link_threshold", 0.4),
+ "slope_ths": config.get("slope_ths", 0.1),
+ "ycenter_ths": config.get("ycenter_ths", 0.5),
+ "height_ths": config.get("height_ths", 0.5),
+ "width_ths": config.get("width_ths", 0.5),
+ "add_margin": config.get("add_margin", 0.1),
+ "contrast_ths": config.get("contrast_ths", 0.1),
+ "adjust_contrast": config.get("adjust_contrast", 0.5),
+ "decoder": config.get("decoder", "greedy"),
+ "beamWidth": config.get("beamWidth", 5),
+ "min_size": config.get("min_size", 10),
+ "start_page": start_page,
+ "end_page": end_page,
+ "save_output": save_output,
+ }
+
+
+# =============================================================================
+# Search spaces
+# =============================================================================
+
+PADDLE_OCR_SEARCH_SPACE = {
+ "use_doc_orientation_classify": tune.choice([True, False]),
+ "use_doc_unwarping": tune.choice([True, False]),
+ "textline_orientation": tune.choice([True, False]),
+ "text_det_thresh": tune.uniform(0.0, 0.7),
+ "text_det_box_thresh": tune.uniform(0.0, 0.7),
+ "text_det_unclip_ratio": tune.choice([0.0]),
+ "text_rec_score_thresh": tune.uniform(0.0, 0.7),
+}
+
+DOCTR_SEARCH_SPACE = {
+ "assume_straight_pages": tune.choice([True, False]),
+ "straighten_pages": tune.choice([True, False]),
+ "preserve_aspect_ratio": tune.choice([True, False]),
+ "symmetric_pad": tune.choice([True, False]),
+ "disable_page_orientation": tune.choice([True, False]),
+ "disable_crop_orientation": tune.choice([True, False]),
+ "resolve_lines": tune.choice([True, False]),
+ "resolve_blocks": tune.choice([True, False]),
+ "paragraph_break": tune.uniform(0.01, 0.1),
+}
+
+EASYOCR_SEARCH_SPACE = {
+ "text_threshold": tune.uniform(0.3, 0.9),
+ "low_text": tune.uniform(0.2, 0.6),
+ "link_threshold": tune.uniform(0.2, 0.6),
+ "slope_ths": tune.uniform(0.0, 0.3),
+ "ycenter_ths": tune.uniform(0.3, 1.0),
+ "height_ths": tune.uniform(0.3, 1.0),
+ "width_ths": tune.uniform(0.3, 1.0),
+ "add_margin": tune.uniform(0.0, 0.3),
+ "contrast_ths": tune.uniform(0.05, 0.3),
+ "adjust_contrast": tune.uniform(0.3, 0.8),
+ "decoder": tune.choice(["greedy", "beamsearch"]),
+ "beamWidth": tune.choice([3, 5, 7, 10]),
+ "min_size": tune.choice([5, 10, 15, 20]),
+}
+
+
+# =============================================================================
+# Config keys for results display
+# =============================================================================
+
+PADDLE_OCR_CONFIG_KEYS = [
+ "use_doc_orientation_classify", "use_doc_unwarping", "textline_orientation",
+ "text_det_thresh", "text_det_box_thresh", "text_det_unclip_ratio", "text_rec_score_thresh",
+]
+
+DOCTR_CONFIG_KEYS = [
+ "assume_straight_pages", "straighten_pages", "preserve_aspect_ratio", "symmetric_pad",
+ "disable_page_orientation", "disable_crop_orientation", "resolve_lines", "resolve_blocks",
+ "paragraph_break",
+]
+
+EASYOCR_CONFIG_KEYS = [
+ "text_threshold", "low_text", "link_threshold", "slope_ths", "ycenter_ths",
+ "height_ths", "width_ths", "add_margin", "contrast_ths", "adjust_contrast",
+ "decoder", "beamWidth", "min_size",
+]