From 68efb27a1efe7ff2ced6fce16ce316f8ffc4abef Mon Sep 17 00:00:00 2001 From: Sergio Jimenez Jimenez Date: Sun, 18 Jan 2026 18:03:23 +0100 Subject: [PATCH] debug set and locking --- .gitignore | 2 + src/dataset_manager.py | 31 +- src/doctr_raytune_rest.ipynb | 111 ++ src/doctr_service/dataset_manager.py | 31 +- src/doctr_service/docker-compose.yml | 2 + src/doctr_service/doctr_tuning_rest.py | 7 + src/easyocr_raytune_rest.ipynb | 111 ++ src/easyocr_service/dataset_manager.py | 31 +- src/easyocr_service/docker-compose.yml | 2 + src/easyocr_service/easyocr_tuning_rest.py | 7 + src/output_raytune.ipynb | 1037 ----------------- src/paddle_ocr/dataset_manager.py | 31 +- .../docker-compose.cpu-registry.yml | 1 + .../docker-compose.gpu-registry.yml | 1 + src/paddle_ocr/docker-compose.workers.yml | 2 + src/paddle_ocr/docker-compose.yml | 4 +- src/paddle_ocr/paddle_ocr_tuning_rest.py | 7 + src/paddle_ocr_raytune_rest.ipynb | 293 +---- src/raytune_ocr.py | 333 ++++++ 19 files changed, 754 insertions(+), 1290 deletions(-) create mode 100644 src/doctr_raytune_rest.ipynb create mode 100644 src/easyocr_raytune_rest.ipynb delete mode 100644 src/output_raytune.ipynb create mode 100644 src/raytune_ocr.py diff --git a/.gitignore b/.gitignore index 0098713..1eb7d2f 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,5 @@ results node_modules src/paddle_ocr/wheels src/*.log +src/output_*.ipynb +debugset/ diff --git a/src/dataset_manager.py b/src/dataset_manager.py index 2d3ccac..e9ea973 100644 --- a/src/dataset_manager.py +++ b/src/dataset_manager.py @@ -42,4 +42,33 @@ class ImageTextDataset: with open(txt_path, "r", encoding="utf-8") as f: text = f.read() - return image, text \ No newline at end of file + return image, text + + def get_output_path(self, idx, output_subdir, debugset_root="/app/debugset"): + """Get output path for saving OCR result to debugset folder. + + Args: + idx: Sample index + output_subdir: Subdirectory name (e.g., 'paddle_text', 'doctr_text') + debugset_root: Root folder for debug output (default: /app/debugset) + + Returns: + Path like /app/debugset/doc1/{output_subdir}/page_001.txt + """ + img_path, _ = self.samples[idx] + # img_path: /app/dataset/doc1/img/page_001.png + # Extract relative path: doc1/img/page_001.png + parts = img_path.split("/dataset/", 1) + if len(parts) == 2: + rel_path = parts[1] # doc1/img/page_001.png + else: + rel_path = os.path.basename(img_path) + + # Replace /img/ with /{output_subdir}/ + rel_parts = rel_path.rsplit("/img/", 1) + doc_folder = rel_parts[0] # doc1 + fname = os.path.splitext(rel_parts[1])[0] + ".txt" # page_001.txt + + out_dir = os.path.join(debugset_root, doc_folder, output_subdir) + os.makedirs(out_dir, exist_ok=True) + return os.path.join(out_dir, fname) \ No newline at end of file diff --git a/src/doctr_raytune_rest.ipynb b/src/doctr_raytune_rest.ipynb new file mode 100644 index 0000000..aafd28f --- /dev/null +++ b/src/doctr_raytune_rest.ipynb @@ -0,0 +1,111 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "header", + "metadata": {}, + "source": [ + "# DocTR Hyperparameter Optimization via REST API\n", + "\n", + "Uses Ray Tune + Optuna to find optimal DocTR parameters.\n", + "\n", + "## Prerequisites\n", + "\n", + "```bash\n", + "cd src/doctr_service\n", + "docker compose up ocr-cpu # or ocr-gpu\n", + "```\n", + "\n", + "Service runs on port 8003." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "deps", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q -U \"ray[tune]\" optuna requests pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup", + "metadata": {}, + "outputs": [], + "source": [ + "from raytune_ocr import (\n", + " check_workers, create_trainable, run_tuner, analyze_results, correlation_analysis,\n", + " doctr_payload, DOCTR_SEARCH_SPACE, DOCTR_CONFIG_KEYS,\n", + ")\n", + "\n", + "# Worker ports\n", + "PORTS = [8003]\n", + "\n", + "# Check workers are running\n", + "healthy = check_workers(PORTS, \"DocTR\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "tune", + "metadata": {}, + "outputs": [], + "source": [ + "# Create trainable and run tuning\n", + "trainable = create_trainable(PORTS, doctr_payload)\n", + "\n", + "results = run_tuner(\n", + " trainable=trainable,\n", + " search_space=DOCTR_SEARCH_SPACE,\n", + " num_samples=64,\n", + " num_workers=len(healthy),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "analysis", + "metadata": {}, + "outputs": [], + "source": [ + "# Analyze results\n", + "df = analyze_results(\n", + " results,\n", + " prefix=\"raytune_doctr\",\n", + " config_keys=DOCTR_CONFIG_KEYS,\n", + ")\n", + "\n", + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "correlation", + "metadata": {}, + "outputs": [], + "source": [ + "# Correlation analysis\n", + "correlation_analysis(df, DOCTR_CONFIG_KEYS)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/doctr_service/dataset_manager.py b/src/doctr_service/dataset_manager.py index 2d3ccac..e9ea973 100644 --- a/src/doctr_service/dataset_manager.py +++ b/src/doctr_service/dataset_manager.py @@ -42,4 +42,33 @@ class ImageTextDataset: with open(txt_path, "r", encoding="utf-8") as f: text = f.read() - return image, text \ No newline at end of file + return image, text + + def get_output_path(self, idx, output_subdir, debugset_root="/app/debugset"): + """Get output path for saving OCR result to debugset folder. + + Args: + idx: Sample index + output_subdir: Subdirectory name (e.g., 'paddle_text', 'doctr_text') + debugset_root: Root folder for debug output (default: /app/debugset) + + Returns: + Path like /app/debugset/doc1/{output_subdir}/page_001.txt + """ + img_path, _ = self.samples[idx] + # img_path: /app/dataset/doc1/img/page_001.png + # Extract relative path: doc1/img/page_001.png + parts = img_path.split("/dataset/", 1) + if len(parts) == 2: + rel_path = parts[1] # doc1/img/page_001.png + else: + rel_path = os.path.basename(img_path) + + # Replace /img/ with /{output_subdir}/ + rel_parts = rel_path.rsplit("/img/", 1) + doc_folder = rel_parts[0] # doc1 + fname = os.path.splitext(rel_parts[1])[0] + ".txt" # page_001.txt + + out_dir = os.path.join(debugset_root, doc_folder, output_subdir) + os.makedirs(out_dir, exist_ok=True) + return os.path.join(out_dir, fname) \ No newline at end of file diff --git a/src/doctr_service/docker-compose.yml b/src/doctr_service/docker-compose.yml index 710f72b..f16c931 100644 --- a/src/doctr_service/docker-compose.yml +++ b/src/doctr_service/docker-compose.yml @@ -14,6 +14,7 @@ services: - "8003:8000" volumes: - ../dataset:/app/dataset:ro + - ../debugset:/app/debugset:rw - doctr-cache:/root/.cache/doctr environment: - PYTHONUNBUFFERED=1 @@ -35,6 +36,7 @@ services: - "8003:8000" volumes: - ../dataset:/app/dataset:ro + - ../debugset:/app/debugset:rw - doctr-cache:/root/.cache/doctr environment: - PYTHONUNBUFFERED=1 diff --git a/src/doctr_service/doctr_tuning_rest.py b/src/doctr_service/doctr_tuning_rest.py index 4ef3928..9385f43 100644 --- a/src/doctr_service/doctr_tuning_rest.py +++ b/src/doctr_service/doctr_tuning_rest.py @@ -169,6 +169,7 @@ class EvaluateRequest(BaseModel): # Page range start_page: int = Field(5, ge=0, description="Start page index (inclusive)") end_page: int = Field(10, ge=1, description="End page index (exclusive)") + save_output: bool = Field(False, description="Save OCR predictions to debugset folder") class EvaluateResponse(BaseModel): @@ -302,6 +303,12 @@ def evaluate(request: EvaluateRequest): ) time_per_page_list.append(float(time.time() - tp0)) + # Save prediction to debugset if requested + if request.save_output: + out_path = state.dataset.get_output_path(idx, "doctr_text") + with open(out_path, "w", encoding="utf-8") as f: + f.write(pred) + m = evaluate_text(ref, pred) cer_list.append(m["CER"]) wer_list.append(m["WER"]) diff --git a/src/easyocr_raytune_rest.ipynb b/src/easyocr_raytune_rest.ipynb new file mode 100644 index 0000000..723f97f --- /dev/null +++ b/src/easyocr_raytune_rest.ipynb @@ -0,0 +1,111 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "header", + "metadata": {}, + "source": [ + "# EasyOCR Hyperparameter Optimization via REST API\n", + "\n", + "Uses Ray Tune + Optuna to find optimal EasyOCR parameters.\n", + "\n", + "## Prerequisites\n", + "\n", + "```bash\n", + "cd src/easyocr_service\n", + "docker compose up ocr-cpu # or ocr-gpu\n", + "```\n", + "\n", + "Service runs on port 8002." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "deps", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q -U \"ray[tune]\" optuna requests pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup", + "metadata": {}, + "outputs": [], + "source": [ + "from raytune_ocr import (\n", + " check_workers, create_trainable, run_tuner, analyze_results, correlation_analysis,\n", + " easyocr_payload, EASYOCR_SEARCH_SPACE, EASYOCR_CONFIG_KEYS,\n", + ")\n", + "\n", + "# Worker ports\n", + "PORTS = [8002]\n", + "\n", + "# Check workers are running\n", + "healthy = check_workers(PORTS, \"EasyOCR\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "tune", + "metadata": {}, + "outputs": [], + "source": [ + "# Create trainable and run tuning\n", + "trainable = create_trainable(PORTS, easyocr_payload)\n", + "\n", + "results = run_tuner(\n", + " trainable=trainable,\n", + " search_space=EASYOCR_SEARCH_SPACE,\n", + " num_samples=64,\n", + " num_workers=len(healthy),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "analysis", + "metadata": {}, + "outputs": [], + "source": [ + "# Analyze results\n", + "df = analyze_results(\n", + " results,\n", + " prefix=\"raytune_easyocr\",\n", + " config_keys=EASYOCR_CONFIG_KEYS,\n", + ")\n", + "\n", + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "correlation", + "metadata": {}, + "outputs": [], + "source": [ + "# Correlation analysis\n", + "correlation_analysis(df, EASYOCR_CONFIG_KEYS)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/easyocr_service/dataset_manager.py b/src/easyocr_service/dataset_manager.py index 2d3ccac..e9ea973 100644 --- a/src/easyocr_service/dataset_manager.py +++ b/src/easyocr_service/dataset_manager.py @@ -42,4 +42,33 @@ class ImageTextDataset: with open(txt_path, "r", encoding="utf-8") as f: text = f.read() - return image, text \ No newline at end of file + return image, text + + def get_output_path(self, idx, output_subdir, debugset_root="/app/debugset"): + """Get output path for saving OCR result to debugset folder. + + Args: + idx: Sample index + output_subdir: Subdirectory name (e.g., 'paddle_text', 'doctr_text') + debugset_root: Root folder for debug output (default: /app/debugset) + + Returns: + Path like /app/debugset/doc1/{output_subdir}/page_001.txt + """ + img_path, _ = self.samples[idx] + # img_path: /app/dataset/doc1/img/page_001.png + # Extract relative path: doc1/img/page_001.png + parts = img_path.split("/dataset/", 1) + if len(parts) == 2: + rel_path = parts[1] # doc1/img/page_001.png + else: + rel_path = os.path.basename(img_path) + + # Replace /img/ with /{output_subdir}/ + rel_parts = rel_path.rsplit("/img/", 1) + doc_folder = rel_parts[0] # doc1 + fname = os.path.splitext(rel_parts[1])[0] + ".txt" # page_001.txt + + out_dir = os.path.join(debugset_root, doc_folder, output_subdir) + os.makedirs(out_dir, exist_ok=True) + return os.path.join(out_dir, fname) \ No newline at end of file diff --git a/src/easyocr_service/docker-compose.yml b/src/easyocr_service/docker-compose.yml index 0b1b085..550e865 100644 --- a/src/easyocr_service/docker-compose.yml +++ b/src/easyocr_service/docker-compose.yml @@ -14,6 +14,7 @@ services: - "8002:8000" volumes: - ../dataset:/app/dataset:ro + - ../debugset:/app/debugset:rw - easyocr-cache:/root/.EasyOCR environment: - PYTHONUNBUFFERED=1 @@ -34,6 +35,7 @@ services: - "8002:8000" volumes: - ../dataset:/app/dataset:ro + - ../debugset:/app/debugset:rw - easyocr-cache:/root/.EasyOCR environment: - PYTHONUNBUFFERED=1 diff --git a/src/easyocr_service/easyocr_tuning_rest.py b/src/easyocr_service/easyocr_tuning_rest.py index 5fa6cd5..dd1b565 100644 --- a/src/easyocr_service/easyocr_tuning_rest.py +++ b/src/easyocr_service/easyocr_tuning_rest.py @@ -133,6 +133,7 @@ class EvaluateRequest(BaseModel): # Page range start_page: int = Field(5, ge=0, description="Start page index (inclusive)") end_page: int = Field(10, ge=1, description="End page index (exclusive)") + save_output: bool = Field(False, description="Save OCR predictions to debugset folder") class EvaluateResponse(BaseModel): @@ -301,6 +302,12 @@ def evaluate(request: EvaluateRequest): pred = assemble_easyocr_result(result) time_per_page_list.append(float(time.time() - tp0)) + # Save prediction to debugset if requested + if request.save_output: + out_path = state.dataset.get_output_path(idx, "easyocr_text") + with open(out_path, "w", encoding="utf-8") as f: + f.write(pred) + m = evaluate_text(ref, pred) cer_list.append(m["CER"]) wer_list.append(m["WER"]) diff --git a/src/output_raytune.ipynb b/src/output_raytune.ipynb deleted file mode 100644 index 7230e7e..0000000 --- a/src/output_raytune.ipynb +++ /dev/null @@ -1,1037 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "header", - "metadata": { - "papermill": { - "duration": 0.002022, - "end_time": "2026-01-18T16:25:38.048417", - "exception": false, - "start_time": "2026-01-18T16:25:38.046395", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "# PaddleOCR Hyperparameter Optimization via REST API\n", - "\n", - "This notebook runs Ray Tune hyperparameter search calling the PaddleOCR REST API (Docker container).\n", - "\n", - "**Benefits:**\n", - "- No model reload per trial - Model stays loaded in Docker container\n", - "- Faster trials - Skip ~10s model load time per trial\n", - "- Cleaner code - REST API replaces subprocess + CLI arg parsing" - ] - }, - { - "cell_type": "markdown", - "id": "prereq", - "metadata": { - "papermill": { - "duration": 0.000855, - "end_time": "2026-01-18T16:25:38.058911", - "exception": false, - "start_time": "2026-01-18T16:25:38.058056", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Prerequisites\n", - "\n", - "Start 2 PaddleOCR workers for parallel hyperparameter tuning:\n", - "\n", - "```bash\n", - "cd src/paddle_ocr\n", - "docker compose -f docker-compose.workers.yml up\n", - "```\n", - "\n", - "This starts 2 GPU workers on ports 8001-8002, allowing 2 concurrent trials.\n", - "\n", - "For CPU-only systems:\n", - "```bash\n", - "docker compose -f docker-compose.workers.yml --profile cpu up\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "3ob9fsoilc4", - "metadata": { - "papermill": { - "duration": 0.000846, - "end_time": "2026-01-18T16:25:38.060620", - "exception": false, - "start_time": "2026-01-18T16:25:38.059774", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 0. Dependencies" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "wyr2nsoj7", - "metadata": { - "execution": { - "iopub.execute_input": "2026-01-18T16:25:38.063421Z", - "iopub.status.busy": "2026-01-18T16:25:38.063287Z", - "iopub.status.idle": "2026-01-18T16:25:39.300678Z", - "shell.execute_reply": "2026-01-18T16:25:39.299298Z" - }, - "papermill": { - "duration": 1.240519, - "end_time": "2026-01-18T16:25:39.301973", - "exception": false, - "start_time": "2026-01-18T16:25:38.061454", - "status": "completed" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: ray[tune] in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (2.53.0)\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: click>=7.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (8.3.1)\r\n", - "Requirement already satisfied: filelock in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (3.20.3)\r\n", - "Requirement already satisfied: jsonschema in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (4.26.0)\r\n", - "Requirement already satisfied: msgpack<2.0.0,>=1.0.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (1.1.2)\r\n", - "Requirement already satisfied: packaging>=24.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (25.0)\r\n", - "Requirement already satisfied: protobuf>=3.20.3 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (6.33.4)\r\n", - "Requirement already satisfied: pyyaml in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (6.0.3)\r\n", - "Requirement already satisfied: requests in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (2.32.5)\r\n", - "Requirement already satisfied: pandas in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (2.3.3)\r\n", - "Requirement already satisfied: pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (2.12.5)\r\n", - "Requirement already satisfied: tensorboardX>=1.9 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (2.6.4)\r\n", - "Requirement already satisfied: pyarrow>=9.0.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (22.0.0)\r\n", - "Requirement already satisfied: fsspec in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from ray[tune]) (2026.1.0)\r\n", - "Requirement already satisfied: annotated-types>=0.6.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (0.7.0)\r\n", - "Requirement already satisfied: pydantic-core==2.41.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (2.41.5)\r\n", - "Requirement already satisfied: typing-extensions>=4.14.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (4.15.0)\r\n", - "Requirement already satisfied: typing-inspection>=0.4.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (0.4.2)\r\n", - "Requirement already satisfied: numpy in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from tensorboardX>=1.9->ray[tune]) (2.4.1)\r\n", - "Requirement already satisfied: attrs>=22.2.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from jsonschema->ray[tune]) (25.4.0)\r\n", - "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from jsonschema->ray[tune]) (2025.9.1)\r\n", - "Requirement already satisfied: referencing>=0.28.4 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from jsonschema->ray[tune]) (0.37.0)\r\n", - "Requirement already satisfied: rpds-py>=0.25.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from jsonschema->ray[tune]) (0.30.0)\r\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pandas->ray[tune]) (2.9.0.post0)\r\n", - "Requirement already satisfied: pytz>=2020.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pandas->ray[tune]) (2025.2)\r\n", - "Requirement already satisfied: tzdata>=2022.7 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pandas->ray[tune]) (2025.3)\r\n", - "Requirement already satisfied: six>=1.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas->ray[tune]) (1.17.0)\r\n", - "Requirement already satisfied: charset_normalizer<4,>=2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests->ray[tune]) (3.4.4)\r\n", - "Requirement already satisfied: idna<4,>=2.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests->ray[tune]) (3.11)\r\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests->ray[tune]) (2.6.3)\r\n", - "Requirement already satisfied: certifi>=2017.4.17 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests->ray[tune]) (2026.1.4)\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Note: you may need to restart the kernel to use updated packages.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: optuna in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (4.6.0)\r\n", - "Requirement already satisfied: alembic>=1.5.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from optuna) (1.18.1)\r\n", - "Requirement already satisfied: colorlog in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from optuna) (6.10.1)\r\n", - "Requirement already satisfied: numpy in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from optuna) (2.4.1)\r\n", - "Requirement already satisfied: packaging>=20.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from optuna) (25.0)\r\n", - "Requirement already satisfied: sqlalchemy>=1.4.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from optuna) (2.0.45)\r\n", - "Requirement already satisfied: tqdm in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from optuna) (4.67.1)\r\n", - "Requirement already satisfied: PyYAML in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from optuna) (6.0.3)\r\n", - "Requirement already satisfied: Mako in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from alembic>=1.5.0->optuna) (1.3.10)\r\n", - "Requirement already satisfied: typing-extensions>=4.12 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from alembic>=1.5.0->optuna) (4.15.0)\r\n", - "Requirement already satisfied: greenlet>=1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from sqlalchemy>=1.4.2->optuna) (3.3.0)\r\n", - "Requirement already satisfied: MarkupSafe>=0.9.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from Mako->alembic>=1.5.0->optuna) (3.0.3)\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Note: you may need to restart the kernel to use updated packages.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: requests in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (2.32.5)\r\n", - "Requirement already satisfied: pandas in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (2.3.3)\r\n", - "Requirement already satisfied: charset_normalizer<4,>=2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (3.4.4)\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: idna<4,>=2.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (3.11)\r\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (2.6.3)\r\n", - "Requirement already satisfied: certifi>=2017.4.17 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (2026.1.4)\r\n", - "Requirement already satisfied: numpy>=1.26.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pandas) (2.4.1)\r\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pandas) (2.9.0.post0)\r\n", - "Requirement already satisfied: pytz>=2020.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pandas) (2025.2)\r\n", - "Requirement already satisfied: tzdata>=2022.7 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pandas) (2025.3)\r\n", - "Requirement already satisfied: six>=1.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "# Install dependencies (run once)\n", - "%pip install -U \"ray[tune]\"\n", - "%pip install optuna\n", - "%pip install requests pandas" - ] - }, - { - "cell_type": "markdown", - "id": "imports-header", - "metadata": { - "papermill": { - "duration": 0.009444, - "end_time": "2026-01-18T16:25:39.312980", - "exception": false, - "start_time": "2026-01-18T16:25:39.303536", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 1. Imports & Setup" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "imports", - "metadata": { - "execution": { - "iopub.execute_input": "2026-01-18T16:25:39.316439Z", - "iopub.status.busy": "2026-01-18T16:25:39.316230Z", - "iopub.status.idle": "2026-01-18T16:25:40.277894Z", - "shell.execute_reply": "2026-01-18T16:25:40.277012Z" - }, - "papermill": { - "duration": 0.964409, - "end_time": "2026-01-18T16:25:40.278450", - "exception": false, - "start_time": "2026-01-18T16:25:39.314041", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import os\n", - "from datetime import datetime\n", - "\n", - "import requests\n", - "import pandas as pd\n", - "\n", - "import ray\n", - "from ray import tune, train\n", - "from ray.tune.search.optuna import OptunaSearch" - ] - }, - { - "cell_type": "markdown", - "id": "config-header", - "metadata": { - "papermill": { - "duration": 0.009552, - "end_time": "2026-01-18T16:25:40.289551", - "exception": false, - "start_time": "2026-01-18T16:25:40.279999", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 2. API Configuration" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "config", - "metadata": { - "execution": { - "iopub.execute_input": "2026-01-18T16:25:40.292573Z", - "iopub.status.busy": "2026-01-18T16:25:40.292489Z", - "iopub.status.idle": "2026-01-18T16:25:40.294713Z", - "shell.execute_reply": "2026-01-18T16:25:40.294164Z" - }, - "papermill": { - "duration": 0.004591, - "end_time": "2026-01-18T16:25:40.295202", - "exception": false, - "start_time": "2026-01-18T16:25:40.290611", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# PaddleOCR REST API endpoints - 2 workers for parallel trials\n", - "# Start workers with: cd src/paddle_ocr && docker compose -f docker-compose.workers.yml up\n", - "WORKER_PORTS = [8001, 8002]\n", - "WORKER_URLS = [f\"http://localhost:{port}\" for port in WORKER_PORTS]\n", - "\n", - "# Output folder for results\n", - "OUTPUT_FOLDER = \"results\"\n", - "os.makedirs(OUTPUT_FOLDER, exist_ok=True)\n", - "\n", - "# Number of concurrent trials = number of workers\n", - "NUM_WORKERS = len(WORKER_URLS)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "health-check", - "metadata": { - "execution": { - "iopub.execute_input": "2026-01-18T16:25:40.298281Z", - "iopub.status.busy": "2026-01-18T16:25:40.298161Z", - "iopub.status.idle": "2026-01-18T16:25:40.306720Z", - "shell.execute_reply": "2026-01-18T16:25:40.306262Z" - }, - "papermill": { - "duration": 0.010723, - "end_time": "2026-01-18T16:25:40.307025", - "exception": false, - "start_time": "2026-01-18T16:25:40.296302", - "status": "completed" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✓ http://localhost:8001: ok (GPU: None)\n", - "✓ http://localhost:8002: ok (GPU: None)\n", - "\n", - "2/2 workers ready for parallel tuning\n" - ] - } - ], - "source": [ - "# Verify all workers are running\n", - "healthy_workers = []\n", - "for url in WORKER_URLS:\n", - " try:\n", - " health = requests.get(f\"{url}/health\", timeout=10).json()\n", - " if health['status'] == 'ok' and health['model_loaded']:\n", - " healthy_workers.append(url)\n", - " print(f\"✓ {url}: {health['status']} (GPU: {health.get('gpu_name', 'N/A')})\")\n", - " else:\n", - " print(f\"✗ {url}: not ready yet\")\n", - " except requests.exceptions.ConnectionError:\n", - " print(f\"✗ {url}: not reachable\")\n", - "\n", - "if not healthy_workers:\n", - " raise RuntimeError(\n", - " \"No healthy workers found. Start them with:\\n\"\n", - " \" cd src/paddle_ocr && docker compose -f docker-compose.workers.yml up\"\n", - " )\n", - "\n", - "print(f\"\\n{len(healthy_workers)}/{len(WORKER_URLS)} workers ready for parallel tuning\")" - ] - }, - { - "cell_type": "markdown", - "id": "search-space-header", - "metadata": { - "papermill": { - "duration": 0.001073, - "end_time": "2026-01-18T16:25:40.309261", - "exception": false, - "start_time": "2026-01-18T16:25:40.308188", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 3. Search Space" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "search-space", - "metadata": { - "execution": { - "iopub.execute_input": "2026-01-18T16:25:40.312177Z", - "iopub.status.busy": "2026-01-18T16:25:40.312107Z", - "iopub.status.idle": "2026-01-18T16:25:40.314237Z", - "shell.execute_reply": "2026-01-18T16:25:40.313794Z" - }, - "papermill": { - "duration": 0.004476, - "end_time": "2026-01-18T16:25:40.314804", - "exception": false, - "start_time": "2026-01-18T16:25:40.310328", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "search_space = {\n", - " # Whether to use document image orientation classification\n", - " \"use_doc_orientation_classify\": tune.choice([True, False]),\n", - " # Whether to use text image unwarping\n", - " \"use_doc_unwarping\": tune.choice([True, False]),\n", - " # Whether to use text line orientation classification\n", - " \"textline_orientation\": tune.choice([True, False]),\n", - " # Detection pixel threshold (pixels > threshold are considered text)\n", - " \"text_det_thresh\": tune.uniform(0.0, 0.7),\n", - " # Detection box threshold (average score within border)\n", - " \"text_det_box_thresh\": tune.uniform(0.0, 0.7),\n", - " # Text detection expansion coefficient\n", - " \"text_det_unclip_ratio\": tune.choice([0.0]),\n", - " # Text recognition threshold (filter low confidence results)\n", - " \"text_rec_score_thresh\": tune.uniform(0.0, 0.7),\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "trainable-header", - "metadata": { - "papermill": { - "duration": 0.001057, - "end_time": "2026-01-18T16:25:40.316975", - "exception": false, - "start_time": "2026-01-18T16:25:40.315918", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 4. Trainable Function" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "trainable", - "metadata": { - "execution": { - "iopub.execute_input": "2026-01-18T16:25:40.319825Z", - "iopub.status.busy": "2026-01-18T16:25:40.319771Z", - "iopub.status.idle": "2026-01-18T16:25:40.322602Z", - "shell.execute_reply": "2026-01-18T16:25:40.322112Z" - }, - "papermill": { - "duration": 0.004907, - "end_time": "2026-01-18T16:25:40.322948", - "exception": false, - "start_time": "2026-01-18T16:25:40.318041", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "def trainable_paddle_ocr(config):\n", - " \"\"\"Call PaddleOCR REST API with the given hyperparameter config.\"\"\"\n", - " import random\n", - " import requests\n", - " from ray import train\n", - "\n", - " # Worker URLs - random selection (load balances with 2 workers, 2 concurrent trials)\n", - " WORKER_PORTS = [8001, 8002]\n", - " api_url = f\"http://localhost:{random.choice(WORKER_PORTS)}\"\n", - "\n", - " payload = {\n", - " \"pdf_folder\": \"/app/dataset\",\n", - " \"use_doc_orientation_classify\": config.get(\"use_doc_orientation_classify\", False),\n", - " \"use_doc_unwarping\": config.get(\"use_doc_unwarping\", False),\n", - " \"textline_orientation\": config.get(\"textline_orientation\", True),\n", - " \"text_det_thresh\": config.get(\"text_det_thresh\", 0.0),\n", - " \"text_det_box_thresh\": config.get(\"text_det_box_thresh\", 0.0),\n", - " \"text_det_unclip_ratio\": config.get(\"text_det_unclip_ratio\", 1.5),\n", - " \"text_rec_score_thresh\": config.get(\"text_rec_score_thresh\", 0.0),\n", - " \"start_page\": 5,\n", - " \"end_page\": 10,\n", - " }\n", - "\n", - " try:\n", - " response = requests.post(f\"{api_url}/evaluate\", json=payload, timeout=None)\n", - " response.raise_for_status()\n", - " metrics = response.json()\n", - " metrics[\"worker\"] = api_url\n", - " train.report(metrics)\n", - " except Exception as e:\n", - " train.report({\n", - " \"CER\": 1.0,\n", - " \"WER\": 1.0,\n", - " \"TIME\": 0.0,\n", - " \"PAGES\": 0,\n", - " \"TIME_PER_PAGE\": 0,\n", - " \"worker\": api_url,\n", - " \"ERROR\": str(e)[:500]\n", - " })" - ] - }, - { - "cell_type": "markdown", - "id": "tuner-header", - "metadata": { - "papermill": { - "duration": 0.001058, - "end_time": "2026-01-18T16:25:40.325120", - "exception": false, - "start_time": "2026-01-18T16:25:40.324062", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## 5. Run Tuner" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "ray-init", - "metadata": { - "execution": { - "iopub.execute_input": "2026-01-18T16:25:40.328162Z", - "iopub.status.busy": "2026-01-18T16:25:40.328055Z", - "iopub.status.idle": "2026-01-18T16:25:42.985307Z", - "shell.execute_reply": "2026-01-18T16:25:42.984863Z" - }, - "papermill": { - "duration": 2.65986, - "end_time": "2026-01-18T16:25:42.986041", - "exception": false, - "start_time": "2026-01-18T16:25:40.326181", - "status": "completed" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2026-01-18 17:25:41,631\tINFO worker.py:2007 -- Started a local Ray instance.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Ray Tune ready (version: 2.53.0)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py:2046: FutureWarning: Tip: In future versions of Ray, Ray will no longer override accelerator visible devices env var if num_gpus=0 or num_gpus=None (default). To enable this behavior and turn off this error message, set RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO=0\n", - " warnings.warn(\n" - ] - } - ], - "source": [ - "ray.init(ignore_reinit_error=True)\n", - "print(f\"Ray Tune ready (version: {ray.__version__})\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "tuner", - "metadata": { - "execution": { - "iopub.execute_input": "2026-01-18T16:25:42.998698Z", - "iopub.status.busy": "2026-01-18T16:25:42.998141Z" - }, - "papermill": { - "duration": null, - "end_time": null, - "exception": false, - "start_time": "2026-01-18T16:25:42.987700", - "status": "running" - }, - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "
\n", - "
\n", - "

Tune Status

\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Current time:2026-01-18 17:38:46
Running for: 00:13:03.82
Memory: 14.3/119.7 GiB
\n", - "
\n", - "
\n", - "
\n", - "

System Info

\n", - " Using FIFO scheduling algorithm.
Logical resource usage: 2.0/20 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:GB10)\n", - "
\n", - "
\n", - "
\n", - "

Messages

\n", - " \n", - " \n", - " Number of errored trials: 3
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Trial name # failureserror file
trainable_paddle_ocr_36ae4d11 1/tmp/ray/session_2026-01-18_17-25-40_347373_1281294/artifacts/2026-01-18_17-25-43/trainable_paddle_ocr_2026-01-18_17-25-43/driver_artifacts/trainable_paddle_ocr_36ae4d11_1_text_det_box_thresh=0.5847,text_det_thresh=0.2571,text_det_unclip_ratio=0.0000,text_rec_score_thre_2026-01-18_17-25-43/error.txt
trainable_paddle_ocr_2312d29c 1/tmp/ray/session_2026-01-18_17-25-40_347373_1281294/artifacts/2026-01-18_17-25-43/trainable_paddle_ocr_2026-01-18_17-25-43/driver_artifacts/trainable_paddle_ocr_2312d29c_2_text_det_box_thresh=0.0312,text_det_thresh=0.0223,text_det_unclip_ratio=0.0000,text_rec_score_thre_2026-01-18_17-25-44/error.txt
trainable_paddle_ocr_5b7b8e02 1/tmp/ray/session_2026-01-18_17-25-40_347373_1281294/artifacts/2026-01-18_17-25-43/trainable_paddle_ocr_2026-01-18_17-25-43/driver_artifacts/trainable_paddle_ocr_5b7b8e02_3_text_det_box_thresh=0.5954,text_det_thresh=0.0707,text_det_unclip_ratio=0.0000,text_rec_score_thre_2026-01-18_17-31-48/error.txt
\n", - "
\n", - "\n", - "\n", - "
\n", - "
\n", - "
\n", - "

Trial Status

\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Trial name status loc text_det_box_thresh text_det_thresh text_det_unclip_rati\n", - "o text_rec_score_thres\n", - "htextline_orientation use_doc_orientation_\n", - "classify use_doc_unwarping
trainable_paddle_ocr_b3243c8aRUNNING 192.168.65.140:1288101 0.360789 0.499551 00.115115False True False
trainable_paddle_ocr_7a4a43b0PENDING 0.0727848 0.237729 00.33623 True FalseTrue
trainable_paddle_ocr_36ae4d11ERROR 192.168.65.140:1282742 0.58473 0.257102 00.634955False True False
trainable_paddle_ocr_2312d29cERROR 192.168.65.140:1282844 0.0311783 0.022272400.141805False True False
trainable_paddle_ocr_5b7b8e02ERROR 192.168.65.140:1285648 0.595412 0.070652200.132174True FalseTrue
\n", - "
\n", - "
\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[36m(pid=gcs_server)\u001b[0m [2026-01-18 17:26:10,501 E 1281442 1281442] (gcs_server) gcs_server.cc:303: Failed to establish connection to the event+metrics exporter agent. Events and metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[33m(raylet)\u001b[0m [2026-01-18 17:26:11,550 E 1281587 1281587] (raylet) main.cc:1032: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[36m(bundle_reservation_check_func pid=1281657)\u001b[0m [2026-01-18 17:26:12,349 E 1281657 1281801] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[2026-01-18 17:26:12,987 E 1281294 1281656] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2026-01-18 17:31:48,050\tERROR tune_controller.py:1331 -- Trial task failed for trial trainable_paddle_ocr_36ae4d11\n", - "Traceback (most recent call last):\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/execution/_internal/event_manager.py\", line 110, in resolve_future\n", - " result = ray.get(future)\n", - " ^^^^^^^^^^^^^^^\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/auto_init_hook.py\", line 22, in auto_init_wrapper\n", - " return fn(*args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/client_mode_hook.py\", line 104, in wrapper\n", - " return func(*args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 2967, in get\n", - " values, debugger_breakpoint = worker.get_objects(\n", - " ^^^^^^^^^^^^^^^^^^^\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 1015, in get_objects\n", - " raise value.as_instanceof_cause()\n", - "ray.exceptions.RayTaskError(DeprecationWarning): \u001b[36mray::ImplicitFunc.train()\u001b[39m (pid=1282742, ip=192.168.65.140, actor_id=d19d5170bbb9faf9c9fa055f01000000, repr=trainable_paddle_ocr)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/trainable.py\", line 331, in train\n", - " raise skipped from exception_cause(skipped)\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/_internal/util.py\", line 98, in run\n", - " self._ret = self._target(*self._args, **self._kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 44, in \n", - " training_func=lambda: self._trainable_func(self.config),\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 249, in _trainable_func\n", - " output = fn()\n", - " ^^^^\n", - " File \"/tmp/ipykernel_1281294/4208751894.py\", line 31, in trainable_paddle_ocr\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/train/v2/_internal/util.py\", line 273, in _wrapped_fn\n", - " raise DeprecationWarning(\n", - "DeprecationWarning: `ray.train.report` is deprecated when running in a function passed to Ray Tune. Please use the equivalent `ray.tune` API instead. See this issue for more context: https://github.com/ray-project/ray/issues/49454\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[36m(trainable_paddle_ocr pid=1285648)\u001b[0m [2026-01-18 17:32:19,397 E 1285648 1285683] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\u001b[32m [repeated 20x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)\u001b[0m\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2026-01-18 17:38:37,341\tERROR tune_controller.py:1331 -- Trial task failed for trial trainable_paddle_ocr_2312d29c\n", - "Traceback (most recent call last):\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/execution/_internal/event_manager.py\", line 110, in resolve_future\n", - " result = ray.get(future)\n", - " ^^^^^^^^^^^^^^^\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/auto_init_hook.py\", line 22, in auto_init_wrapper\n", - " return fn(*args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/client_mode_hook.py\", line 104, in wrapper\n", - " return func(*args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 2967, in get\n", - " values, debugger_breakpoint = worker.get_objects(\n", - " ^^^^^^^^^^^^^^^^^^^\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 1015, in get_objects\n", - " raise value.as_instanceof_cause()\n", - "ray.exceptions.RayTaskError(DeprecationWarning): \u001b[36mray::ImplicitFunc.train()\u001b[39m (pid=1282844, ip=192.168.65.140, actor_id=845cd8594f8ace3d960b90e501000000, repr=trainable_paddle_ocr)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/trainable.py\", line 331, in train\n", - " raise skipped from exception_cause(skipped)\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/_internal/util.py\", line 98, in run\n", - " self._ret = self._target(*self._args, **self._kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 44, in \n", - " training_func=lambda: self._trainable_func(self.config),\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 249, in _trainable_func\n", - " output = fn()\n", - " ^^^^\n", - " File \"/tmp/ipykernel_1281294/4208751894.py\", line 31, in trainable_paddle_ocr\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/train/v2/_internal/util.py\", line 273, in _wrapped_fn\n", - " raise DeprecationWarning(\n", - "DeprecationWarning: `ray.train.report` is deprecated when running in a function passed to Ray Tune. Please use the equivalent `ray.tune` API instead. See this issue for more context: https://github.com/ray-project/ray/issues/49454\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2026-01-18 17:38:46,519\tERROR tune_controller.py:1331 -- Trial task failed for trial trainable_paddle_ocr_5b7b8e02\n", - "Traceback (most recent call last):\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/execution/_internal/event_manager.py\", line 110, in resolve_future\n", - " result = ray.get(future)\n", - " ^^^^^^^^^^^^^^^\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/auto_init_hook.py\", line 22, in auto_init_wrapper\n", - " return fn(*args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/client_mode_hook.py\", line 104, in wrapper\n", - " return func(*args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 2967, in get\n", - " values, debugger_breakpoint = worker.get_objects(\n", - " ^^^^^^^^^^^^^^^^^^^\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 1015, in get_objects\n", - " raise value.as_instanceof_cause()\n", - "ray.exceptions.RayTaskError(DeprecationWarning): \u001b[36mray::ImplicitFunc.train()\u001b[39m (pid=1285648, ip=192.168.65.140, actor_id=b8478e34aea747352febbe0801000000, repr=trainable_paddle_ocr)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/trainable.py\", line 331, in train\n", - " raise skipped from exception_cause(skipped)\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/_internal/util.py\", line 98, in run\n", - " self._ret = self._target(*self._args, **self._kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 44, in \n", - " training_func=lambda: self._trainable_func(self.config),\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 249, in _trainable_func\n", - " output = fn()\n", - " ^^^^\n", - " File \"/tmp/ipykernel_1281294/4208751894.py\", line 31, in trainable_paddle_ocr\n", - " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/train/v2/_internal/util.py\", line 273, in _wrapped_fn\n", - " raise DeprecationWarning(\n", - "DeprecationWarning: `ray.train.report` is deprecated when running in a function passed to Ray Tune. Please use the equivalent `ray.tune` API instead. See this issue for more context: https://github.com/ray-project/ray/issues/49454\n" - ] - } - ], - "source": [ - "tuner = tune.Tuner(\n", - " trainable_paddle_ocr,\n", - " tune_config=tune.TuneConfig(\n", - " metric=\"CER\",\n", - " mode=\"min\",\n", - " search_alg=OptunaSearch(),\n", - " num_samples=64,\n", - " max_concurrent_trials=NUM_WORKERS, # Run trials in parallel across workers\n", - " ),\n", - " param_space=search_space,\n", - ")\n", - "\n", - "results = tuner.fit()" - ] - }, - { - "cell_type": "markdown", - "id": "analysis-header", - "metadata": { - "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" - }, - "tags": [] - }, - "source": [ - "## 6. Results Analysis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "results-df", - "metadata": { - "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "df = results.get_dataframe()\n", - "df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "save-results", - "metadata": { - "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# Save results to CSV\n", - "timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n", - "filename = f\"raytune_paddle_rest_results_{timestamp}.csv\"\n", - "filepath = os.path.join(OUTPUT_FOLDER, filename)\n", - "\n", - "df.to_csv(filepath, index=False)\n", - "print(f\"Results saved: {filepath}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "best-config", - "metadata": { - "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# Best configuration\n", - "best = df.loc[df[\"CER\"].idxmin()]\n", - "\n", - "print(f\"Best CER: {best['CER']:.6f}\")\n", - "print(f\"Best WER: {best['WER']:.6f}\")\n", - "print(f\"\\nOptimal Configuration:\")\n", - "print(f\" textline_orientation: {best['config/textline_orientation']}\")\n", - "print(f\" use_doc_orientation_classify: {best['config/use_doc_orientation_classify']}\")\n", - "print(f\" use_doc_unwarping: {best['config/use_doc_unwarping']}\")\n", - "print(f\" text_det_thresh: {best['config/text_det_thresh']:.4f}\")\n", - "print(f\" text_det_box_thresh: {best['config/text_det_box_thresh']:.4f}\")\n", - "print(f\" text_det_unclip_ratio: {best['config/text_det_unclip_ratio']}\")\n", - "print(f\" text_rec_score_thresh: {best['config/text_rec_score_thresh']:.4f}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "correlation", - "metadata": { - "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# Correlation analysis\n", - "param_cols = [\n", - " \"config/text_det_thresh\",\n", - " \"config/text_det_box_thresh\",\n", - " \"config/text_det_unclip_ratio\",\n", - " \"config/text_rec_score_thresh\",\n", - "]\n", - "\n", - "corr_cer = df[param_cols + [\"CER\"]].corr()[\"CER\"].sort_values(ascending=False)\n", - "corr_wer = df[param_cols + [\"WER\"]].corr()[\"WER\"].sort_values(ascending=False)\n", - "\n", - "print(\"Correlation with CER:\")\n", - "print(corr_cer)\n", - "print(\"\\nCorrelation with WER:\")\n", - "print(corr_wer)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - }, - "papermill": { - "default_parameters": {}, - "duration": null, - "end_time": null, - "environment_variables": {}, - "exception": null, - "input_path": "paddle_ocr_raytune_rest.ipynb", - "output_path": "output_raytune.ipynb", - "parameters": {}, - "start_time": "2026-01-18T16:25:37.429790", - "version": "2.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file diff --git a/src/paddle_ocr/dataset_manager.py b/src/paddle_ocr/dataset_manager.py index 2d3ccac..e9ea973 100644 --- a/src/paddle_ocr/dataset_manager.py +++ b/src/paddle_ocr/dataset_manager.py @@ -42,4 +42,33 @@ class ImageTextDataset: with open(txt_path, "r", encoding="utf-8") as f: text = f.read() - return image, text \ No newline at end of file + return image, text + + def get_output_path(self, idx, output_subdir, debugset_root="/app/debugset"): + """Get output path for saving OCR result to debugset folder. + + Args: + idx: Sample index + output_subdir: Subdirectory name (e.g., 'paddle_text', 'doctr_text') + debugset_root: Root folder for debug output (default: /app/debugset) + + Returns: + Path like /app/debugset/doc1/{output_subdir}/page_001.txt + """ + img_path, _ = self.samples[idx] + # img_path: /app/dataset/doc1/img/page_001.png + # Extract relative path: doc1/img/page_001.png + parts = img_path.split("/dataset/", 1) + if len(parts) == 2: + rel_path = parts[1] # doc1/img/page_001.png + else: + rel_path = os.path.basename(img_path) + + # Replace /img/ with /{output_subdir}/ + rel_parts = rel_path.rsplit("/img/", 1) + doc_folder = rel_parts[0] # doc1 + fname = os.path.splitext(rel_parts[1])[0] + ".txt" # page_001.txt + + out_dir = os.path.join(debugset_root, doc_folder, output_subdir) + os.makedirs(out_dir, exist_ok=True) + return os.path.join(out_dir, fname) \ No newline at end of file diff --git a/src/paddle_ocr/docker-compose.cpu-registry.yml b/src/paddle_ocr/docker-compose.cpu-registry.yml index 1d9246f..550ecd3 100644 --- a/src/paddle_ocr/docker-compose.cpu-registry.yml +++ b/src/paddle_ocr/docker-compose.cpu-registry.yml @@ -9,6 +9,7 @@ services: - "8001:8000" volumes: - ../dataset:/app/dataset:ro + - ../debugset:/app/debugset:rw - paddlex-cache:/root/.paddlex environment: - PYTHONUNBUFFERED=1 diff --git a/src/paddle_ocr/docker-compose.gpu-registry.yml b/src/paddle_ocr/docker-compose.gpu-registry.yml index 6e606c2..bd9b991 100644 --- a/src/paddle_ocr/docker-compose.gpu-registry.yml +++ b/src/paddle_ocr/docker-compose.gpu-registry.yml @@ -11,6 +11,7 @@ services: - "8002:8000" volumes: - ../dataset:/app/dataset:ro + - ../debugset:/app/debugset:rw - paddlex-cache:/root/.paddlex - ./scripts:/app/scripts:ro environment: diff --git a/src/paddle_ocr/docker-compose.workers.yml b/src/paddle_ocr/docker-compose.workers.yml index 222ea82..cada286 100644 --- a/src/paddle_ocr/docker-compose.workers.yml +++ b/src/paddle_ocr/docker-compose.workers.yml @@ -16,6 +16,7 @@ x-ocr-gpu-common: &ocr-gpu-common image: seryus.ddns.net/unir/paddle-ocr-gpu:latest volumes: - ../dataset:/app/dataset:ro + - ../debugset:/app/debugset:rw - paddlex-cache:/root/.paddlex environment: - PYTHONUNBUFFERED=1 @@ -39,6 +40,7 @@ x-ocr-cpu-common: &ocr-cpu-common image: seryus.ddns.net/unir/paddle-ocr-cpu:latest volumes: - ../dataset:/app/dataset:ro + - ../debugset:/app/debugset:rw - paddlex-cache:/root/.paddlex environment: - PYTHONUNBUFFERED=1 diff --git a/src/paddle_ocr/docker-compose.yml b/src/paddle_ocr/docker-compose.yml index 22c887b..5641717 100644 --- a/src/paddle_ocr/docker-compose.yml +++ b/src/paddle_ocr/docker-compose.yml @@ -45,7 +45,8 @@ services: ports: - "8000:8000" volumes: - - ../dataset:/app/dataset:ro # Your dataset + - ../dataset:/app/dataset:ro + - ../debugset:/app/debugset:rw # Your dataset - paddlex-cache:/root/.paddlex # For additional models at runtime environment: - PYTHONUNBUFFERED=1 @@ -74,6 +75,7 @@ services: - "8000:8000" volumes: - ../dataset:/app/dataset:ro + - ../debugset:/app/debugset:rw - paddlex-cache:/root/.paddlex environment: - PYTHONUNBUFFERED=1 diff --git a/src/paddle_ocr/paddle_ocr_tuning_rest.py b/src/paddle_ocr/paddle_ocr_tuning_rest.py index 6e836c6..b61ff0e 100644 --- a/src/paddle_ocr/paddle_ocr_tuning_rest.py +++ b/src/paddle_ocr/paddle_ocr_tuning_rest.py @@ -127,6 +127,7 @@ class EvaluateRequest(BaseModel): text_rec_score_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Recognition score threshold") start_page: int = Field(5, ge=0, description="Start page index (inclusive)") end_page: int = Field(10, ge=1, description="End page index (exclusive)") + save_output: bool = Field(False, description="Save OCR predictions to debugset folder") class EvaluateResponse(BaseModel): @@ -307,6 +308,12 @@ def evaluate(request: EvaluateRequest): pred = assemble_from_paddle_result(out) time_per_page_list.append(float(time.time() - tp0)) + # Save prediction to debugset if requested + if request.save_output: + out_path = state.dataset.get_output_path(idx, "paddle_text") + with open(out_path, "w", encoding="utf-8") as f: + f.write(pred) + m = evaluate_text(ref, pred) cer_list.append(m["CER"]) wer_list.append(m["WER"]) diff --git a/src/paddle_ocr_raytune_rest.ipynb b/src/paddle_ocr_raytune_rest.ipynb index 44710b9..f2fe22c 100644 --- a/src/paddle_ocr_raytune_rest.ipynb +++ b/src/paddle_ocr_raytune_rest.ipynb @@ -7,263 +7,81 @@ "source": [ "# PaddleOCR Hyperparameter Optimization via REST API\n", "\n", - "This notebook runs Ray Tune hyperparameter search calling the PaddleOCR REST API (Docker container).\n", + "Uses Ray Tune + Optuna to find optimal PaddleOCR parameters.\n", "\n", - "**Benefits:**\n", - "- No model reload per trial - Model stays loaded in Docker container\n", - "- Faster trials - Skip ~10s model load time per trial\n", - "- Cleaner code - REST API replaces subprocess + CLI arg parsing" - ] - }, - { - "cell_type": "markdown", - "id": "prereq", - "metadata": {}, - "source": [ "## Prerequisites\n", "\n", - "Start 2 PaddleOCR workers for parallel hyperparameter tuning:\n", - "\n", "```bash\n", "cd src/paddle_ocr\n", - "docker compose -f docker-compose.workers.yml up\n", - "```\n", - "\n", - "This starts 2 GPU workers on ports 8001-8002, allowing 2 concurrent trials.\n", - "\n", - "For CPU-only systems:\n", - "```bash\n", - "docker compose -f docker-compose.workers.yml --profile cpu up\n", + "docker compose -f docker-compose.workers.yml up # GPU workers on 8001-8002\n", + "# or: docker compose -f docker-compose.workers.yml --profile cpu up\n", "```" ] }, { - "cell_type": "markdown", - "id": "3ob9fsoilc4", + "cell_type": "code", + "execution_count": null, + "id": "deps", "metadata": {}, + "outputs": [], "source": [ - "## 0. Dependencies" + "%pip install -q -U \"ray[tune]\" optuna requests pandas" ] }, { "cell_type": "code", "execution_count": null, - "id": "wyr2nsoj7", + "id": "setup", "metadata": {}, "outputs": [], "source": [ - "# Install dependencies (run once)\n", - "%pip install -U \"ray[tune]\"\n", - "%pip install optuna\n", - "%pip install requests pandas" - ] - }, - { - "cell_type": "markdown", - "id": "imports-header", - "metadata": {}, - "source": [ - "## 1. Imports & Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "imports", - "metadata": {}, - "outputs": [], - "source": "import os\nfrom datetime import datetime\n\nimport requests\nimport pandas as pd\n\nimport ray\nfrom ray import tune, train\nfrom ray.tune.search.optuna import OptunaSearch" - }, - { - "cell_type": "markdown", - "id": "config-header", - "metadata": {}, - "source": [ - "## 2. API Configuration" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "config", - "metadata": {}, - "outputs": [], - "source": [ - "# PaddleOCR REST API endpoints - 2 workers for parallel trials\n", - "# Start workers with: cd src/paddle_ocr && docker compose -f docker-compose.workers.yml up\n", - "WORKER_PORTS = [8001, 8002]\n", - "WORKER_URLS = [f\"http://localhost:{port}\" for port in WORKER_PORTS]\n", + "from raytune_ocr import (\n", + " check_workers, create_trainable, run_tuner, analyze_results, correlation_analysis,\n", + " paddle_ocr_payload, PADDLE_OCR_SEARCH_SPACE, PADDLE_OCR_CONFIG_KEYS,\n", + ")\n", "\n", - "# Output folder for results\n", - "OUTPUT_FOLDER = \"results\"\n", - "os.makedirs(OUTPUT_FOLDER, exist_ok=True)\n", + "# Worker ports\n", + "PORTS = [8001, 8002]\n", "\n", - "# Number of concurrent trials = number of workers\n", - "NUM_WORKERS = len(WORKER_URLS)" + "# Check workers are running\n", + "healthy = check_workers(PORTS, \"PaddleOCR\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "health-check", + "id": "tune", "metadata": {}, "outputs": [], "source": [ - "# Verify all workers are running\n", - "healthy_workers = []\n", - "for url in WORKER_URLS:\n", - " try:\n", - " health = requests.get(f\"{url}/health\", timeout=10).json()\n", - " if health['status'] == 'ok' and health['model_loaded']:\n", - " healthy_workers.append(url)\n", - " print(f\"✓ {url}: {health['status']} (GPU: {health.get('gpu_name', 'N/A')})\")\n", - " else:\n", - " print(f\"✗ {url}: not ready yet\")\n", - " except requests.exceptions.ConnectionError:\n", - " print(f\"✗ {url}: not reachable\")\n", + "# Create trainable and run tuning\n", + "trainable = create_trainable(PORTS, paddle_ocr_payload)\n", "\n", - "if not healthy_workers:\n", - " raise RuntimeError(\n", - " \"No healthy workers found. Start them with:\\n\"\n", - " \" cd src/paddle_ocr && docker compose -f docker-compose.workers.yml up\"\n", - " )\n", + "results = run_tuner(\n", + " trainable=trainable,\n", + " search_space=PADDLE_OCR_SEARCH_SPACE,\n", + " num_samples=64,\n", + " num_workers=len(healthy),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "analysis", + "metadata": {}, + "outputs": [], + "source": [ + "# Analyze results\n", + "df = analyze_results(\n", + " results,\n", + " prefix=\"raytune_paddle\",\n", + " config_keys=PADDLE_OCR_CONFIG_KEYS,\n", + ")\n", "\n", - "print(f\"\\n{len(healthy_workers)}/{len(WORKER_URLS)} workers ready for parallel tuning\")" - ] - }, - { - "cell_type": "markdown", - "id": "search-space-header", - "metadata": {}, - "source": [ - "## 3. Search Space" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "search-space", - "metadata": {}, - "outputs": [], - "source": [ - "search_space = {\n", - " # Whether to use document image orientation classification\n", - " \"use_doc_orientation_classify\": tune.choice([True, False]),\n", - " # Whether to use text image unwarping\n", - " \"use_doc_unwarping\": tune.choice([True, False]),\n", - " # Whether to use text line orientation classification\n", - " \"textline_orientation\": tune.choice([True, False]),\n", - " # Detection pixel threshold (pixels > threshold are considered text)\n", - " \"text_det_thresh\": tune.uniform(0.0, 0.7),\n", - " # Detection box threshold (average score within border)\n", - " \"text_det_box_thresh\": tune.uniform(0.0, 0.7),\n", - " # Text detection expansion coefficient\n", - " \"text_det_unclip_ratio\": tune.choice([0.0]),\n", - " # Text recognition threshold (filter low confidence results)\n", - " \"text_rec_score_thresh\": tune.uniform(0.0, 0.7),\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "trainable-header", - "metadata": {}, - "source": [ - "## 4. Trainable Function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "trainable", - "metadata": {}, - "outputs": [], - "source": "def trainable_paddle_ocr(config):\n \"\"\"Call PaddleOCR REST API with the given hyperparameter config.\"\"\"\n import random\n import requests\n from ray import train\n\n # Worker URLs - random selection (load balances with 2 workers, 2 concurrent trials)\n WORKER_PORTS = [8001, 8002]\n api_url = f\"http://localhost:{random.choice(WORKER_PORTS)}\"\n\n payload = {\n \"pdf_folder\": \"/app/dataset\",\n \"use_doc_orientation_classify\": config.get(\"use_doc_orientation_classify\", False),\n \"use_doc_unwarping\": config.get(\"use_doc_unwarping\", False),\n \"textline_orientation\": config.get(\"textline_orientation\", True),\n \"text_det_thresh\": config.get(\"text_det_thresh\", 0.0),\n \"text_det_box_thresh\": config.get(\"text_det_box_thresh\", 0.0),\n \"text_det_unclip_ratio\": config.get(\"text_det_unclip_ratio\", 1.5),\n \"text_rec_score_thresh\": config.get(\"text_rec_score_thresh\", 0.0),\n \"start_page\": 5,\n \"end_page\": 10,\n }\n\n try:\n response = requests.post(f\"{api_url}/evaluate\", json=payload, timeout=None)\n response.raise_for_status()\n metrics = response.json()\n metrics[\"worker\"] = api_url\n train.report(metrics)\n except Exception as e:\n train.report({\n \"CER\": 1.0,\n \"WER\": 1.0,\n \"TIME\": 0.0,\n \"PAGES\": 0,\n \"TIME_PER_PAGE\": 0,\n \"worker\": api_url,\n \"ERROR\": str(e)[:500]\n })" - }, - { - "cell_type": "markdown", - "id": "tuner-header", - "metadata": {}, - "source": [ - "## 5. Run Tuner" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ray-init", - "metadata": {}, - "outputs": [], - "source": [ - "ray.init(ignore_reinit_error=True)\n", - "print(f\"Ray Tune ready (version: {ray.__version__})\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "tuner", - "metadata": {}, - "outputs": [], - "source": "tuner = tune.Tuner(\n trainable_paddle_ocr,\n tune_config=tune.TuneConfig(\n metric=\"CER\",\n mode=\"min\",\n search_alg=OptunaSearch(),\n num_samples=64,\n max_concurrent_trials=NUM_WORKERS, # Run trials in parallel across workers\n ),\n param_space=search_space,\n)\n\nresults = tuner.fit()" - }, - { - "cell_type": "markdown", - "id": "analysis-header", - "metadata": {}, - "source": [ - "## 6. Results Analysis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "results-df", - "metadata": {}, - "outputs": [], - "source": [ - "df = results.get_dataframe()\n", "df.describe()" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "save-results", - "metadata": {}, - "outputs": [], - "source": [ - "# Save results to CSV\n", - "timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n", - "filename = f\"raytune_paddle_rest_results_{timestamp}.csv\"\n", - "filepath = os.path.join(OUTPUT_FOLDER, filename)\n", - "\n", - "df.to_csv(filepath, index=False)\n", - "print(f\"Results saved: {filepath}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "best-config", - "metadata": {}, - "outputs": [], - "source": [ - "# Best configuration\n", - "best = df.loc[df[\"CER\"].idxmin()]\n", - "\n", - "print(f\"Best CER: {best['CER']:.6f}\")\n", - "print(f\"Best WER: {best['WER']:.6f}\")\n", - "print(f\"\\nOptimal Configuration:\")\n", - "print(f\" textline_orientation: {best['config/textline_orientation']}\")\n", - "print(f\" use_doc_orientation_classify: {best['config/use_doc_orientation_classify']}\")\n", - "print(f\" use_doc_unwarping: {best['config/use_doc_unwarping']}\")\n", - "print(f\" text_det_thresh: {best['config/text_det_thresh']:.4f}\")\n", - "print(f\" text_det_box_thresh: {best['config/text_det_box_thresh']:.4f}\")\n", - "print(f\" text_det_unclip_ratio: {best['config/text_det_unclip_ratio']}\")\n", - "print(f\" text_rec_score_thresh: {best['config/text_rec_score_thresh']:.4f}\")" - ] - }, { "cell_type": "code", "execution_count": null, @@ -272,42 +90,21 @@ "outputs": [], "source": [ "# Correlation analysis\n", - "param_cols = [\n", - " \"config/text_det_thresh\",\n", - " \"config/text_det_box_thresh\",\n", - " \"config/text_det_unclip_ratio\",\n", - " \"config/text_rec_score_thresh\",\n", - "]\n", - "\n", - "corr_cer = df[param_cols + [\"CER\"]].corr()[\"CER\"].sort_values(ascending=False)\n", - "corr_wer = df[param_cols + [\"WER\"]].corr()[\"WER\"].sort_values(ascending=False)\n", - "\n", - "print(\"Correlation with CER:\")\n", - "print(corr_cer)\n", - "print(\"\\nCorrelation with WER:\")\n", - "print(corr_wer)" + "correlation_analysis(df, PADDLE_OCR_CONFIG_KEYS)" ] } ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/src/raytune_ocr.py b/src/raytune_ocr.py new file mode 100644 index 0000000..c1f53c7 --- /dev/null +++ b/src/raytune_ocr.py @@ -0,0 +1,333 @@ +# raytune_ocr.py +# Shared Ray Tune utilities for OCR hyperparameter optimization +# +# Usage: +# from raytune_ocr import check_workers, create_trainable, run_tuner, analyze_results + +import os +from datetime import datetime +from typing import List, Dict, Any, Callable + +import requests +import pandas as pd + +import ray +from ray import tune, train +from ray.tune.search.optuna import OptunaSearch + + +def check_workers(ports: List[int], service_name: str = "OCR") -> List[str]: + """ + Verify workers are running and return healthy URLs. + + Args: + ports: List of port numbers to check + service_name: Name for error messages + + Returns: + List of healthy worker URLs + + Raises: + RuntimeError if no healthy workers found + """ + worker_urls = [f"http://localhost:{port}" for port in ports] + healthy_workers = [] + + for url in worker_urls: + try: + health = requests.get(f"{url}/health", timeout=10).json() + if health.get('status') == 'ok' and health.get('model_loaded'): + healthy_workers.append(url) + gpu = health.get('gpu_name', 'CPU') + print(f"✓ {url}: {health['status']} ({gpu})") + else: + print(f"✗ {url}: not ready yet") + except requests.exceptions.ConnectionError: + print(f"✗ {url}: not reachable") + + if not healthy_workers: + raise RuntimeError( + f"No healthy {service_name} workers found.\n" + f"Checked ports: {ports}" + ) + + print(f"\n{len(healthy_workers)}/{len(worker_urls)} workers ready") + return healthy_workers + + +def create_trainable(ports: List[int], payload_fn: Callable[[Dict], Dict]) -> Callable: + """ + Factory to create a trainable function for Ray Tune. + + Args: + ports: List of worker ports for load balancing + payload_fn: Function that takes config dict and returns API payload dict + + Returns: + Trainable function for Ray Tune + """ + def trainable(config): + import random + import requests + from ray import train + + api_url = f"http://localhost:{random.choice(ports)}" + payload = payload_fn(config) + + try: + response = requests.post(f"{api_url}/evaluate", json=payload, timeout=None) + response.raise_for_status() + metrics = response.json() + metrics["worker"] = api_url + train.report(metrics) + except Exception as e: + train.report({ + "CER": 1.0, + "WER": 1.0, + "TIME": 0.0, + "PAGES": 0, + "TIME_PER_PAGE": 0, + "worker": api_url, + "ERROR": str(e)[:500] + }) + + return trainable + + +def run_tuner( + trainable: Callable, + search_space: Dict[str, Any], + num_samples: int = 64, + num_workers: int = 1, + metric: str = "CER", + mode: str = "min", +) -> tune.ResultGrid: + """ + Initialize Ray and run hyperparameter tuning. + + Args: + trainable: Trainable function from create_trainable() + search_space: Dict of parameter names to tune.* search spaces + num_samples: Number of trials to run + num_workers: Max concurrent trials + metric: Metric to optimize + mode: "min" or "max" + + Returns: + Ray Tune ResultGrid + """ + ray.init(ignore_reinit_error=True, include_dashboard=False) + print(f"Ray Tune ready (version: {ray.__version__})") + + tuner = tune.Tuner( + trainable, + tune_config=tune.TuneConfig( + metric=metric, + mode=mode, + search_alg=OptunaSearch(), + num_samples=num_samples, + max_concurrent_trials=num_workers, + ), + param_space=search_space, + ) + + return tuner.fit() + + +def analyze_results( + results: tune.ResultGrid, + output_folder: str = "results", + prefix: str = "raytune", + config_keys: List[str] = None, +) -> pd.DataFrame: + """ + Analyze and save tuning results. + + Args: + results: Ray Tune ResultGrid + output_folder: Directory to save CSV + prefix: Filename prefix + config_keys: List of config keys to show in best result (without 'config/' prefix) + + Returns: + Results DataFrame + """ + os.makedirs(output_folder, exist_ok=True) + df = results.get_dataframe() + + # Save to CSV + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"{prefix}_results_{timestamp}.csv" + filepath = os.path.join(output_folder, filename) + df.to_csv(filepath, index=False) + print(f"Results saved: {filepath}") + + # Best configuration + best = df.loc[df["CER"].idxmin()] + print(f"\nBest CER: {best['CER']:.6f}") + print(f"Best WER: {best['WER']:.6f}") + + if config_keys: + print(f"\nOptimal Configuration:") + for key in config_keys: + col = f"config/{key}" + if col in best: + val = best[col] + if isinstance(val, float): + print(f" {key}: {val:.4f}") + else: + print(f" {key}: {val}") + + return df + + +def correlation_analysis(df: pd.DataFrame, param_keys: List[str]) -> None: + """ + Print correlation of numeric parameters with CER/WER. + + Args: + df: Results DataFrame + param_keys: List of config keys (without 'config/' prefix) + """ + param_cols = [f"config/{k}" for k in param_keys if f"config/{k}" in df.columns] + numeric_cols = [c for c in param_cols if df[c].dtype in ['float64', 'int64']] + + if not numeric_cols: + print("No numeric parameters for correlation analysis") + return + + corr_cer = df[numeric_cols + ["CER"]].corr()["CER"].sort_values(ascending=False) + corr_wer = df[numeric_cols + ["WER"]].corr()["WER"].sort_values(ascending=False) + + print("Correlation with CER:") + print(corr_cer) + print("\nCorrelation with WER:") + print(corr_wer) + + +# ============================================================================= +# OCR-specific payload functions +# ============================================================================= + +def paddle_ocr_payload(config: Dict, start_page: int = 5, end_page: int = 10, save_output: bool = False) -> Dict: + """Create payload for PaddleOCR API.""" + return { + "pdf_folder": "/app/dataset", + "use_doc_orientation_classify": config.get("use_doc_orientation_classify", False), + "use_doc_unwarping": config.get("use_doc_unwarping", False), + "textline_orientation": config.get("textline_orientation", True), + "text_det_thresh": config.get("text_det_thresh", 0.0), + "text_det_box_thresh": config.get("text_det_box_thresh", 0.0), + "text_det_unclip_ratio": config.get("text_det_unclip_ratio", 1.5), + "text_rec_score_thresh": config.get("text_rec_score_thresh", 0.0), + "start_page": start_page, + "end_page": end_page, + "save_output": save_output, + } + + +def doctr_payload(config: Dict, start_page: int = 5, end_page: int = 10, save_output: bool = False) -> Dict: + """Create payload for DocTR API.""" + return { + "pdf_folder": "/app/dataset", + "assume_straight_pages": config.get("assume_straight_pages", True), + "straighten_pages": config.get("straighten_pages", False), + "preserve_aspect_ratio": config.get("preserve_aspect_ratio", True), + "symmetric_pad": config.get("symmetric_pad", True), + "disable_page_orientation": config.get("disable_page_orientation", False), + "disable_crop_orientation": config.get("disable_crop_orientation", False), + "resolve_lines": config.get("resolve_lines", True), + "resolve_blocks": config.get("resolve_blocks", False), + "paragraph_break": config.get("paragraph_break", 0.035), + "start_page": start_page, + "end_page": end_page, + "save_output": save_output, + } + + +def easyocr_payload(config: Dict, start_page: int = 5, end_page: int = 10, save_output: bool = False) -> Dict: + """Create payload for EasyOCR API.""" + return { + "pdf_folder": "/app/dataset", + "text_threshold": config.get("text_threshold", 0.7), + "low_text": config.get("low_text", 0.4), + "link_threshold": config.get("link_threshold", 0.4), + "slope_ths": config.get("slope_ths", 0.1), + "ycenter_ths": config.get("ycenter_ths", 0.5), + "height_ths": config.get("height_ths", 0.5), + "width_ths": config.get("width_ths", 0.5), + "add_margin": config.get("add_margin", 0.1), + "contrast_ths": config.get("contrast_ths", 0.1), + "adjust_contrast": config.get("adjust_contrast", 0.5), + "decoder": config.get("decoder", "greedy"), + "beamWidth": config.get("beamWidth", 5), + "min_size": config.get("min_size", 10), + "start_page": start_page, + "end_page": end_page, + "save_output": save_output, + } + + +# ============================================================================= +# Search spaces +# ============================================================================= + +PADDLE_OCR_SEARCH_SPACE = { + "use_doc_orientation_classify": tune.choice([True, False]), + "use_doc_unwarping": tune.choice([True, False]), + "textline_orientation": tune.choice([True, False]), + "text_det_thresh": tune.uniform(0.0, 0.7), + "text_det_box_thresh": tune.uniform(0.0, 0.7), + "text_det_unclip_ratio": tune.choice([0.0]), + "text_rec_score_thresh": tune.uniform(0.0, 0.7), +} + +DOCTR_SEARCH_SPACE = { + "assume_straight_pages": tune.choice([True, False]), + "straighten_pages": tune.choice([True, False]), + "preserve_aspect_ratio": tune.choice([True, False]), + "symmetric_pad": tune.choice([True, False]), + "disable_page_orientation": tune.choice([True, False]), + "disable_crop_orientation": tune.choice([True, False]), + "resolve_lines": tune.choice([True, False]), + "resolve_blocks": tune.choice([True, False]), + "paragraph_break": tune.uniform(0.01, 0.1), +} + +EASYOCR_SEARCH_SPACE = { + "text_threshold": tune.uniform(0.3, 0.9), + "low_text": tune.uniform(0.2, 0.6), + "link_threshold": tune.uniform(0.2, 0.6), + "slope_ths": tune.uniform(0.0, 0.3), + "ycenter_ths": tune.uniform(0.3, 1.0), + "height_ths": tune.uniform(0.3, 1.0), + "width_ths": tune.uniform(0.3, 1.0), + "add_margin": tune.uniform(0.0, 0.3), + "contrast_ths": tune.uniform(0.05, 0.3), + "adjust_contrast": tune.uniform(0.3, 0.8), + "decoder": tune.choice(["greedy", "beamsearch"]), + "beamWidth": tune.choice([3, 5, 7, 10]), + "min_size": tune.choice([5, 10, 15, 20]), +} + + +# ============================================================================= +# Config keys for results display +# ============================================================================= + +PADDLE_OCR_CONFIG_KEYS = [ + "use_doc_orientation_classify", "use_doc_unwarping", "textline_orientation", + "text_det_thresh", "text_det_box_thresh", "text_det_unclip_ratio", "text_rec_score_thresh", +] + +DOCTR_CONFIG_KEYS = [ + "assume_straight_pages", "straighten_pages", "preserve_aspect_ratio", "symmetric_pad", + "disable_page_orientation", "disable_crop_orientation", "resolve_lines", "resolve_blocks", + "paragraph_break", +] + +EASYOCR_CONFIG_KEYS = [ + "text_threshold", "low_text", "link_threshold", "slope_ths", "ycenter_ths", + "height_ths", "width_ths", "add_margin", "contrast_ths", "adjust_contrast", + "decoder", "beamWidth", "min_size", +]