From 15bfba79a70691a9b1cbca3d6853015954531a7c Mon Sep 17 00:00:00 2001 From: Sergio Jimenez Jimenez Date: Sun, 18 Jan 2026 17:38:42 +0100 Subject: [PATCH] lock model --- src/README.md | 25 ++ src/doctr_service/doctr_tuning_rest.py | 71 ++--- src/easyocr_service/easyocr_tuning_rest.py | 69 +++-- src/output_raytune.ipynb | 314 +++++++++++++-------- src/paddle_ocr/docker-compose.workers.yml | 2 +- src/paddle_ocr_raytune_rest.ipynb | 31 +- 6 files changed, 295 insertions(+), 217 deletions(-) diff --git a/src/README.md b/src/README.md index 1c0b8af..13a68b3 100644 --- a/src/README.md +++ b/src/README.md @@ -1,5 +1,30 @@ # Running Notebooks in Background +## Quick: Check Ray Tune Progress + +**Current run:** PaddleOCR hyperparameter optimization via Ray Tune + Optuna. +- 64 trials searching for optimal detection/recognition thresholds +- 2 CPU workers running in parallel (Docker containers on ports 8001-8002) +- Notebook: `paddle_ocr_raytune_rest.ipynb` → `output_raytune.ipynb` +- Results saved to: `~/ray_results/trainable_paddle_ocr_2026-01-18_17-25-43/` + +```bash +# Is it still running? +ps aux | grep papermill | grep -v grep + +# View live log +tail -f papermill.log + +# Count completed trials (64 total) +find ~/ray_results/trainable_paddle_ocr_2026-01-18_17-25-43/ -name "result.json" ! -empty | wc -l + +# Check workers are healthy +curl -s localhost:8001/health | jq -r '.status' +curl -s localhost:8002/health | jq -r '.status' +``` + +--- + ## Option 1: Papermill (Recommended) Runs notebooks directly without conversion. diff --git a/src/doctr_service/doctr_tuning_rest.py b/src/doctr_service/doctr_tuning_rest.py index 109b94e..4ef3928 100644 --- a/src/doctr_service/doctr_tuning_rest.py +++ b/src/doctr_service/doctr_tuning_rest.py @@ -5,6 +5,7 @@ import os import re import time +import threading from typing import Optional from contextlib import asynccontextmanager @@ -57,6 +58,10 @@ class AppState: # Track current model config for cache invalidation current_config: Optional[dict] = None device: str = "cuda" if torch.cuda.is_available() else "cpu" + lock: threading.Lock = None # Protects OCR model from concurrent access + + def __init__(self): + self.lock = threading.Lock() state = AppState() @@ -253,23 +258,6 @@ def evaluate(request: EvaluateRequest): if len(state.dataset) == 0: raise HTTPException(status_code=400, detail="Dataset is empty") - # Check if model needs to be reinitialized - new_config = { - "assume_straight_pages": request.assume_straight_pages, - "straighten_pages": request.straighten_pages, - "preserve_aspect_ratio": request.preserve_aspect_ratio, - "symmetric_pad": request.symmetric_pad, - "disable_page_orientation": request.disable_page_orientation, - "disable_crop_orientation": request.disable_crop_orientation, - } - - model_reinitialized = False - if state.current_config != new_config: - print(f"Model config changed, reinitializing...") - state.model = create_model(**new_config) - state.current_config = new_config - model_reinitialized = True - # Validate page range start = request.start_page end = min(request.end_page, len(state.dataset)) @@ -280,24 +268,43 @@ def evaluate(request: EvaluateRequest): time_per_page_list = [] t0 = time.time() - for idx in range(start, end): - img, ref = state.dataset[idx] - arr = np.array(img) + # Lock to prevent concurrent OCR access (model is not thread-safe) + with state.lock: + # Check if model needs to be reinitialized + new_config = { + "assume_straight_pages": request.assume_straight_pages, + "straighten_pages": request.straighten_pages, + "preserve_aspect_ratio": request.preserve_aspect_ratio, + "symmetric_pad": request.symmetric_pad, + "disable_page_orientation": request.disable_page_orientation, + "disable_crop_orientation": request.disable_crop_orientation, + } - tp0 = time.time() - # DocTR expects a list of images - result = state.model([arr]) + model_reinitialized = False + if state.current_config != new_config: + print(f"Model config changed, reinitializing...") + state.model = create_model(**new_config) + state.current_config = new_config + model_reinitialized = True - pred = doctr_result_to_text( - result, - resolve_lines=request.resolve_lines, - resolve_blocks=request.resolve_blocks, - ) - time_per_page_list.append(float(time.time() - tp0)) + for idx in range(start, end): + img, ref = state.dataset[idx] + arr = np.array(img) - m = evaluate_text(ref, pred) - cer_list.append(m["CER"]) - wer_list.append(m["WER"]) + tp0 = time.time() + # DocTR expects a list of images + result = state.model([arr]) + + pred = doctr_result_to_text( + result, + resolve_lines=request.resolve_lines, + resolve_blocks=request.resolve_blocks, + ) + time_per_page_list.append(float(time.time() - tp0)) + + m = evaluate_text(ref, pred) + cer_list.append(m["CER"]) + wer_list.append(m["WER"]) return EvaluateResponse( CER=float(np.mean(cer_list)) if cer_list else 1.0, diff --git a/src/easyocr_service/easyocr_tuning_rest.py b/src/easyocr_service/easyocr_tuning_rest.py index c550955..5fa6cd5 100644 --- a/src/easyocr_service/easyocr_tuning_rest.py +++ b/src/easyocr_service/easyocr_tuning_rest.py @@ -5,6 +5,7 @@ import os import re import time +import threading from typing import Optional, List from contextlib import asynccontextmanager @@ -52,6 +53,10 @@ class AppState: dataset: Optional[ImageTextDataset] = None dataset_path: Optional[str] = None languages: List[str] = DEFAULT_LANGUAGES + lock: threading.Lock = None # Protects OCR model from concurrent access + + def __init__(self): + self.lock = threading.Lock() state = AppState() @@ -263,40 +268,42 @@ def evaluate(request: EvaluateRequest): time_per_page_list = [] t0 = time.time() - for idx in range(start, end): - img, ref = state.dataset[idx] - arr = np.array(img) + # Lock to prevent concurrent OCR access (model is not thread-safe) + with state.lock: + for idx in range(start, end): + img, ref = state.dataset[idx] + arr = np.array(img) - tp0 = time.time() - result = state.reader.readtext( - arr, - # Detection thresholds - text_threshold=request.text_threshold, - low_text=request.low_text, - link_threshold=request.link_threshold, - # Bounding box merging - slope_ths=request.slope_ths, - ycenter_ths=request.ycenter_ths, - height_ths=request.height_ths, - width_ths=request.width_ths, - add_margin=request.add_margin, - # Contrast - contrast_ths=request.contrast_ths, - adjust_contrast=request.adjust_contrast, - # Decoder - decoder=request.decoder, - beamWidth=request.beamWidth, - # Other - min_size=request.min_size, - rotation_info=request.rotation_info, - ) + tp0 = time.time() + result = state.reader.readtext( + arr, + # Detection thresholds + text_threshold=request.text_threshold, + low_text=request.low_text, + link_threshold=request.link_threshold, + # Bounding box merging + slope_ths=request.slope_ths, + ycenter_ths=request.ycenter_ths, + height_ths=request.height_ths, + width_ths=request.width_ths, + add_margin=request.add_margin, + # Contrast + contrast_ths=request.contrast_ths, + adjust_contrast=request.adjust_contrast, + # Decoder + decoder=request.decoder, + beamWidth=request.beamWidth, + # Other + min_size=request.min_size, + rotation_info=request.rotation_info, + ) - pred = assemble_easyocr_result(result) - time_per_page_list.append(float(time.time() - tp0)) + pred = assemble_easyocr_result(result) + time_per_page_list.append(float(time.time() - tp0)) - m = evaluate_text(ref, pred) - cer_list.append(m["CER"]) - wer_list.append(m["WER"]) + m = evaluate_text(ref, pred) + cer_list.append(m["CER"]) + wer_list.append(m["WER"]) return EvaluateResponse( CER=float(np.mean(cer_list)) if cer_list else 1.0, diff --git a/src/output_raytune.ipynb b/src/output_raytune.ipynb index 85b308b..18293c4 100644 --- a/src/output_raytune.ipynb +++ b/src/output_raytune.ipynb @@ -5,10 +5,10 @@ "id": "header", "metadata": { "papermill": { - "duration": 0.00208, - "end_time": "2026-01-18T07:22:47.796550", + "duration": 0.002022, + "end_time": "2026-01-18T16:25:38.048417", "exception": false, - "start_time": "2026-01-18T07:22:47.794470", + "start_time": "2026-01-18T16:25:38.046395", "status": "completed" }, "tags": [] @@ -29,10 +29,10 @@ "id": "prereq", "metadata": { "papermill": { - "duration": 0.000961, - "end_time": "2026-01-18T07:22:47.807230", + "duration": 0.000855, + "end_time": "2026-01-18T16:25:38.058911", "exception": false, - "start_time": "2026-01-18T07:22:47.806269", + "start_time": "2026-01-18T16:25:38.058056", "status": "completed" }, "tags": [] @@ -60,10 +60,10 @@ "id": "3ob9fsoilc4", "metadata": { "papermill": { - "duration": 0.000901, - "end_time": "2026-01-18T07:22:47.809075", + "duration": 0.000846, + "end_time": "2026-01-18T16:25:38.060620", "exception": false, - "start_time": "2026-01-18T07:22:47.808174", + "start_time": "2026-01-18T16:25:38.059774", "status": "completed" }, "tags": [] @@ -78,16 +78,16 @@ "id": "wyr2nsoj7", "metadata": { "execution": { - "iopub.execute_input": "2026-01-18T07:22:47.812056Z", - "iopub.status.busy": "2026-01-18T07:22:47.811910Z", - "iopub.status.idle": "2026-01-18T07:22:49.130013Z", - "shell.execute_reply": "2026-01-18T07:22:49.129363Z" + "iopub.execute_input": "2026-01-18T16:25:38.063421Z", + "iopub.status.busy": "2026-01-18T16:25:38.063287Z", + "iopub.status.idle": "2026-01-18T16:25:39.300678Z", + "shell.execute_reply": "2026-01-18T16:25:39.299298Z" }, "papermill": { - "duration": 1.321151, - "end_time": "2026-01-18T07:22:49.131123", + "duration": 1.240519, + "end_time": "2026-01-18T16:25:39.301973", "exception": false, - "start_time": "2026-01-18T07:22:47.809972", + "start_time": "2026-01-18T16:25:38.061454", "status": "completed" }, "tags": [] @@ -120,13 +120,7 @@ "Requirement already satisfied: annotated-types>=0.6.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (0.7.0)\r\n", "Requirement already satisfied: pydantic-core==2.41.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (2.41.5)\r\n", "Requirement already satisfied: typing-extensions>=4.14.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (4.15.0)\r\n", - "Requirement already satisfied: typing-inspection>=0.4.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (0.4.2)\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "Requirement already satisfied: typing-inspection>=0.4.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (0.4.2)\r\n", "Requirement already satisfied: numpy in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from tensorboardX>=1.9->ray[tune]) (2.4.1)\r\n", "Requirement already satisfied: attrs>=22.2.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from jsonschema->ray[tune]) (25.4.0)\r\n", "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from jsonschema->ray[tune]) (2025.9.1)\r\n", @@ -180,7 +174,13 @@ "text": [ "Requirement already satisfied: requests in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (2.32.5)\r\n", "Requirement already satisfied: pandas in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (2.3.3)\r\n", - "Requirement already satisfied: charset_normalizer<4,>=2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (3.4.4)\r\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (3.4.4)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "Requirement already satisfied: idna<4,>=2.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (3.11)\r\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (2.6.3)\r\n", "Requirement already satisfied: certifi>=2017.4.17 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (2026.1.4)\r\n", @@ -211,10 +211,10 @@ "id": "imports-header", "metadata": { "papermill": { - "duration": 0.002313, - "end_time": "2026-01-18T07:22:49.136199", + "duration": 0.009444, + "end_time": "2026-01-18T16:25:39.312980", "exception": false, - "start_time": "2026-01-18T07:22:49.133886", + "start_time": "2026-01-18T16:25:39.303536", "status": "completed" }, "tags": [] @@ -229,16 +229,16 @@ "id": "imports", "metadata": { "execution": { - "iopub.execute_input": "2026-01-18T07:22:49.141850Z", - "iopub.status.busy": "2026-01-18T07:22:49.141713Z", - "iopub.status.idle": "2026-01-18T07:22:50.248414Z", - "shell.execute_reply": "2026-01-18T07:22:50.247699Z" + "iopub.execute_input": "2026-01-18T16:25:39.316439Z", + "iopub.status.busy": "2026-01-18T16:25:39.316230Z", + "iopub.status.idle": "2026-01-18T16:25:40.277894Z", + "shell.execute_reply": "2026-01-18T16:25:40.277012Z" }, "papermill": { - "duration": 1.111175, - "end_time": "2026-01-18T07:22:50.249605", + "duration": 0.964409, + "end_time": "2026-01-18T16:25:40.278450", "exception": false, - "start_time": "2026-01-18T07:22:49.138430", + "start_time": "2026-01-18T16:25:39.314041", "status": "completed" }, "tags": [] @@ -252,7 +252,7 @@ "import pandas as pd\n", "\n", "import ray\n", - "from ray import tune, air\n", + "from ray import tune, train\n", "from ray.tune.search.optuna import OptunaSearch" ] }, @@ -261,10 +261,10 @@ "id": "config-header", "metadata": { "papermill": { - "duration": 0.00953, - "end_time": "2026-01-18T07:22:50.261880", + "duration": 0.009552, + "end_time": "2026-01-18T16:25:40.289551", "exception": false, - "start_time": "2026-01-18T07:22:50.252350", + "start_time": "2026-01-18T16:25:40.279999", "status": "completed" }, "tags": [] @@ -279,16 +279,16 @@ "id": "config", "metadata": { "execution": { - "iopub.execute_input": "2026-01-18T07:22:50.267482Z", - "iopub.status.busy": "2026-01-18T07:22:50.267340Z", - "iopub.status.idle": "2026-01-18T07:22:50.269689Z", - "shell.execute_reply": "2026-01-18T07:22:50.269264Z" + "iopub.execute_input": "2026-01-18T16:25:40.292573Z", + "iopub.status.busy": "2026-01-18T16:25:40.292489Z", + "iopub.status.idle": "2026-01-18T16:25:40.294713Z", + "shell.execute_reply": "2026-01-18T16:25:40.294164Z" }, "papermill": { - "duration": 0.006027, - "end_time": "2026-01-18T07:22:50.270230", + "duration": 0.004591, + "end_time": "2026-01-18T16:25:40.295202", "exception": false, - "start_time": "2026-01-18T07:22:50.264203", + "start_time": "2026-01-18T16:25:40.290611", "status": "completed" }, "tags": [] @@ -314,16 +314,16 @@ "id": "health-check", "metadata": { "execution": { - "iopub.execute_input": "2026-01-18T07:22:50.275708Z", - "iopub.status.busy": "2026-01-18T07:22:50.275626Z", - "iopub.status.idle": "2026-01-18T07:22:50.283441Z", - "shell.execute_reply": "2026-01-18T07:22:50.282984Z" + "iopub.execute_input": "2026-01-18T16:25:40.298281Z", + "iopub.status.busy": "2026-01-18T16:25:40.298161Z", + "iopub.status.idle": "2026-01-18T16:25:40.306720Z", + "shell.execute_reply": "2026-01-18T16:25:40.306262Z" }, "papermill": { - "duration": 0.011534, - "end_time": "2026-01-18T07:22:50.284080", + "duration": 0.010723, + "end_time": "2026-01-18T16:25:40.307025", "exception": false, - "start_time": "2026-01-18T07:22:50.272546", + "start_time": "2026-01-18T16:25:40.296302", "status": "completed" }, "tags": [] @@ -368,10 +368,10 @@ "id": "search-space-header", "metadata": { "papermill": { - "duration": 0.002325, - "end_time": "2026-01-18T07:22:50.288969", + "duration": 0.001073, + "end_time": "2026-01-18T16:25:40.309261", "exception": false, - "start_time": "2026-01-18T07:22:50.286644", + "start_time": "2026-01-18T16:25:40.308188", "status": "completed" }, "tags": [] @@ -386,16 +386,16 @@ "id": "search-space", "metadata": { "execution": { - "iopub.execute_input": "2026-01-18T07:22:50.294569Z", - "iopub.status.busy": "2026-01-18T07:22:50.294500Z", - "iopub.status.idle": "2026-01-18T07:22:50.296998Z", - "shell.execute_reply": "2026-01-18T07:22:50.296295Z" + "iopub.execute_input": "2026-01-18T16:25:40.312177Z", + "iopub.status.busy": "2026-01-18T16:25:40.312107Z", + "iopub.status.idle": "2026-01-18T16:25:40.314237Z", + "shell.execute_reply": "2026-01-18T16:25:40.313794Z" }, "papermill": { - "duration": 0.006486, - "end_time": "2026-01-18T07:22:50.297804", + "duration": 0.004476, + "end_time": "2026-01-18T16:25:40.314804", "exception": false, - "start_time": "2026-01-18T07:22:50.291318", + "start_time": "2026-01-18T16:25:40.310328", "status": "completed" }, "tags": [] @@ -425,10 +425,10 @@ "id": "trainable-header", "metadata": { "papermill": { - "duration": 0.002321, - "end_time": "2026-01-18T07:22:50.302532", + "duration": 0.001057, + "end_time": "2026-01-18T16:25:40.316975", "exception": false, - "start_time": "2026-01-18T07:22:50.300211", + "start_time": "2026-01-18T16:25:40.315918", "status": "completed" }, "tags": [] @@ -443,16 +443,16 @@ "id": "trainable", "metadata": { "execution": { - "iopub.execute_input": "2026-01-18T07:22:50.308222Z", - "iopub.status.busy": "2026-01-18T07:22:50.308103Z", - "iopub.status.idle": "2026-01-18T07:22:50.311240Z", - "shell.execute_reply": "2026-01-18T07:22:50.310694Z" + "iopub.execute_input": "2026-01-18T16:25:40.319825Z", + "iopub.status.busy": "2026-01-18T16:25:40.319771Z", + "iopub.status.idle": "2026-01-18T16:25:40.322602Z", + "shell.execute_reply": "2026-01-18T16:25:40.322112Z" }, "papermill": { - "duration": 0.007301, - "end_time": "2026-01-18T07:22:50.312116", + "duration": 0.004907, + "end_time": "2026-01-18T16:25:40.322948", "exception": false, - "start_time": "2026-01-18T07:22:50.304815", + "start_time": "2026-01-18T16:25:40.318041", "status": "completed" }, "tags": [] @@ -463,7 +463,7 @@ " \"\"\"Call PaddleOCR REST API with the given hyperparameter config.\"\"\"\n", " import random\n", " import requests\n", - " from ray import tune\n", + " from ray import train\n", "\n", " # Worker URLs - random selection (load balances with 2 workers, 2 concurrent trials)\n", " WORKER_PORTS = [8001, 8002]\n", @@ -487,17 +487,17 @@ " response.raise_for_status()\n", " metrics = response.json()\n", " metrics[\"worker\"] = api_url\n", - " tune.report(**metrics)\n", + " train.report(metrics)\n", " except Exception as e:\n", - " tune.report(\n", - " CER=1.0,\n", - " WER=1.0,\n", - " TIME=0.0,\n", - " PAGES=0,\n", - " TIME_PER_PAGE=0,\n", - " worker=api_url,\n", - " ERROR=str(e)[:500]\n", - " )" + " train.report({\n", + " \"CER\": 1.0,\n", + " \"WER\": 1.0,\n", + " \"TIME\": 0.0,\n", + " \"PAGES\": 0,\n", + " \"TIME_PER_PAGE\": 0,\n", + " \"worker\": api_url,\n", + " \"ERROR\": str(e)[:500]\n", + " })" ] }, { @@ -505,10 +505,10 @@ "id": "tuner-header", "metadata": { "papermill": { - "duration": 0.002522, - "end_time": "2026-01-18T07:22:50.317277", + "duration": 0.001058, + "end_time": "2026-01-18T16:25:40.325120", "exception": false, - "start_time": "2026-01-18T07:22:50.314755", + "start_time": "2026-01-18T16:25:40.324062", "status": "completed" }, "tags": [] @@ -523,16 +523,16 @@ "id": "ray-init", "metadata": { "execution": { - "iopub.execute_input": "2026-01-18T07:22:50.323163Z", - "iopub.status.busy": "2026-01-18T07:22:50.323037Z", - "iopub.status.idle": "2026-01-18T07:22:54.197904Z", - "shell.execute_reply": "2026-01-18T07:22:54.196986Z" + "iopub.execute_input": "2026-01-18T16:25:40.328162Z", + "iopub.status.busy": "2026-01-18T16:25:40.328055Z", + "iopub.status.idle": "2026-01-18T16:25:42.985307Z", + "shell.execute_reply": "2026-01-18T16:25:42.984863Z" }, "papermill": { - "duration": 3.878908, - "end_time": "2026-01-18T07:22:54.198593", + "duration": 2.65986, + "end_time": "2026-01-18T16:25:42.986041", "exception": false, - "start_time": "2026-01-18T07:22:50.319685", + "start_time": "2026-01-18T16:25:40.326181", "status": "completed" }, "tags": [] @@ -542,7 +542,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2026-01-18 08:22:51,904\tINFO worker.py:2007 -- Started a local Ray instance.\n" + "2026-01-18 17:25:41,631\tINFO worker.py:2007 -- Started a local Ray instance.\n" ] }, { @@ -572,35 +572,19 @@ "id": "tuner", "metadata": { "execution": { - "iopub.execute_input": "2026-01-18T07:22:54.213071Z", - "iopub.status.busy": "2026-01-18T07:22:54.212310Z" + "iopub.execute_input": "2026-01-18T16:25:42.998698Z", + "iopub.status.busy": "2026-01-18T16:25:42.998141Z" }, "papermill": { "duration": null, "end_time": null, "exception": false, - "start_time": "2026-01-18T07:22:54.201610", + "start_time": "2026-01-18T16:25:42.987700", "status": "running" }, "tags": [] }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/impl/tuner_internal.py:144: RayDeprecationWarning: The `RunConfig` class should be imported from `ray.tune` when passing it to the Tuner. Please update your imports. See this issue for more context and migration options: https://github.com/ray-project/ray/issues/49454. Disable these warnings by setting the environment variable: RAY_TRAIN_ENABLE_V2_MIGRATION_WARNINGS=0\n", - " _log_deprecation_warning(\n", - "2026-01-18 08:22:54,222\tINFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[I 2026-01-18 08:22:54,226] A new study created in memory with name: optuna\n" - ] - }, { "data": { "text/html": [ @@ -610,9 +594,9 @@ "

Tune Status

\n", " \n", "\n", - "\n", - "\n", - "\n", + "\n", + "\n", + "\n", "\n", "
Current time:2026-01-18 08:23:19
Running for: 00:00:25.26
Memory: 57.8/119.7 GiB
Current time:2026-01-18 17:37:46
Running for: 00:12:03.55
Memory: 16.5/119.7 GiB
\n", " \n", @@ -621,7 +605,39 @@ "

System Info

\n", " Using FIFO scheduling algorithm.
Logical resource usage: 2.0/20 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:GB10)\n", " \n", - " \n", + "
\n", + "
\n", + "

Messages

\n", + " \n", + " \n", + " Number of errored trials: 1
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Trial name # failureserror file
trainable_paddle_ocr_36ae4d11 1/tmp/ray/session_2026-01-18_17-25-40_347373_1281294/artifacts/2026-01-18_17-25-43/trainable_paddle_ocr_2026-01-18_17-25-43/driver_artifacts/trainable_paddle_ocr_36ae4d11_1_text_det_box_thresh=0.5847,text_det_thresh=0.2571,text_det_unclip_ratio=0.0000,text_rec_score_thre_2026-01-18_17-25-43/error.txt
\n", + "
\n", + "\n", + "\n", " \n", "
\n", "
\n", @@ -634,8 +650,9 @@ "classify use_doc_unwarping \n", "\n", "\n", - "trainable_paddle_ocr_59252191RUNNING 192.168.65.140:1195312 0.414043 0.33747500.478234True True True \n", - "trainable_paddle_ocr_47499299RUNNING 192.168.65.140:1195374 0.544738 0.26973500.30771 True FalseTrue \n", + "trainable_paddle_ocr_2312d29cRUNNING 192.168.65.140:1282844 0.0311783 0.022272400.141805False True False \n", + "trainable_paddle_ocr_5b7b8e02RUNNING 192.168.65.140:1285648 0.595412 0.070652200.132174True FalseTrue \n", + "trainable_paddle_ocr_36ae4d11ERROR 192.168.65.140:1282742 0.58473 0.257102 00.634955False True False \n", "\n", "\n", "
\n", @@ -682,28 +699,76 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[36m(pid=gcs_server)\u001b[0m [2026-01-18 08:23:20,495 E 1193965 1193965] (gcs_server) gcs_server.cc:303: Failed to establish connection to the event+metrics exporter agent. Events and metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n" + "\u001b[36m(pid=gcs_server)\u001b[0m [2026-01-18 17:26:10,501 E 1281442 1281442] (gcs_server) gcs_server.cc:303: Failed to establish connection to the event+metrics exporter agent. Events and metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[33m(raylet)\u001b[0m [2026-01-18 08:23:21,833 E 1194136 1194136] (raylet) main.cc:1032: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n" + "\u001b[33m(raylet)\u001b[0m [2026-01-18 17:26:11,550 E 1281587 1281587] (raylet) main.cc:1032: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[36m(bundle_reservation_check_func pid=1194212)\u001b[0m [2026-01-18 08:23:23,446 E 1194212 1194301] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n" + "\u001b[36m(bundle_reservation_check_func pid=1281657)\u001b[0m [2026-01-18 17:26:12,349 E 1281657 1281801] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "[2026-01-18 08:23:24,197 E 1193837 1194205] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n" + "[2026-01-18 17:26:12,987 E 1281294 1281656] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-01-18 17:31:48,050\tERROR tune_controller.py:1331 -- Trial task failed for trial trainable_paddle_ocr_36ae4d11\n", + "Traceback (most recent call last):\n", + " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/execution/_internal/event_manager.py\", line 110, in resolve_future\n", + " result = ray.get(future)\n", + " ^^^^^^^^^^^^^^^\n", + " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/auto_init_hook.py\", line 22, in auto_init_wrapper\n", + " return fn(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/client_mode_hook.py\", line 104, in wrapper\n", + " return func(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 2967, in get\n", + " values, debugger_breakpoint = worker.get_objects(\n", + " ^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 1015, in get_objects\n", + " raise value.as_instanceof_cause()\n", + "ray.exceptions.RayTaskError(DeprecationWarning): \u001b[36mray::ImplicitFunc.train()\u001b[39m (pid=1282742, ip=192.168.65.140, actor_id=d19d5170bbb9faf9c9fa055f01000000, repr=trainable_paddle_ocr)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/trainable.py\", line 331, in train\n", + " raise skipped from exception_cause(skipped)\n", + " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/_internal/util.py\", line 98, in run\n", + " self._ret = self._target(*self._args, **self._kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 44, in \n", + " training_func=lambda: self._trainable_func(self.config),\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 249, in _trainable_func\n", + " output = fn()\n", + " ^^^^\n", + " File \"/tmp/ipykernel_1281294/4208751894.py\", line 31, in trainable_paddle_ocr\n", + " File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/train/v2/_internal/util.py\", line 273, in _wrapped_fn\n", + " raise DeprecationWarning(\n", + "DeprecationWarning: `ray.train.report` is deprecated when running in a function passed to Ray Tune. Please use the equivalent `ray.tune` API instead. See this issue for more context: https://github.com/ray-project/ray/issues/49454\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[36m(trainable_paddle_ocr pid=1285648)\u001b[0m [2026-01-18 17:32:19,397 E 1285648 1285683] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\u001b[32m [repeated 20x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)\u001b[0m\n" ] } ], @@ -717,7 +782,6 @@ " num_samples=64,\n", " max_concurrent_trials=NUM_WORKERS, # Run trials in parallel across workers\n", " ),\n", - " run_config=air.RunConfig(verbose=2, log_to_file=False),\n", " param_space=search_space,\n", ")\n", "\n", @@ -878,7 +942,7 @@ "input_path": "paddle_ocr_raytune_rest.ipynb", "output_path": "output_raytune.ipynb", "parameters": {}, - "start_time": "2026-01-18T07:22:47.169883", + "start_time": "2026-01-18T16:25:37.429790", "version": "2.6.0" } }, diff --git a/src/paddle_ocr/docker-compose.workers.yml b/src/paddle_ocr/docker-compose.workers.yml index cc8edde..222ea82 100644 --- a/src/paddle_ocr/docker-compose.workers.yml +++ b/src/paddle_ocr/docker-compose.workers.yml @@ -36,7 +36,7 @@ x-ocr-gpu-common: &ocr-gpu-common start_period: 120s x-ocr-cpu-common: &ocr-cpu-common - image: paddle-ocr-api:cpu + image: seryus.ddns.net/unir/paddle-ocr-cpu:latest volumes: - ../dataset:/app/dataset:ro - paddlex-cache:/root/.paddlex diff --git a/src/paddle_ocr_raytune_rest.ipynb b/src/paddle_ocr_raytune_rest.ipynb index 5febbb6..44710b9 100644 --- a/src/paddle_ocr_raytune_rest.ipynb +++ b/src/paddle_ocr_raytune_rest.ipynb @@ -72,17 +72,7 @@ "id": "imports", "metadata": {}, "outputs": [], - "source": [ - "import os\n", - "from datetime import datetime\n", - "\n", - "import requests\n", - "import pandas as pd\n", - "\n", - "import ray\n", - "from ray import tune, air\n", - "from ray.tune.search.optuna import OptunaSearch" - ] + "source": "import os\nfrom datetime import datetime\n\nimport requests\nimport pandas as pd\n\nimport ray\nfrom ray import tune, train\nfrom ray.tune.search.optuna import OptunaSearch" }, { "cell_type": "markdown", @@ -188,7 +178,7 @@ "id": "trainable", "metadata": {}, "outputs": [], - "source": "def trainable_paddle_ocr(config):\n \"\"\"Call PaddleOCR REST API with the given hyperparameter config.\"\"\"\n import random\n import requests\n from ray import tune\n\n # Worker URLs - random selection (load balances with 2 workers, 2 concurrent trials)\n WORKER_PORTS = [8001, 8002]\n api_url = f\"http://localhost:{random.choice(WORKER_PORTS)}\"\n\n payload = {\n \"pdf_folder\": \"/app/dataset\",\n \"use_doc_orientation_classify\": config.get(\"use_doc_orientation_classify\", False),\n \"use_doc_unwarping\": config.get(\"use_doc_unwarping\", False),\n \"textline_orientation\": config.get(\"textline_orientation\", True),\n \"text_det_thresh\": config.get(\"text_det_thresh\", 0.0),\n \"text_det_box_thresh\": config.get(\"text_det_box_thresh\", 0.0),\n \"text_det_unclip_ratio\": config.get(\"text_det_unclip_ratio\", 1.5),\n \"text_rec_score_thresh\": config.get(\"text_rec_score_thresh\", 0.0),\n \"start_page\": 5,\n \"end_page\": 10,\n }\n\n try:\n response = requests.post(f\"{api_url}/evaluate\", json=payload, timeout=None)\n response.raise_for_status()\n metrics = response.json()\n metrics[\"worker\"] = api_url\n tune.report(**metrics)\n except Exception as e:\n tune.report(\n CER=1.0,\n WER=1.0,\n TIME=0.0,\n PAGES=0,\n TIME_PER_PAGE=0,\n worker=api_url,\n ERROR=str(e)[:500]\n )" + "source": "def trainable_paddle_ocr(config):\n \"\"\"Call PaddleOCR REST API with the given hyperparameter config.\"\"\"\n import random\n import requests\n from ray import train\n\n # Worker URLs - random selection (load balances with 2 workers, 2 concurrent trials)\n WORKER_PORTS = [8001, 8002]\n api_url = f\"http://localhost:{random.choice(WORKER_PORTS)}\"\n\n payload = {\n \"pdf_folder\": \"/app/dataset\",\n \"use_doc_orientation_classify\": config.get(\"use_doc_orientation_classify\", False),\n \"use_doc_unwarping\": config.get(\"use_doc_unwarping\", False),\n \"textline_orientation\": config.get(\"textline_orientation\", True),\n \"text_det_thresh\": config.get(\"text_det_thresh\", 0.0),\n \"text_det_box_thresh\": config.get(\"text_det_box_thresh\", 0.0),\n \"text_det_unclip_ratio\": config.get(\"text_det_unclip_ratio\", 1.5),\n \"text_rec_score_thresh\": config.get(\"text_rec_score_thresh\", 0.0),\n \"start_page\": 5,\n \"end_page\": 10,\n }\n\n try:\n response = requests.post(f\"{api_url}/evaluate\", json=payload, timeout=None)\n response.raise_for_status()\n metrics = response.json()\n metrics[\"worker\"] = api_url\n train.report(metrics)\n except Exception as e:\n train.report({\n \"CER\": 1.0,\n \"WER\": 1.0,\n \"TIME\": 0.0,\n \"PAGES\": 0,\n \"TIME_PER_PAGE\": 0,\n \"worker\": api_url,\n \"ERROR\": str(e)[:500]\n })" }, { "cell_type": "markdown", @@ -215,22 +205,7 @@ "id": "tuner", "metadata": {}, "outputs": [], - "source": [ - "tuner = tune.Tuner(\n", - " trainable_paddle_ocr,\n", - " tune_config=tune.TuneConfig(\n", - " metric=\"CER\",\n", - " mode=\"min\",\n", - " search_alg=OptunaSearch(),\n", - " num_samples=64,\n", - " max_concurrent_trials=NUM_WORKERS, # Run trials in parallel across workers\n", - " ),\n", - " run_config=air.RunConfig(verbose=2, log_to_file=False),\n", - " param_space=search_space,\n", - ")\n", - "\n", - "results = tuner.fit()" - ] + "source": "tuner = tune.Tuner(\n trainable_paddle_ocr,\n tune_config=tune.TuneConfig(\n metric=\"CER\",\n mode=\"min\",\n search_alg=OptunaSearch(),\n num_samples=64,\n max_concurrent_trials=NUM_WORKERS, # Run trials in parallel across workers\n ),\n param_space=search_space,\n)\n\nresults = tuner.fit()" }, { "cell_type": "markdown",