lock model

This commit is contained in:
2026-01-18 17:38:42 +01:00
parent b29df98602
commit 15bfba79a7
6 changed files with 295 additions and 217 deletions

View File

@@ -1,5 +1,30 @@
# Running Notebooks in Background
## Quick: Check Ray Tune Progress
**Current run:** PaddleOCR hyperparameter optimization via Ray Tune + Optuna.
- 64 trials searching for optimal detection/recognition thresholds
- 2 CPU workers running in parallel (Docker containers on ports 8001-8002)
- Notebook: `paddle_ocr_raytune_rest.ipynb``output_raytune.ipynb`
- Results saved to: `~/ray_results/trainable_paddle_ocr_2026-01-18_17-25-43/`
```bash
# Is it still running?
ps aux | grep papermill | grep -v grep
# View live log
tail -f papermill.log
# Count completed trials (64 total)
find ~/ray_results/trainable_paddle_ocr_2026-01-18_17-25-43/ -name "result.json" ! -empty | wc -l
# Check workers are healthy
curl -s localhost:8001/health | jq -r '.status'
curl -s localhost:8002/health | jq -r '.status'
```
---
## Option 1: Papermill (Recommended)
Runs notebooks directly without conversion.

View File

@@ -5,6 +5,7 @@
import os
import re
import time
import threading
from typing import Optional
from contextlib import asynccontextmanager
@@ -57,6 +58,10 @@ class AppState:
# Track current model config for cache invalidation
current_config: Optional[dict] = None
device: str = "cuda" if torch.cuda.is_available() else "cpu"
lock: threading.Lock = None # Protects OCR model from concurrent access
def __init__(self):
self.lock = threading.Lock()
state = AppState()
@@ -253,6 +258,18 @@ def evaluate(request: EvaluateRequest):
if len(state.dataset) == 0:
raise HTTPException(status_code=400, detail="Dataset is empty")
# Validate page range
start = request.start_page
end = min(request.end_page, len(state.dataset))
if start >= end:
raise HTTPException(status_code=400, detail=f"Invalid page range: {start}-{end}")
cer_list, wer_list = [], []
time_per_page_list = []
t0 = time.time()
# Lock to prevent concurrent OCR access (model is not thread-safe)
with state.lock:
# Check if model needs to be reinitialized
new_config = {
"assume_straight_pages": request.assume_straight_pages,
@@ -270,16 +287,6 @@ def evaluate(request: EvaluateRequest):
state.current_config = new_config
model_reinitialized = True
# Validate page range
start = request.start_page
end = min(request.end_page, len(state.dataset))
if start >= end:
raise HTTPException(status_code=400, detail=f"Invalid page range: {start}-{end}")
cer_list, wer_list = [], []
time_per_page_list = []
t0 = time.time()
for idx in range(start, end):
img, ref = state.dataset[idx]
arr = np.array(img)

View File

@@ -5,6 +5,7 @@
import os
import re
import time
import threading
from typing import Optional, List
from contextlib import asynccontextmanager
@@ -52,6 +53,10 @@ class AppState:
dataset: Optional[ImageTextDataset] = None
dataset_path: Optional[str] = None
languages: List[str] = DEFAULT_LANGUAGES
lock: threading.Lock = None # Protects OCR model from concurrent access
def __init__(self):
self.lock = threading.Lock()
state = AppState()
@@ -263,6 +268,8 @@ def evaluate(request: EvaluateRequest):
time_per_page_list = []
t0 = time.time()
# Lock to prevent concurrent OCR access (model is not thread-safe)
with state.lock:
for idx in range(start, end):
img, ref = state.dataset[idx]
arr = np.array(img)

View File

@@ -5,10 +5,10 @@
"id": "header",
"metadata": {
"papermill": {
"duration": 0.00208,
"end_time": "2026-01-18T07:22:47.796550",
"duration": 0.002022,
"end_time": "2026-01-18T16:25:38.048417",
"exception": false,
"start_time": "2026-01-18T07:22:47.794470",
"start_time": "2026-01-18T16:25:38.046395",
"status": "completed"
},
"tags": []
@@ -29,10 +29,10 @@
"id": "prereq",
"metadata": {
"papermill": {
"duration": 0.000961,
"end_time": "2026-01-18T07:22:47.807230",
"duration": 0.000855,
"end_time": "2026-01-18T16:25:38.058911",
"exception": false,
"start_time": "2026-01-18T07:22:47.806269",
"start_time": "2026-01-18T16:25:38.058056",
"status": "completed"
},
"tags": []
@@ -60,10 +60,10 @@
"id": "3ob9fsoilc4",
"metadata": {
"papermill": {
"duration": 0.000901,
"end_time": "2026-01-18T07:22:47.809075",
"duration": 0.000846,
"end_time": "2026-01-18T16:25:38.060620",
"exception": false,
"start_time": "2026-01-18T07:22:47.808174",
"start_time": "2026-01-18T16:25:38.059774",
"status": "completed"
},
"tags": []
@@ -78,16 +78,16 @@
"id": "wyr2nsoj7",
"metadata": {
"execution": {
"iopub.execute_input": "2026-01-18T07:22:47.812056Z",
"iopub.status.busy": "2026-01-18T07:22:47.811910Z",
"iopub.status.idle": "2026-01-18T07:22:49.130013Z",
"shell.execute_reply": "2026-01-18T07:22:49.129363Z"
"iopub.execute_input": "2026-01-18T16:25:38.063421Z",
"iopub.status.busy": "2026-01-18T16:25:38.063287Z",
"iopub.status.idle": "2026-01-18T16:25:39.300678Z",
"shell.execute_reply": "2026-01-18T16:25:39.299298Z"
},
"papermill": {
"duration": 1.321151,
"end_time": "2026-01-18T07:22:49.131123",
"duration": 1.240519,
"end_time": "2026-01-18T16:25:39.301973",
"exception": false,
"start_time": "2026-01-18T07:22:47.809972",
"start_time": "2026-01-18T16:25:38.061454",
"status": "completed"
},
"tags": []
@@ -120,13 +120,7 @@
"Requirement already satisfied: annotated-types>=0.6.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (0.7.0)\r\n",
"Requirement already satisfied: pydantic-core==2.41.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (2.41.5)\r\n",
"Requirement already satisfied: typing-extensions>=4.14.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (4.15.0)\r\n",
"Requirement already satisfied: typing-inspection>=0.4.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (0.4.2)\r\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: typing-inspection>=0.4.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (0.4.2)\r\n",
"Requirement already satisfied: numpy in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from tensorboardX>=1.9->ray[tune]) (2.4.1)\r\n",
"Requirement already satisfied: attrs>=22.2.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from jsonschema->ray[tune]) (25.4.0)\r\n",
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from jsonschema->ray[tune]) (2025.9.1)\r\n",
@@ -180,7 +174,13 @@
"text": [
"Requirement already satisfied: requests in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (2.32.5)\r\n",
"Requirement already satisfied: pandas in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (2.3.3)\r\n",
"Requirement already satisfied: charset_normalizer<4,>=2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (3.4.4)\r\n",
"Requirement already satisfied: charset_normalizer<4,>=2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (3.4.4)\r\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: idna<4,>=2.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (3.11)\r\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (2.6.3)\r\n",
"Requirement already satisfied: certifi>=2017.4.17 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (2026.1.4)\r\n",
@@ -211,10 +211,10 @@
"id": "imports-header",
"metadata": {
"papermill": {
"duration": 0.002313,
"end_time": "2026-01-18T07:22:49.136199",
"duration": 0.009444,
"end_time": "2026-01-18T16:25:39.312980",
"exception": false,
"start_time": "2026-01-18T07:22:49.133886",
"start_time": "2026-01-18T16:25:39.303536",
"status": "completed"
},
"tags": []
@@ -229,16 +229,16 @@
"id": "imports",
"metadata": {
"execution": {
"iopub.execute_input": "2026-01-18T07:22:49.141850Z",
"iopub.status.busy": "2026-01-18T07:22:49.141713Z",
"iopub.status.idle": "2026-01-18T07:22:50.248414Z",
"shell.execute_reply": "2026-01-18T07:22:50.247699Z"
"iopub.execute_input": "2026-01-18T16:25:39.316439Z",
"iopub.status.busy": "2026-01-18T16:25:39.316230Z",
"iopub.status.idle": "2026-01-18T16:25:40.277894Z",
"shell.execute_reply": "2026-01-18T16:25:40.277012Z"
},
"papermill": {
"duration": 1.111175,
"end_time": "2026-01-18T07:22:50.249605",
"duration": 0.964409,
"end_time": "2026-01-18T16:25:40.278450",
"exception": false,
"start_time": "2026-01-18T07:22:49.138430",
"start_time": "2026-01-18T16:25:39.314041",
"status": "completed"
},
"tags": []
@@ -252,7 +252,7 @@
"import pandas as pd\n",
"\n",
"import ray\n",
"from ray import tune, air\n",
"from ray import tune, train\n",
"from ray.tune.search.optuna import OptunaSearch"
]
},
@@ -261,10 +261,10 @@
"id": "config-header",
"metadata": {
"papermill": {
"duration": 0.00953,
"end_time": "2026-01-18T07:22:50.261880",
"duration": 0.009552,
"end_time": "2026-01-18T16:25:40.289551",
"exception": false,
"start_time": "2026-01-18T07:22:50.252350",
"start_time": "2026-01-18T16:25:40.279999",
"status": "completed"
},
"tags": []
@@ -279,16 +279,16 @@
"id": "config",
"metadata": {
"execution": {
"iopub.execute_input": "2026-01-18T07:22:50.267482Z",
"iopub.status.busy": "2026-01-18T07:22:50.267340Z",
"iopub.status.idle": "2026-01-18T07:22:50.269689Z",
"shell.execute_reply": "2026-01-18T07:22:50.269264Z"
"iopub.execute_input": "2026-01-18T16:25:40.292573Z",
"iopub.status.busy": "2026-01-18T16:25:40.292489Z",
"iopub.status.idle": "2026-01-18T16:25:40.294713Z",
"shell.execute_reply": "2026-01-18T16:25:40.294164Z"
},
"papermill": {
"duration": 0.006027,
"end_time": "2026-01-18T07:22:50.270230",
"duration": 0.004591,
"end_time": "2026-01-18T16:25:40.295202",
"exception": false,
"start_time": "2026-01-18T07:22:50.264203",
"start_time": "2026-01-18T16:25:40.290611",
"status": "completed"
},
"tags": []
@@ -314,16 +314,16 @@
"id": "health-check",
"metadata": {
"execution": {
"iopub.execute_input": "2026-01-18T07:22:50.275708Z",
"iopub.status.busy": "2026-01-18T07:22:50.275626Z",
"iopub.status.idle": "2026-01-18T07:22:50.283441Z",
"shell.execute_reply": "2026-01-18T07:22:50.282984Z"
"iopub.execute_input": "2026-01-18T16:25:40.298281Z",
"iopub.status.busy": "2026-01-18T16:25:40.298161Z",
"iopub.status.idle": "2026-01-18T16:25:40.306720Z",
"shell.execute_reply": "2026-01-18T16:25:40.306262Z"
},
"papermill": {
"duration": 0.011534,
"end_time": "2026-01-18T07:22:50.284080",
"duration": 0.010723,
"end_time": "2026-01-18T16:25:40.307025",
"exception": false,
"start_time": "2026-01-18T07:22:50.272546",
"start_time": "2026-01-18T16:25:40.296302",
"status": "completed"
},
"tags": []
@@ -368,10 +368,10 @@
"id": "search-space-header",
"metadata": {
"papermill": {
"duration": 0.002325,
"end_time": "2026-01-18T07:22:50.288969",
"duration": 0.001073,
"end_time": "2026-01-18T16:25:40.309261",
"exception": false,
"start_time": "2026-01-18T07:22:50.286644",
"start_time": "2026-01-18T16:25:40.308188",
"status": "completed"
},
"tags": []
@@ -386,16 +386,16 @@
"id": "search-space",
"metadata": {
"execution": {
"iopub.execute_input": "2026-01-18T07:22:50.294569Z",
"iopub.status.busy": "2026-01-18T07:22:50.294500Z",
"iopub.status.idle": "2026-01-18T07:22:50.296998Z",
"shell.execute_reply": "2026-01-18T07:22:50.296295Z"
"iopub.execute_input": "2026-01-18T16:25:40.312177Z",
"iopub.status.busy": "2026-01-18T16:25:40.312107Z",
"iopub.status.idle": "2026-01-18T16:25:40.314237Z",
"shell.execute_reply": "2026-01-18T16:25:40.313794Z"
},
"papermill": {
"duration": 0.006486,
"end_time": "2026-01-18T07:22:50.297804",
"duration": 0.004476,
"end_time": "2026-01-18T16:25:40.314804",
"exception": false,
"start_time": "2026-01-18T07:22:50.291318",
"start_time": "2026-01-18T16:25:40.310328",
"status": "completed"
},
"tags": []
@@ -425,10 +425,10 @@
"id": "trainable-header",
"metadata": {
"papermill": {
"duration": 0.002321,
"end_time": "2026-01-18T07:22:50.302532",
"duration": 0.001057,
"end_time": "2026-01-18T16:25:40.316975",
"exception": false,
"start_time": "2026-01-18T07:22:50.300211",
"start_time": "2026-01-18T16:25:40.315918",
"status": "completed"
},
"tags": []
@@ -443,16 +443,16 @@
"id": "trainable",
"metadata": {
"execution": {
"iopub.execute_input": "2026-01-18T07:22:50.308222Z",
"iopub.status.busy": "2026-01-18T07:22:50.308103Z",
"iopub.status.idle": "2026-01-18T07:22:50.311240Z",
"shell.execute_reply": "2026-01-18T07:22:50.310694Z"
"iopub.execute_input": "2026-01-18T16:25:40.319825Z",
"iopub.status.busy": "2026-01-18T16:25:40.319771Z",
"iopub.status.idle": "2026-01-18T16:25:40.322602Z",
"shell.execute_reply": "2026-01-18T16:25:40.322112Z"
},
"papermill": {
"duration": 0.007301,
"end_time": "2026-01-18T07:22:50.312116",
"duration": 0.004907,
"end_time": "2026-01-18T16:25:40.322948",
"exception": false,
"start_time": "2026-01-18T07:22:50.304815",
"start_time": "2026-01-18T16:25:40.318041",
"status": "completed"
},
"tags": []
@@ -463,7 +463,7 @@
" \"\"\"Call PaddleOCR REST API with the given hyperparameter config.\"\"\"\n",
" import random\n",
" import requests\n",
" from ray import tune\n",
" from ray import train\n",
"\n",
" # Worker URLs - random selection (load balances with 2 workers, 2 concurrent trials)\n",
" WORKER_PORTS = [8001, 8002]\n",
@@ -487,17 +487,17 @@
" response.raise_for_status()\n",
" metrics = response.json()\n",
" metrics[\"worker\"] = api_url\n",
" tune.report(**metrics)\n",
" train.report(metrics)\n",
" except Exception as e:\n",
" tune.report(\n",
" CER=1.0,\n",
" WER=1.0,\n",
" TIME=0.0,\n",
" PAGES=0,\n",
" TIME_PER_PAGE=0,\n",
" worker=api_url,\n",
" ERROR=str(e)[:500]\n",
" )"
" train.report({\n",
" \"CER\": 1.0,\n",
" \"WER\": 1.0,\n",
" \"TIME\": 0.0,\n",
" \"PAGES\": 0,\n",
" \"TIME_PER_PAGE\": 0,\n",
" \"worker\": api_url,\n",
" \"ERROR\": str(e)[:500]\n",
" })"
]
},
{
@@ -505,10 +505,10 @@
"id": "tuner-header",
"metadata": {
"papermill": {
"duration": 0.002522,
"end_time": "2026-01-18T07:22:50.317277",
"duration": 0.001058,
"end_time": "2026-01-18T16:25:40.325120",
"exception": false,
"start_time": "2026-01-18T07:22:50.314755",
"start_time": "2026-01-18T16:25:40.324062",
"status": "completed"
},
"tags": []
@@ -523,16 +523,16 @@
"id": "ray-init",
"metadata": {
"execution": {
"iopub.execute_input": "2026-01-18T07:22:50.323163Z",
"iopub.status.busy": "2026-01-18T07:22:50.323037Z",
"iopub.status.idle": "2026-01-18T07:22:54.197904Z",
"shell.execute_reply": "2026-01-18T07:22:54.196986Z"
"iopub.execute_input": "2026-01-18T16:25:40.328162Z",
"iopub.status.busy": "2026-01-18T16:25:40.328055Z",
"iopub.status.idle": "2026-01-18T16:25:42.985307Z",
"shell.execute_reply": "2026-01-18T16:25:42.984863Z"
},
"papermill": {
"duration": 3.878908,
"end_time": "2026-01-18T07:22:54.198593",
"duration": 2.65986,
"end_time": "2026-01-18T16:25:42.986041",
"exception": false,
"start_time": "2026-01-18T07:22:50.319685",
"start_time": "2026-01-18T16:25:40.326181",
"status": "completed"
},
"tags": []
@@ -542,7 +542,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"2026-01-18 08:22:51,904\tINFO worker.py:2007 -- Started a local Ray instance.\n"
"2026-01-18 17:25:41,631\tINFO worker.py:2007 -- Started a local Ray instance.\n"
]
},
{
@@ -572,35 +572,19 @@
"id": "tuner",
"metadata": {
"execution": {
"iopub.execute_input": "2026-01-18T07:22:54.213071Z",
"iopub.status.busy": "2026-01-18T07:22:54.212310Z"
"iopub.execute_input": "2026-01-18T16:25:42.998698Z",
"iopub.status.busy": "2026-01-18T16:25:42.998141Z"
},
"papermill": {
"duration": null,
"end_time": null,
"exception": false,
"start_time": "2026-01-18T07:22:54.201610",
"start_time": "2026-01-18T16:25:42.987700",
"status": "running"
},
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/impl/tuner_internal.py:144: RayDeprecationWarning: The `RunConfig` class should be imported from `ray.tune` when passing it to the Tuner. Please update your imports. See this issue for more context and migration options: https://github.com/ray-project/ray/issues/49454. Disable these warnings by setting the environment variable: RAY_TRAIN_ENABLE_V2_MIGRATION_WARNINGS=0\n",
" _log_deprecation_warning(\n",
"2026-01-18 08:22:54,222\tINFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[I 2026-01-18 08:22:54,226] A new study created in memory with name: optuna\n"
]
},
{
"data": {
"text/html": [
@@ -610,9 +594,9 @@
" <h3>Tune Status</h3>\n",
" <table>\n",
"<tbody>\n",
"<tr><td>Current time:</td><td>2026-01-18 08:23:19</td></tr>\n",
"<tr><td>Running for: </td><td>00:00:25.26 </td></tr>\n",
"<tr><td>Memory: </td><td>57.8/119.7 GiB </td></tr>\n",
"<tr><td>Current time:</td><td>2026-01-18 17:37:46</td></tr>\n",
"<tr><td>Running for: </td><td>00:12:03.55 </td></tr>\n",
"<tr><td>Memory: </td><td>16.5/119.7 GiB </td></tr>\n",
"</tbody>\n",
"</table>\n",
" </div>\n",
@@ -621,6 +605,38 @@
" <h3>System Info</h3>\n",
" Using FIFO scheduling algorithm.<br>Logical resource usage: 2.0/20 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:GB10)\n",
" </div>\n",
" <div class=\"vDivider\"></div>\n",
"<div class=\"messages\">\n",
" <h3>Messages</h3>\n",
" \n",
" \n",
" Number of errored trials: 1<br><table>\n",
"<thead>\n",
"<tr><th>Trial name </th><th style=\"text-align: right;\"> # failures</th><th>error file </th></tr>\n",
"</thead>\n",
"<tbody>\n",
"<tr><td>trainable_paddle_ocr_36ae4d11</td><td style=\"text-align: right;\"> 1</td><td>/tmp/ray/session_2026-01-18_17-25-40_347373_1281294/artifacts/2026-01-18_17-25-43/trainable_paddle_ocr_2026-01-18_17-25-43/driver_artifacts/trainable_paddle_ocr_36ae4d11_1_text_det_box_thresh=0.5847,text_det_thresh=0.2571,text_det_unclip_ratio=0.0000,text_rec_score_thre_2026-01-18_17-25-43/error.txt</td></tr>\n",
"</tbody>\n",
"</table>\n",
"</div>\n",
"<style>\n",
".messages {\n",
" color: var(--jp-ui-font-color1);\n",
" display: flex;\n",
" flex-direction: column;\n",
" padding-left: 1em;\n",
" overflow-y: auto;\n",
"}\n",
".messages h3 {\n",
" font-weight: bold;\n",
"}\n",
".vDivider {\n",
" border-left-width: var(--jp-border-width);\n",
" border-left-color: var(--jp-border-color0);\n",
" border-left-style: solid;\n",
" margin: 0.5em 1em 0.5em 1em;\n",
"}\n",
"</style>\n",
"\n",
" </div>\n",
" <div class=\"hDivider\"></div>\n",
@@ -634,8 +650,9 @@
"classify </th><th>use_doc_unwarping </th></tr>\n",
"</thead>\n",
"<tbody>\n",
"<tr><td>trainable_paddle_ocr_59252191</td><td>RUNNING </td><td>192.168.65.140:1195312</td><td style=\"text-align: right;\"> 0.414043</td><td style=\"text-align: right;\"> 0.337475</td><td style=\"text-align: right;\">0</td><td style=\"text-align: right;\">0.478234</td><td>True </td><td>True </td><td>True </td></tr>\n",
"<tr><td>trainable_paddle_ocr_47499299</td><td>RUNNING </td><td>192.168.65.140:1195374</td><td style=\"text-align: right;\"> 0.544738</td><td style=\"text-align: right;\"> 0.269735</td><td style=\"text-align: right;\">0</td><td style=\"text-align: right;\">0.30771 </td><td>True </td><td>False</td><td>True </td></tr>\n",
"<tr><td>trainable_paddle_ocr_2312d29c</td><td>RUNNING </td><td>192.168.65.140:1282844</td><td style=\"text-align: right;\"> 0.0311783</td><td style=\"text-align: right;\"> 0.0222724</td><td style=\"text-align: right;\">0</td><td style=\"text-align: right;\">0.141805</td><td>False </td><td>True </td><td>False </td></tr>\n",
"<tr><td>trainable_paddle_ocr_5b7b8e02</td><td>RUNNING </td><td>192.168.65.140:1285648</td><td style=\"text-align: right;\"> 0.595412 </td><td style=\"text-align: right;\"> 0.0706522</td><td style=\"text-align: right;\">0</td><td style=\"text-align: right;\">0.132174</td><td>True </td><td>False</td><td>True </td></tr>\n",
"<tr><td>trainable_paddle_ocr_36ae4d11</td><td>ERROR </td><td>192.168.65.140:1282742</td><td style=\"text-align: right;\"> 0.58473 </td><td style=\"text-align: right;\"> 0.257102 </td><td style=\"text-align: right;\">0</td><td style=\"text-align: right;\">0.634955</td><td>False </td><td>True </td><td>False </td></tr>\n",
"</tbody>\n",
"</table>\n",
" </div>\n",
@@ -682,28 +699,76 @@
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[36m(pid=gcs_server)\u001b[0m [2026-01-18 08:23:20,495 E 1193965 1193965] (gcs_server) gcs_server.cc:303: Failed to establish connection to the event+metrics exporter agent. Events and metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n"
"\u001b[36m(pid=gcs_server)\u001b[0m [2026-01-18 17:26:10,501 E 1281442 1281442] (gcs_server) gcs_server.cc:303: Failed to establish connection to the event+metrics exporter agent. Events and metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[33m(raylet)\u001b[0m [2026-01-18 08:23:21,833 E 1194136 1194136] (raylet) main.cc:1032: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n"
"\u001b[33m(raylet)\u001b[0m [2026-01-18 17:26:11,550 E 1281587 1281587] (raylet) main.cc:1032: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[36m(bundle_reservation_check_func pid=1194212)\u001b[0m [2026-01-18 08:23:23,446 E 1194212 1194301] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n"
"\u001b[36m(bundle_reservation_check_func pid=1281657)\u001b[0m [2026-01-18 17:26:12,349 E 1281657 1281801] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[2026-01-18 08:23:24,197 E 1193837 1194205] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n"
"[2026-01-18 17:26:12,987 E 1281294 1281656] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2026-01-18 17:31:48,050\tERROR tune_controller.py:1331 -- Trial task failed for trial trainable_paddle_ocr_36ae4d11\n",
"Traceback (most recent call last):\n",
" File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/execution/_internal/event_manager.py\", line 110, in resolve_future\n",
" result = ray.get(future)\n",
" ^^^^^^^^^^^^^^^\n",
" File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/auto_init_hook.py\", line 22, in auto_init_wrapper\n",
" return fn(*args, **kwargs)\n",
" ^^^^^^^^^^^^^^^^^^^\n",
" File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/client_mode_hook.py\", line 104, in wrapper\n",
" return func(*args, **kwargs)\n",
" ^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 2967, in get\n",
" values, debugger_breakpoint = worker.get_objects(\n",
" ^^^^^^^^^^^^^^^^^^^\n",
" File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 1015, in get_objects\n",
" raise value.as_instanceof_cause()\n",
"ray.exceptions.RayTaskError(DeprecationWarning): \u001b[36mray::ImplicitFunc.train()\u001b[39m (pid=1282742, ip=192.168.65.140, actor_id=d19d5170bbb9faf9c9fa055f01000000, repr=trainable_paddle_ocr)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/trainable.py\", line 331, in train\n",
" raise skipped from exception_cause(skipped)\n",
" File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/_internal/util.py\", line 98, in run\n",
" self._ret = self._target(*self._args, **self._kwargs)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 44, in <lambda>\n",
" training_func=lambda: self._trainable_func(self.config),\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 249, in _trainable_func\n",
" output = fn()\n",
" ^^^^\n",
" File \"/tmp/ipykernel_1281294/4208751894.py\", line 31, in trainable_paddle_ocr\n",
" File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/train/v2/_internal/util.py\", line 273, in _wrapped_fn\n",
" raise DeprecationWarning(\n",
"DeprecationWarning: `ray.train.report` is deprecated when running in a function passed to Ray Tune. Please use the equivalent `ray.tune` API instead. See this issue for more context: https://github.com/ray-project/ray/issues/49454\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[36m(trainable_paddle_ocr pid=1285648)\u001b[0m [2026-01-18 17:32:19,397 E 1285648 1285683] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\u001b[32m [repeated 20x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)\u001b[0m\n"
]
}
],
@@ -717,7 +782,6 @@
" num_samples=64,\n",
" max_concurrent_trials=NUM_WORKERS, # Run trials in parallel across workers\n",
" ),\n",
" run_config=air.RunConfig(verbose=2, log_to_file=False),\n",
" param_space=search_space,\n",
")\n",
"\n",
@@ -878,7 +942,7 @@
"input_path": "paddle_ocr_raytune_rest.ipynb",
"output_path": "output_raytune.ipynb",
"parameters": {},
"start_time": "2026-01-18T07:22:47.169883",
"start_time": "2026-01-18T16:25:37.429790",
"version": "2.6.0"
}
},

View File

@@ -36,7 +36,7 @@ x-ocr-gpu-common: &ocr-gpu-common
start_period: 120s
x-ocr-cpu-common: &ocr-cpu-common
image: paddle-ocr-api:cpu
image: seryus.ddns.net/unir/paddle-ocr-cpu:latest
volumes:
- ../dataset:/app/dataset:ro
- paddlex-cache:/root/.paddlex

View File

@@ -72,17 +72,7 @@
"id": "imports",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from datetime import datetime\n",
"\n",
"import requests\n",
"import pandas as pd\n",
"\n",
"import ray\n",
"from ray import tune, air\n",
"from ray.tune.search.optuna import OptunaSearch"
]
"source": "import os\nfrom datetime import datetime\n\nimport requests\nimport pandas as pd\n\nimport ray\nfrom ray import tune, train\nfrom ray.tune.search.optuna import OptunaSearch"
},
{
"cell_type": "markdown",
@@ -188,7 +178,7 @@
"id": "trainable",
"metadata": {},
"outputs": [],
"source": "def trainable_paddle_ocr(config):\n \"\"\"Call PaddleOCR REST API with the given hyperparameter config.\"\"\"\n import random\n import requests\n from ray import tune\n\n # Worker URLs - random selection (load balances with 2 workers, 2 concurrent trials)\n WORKER_PORTS = [8001, 8002]\n api_url = f\"http://localhost:{random.choice(WORKER_PORTS)}\"\n\n payload = {\n \"pdf_folder\": \"/app/dataset\",\n \"use_doc_orientation_classify\": config.get(\"use_doc_orientation_classify\", False),\n \"use_doc_unwarping\": config.get(\"use_doc_unwarping\", False),\n \"textline_orientation\": config.get(\"textline_orientation\", True),\n \"text_det_thresh\": config.get(\"text_det_thresh\", 0.0),\n \"text_det_box_thresh\": config.get(\"text_det_box_thresh\", 0.0),\n \"text_det_unclip_ratio\": config.get(\"text_det_unclip_ratio\", 1.5),\n \"text_rec_score_thresh\": config.get(\"text_rec_score_thresh\", 0.0),\n \"start_page\": 5,\n \"end_page\": 10,\n }\n\n try:\n response = requests.post(f\"{api_url}/evaluate\", json=payload, timeout=None)\n response.raise_for_status()\n metrics = response.json()\n metrics[\"worker\"] = api_url\n tune.report(**metrics)\n except Exception as e:\n tune.report(\n CER=1.0,\n WER=1.0,\n TIME=0.0,\n PAGES=0,\n TIME_PER_PAGE=0,\n worker=api_url,\n ERROR=str(e)[:500]\n )"
"source": "def trainable_paddle_ocr(config):\n \"\"\"Call PaddleOCR REST API with the given hyperparameter config.\"\"\"\n import random\n import requests\n from ray import train\n\n # Worker URLs - random selection (load balances with 2 workers, 2 concurrent trials)\n WORKER_PORTS = [8001, 8002]\n api_url = f\"http://localhost:{random.choice(WORKER_PORTS)}\"\n\n payload = {\n \"pdf_folder\": \"/app/dataset\",\n \"use_doc_orientation_classify\": config.get(\"use_doc_orientation_classify\", False),\n \"use_doc_unwarping\": config.get(\"use_doc_unwarping\", False),\n \"textline_orientation\": config.get(\"textline_orientation\", True),\n \"text_det_thresh\": config.get(\"text_det_thresh\", 0.0),\n \"text_det_box_thresh\": config.get(\"text_det_box_thresh\", 0.0),\n \"text_det_unclip_ratio\": config.get(\"text_det_unclip_ratio\", 1.5),\n \"text_rec_score_thresh\": config.get(\"text_rec_score_thresh\", 0.0),\n \"start_page\": 5,\n \"end_page\": 10,\n }\n\n try:\n response = requests.post(f\"{api_url}/evaluate\", json=payload, timeout=None)\n response.raise_for_status()\n metrics = response.json()\n metrics[\"worker\"] = api_url\n train.report(metrics)\n except Exception as e:\n train.report({\n \"CER\": 1.0,\n \"WER\": 1.0,\n \"TIME\": 0.0,\n \"PAGES\": 0,\n \"TIME_PER_PAGE\": 0,\n \"worker\": api_url,\n \"ERROR\": str(e)[:500]\n })"
},
{
"cell_type": "markdown",
@@ -215,22 +205,7 @@
"id": "tuner",
"metadata": {},
"outputs": [],
"source": [
"tuner = tune.Tuner(\n",
" trainable_paddle_ocr,\n",
" tune_config=tune.TuneConfig(\n",
" metric=\"CER\",\n",
" mode=\"min\",\n",
" search_alg=OptunaSearch(),\n",
" num_samples=64,\n",
" max_concurrent_trials=NUM_WORKERS, # Run trials in parallel across workers\n",
" ),\n",
" run_config=air.RunConfig(verbose=2, log_to_file=False),\n",
" param_space=search_space,\n",
")\n",
"\n",
"results = tuner.fit()"
]
"source": "tuner = tune.Tuner(\n trainable_paddle_ocr,\n tune_config=tune.TuneConfig(\n metric=\"CER\",\n mode=\"min\",\n search_alg=OptunaSearch(),\n num_samples=64,\n max_concurrent_trials=NUM_WORKERS, # Run trials in parallel across workers\n ),\n param_space=search_space,\n)\n\nresults = tuner.fit()"
},
{
"cell_type": "markdown",