lock model

This commit is contained in:
2026-01-18 17:38:42 +01:00
parent b29df98602
commit 15bfba79a7
6 changed files with 295 additions and 217 deletions

View File

@@ -1,5 +1,30 @@
# Running Notebooks in Background # Running Notebooks in Background
## Quick: Check Ray Tune Progress
**Current run:** PaddleOCR hyperparameter optimization via Ray Tune + Optuna.
- 64 trials searching for optimal detection/recognition thresholds
- 2 CPU workers running in parallel (Docker containers on ports 8001-8002)
- Notebook: `paddle_ocr_raytune_rest.ipynb``output_raytune.ipynb`
- Results saved to: `~/ray_results/trainable_paddle_ocr_2026-01-18_17-25-43/`
```bash
# Is it still running?
ps aux | grep papermill | grep -v grep
# View live log
tail -f papermill.log
# Count completed trials (64 total)
find ~/ray_results/trainable_paddle_ocr_2026-01-18_17-25-43/ -name "result.json" ! -empty | wc -l
# Check workers are healthy
curl -s localhost:8001/health | jq -r '.status'
curl -s localhost:8002/health | jq -r '.status'
```
---
## Option 1: Papermill (Recommended) ## Option 1: Papermill (Recommended)
Runs notebooks directly without conversion. Runs notebooks directly without conversion.

View File

@@ -5,6 +5,7 @@
import os import os
import re import re
import time import time
import threading
from typing import Optional from typing import Optional
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
@@ -57,6 +58,10 @@ class AppState:
# Track current model config for cache invalidation # Track current model config for cache invalidation
current_config: Optional[dict] = None current_config: Optional[dict] = None
device: str = "cuda" if torch.cuda.is_available() else "cpu" device: str = "cuda" if torch.cuda.is_available() else "cpu"
lock: threading.Lock = None # Protects OCR model from concurrent access
def __init__(self):
self.lock = threading.Lock()
state = AppState() state = AppState()
@@ -253,6 +258,18 @@ def evaluate(request: EvaluateRequest):
if len(state.dataset) == 0: if len(state.dataset) == 0:
raise HTTPException(status_code=400, detail="Dataset is empty") raise HTTPException(status_code=400, detail="Dataset is empty")
# Validate page range
start = request.start_page
end = min(request.end_page, len(state.dataset))
if start >= end:
raise HTTPException(status_code=400, detail=f"Invalid page range: {start}-{end}")
cer_list, wer_list = [], []
time_per_page_list = []
t0 = time.time()
# Lock to prevent concurrent OCR access (model is not thread-safe)
with state.lock:
# Check if model needs to be reinitialized # Check if model needs to be reinitialized
new_config = { new_config = {
"assume_straight_pages": request.assume_straight_pages, "assume_straight_pages": request.assume_straight_pages,
@@ -270,16 +287,6 @@ def evaluate(request: EvaluateRequest):
state.current_config = new_config state.current_config = new_config
model_reinitialized = True model_reinitialized = True
# Validate page range
start = request.start_page
end = min(request.end_page, len(state.dataset))
if start >= end:
raise HTTPException(status_code=400, detail=f"Invalid page range: {start}-{end}")
cer_list, wer_list = [], []
time_per_page_list = []
t0 = time.time()
for idx in range(start, end): for idx in range(start, end):
img, ref = state.dataset[idx] img, ref = state.dataset[idx]
arr = np.array(img) arr = np.array(img)

View File

@@ -5,6 +5,7 @@
import os import os
import re import re
import time import time
import threading
from typing import Optional, List from typing import Optional, List
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
@@ -52,6 +53,10 @@ class AppState:
dataset: Optional[ImageTextDataset] = None dataset: Optional[ImageTextDataset] = None
dataset_path: Optional[str] = None dataset_path: Optional[str] = None
languages: List[str] = DEFAULT_LANGUAGES languages: List[str] = DEFAULT_LANGUAGES
lock: threading.Lock = None # Protects OCR model from concurrent access
def __init__(self):
self.lock = threading.Lock()
state = AppState() state = AppState()
@@ -263,6 +268,8 @@ def evaluate(request: EvaluateRequest):
time_per_page_list = [] time_per_page_list = []
t0 = time.time() t0 = time.time()
# Lock to prevent concurrent OCR access (model is not thread-safe)
with state.lock:
for idx in range(start, end): for idx in range(start, end):
img, ref = state.dataset[idx] img, ref = state.dataset[idx]
arr = np.array(img) arr = np.array(img)

View File

@@ -5,10 +5,10 @@
"id": "header", "id": "header",
"metadata": { "metadata": {
"papermill": { "papermill": {
"duration": 0.00208, "duration": 0.002022,
"end_time": "2026-01-18T07:22:47.796550", "end_time": "2026-01-18T16:25:38.048417",
"exception": false, "exception": false,
"start_time": "2026-01-18T07:22:47.794470", "start_time": "2026-01-18T16:25:38.046395",
"status": "completed" "status": "completed"
}, },
"tags": [] "tags": []
@@ -29,10 +29,10 @@
"id": "prereq", "id": "prereq",
"metadata": { "metadata": {
"papermill": { "papermill": {
"duration": 0.000961, "duration": 0.000855,
"end_time": "2026-01-18T07:22:47.807230", "end_time": "2026-01-18T16:25:38.058911",
"exception": false, "exception": false,
"start_time": "2026-01-18T07:22:47.806269", "start_time": "2026-01-18T16:25:38.058056",
"status": "completed" "status": "completed"
}, },
"tags": [] "tags": []
@@ -60,10 +60,10 @@
"id": "3ob9fsoilc4", "id": "3ob9fsoilc4",
"metadata": { "metadata": {
"papermill": { "papermill": {
"duration": 0.000901, "duration": 0.000846,
"end_time": "2026-01-18T07:22:47.809075", "end_time": "2026-01-18T16:25:38.060620",
"exception": false, "exception": false,
"start_time": "2026-01-18T07:22:47.808174", "start_time": "2026-01-18T16:25:38.059774",
"status": "completed" "status": "completed"
}, },
"tags": [] "tags": []
@@ -78,16 +78,16 @@
"id": "wyr2nsoj7", "id": "wyr2nsoj7",
"metadata": { "metadata": {
"execution": { "execution": {
"iopub.execute_input": "2026-01-18T07:22:47.812056Z", "iopub.execute_input": "2026-01-18T16:25:38.063421Z",
"iopub.status.busy": "2026-01-18T07:22:47.811910Z", "iopub.status.busy": "2026-01-18T16:25:38.063287Z",
"iopub.status.idle": "2026-01-18T07:22:49.130013Z", "iopub.status.idle": "2026-01-18T16:25:39.300678Z",
"shell.execute_reply": "2026-01-18T07:22:49.129363Z" "shell.execute_reply": "2026-01-18T16:25:39.299298Z"
}, },
"papermill": { "papermill": {
"duration": 1.321151, "duration": 1.240519,
"end_time": "2026-01-18T07:22:49.131123", "end_time": "2026-01-18T16:25:39.301973",
"exception": false, "exception": false,
"start_time": "2026-01-18T07:22:47.809972", "start_time": "2026-01-18T16:25:38.061454",
"status": "completed" "status": "completed"
}, },
"tags": [] "tags": []
@@ -120,13 +120,7 @@
"Requirement already satisfied: annotated-types>=0.6.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (0.7.0)\r\n", "Requirement already satisfied: annotated-types>=0.6.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (0.7.0)\r\n",
"Requirement already satisfied: pydantic-core==2.41.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (2.41.5)\r\n", "Requirement already satisfied: pydantic-core==2.41.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (2.41.5)\r\n",
"Requirement already satisfied: typing-extensions>=4.14.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (4.15.0)\r\n", "Requirement already satisfied: typing-extensions>=4.14.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (4.15.0)\r\n",
"Requirement already satisfied: typing-inspection>=0.4.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (0.4.2)\r\n" "Requirement already satisfied: typing-inspection>=0.4.2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.10.*,!=2.11.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3->ray[tune]) (0.4.2)\r\n",
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: numpy in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from tensorboardX>=1.9->ray[tune]) (2.4.1)\r\n", "Requirement already satisfied: numpy in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from tensorboardX>=1.9->ray[tune]) (2.4.1)\r\n",
"Requirement already satisfied: attrs>=22.2.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from jsonschema->ray[tune]) (25.4.0)\r\n", "Requirement already satisfied: attrs>=22.2.0 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from jsonschema->ray[tune]) (25.4.0)\r\n",
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from jsonschema->ray[tune]) (2025.9.1)\r\n", "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from jsonschema->ray[tune]) (2025.9.1)\r\n",
@@ -180,7 +174,13 @@
"text": [ "text": [
"Requirement already satisfied: requests in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (2.32.5)\r\n", "Requirement already satisfied: requests in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (2.32.5)\r\n",
"Requirement already satisfied: pandas in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (2.3.3)\r\n", "Requirement already satisfied: pandas in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (2.3.3)\r\n",
"Requirement already satisfied: charset_normalizer<4,>=2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (3.4.4)\r\n", "Requirement already satisfied: charset_normalizer<4,>=2 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (3.4.4)\r\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: idna<4,>=2.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (3.11)\r\n", "Requirement already satisfied: idna<4,>=2.5 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (3.11)\r\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (2.6.3)\r\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (2.6.3)\r\n",
"Requirement already satisfied: certifi>=2017.4.17 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (2026.1.4)\r\n", "Requirement already satisfied: certifi>=2017.4.17 in /home/sergio/MastersThesis/.venv/lib/python3.12/site-packages (from requests) (2026.1.4)\r\n",
@@ -211,10 +211,10 @@
"id": "imports-header", "id": "imports-header",
"metadata": { "metadata": {
"papermill": { "papermill": {
"duration": 0.002313, "duration": 0.009444,
"end_time": "2026-01-18T07:22:49.136199", "end_time": "2026-01-18T16:25:39.312980",
"exception": false, "exception": false,
"start_time": "2026-01-18T07:22:49.133886", "start_time": "2026-01-18T16:25:39.303536",
"status": "completed" "status": "completed"
}, },
"tags": [] "tags": []
@@ -229,16 +229,16 @@
"id": "imports", "id": "imports",
"metadata": { "metadata": {
"execution": { "execution": {
"iopub.execute_input": "2026-01-18T07:22:49.141850Z", "iopub.execute_input": "2026-01-18T16:25:39.316439Z",
"iopub.status.busy": "2026-01-18T07:22:49.141713Z", "iopub.status.busy": "2026-01-18T16:25:39.316230Z",
"iopub.status.idle": "2026-01-18T07:22:50.248414Z", "iopub.status.idle": "2026-01-18T16:25:40.277894Z",
"shell.execute_reply": "2026-01-18T07:22:50.247699Z" "shell.execute_reply": "2026-01-18T16:25:40.277012Z"
}, },
"papermill": { "papermill": {
"duration": 1.111175, "duration": 0.964409,
"end_time": "2026-01-18T07:22:50.249605", "end_time": "2026-01-18T16:25:40.278450",
"exception": false, "exception": false,
"start_time": "2026-01-18T07:22:49.138430", "start_time": "2026-01-18T16:25:39.314041",
"status": "completed" "status": "completed"
}, },
"tags": [] "tags": []
@@ -252,7 +252,7 @@
"import pandas as pd\n", "import pandas as pd\n",
"\n", "\n",
"import ray\n", "import ray\n",
"from ray import tune, air\n", "from ray import tune, train\n",
"from ray.tune.search.optuna import OptunaSearch" "from ray.tune.search.optuna import OptunaSearch"
] ]
}, },
@@ -261,10 +261,10 @@
"id": "config-header", "id": "config-header",
"metadata": { "metadata": {
"papermill": { "papermill": {
"duration": 0.00953, "duration": 0.009552,
"end_time": "2026-01-18T07:22:50.261880", "end_time": "2026-01-18T16:25:40.289551",
"exception": false, "exception": false,
"start_time": "2026-01-18T07:22:50.252350", "start_time": "2026-01-18T16:25:40.279999",
"status": "completed" "status": "completed"
}, },
"tags": [] "tags": []
@@ -279,16 +279,16 @@
"id": "config", "id": "config",
"metadata": { "metadata": {
"execution": { "execution": {
"iopub.execute_input": "2026-01-18T07:22:50.267482Z", "iopub.execute_input": "2026-01-18T16:25:40.292573Z",
"iopub.status.busy": "2026-01-18T07:22:50.267340Z", "iopub.status.busy": "2026-01-18T16:25:40.292489Z",
"iopub.status.idle": "2026-01-18T07:22:50.269689Z", "iopub.status.idle": "2026-01-18T16:25:40.294713Z",
"shell.execute_reply": "2026-01-18T07:22:50.269264Z" "shell.execute_reply": "2026-01-18T16:25:40.294164Z"
}, },
"papermill": { "papermill": {
"duration": 0.006027, "duration": 0.004591,
"end_time": "2026-01-18T07:22:50.270230", "end_time": "2026-01-18T16:25:40.295202",
"exception": false, "exception": false,
"start_time": "2026-01-18T07:22:50.264203", "start_time": "2026-01-18T16:25:40.290611",
"status": "completed" "status": "completed"
}, },
"tags": [] "tags": []
@@ -314,16 +314,16 @@
"id": "health-check", "id": "health-check",
"metadata": { "metadata": {
"execution": { "execution": {
"iopub.execute_input": "2026-01-18T07:22:50.275708Z", "iopub.execute_input": "2026-01-18T16:25:40.298281Z",
"iopub.status.busy": "2026-01-18T07:22:50.275626Z", "iopub.status.busy": "2026-01-18T16:25:40.298161Z",
"iopub.status.idle": "2026-01-18T07:22:50.283441Z", "iopub.status.idle": "2026-01-18T16:25:40.306720Z",
"shell.execute_reply": "2026-01-18T07:22:50.282984Z" "shell.execute_reply": "2026-01-18T16:25:40.306262Z"
}, },
"papermill": { "papermill": {
"duration": 0.011534, "duration": 0.010723,
"end_time": "2026-01-18T07:22:50.284080", "end_time": "2026-01-18T16:25:40.307025",
"exception": false, "exception": false,
"start_time": "2026-01-18T07:22:50.272546", "start_time": "2026-01-18T16:25:40.296302",
"status": "completed" "status": "completed"
}, },
"tags": [] "tags": []
@@ -368,10 +368,10 @@
"id": "search-space-header", "id": "search-space-header",
"metadata": { "metadata": {
"papermill": { "papermill": {
"duration": 0.002325, "duration": 0.001073,
"end_time": "2026-01-18T07:22:50.288969", "end_time": "2026-01-18T16:25:40.309261",
"exception": false, "exception": false,
"start_time": "2026-01-18T07:22:50.286644", "start_time": "2026-01-18T16:25:40.308188",
"status": "completed" "status": "completed"
}, },
"tags": [] "tags": []
@@ -386,16 +386,16 @@
"id": "search-space", "id": "search-space",
"metadata": { "metadata": {
"execution": { "execution": {
"iopub.execute_input": "2026-01-18T07:22:50.294569Z", "iopub.execute_input": "2026-01-18T16:25:40.312177Z",
"iopub.status.busy": "2026-01-18T07:22:50.294500Z", "iopub.status.busy": "2026-01-18T16:25:40.312107Z",
"iopub.status.idle": "2026-01-18T07:22:50.296998Z", "iopub.status.idle": "2026-01-18T16:25:40.314237Z",
"shell.execute_reply": "2026-01-18T07:22:50.296295Z" "shell.execute_reply": "2026-01-18T16:25:40.313794Z"
}, },
"papermill": { "papermill": {
"duration": 0.006486, "duration": 0.004476,
"end_time": "2026-01-18T07:22:50.297804", "end_time": "2026-01-18T16:25:40.314804",
"exception": false, "exception": false,
"start_time": "2026-01-18T07:22:50.291318", "start_time": "2026-01-18T16:25:40.310328",
"status": "completed" "status": "completed"
}, },
"tags": [] "tags": []
@@ -425,10 +425,10 @@
"id": "trainable-header", "id": "trainable-header",
"metadata": { "metadata": {
"papermill": { "papermill": {
"duration": 0.002321, "duration": 0.001057,
"end_time": "2026-01-18T07:22:50.302532", "end_time": "2026-01-18T16:25:40.316975",
"exception": false, "exception": false,
"start_time": "2026-01-18T07:22:50.300211", "start_time": "2026-01-18T16:25:40.315918",
"status": "completed" "status": "completed"
}, },
"tags": [] "tags": []
@@ -443,16 +443,16 @@
"id": "trainable", "id": "trainable",
"metadata": { "metadata": {
"execution": { "execution": {
"iopub.execute_input": "2026-01-18T07:22:50.308222Z", "iopub.execute_input": "2026-01-18T16:25:40.319825Z",
"iopub.status.busy": "2026-01-18T07:22:50.308103Z", "iopub.status.busy": "2026-01-18T16:25:40.319771Z",
"iopub.status.idle": "2026-01-18T07:22:50.311240Z", "iopub.status.idle": "2026-01-18T16:25:40.322602Z",
"shell.execute_reply": "2026-01-18T07:22:50.310694Z" "shell.execute_reply": "2026-01-18T16:25:40.322112Z"
}, },
"papermill": { "papermill": {
"duration": 0.007301, "duration": 0.004907,
"end_time": "2026-01-18T07:22:50.312116", "end_time": "2026-01-18T16:25:40.322948",
"exception": false, "exception": false,
"start_time": "2026-01-18T07:22:50.304815", "start_time": "2026-01-18T16:25:40.318041",
"status": "completed" "status": "completed"
}, },
"tags": [] "tags": []
@@ -463,7 +463,7 @@
" \"\"\"Call PaddleOCR REST API with the given hyperparameter config.\"\"\"\n", " \"\"\"Call PaddleOCR REST API with the given hyperparameter config.\"\"\"\n",
" import random\n", " import random\n",
" import requests\n", " import requests\n",
" from ray import tune\n", " from ray import train\n",
"\n", "\n",
" # Worker URLs - random selection (load balances with 2 workers, 2 concurrent trials)\n", " # Worker URLs - random selection (load balances with 2 workers, 2 concurrent trials)\n",
" WORKER_PORTS = [8001, 8002]\n", " WORKER_PORTS = [8001, 8002]\n",
@@ -487,17 +487,17 @@
" response.raise_for_status()\n", " response.raise_for_status()\n",
" metrics = response.json()\n", " metrics = response.json()\n",
" metrics[\"worker\"] = api_url\n", " metrics[\"worker\"] = api_url\n",
" tune.report(**metrics)\n", " train.report(metrics)\n",
" except Exception as e:\n", " except Exception as e:\n",
" tune.report(\n", " train.report({\n",
" CER=1.0,\n", " \"CER\": 1.0,\n",
" WER=1.0,\n", " \"WER\": 1.0,\n",
" TIME=0.0,\n", " \"TIME\": 0.0,\n",
" PAGES=0,\n", " \"PAGES\": 0,\n",
" TIME_PER_PAGE=0,\n", " \"TIME_PER_PAGE\": 0,\n",
" worker=api_url,\n", " \"worker\": api_url,\n",
" ERROR=str(e)[:500]\n", " \"ERROR\": str(e)[:500]\n",
" )" " })"
] ]
}, },
{ {
@@ -505,10 +505,10 @@
"id": "tuner-header", "id": "tuner-header",
"metadata": { "metadata": {
"papermill": { "papermill": {
"duration": 0.002522, "duration": 0.001058,
"end_time": "2026-01-18T07:22:50.317277", "end_time": "2026-01-18T16:25:40.325120",
"exception": false, "exception": false,
"start_time": "2026-01-18T07:22:50.314755", "start_time": "2026-01-18T16:25:40.324062",
"status": "completed" "status": "completed"
}, },
"tags": [] "tags": []
@@ -523,16 +523,16 @@
"id": "ray-init", "id": "ray-init",
"metadata": { "metadata": {
"execution": { "execution": {
"iopub.execute_input": "2026-01-18T07:22:50.323163Z", "iopub.execute_input": "2026-01-18T16:25:40.328162Z",
"iopub.status.busy": "2026-01-18T07:22:50.323037Z", "iopub.status.busy": "2026-01-18T16:25:40.328055Z",
"iopub.status.idle": "2026-01-18T07:22:54.197904Z", "iopub.status.idle": "2026-01-18T16:25:42.985307Z",
"shell.execute_reply": "2026-01-18T07:22:54.196986Z" "shell.execute_reply": "2026-01-18T16:25:42.984863Z"
}, },
"papermill": { "papermill": {
"duration": 3.878908, "duration": 2.65986,
"end_time": "2026-01-18T07:22:54.198593", "end_time": "2026-01-18T16:25:42.986041",
"exception": false, "exception": false,
"start_time": "2026-01-18T07:22:50.319685", "start_time": "2026-01-18T16:25:40.326181",
"status": "completed" "status": "completed"
}, },
"tags": [] "tags": []
@@ -542,7 +542,7 @@
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"2026-01-18 08:22:51,904\tINFO worker.py:2007 -- Started a local Ray instance.\n" "2026-01-18 17:25:41,631\tINFO worker.py:2007 -- Started a local Ray instance.\n"
] ]
}, },
{ {
@@ -572,35 +572,19 @@
"id": "tuner", "id": "tuner",
"metadata": { "metadata": {
"execution": { "execution": {
"iopub.execute_input": "2026-01-18T07:22:54.213071Z", "iopub.execute_input": "2026-01-18T16:25:42.998698Z",
"iopub.status.busy": "2026-01-18T07:22:54.212310Z" "iopub.status.busy": "2026-01-18T16:25:42.998141Z"
}, },
"papermill": { "papermill": {
"duration": null, "duration": null,
"end_time": null, "end_time": null,
"exception": false, "exception": false,
"start_time": "2026-01-18T07:22:54.201610", "start_time": "2026-01-18T16:25:42.987700",
"status": "running" "status": "running"
}, },
"tags": [] "tags": []
}, },
"outputs": [ "outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/impl/tuner_internal.py:144: RayDeprecationWarning: The `RunConfig` class should be imported from `ray.tune` when passing it to the Tuner. Please update your imports. See this issue for more context and migration options: https://github.com/ray-project/ray/issues/49454. Disable these warnings by setting the environment variable: RAY_TRAIN_ENABLE_V2_MIGRATION_WARNINGS=0\n",
" _log_deprecation_warning(\n",
"2026-01-18 08:22:54,222\tINFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[I 2026-01-18 08:22:54,226] A new study created in memory with name: optuna\n"
]
},
{ {
"data": { "data": {
"text/html": [ "text/html": [
@@ -610,9 +594,9 @@
" <h3>Tune Status</h3>\n", " <h3>Tune Status</h3>\n",
" <table>\n", " <table>\n",
"<tbody>\n", "<tbody>\n",
"<tr><td>Current time:</td><td>2026-01-18 08:23:19</td></tr>\n", "<tr><td>Current time:</td><td>2026-01-18 17:37:46</td></tr>\n",
"<tr><td>Running for: </td><td>00:00:25.26 </td></tr>\n", "<tr><td>Running for: </td><td>00:12:03.55 </td></tr>\n",
"<tr><td>Memory: </td><td>57.8/119.7 GiB </td></tr>\n", "<tr><td>Memory: </td><td>16.5/119.7 GiB </td></tr>\n",
"</tbody>\n", "</tbody>\n",
"</table>\n", "</table>\n",
" </div>\n", " </div>\n",
@@ -621,6 +605,38 @@
" <h3>System Info</h3>\n", " <h3>System Info</h3>\n",
" Using FIFO scheduling algorithm.<br>Logical resource usage: 2.0/20 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:GB10)\n", " Using FIFO scheduling algorithm.<br>Logical resource usage: 2.0/20 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:GB10)\n",
" </div>\n", " </div>\n",
" <div class=\"vDivider\"></div>\n",
"<div class=\"messages\">\n",
" <h3>Messages</h3>\n",
" \n",
" \n",
" Number of errored trials: 1<br><table>\n",
"<thead>\n",
"<tr><th>Trial name </th><th style=\"text-align: right;\"> # failures</th><th>error file </th></tr>\n",
"</thead>\n",
"<tbody>\n",
"<tr><td>trainable_paddle_ocr_36ae4d11</td><td style=\"text-align: right;\"> 1</td><td>/tmp/ray/session_2026-01-18_17-25-40_347373_1281294/artifacts/2026-01-18_17-25-43/trainable_paddle_ocr_2026-01-18_17-25-43/driver_artifacts/trainable_paddle_ocr_36ae4d11_1_text_det_box_thresh=0.5847,text_det_thresh=0.2571,text_det_unclip_ratio=0.0000,text_rec_score_thre_2026-01-18_17-25-43/error.txt</td></tr>\n",
"</tbody>\n",
"</table>\n",
"</div>\n",
"<style>\n",
".messages {\n",
" color: var(--jp-ui-font-color1);\n",
" display: flex;\n",
" flex-direction: column;\n",
" padding-left: 1em;\n",
" overflow-y: auto;\n",
"}\n",
".messages h3 {\n",
" font-weight: bold;\n",
"}\n",
".vDivider {\n",
" border-left-width: var(--jp-border-width);\n",
" border-left-color: var(--jp-border-color0);\n",
" border-left-style: solid;\n",
" margin: 0.5em 1em 0.5em 1em;\n",
"}\n",
"</style>\n",
"\n", "\n",
" </div>\n", " </div>\n",
" <div class=\"hDivider\"></div>\n", " <div class=\"hDivider\"></div>\n",
@@ -634,8 +650,9 @@
"classify </th><th>use_doc_unwarping </th></tr>\n", "classify </th><th>use_doc_unwarping </th></tr>\n",
"</thead>\n", "</thead>\n",
"<tbody>\n", "<tbody>\n",
"<tr><td>trainable_paddle_ocr_59252191</td><td>RUNNING </td><td>192.168.65.140:1195312</td><td style=\"text-align: right;\"> 0.414043</td><td style=\"text-align: right;\"> 0.337475</td><td style=\"text-align: right;\">0</td><td style=\"text-align: right;\">0.478234</td><td>True </td><td>True </td><td>True </td></tr>\n", "<tr><td>trainable_paddle_ocr_2312d29c</td><td>RUNNING </td><td>192.168.65.140:1282844</td><td style=\"text-align: right;\"> 0.0311783</td><td style=\"text-align: right;\"> 0.0222724</td><td style=\"text-align: right;\">0</td><td style=\"text-align: right;\">0.141805</td><td>False </td><td>True </td><td>False </td></tr>\n",
"<tr><td>trainable_paddle_ocr_47499299</td><td>RUNNING </td><td>192.168.65.140:1195374</td><td style=\"text-align: right;\"> 0.544738</td><td style=\"text-align: right;\"> 0.269735</td><td style=\"text-align: right;\">0</td><td style=\"text-align: right;\">0.30771 </td><td>True </td><td>False</td><td>True </td></tr>\n", "<tr><td>trainable_paddle_ocr_5b7b8e02</td><td>RUNNING </td><td>192.168.65.140:1285648</td><td style=\"text-align: right;\"> 0.595412 </td><td style=\"text-align: right;\"> 0.0706522</td><td style=\"text-align: right;\">0</td><td style=\"text-align: right;\">0.132174</td><td>True </td><td>False</td><td>True </td></tr>\n",
"<tr><td>trainable_paddle_ocr_36ae4d11</td><td>ERROR </td><td>192.168.65.140:1282742</td><td style=\"text-align: right;\"> 0.58473 </td><td style=\"text-align: right;\"> 0.257102 </td><td style=\"text-align: right;\">0</td><td style=\"text-align: right;\">0.634955</td><td>False </td><td>True </td><td>False </td></tr>\n",
"</tbody>\n", "</tbody>\n",
"</table>\n", "</table>\n",
" </div>\n", " </div>\n",
@@ -682,28 +699,76 @@
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"\u001b[36m(pid=gcs_server)\u001b[0m [2026-01-18 08:23:20,495 E 1193965 1193965] (gcs_server) gcs_server.cc:303: Failed to establish connection to the event+metrics exporter agent. Events and metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n" "\u001b[36m(pid=gcs_server)\u001b[0m [2026-01-18 17:26:10,501 E 1281442 1281442] (gcs_server) gcs_server.cc:303: Failed to establish connection to the event+metrics exporter agent. Events and metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n"
] ]
}, },
{ {
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"\u001b[33m(raylet)\u001b[0m [2026-01-18 08:23:21,833 E 1194136 1194136] (raylet) main.cc:1032: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n" "\u001b[33m(raylet)\u001b[0m [2026-01-18 17:26:11,550 E 1281587 1281587] (raylet) main.cc:1032: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n"
] ]
}, },
{ {
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"\u001b[36m(bundle_reservation_check_func pid=1194212)\u001b[0m [2026-01-18 08:23:23,446 E 1194212 1194301] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n" "\u001b[36m(bundle_reservation_check_func pid=1281657)\u001b[0m [2026-01-18 17:26:12,349 E 1281657 1281801] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n"
] ]
}, },
{ {
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"[2026-01-18 08:23:24,197 E 1193837 1194205] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n" "[2026-01-18 17:26:12,987 E 1281294 1281656] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2026-01-18 17:31:48,050\tERROR tune_controller.py:1331 -- Trial task failed for trial trainable_paddle_ocr_36ae4d11\n",
"Traceback (most recent call last):\n",
" File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/execution/_internal/event_manager.py\", line 110, in resolve_future\n",
" result = ray.get(future)\n",
" ^^^^^^^^^^^^^^^\n",
" File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/auto_init_hook.py\", line 22, in auto_init_wrapper\n",
" return fn(*args, **kwargs)\n",
" ^^^^^^^^^^^^^^^^^^^\n",
" File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/client_mode_hook.py\", line 104, in wrapper\n",
" return func(*args, **kwargs)\n",
" ^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 2967, in get\n",
" values, debugger_breakpoint = worker.get_objects(\n",
" ^^^^^^^^^^^^^^^^^^^\n",
" File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/_private/worker.py\", line 1015, in get_objects\n",
" raise value.as_instanceof_cause()\n",
"ray.exceptions.RayTaskError(DeprecationWarning): \u001b[36mray::ImplicitFunc.train()\u001b[39m (pid=1282742, ip=192.168.65.140, actor_id=d19d5170bbb9faf9c9fa055f01000000, repr=trainable_paddle_ocr)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/trainable.py\", line 331, in train\n",
" raise skipped from exception_cause(skipped)\n",
" File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/air/_internal/util.py\", line 98, in run\n",
" self._ret = self._target(*self._args, **self._kwargs)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 44, in <lambda>\n",
" training_func=lambda: self._trainable_func(self.config),\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/tune/trainable/function_trainable.py\", line 249, in _trainable_func\n",
" output = fn()\n",
" ^^^^\n",
" File \"/tmp/ipykernel_1281294/4208751894.py\", line 31, in trainable_paddle_ocr\n",
" File \"/home/sergio/MastersThesis/.venv/lib/python3.12/site-packages/ray/train/v2/_internal/util.py\", line 273, in _wrapped_fn\n",
" raise DeprecationWarning(\n",
"DeprecationWarning: `ray.train.report` is deprecated when running in a function passed to Ray Tune. Please use the equivalent `ray.tune` API instead. See this issue for more context: https://github.com/ray-project/ray/issues/49454\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[36m(trainable_paddle_ocr pid=1285648)\u001b[0m [2026-01-18 17:32:19,397 E 1285648 1285683] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\u001b[32m [repeated 20x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)\u001b[0m\n"
] ]
} }
], ],
@@ -717,7 +782,6 @@
" num_samples=64,\n", " num_samples=64,\n",
" max_concurrent_trials=NUM_WORKERS, # Run trials in parallel across workers\n", " max_concurrent_trials=NUM_WORKERS, # Run trials in parallel across workers\n",
" ),\n", " ),\n",
" run_config=air.RunConfig(verbose=2, log_to_file=False),\n",
" param_space=search_space,\n", " param_space=search_space,\n",
")\n", ")\n",
"\n", "\n",
@@ -878,7 +942,7 @@
"input_path": "paddle_ocr_raytune_rest.ipynb", "input_path": "paddle_ocr_raytune_rest.ipynb",
"output_path": "output_raytune.ipynb", "output_path": "output_raytune.ipynb",
"parameters": {}, "parameters": {},
"start_time": "2026-01-18T07:22:47.169883", "start_time": "2026-01-18T16:25:37.429790",
"version": "2.6.0" "version": "2.6.0"
} }
}, },

View File

@@ -36,7 +36,7 @@ x-ocr-gpu-common: &ocr-gpu-common
start_period: 120s start_period: 120s
x-ocr-cpu-common: &ocr-cpu-common x-ocr-cpu-common: &ocr-cpu-common
image: paddle-ocr-api:cpu image: seryus.ddns.net/unir/paddle-ocr-cpu:latest
volumes: volumes:
- ../dataset:/app/dataset:ro - ../dataset:/app/dataset:ro
- paddlex-cache:/root/.paddlex - paddlex-cache:/root/.paddlex

View File

@@ -72,17 +72,7 @@
"id": "imports", "id": "imports",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": "import os\nfrom datetime import datetime\n\nimport requests\nimport pandas as pd\n\nimport ray\nfrom ray import tune, train\nfrom ray.tune.search.optuna import OptunaSearch"
"import os\n",
"from datetime import datetime\n",
"\n",
"import requests\n",
"import pandas as pd\n",
"\n",
"import ray\n",
"from ray import tune, air\n",
"from ray.tune.search.optuna import OptunaSearch"
]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -188,7 +178,7 @@
"id": "trainable", "id": "trainable",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": "def trainable_paddle_ocr(config):\n \"\"\"Call PaddleOCR REST API with the given hyperparameter config.\"\"\"\n import random\n import requests\n from ray import tune\n\n # Worker URLs - random selection (load balances with 2 workers, 2 concurrent trials)\n WORKER_PORTS = [8001, 8002]\n api_url = f\"http://localhost:{random.choice(WORKER_PORTS)}\"\n\n payload = {\n \"pdf_folder\": \"/app/dataset\",\n \"use_doc_orientation_classify\": config.get(\"use_doc_orientation_classify\", False),\n \"use_doc_unwarping\": config.get(\"use_doc_unwarping\", False),\n \"textline_orientation\": config.get(\"textline_orientation\", True),\n \"text_det_thresh\": config.get(\"text_det_thresh\", 0.0),\n \"text_det_box_thresh\": config.get(\"text_det_box_thresh\", 0.0),\n \"text_det_unclip_ratio\": config.get(\"text_det_unclip_ratio\", 1.5),\n \"text_rec_score_thresh\": config.get(\"text_rec_score_thresh\", 0.0),\n \"start_page\": 5,\n \"end_page\": 10,\n }\n\n try:\n response = requests.post(f\"{api_url}/evaluate\", json=payload, timeout=None)\n response.raise_for_status()\n metrics = response.json()\n metrics[\"worker\"] = api_url\n tune.report(**metrics)\n except Exception as e:\n tune.report(\n CER=1.0,\n WER=1.0,\n TIME=0.0,\n PAGES=0,\n TIME_PER_PAGE=0,\n worker=api_url,\n ERROR=str(e)[:500]\n )" "source": "def trainable_paddle_ocr(config):\n \"\"\"Call PaddleOCR REST API with the given hyperparameter config.\"\"\"\n import random\n import requests\n from ray import train\n\n # Worker URLs - random selection (load balances with 2 workers, 2 concurrent trials)\n WORKER_PORTS = [8001, 8002]\n api_url = f\"http://localhost:{random.choice(WORKER_PORTS)}\"\n\n payload = {\n \"pdf_folder\": \"/app/dataset\",\n \"use_doc_orientation_classify\": config.get(\"use_doc_orientation_classify\", False),\n \"use_doc_unwarping\": config.get(\"use_doc_unwarping\", False),\n \"textline_orientation\": config.get(\"textline_orientation\", True),\n \"text_det_thresh\": config.get(\"text_det_thresh\", 0.0),\n \"text_det_box_thresh\": config.get(\"text_det_box_thresh\", 0.0),\n \"text_det_unclip_ratio\": config.get(\"text_det_unclip_ratio\", 1.5),\n \"text_rec_score_thresh\": config.get(\"text_rec_score_thresh\", 0.0),\n \"start_page\": 5,\n \"end_page\": 10,\n }\n\n try:\n response = requests.post(f\"{api_url}/evaluate\", json=payload, timeout=None)\n response.raise_for_status()\n metrics = response.json()\n metrics[\"worker\"] = api_url\n train.report(metrics)\n except Exception as e:\n train.report({\n \"CER\": 1.0,\n \"WER\": 1.0,\n \"TIME\": 0.0,\n \"PAGES\": 0,\n \"TIME_PER_PAGE\": 0,\n \"worker\": api_url,\n \"ERROR\": str(e)[:500]\n })"
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -215,22 +205,7 @@
"id": "tuner", "id": "tuner",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": "tuner = tune.Tuner(\n trainable_paddle_ocr,\n tune_config=tune.TuneConfig(\n metric=\"CER\",\n mode=\"min\",\n search_alg=OptunaSearch(),\n num_samples=64,\n max_concurrent_trials=NUM_WORKERS, # Run trials in parallel across workers\n ),\n param_space=search_space,\n)\n\nresults = tuner.fit()"
"tuner = tune.Tuner(\n",
" trainable_paddle_ocr,\n",
" tune_config=tune.TuneConfig(\n",
" metric=\"CER\",\n",
" mode=\"min\",\n",
" search_alg=OptunaSearch(),\n",
" num_samples=64,\n",
" max_concurrent_trials=NUM_WORKERS, # Run trials in parallel across workers\n",
" ),\n",
" run_config=air.RunConfig(verbose=2, log_to_file=False),\n",
" param_space=search_space,\n",
")\n",
"\n",
"results = tuner.fit()"
]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",