{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "2813d34d",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "be3c1872",
"metadata": {},
"source": [
"# AI-based OCR Benchmark Notebook\n",
"\n",
"This notebook benchmarks **AI-based OCR models** on scanned PDF documents/images in Spanish.\n",
"It excludes traditional OCR engines like Tesseract that require external installations."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6a1e98fe",
"metadata": {},
"outputs": [],
"source": [
"%pip install --upgrade pip\n",
"%pip install --upgrade jupyter\n",
"%pip install --upgrade ipywidgets\n",
"%pip install --upgrade ipykernel\n",
"\n",
"# Install necessary packages\n",
"%pip install transformers torch pdf2image pillow jiwer paddleocr hf_xet paddlepaddle\n",
"# pdf reading\n",
"%pip install PyMuPDF\n",
"\n",
"# Data analysis and visualization\n",
"%pip install pandas\n",
"%pip install matplotlib\n",
"%pip install seaborn"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ae33632a",
"metadata": {},
"outputs": [],
"source": [
"# Imports\n",
"import os, json\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from pdf2image import convert_from_path\n",
"from PIL import Image, ImageOps\n",
"import torch\n",
"from jiwer import wer, cer\n",
"from paddleocr import PaddleOCR\n",
"import fitz # PyMuPDF\n",
"import re\n",
"from datetime import datetime"
]
},
{
"cell_type": "markdown",
"id": "0e00f1b0",
"metadata": {},
"source": [
"## 1 Configuration"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"PDF_FOLDER = './instructions' # Folder containing PDF files\n",
"OUTPUT_FOLDER = 'results'\n",
"os.makedirs(OUTPUT_FOLDER, exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "8bd4ca23",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
c:\\Users\\sji\\Desktop\\MastersThesis\\instructions\n",
"\n"
],
"text/plain": [
"c:\\Users\\sji\\Desktop\\MastersThesis\\instructions\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"c:\\Users\\sji\\Desktop\\MastersThesis\\paddle_ocr_tuning.py\n",
"\n"
],
"text/plain": [
"c:\\Users\\sji\\Desktop\\MastersThesis\\paddle_ocr_tuning.py\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"c:\\Users\\sji\\Desktop\\MastersThesis\n",
"\n"
],
"text/plain": [
"c:\\Users\\sji\\Desktop\\MastersThesis\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"PDF_FOLDER_ABS = os.path.abspath(PDF_FOLDER) # ./instructions -> C:\\...\\instructions\n",
"SCRIPT_ABS = os.path.abspath(\"paddle_ocr_tuning.py\") # paddle_ocr_tuning.py -> C:\\...\\paddle_ocr_tuning.py\n",
"SCRIPT_DIR = os.path.dirname(SCRIPT_ABS)\n",
"\n",
"print(PDF_FOLDER_ABS)\n",
"print(SCRIPT_ABS)\n",
"print(SCRIPT_DIR)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "243849b9",
"metadata": {},
"outputs": [],
"source": [
"# 3. PaddleOCR \n",
"# https://www.paddleocr.ai/v3.0.0/en/version3.x/pipeline_usage/OCR.html?utm_source=chatgpt.com#21-command-line\n",
"from paddleocr import PaddleOCR\n",
"\n",
"# Initialize with better settings for Spanish/Latin text\n",
"# https://www.paddleocr.ai/main/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html?utm_source=chatgpt.com#5-models-and-their-supported-languages\n",
"paddleocr_model = PaddleOCR(\n",
" text_detection_model_name=\"PP-OCRv5_server_det\",\n",
" text_recognition_model_name=\"PP-OCRv5_server_rec\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "329da34a",
"metadata": {},
"outputs": [],
"source": [
"import paddleocr\n",
"\n",
"print(paddleocr.__version__)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b1541bb6",
"metadata": {},
"outputs": [],
"source": [
"# 1) Locate the installed PaddleOCR package\n",
"pkg_dir = os.path.dirname(paddleocr.__file__)\n",
"print(pkg_dir)"
]
},
{
"cell_type": "markdown",
"id": "84c999e2",
"metadata": {},
"source": [
"## 2 Helper Functions"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9596c7df",
"metadata": {},
"outputs": [],
"source": [
"from typing import List, Optional\n",
"from paddle_ocr_tuning import pdf_to_images, pdf_extract_text, evaluate_text, assemble_from_paddle_result\n",
"\n",
"def show_page(img: Image.Image, text: str, scale: float = 1):\n",
" \"\"\"\n",
" Displays a smaller version of the image with text as a footer.\n",
" \"\"\"\n",
" # Compute plot size based on image dimensions (but without resizing the image)\n",
" w, h = img.size\n",
" figsize = (w * scale / 100, h * scale / 100) # convert pixels to inches approx\n",
"\n",
" fig, ax = plt.subplots(figsize=figsize)\n",
" ax.imshow(img)\n",
" ax.axis(\"off\")\n",
"\n",
"\n",
" # Add OCR text below the image (footer)\n",
" # plt.figtext(0.5, 0.02, text.strip(), wrap=True, ha='center', va='bottom', fontsize=10)\n",
" plt.tight_layout()\n",
" plt.show()"
]
},
{
"cell_type": "markdown",
"id": "e42cae29",
"metadata": {},
"source": [
"## Run AI OCR Benchmark"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9b55c154",
"metadata": {},
"outputs": [],
"source": [
"results = []\n",
"\n",
"for pdf_file in os.listdir(PDF_FOLDER):\n",
" if not pdf_file.lower().endswith('.pdf'):\n",
" continue\n",
" pdf_path = os.path.join(PDF_FOLDER, pdf_file)\n",
" page_range = range(5, 10)\n",
" \n",
" images = pdf_to_images(pdf_path, 300, page_range)\n",
" \n",
" for i, img in enumerate(images):\n",
" # img = preprocess_for_ocr(img)\n",
" page_num = page_range[i]\n",
" ref = pdf_extract_text(pdf_path, page_num=page_num)\n",
" show_page(img, f\"page: {page_num}\", 0.15)\n",
" print(f\"ref: \\n{ref}\")\n",
" \n",
" # Convert PIL image to numpy array\n",
" image_array = np.array(img)\n",
" out = paddleocr_model.predict(\n",
" image_array,\n",
" use_doc_orientation_classify=False,\n",
" use_doc_unwarping=False,\n",
" use_textline_orientation=True\n",
" )\n",
" # PaddleOCR\n",
" paddle_text = assemble_from_paddle_result(out)\n",
" print(f\"paddle_text: \\n{paddle_text}\")\n",
" results.append({'PDF': pdf_file, 'Page': page_num, 'Model': 'PaddleOCR', 'Prediction': paddle_text, **evaluate_text(ref, paddle_text)})\n",
" "
]
},
{
"cell_type": "markdown",
"id": "0db6dc74",
"metadata": {},
"source": [
"## 5 Save and Analyze Results"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "da3155e3",
"metadata": {},
"outputs": [],
"source": [
"df_results = pd.DataFrame(results)\n",
"\n",
"# Generate a unique filename with timestamp\n",
"timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
"filename = f\"ai_ocr_benchmark_finetune_results_{timestamp}.csv\"\n",
"filepath = os.path.join(OUTPUT_FOLDER, filename)\n",
"\n",
"df_results.to_csv(filepath, index=False)\n",
"print(f\"Benchmark results saved as {filename}\")\n",
"\n",
"# Summary by model\n",
"summary = df_results.groupby('Model')[['WER', 'CER']].mean()\n",
"print(summary)\n",
"\n",
"# Plot\n",
"summary.plot(kind='bar', figsize=(8,5), title='AI OCR Benchmark (WER & CER)')\n",
"plt.ylabel('Error Rate')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "3e0f00c0",
"metadata": {},
"source": [
"### How to read this chart:\n",
"- CER (Character Error Rate) focus on raw transcription quality\n",
"- WER (Word Error Rate) penalizes incorrect tokenization or missing spaces\n",
"- CER and WER are error metrics, which means:\n",
" - Higher values = worse performance\n",
" - Lower values = better accuracy"
]
},
{
"cell_type": "markdown",
"id": "830b0e25",
"metadata": {},
"source": [
"# Busqueda de hyperparametros\n",
"https://docs.ray.io/en/latest/tune/index.html"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3a4bd700",
"metadata": {},
"outputs": [],
"source": [
"!python --version\n",
"!pip --version"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "b0cf4bcf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n",
"Collecting rich\n",
" Downloading rich-14.2.0-py3-none-any.whl.metadata (18 kB)\n",
"Requirement already satisfied: ray[tune] in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (2.51.1)\n",
"Requirement already satisfied: click!=8.3.0,>=7.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (8.2.1)\n",
"Requirement already satisfied: filelock in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (3.20.0)\n",
"Requirement already satisfied: jsonschema in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (4.25.1)\n",
"Requirement already satisfied: msgpack<2.0.0,>=1.0.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (1.1.2)\n",
"Requirement already satisfied: packaging in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (25.0)\n",
"Requirement already satisfied: protobuf>=3.20.3 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (6.33.0)\n",
"Requirement already satisfied: pyyaml in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (6.0.2)\n",
"Requirement already satisfied: requests in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (2.32.5)\n",
"Requirement already satisfied: pandas in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (2.3.3)\n",
"Requirement already satisfied: tensorboardX>=1.9 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (2.6.4)\n",
"Requirement already satisfied: pyarrow>=9.0.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (22.0.0)\n",
"Requirement already satisfied: fsspec in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (2025.10.0)\n",
"Collecting markdown-it-py>=2.2.0 (from rich)\n",
" Downloading markdown_it_py-4.0.0-py3-none-any.whl.metadata (7.3 kB)\n",
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from rich) (2.19.2)\n",
"Requirement already satisfied: colorama in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from click!=8.3.0,>=7.0->ray[tune]) (0.4.6)\n",
"Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich)\n",
" Downloading mdurl-0.1.2-py3-none-any.whl.metadata (1.6 kB)\n",
"Requirement already satisfied: numpy in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from tensorboardX>=1.9->ray[tune]) (2.3.4)\n",
"Requirement already satisfied: attrs>=22.2.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema->ray[tune]) (25.4.0)\n",
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema->ray[tune]) (2025.9.1)\n",
"Requirement already satisfied: referencing>=0.28.4 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema->ray[tune]) (0.37.0)\n",
"Requirement already satisfied: rpds-py>=0.7.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema->ray[tune]) (0.28.0)\n",
"Requirement already satisfied: typing-extensions>=4.4.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from referencing>=0.28.4->jsonschema->ray[tune]) (4.15.0)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas->ray[tune]) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas->ray[tune]) (2025.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas->ray[tune]) (2025.2)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from python-dateutil>=2.8.2->pandas->ray[tune]) (1.17.0)\n",
"Requirement already satisfied: charset_normalizer<4,>=2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests->ray[tune]) (3.4.4)\n",
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests->ray[tune]) (3.11)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests->ray[tune]) (2.5.0)\n",
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests->ray[tune]) (2025.10.5)\n",
"Downloading rich-14.2.0-py3-none-any.whl (243 kB)\n",
"Downloading markdown_it_py-4.0.0-py3-none-any.whl (87 kB)\n",
"Downloading mdurl-0.1.2-py3-none-any.whl (10.0 kB)\n",
"Installing collected packages: mdurl, markdown-it-py, rich\n",
"\n",
" ---------------------------------------- 0/3 [mdurl]\n",
" ---------------------------------------- 0/3 [mdurl]\n",
" ---------------------------------------- 0/3 [mdurl]\n",
" ---------------------------------------- 0/3 [mdurl]\n",
" ---------------------------------------- 0/3 [mdurl]\n",
" ---------------------------------------- 0/3 [mdurl]\n",
" ---------------------------------------- 0/3 [mdurl]\n",
" ---------------------------------------- 0/3 [mdurl]\n",
" ---------------------------------------- 0/3 [mdurl]\n",
" ---------------------------------------- 0/3 [mdurl]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
" -------------------------- ------------- 2/3 [rich]\n",
" -------------------------- ------------- 2/3 [rich]\n",
" -------------------------- ------------- 2/3 [rich]\n",
" -------------------------- ------------- 2/3 [rich]\n",
" -------------------------- ------------- 2/3 [rich]\n",
" -------------------------- ------------- 2/3 [rich]\n",
" -------------------------- ------------- 2/3 [rich]\n",
" -------------------------- ------------- 2/3 [rich]\n",
" -------------------------- ------------- 2/3 [rich]\n",
" -------------------------- ------------- 2/3 [rich]\n",
" -------------------------- ------------- 2/3 [rich]\n",
" -------------------------- ------------- 2/3 [rich]\n",
" -------------------------- ------------- 2/3 [rich]\n",
" -------------------------- ------------- 2/3 [rich]\n",
" -------------------------- ------------- 2/3 [rich]\n",
" -------------------------- ------------- 2/3 [rich]\n",
" -------------------------- ------------- 2/3 [rich]\n",
" -------------------------- ------------- 2/3 [rich]\n",
" ---------------------------------------- 3/3 [rich]\n",
"\n",
"Successfully installed markdown-it-py-4.0.0 mdurl-0.1.2 rich-14.2.0\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"# Instalaci贸n de Ray y Ray Tune\n",
"%pip install -U \"ray[tune]\" rich"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f3ca0b9b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-11-12 10:11:36,022\tINFO worker.py:2012 -- Started a local Ray instance.\n",
"c:\\Users\\sji\\Desktop\\MastersThesis\\.venv\\Lib\\site-packages\\ray\\_private\\worker.py:2051: FutureWarning: Tip: In future versions of Ray, Ray will no longer override accelerator visible devices env var if num_gpus=0 or num_gpus=None (default). To enable this behavior and turn off this error message, set RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO=0\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/html": [
"Ray Tune listo (versi贸n: 2.51.1 )\n",
"\n"
],
"text/plain": [
"Ray Tune listo \u001b[1m(\u001b[0mversi贸n: \u001b[1;36m2.51\u001b[0m.\u001b[1;36m1\u001b[0m \u001b[1m)\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import ray\n",
"from ray import tune\n",
"from ray.tune.schedulers import ASHAScheduler\n",
"\n",
"ray.init(ignore_reinit_error=True)\n",
"print(\"Ray Tune listo (versi贸n:\", ray.__version__, \")\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "ae5a10c4",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\sji\\Desktop\\MastersThesis\\.venv\\Lib\\site-packages\\ray\\_private\\client_mode_hook.py:104: FutureWarning: `local_mode` is an experimental feature that is no longer maintained and will be removed in the near future. For debugging consider using the Ray distributed debugger.\n",
" return func(*args, **kwargs)\n",
"2025-11-12 10:13:21,828\tINFO worker.py:1850 -- Calling ray.init() again after it has already been called.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "77da938a02c94d9ca0630b2c56da3357",
"version_major": 2,
"version_minor": 0
},
"text/html": [
"\n"
],
"text/plain": [
"RayContext(dashboard_url='', python_version='3.11.9', ray_version='2.51.1', ray_commit='eeb38c79c1af96df29cbacab7b8a823d489237f3')"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# ===============================================================\n",
"# 馃攳 RAY TUNE: OPTIMIZACI脫N AUTOM脕TICA DE HIPERPAR脕METROS OCR\n",
"# ===============================================================\n",
"\n",
"from ray import tune, air\n",
"from ray.tune.schedulers import ASHAScheduler\n",
"import pandas as pd\n",
"import time\n",
"import colorama\n",
"from rich import print\n",
"import sys, subprocess \n",
"\n",
"colorama.just_fix_windows_console()\n",
"ray.init(ignore_reinit_error=True, local_mode=True)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "96c320e8",
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"# --- Configuraci贸n base del experimento ---\n",
"search_space = {\n",
" \"dpi\": tune.choice([240, 300, 360]),\n",
" \"textline_orientation\": tune.choice([True, False]),\n",
" \"text_det_box_thresh\": tune.uniform(0.4, 0.7),\n",
" \"text_det_unclip_ratio\": tune.uniform(1.2, 2.0),\n",
" \"text_rec_score_thresh\": tune.choice([0.0, 0.2, 0.4]),\n",
" \"line_tolerance\": tune.choice([0.5, 0.6, 0.7]),\n",
" \"min_box_score\": tune.choice([0, 0.5, 0.6])\n",
"}\n",
"KEYMAP = {\n",
" \"dpi\": \"dpi\",\n",
" \"textline_orientation\": \"textline-orientation\",\n",
" \"text_det_box_thresh\": \"text-det-box-thresh\",\n",
" \"text_det_unclip_ratio\": \"text-det-unclip-ratio\",\n",
" \"text_rec_score_thresh\": \"text-rec-score-thresh\",\n",
" \"line_tolerance\": \"line-tolerance\",\n",
" \"pages_per_pdf\": \"pages-per-pdf\",\n",
" \"min_box_score\": \"min-box-score\",\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "accb4e9d",
"metadata": {},
"outputs": [],
"source": [
"import sys, subprocess\n",
"print(\"Notebook Python:\", sys.executable)\n",
"# test paddle ocr run with params\n",
"test_proc = subprocess.run([sys.executable, SCRIPT_ABS, \"--pdf-folder\", PDF_FOLDER_ABS, \"--pages-per-pdf\", \"1\"], capture_output=True, text=True, cwd=SCRIPT_DIR)\n",
"last = test_proc.stdout.strip().splitlines()[-1]\n",
"\n",
"metrics = json.loads(last)\n",
"print(metrics)\n",
"\n",
"print(f\"return code: {test_proc.returncode}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8df28468",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-11-12 10:13:25,930\tINFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949\n"
]
},
{
"data": {
"text/html": [
"\n",
"
\n",
"
\n",
"
Tune Status
\n",
"
\n",
"\n",
"| Current time: | 2025-11-12 10:16:28 |
\n",
"| Running for: | 00:03:02.06 |
\n",
"| Memory: | 21.7/31.8 GiB |
\n",
"\n",
"
\n",
"
\n",
"
\n",
"
\n",
"
System Info
\n",
" Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 8.0/12 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:G)\n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
"
Trial Status
\n",
"
\n",
"\n",
"| Trial name | status | loc | dpi | line_tolerance | min_box_score | text_det_box_thresh | text_det_unclip_rati\n",
"o | text_rec_score_thres\n",
"h | textline_orientation |
\n",
"\n",
"\n",
"| trainable_paddle_ocr_2c044_00000 | RUNNING | 127.0.0.1:16412 | 300 | 0.7 | 0 | 0.5844 | 1.68825 | 0 | False |
\n",
"| trainable_paddle_ocr_2c044_00001 | RUNNING | 127.0.0.1:23300 | 300 | 0.6 | 0.5 | 0.56087 | 1.5857 | 0.4 | False |
\n",
"| trainable_paddle_ocr_2c044_00002 | RUNNING | 127.0.0.1:15080 | 300 | 0.7 | 0.6 | 0.534888 | 1.27986 | 0.2 | True |
\n",
"| trainable_paddle_ocr_2c044_00003 | RUNNING | 127.0.0.1:22208 | 300 | 0.6 | 0.6 | 0.570881 | 1.92797 | 0 | True |
\n",
"| trainable_paddle_ocr_2c044_00004 | RUNNING | 127.0.0.1:6244 | 240 | 0.5 | 0 | 0.445475 | 1.70568 | 0 | True |
\n",
"| trainable_paddle_ocr_2c044_00005 | RUNNING | 127.0.0.1:1252 | 300 | 0.7 | 0.5 | 0.402891 | 1.65377 | 0 | False |
\n",
"| trainable_paddle_ocr_2c044_00006 | RUNNING | 127.0.0.1:4104 | 300 | 0.6 | 0.5 | 0.493143 | 1.26816 | 0.4 | False |
\n",
"| trainable_paddle_ocr_2c044_00007 | RUNNING | 127.0.0.1:15552 | 300 | 0.5 | 0.6 | 0.660866 | 1.52281 | 0.4 | True |
\n",
"\n",
"
\n",
"
\n",
"
\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-11-12 10:13:25,974\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00000_0_dpi=300,line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.5844,text_det_unclip_ratio=_2025-11-12_10-13-25\n",
"2025-11-12 10:13:25,980\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00000_0_dpi=300,line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.5844,text_det_unclip_ratio=_2025-11-12_10-13-25\n",
"2025-11-12 10:13:25,985\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.5609,text_det_unclip_r_2025-11-12_10-13-25\n",
"2025-11-12 10:13:25,989\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.5609,text_det_unclip_r_2025-11-12_10-13-25\n",
"2025-11-12 10:13:25,993\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00002_2_dpi=300,line_tolerance=0.7000,min_box_score=0.6000,text_det_box_thresh=0.5349,text_det_unclip_r_2025-11-12_10-13-25\n",
"2025-11-12 10:13:25,997\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00002_2_dpi=300,line_tolerance=0.7000,min_box_score=0.6000,text_det_box_thresh=0.5349,text_det_unclip_r_2025-11-12_10-13-25\n",
"2025-11-12 10:13:26,002\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00003_3_dpi=300,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5709,text_det_unclip_r_2025-11-12_10-13-26\n",
"2025-11-12 10:13:26,007\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00003_3_dpi=300,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5709,text_det_unclip_r_2025-11-12_10-13-26\n",
"2025-11-12 10:13:26,014\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00004_4_dpi=240,line_tolerance=0.5000,min_box_score=0,text_det_box_thresh=0.4455,text_det_unclip_ratio=_2025-11-12_10-13-26\n",
"2025-11-12 10:13:26,017\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00004_4_dpi=240,line_tolerance=0.5000,min_box_score=0,text_det_box_thresh=0.4455,text_det_unclip_ratio=_2025-11-12_10-13-26\n",
"2025-11-12 10:13:26,021\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00005_5_dpi=300,line_tolerance=0.7000,min_box_score=0.5000,text_det_box_thresh=0.4029,text_det_unclip_r_2025-11-12_10-13-26\n",
"2025-11-12 10:13:26,023\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00005_5_dpi=300,line_tolerance=0.7000,min_box_score=0.5000,text_det_box_thresh=0.4029,text_det_unclip_r_2025-11-12_10-13-26\n",
"2025-11-12 10:13:26,027\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00006_6_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4931,text_det_unclip_r_2025-11-12_10-13-26\n",
"2025-11-12 10:13:26,030\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00006_6_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4931,text_det_unclip_r_2025-11-12_10-13-26\n",
"2025-11-12 10:13:26,033\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00007_7_dpi=300,line_tolerance=0.5000,min_box_score=0.6000,text_det_box_thresh=0.6609,text_det_unclip_r_2025-11-12_10-13-26\n",
"2025-11-12 10:13:26,035\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00007_7_dpi=300,line_tolerance=0.5000,min_box_score=0.6000,text_det_box_thresh=0.6609,text_det_unclip_r_2025-11-12_10-13-26\n",
"2025-11-12 10:13:33,013\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00000_0_dpi=300,line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.5844,text_det_unclip_ratio=_2025-11-12_10-13-25\n",
"2025-11-12 10:13:33,016\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00000_0_dpi=300,line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.5844,text_det_unclip_ratio=_2025-11-12_10-13-25\n",
"2025-11-12 10:13:33,162\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00002_2_dpi=300,line_tolerance=0.7000,min_box_score=0.6000,text_det_box_thresh=0.5349,text_det_unclip_r_2025-11-12_10-13-25\n",
"2025-11-12 10:13:33,164\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00002_2_dpi=300,line_tolerance=0.7000,min_box_score=0.6000,text_det_box_thresh=0.5349,text_det_unclip_r_2025-11-12_10-13-25\n",
"2025-11-12 10:13:33,179\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.5609,text_det_unclip_r_2025-11-12_10-13-25\n",
"2025-11-12 10:13:33,183\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.5609,text_det_unclip_r_2025-11-12_10-13-25\n",
"2025-11-12 10:13:33,296\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00004_4_dpi=240,line_tolerance=0.5000,min_box_score=0,text_det_box_thresh=0.4455,text_det_unclip_ratio=_2025-11-12_10-13-26\n",
"2025-11-12 10:13:33,303\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00004_4_dpi=240,line_tolerance=0.5000,min_box_score=0,text_det_box_thresh=0.4455,text_det_unclip_ratio=_2025-11-12_10-13-26\n",
"2025-11-12 10:13:33,322\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00005_5_dpi=300,line_tolerance=0.7000,min_box_score=0.5000,text_det_box_thresh=0.4029,text_det_unclip_r_2025-11-12_10-13-26\n",
"2025-11-12 10:13:33,325\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00005_5_dpi=300,line_tolerance=0.7000,min_box_score=0.5000,text_det_box_thresh=0.4029,text_det_unclip_r_2025-11-12_10-13-26\n",
"2025-11-12 10:13:33,339\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00006_6_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4931,text_det_unclip_r_2025-11-12_10-13-26\n",
"2025-11-12 10:13:33,341\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00006_6_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4931,text_det_unclip_r_2025-11-12_10-13-26\n",
"2025-11-12 10:13:33,349\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00003_3_dpi=300,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5709,text_det_unclip_r_2025-11-12_10-13-26\n",
"2025-11-12 10:13:33,352\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00003_3_dpi=300,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5709,text_det_unclip_r_2025-11-12_10-13-26\n",
"2025-11-12 10:13:33,388\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00007_7_dpi=300,line_tolerance=0.5000,min_box_score=0.6000,text_det_box_thresh=0.6609,text_det_unclip_r_2025-11-12_10-13-26\n",
"2025-11-12 10:13:33,390\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00007_7_dpi=300,line_tolerance=0.5000,min_box_score=0.6000,text_det_box_thresh=0.6609,text_det_unclip_r_2025-11-12_10-13-26\n"
]
}
],
"source": [
"def trainable_paddle_ocr(config):\n",
" args = [sys.executable, SCRIPT_ABS, \"--pdf-folder\", PDF_FOLDER_ABS, \"--pages-per-pdf\", \"1\"]\n",
" for k, v in config.items():\n",
" args += [f\"--{KEYMAP[k]}\", str(v)]\n",
" proc = subprocess.run([sys.executable, SCRIPT_ABS, \"--pdf-folder\", PDF_FOLDER_ABS, \"--pages-per-pdf\", \"1\"], capture_output=True, text=True, cwd=SCRIPT_DIR)\n",
"\n",
" if proc.returncode != 0:\n",
" tune.report(CER=1.0, WER=1.0, time=0.0, error=proc.stderr[:500])\n",
" return\n",
" # 煤ltima l铆nea = JSON con m茅tricas\n",
" last = proc.stdout.strip().splitlines()[-1]\n",
" \n",
" metrics = json.loads(last)\n",
" tune.report(**metrics)\n",
"\n",
"scheduler = ASHAScheduler(grace_period=1, reduction_factor=2)\n",
"\n",
"tuner = tune.Tuner(\n",
" trainable_paddle_ocr,\n",
" tune_config=tune.TuneConfig(metric=\"CER\", mode=\"min\", scheduler=scheduler, num_samples=8),\n",
" param_space=search_space,\n",
" run_config=air.RunConfig(\n",
" log_to_file=False, # <- stream stdout/stderr to the notebook instead of files\n",
" verbose=2 # 0=silent, 1=brief, 2=default, 3=debuggy\n",
" ),\n",
")\n",
"\n",
"results = tuner.fit()\n",
"df = results.get_dataframe().sort_values(\"CER\", ascending=True)\n",
"cols = [\"dpi\",\"textline_orientation\",\"text_det_box_thresh\",\"text_det_unclip_ratio\",\n",
" \"text_rec_score_thresh\",\"line_tolerance\",\"pages_per_pdf\",\"lang\",\"CER\",\"WER\",\"time\"]\n",
"print(df[cols].head(10))\n",
"df.to_csv(\"raytune_paddle_subproc_results.csv\", index=False)\n",
"print(\" Guardado: raytune_paddle_subproc_results.csv\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv (3.11.9)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}