{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "2813d34d", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "be3c1872", "metadata": {}, "source": [ "# AI-based OCR Benchmark Notebook\n", "\n", "This notebook benchmarks **AI-based OCR models** on scanned PDF documents/images in Spanish.\n", "It excludes traditional OCR engines like Tesseract that require external installations." ] }, { "cell_type": "code", "execution_count": null, "id": "6a1e98fe", "metadata": {}, "outputs": [], "source": [ "%pip install --upgrade pip\n", "%pip install --upgrade jupyter\n", "%pip install --upgrade ipywidgets\n", "%pip install --upgrade ipykernel\n", "\n", "# Install necessary packages\n", "%pip install transformers torch pdf2image pillow jiwer paddleocr hf_xet paddlepaddle\n", "# pdf reading\n", "%pip install PyMuPDF\n", "\n", "# Data analysis and visualization\n", "%pip install pandas\n", "%pip install matplotlib\n", "%pip install seaborn" ] }, { "cell_type": "code", "execution_count": 4, "id": "ae33632a", "metadata": {}, "outputs": [], "source": [ "# Imports\n", "import os, json\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from pdf2image import convert_from_path\n", "from PIL import Image, ImageOps\n", "import torch\n", "from jiwer import wer, cer\n", "from paddleocr import PaddleOCR\n", "import fitz # PyMuPDF\n", "import re\n", "from datetime import datetime" ] }, { "cell_type": "markdown", "id": "0e00f1b0", "metadata": {}, "source": [ "## 1 Configuration" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "PDF_FOLDER = './instructions' # Folder containing PDF files\n", "OUTPUT_FOLDER = 'results'\n", "os.makedirs(OUTPUT_FOLDER, exist_ok=True)" ] }, { "cell_type": "code", "execution_count": 6, "id": "8bd4ca23", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
c:\\Users\\sji\\Desktop\\MastersThesis\\instructions\n",
       "
\n" ], "text/plain": [ "c:\\Users\\sji\\Desktop\\MastersThesis\\instructions\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
c:\\Users\\sji\\Desktop\\MastersThesis\\paddle_ocr_tuning.py\n",
       "
\n" ], "text/plain": [ "c:\\Users\\sji\\Desktop\\MastersThesis\\paddle_ocr_tuning.py\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
c:\\Users\\sji\\Desktop\\MastersThesis\n",
       "
\n" ], "text/plain": [ "c:\\Users\\sji\\Desktop\\MastersThesis\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "PDF_FOLDER_ABS = os.path.abspath(PDF_FOLDER) # ./instructions -> C:\\...\\instructions\n", "SCRIPT_ABS = os.path.abspath(\"paddle_ocr_tuning.py\") # paddle_ocr_tuning.py -> C:\\...\\paddle_ocr_tuning.py\n", "SCRIPT_DIR = os.path.dirname(SCRIPT_ABS)\n", "\n", "print(PDF_FOLDER_ABS)\n", "print(SCRIPT_ABS)\n", "print(SCRIPT_DIR)" ] }, { "cell_type": "code", "execution_count": null, "id": "243849b9", "metadata": {}, "outputs": [], "source": [ "# 3. PaddleOCR \n", "# https://www.paddleocr.ai/v3.0.0/en/version3.x/pipeline_usage/OCR.html?utm_source=chatgpt.com#21-command-line\n", "from paddleocr import PaddleOCR\n", "\n", "# Initialize with better settings for Spanish/Latin text\n", "# https://www.paddleocr.ai/main/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html?utm_source=chatgpt.com#5-models-and-their-supported-languages\n", "paddleocr_model = PaddleOCR(\n", " text_detection_model_name=\"PP-OCRv5_server_det\",\n", " text_recognition_model_name=\"PP-OCRv5_server_rec\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "329da34a", "metadata": {}, "outputs": [], "source": [ "import paddleocr\n", "\n", "print(paddleocr.__version__)" ] }, { "cell_type": "code", "execution_count": null, "id": "b1541bb6", "metadata": {}, "outputs": [], "source": [ "# 1) Locate the installed PaddleOCR package\n", "pkg_dir = os.path.dirname(paddleocr.__file__)\n", "print(pkg_dir)" ] }, { "cell_type": "markdown", "id": "84c999e2", "metadata": {}, "source": [ "## 2 Helper Functions" ] }, { "cell_type": "code", "execution_count": null, "id": "9596c7df", "metadata": {}, "outputs": [], "source": [ "from typing import List, Optional\n", "from paddle_ocr_tuning import pdf_to_images, pdf_extract_text, evaluate_text, assemble_from_paddle_result\n", "\n", "def show_page(img: Image.Image, text: str, scale: float = 1):\n", " \"\"\"\n", " Displays a smaller version of the image with text as a footer.\n", " \"\"\"\n", " # Compute plot size based on image dimensions (but without resizing the image)\n", " w, h = img.size\n", " figsize = (w * scale / 100, h * scale / 100) # convert pixels to inches approx\n", "\n", " fig, ax = plt.subplots(figsize=figsize)\n", " ax.imshow(img)\n", " ax.axis(\"off\")\n", "\n", "\n", " # Add OCR text below the image (footer)\n", " # plt.figtext(0.5, 0.02, text.strip(), wrap=True, ha='center', va='bottom', fontsize=10)\n", " plt.tight_layout()\n", " plt.show()" ] }, { "cell_type": "markdown", "id": "e42cae29", "metadata": {}, "source": [ "## Run AI OCR Benchmark" ] }, { "cell_type": "code", "execution_count": null, "id": "9b55c154", "metadata": {}, "outputs": [], "source": [ "results = []\n", "\n", "for pdf_file in os.listdir(PDF_FOLDER):\n", " if not pdf_file.lower().endswith('.pdf'):\n", " continue\n", " pdf_path = os.path.join(PDF_FOLDER, pdf_file)\n", " page_range = range(5, 10)\n", " \n", " images = pdf_to_images(pdf_path, 300, page_range)\n", " \n", " for i, img in enumerate(images):\n", " # img = preprocess_for_ocr(img)\n", " page_num = page_range[i]\n", " ref = pdf_extract_text(pdf_path, page_num=page_num)\n", " show_page(img, f\"page: {page_num}\", 0.15)\n", " print(f\"ref: \\n{ref}\")\n", " \n", " # Convert PIL image to numpy array\n", " image_array = np.array(img)\n", " out = paddleocr_model.predict(\n", " image_array,\n", " use_doc_orientation_classify=False,\n", " use_doc_unwarping=False,\n", " use_textline_orientation=True\n", " )\n", " # PaddleOCR\n", " paddle_text = assemble_from_paddle_result(out)\n", " print(f\"paddle_text: \\n{paddle_text}\")\n", " results.append({'PDF': pdf_file, 'Page': page_num, 'Model': 'PaddleOCR', 'Prediction': paddle_text, **evaluate_text(ref, paddle_text)})\n", " " ] }, { "cell_type": "markdown", "id": "0db6dc74", "metadata": {}, "source": [ "## 5 Save and Analyze Results" ] }, { "cell_type": "code", "execution_count": null, "id": "da3155e3", "metadata": {}, "outputs": [], "source": [ "df_results = pd.DataFrame(results)\n", "\n", "# Generate a unique filename with timestamp\n", "timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n", "filename = f\"ai_ocr_benchmark_finetune_results_{timestamp}.csv\"\n", "filepath = os.path.join(OUTPUT_FOLDER, filename)\n", "\n", "df_results.to_csv(filepath, index=False)\n", "print(f\"Benchmark results saved as {filename}\")\n", "\n", "# Summary by model\n", "summary = df_results.groupby('Model')[['WER', 'CER']].mean()\n", "print(summary)\n", "\n", "# Plot\n", "summary.plot(kind='bar', figsize=(8,5), title='AI OCR Benchmark (WER & CER)')\n", "plt.ylabel('Error Rate')\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "3e0f00c0", "metadata": {}, "source": [ "### How to read this chart:\n", "- CER (Character Error Rate) focus on raw transcription quality\n", "- WER (Word Error Rate) penalizes incorrect tokenization or missing spaces\n", "- CER and WER are error metrics, which means:\n", " - Higher values = worse performance\n", " - Lower values = better accuracy" ] }, { "cell_type": "markdown", "id": "830b0e25", "metadata": {}, "source": [ "# Busqueda de hyperparametros\n", "https://docs.ray.io/en/latest/tune/index.html" ] }, { "cell_type": "code", "execution_count": null, "id": "3a4bd700", "metadata": {}, "outputs": [], "source": [ "!python --version\n", "!pip --version" ] }, { "cell_type": "code", "execution_count": 18, "id": "b0cf4bcf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n", "Collecting rich\n", " Downloading rich-14.2.0-py3-none-any.whl.metadata (18 kB)\n", "Requirement already satisfied: ray[tune] in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (2.51.1)\n", "Requirement already satisfied: click!=8.3.0,>=7.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (8.2.1)\n", "Requirement already satisfied: filelock in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (3.20.0)\n", "Requirement already satisfied: jsonschema in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (4.25.1)\n", "Requirement already satisfied: msgpack<2.0.0,>=1.0.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (1.1.2)\n", "Requirement already satisfied: packaging in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (25.0)\n", "Requirement already satisfied: protobuf>=3.20.3 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (6.33.0)\n", "Requirement already satisfied: pyyaml in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (6.0.2)\n", "Requirement already satisfied: requests in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (2.32.5)\n", "Requirement already satisfied: pandas in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (2.3.3)\n", "Requirement already satisfied: tensorboardX>=1.9 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (2.6.4)\n", "Requirement already satisfied: pyarrow>=9.0.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (22.0.0)\n", "Requirement already satisfied: fsspec in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (2025.10.0)\n", "Collecting markdown-it-py>=2.2.0 (from rich)\n", " Downloading markdown_it_py-4.0.0-py3-none-any.whl.metadata (7.3 kB)\n", "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from rich) (2.19.2)\n", "Requirement already satisfied: colorama in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from click!=8.3.0,>=7.0->ray[tune]) (0.4.6)\n", "Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich)\n", " Downloading mdurl-0.1.2-py3-none-any.whl.metadata (1.6 kB)\n", "Requirement already satisfied: numpy in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from tensorboardX>=1.9->ray[tune]) (2.3.4)\n", "Requirement already satisfied: attrs>=22.2.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema->ray[tune]) (25.4.0)\n", "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema->ray[tune]) (2025.9.1)\n", "Requirement already satisfied: referencing>=0.28.4 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema->ray[tune]) (0.37.0)\n", "Requirement already satisfied: rpds-py>=0.7.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema->ray[tune]) (0.28.0)\n", "Requirement already satisfied: typing-extensions>=4.4.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from referencing>=0.28.4->jsonschema->ray[tune]) (4.15.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas->ray[tune]) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas->ray[tune]) (2025.2)\n", "Requirement already satisfied: tzdata>=2022.7 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas->ray[tune]) (2025.2)\n", "Requirement already satisfied: six>=1.5 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from python-dateutil>=2.8.2->pandas->ray[tune]) (1.17.0)\n", "Requirement already satisfied: charset_normalizer<4,>=2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests->ray[tune]) (3.4.4)\n", "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests->ray[tune]) (3.11)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests->ray[tune]) (2.5.0)\n", "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests->ray[tune]) (2025.10.5)\n", "Downloading rich-14.2.0-py3-none-any.whl (243 kB)\n", "Downloading markdown_it_py-4.0.0-py3-none-any.whl (87 kB)\n", "Downloading mdurl-0.1.2-py3-none-any.whl (10.0 kB)\n", "Installing collected packages: mdurl, markdown-it-py, rich\n", "\n", " ---------------------------------------- 0/3 [mdurl]\n", " ---------------------------------------- 0/3 [mdurl]\n", " ---------------------------------------- 0/3 [mdurl]\n", " ---------------------------------------- 0/3 [mdurl]\n", " ---------------------------------------- 0/3 [mdurl]\n", " ---------------------------------------- 0/3 [mdurl]\n", " ---------------------------------------- 0/3 [mdurl]\n", " ---------------------------------------- 0/3 [mdurl]\n", " ---------------------------------------- 0/3 [mdurl]\n", " ---------------------------------------- 0/3 [mdurl]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " ---------------------------------------- 3/3 [rich]\n", "\n", "Successfully installed markdown-it-py-4.0.0 mdurl-0.1.2 rich-14.2.0\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "# Instalaci贸n de Ray y Ray Tune\n", "%pip install -U \"ray[tune]\" rich" ] }, { "cell_type": "code", "execution_count": 2, "id": "f3ca0b9b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2025-11-12 10:11:36,022\tINFO worker.py:2012 -- Started a local Ray instance.\n", "c:\\Users\\sji\\Desktop\\MastersThesis\\.venv\\Lib\\site-packages\\ray\\_private\\worker.py:2051: FutureWarning: Tip: In future versions of Ray, Ray will no longer override accelerator visible devices env var if num_gpus=0 or num_gpus=None (default). To enable this behavior and turn off this error message, set RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO=0\n", " warnings.warn(\n" ] }, { "data": { "text/html": [ "
Ray Tune listo (versi贸n: 2.51.1 )\n",
       "
\n" ], "text/plain": [ "Ray Tune listo \u001b[1m(\u001b[0mversi贸n: \u001b[1;36m2.51\u001b[0m.\u001b[1;36m1\u001b[0m \u001b[1m)\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import ray\n", "from ray import tune\n", "from ray.tune.schedulers import ASHAScheduler\n", "\n", "ray.init(ignore_reinit_error=True)\n", "print(\"Ray Tune listo (versi贸n:\", ray.__version__, \")\")" ] }, { "cell_type": "code", "execution_count": 10, "id": "ae5a10c4", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\sji\\Desktop\\MastersThesis\\.venv\\Lib\\site-packages\\ray\\_private\\client_mode_hook.py:104: FutureWarning: `local_mode` is an experimental feature that is no longer maintained and will be removed in the near future. For debugging consider using the Ray distributed debugger.\n", " return func(*args, **kwargs)\n", "2025-11-12 10:13:21,828\tINFO worker.py:1850 -- Calling ray.init() again after it has already been called.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "77da938a02c94d9ca0630b2c56da3357", "version_major": 2, "version_minor": 0 }, "text/html": [ "
\n", "
\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Python version:3.11.9
Ray version:2.51.1
\n", "\n", "
\n", "
\n" ], "text/plain": [ "RayContext(dashboard_url='', python_version='3.11.9', ray_version='2.51.1', ray_commit='eeb38c79c1af96df29cbacab7b8a823d489237f3')" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# ===============================================================\n", "# 馃攳 RAY TUNE: OPTIMIZACI脫N AUTOM脕TICA DE HIPERPAR脕METROS OCR\n", "# ===============================================================\n", "\n", "from ray import tune, air\n", "from ray.tune.schedulers import ASHAScheduler\n", "import pandas as pd\n", "import time\n", "import colorama\n", "from rich import print\n", "import sys, subprocess \n", "\n", "colorama.just_fix_windows_console()\n", "ray.init(ignore_reinit_error=True, local_mode=True)" ] }, { "cell_type": "code", "execution_count": 8, "id": "96c320e8", "metadata": {}, "outputs": [], "source": [ "\n", "\n", "# --- Configuraci贸n base del experimento ---\n", "search_space = {\n", " \"dpi\": tune.choice([240, 300, 360]),\n", " \"textline_orientation\": tune.choice([True, False]),\n", " \"text_det_box_thresh\": tune.uniform(0.4, 0.7),\n", " \"text_det_unclip_ratio\": tune.uniform(1.2, 2.0),\n", " \"text_rec_score_thresh\": tune.choice([0.0, 0.2, 0.4]),\n", " \"line_tolerance\": tune.choice([0.5, 0.6, 0.7]),\n", " \"min_box_score\": tune.choice([0, 0.5, 0.6])\n", "}\n", "KEYMAP = {\n", " \"dpi\": \"dpi\",\n", " \"textline_orientation\": \"textline-orientation\",\n", " \"text_det_box_thresh\": \"text-det-box-thresh\",\n", " \"text_det_unclip_ratio\": \"text-det-unclip-ratio\",\n", " \"text_rec_score_thresh\": \"text-rec-score-thresh\",\n", " \"line_tolerance\": \"line-tolerance\",\n", " \"pages_per_pdf\": \"pages-per-pdf\",\n", " \"min_box_score\": \"min-box-score\",\n", "}" ] }, { "cell_type": "code", "execution_count": null, "id": "accb4e9d", "metadata": {}, "outputs": [], "source": [ "import sys, subprocess\n", "print(\"Notebook Python:\", sys.executable)\n", "# test paddle ocr run with params\n", "test_proc = subprocess.run([sys.executable, SCRIPT_ABS, \"--pdf-folder\", PDF_FOLDER_ABS, \"--pages-per-pdf\", \"1\"], capture_output=True, text=True, cwd=SCRIPT_DIR)\n", "last = test_proc.stdout.strip().splitlines()[-1]\n", "\n", "metrics = json.loads(last)\n", "print(metrics)\n", "\n", "print(f\"return code: {test_proc.returncode}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "8df28468", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2025-11-12 10:13:25,930\tINFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949\n" ] }, { "data": { "text/html": [ "
\n", "
\n", "
\n", "

Tune Status

\n", " \n", "\n", "\n", "\n", "\n", "\n", "
Current time:2025-11-12 10:16:28
Running for: 00:03:02.06
Memory: 21.7/31.8 GiB
\n", "
\n", "
\n", "
\n", "

System Info

\n", " Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 8.0/12 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:G)\n", "
\n", " \n", "
\n", "
\n", "
\n", "

Trial Status

\n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Trial name status loc dpi line_tolerance min_box_score text_det_box_thresh text_det_unclip_rati\n", "o text_rec_score_thres\n", "htextline_orientation
trainable_paddle_ocr_2c044_00000RUNNING 127.0.0.1:16412 300 0.7 0 0.5844 1.688250 False
trainable_paddle_ocr_2c044_00001RUNNING 127.0.0.1:23300 300 0.6 0.5 0.56087 1.5857 0.4False
trainable_paddle_ocr_2c044_00002RUNNING 127.0.0.1:15080 300 0.7 0.6 0.5348881.279860.2True
trainable_paddle_ocr_2c044_00003RUNNING 127.0.0.1:22208 300 0.6 0.6 0.5708811.927970 True
trainable_paddle_ocr_2c044_00004RUNNING 127.0.0.1:6244 240 0.5 0 0.4454751.705680 True
trainable_paddle_ocr_2c044_00005RUNNING 127.0.0.1:1252 300 0.7 0.5 0.4028911.653770 False
trainable_paddle_ocr_2c044_00006RUNNING 127.0.0.1:4104 300 0.6 0.5 0.4931431.268160.4False
trainable_paddle_ocr_2c044_00007RUNNING 127.0.0.1:15552 300 0.5 0.6 0.6608661.522810.4True
\n", "
\n", "
\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "2025-11-12 10:13:25,974\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00000_0_dpi=300,line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.5844,text_det_unclip_ratio=_2025-11-12_10-13-25\n", "2025-11-12 10:13:25,980\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00000_0_dpi=300,line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.5844,text_det_unclip_ratio=_2025-11-12_10-13-25\n", "2025-11-12 10:13:25,985\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.5609,text_det_unclip_r_2025-11-12_10-13-25\n", "2025-11-12 10:13:25,989\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.5609,text_det_unclip_r_2025-11-12_10-13-25\n", "2025-11-12 10:13:25,993\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00002_2_dpi=300,line_tolerance=0.7000,min_box_score=0.6000,text_det_box_thresh=0.5349,text_det_unclip_r_2025-11-12_10-13-25\n", "2025-11-12 10:13:25,997\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00002_2_dpi=300,line_tolerance=0.7000,min_box_score=0.6000,text_det_box_thresh=0.5349,text_det_unclip_r_2025-11-12_10-13-25\n", "2025-11-12 10:13:26,002\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00003_3_dpi=300,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5709,text_det_unclip_r_2025-11-12_10-13-26\n", "2025-11-12 10:13:26,007\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00003_3_dpi=300,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5709,text_det_unclip_r_2025-11-12_10-13-26\n", "2025-11-12 10:13:26,014\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00004_4_dpi=240,line_tolerance=0.5000,min_box_score=0,text_det_box_thresh=0.4455,text_det_unclip_ratio=_2025-11-12_10-13-26\n", "2025-11-12 10:13:26,017\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00004_4_dpi=240,line_tolerance=0.5000,min_box_score=0,text_det_box_thresh=0.4455,text_det_unclip_ratio=_2025-11-12_10-13-26\n", "2025-11-12 10:13:26,021\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00005_5_dpi=300,line_tolerance=0.7000,min_box_score=0.5000,text_det_box_thresh=0.4029,text_det_unclip_r_2025-11-12_10-13-26\n", "2025-11-12 10:13:26,023\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00005_5_dpi=300,line_tolerance=0.7000,min_box_score=0.5000,text_det_box_thresh=0.4029,text_det_unclip_r_2025-11-12_10-13-26\n", "2025-11-12 10:13:26,027\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00006_6_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4931,text_det_unclip_r_2025-11-12_10-13-26\n", "2025-11-12 10:13:26,030\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00006_6_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4931,text_det_unclip_r_2025-11-12_10-13-26\n", "2025-11-12 10:13:26,033\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00007_7_dpi=300,line_tolerance=0.5000,min_box_score=0.6000,text_det_box_thresh=0.6609,text_det_unclip_r_2025-11-12_10-13-26\n", "2025-11-12 10:13:26,035\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00007_7_dpi=300,line_tolerance=0.5000,min_box_score=0.6000,text_det_box_thresh=0.6609,text_det_unclip_r_2025-11-12_10-13-26\n", "2025-11-12 10:13:33,013\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00000_0_dpi=300,line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.5844,text_det_unclip_ratio=_2025-11-12_10-13-25\n", "2025-11-12 10:13:33,016\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00000_0_dpi=300,line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.5844,text_det_unclip_ratio=_2025-11-12_10-13-25\n", "2025-11-12 10:13:33,162\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00002_2_dpi=300,line_tolerance=0.7000,min_box_score=0.6000,text_det_box_thresh=0.5349,text_det_unclip_r_2025-11-12_10-13-25\n", "2025-11-12 10:13:33,164\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00002_2_dpi=300,line_tolerance=0.7000,min_box_score=0.6000,text_det_box_thresh=0.5349,text_det_unclip_r_2025-11-12_10-13-25\n", "2025-11-12 10:13:33,179\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.5609,text_det_unclip_r_2025-11-12_10-13-25\n", "2025-11-12 10:13:33,183\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.5609,text_det_unclip_r_2025-11-12_10-13-25\n", "2025-11-12 10:13:33,296\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00004_4_dpi=240,line_tolerance=0.5000,min_box_score=0,text_det_box_thresh=0.4455,text_det_unclip_ratio=_2025-11-12_10-13-26\n", "2025-11-12 10:13:33,303\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00004_4_dpi=240,line_tolerance=0.5000,min_box_score=0,text_det_box_thresh=0.4455,text_det_unclip_ratio=_2025-11-12_10-13-26\n", "2025-11-12 10:13:33,322\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00005_5_dpi=300,line_tolerance=0.7000,min_box_score=0.5000,text_det_box_thresh=0.4029,text_det_unclip_r_2025-11-12_10-13-26\n", "2025-11-12 10:13:33,325\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00005_5_dpi=300,line_tolerance=0.7000,min_box_score=0.5000,text_det_box_thresh=0.4029,text_det_unclip_r_2025-11-12_10-13-26\n", "2025-11-12 10:13:33,339\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00006_6_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4931,text_det_unclip_r_2025-11-12_10-13-26\n", "2025-11-12 10:13:33,341\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00006_6_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4931,text_det_unclip_r_2025-11-12_10-13-26\n", "2025-11-12 10:13:33,349\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00003_3_dpi=300,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5709,text_det_unclip_r_2025-11-12_10-13-26\n", "2025-11-12 10:13:33,352\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00003_3_dpi=300,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5709,text_det_unclip_r_2025-11-12_10-13-26\n", "2025-11-12 10:13:33,388\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00007_7_dpi=300,line_tolerance=0.5000,min_box_score=0.6000,text_det_box_thresh=0.6609,text_det_unclip_r_2025-11-12_10-13-26\n", "2025-11-12 10:13:33,390\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00007_7_dpi=300,line_tolerance=0.5000,min_box_score=0.6000,text_det_box_thresh=0.6609,text_det_unclip_r_2025-11-12_10-13-26\n" ] } ], "source": [ "def trainable_paddle_ocr(config):\n", " args = [sys.executable, SCRIPT_ABS, \"--pdf-folder\", PDF_FOLDER_ABS, \"--pages-per-pdf\", \"1\"]\n", " for k, v in config.items():\n", " args += [f\"--{KEYMAP[k]}\", str(v)]\n", " proc = subprocess.run([sys.executable, SCRIPT_ABS, \"--pdf-folder\", PDF_FOLDER_ABS, \"--pages-per-pdf\", \"1\"], capture_output=True, text=True, cwd=SCRIPT_DIR)\n", "\n", " if proc.returncode != 0:\n", " tune.report(CER=1.0, WER=1.0, time=0.0, error=proc.stderr[:500])\n", " return\n", " # 煤ltima l铆nea = JSON con m茅tricas\n", " last = proc.stdout.strip().splitlines()[-1]\n", " \n", " metrics = json.loads(last)\n", " tune.report(**metrics)\n", "\n", "scheduler = ASHAScheduler(grace_period=1, reduction_factor=2)\n", "\n", "tuner = tune.Tuner(\n", " trainable_paddle_ocr,\n", " tune_config=tune.TuneConfig(metric=\"CER\", mode=\"min\", scheduler=scheduler, num_samples=8),\n", " param_space=search_space,\n", " run_config=air.RunConfig(\n", " log_to_file=False, # <- stream stdout/stderr to the notebook instead of files\n", " verbose=2 # 0=silent, 1=brief, 2=default, 3=debuggy\n", " ),\n", ")\n", "\n", "results = tuner.fit()\n", "df = results.get_dataframe().sort_values(\"CER\", ascending=True)\n", "cols = [\"dpi\",\"textline_orientation\",\"text_det_box_thresh\",\"text_det_unclip_ratio\",\n", " \"text_rec_score_thresh\",\"line_tolerance\",\"pages_per_pdf\",\"lang\",\"CER\",\"WER\",\"time\"]\n", "print(df[cols].head(10))\n", "df.to_csv(\"raytune_paddle_subproc_results.csv\", index=False)\n", "print(\" Guardado: raytune_paddle_subproc_results.csv\")" ] } ], "metadata": { "kernelspec": { "display_name": ".venv (3.11.9)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }