{ "cells": [ { "cell_type": "markdown", "id": "be3c1872", "metadata": {}, "source": [ "# AI-based OCR Benchmark Notebook\n", "\n", "This notebook benchmarks **AI-based OCR models** on scanned PDF documents/images in Spanish.\n", "It excludes traditional OCR engines like Tesseract that require external installations." ] }, { "cell_type": "code", "execution_count": null, "id": "6a1e98fe", "metadata": {}, "outputs": [], "source": [ "%pip install --upgrade pip\n", "%pip install --upgrade jupyter\n", "%pip install --upgrade ipywidgets\n", "%pip install --upgrade ipykernel\n", "\n", "# Install necessary packages\n", "%pip install transformers torch pdf2image pillow jiwer paddleocr hf_xet paddlepaddle\n", "# pdf reading\n", "%pip install PyMuPDF\n", "\n", "# Data analysis and visualization\n", "%pip install pandas\n", "%pip install matplotlib\n", "%pip install seaborn" ] }, { "cell_type": "code", "execution_count": 1, "id": "ae33632a", "metadata": {}, "outputs": [], "source": [ "# Imports\n", "import os, json\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from pdf2image import convert_from_path\n", "from PIL import Image, ImageOps\n", "import torch\n", "from jiwer import wer, cer\n", "from paddleocr import PaddleOCR\n", "import fitz # PyMuPDF\n", "import re\n", "from datetime import datetime" ] }, { "cell_type": "markdown", "id": "0e00f1b0", "metadata": {}, "source": [ "## 1 Configuration" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "PDF_FOLDER = './instructions' # Folder containing PDF files\n", "OUTPUT_FOLDER = 'results'\n", "os.makedirs(OUTPUT_FOLDER, exist_ok=True)" ] }, { "cell_type": "code", "execution_count": 3, "id": "8bd4ca23", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "c:\\Users\\sji\\Desktop\\MastersThesis\\instructions\n", "c:\\Users\\sji\\Desktop\\MastersThesis\\paddle_ocr_tuning.py\n", "c:\\Users\\sji\\Desktop\\MastersThesis\n" ] } ], "source": [ "PDF_FOLDER_ABS = os.path.abspath(PDF_FOLDER) # ./instructions -> C:\\...\\instructions\n", "SCRIPT_ABS = os.path.abspath(\"paddle_ocr_tuning.py\") # paddle_ocr_tuning.py -> C:\\...\\paddle_ocr_tuning.py\n", "SCRIPT_DIR = os.path.dirname(SCRIPT_ABS)\n", "\n", "print(PDF_FOLDER_ABS)\n", "print(SCRIPT_ABS)\n", "print(SCRIPT_DIR)" ] }, { "cell_type": "code", "execution_count": null, "id": "243849b9", "metadata": {}, "outputs": [], "source": [ "# 3. PaddleOCR \n", "# https://www.paddleocr.ai/v3.0.0/en/version3.x/pipeline_usage/OCR.html?utm_source=chatgpt.com#21-command-line\n", "from paddleocr import PaddleOCR\n", "\n", "# Initialize with better settings for Spanish/Latin text\n", "# https://www.paddleocr.ai/main/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html?utm_source=chatgpt.com#5-models-and-their-supported-languages\n", "paddleocr_model = PaddleOCR(\n", " text_detection_model_name=\"PP-OCRv5_server_det\",\n", " text_recognition_model_name=\"PP-OCRv5_server_rec\"\n", ")" ] }, { "cell_type": "code", "execution_count": 9, "id": "329da34a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
3.3.1\n",
       "
\n" ], "text/plain": [ "\u001b[1;36m3.3\u001b[0m.\u001b[1;36m1\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import paddleocr\n", "\n", "print(paddleocr.__version__)" ] }, { "cell_type": "code", "execution_count": 10, "id": "b1541bb6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
c:\\Users\\sji\\Desktop\\MastersThesis\\.venv\\Lib\\site-packages\\paddleocr\n",
       "
\n" ], "text/plain": [ "c:\\Users\\sji\\Desktop\\MastersThesis\\.venv\\Lib\\site-packages\\paddleocr\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# 1) Locate the installed PaddleOCR package\n", "pkg_dir = os.path.dirname(paddleocr.__file__)\n", "print(pkg_dir)" ] }, { "cell_type": "markdown", "id": "84c999e2", "metadata": {}, "source": [ "## 2 Helper Functions" ] }, { "cell_type": "code", "execution_count": null, "id": "9596c7df", "metadata": {}, "outputs": [], "source": [ "from typing import List, Optional\n", "from paddle_ocr_tuning import pdf_to_images, pdf_extract_text, evaluate_text, assemble_from_paddle_result\n", "\n", "def show_page(img: Image.Image, text: str, scale: float = 1):\n", " \"\"\"\n", " Displays a smaller version of the image with text as a footer.\n", " \"\"\"\n", " # Compute plot size based on image dimensions (but without resizing the image)\n", " w, h = img.size\n", " figsize = (w * scale / 100, h * scale / 100) # convert pixels to inches approx\n", "\n", " fig, ax = plt.subplots(figsize=figsize)\n", " ax.imshow(img)\n", " ax.axis(\"off\")\n", "\n", "\n", " # Add OCR text below the image (footer)\n", " # plt.figtext(0.5, 0.02, text.strip(), wrap=True, ha='center', va='bottom', fontsize=10)\n", " plt.tight_layout()\n", " plt.show()" ] }, { "cell_type": "markdown", "id": "e42cae29", "metadata": {}, "source": [ "## Run AI OCR Benchmark" ] }, { "cell_type": "code", "execution_count": null, "id": "9b55c154", "metadata": {}, "outputs": [], "source": [ "results = []\n", "\n", "for pdf_file in os.listdir(PDF_FOLDER):\n", " if not pdf_file.lower().endswith('.pdf'):\n", " continue\n", " pdf_path = os.path.join(PDF_FOLDER, pdf_file)\n", " page_range = range(5, 10)\n", " \n", " images = pdf_to_images(pdf_path, 300, page_range)\n", " \n", " for i, img in enumerate(images):\n", " # img = preprocess_for_ocr(img)\n", " page_num = page_range[i]\n", " ref = pdf_extract_text(pdf_path, page_num=page_num)\n", " show_page(img, f\"page: {page_num}\", 0.15)\n", " print(f\"ref: \\n{ref}\")\n", " \n", " # Convert PIL image to numpy array\n", " image_array = np.array(img)\n", " out = paddleocr_model.predict(\n", " image_array,\n", " use_doc_orientation_classify=False,\n", " use_doc_unwarping=False,\n", " use_textline_orientation=True\n", " )\n", " # PaddleOCR\n", " paddle_text = assemble_from_paddle_result(out)\n", " print(f\"paddle_text: \\n{paddle_text}\")\n", " results.append({'PDF': pdf_file, 'Page': page_num, 'Model': 'PaddleOCR', 'Prediction': paddle_text, **evaluate_text(ref, paddle_text)})\n", " " ] }, { "cell_type": "markdown", "id": "0db6dc74", "metadata": {}, "source": [ "## 5 Save and Analyze Results" ] }, { "cell_type": "code", "execution_count": null, "id": "da3155e3", "metadata": {}, "outputs": [], "source": [ "df_results = pd.DataFrame(results)\n", "\n", "# Generate a unique filename with timestamp\n", "timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n", "filename = f\"ai_ocr_benchmark_finetune_results_{timestamp}.csv\"\n", "filepath = os.path.join(OUTPUT_FOLDER, filename)\n", "\n", "df_results.to_csv(filepath, index=False)\n", "print(f\"Benchmark results saved as {filename}\")\n", "\n", "# Summary by model\n", "summary = df_results.groupby('Model')[['WER', 'CER']].mean()\n", "print(summary)\n", "\n", "# Plot\n", "summary.plot(kind='bar', figsize=(8,5), title='AI OCR Benchmark (WER & CER)')\n", "plt.ylabel('Error Rate')\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "3e0f00c0", "metadata": {}, "source": [ "### How to read this chart:\n", "- CER (Character Error Rate) focus on raw transcription quality\n", "- WER (Word Error Rate) penalizes incorrect tokenization or missing spaces\n", "- CER and WER are error metrics, which means:\n", " - Higher values = worse performance\n", " - Lower values = better accuracy" ] }, { "cell_type": "markdown", "id": "830b0e25", "metadata": {}, "source": [ "# Busqueda de hyperparametros\n", "https://docs.ray.io/en/latest/tune/index.html" ] }, { "cell_type": "code", "execution_count": null, "id": "3a4bd700", "metadata": {}, "outputs": [], "source": [ "!python --version\n", "!pip --version" ] }, { "cell_type": "code", "execution_count": 18, "id": "b0cf4bcf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n", "Collecting rich\n", " Downloading rich-14.2.0-py3-none-any.whl.metadata (18 kB)\n", "Requirement already satisfied: ray[tune] in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (2.51.1)\n", "Requirement already satisfied: click!=8.3.0,>=7.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (8.2.1)\n", "Requirement already satisfied: filelock in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (3.20.0)\n", "Requirement already satisfied: jsonschema in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (4.25.1)\n", "Requirement already satisfied: msgpack<2.0.0,>=1.0.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (1.1.2)\n", "Requirement already satisfied: packaging in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (25.0)\n", "Requirement already satisfied: protobuf>=3.20.3 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (6.33.0)\n", "Requirement already satisfied: pyyaml in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (6.0.2)\n", "Requirement already satisfied: requests in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (2.32.5)\n", "Requirement already satisfied: pandas in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (2.3.3)\n", "Requirement already satisfied: tensorboardX>=1.9 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (2.6.4)\n", "Requirement already satisfied: pyarrow>=9.0.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (22.0.0)\n", "Requirement already satisfied: fsspec in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (2025.10.0)\n", "Collecting markdown-it-py>=2.2.0 (from rich)\n", " Downloading markdown_it_py-4.0.0-py3-none-any.whl.metadata (7.3 kB)\n", "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from rich) (2.19.2)\n", "Requirement already satisfied: colorama in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from click!=8.3.0,>=7.0->ray[tune]) (0.4.6)\n", "Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich)\n", " Downloading mdurl-0.1.2-py3-none-any.whl.metadata (1.6 kB)\n", "Requirement already satisfied: numpy in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from tensorboardX>=1.9->ray[tune]) (2.3.4)\n", "Requirement already satisfied: attrs>=22.2.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema->ray[tune]) (25.4.0)\n", "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema->ray[tune]) (2025.9.1)\n", "Requirement already satisfied: referencing>=0.28.4 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema->ray[tune]) (0.37.0)\n", "Requirement already satisfied: rpds-py>=0.7.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema->ray[tune]) (0.28.0)\n", "Requirement already satisfied: typing-extensions>=4.4.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from referencing>=0.28.4->jsonschema->ray[tune]) (4.15.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas->ray[tune]) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas->ray[tune]) (2025.2)\n", "Requirement already satisfied: tzdata>=2022.7 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas->ray[tune]) (2025.2)\n", "Requirement already satisfied: six>=1.5 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from python-dateutil>=2.8.2->pandas->ray[tune]) (1.17.0)\n", "Requirement already satisfied: charset_normalizer<4,>=2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests->ray[tune]) (3.4.4)\n", "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests->ray[tune]) (3.11)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests->ray[tune]) (2.5.0)\n", "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests->ray[tune]) (2025.10.5)\n", "Downloading rich-14.2.0-py3-none-any.whl (243 kB)\n", "Downloading markdown_it_py-4.0.0-py3-none-any.whl (87 kB)\n", "Downloading mdurl-0.1.2-py3-none-any.whl (10.0 kB)\n", "Installing collected packages: mdurl, markdown-it-py, rich\n", "\n", " ---------------------------------------- 0/3 [mdurl]\n", " ---------------------------------------- 0/3 [mdurl]\n", " ---------------------------------------- 0/3 [mdurl]\n", " ---------------------------------------- 0/3 [mdurl]\n", " ---------------------------------------- 0/3 [mdurl]\n", " ---------------------------------------- 0/3 [mdurl]\n", " ---------------------------------------- 0/3 [mdurl]\n", " ---------------------------------------- 0/3 [mdurl]\n", " ---------------------------------------- 0/3 [mdurl]\n", " ---------------------------------------- 0/3 [mdurl]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " ------------- -------------------------- 1/3 [markdown-it-py]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " -------------------------- ------------- 2/3 [rich]\n", " ---------------------------------------- 3/3 [rich]\n", "\n", "Successfully installed markdown-it-py-4.0.0 mdurl-0.1.2 rich-14.2.0\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "# Instalaci贸n de Ray y Ray Tune\n", "%pip install -U \"ray[tune]\" rich" ] }, { "cell_type": "code", "execution_count": 6, "id": "f3ca0b9b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2025-11-12 22:30:42,267\tINFO worker.py:1850 -- Calling ray.init() again after it has already been called.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Ray Tune listo (versi贸n: 2.51.1 )\n" ] } ], "source": [ "import ray\n", "from ray import tune\n", "from ray.tune.schedulers import ASHAScheduler\n", "\n", "ray.init(ignore_reinit_error=True)\n", "print(\"Ray Tune listo (versi贸n:\", ray.__version__, \")\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "ae5a10c4", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2025-11-12 22:30:48,318\tINFO worker.py:1850 -- Calling ray.init() again after it has already been called.\n" ] } ], "source": [ "# ===============================================================\n", "# 馃攳 RAY TUNE: OPTIMIZACI脫N AUTOM脕TICA DE HIPERPAR脕METROS OCR\n", "# ===============================================================\n", "\n", "from ray import tune, air\n", "from ray.tune.schedulers import ASHAScheduler\n", "import pandas as pd\n", "import time\n", "import colorama\n", "from rich import print\n", "import sys, subprocess \n", "from rich.console import Console\n", "\n", "colorama.just_fix_windows_console()\n", "ray.init(ignore_reinit_error=True)\n", "\n", "# Tell Ray Tune to use a Jupyter-compatible console\n", "console = Console(force_jupyter=True)" ] }, { "cell_type": "code", "execution_count": 8, "id": "96c320e8", "metadata": {}, "outputs": [], "source": [ "\n", "\n", "# --- Configuraci贸n base del experimento ---\n", "search_space = {\n", " \"dpi\": tune.choice([240, 300, 360]),\n", " \"textline_orientation\": tune.choice([True, False]),\n", " \"text_det_box_thresh\": tune.uniform(0.4, 0.7),\n", " \"text_det_unclip_ratio\": tune.uniform(1.2, 2.0),\n", " \"text_rec_score_thresh\": tune.choice([0.0, 0.2, 0.4]),\n", " \"line_tolerance\": tune.choice([0.5, 0.6, 0.7]),\n", " \"min_box_score\": tune.choice([0, 0.5, 0.6])\n", "}\n", "KEYMAP = {\n", " \"dpi\": \"dpi\",\n", " \"textline_orientation\": \"textline-orientation\",\n", " \"text_det_box_thresh\": \"text-det-box-thresh\",\n", " \"text_det_unclip_ratio\": \"text-det-unclip-ratio\",\n", " \"text_rec_score_thresh\": \"text-rec-score-thresh\",\n", " \"line_tolerance\": \"line-tolerance\",\n", " \"pages_per_pdf\": \"pages-per-pdf\",\n", " \"min_box_score\": \"min-box-score\",\n", "}" ] }, { "cell_type": "code", "execution_count": 7, "id": "accb4e9d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Notebook Python: c:\\Users\\sji\\Desktop\\MastersThesis\\.venv\\Scripts\\python.exe\n",
       "
\n" ], "text/plain": [ "Notebook Python: c:\\Users\\sji\\Desktop\\MastersThesis\\.venv\\Scripts\\python.exe\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
{'CER': 0.019801980198019802, 'WER': 0.09090909090909091, 'TIME': 38.859522104263306, 'PAGES': 1}\n",
       "
\n" ], "text/plain": [ "\u001b[1m{\u001b[0m\u001b[32m'CER'\u001b[0m: \u001b[1;36m0.019801980198019802\u001b[0m, \u001b[32m'WER'\u001b[0m: \u001b[1;36m0.09090909090909091\u001b[0m, \u001b[32m'TIME'\u001b[0m: \u001b[1;36m38.859522104263306\u001b[0m, \u001b[32m'PAGES'\u001b[0m: \u001b[1;36m1\u001b[0m\u001b[1m}\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
return code: 0\n",
       "
\n" ], "text/plain": [ "return code: \u001b[1;36m0\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
args: ['c:\\\\Users\\\\sji\\\\Desktop\\\\MastersThesis\\\\.venv\\\\Scripts\\\\python.exe', \n",
       "'c:\\\\Users\\\\sji\\\\Desktop\\\\MastersThesis\\\\paddle_ocr_tuning.py', '--pdf-folder', \n",
       "'c:\\\\Users\\\\sji\\\\Desktop\\\\MastersThesis\\\\instructions', '--pages-per-pdf', '1', '--dpi', '360', \n",
       "'--textline-orientation', 'True', '--text-det-box-thresh', '0.46611732611383844', '--text-det-unclip-ratio', \n",
       "'1.3598680409827462', '--text-rec-score-thresh', '0.0', '--line-tolerance', '0.5', '--min-box-score', '0.6']\n",
       "
\n" ], "text/plain": [ "args: \u001b[1m[\u001b[0m\u001b[32m'c:\\\\Users\\\\sji\\\\Desktop\\\\MastersThesis\\\\.venv\\\\Scripts\\\\python.exe'\u001b[0m, \n", "\u001b[32m'c:\\\\Users\\\\sji\\\\Desktop\\\\MastersThesis\\\\paddle_ocr_tuning.py'\u001b[0m, \u001b[32m'--pdf-folder'\u001b[0m, \n", "\u001b[32m'c:\\\\Users\\\\sji\\\\Desktop\\\\MastersThesis\\\\instructions'\u001b[0m, \u001b[32m'--pages-per-pdf'\u001b[0m, \u001b[32m'1'\u001b[0m, \u001b[32m'--dpi'\u001b[0m, \u001b[32m'360'\u001b[0m, \n", "\u001b[32m'--textline-orientation'\u001b[0m, \u001b[32m'True'\u001b[0m, \u001b[32m'--text-det-box-thresh'\u001b[0m, \u001b[32m'0.46611732611383844'\u001b[0m, \u001b[32m'--text-det-unclip-ratio'\u001b[0m, \n", "\u001b[32m'1.3598680409827462'\u001b[0m, \u001b[32m'--text-rec-score-thresh'\u001b[0m, \u001b[32m'0.0'\u001b[0m, \u001b[32m'--line-tolerance'\u001b[0m, \u001b[32m'0.5'\u001b[0m, \u001b[32m'--min-box-score'\u001b[0m, \u001b[32m'0.6'\u001b[0m\u001b[1m]\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import sys, subprocess\n", "print(\"Notebook Python:\", sys.executable)\n", "# test paddle ocr run with params\n", "args = [sys.executable, \n", " SCRIPT_ABS, \n", " \"--pdf-folder\", PDF_FOLDER_ABS, \n", " \"--pages-per-pdf\", \"1\",\n", " \"--dpi\",\"360\" ,\n", " \"--textline-orientation\",\"True\",\n", " \"--text-det-box-thresh\",\"0.46611732611383844\",\n", " \"--text-det-unclip-ratio\",\"1.3598680409827462\",\n", " \"--text-rec-score-thresh\",\"0.0\",\n", " \"--line-tolerance\", \"0.5\",\n", " \"--min-box-score\",\"0.6\"]\n", "test_proc = subprocess.run(args, capture_output=True, text=True, cwd=SCRIPT_DIR)\n", "if test_proc.returncode != 0:\n", " print(test_proc.stderr)\n", "last = test_proc.stdout.strip().splitlines()[-1]\n", "\n", "metrics = json.loads(last)\n", "print(metrics)\n", "\n", "print(f\"return code: {test_proc.returncode}\")\n", "print(f\"args: {args}\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "8df28468", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\sji\\Desktop\\MastersThesis\\.venv\\Lib\\site-packages\\ray\\tune\\impl\\tuner_internal.py:144: RayDeprecationWarning: The `RunConfig` class should be imported from `ray.tune` when passing it to the Tuner. Please update your imports. See this issue for more context and migration options: https://github.com/ray-project/ray/issues/49454. Disable these warnings by setting the environment variable: RAY_TRAIN_ENABLE_V2_MIGRATION_WARNINGS=0\n", " _log_deprecation_warning(\n", "2025-11-12 22:31:01,166\tINFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949\n" ] }, { "data": { "text/html": [ "
\n", "
\n", "
\n", "

Tune Status

\n", " \n", "\n", "\n", "\n", "\n", "\n", "
Current time:2025-11-12 22:39:26
Running for: 00:08:25.78
Memory: 9.9/31.8 GiB
\n", "
\n", "
\n", "
\n", "

System Info

\n", " Using AsyncHyperBand: num_stopped=1
Bracket: Iter 64.000: None | Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: -0.062382927481937384
Logical resource usage: 1.0/12 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:G)\n", "
\n", " \n", "
\n", "
\n", "
\n", "

Trial Status

\n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Trial name status loc dpi line_tolerance min_box_score text_det_box_thresh text_det_unclip_rati\n", "o text_rec_score_thres\n", "htextline_orientation iter total time (s) CER WER TIME
trainable_paddle_ocr_3632f_00000TERMINATED127.0.0.1:22388 360 0.6 0.6 0.5981391.595 0.2True 1 500.4 0.06845950.414935473.74
trainable_paddle_ocr_3632f_00001TERMINATED127.0.0.1:10796 300 0.6 0.5 0.4180691.618570.2True 1 465.4740.05630630.285714438.892
\n", "
\n", "
\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "2025-11-12 22:31:01,216\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00000_0_dpi=360,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5981,text_det_unclip_r_2025-11-12_22-31-01\n", "2025-11-12 22:31:01,216\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00000_0_dpi=360,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5981,text_det_unclip_r_2025-11-12_22-31-01\n", "2025-11-12 22:31:01,265\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4181,text_det_unclip_r_2025-11-12_22-31-01\n", "2025-11-12 22:31:01,265\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4181,text_det_unclip_r_2025-11-12_22-31-01\n", "2025-11-12 22:31:06,561\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00000_0_dpi=360,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5981,text_det_unclip_r_2025-11-12_22-31-01\n", "2025-11-12 22:31:06,563\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00000_0_dpi=360,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5981,text_det_unclip_r_2025-11-12_22-31-01\n", "2025-11-12 22:31:06,605\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4181,text_det_unclip_r_2025-11-12_22-31-01\n", "2025-11-12 22:31:06,605\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4181,text_det_unclip_r_2025-11-12_22-31-01\n" ] }, { "data": { "text/html": [ "
\n", "

Trial Progress

\n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Trial name CER PAGES TIME TIME_PER_PAGE WER
trainable_paddle_ocr_3632f_000000.0684595 2473.74 236.7680.414935
trainable_paddle_ocr_3632f_000010.0563063 2438.892 219.3720.285714
\n", "
\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "2025-11-12 22:38:52,093\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4181,text_det_unclip_r_2025-11-12_22-31-01\n", "2025-11-12 22:39:26,972\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00000_0_dpi=360,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5981,text_det_unclip_r_2025-11-12_22-31-01\n", "2025-11-12 22:39:26,988\tINFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to 'C:/Users/sji/ray_results/trainable_paddle_ocr_2025-11-12_22-31-01' in 0.0087s.\n", "2025-11-12 22:39:26,994\tINFO tune.py:1041 -- Total run time: 505.83 seconds (505.77 seconds for the tuning loop).\n" ] } ], "source": [ "def trainable_paddle_ocr(config):\n", " args = [sys.executable, SCRIPT_ABS, \"--pdf-folder\", PDF_FOLDER_ABS, \"--pages-per-pdf\", \"2\"]\n", " for k, v in config.items():\n", " args += [f\"--{KEYMAP[k]}\", str(v)]\n", " proc = subprocess.run(args, capture_output=True, text=True, cwd=SCRIPT_DIR)\n", "\n", " if proc.returncode != 0:\n", " tune.report(CER=1.0, WER=1.0, TIME=0.0, ERROR=proc.stderr[:500])\n", " return\n", " # 煤ltima l铆nea = JSON con m茅tricas\n", " last = proc.stdout.strip().splitlines()[-1]\n", " \n", " metrics = json.loads(last)\n", " tune.report(metrics=metrics)\n", "\n", "scheduler = ASHAScheduler(grace_period=1, reduction_factor=2)\n", "\n", "tuner = tune.Tuner(\n", " trainable_paddle_ocr,\n", " tune_config=tune.TuneConfig(metric=\"CER\", \n", " mode=\"min\", \n", " scheduler=scheduler, \n", " num_samples=2, \n", " max_concurrent_trials=4),\n", " run_config=air.RunConfig(verbose=2, log_to_file=False),\n", " param_space=search_space\n", ")\n", "\n", "results = tuner.fit()\n", "\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "710a67ce", "metadata": {}, "outputs": [], "source": [ "df = results.get_dataframe().sort_values(\"CER\", ascending=True)" ] }, { "cell_type": "code", "execution_count": 11, "id": "1ab345a3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Guardado: raytune_paddle_subproc_results_20251112_223927.csv\n",
       "
\n" ], "text/plain": [ "Guardado: raytune_paddle_subproc_results_20251112_223927.csv\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Generate a unique filename with timestamp\n", "timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n", "filename = f\"raytune_paddle_subproc_results_{timestamp}.csv\"\n", "filepath = os.path.join(OUTPUT_FOLDER, filename)\n", "\n", "\n", "df.to_csv(filename, index=False)\n", "print(f\"Guardado: {filename}\")" ] }, { "cell_type": "code", "execution_count": 12, "id": "3e3a34e4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CERWERTIMEPAGESTIME_PER_PAGEtimestamptraining_iterationtime_this_iter_stime_total_spidtime_since_restoreiterations_since_restoreconfig/dpiconfig/text_det_box_threshconfig/text_det_unclip_ratioconfig/text_rec_score_threshconfig/line_toleranceconfig/min_box_score
count2.0000002.0000002.0000002.02.0000002.000000e+002.02.0000002.0000002.0000002.0000002.02.0000002.0000002.0000002.02.02.000000
mean0.0623830.350325456.3158702.0228.0702881.762958e+091.0482.937319482.93731916592.000000482.9373191.0330.0000000.5081041.6067870.20.60.550000
std0.0085940.09137324.6417090.012.3005732.404163e+010.024.69645124.6964518196.78180824.6964510.042.4264070.1273290.0166660.00.00.070711
min0.0563060.285714438.8915502.0219.3724691.762958e+091.0465.474291465.47429110796.000000465.4742911.0300.0000000.4180691.5950030.20.60.500000
25%0.0593450.318019447.6037102.0223.7213781.762958e+091.0474.205805474.20580513694.000000474.2058051.0315.0000000.4630861.6008950.20.60.525000
50%0.0623830.350325456.3158702.0228.0702881.762958e+091.0482.937319482.93731916592.000000482.9373191.0330.0000000.5081041.6067870.20.60.550000
75%0.0654210.382630465.0280302.0232.4191971.762958e+091.0491.668833491.66883319490.000000491.6688331.0345.0000000.5531211.6126800.20.60.575000
max0.0684600.414935473.7401902.0236.7681071.762958e+091.0500.400347500.40034722388.000000500.4003471.0360.0000000.5981391.6185720.20.60.600000
\n", "
" ], "text/plain": [ " CER WER TIME PAGES TIME_PER_PAGE timestamp \\\n", "count 2.000000 2.000000 2.000000 2.0 2.000000 2.000000e+00 \n", "mean 0.062383 0.350325 456.315870 2.0 228.070288 1.762958e+09 \n", "std 0.008594 0.091373 24.641709 0.0 12.300573 2.404163e+01 \n", "min 0.056306 0.285714 438.891550 2.0 219.372469 1.762958e+09 \n", "25% 0.059345 0.318019 447.603710 2.0 223.721378 1.762958e+09 \n", "50% 0.062383 0.350325 456.315870 2.0 228.070288 1.762958e+09 \n", "75% 0.065421 0.382630 465.028030 2.0 232.419197 1.762958e+09 \n", "max 0.068460 0.414935 473.740190 2.0 236.768107 1.762958e+09 \n", "\n", " training_iteration time_this_iter_s time_total_s pid \\\n", "count 2.0 2.000000 2.000000 2.000000 \n", "mean 1.0 482.937319 482.937319 16592.000000 \n", "std 0.0 24.696451 24.696451 8196.781808 \n", "min 1.0 465.474291 465.474291 10796.000000 \n", "25% 1.0 474.205805 474.205805 13694.000000 \n", "50% 1.0 482.937319 482.937319 16592.000000 \n", "75% 1.0 491.668833 491.668833 19490.000000 \n", "max 1.0 500.400347 500.400347 22388.000000 \n", "\n", " time_since_restore iterations_since_restore config/dpi \\\n", "count 2.000000 2.0 2.000000 \n", "mean 482.937319 1.0 330.000000 \n", "std 24.696451 0.0 42.426407 \n", "min 465.474291 1.0 300.000000 \n", "25% 474.205805 1.0 315.000000 \n", "50% 482.937319 1.0 330.000000 \n", "75% 491.668833 1.0 345.000000 \n", "max 500.400347 1.0 360.000000 \n", "\n", " config/text_det_box_thresh config/text_det_unclip_ratio \\\n", "count 2.000000 2.000000 \n", "mean 0.508104 1.606787 \n", "std 0.127329 0.016666 \n", "min 0.418069 1.595003 \n", "25% 0.463086 1.600895 \n", "50% 0.508104 1.606787 \n", "75% 0.553121 1.612680 \n", "max 0.598139 1.618572 \n", "\n", " config/text_rec_score_thresh config/line_tolerance \\\n", "count 2.0 2.0 \n", "mean 0.2 0.6 \n", "std 0.0 0.0 \n", "min 0.2 0.6 \n", "25% 0.2 0.6 \n", "50% 0.2 0.6 \n", "75% 0.2 0.6 \n", "max 0.2 0.6 \n", "\n", " config/min_box_score \n", "count 2.000000 \n", "mean 0.550000 \n", "std 0.070711 \n", "min 0.500000 \n", "25% 0.525000 \n", "50% 0.550000 \n", "75% 0.575000 \n", "max 0.600000 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 16, "id": "4ce5eb6a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Correlaci贸n con CER:\n",
       " config/min_box_score            1.0\n",
       "CER                             1.0\n",
       "config/text_det_box_thresh      1.0\n",
       "config/dpi                      1.0\n",
       "config/text_det_unclip_ratio   -1.0\n",
       "config/text_rec_score_thresh    NaN\n",
       "config/line_tolerance           NaN\n",
       "Name: CER, dtype: float64\n",
       "
\n" ], "text/plain": [ "Correlaci贸n con CER:\n", " config/min_box_score \u001b[1;36m1.0\u001b[0m\n", "CER \u001b[1;36m1.0\u001b[0m\n", "config/text_det_box_thresh \u001b[1;36m1.0\u001b[0m\n", "config/dpi \u001b[1;36m1.0\u001b[0m\n", "config/text_det_unclip_ratio \u001b[1;36m-1.0\u001b[0m\n", "config/text_rec_score_thresh NaN\n", "config/line_tolerance NaN\n", "Name: CER, dtype: float64\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
Correlaci贸n con WER:\n",
       " config/min_box_score            1.0\n",
       "config/dpi                      1.0\n",
       "config/text_det_box_thresh      1.0\n",
       "WER                             1.0\n",
       "config/text_det_unclip_ratio   -1.0\n",
       "config/text_rec_score_thresh    NaN\n",
       "config/line_tolerance           NaN\n",
       "Name: WER, dtype: float64\n",
       "
\n" ], "text/plain": [ "Correlaci贸n con WER:\n", " config/min_box_score \u001b[1;36m1.0\u001b[0m\n", "config/dpi \u001b[1;36m1.0\u001b[0m\n", "config/text_det_box_thresh \u001b[1;36m1.0\u001b[0m\n", "WER \u001b[1;36m1.0\u001b[0m\n", "config/text_det_unclip_ratio \u001b[1;36m-1.0\u001b[0m\n", "config/text_rec_score_thresh NaN\n", "config/line_tolerance NaN\n", "Name: WER, dtype: float64\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "param_cols = [\n", " \"config/dpi\",\n", " \"config/text_det_box_thresh\",\n", " \"config/text_det_unclip_ratio\",\n", " \"config/text_rec_score_thresh\",\n", " \"config/line_tolerance\",\n", " \"config/min_box_score\",\n", "]\n", "# Correlaci贸n de Pearson con CER y WER\n", "corr_cer = df[param_cols + [\"CER\"]].corr()[\"CER\"].sort_values(ascending=False)\n", "corr_wer = df[param_cols + [\"WER\"]].corr()[\"WER\"].sort_values(ascending=False)\n", "\n", "print(\"Correlaci贸n con CER:\\n\", corr_cer)\n", "print(\"Correlaci贸n con WER:\\n\", corr_wer)" ] }, { "cell_type": "code", "execution_count": 13, "id": "02fc0a87", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "\n", "plt.scatter(df[\"config/text_det_box_thresh\"], df[\"CER\"])\n", "plt.xlabel(\"Detection Box Threshold\")\n", "plt.ylabel(\"CER\")\n", "plt.title(\"Effect of Detection Threshold on Character Error Rate\")\n", "plt.show()\n", "\n", "plt.scatter(df[\"config/line_tolerance\"], df[\"WER\"])\n", "plt.xlabel(\"Line Tolerance\")\n", "plt.ylabel(\"WER\")\n", "plt.title(\"Effect of Line Tolerance on Word Error Rate\")\n", "plt.show()\n" ] } ], "metadata": { "kernelspec": { "display_name": ".venv (3.11.9)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }