1320 lines
108 KiB
Plaintext
1320 lines
108 KiB
Plaintext
|
|
{
|
||
|
|
"cells": [
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"id": "be3c1872",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"# AI-based OCR Benchmark Notebook\n",
|
||
|
|
"\n",
|
||
|
|
"This notebook benchmarks **AI-based OCR models** on scanned PDF documents/images in Spanish.\n",
|
||
|
|
"It excludes traditional OCR engines like Tesseract that require external installations."
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"id": "6a1e98fe",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"%pip install --upgrade pip\n",
|
||
|
|
"%pip install --upgrade jupyter\n",
|
||
|
|
"%pip install --upgrade ipywidgets\n",
|
||
|
|
"%pip install --upgrade ipykernel\n",
|
||
|
|
"\n",
|
||
|
|
"# Install necessary packages\n",
|
||
|
|
"%pip install transformers torch pdf2image pillow jiwer paddleocr hf_xet paddlepaddle\n",
|
||
|
|
"# pdf reading\n",
|
||
|
|
"%pip install PyMuPDF\n",
|
||
|
|
"\n",
|
||
|
|
"# Data analysis and visualization\n",
|
||
|
|
"%pip install pandas\n",
|
||
|
|
"%pip install matplotlib\n",
|
||
|
|
"%pip install seaborn"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 1,
|
||
|
|
"id": "ae33632a",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"# Imports\n",
|
||
|
|
"import os, json\n",
|
||
|
|
"import numpy as np\n",
|
||
|
|
"import pandas as pd\n",
|
||
|
|
"import matplotlib.pyplot as plt\n",
|
||
|
|
"from pdf2image import convert_from_path\n",
|
||
|
|
"from PIL import Image, ImageOps\n",
|
||
|
|
"import torch\n",
|
||
|
|
"from jiwer import wer, cer\n",
|
||
|
|
"from paddleocr import PaddleOCR\n",
|
||
|
|
"import fitz # PyMuPDF\n",
|
||
|
|
"import re\n",
|
||
|
|
"from datetime import datetime"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"id": "0e00f1b0",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"## 1 Configuration"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 2,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"PDF_FOLDER = './instructions' # Folder containing PDF files\n",
|
||
|
|
"OUTPUT_FOLDER = 'results'\n",
|
||
|
|
"os.makedirs(OUTPUT_FOLDER, exist_ok=True)"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 3,
|
||
|
|
"id": "8bd4ca23",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stdout",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"c:\\Users\\sji\\Desktop\\MastersThesis\\instructions\n",
|
||
|
|
"c:\\Users\\sji\\Desktop\\MastersThesis\\paddle_ocr_tuning.py\n",
|
||
|
|
"c:\\Users\\sji\\Desktop\\MastersThesis\n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"PDF_FOLDER_ABS = os.path.abspath(PDF_FOLDER) # ./instructions -> C:\\...\\instructions\n",
|
||
|
|
"SCRIPT_ABS = os.path.abspath(\"paddle_ocr_tuning.py\") # paddle_ocr_tuning.py -> C:\\...\\paddle_ocr_tuning.py\n",
|
||
|
|
"SCRIPT_DIR = os.path.dirname(SCRIPT_ABS)\n",
|
||
|
|
"\n",
|
||
|
|
"print(PDF_FOLDER_ABS)\n",
|
||
|
|
"print(SCRIPT_ABS)\n",
|
||
|
|
"print(SCRIPT_DIR)"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"id": "243849b9",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"# 3. PaddleOCR \n",
|
||
|
|
"# https://www.paddleocr.ai/v3.0.0/en/version3.x/pipeline_usage/OCR.html?utm_source=chatgpt.com#21-command-line\n",
|
||
|
|
"from paddleocr import PaddleOCR\n",
|
||
|
|
"\n",
|
||
|
|
"# Initialize with better settings for Spanish/Latin text\n",
|
||
|
|
"# https://www.paddleocr.ai/main/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html?utm_source=chatgpt.com#5-models-and-their-supported-languages\n",
|
||
|
|
"paddleocr_model = PaddleOCR(\n",
|
||
|
|
" text_detection_model_name=\"PP-OCRv5_server_det\",\n",
|
||
|
|
" text_recognition_model_name=\"PP-OCRv5_server_rec\"\n",
|
||
|
|
")"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 9,
|
||
|
|
"id": "329da34a",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"text/html": [
|
||
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.3</span>.<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>\n",
|
||
|
|
"</pre>\n"
|
||
|
|
],
|
||
|
|
"text/plain": [
|
||
|
|
"\u001b[1;36m3.3\u001b[0m.\u001b[1;36m1\u001b[0m\n"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "display_data"
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"import paddleocr\n",
|
||
|
|
"\n",
|
||
|
|
"print(paddleocr.__version__)"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 10,
|
||
|
|
"id": "b1541bb6",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"text/html": [
|
||
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">c:\\Users\\sji\\Desktop\\MastersThesis\\.venv\\Lib\\site-packages\\paddleocr\n",
|
||
|
|
"</pre>\n"
|
||
|
|
],
|
||
|
|
"text/plain": [
|
||
|
|
"c:\\Users\\sji\\Desktop\\MastersThesis\\.venv\\Lib\\site-packages\\paddleocr\n"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "display_data"
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"# 1) Locate the installed PaddleOCR package\n",
|
||
|
|
"pkg_dir = os.path.dirname(paddleocr.__file__)\n",
|
||
|
|
"print(pkg_dir)"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"id": "84c999e2",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"## 2 Helper Functions"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"id": "9596c7df",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"from typing import List, Optional\n",
|
||
|
|
"from paddle_ocr_tuning import pdf_to_images, pdf_extract_text, evaluate_text, assemble_from_paddle_result\n",
|
||
|
|
"\n",
|
||
|
|
"def show_page(img: Image.Image, text: str, scale: float = 1):\n",
|
||
|
|
" \"\"\"\n",
|
||
|
|
" Displays a smaller version of the image with text as a footer.\n",
|
||
|
|
" \"\"\"\n",
|
||
|
|
" # Compute plot size based on image dimensions (but without resizing the image)\n",
|
||
|
|
" w, h = img.size\n",
|
||
|
|
" figsize = (w * scale / 100, h * scale / 100) # convert pixels to inches approx\n",
|
||
|
|
"\n",
|
||
|
|
" fig, ax = plt.subplots(figsize=figsize)\n",
|
||
|
|
" ax.imshow(img)\n",
|
||
|
|
" ax.axis(\"off\")\n",
|
||
|
|
"\n",
|
||
|
|
"\n",
|
||
|
|
" # Add OCR text below the image (footer)\n",
|
||
|
|
" # plt.figtext(0.5, 0.02, text.strip(), wrap=True, ha='center', va='bottom', fontsize=10)\n",
|
||
|
|
" plt.tight_layout()\n",
|
||
|
|
" plt.show()"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"id": "e42cae29",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"## Run AI OCR Benchmark"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"id": "9b55c154",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"results = []\n",
|
||
|
|
"\n",
|
||
|
|
"for pdf_file in os.listdir(PDF_FOLDER):\n",
|
||
|
|
" if not pdf_file.lower().endswith('.pdf'):\n",
|
||
|
|
" continue\n",
|
||
|
|
" pdf_path = os.path.join(PDF_FOLDER, pdf_file)\n",
|
||
|
|
" page_range = range(5, 10)\n",
|
||
|
|
" \n",
|
||
|
|
" images = pdf_to_images(pdf_path, 300, page_range)\n",
|
||
|
|
" \n",
|
||
|
|
" for i, img in enumerate(images):\n",
|
||
|
|
" # img = preprocess_for_ocr(img)\n",
|
||
|
|
" page_num = page_range[i]\n",
|
||
|
|
" ref = pdf_extract_text(pdf_path, page_num=page_num)\n",
|
||
|
|
" show_page(img, f\"page: {page_num}\", 0.15)\n",
|
||
|
|
" print(f\"ref: \\n{ref}\")\n",
|
||
|
|
" \n",
|
||
|
|
" # Convert PIL image to numpy array\n",
|
||
|
|
" image_array = np.array(img)\n",
|
||
|
|
" out = paddleocr_model.predict(\n",
|
||
|
|
" image_array,\n",
|
||
|
|
" use_doc_orientation_classify=False,\n",
|
||
|
|
" use_doc_unwarping=False,\n",
|
||
|
|
" use_textline_orientation=True\n",
|
||
|
|
" )\n",
|
||
|
|
" # PaddleOCR\n",
|
||
|
|
" paddle_text = assemble_from_paddle_result(out)\n",
|
||
|
|
" print(f\"paddle_text: \\n{paddle_text}\")\n",
|
||
|
|
" results.append({'PDF': pdf_file, 'Page': page_num, 'Model': 'PaddleOCR', 'Prediction': paddle_text, **evaluate_text(ref, paddle_text)})\n",
|
||
|
|
" "
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"id": "0db6dc74",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"## 5 Save and Analyze Results"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"id": "da3155e3",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"df_results = pd.DataFrame(results)\n",
|
||
|
|
"\n",
|
||
|
|
"# Generate a unique filename with timestamp\n",
|
||
|
|
"timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
|
||
|
|
"filename = f\"ai_ocr_benchmark_finetune_results_{timestamp}.csv\"\n",
|
||
|
|
"filepath = os.path.join(OUTPUT_FOLDER, filename)\n",
|
||
|
|
"\n",
|
||
|
|
"df_results.to_csv(filepath, index=False)\n",
|
||
|
|
"print(f\"Benchmark results saved as {filename}\")\n",
|
||
|
|
"\n",
|
||
|
|
"# Summary by model\n",
|
||
|
|
"summary = df_results.groupby('Model')[['WER', 'CER']].mean()\n",
|
||
|
|
"print(summary)\n",
|
||
|
|
"\n",
|
||
|
|
"# Plot\n",
|
||
|
|
"summary.plot(kind='bar', figsize=(8,5), title='AI OCR Benchmark (WER & CER)')\n",
|
||
|
|
"plt.ylabel('Error Rate')\n",
|
||
|
|
"plt.show()"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"id": "3e0f00c0",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"### How to read this chart:\n",
|
||
|
|
"- CER (Character Error Rate) focus on raw transcription quality\n",
|
||
|
|
"- WER (Word Error Rate) penalizes incorrect tokenization or missing spaces\n",
|
||
|
|
"- CER and WER are error metrics, which means:\n",
|
||
|
|
" - Higher values = worse performance\n",
|
||
|
|
" - Lower values = better accuracy"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"id": "830b0e25",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"# Busqueda de hyperparametros\n",
|
||
|
|
"https://docs.ray.io/en/latest/tune/index.html"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"id": "3a4bd700",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"!python --version\n",
|
||
|
|
"!pip --version"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 18,
|
||
|
|
"id": "b0cf4bcf",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stdout",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n",
|
||
|
|
"Collecting rich\n",
|
||
|
|
" Downloading rich-14.2.0-py3-none-any.whl.metadata (18 kB)\n",
|
||
|
|
"Requirement already satisfied: ray[tune] in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (2.51.1)\n",
|
||
|
|
"Requirement already satisfied: click!=8.3.0,>=7.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (8.2.1)\n",
|
||
|
|
"Requirement already satisfied: filelock in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (3.20.0)\n",
|
||
|
|
"Requirement already satisfied: jsonschema in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (4.25.1)\n",
|
||
|
|
"Requirement already satisfied: msgpack<2.0.0,>=1.0.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (1.1.2)\n",
|
||
|
|
"Requirement already satisfied: packaging in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (25.0)\n",
|
||
|
|
"Requirement already satisfied: protobuf>=3.20.3 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (6.33.0)\n",
|
||
|
|
"Requirement already satisfied: pyyaml in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (6.0.2)\n",
|
||
|
|
"Requirement already satisfied: requests in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (2.32.5)\n",
|
||
|
|
"Requirement already satisfied: pandas in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (2.3.3)\n",
|
||
|
|
"Requirement already satisfied: tensorboardX>=1.9 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (2.6.4)\n",
|
||
|
|
"Requirement already satisfied: pyarrow>=9.0.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (22.0.0)\n",
|
||
|
|
"Requirement already satisfied: fsspec in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ray[tune]) (2025.10.0)\n",
|
||
|
|
"Collecting markdown-it-py>=2.2.0 (from rich)\n",
|
||
|
|
" Downloading markdown_it_py-4.0.0-py3-none-any.whl.metadata (7.3 kB)\n",
|
||
|
|
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from rich) (2.19.2)\n",
|
||
|
|
"Requirement already satisfied: colorama in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from click!=8.3.0,>=7.0->ray[tune]) (0.4.6)\n",
|
||
|
|
"Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich)\n",
|
||
|
|
" Downloading mdurl-0.1.2-py3-none-any.whl.metadata (1.6 kB)\n",
|
||
|
|
"Requirement already satisfied: numpy in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from tensorboardX>=1.9->ray[tune]) (2.3.4)\n",
|
||
|
|
"Requirement already satisfied: attrs>=22.2.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema->ray[tune]) (25.4.0)\n",
|
||
|
|
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema->ray[tune]) (2025.9.1)\n",
|
||
|
|
"Requirement already satisfied: referencing>=0.28.4 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema->ray[tune]) (0.37.0)\n",
|
||
|
|
"Requirement already satisfied: rpds-py>=0.7.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema->ray[tune]) (0.28.0)\n",
|
||
|
|
"Requirement already satisfied: typing-extensions>=4.4.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from referencing>=0.28.4->jsonschema->ray[tune]) (4.15.0)\n",
|
||
|
|
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas->ray[tune]) (2.9.0.post0)\n",
|
||
|
|
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas->ray[tune]) (2025.2)\n",
|
||
|
|
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas->ray[tune]) (2025.2)\n",
|
||
|
|
"Requirement already satisfied: six>=1.5 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from python-dateutil>=2.8.2->pandas->ray[tune]) (1.17.0)\n",
|
||
|
|
"Requirement already satisfied: charset_normalizer<4,>=2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests->ray[tune]) (3.4.4)\n",
|
||
|
|
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests->ray[tune]) (3.11)\n",
|
||
|
|
"Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests->ray[tune]) (2.5.0)\n",
|
||
|
|
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests->ray[tune]) (2025.10.5)\n",
|
||
|
|
"Downloading rich-14.2.0-py3-none-any.whl (243 kB)\n",
|
||
|
|
"Downloading markdown_it_py-4.0.0-py3-none-any.whl (87 kB)\n",
|
||
|
|
"Downloading mdurl-0.1.2-py3-none-any.whl (10.0 kB)\n",
|
||
|
|
"Installing collected packages: mdurl, markdown-it-py, rich\n",
|
||
|
|
"\n",
|
||
|
|
" ---------------------------------------- 0/3 [mdurl]\n",
|
||
|
|
" ---------------------------------------- 0/3 [mdurl]\n",
|
||
|
|
" ---------------------------------------- 0/3 [mdurl]\n",
|
||
|
|
" ---------------------------------------- 0/3 [mdurl]\n",
|
||
|
|
" ---------------------------------------- 0/3 [mdurl]\n",
|
||
|
|
" ---------------------------------------- 0/3 [mdurl]\n",
|
||
|
|
" ---------------------------------------- 0/3 [mdurl]\n",
|
||
|
|
" ---------------------------------------- 0/3 [mdurl]\n",
|
||
|
|
" ---------------------------------------- 0/3 [mdurl]\n",
|
||
|
|
" ---------------------------------------- 0/3 [mdurl]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" ------------- -------------------------- 1/3 [markdown-it-py]\n",
|
||
|
|
" -------------------------- ------------- 2/3 [rich]\n",
|
||
|
|
" -------------------------- ------------- 2/3 [rich]\n",
|
||
|
|
" -------------------------- ------------- 2/3 [rich]\n",
|
||
|
|
" -------------------------- ------------- 2/3 [rich]\n",
|
||
|
|
" -------------------------- ------------- 2/3 [rich]\n",
|
||
|
|
" -------------------------- ------------- 2/3 [rich]\n",
|
||
|
|
" -------------------------- ------------- 2/3 [rich]\n",
|
||
|
|
" -------------------------- ------------- 2/3 [rich]\n",
|
||
|
|
" -------------------------- ------------- 2/3 [rich]\n",
|
||
|
|
" -------------------------- ------------- 2/3 [rich]\n",
|
||
|
|
" -------------------------- ------------- 2/3 [rich]\n",
|
||
|
|
" -------------------------- ------------- 2/3 [rich]\n",
|
||
|
|
" -------------------------- ------------- 2/3 [rich]\n",
|
||
|
|
" -------------------------- ------------- 2/3 [rich]\n",
|
||
|
|
" -------------------------- ------------- 2/3 [rich]\n",
|
||
|
|
" -------------------------- ------------- 2/3 [rich]\n",
|
||
|
|
" -------------------------- ------------- 2/3 [rich]\n",
|
||
|
|
" -------------------------- ------------- 2/3 [rich]\n",
|
||
|
|
" ---------------------------------------- 3/3 [rich]\n",
|
||
|
|
"\n",
|
||
|
|
"Successfully installed markdown-it-py-4.0.0 mdurl-0.1.2 rich-14.2.0\n",
|
||
|
|
"Note: you may need to restart the kernel to use updated packages.\n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"# Instalación de Ray y Ray Tune\n",
|
||
|
|
"%pip install -U \"ray[tune]\" rich"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 6,
|
||
|
|
"id": "f3ca0b9b",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stderr",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"2025-11-12 22:30:42,267\tINFO worker.py:1850 -- Calling ray.init() again after it has already been called.\n"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"name": "stdout",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"Ray Tune listo (versión: 2.51.1 )\n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"import ray\n",
|
||
|
|
"from ray import tune\n",
|
||
|
|
"from ray.tune.schedulers import ASHAScheduler\n",
|
||
|
|
"\n",
|
||
|
|
"ray.init(ignore_reinit_error=True)\n",
|
||
|
|
"print(\"Ray Tune listo (versión:\", ray.__version__, \")\")"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 7,
|
||
|
|
"id": "ae5a10c4",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stderr",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"2025-11-12 22:30:48,318\tINFO worker.py:1850 -- Calling ray.init() again after it has already been called.\n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"# ===============================================================\n",
|
||
|
|
"# 🔍 RAY TUNE: OPTIMIZACIÓN AUTOMÁTICA DE HIPERPARÁMETROS OCR\n",
|
||
|
|
"# ===============================================================\n",
|
||
|
|
"\n",
|
||
|
|
"from ray import tune, air\n",
|
||
|
|
"from ray.tune.schedulers import ASHAScheduler\n",
|
||
|
|
"import pandas as pd\n",
|
||
|
|
"import time\n",
|
||
|
|
"import colorama\n",
|
||
|
|
"from rich import print\n",
|
||
|
|
"import sys, subprocess \n",
|
||
|
|
"from rich.console import Console\n",
|
||
|
|
"\n",
|
||
|
|
"colorama.just_fix_windows_console()\n",
|
||
|
|
"ray.init(ignore_reinit_error=True)\n",
|
||
|
|
"\n",
|
||
|
|
"# Tell Ray Tune to use a Jupyter-compatible console\n",
|
||
|
|
"console = Console(force_jupyter=True)"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 8,
|
||
|
|
"id": "96c320e8",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"\n",
|
||
|
|
"\n",
|
||
|
|
"# --- Configuración base del experimento ---\n",
|
||
|
|
"search_space = {\n",
|
||
|
|
" \"dpi\": tune.choice([240, 300, 360]),\n",
|
||
|
|
" \"textline_orientation\": tune.choice([True, False]),\n",
|
||
|
|
" \"text_det_box_thresh\": tune.uniform(0.4, 0.7),\n",
|
||
|
|
" \"text_det_unclip_ratio\": tune.uniform(1.2, 2.0),\n",
|
||
|
|
" \"text_rec_score_thresh\": tune.choice([0.0, 0.2, 0.4]),\n",
|
||
|
|
" \"line_tolerance\": tune.choice([0.5, 0.6, 0.7]),\n",
|
||
|
|
" \"min_box_score\": tune.choice([0, 0.5, 0.6])\n",
|
||
|
|
"}\n",
|
||
|
|
"KEYMAP = {\n",
|
||
|
|
" \"dpi\": \"dpi\",\n",
|
||
|
|
" \"textline_orientation\": \"textline-orientation\",\n",
|
||
|
|
" \"text_det_box_thresh\": \"text-det-box-thresh\",\n",
|
||
|
|
" \"text_det_unclip_ratio\": \"text-det-unclip-ratio\",\n",
|
||
|
|
" \"text_rec_score_thresh\": \"text-rec-score-thresh\",\n",
|
||
|
|
" \"line_tolerance\": \"line-tolerance\",\n",
|
||
|
|
" \"pages_per_pdf\": \"pages-per-pdf\",\n",
|
||
|
|
" \"min_box_score\": \"min-box-score\",\n",
|
||
|
|
"}"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 7,
|
||
|
|
"id": "accb4e9d",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"text/html": [
|
||
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Notebook Python: c:\\Users\\sji\\Desktop\\MastersThesis\\.venv\\Scripts\\python.exe\n",
|
||
|
|
"</pre>\n"
|
||
|
|
],
|
||
|
|
"text/plain": [
|
||
|
|
"Notebook Python: c:\\Users\\sji\\Desktop\\MastersThesis\\.venv\\Scripts\\python.exe\n"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "display_data"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"text/html": [
|
||
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'CER'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.019801980198019802</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'WER'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.09090909090909091</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'TIME'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">38.859522104263306</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'PAGES'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span><span style=\"font-weight: bold\">}</span>\n",
|
||
|
|
"</pre>\n"
|
||
|
|
],
|
||
|
|
"text/plain": [
|
||
|
|
"\u001b[1m{\u001b[0m\u001b[32m'CER'\u001b[0m: \u001b[1;36m0.019801980198019802\u001b[0m, \u001b[32m'WER'\u001b[0m: \u001b[1;36m0.09090909090909091\u001b[0m, \u001b[32m'TIME'\u001b[0m: \u001b[1;36m38.859522104263306\u001b[0m, \u001b[32m'PAGES'\u001b[0m: \u001b[1;36m1\u001b[0m\u001b[1m}\u001b[0m\n"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "display_data"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"text/html": [
|
||
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">return code: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span>\n",
|
||
|
|
"</pre>\n"
|
||
|
|
],
|
||
|
|
"text/plain": [
|
||
|
|
"return code: \u001b[1;36m0\u001b[0m\n"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "display_data"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"text/html": [
|
||
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">args: <span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'c:\\\\Users\\\\sji\\\\Desktop\\\\MastersThesis\\\\.venv\\\\Scripts\\\\python.exe'</span>, \n",
|
||
|
|
"<span style=\"color: #008000; text-decoration-color: #008000\">'c:\\\\Users\\\\sji\\\\Desktop\\\\MastersThesis\\\\paddle_ocr_tuning.py'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'--pdf-folder'</span>, \n",
|
||
|
|
"<span style=\"color: #008000; text-decoration-color: #008000\">'c:\\\\Users\\\\sji\\\\Desktop\\\\MastersThesis\\\\instructions'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'--pages-per-pdf'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'1'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'--dpi'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'360'</span>, \n",
|
||
|
|
"<span style=\"color: #008000; text-decoration-color: #008000\">'--textline-orientation'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'True'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'--text-det-box-thresh'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'0.46611732611383844'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'--text-det-unclip-ratio'</span>, \n",
|
||
|
|
"<span style=\"color: #008000; text-decoration-color: #008000\">'1.3598680409827462'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'--text-rec-score-thresh'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'0.0'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'--line-tolerance'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'0.5'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'--min-box-score'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'0.6'</span><span style=\"font-weight: bold\">]</span>\n",
|
||
|
|
"</pre>\n"
|
||
|
|
],
|
||
|
|
"text/plain": [
|
||
|
|
"args: \u001b[1m[\u001b[0m\u001b[32m'c:\\\\Users\\\\sji\\\\Desktop\\\\MastersThesis\\\\.venv\\\\Scripts\\\\python.exe'\u001b[0m, \n",
|
||
|
|
"\u001b[32m'c:\\\\Users\\\\sji\\\\Desktop\\\\MastersThesis\\\\paddle_ocr_tuning.py'\u001b[0m, \u001b[32m'--pdf-folder'\u001b[0m, \n",
|
||
|
|
"\u001b[32m'c:\\\\Users\\\\sji\\\\Desktop\\\\MastersThesis\\\\instructions'\u001b[0m, \u001b[32m'--pages-per-pdf'\u001b[0m, \u001b[32m'1'\u001b[0m, \u001b[32m'--dpi'\u001b[0m, \u001b[32m'360'\u001b[0m, \n",
|
||
|
|
"\u001b[32m'--textline-orientation'\u001b[0m, \u001b[32m'True'\u001b[0m, \u001b[32m'--text-det-box-thresh'\u001b[0m, \u001b[32m'0.46611732611383844'\u001b[0m, \u001b[32m'--text-det-unclip-ratio'\u001b[0m, \n",
|
||
|
|
"\u001b[32m'1.3598680409827462'\u001b[0m, \u001b[32m'--text-rec-score-thresh'\u001b[0m, \u001b[32m'0.0'\u001b[0m, \u001b[32m'--line-tolerance'\u001b[0m, \u001b[32m'0.5'\u001b[0m, \u001b[32m'--min-box-score'\u001b[0m, \u001b[32m'0.6'\u001b[0m\u001b[1m]\u001b[0m\n"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "display_data"
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"import sys, subprocess\n",
|
||
|
|
"print(\"Notebook Python:\", sys.executable)\n",
|
||
|
|
"# test paddle ocr run with params\n",
|
||
|
|
"args = [sys.executable, \n",
|
||
|
|
" SCRIPT_ABS, \n",
|
||
|
|
" \"--pdf-folder\", PDF_FOLDER_ABS, \n",
|
||
|
|
" \"--pages-per-pdf\", \"1\",\n",
|
||
|
|
" \"--dpi\",\"360\" ,\n",
|
||
|
|
" \"--textline-orientation\",\"True\",\n",
|
||
|
|
" \"--text-det-box-thresh\",\"0.46611732611383844\",\n",
|
||
|
|
" \"--text-det-unclip-ratio\",\"1.3598680409827462\",\n",
|
||
|
|
" \"--text-rec-score-thresh\",\"0.0\",\n",
|
||
|
|
" \"--line-tolerance\", \"0.5\",\n",
|
||
|
|
" \"--min-box-score\",\"0.6\"]\n",
|
||
|
|
"test_proc = subprocess.run(args, capture_output=True, text=True, cwd=SCRIPT_DIR)\n",
|
||
|
|
"if test_proc.returncode != 0:\n",
|
||
|
|
" print(test_proc.stderr)\n",
|
||
|
|
"last = test_proc.stdout.strip().splitlines()[-1]\n",
|
||
|
|
"\n",
|
||
|
|
"metrics = json.loads(last)\n",
|
||
|
|
"print(metrics)\n",
|
||
|
|
"\n",
|
||
|
|
"print(f\"return code: {test_proc.returncode}\")\n",
|
||
|
|
"print(f\"args: {args}\")"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 9,
|
||
|
|
"id": "8df28468",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stderr",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"c:\\Users\\sji\\Desktop\\MastersThesis\\.venv\\Lib\\site-packages\\ray\\tune\\impl\\tuner_internal.py:144: RayDeprecationWarning: The `RunConfig` class should be imported from `ray.tune` when passing it to the Tuner. Please update your imports. See this issue for more context and migration options: https://github.com/ray-project/ray/issues/49454. Disable these warnings by setting the environment variable: RAY_TRAIN_ENABLE_V2_MIGRATION_WARNINGS=0\n",
|
||
|
|
" _log_deprecation_warning(\n",
|
||
|
|
"2025-11-12 22:31:01,166\tINFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949\n"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"text/html": [
|
||
|
|
"<div class=\"tuneStatus\">\n",
|
||
|
|
" <div style=\"display: flex;flex-direction: row\">\n",
|
||
|
|
" <div style=\"display: flex;flex-direction: column;\">\n",
|
||
|
|
" <h3>Tune Status</h3>\n",
|
||
|
|
" <table>\n",
|
||
|
|
"<tbody>\n",
|
||
|
|
"<tr><td>Current time:</td><td>2025-11-12 22:39:26</td></tr>\n",
|
||
|
|
"<tr><td>Running for: </td><td>00:08:25.78 </td></tr>\n",
|
||
|
|
"<tr><td>Memory: </td><td>9.9/31.8 GiB </td></tr>\n",
|
||
|
|
"</tbody>\n",
|
||
|
|
"</table>\n",
|
||
|
|
" </div>\n",
|
||
|
|
" <div class=\"vDivider\"></div>\n",
|
||
|
|
" <div class=\"systemInfo\">\n",
|
||
|
|
" <h3>System Info</h3>\n",
|
||
|
|
" Using AsyncHyperBand: num_stopped=1<br>Bracket: Iter 64.000: None | Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: -0.062382927481937384<br>Logical resource usage: 1.0/12 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:G)\n",
|
||
|
|
" </div>\n",
|
||
|
|
" \n",
|
||
|
|
" </div>\n",
|
||
|
|
" <div class=\"hDivider\"></div>\n",
|
||
|
|
" <div class=\"trialStatus\">\n",
|
||
|
|
" <h3>Trial Status</h3>\n",
|
||
|
|
" <table>\n",
|
||
|
|
"<thead>\n",
|
||
|
|
"<tr><th>Trial name </th><th>status </th><th>loc </th><th style=\"text-align: right;\"> dpi</th><th style=\"text-align: right;\"> line_tolerance</th><th style=\"text-align: right;\"> min_box_score</th><th style=\"text-align: right;\"> text_det_box_thresh</th><th style=\"text-align: right;\"> text_det_unclip_rati\n",
|
||
|
|
"o</th><th style=\"text-align: right;\"> text_rec_score_thres\n",
|
||
|
|
"h</th><th>textline_orientation </th><th style=\"text-align: right;\"> iter</th><th style=\"text-align: right;\"> total time (s)</th><th style=\"text-align: right;\"> CER</th><th style=\"text-align: right;\"> WER</th><th style=\"text-align: right;\"> TIME</th></tr>\n",
|
||
|
|
"</thead>\n",
|
||
|
|
"<tbody>\n",
|
||
|
|
"<tr><td>trainable_paddle_ocr_3632f_00000</td><td>TERMINATED</td><td>127.0.0.1:22388</td><td style=\"text-align: right;\"> 360</td><td style=\"text-align: right;\"> 0.6</td><td style=\"text-align: right;\"> 0.6</td><td style=\"text-align: right;\"> 0.598139</td><td style=\"text-align: right;\">1.595 </td><td style=\"text-align: right;\">0.2</td><td>True </td><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 500.4 </td><td style=\"text-align: right;\">0.0684595</td><td style=\"text-align: right;\">0.414935</td><td style=\"text-align: right;\">473.74 </td></tr>\n",
|
||
|
|
"<tr><td>trainable_paddle_ocr_3632f_00001</td><td>TERMINATED</td><td>127.0.0.1:10796</td><td style=\"text-align: right;\"> 300</td><td style=\"text-align: right;\"> 0.6</td><td style=\"text-align: right;\"> 0.5</td><td style=\"text-align: right;\"> 0.418069</td><td style=\"text-align: right;\">1.61857</td><td style=\"text-align: right;\">0.2</td><td>True </td><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 465.474</td><td style=\"text-align: right;\">0.0563063</td><td style=\"text-align: right;\">0.285714</td><td style=\"text-align: right;\">438.892</td></tr>\n",
|
||
|
|
"</tbody>\n",
|
||
|
|
"</table>\n",
|
||
|
|
" </div>\n",
|
||
|
|
"</div>\n",
|
||
|
|
"<style>\n",
|
||
|
|
".tuneStatus {\n",
|
||
|
|
" color: var(--jp-ui-font-color1);\n",
|
||
|
|
"}\n",
|
||
|
|
".tuneStatus .systemInfo {\n",
|
||
|
|
" display: flex;\n",
|
||
|
|
" flex-direction: column;\n",
|
||
|
|
"}\n",
|
||
|
|
".tuneStatus td {\n",
|
||
|
|
" white-space: nowrap;\n",
|
||
|
|
"}\n",
|
||
|
|
".tuneStatus .trialStatus {\n",
|
||
|
|
" display: flex;\n",
|
||
|
|
" flex-direction: column;\n",
|
||
|
|
"}\n",
|
||
|
|
".tuneStatus h3 {\n",
|
||
|
|
" font-weight: bold;\n",
|
||
|
|
"}\n",
|
||
|
|
".tuneStatus .hDivider {\n",
|
||
|
|
" border-bottom-width: var(--jp-border-width);\n",
|
||
|
|
" border-bottom-color: var(--jp-border-color0);\n",
|
||
|
|
" border-bottom-style: solid;\n",
|
||
|
|
"}\n",
|
||
|
|
".tuneStatus .vDivider {\n",
|
||
|
|
" border-left-width: var(--jp-border-width);\n",
|
||
|
|
" border-left-color: var(--jp-border-color0);\n",
|
||
|
|
" border-left-style: solid;\n",
|
||
|
|
" margin: 0.5em 1em 0.5em 1em;\n",
|
||
|
|
"}\n",
|
||
|
|
"</style>\n"
|
||
|
|
],
|
||
|
|
"text/plain": [
|
||
|
|
"<IPython.core.display.HTML object>"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "display_data"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"name": "stderr",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"2025-11-12 22:31:01,216\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00000_0_dpi=360,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5981,text_det_unclip_r_2025-11-12_22-31-01\n",
|
||
|
|
"2025-11-12 22:31:01,216\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00000_0_dpi=360,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5981,text_det_unclip_r_2025-11-12_22-31-01\n",
|
||
|
|
"2025-11-12 22:31:01,265\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4181,text_det_unclip_r_2025-11-12_22-31-01\n",
|
||
|
|
"2025-11-12 22:31:01,265\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4181,text_det_unclip_r_2025-11-12_22-31-01\n",
|
||
|
|
"2025-11-12 22:31:06,561\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00000_0_dpi=360,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5981,text_det_unclip_r_2025-11-12_22-31-01\n",
|
||
|
|
"2025-11-12 22:31:06,563\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00000_0_dpi=360,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5981,text_det_unclip_r_2025-11-12_22-31-01\n",
|
||
|
|
"2025-11-12 22:31:06,605\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4181,text_det_unclip_r_2025-11-12_22-31-01\n",
|
||
|
|
"2025-11-12 22:31:06,605\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4181,text_det_unclip_r_2025-11-12_22-31-01\n"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"text/html": [
|
||
|
|
"<div class=\"trialProgress\">\n",
|
||
|
|
" <h3>Trial Progress</h3>\n",
|
||
|
|
" <table>\n",
|
||
|
|
"<thead>\n",
|
||
|
|
"<tr><th>Trial name </th><th style=\"text-align: right;\"> CER</th><th style=\"text-align: right;\"> PAGES</th><th style=\"text-align: right;\"> TIME</th><th style=\"text-align: right;\"> TIME_PER_PAGE</th><th style=\"text-align: right;\"> WER</th></tr>\n",
|
||
|
|
"</thead>\n",
|
||
|
|
"<tbody>\n",
|
||
|
|
"<tr><td>trainable_paddle_ocr_3632f_00000</td><td style=\"text-align: right;\">0.0684595</td><td style=\"text-align: right;\"> 2</td><td style=\"text-align: right;\">473.74 </td><td style=\"text-align: right;\"> 236.768</td><td style=\"text-align: right;\">0.414935</td></tr>\n",
|
||
|
|
"<tr><td>trainable_paddle_ocr_3632f_00001</td><td style=\"text-align: right;\">0.0563063</td><td style=\"text-align: right;\"> 2</td><td style=\"text-align: right;\">438.892</td><td style=\"text-align: right;\"> 219.372</td><td style=\"text-align: right;\">0.285714</td></tr>\n",
|
||
|
|
"</tbody>\n",
|
||
|
|
"</table>\n",
|
||
|
|
"</div>\n",
|
||
|
|
"<style>\n",
|
||
|
|
".trialProgress {\n",
|
||
|
|
" display: flex;\n",
|
||
|
|
" flex-direction: column;\n",
|
||
|
|
" color: var(--jp-ui-font-color1);\n",
|
||
|
|
"}\n",
|
||
|
|
".trialProgress h3 {\n",
|
||
|
|
" font-weight: bold;\n",
|
||
|
|
"}\n",
|
||
|
|
".trialProgress td {\n",
|
||
|
|
" white-space: nowrap;\n",
|
||
|
|
"}\n",
|
||
|
|
"</style>\n"
|
||
|
|
],
|
||
|
|
"text/plain": [
|
||
|
|
"<IPython.core.display.HTML object>"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "display_data"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"name": "stderr",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"2025-11-12 22:38:52,093\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4181,text_det_unclip_r_2025-11-12_22-31-01\n",
|
||
|
|
"2025-11-12 22:39:26,972\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00000_0_dpi=360,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5981,text_det_unclip_r_2025-11-12_22-31-01\n",
|
||
|
|
"2025-11-12 22:39:26,988\tINFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to 'C:/Users/sji/ray_results/trainable_paddle_ocr_2025-11-12_22-31-01' in 0.0087s.\n",
|
||
|
|
"2025-11-12 22:39:26,994\tINFO tune.py:1041 -- Total run time: 505.83 seconds (505.77 seconds for the tuning loop).\n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"def trainable_paddle_ocr(config):\n",
|
||
|
|
" args = [sys.executable, SCRIPT_ABS, \"--pdf-folder\", PDF_FOLDER_ABS, \"--pages-per-pdf\", \"2\"]\n",
|
||
|
|
" for k, v in config.items():\n",
|
||
|
|
" args += [f\"--{KEYMAP[k]}\", str(v)]\n",
|
||
|
|
" proc = subprocess.run(args, capture_output=True, text=True, cwd=SCRIPT_DIR)\n",
|
||
|
|
"\n",
|
||
|
|
" if proc.returncode != 0:\n",
|
||
|
|
" tune.report(CER=1.0, WER=1.0, TIME=0.0, ERROR=proc.stderr[:500])\n",
|
||
|
|
" return\n",
|
||
|
|
" # última línea = JSON con métricas\n",
|
||
|
|
" last = proc.stdout.strip().splitlines()[-1]\n",
|
||
|
|
" \n",
|
||
|
|
" metrics = json.loads(last)\n",
|
||
|
|
" tune.report(metrics=metrics)\n",
|
||
|
|
"\n",
|
||
|
|
"scheduler = ASHAScheduler(grace_period=1, reduction_factor=2)\n",
|
||
|
|
"\n",
|
||
|
|
"tuner = tune.Tuner(\n",
|
||
|
|
" trainable_paddle_ocr,\n",
|
||
|
|
" tune_config=tune.TuneConfig(metric=\"CER\", \n",
|
||
|
|
" mode=\"min\", \n",
|
||
|
|
" scheduler=scheduler, \n",
|
||
|
|
" num_samples=2, \n",
|
||
|
|
" max_concurrent_trials=4),\n",
|
||
|
|
" run_config=air.RunConfig(verbose=2, log_to_file=False),\n",
|
||
|
|
" param_space=search_space\n",
|
||
|
|
")\n",
|
||
|
|
"\n",
|
||
|
|
"results = tuner.fit()\n",
|
||
|
|
"\n"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 10,
|
||
|
|
"id": "710a67ce",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"df = results.get_dataframe().sort_values(\"CER\", ascending=True)"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 11,
|
||
|
|
"id": "1ab345a3",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"text/html": [
|
||
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Guardado: raytune_paddle_subproc_results_20251112_223927.csv\n",
|
||
|
|
"</pre>\n"
|
||
|
|
],
|
||
|
|
"text/plain": [
|
||
|
|
"Guardado: raytune_paddle_subproc_results_20251112_223927.csv\n"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "display_data"
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"# Generate a unique filename with timestamp\n",
|
||
|
|
"timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
|
||
|
|
"filename = f\"raytune_paddle_subproc_results_{timestamp}.csv\"\n",
|
||
|
|
"filepath = os.path.join(OUTPUT_FOLDER, filename)\n",
|
||
|
|
"\n",
|
||
|
|
"\n",
|
||
|
|
"df.to_csv(filename, index=False)\n",
|
||
|
|
"print(f\"Guardado: {filename}\")"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 12,
|
||
|
|
"id": "3e3a34e4",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"text/html": [
|
||
|
|
"<div>\n",
|
||
|
|
"<style scoped>\n",
|
||
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
|
" vertical-align: middle;\n",
|
||
|
|
" }\n",
|
||
|
|
"\n",
|
||
|
|
" .dataframe tbody tr th {\n",
|
||
|
|
" vertical-align: top;\n",
|
||
|
|
" }\n",
|
||
|
|
"\n",
|
||
|
|
" .dataframe thead th {\n",
|
||
|
|
" text-align: right;\n",
|
||
|
|
" }\n",
|
||
|
|
"</style>\n",
|
||
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
|
" <thead>\n",
|
||
|
|
" <tr style=\"text-align: right;\">\n",
|
||
|
|
" <th></th>\n",
|
||
|
|
" <th>CER</th>\n",
|
||
|
|
" <th>WER</th>\n",
|
||
|
|
" <th>TIME</th>\n",
|
||
|
|
" <th>PAGES</th>\n",
|
||
|
|
" <th>TIME_PER_PAGE</th>\n",
|
||
|
|
" <th>timestamp</th>\n",
|
||
|
|
" <th>training_iteration</th>\n",
|
||
|
|
" <th>time_this_iter_s</th>\n",
|
||
|
|
" <th>time_total_s</th>\n",
|
||
|
|
" <th>pid</th>\n",
|
||
|
|
" <th>time_since_restore</th>\n",
|
||
|
|
" <th>iterations_since_restore</th>\n",
|
||
|
|
" <th>config/dpi</th>\n",
|
||
|
|
" <th>config/text_det_box_thresh</th>\n",
|
||
|
|
" <th>config/text_det_unclip_ratio</th>\n",
|
||
|
|
" <th>config/text_rec_score_thresh</th>\n",
|
||
|
|
" <th>config/line_tolerance</th>\n",
|
||
|
|
" <th>config/min_box_score</th>\n",
|
||
|
|
" </tr>\n",
|
||
|
|
" </thead>\n",
|
||
|
|
" <tbody>\n",
|
||
|
|
" <tr>\n",
|
||
|
|
" <th>count</th>\n",
|
||
|
|
" <td>2.000000</td>\n",
|
||
|
|
" <td>2.000000</td>\n",
|
||
|
|
" <td>2.000000</td>\n",
|
||
|
|
" <td>2.0</td>\n",
|
||
|
|
" <td>2.000000</td>\n",
|
||
|
|
" <td>2.000000e+00</td>\n",
|
||
|
|
" <td>2.0</td>\n",
|
||
|
|
" <td>2.000000</td>\n",
|
||
|
|
" <td>2.000000</td>\n",
|
||
|
|
" <td>2.000000</td>\n",
|
||
|
|
" <td>2.000000</td>\n",
|
||
|
|
" <td>2.0</td>\n",
|
||
|
|
" <td>2.000000</td>\n",
|
||
|
|
" <td>2.000000</td>\n",
|
||
|
|
" <td>2.000000</td>\n",
|
||
|
|
" <td>2.0</td>\n",
|
||
|
|
" <td>2.0</td>\n",
|
||
|
|
" <td>2.000000</td>\n",
|
||
|
|
" </tr>\n",
|
||
|
|
" <tr>\n",
|
||
|
|
" <th>mean</th>\n",
|
||
|
|
" <td>0.062383</td>\n",
|
||
|
|
" <td>0.350325</td>\n",
|
||
|
|
" <td>456.315870</td>\n",
|
||
|
|
" <td>2.0</td>\n",
|
||
|
|
" <td>228.070288</td>\n",
|
||
|
|
" <td>1.762958e+09</td>\n",
|
||
|
|
" <td>1.0</td>\n",
|
||
|
|
" <td>482.937319</td>\n",
|
||
|
|
" <td>482.937319</td>\n",
|
||
|
|
" <td>16592.000000</td>\n",
|
||
|
|
" <td>482.937319</td>\n",
|
||
|
|
" <td>1.0</td>\n",
|
||
|
|
" <td>330.000000</td>\n",
|
||
|
|
" <td>0.508104</td>\n",
|
||
|
|
" <td>1.606787</td>\n",
|
||
|
|
" <td>0.2</td>\n",
|
||
|
|
" <td>0.6</td>\n",
|
||
|
|
" <td>0.550000</td>\n",
|
||
|
|
" </tr>\n",
|
||
|
|
" <tr>\n",
|
||
|
|
" <th>std</th>\n",
|
||
|
|
" <td>0.008594</td>\n",
|
||
|
|
" <td>0.091373</td>\n",
|
||
|
|
" <td>24.641709</td>\n",
|
||
|
|
" <td>0.0</td>\n",
|
||
|
|
" <td>12.300573</td>\n",
|
||
|
|
" <td>2.404163e+01</td>\n",
|
||
|
|
" <td>0.0</td>\n",
|
||
|
|
" <td>24.696451</td>\n",
|
||
|
|
" <td>24.696451</td>\n",
|
||
|
|
" <td>8196.781808</td>\n",
|
||
|
|
" <td>24.696451</td>\n",
|
||
|
|
" <td>0.0</td>\n",
|
||
|
|
" <td>42.426407</td>\n",
|
||
|
|
" <td>0.127329</td>\n",
|
||
|
|
" <td>0.016666</td>\n",
|
||
|
|
" <td>0.0</td>\n",
|
||
|
|
" <td>0.0</td>\n",
|
||
|
|
" <td>0.070711</td>\n",
|
||
|
|
" </tr>\n",
|
||
|
|
" <tr>\n",
|
||
|
|
" <th>min</th>\n",
|
||
|
|
" <td>0.056306</td>\n",
|
||
|
|
" <td>0.285714</td>\n",
|
||
|
|
" <td>438.891550</td>\n",
|
||
|
|
" <td>2.0</td>\n",
|
||
|
|
" <td>219.372469</td>\n",
|
||
|
|
" <td>1.762958e+09</td>\n",
|
||
|
|
" <td>1.0</td>\n",
|
||
|
|
" <td>465.474291</td>\n",
|
||
|
|
" <td>465.474291</td>\n",
|
||
|
|
" <td>10796.000000</td>\n",
|
||
|
|
" <td>465.474291</td>\n",
|
||
|
|
" <td>1.0</td>\n",
|
||
|
|
" <td>300.000000</td>\n",
|
||
|
|
" <td>0.418069</td>\n",
|
||
|
|
" <td>1.595003</td>\n",
|
||
|
|
" <td>0.2</td>\n",
|
||
|
|
" <td>0.6</td>\n",
|
||
|
|
" <td>0.500000</td>\n",
|
||
|
|
" </tr>\n",
|
||
|
|
" <tr>\n",
|
||
|
|
" <th>25%</th>\n",
|
||
|
|
" <td>0.059345</td>\n",
|
||
|
|
" <td>0.318019</td>\n",
|
||
|
|
" <td>447.603710</td>\n",
|
||
|
|
" <td>2.0</td>\n",
|
||
|
|
" <td>223.721378</td>\n",
|
||
|
|
" <td>1.762958e+09</td>\n",
|
||
|
|
" <td>1.0</td>\n",
|
||
|
|
" <td>474.205805</td>\n",
|
||
|
|
" <td>474.205805</td>\n",
|
||
|
|
" <td>13694.000000</td>\n",
|
||
|
|
" <td>474.205805</td>\n",
|
||
|
|
" <td>1.0</td>\n",
|
||
|
|
" <td>315.000000</td>\n",
|
||
|
|
" <td>0.463086</td>\n",
|
||
|
|
" <td>1.600895</td>\n",
|
||
|
|
" <td>0.2</td>\n",
|
||
|
|
" <td>0.6</td>\n",
|
||
|
|
" <td>0.525000</td>\n",
|
||
|
|
" </tr>\n",
|
||
|
|
" <tr>\n",
|
||
|
|
" <th>50%</th>\n",
|
||
|
|
" <td>0.062383</td>\n",
|
||
|
|
" <td>0.350325</td>\n",
|
||
|
|
" <td>456.315870</td>\n",
|
||
|
|
" <td>2.0</td>\n",
|
||
|
|
" <td>228.070288</td>\n",
|
||
|
|
" <td>1.762958e+09</td>\n",
|
||
|
|
" <td>1.0</td>\n",
|
||
|
|
" <td>482.937319</td>\n",
|
||
|
|
" <td>482.937319</td>\n",
|
||
|
|
" <td>16592.000000</td>\n",
|
||
|
|
" <td>482.937319</td>\n",
|
||
|
|
" <td>1.0</td>\n",
|
||
|
|
" <td>330.000000</td>\n",
|
||
|
|
" <td>0.508104</td>\n",
|
||
|
|
" <td>1.606787</td>\n",
|
||
|
|
" <td>0.2</td>\n",
|
||
|
|
" <td>0.6</td>\n",
|
||
|
|
" <td>0.550000</td>\n",
|
||
|
|
" </tr>\n",
|
||
|
|
" <tr>\n",
|
||
|
|
" <th>75%</th>\n",
|
||
|
|
" <td>0.065421</td>\n",
|
||
|
|
" <td>0.382630</td>\n",
|
||
|
|
" <td>465.028030</td>\n",
|
||
|
|
" <td>2.0</td>\n",
|
||
|
|
" <td>232.419197</td>\n",
|
||
|
|
" <td>1.762958e+09</td>\n",
|
||
|
|
" <td>1.0</td>\n",
|
||
|
|
" <td>491.668833</td>\n",
|
||
|
|
" <td>491.668833</td>\n",
|
||
|
|
" <td>19490.000000</td>\n",
|
||
|
|
" <td>491.668833</td>\n",
|
||
|
|
" <td>1.0</td>\n",
|
||
|
|
" <td>345.000000</td>\n",
|
||
|
|
" <td>0.553121</td>\n",
|
||
|
|
" <td>1.612680</td>\n",
|
||
|
|
" <td>0.2</td>\n",
|
||
|
|
" <td>0.6</td>\n",
|
||
|
|
" <td>0.575000</td>\n",
|
||
|
|
" </tr>\n",
|
||
|
|
" <tr>\n",
|
||
|
|
" <th>max</th>\n",
|
||
|
|
" <td>0.068460</td>\n",
|
||
|
|
" <td>0.414935</td>\n",
|
||
|
|
" <td>473.740190</td>\n",
|
||
|
|
" <td>2.0</td>\n",
|
||
|
|
" <td>236.768107</td>\n",
|
||
|
|
" <td>1.762958e+09</td>\n",
|
||
|
|
" <td>1.0</td>\n",
|
||
|
|
" <td>500.400347</td>\n",
|
||
|
|
" <td>500.400347</td>\n",
|
||
|
|
" <td>22388.000000</td>\n",
|
||
|
|
" <td>500.400347</td>\n",
|
||
|
|
" <td>1.0</td>\n",
|
||
|
|
" <td>360.000000</td>\n",
|
||
|
|
" <td>0.598139</td>\n",
|
||
|
|
" <td>1.618572</td>\n",
|
||
|
|
" <td>0.2</td>\n",
|
||
|
|
" <td>0.6</td>\n",
|
||
|
|
" <td>0.600000</td>\n",
|
||
|
|
" </tr>\n",
|
||
|
|
" </tbody>\n",
|
||
|
|
"</table>\n",
|
||
|
|
"</div>"
|
||
|
|
],
|
||
|
|
"text/plain": [
|
||
|
|
" CER WER TIME PAGES TIME_PER_PAGE timestamp \\\n",
|
||
|
|
"count 2.000000 2.000000 2.000000 2.0 2.000000 2.000000e+00 \n",
|
||
|
|
"mean 0.062383 0.350325 456.315870 2.0 228.070288 1.762958e+09 \n",
|
||
|
|
"std 0.008594 0.091373 24.641709 0.0 12.300573 2.404163e+01 \n",
|
||
|
|
"min 0.056306 0.285714 438.891550 2.0 219.372469 1.762958e+09 \n",
|
||
|
|
"25% 0.059345 0.318019 447.603710 2.0 223.721378 1.762958e+09 \n",
|
||
|
|
"50% 0.062383 0.350325 456.315870 2.0 228.070288 1.762958e+09 \n",
|
||
|
|
"75% 0.065421 0.382630 465.028030 2.0 232.419197 1.762958e+09 \n",
|
||
|
|
"max 0.068460 0.414935 473.740190 2.0 236.768107 1.762958e+09 \n",
|
||
|
|
"\n",
|
||
|
|
" training_iteration time_this_iter_s time_total_s pid \\\n",
|
||
|
|
"count 2.0 2.000000 2.000000 2.000000 \n",
|
||
|
|
"mean 1.0 482.937319 482.937319 16592.000000 \n",
|
||
|
|
"std 0.0 24.696451 24.696451 8196.781808 \n",
|
||
|
|
"min 1.0 465.474291 465.474291 10796.000000 \n",
|
||
|
|
"25% 1.0 474.205805 474.205805 13694.000000 \n",
|
||
|
|
"50% 1.0 482.937319 482.937319 16592.000000 \n",
|
||
|
|
"75% 1.0 491.668833 491.668833 19490.000000 \n",
|
||
|
|
"max 1.0 500.400347 500.400347 22388.000000 \n",
|
||
|
|
"\n",
|
||
|
|
" time_since_restore iterations_since_restore config/dpi \\\n",
|
||
|
|
"count 2.000000 2.0 2.000000 \n",
|
||
|
|
"mean 482.937319 1.0 330.000000 \n",
|
||
|
|
"std 24.696451 0.0 42.426407 \n",
|
||
|
|
"min 465.474291 1.0 300.000000 \n",
|
||
|
|
"25% 474.205805 1.0 315.000000 \n",
|
||
|
|
"50% 482.937319 1.0 330.000000 \n",
|
||
|
|
"75% 491.668833 1.0 345.000000 \n",
|
||
|
|
"max 500.400347 1.0 360.000000 \n",
|
||
|
|
"\n",
|
||
|
|
" config/text_det_box_thresh config/text_det_unclip_ratio \\\n",
|
||
|
|
"count 2.000000 2.000000 \n",
|
||
|
|
"mean 0.508104 1.606787 \n",
|
||
|
|
"std 0.127329 0.016666 \n",
|
||
|
|
"min 0.418069 1.595003 \n",
|
||
|
|
"25% 0.463086 1.600895 \n",
|
||
|
|
"50% 0.508104 1.606787 \n",
|
||
|
|
"75% 0.553121 1.612680 \n",
|
||
|
|
"max 0.598139 1.618572 \n",
|
||
|
|
"\n",
|
||
|
|
" config/text_rec_score_thresh config/line_tolerance \\\n",
|
||
|
|
"count 2.0 2.0 \n",
|
||
|
|
"mean 0.2 0.6 \n",
|
||
|
|
"std 0.0 0.0 \n",
|
||
|
|
"min 0.2 0.6 \n",
|
||
|
|
"25% 0.2 0.6 \n",
|
||
|
|
"50% 0.2 0.6 \n",
|
||
|
|
"75% 0.2 0.6 \n",
|
||
|
|
"max 0.2 0.6 \n",
|
||
|
|
"\n",
|
||
|
|
" config/min_box_score \n",
|
||
|
|
"count 2.000000 \n",
|
||
|
|
"mean 0.550000 \n",
|
||
|
|
"std 0.070711 \n",
|
||
|
|
"min 0.500000 \n",
|
||
|
|
"25% 0.525000 \n",
|
||
|
|
"50% 0.550000 \n",
|
||
|
|
"75% 0.575000 \n",
|
||
|
|
"max 0.600000 "
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"execution_count": 12,
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "execute_result"
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"df.describe()"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 16,
|
||
|
|
"id": "4ce5eb6a",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"text/html": [
|
||
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Correlación con CER:\n",
|
||
|
|
" config/min_box_score <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1.0</span>\n",
|
||
|
|
"CER <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1.0</span>\n",
|
||
|
|
"config/text_det_box_thresh <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1.0</span>\n",
|
||
|
|
"config/dpi <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1.0</span>\n",
|
||
|
|
"config/text_det_unclip_ratio <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">-1.0</span>\n",
|
||
|
|
"config/text_rec_score_thresh NaN\n",
|
||
|
|
"config/line_tolerance NaN\n",
|
||
|
|
"Name: CER, dtype: float64\n",
|
||
|
|
"</pre>\n"
|
||
|
|
],
|
||
|
|
"text/plain": [
|
||
|
|
"Correlación con CER:\n",
|
||
|
|
" config/min_box_score \u001b[1;36m1.0\u001b[0m\n",
|
||
|
|
"CER \u001b[1;36m1.0\u001b[0m\n",
|
||
|
|
"config/text_det_box_thresh \u001b[1;36m1.0\u001b[0m\n",
|
||
|
|
"config/dpi \u001b[1;36m1.0\u001b[0m\n",
|
||
|
|
"config/text_det_unclip_ratio \u001b[1;36m-1.0\u001b[0m\n",
|
||
|
|
"config/text_rec_score_thresh NaN\n",
|
||
|
|
"config/line_tolerance NaN\n",
|
||
|
|
"Name: CER, dtype: float64\n"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "display_data"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"text/html": [
|
||
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Correlación con WER:\n",
|
||
|
|
" config/min_box_score <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1.0</span>\n",
|
||
|
|
"config/dpi <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1.0</span>\n",
|
||
|
|
"config/text_det_box_thresh <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1.0</span>\n",
|
||
|
|
"WER <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1.0</span>\n",
|
||
|
|
"config/text_det_unclip_ratio <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">-1.0</span>\n",
|
||
|
|
"config/text_rec_score_thresh NaN\n",
|
||
|
|
"config/line_tolerance NaN\n",
|
||
|
|
"Name: WER, dtype: float64\n",
|
||
|
|
"</pre>\n"
|
||
|
|
],
|
||
|
|
"text/plain": [
|
||
|
|
"Correlación con WER:\n",
|
||
|
|
" config/min_box_score \u001b[1;36m1.0\u001b[0m\n",
|
||
|
|
"config/dpi \u001b[1;36m1.0\u001b[0m\n",
|
||
|
|
"config/text_det_box_thresh \u001b[1;36m1.0\u001b[0m\n",
|
||
|
|
"WER \u001b[1;36m1.0\u001b[0m\n",
|
||
|
|
"config/text_det_unclip_ratio \u001b[1;36m-1.0\u001b[0m\n",
|
||
|
|
"config/text_rec_score_thresh NaN\n",
|
||
|
|
"config/line_tolerance NaN\n",
|
||
|
|
"Name: WER, dtype: float64\n"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "display_data"
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"param_cols = [\n",
|
||
|
|
" \"config/dpi\",\n",
|
||
|
|
" \"config/text_det_box_thresh\",\n",
|
||
|
|
" \"config/text_det_unclip_ratio\",\n",
|
||
|
|
" \"config/text_rec_score_thresh\",\n",
|
||
|
|
" \"config/line_tolerance\",\n",
|
||
|
|
" \"config/min_box_score\",\n",
|
||
|
|
"]\n",
|
||
|
|
"# Correlación de Pearson con CER y WER\n",
|
||
|
|
"corr_cer = df[param_cols + [\"CER\"]].corr()[\"CER\"].sort_values(ascending=False)\n",
|
||
|
|
"corr_wer = df[param_cols + [\"WER\"]].corr()[\"WER\"].sort_values(ascending=False)\n",
|
||
|
|
"\n",
|
||
|
|
"print(\"Correlación con CER:\\n\", corr_cer)\n",
|
||
|
|
"print(\"Correlación con WER:\\n\", corr_wer)"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 13,
|
||
|
|
"id": "02fc0a87",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkoAAAHHCAYAAABA5XcCAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjcsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvTLEjVAAAAAlwSFlzAAAPYQAAD2EBqD+naQAASQRJREFUeJzt3QncjXX+//GP/ZatspM1+y67DClZImuFaSI/07QIw4xCojITLcr8oqRF2zSMlLIkIoYoa0lZSkqbLdlDcf0f7+/vf505577PdW/d7nOf2+v5eBzuc13fa98+57tdOTzP8wwAAABJ5Ew6CAAAAARKAAAAySBHCQAAIACBEgAAQAACJQAAgAAESgAAAAEIlAAAAAIQKAEAAAQgUAIAAAhAoJQNHTt2zP74xz9aqVKlLEeOHPbnP//ZDd+7d69dd911VrRoUTd88uTJFu/bdL554YUX3PZ/9dVXllVdccUVVqdOHcvO66NjcN9996WYTmmUNp7PtfXr18d6VYCYIlCKs5tW0OeDDz4IpX3wwQdd+ttvv91efvllu+mmm9zwYcOG2TvvvGOjRo1ywzt27Jjh66llz50795zMN9o2RVOxYsXQfsmZM6ddeOGFVrduXfvTn/5kH3744W9ajyeffNKtx7l2rvZjevgP+5Q+CkgQH9544w3r1KmTFStWzPLmzWtlypSxG264wZYtW2bZwcKFC1MVyGakm2++OfDaSEhIsKwq/H6pT4ECBaxp06b20ksvxdX+P5dyn9O5I8M98MADVqlSpSTDq1SpEvpbN7vmzZvbuHHjItJoeLdu3eyvf/3rOX3AK9eqe/fuGTrfoG0K0qBBA/vLX/7i/j569Kht3brVZs+ebc8884wLGB977LF0B0p6uOimeC4F7UcFiH369LF8+fJZZunZs2fE+aXcPQWsPXr0cON8JUuWzLR1Qvro1Z7/8z//44L9hg0b2vDhw10u7Q8//OCCp6uuusref/99a9myZVzvYj2op06dmukPa12Xzz77bJLhuXLlsqws/H6pc0Hb0L9/fzt16pTdcsstcbP/zxUCpTijX4GNGzdONs2+ffusVq1aUYcrdyUeBW1TkLJly9of/vCHiGEPPfSQ/f73v7fHH3/cqlat6h728UY33My+6darV899fAcOHHD7TsMS7+Pf6uTJky6HQzmByHiTJk1yQZKKrvVjIbxY8J577nG5tblzZ+5j4cSJE3bBBRdYPASZOj/z588fmEb7Lj3XxPHjx11OzrnYP7/++qudPXvWXVepvV/qh2DlypXdvfKWdARK2Q13o2xk+fLl7sa3a9cuW7BgQSgr1S+204WuKN8f7jt06JC7cZYrV879IlLugYIKXVzh9P0f//iHK8ZSVnLx4sVd8Z1fh0Hz1AX/4osvhpaRUs6LAqCBAwe63AjNs379+m76lLYpPXV0dIPTg+Diiy+2v//9725/hG+b6mzVrl3brYfW59Zbb7WffvopIov6008/tRUrVkQtbsqM/RhUR0k5XVp3LVfFKIMGDXLrE62uzmeffWZt27Z1N1/dIB9++GE7F1Jajn9sZ86caWPGjHFplPbIkSNuvIpJtV+KFCnihrdp08bldoRTbqH2uY6Ntr1EiRJ29dVX28aNG9O8Pqk5H5OzatUqa9KkiZvu0ksvtaeffjpN+0s5no0aNXLnqXIt9eD67rvvItLoPChYsKAbrtxG/a3zR7nEZ86cSXb+P//8s02YMMFq1Khhjz76aNS6U8qxVLFLOOUqKOdJy9HDXDmJ+/fvj0jz5ptvWufOnd25p+Og7R8/fnySdfLPwQ0bNljr1q3dsRg9enSa5uGfG9dcc41ddNFFbp0UtOua8veR7nMSXqSUlmtddE516dLFVVfQj1Mdl7Qe02j8a1j3kTvuuMOds5dcckmK+yc156buC5q3jq+2UftQ+1LnflroWOs82blzZ8TwlStX2vXXX2/ly5d389W9Tjn0Ord8GbX/sxJylOLM4cOH3S/6cDoJVUG7Zs2aLhDQiasLz89KVRa7X69HD5F+/fpF/FrRA0g3Xp2sugBWr17t6jEpCza8wrcuUl3kytVSxWr9UtGFo/pRupFoGRquG63qA4ku1CC6uHRj+OKLL+zOO+90RYp6WOhC00N+6NChgdukCzk99GDRjf65555zNw9drKJt17YNGDDAhgwZ4gKzKVOm2KZNm9zDOU+ePG5fDB482M1Dv77Di5tiuR+VvX3//fdbu3btXE7P9u3b7amnnrJ169aF1t2nm5GCDxWZqU7Ka6+9ZnfffbcL2rQ+GSUty9HDUL929bDXQ1l/q6hV6RQ4qLhVOUwzZsywK6+80u0r/2F+2223uXnr/FGO448//ugCFhW1XnbZZWlan9Scj0E++eQTa9++vTsvdTx0TLXeqS2O9M89BVoKZtTwQg9+HT+dg+E5wQocOnToYM2aNXMPxHfffdflFOkcSS6XVPvl4MGDLrBMS66kznkFJNoePYh1Lmv/zJo1K2L9dV0ooNL/On5jx451Qe8jjzwSMT8dI+1zFSErGPT3UWrnsWTJEhfAlC5d2h0TFR3qeM+fP9991/X3/fffu3S6lhJLzbXu07XUt29fN41yVqpXr57i/kp8fxad04ULF44YpiBJ54u2UT+Mkts/aT03da0o90v3DwU0+nGYFjp/v/32W3fcw2mZutfpPNMzZ+3atfbEE0+4tBonGbn/swwPcWHGjBnK/oj6yZcvX0TaChUqeJ07d04yD6UdNGhQxLDx48d7BQoU8Hbs2BExfOTIkV6uXLm83bt3u+/Lli1z0w8ZMiTJfM+ePRv6W/Pq379/qrZp8uTJbp6vvPJKaNjp06e9Fi1aeAULFvSOHDmS4jZFk1Laxx9/3C33zTffdN9Xrlzpvv/zn/+MSLdo0aIkw2vXru21adMmyTwzaz/658GuXbvc93379nl58+b12rdv7505cyaUbsqUKS7d888/Hxqm9dawl156KTTs1KlTXqlSpbxevXp5qbV//343n3HjxkUdn9rlvPfeey5d5cqVvRMnTkTsh6pVq3odOnSI2CdKU6lSJe/qq68ODStSpEiSczq965OW8zHx9nfv3t1LSEjwvv7669Cwzz77zB37lG6zWkaJEiW8OnXqeD///HNo+Pz58920Y8eODQ3TOaFhDzzwQMQ8GjZs6DVq1CjZ5fzjH/9w077xxhteavjnWrt27SKOw7Bhw9x2HTp0KDQs/Pj5br31Vu+CCy7wTp48meRYTJs2LUn61Mzj119/deeArvGffvopIm34OuqciLbf03KtaxkapnGp4R+baB+dy4n3a6tWrdz2hAvaP6k9N3VfULrChQu7e0NqaDt1/9B1rc8nn3zi3XTTTVGfFyeiHKMJEyZ4OXLkiDj3M2L/ZyUUvcUZZWkqUg//vP322+men34F/O53v3O/HPRLyP8od0K/XP/zn/+4dHPmzHE5V9EqU6e3+bMq/OnXoH6x+fRrQr8yVGFYWdPngn6t+sU2/j5Q8Y5y28L3gXIzlPa9997LsvtRuQmnT592uQTh9Xr061e/YFVcmXjbw+si6Jeucme+/PJLy0hpWY4qjYbX+/joo4/s888/d/XJ9Ova35f61a3KxtqXfnGmclpUDKNfsL91fdJ7Pur4qnhGRWHKSfQpN1Q5PylRkauKVZTDEN46SsVQKv5IfAz9nLRwOvdSOoZ+kWahQoUsLZQrEX5ualna5q+//jo0LPz46brS8VI65T5s27YtYn7K4VBuQmKpmYdyHZQDofM9cX3L1Fw/ab3WlXOTmmPo0/FLfH/WZ+LEiUnS6hqNlrMXbf+k9dzs1atXmnLdFy9e7NLro1xW5QRpHRLnBuYPO0a6HrXvVPFfvx10bFKSEffaWKDoLc7o5p5SZe600ANp8+bNgReVbuCismrVHUhrFm5ydKNVperEFXf1gPHHnwu6sYQ/MLQPVKSpugLJ7YOsuB/9fZS4SECBgCpjJt6HKr5M/EBRcKd1z0hpWU7iVpzal34AFUTHS/NTPSOlU10J3WxVb0VFy9r2tK5Pes9H1ddR0YimTUzHRQ+59BxDUaC
|
||
|
|
"text/plain": [
|
||
|
|
"<Figure size 640x480 with 1 Axes>"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "display_data"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkAAAAHHCAYAAABXx+fLAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjcsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvTLEjVAAAAAlwSFlzAAAPYQAAD2EBqD+naQAAQJZJREFUeJzt3Ql4VOXd/vFfCEvYEZBVdlREFGQVF0ALokUELRX9W4m4oEWwFjcQAREsdSlFWRWLVGlf6IugFBGsgBQURKEIIgSKKIsCocoqm3D+1/2810xnQhImIckkeb6f6xplzpw5c84zJzP3PNtJCIIgMAAAAI8UifcOAAAA5DUCEAAA8A4BCAAAeIcABAAAvEMAAgAA3iEAAQAA7xCAAACAdwhAAADAOwQgAADgHQIQctWhQ4fs3nvvtWrVqllCQoI9/PDDbvnu3butR48eVqlSJbd8zJgxBf6Ysuuuu+6yunXrWkH19NNPu3IAOBdQkBCAkGVTp051X3gZ3VasWBFe93e/+51b/9e//rW9+eabduedd7rlv/3tb23BggU2aNAgt/z666/P8XdCr/3222/nynbTO6b0KNjceOONlt8odGX2HoZuWg/5S9++fa1IkSL2/fffRy3XfS0vUaKEHT16NOqxr776yr2fTz75pOX38y8pKcnyK/09R+5r6dKlrXXr1vbGG29ke5vz5s1zwRF5r2gcXhOFxDPPPGP16tU7bXnDhg3D/160aJFdfvnlNmzYsKh1tLxbt2726KOP5tr+Kaiolql79+45ut2Mjim7Jk+ebKdOnbK8dP/991vHjh3D97du3WpDhw61Pn362NVXXx1e3qBBgzzdL5zZVVddZRMnTrSPPvrIunbtGl7+8ccfuwB04sQJ++yzz9x6IVo39Nz8QkHttddeO215YmKi5WfNmjWzRx55xP37u+++c8eQnJxsx44ds/vuuy9bAWj8+PGEoDggACHbbrjhBmvZsmWm6+zZs8caN26c7vIKFSoUyNLP6Jiyq1ixYpbX2rZt624h+sJUANKyX/3qVxZPP/74o5UqVSqu+5CfhULMsmXLogKQQs6ll15qR44ccY9Fhh3dVzi64oorzuq1f/rpJxfWixcvbmeraNGi2TrXDh8+7GpecuPcieX4atasGbXfqs2qX7++/fGPf8xWAEL80ASGXPHhhx+6KmLVLLz77rvhKuNQ81kQBO5XT2h5yL59+1yfmlq1arlfiKpNeu65506rIdH9l156yS655BJXZX7uuee6ZjR9kYu2qQ/KP//5zzE35yjY3HPPPVa1alW3zaZNm7rnn+mYvv766xztA6Ttabsvvviivfrqq64WRmXRqlUr+/TTT097/saNG11NV8WKFd1+K5TOmTPHcsL//u//WosWLaxkyZJWuXJl98G/c+fOmJ47bdq08HO1b7fddptt3749ap0OHTpYkyZNbNWqVdauXTv35RVqpnnnnXesS5cuVqNGDXf8KocRI0bYyZMn093Gl19+addcc43bhr6knn/++dP2SU1Dam644IILXFlVr17dbrnlFtuyZUvUuaU+aRdffLFbR+eDasx++OGHmGsIVYumL2mFfNV0btiwId2+Mv/+97/d+6/1ypcvb71793Zf4pmpXbu2+/sI1eqE6P6VV17pQk56j+l4Qj86znSupz0PVR6h81DlHApVOif1fD32yiuvWE4LfV4sWbLENf1VqVLFzjvvvDOeOzlxfLHSZ0+jRo2iziFZunSp/fKXv3Tvl7ar90xN/wqoIXrv9TkokU1rOXUuInPUACHb9u/fb3v37o1apj9edWy+6KKLXP8Y/cHrAytUZXzZZZeF+8106tTJevXqFX6uPvjbt2/vvmD1R64PDlXrq5+QqpojO0rrw00fjqqFUodk/XLTB476HykA6DW0XO3zatY5U3OOPpT0gaovpH79+rmmPX356wNKoew3v/lNhsekD8Dc8Ne//tUOHjzoykLlqi90fVmrP0eo1mj9+vXuS09f+AMHDnRfun/7299cs99bb71lN998c7ZfX+WrL2R9yY0aNcp1XFfo1Jfpv/71r0xr8J599lkbMmSI3Xrrre59SE1NtbFjx7ovqrTP/c9//uPeRwUkBSx9yIdev0yZMjZgwAD3fwUL1VIdOHDAXnjhhajX0xeCArDKR685c+ZMe+KJJ1xA1rZFwUn9sRYuXOheS++pyvcf//iHffHFF+HzQ+UdOvaHHnrIBd5x48a5/daxZ1Zj98EHH7jXU42AQo7OKx233qPVq1ef1tld+6pzTeWrx9Wcoi95hf7MqHZn1qxZrtlFX67Hjx934Vj90vR39Pjjj7sfGTpvVDb6Un/ggQdiPtcjvf766y446u9Ir6Uwu27dOrvuuuvcua/j1N+fmoRD712s0n5+iGpfypUrF7VM4UevpfdfP2wyO3dy4viyQse+Y8cOO+ecc6KW6zX1Xug90WfiypUr3bmgdfVY6Fz79ttv3Tmoz5a0zuZcRAwCIItef/31QKdOercSJUpErVunTp2gS5cup21D6z744INRy0aMGBGULl062LRpU9TygQMHBomJicG2bdvc/UWLFrnnP/TQQ6dt99SpU+F/a1vJyckxHdOYMWPcNqdNmxZedvz48aBt27ZBmTJlggMHDpzxmNITy7raR60XsnXrVrcvlSpVCr7//vvw8nfeecct//vf/x5e9rOf/Sy45JJLgqNHj0aVwRVXXBGcf/75Qaw+/fRTt229t6Fjr1KlStCkSZPgyJEj4fXmzp3r1hs6dGh42bBhw9yykK+//tq9X88++2zUa6xbty4oWrRo1PL27du7506aNOm0ffrxxx9PW3b//fcHpUqVijre0DbeeOON8LJjx44F1apVC37xi1+El02ZMsWtN3r06AzPm6VLl7p1/vKXv0Q9Pn/+/HSXp9WsWTNXbv/5z3/Cyz7//POgSJEiQa9evU4rs7vvvjvq+TfffLN7389k/Pjx7vnaX1m+fLm7/8033wRffvml+/f69euj3rPQvsd6rofOw3LlygV79uyJev3u3bsHSUlJ7vVC9Lp632P5WtE5n9FnSOfOnU/7rLnqqquCn376KWobGZ07OXF8GdHf6XXXXRekpqa6m87pO++8M93Ps/TO31GjRgUJCQlR5abnpVdmZ3su4sxoAkO2qepWv1wib++99162t6dfRWo60C8p/TIM3dRZV7/e//nPf7r1VLOhX7bpdULO7nBsdUTUsPbbb789vEy/rvSrS8PeVQWf13r27Bn1qzLUOVk1QKFRP6oVUS2CajJC5aVfxZ07d7bNmzfH3FyVlpoS1YygX96Ro3LUJKXqfjUBZkQ1E6q6135Fvo8q3/PPP98WL14ctb5+desXblpqOgsJHZ/KQL+q1ewXSTVEkf0yVIug2r9QWYXOGzXj9e/fP8PzRuegmqJUOxm572rK02uk3fdIqqVcs2aNq2mIrEVQvxxtT+dYWqFamRAdn94/1XLF2g9IVBugWkDVmur90euHmsHSdoDO6rn+i1/8IqqWU3+LGsGpWka9XohqSHXexUrnVdrPD91+//vfn7au+tak1zk6vXPnbI/vTN5//323vm6qYVTNjfYhba1k5PmrWiudR2qe1O8/1eCcydmci4gNTWDINn3BnKkTdFboC3vt2rUZfhjpC1nU1q5+IVmtqs7MN998476c1VE0kj7UQ4/ntcgvFwmFoVD7v6r49WGqpibdMiozfTFmVeh4L7zwwtMe0xds6Is3o/dR+6XyTE/aanvtX3qdTtW899RTT7mQlzYQqPk1kpok04ZflZfOpxCdNzoedb7NbN+1bTVDZXYOZrXMdB4pNKTtwJvZe5y2GSiS+r6oGTEy5KiZTVQO6syuZQoO+r/6n4ReK6vnetqRnmrOVDNTeu+vjj29oJceBZrIkYiZSW+0aUbnztke35m0adPGRo4c6YKgmk71b71fafdj27ZtrslO/fHS9tlJe/7m9LmI2BCAkG+o1kC/dtR/IT3quOqTjIYD/18L4v+Vl2gqgYx+eUdOSZBXtF/6ElZtYHrHoF+vGf1SDlFfDfUHUwjQdAvqn6M
|
||
|
|
"text/plain": [
|
||
|
|
"<Figure size 640x480 with 1 Axes>"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "display_data"
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"import matplotlib.pyplot as plt\n",
|
||
|
|
"\n",
|
||
|
|
"plt.scatter(df[\"config/text_det_box_thresh\"], df[\"CER\"])\n",
|
||
|
|
"plt.xlabel(\"Detection Box Threshold\")\n",
|
||
|
|
"plt.ylabel(\"CER\")\n",
|
||
|
|
"plt.title(\"Effect of Detection Threshold on Character Error Rate\")\n",
|
||
|
|
"plt.show()\n",
|
||
|
|
"\n",
|
||
|
|
"plt.scatter(df[\"config/line_tolerance\"], df[\"WER\"])\n",
|
||
|
|
"plt.xlabel(\"Line Tolerance\")\n",
|
||
|
|
"plt.ylabel(\"WER\")\n",
|
||
|
|
"plt.title(\"Effect of Line Tolerance on Word Error Rate\")\n",
|
||
|
|
"plt.show()\n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"metadata": {
|
||
|
|
"kernelspec": {
|
||
|
|
"display_name": ".venv (3.11.9)",
|
||
|
|
"language": "python",
|
||
|
|
"name": "python3"
|
||
|
|
},
|
||
|
|
"language_info": {
|
||
|
|
"codemirror_mode": {
|
||
|
|
"name": "ipython",
|
||
|
|
"version": 3
|
||
|
|
},
|
||
|
|
"file_extension": ".py",
|
||
|
|
"mimetype": "text/x-python",
|
||
|
|
"name": "python",
|
||
|
|
"nbconvert_exporter": "python",
|
||
|
|
"pygments_lexer": "ipython3",
|
||
|
|
"version": "3.11.9"
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"nbformat": 4,
|
||
|
|
"nbformat_minor": 5
|
||
|
|
}
|