2025-11-17 10:52:00 +00:00
{
"cells": [
{
"cell_type": "markdown",
"id": "be3c1872",
"metadata": {},
"source": [
"# AI-based OCR Benchmark Notebook\n",
"\n",
"This notebook benchmarks **AI-based OCR models** on scanned PDF documents/images in Spanish.\n",
"It excludes traditional OCR engines like Tesseract that require external installations."
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": 1,
2025-11-17 10:52:00 +00:00
"id": "6a1e98fe",
"metadata": {},
2025-12-06 21:15:49 +01:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pip in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (25.3)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: jupyter in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (1.1.1)\n",
"Requirement already satisfied: notebook in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter) (7.5.0)\n",
"Requirement already satisfied: jupyter-console in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter) (6.6.3)\n",
"Requirement already satisfied: nbconvert in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter) (7.16.6)\n",
"Requirement already satisfied: ipykernel in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jupyter) (7.1.0)\n",
"Requirement already satisfied: ipywidgets in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter) (8.1.8)\n",
"Requirement already satisfied: jupyterlab in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter) (4.5.0)\n",
"Requirement already satisfied: comm>=0.1.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (0.2.3)\n",
"Requirement already satisfied: debugpy>=1.6.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (1.8.17)\n",
"Requirement already satisfied: ipython>=7.23.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (9.8.0)\n",
"Requirement already satisfied: jupyter-client>=8.0.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (8.6.3)\n",
"Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (5.9.1)\n",
"Requirement already satisfied: matplotlib-inline>=0.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (0.2.1)\n",
"Requirement already satisfied: nest-asyncio>=1.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (1.6.0)\n",
"Requirement already satisfied: packaging>=22 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (25.0)\n",
"Requirement already satisfied: psutil>=5.7 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (7.1.3)\n",
"Requirement already satisfied: pyzmq>=25 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (27.1.0)\n",
"Requirement already satisfied: tornado>=6.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (6.5.2)\n",
"Requirement already satisfied: traitlets>=5.4.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (5.14.3)\n",
"Requirement already satisfied: colorama>=0.4.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (0.4.6)\n",
"Requirement already satisfied: decorator>=4.3.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (5.2.1)\n",
"Requirement already satisfied: ipython-pygments-lexers>=1.0.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (1.1.1)\n",
"Requirement already satisfied: jedi>=0.18.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (0.19.2)\n",
"Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (3.0.52)\n",
"Requirement already satisfied: pygments>=2.11.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (2.19.2)\n",
"Requirement already satisfied: stack_data>=0.6.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (0.6.3)\n",
"Requirement already satisfied: typing_extensions>=4.6 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (4.15.0)\n",
"Requirement already satisfied: wcwidth in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=7.23.1->ipykernel->jupyter) (0.2.14)\n",
"Requirement already satisfied: parso<0.9.0,>=0.8.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jedi>=0.18.1->ipython>=7.23.1->ipykernel->jupyter) (0.8.5)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jupyter-client>=8.0.0->ipykernel->jupyter) (2.9.0.post0)\n",
"Requirement already satisfied: platformdirs>=2.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel->jupyter) (4.5.1)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from python-dateutil>=2.8.2->jupyter-client>=8.0.0->ipykernel->jupyter) (1.17.0)\n",
"Requirement already satisfied: executing>=1.2.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel->jupyter) (2.2.1)\n",
"Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel->jupyter) (3.0.1)\n",
"Requirement already satisfied: pure-eval in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel->jupyter) (0.2.3)\n",
"Requirement already satisfied: widgetsnbextension~=4.0.14 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ipywidgets->jupyter) (4.0.15)\n",
"Requirement already satisfied: jupyterlab_widgets~=3.0.15 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ipywidgets->jupyter) (3.0.16)\n",
"Requirement already satisfied: async-lru>=1.0.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (2.0.5)\n",
"Requirement already satisfied: httpx<1,>=0.25.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (0.28.1)\n",
"Requirement already satisfied: jinja2>=3.0.3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (3.1.6)\n",
"Requirement already satisfied: jupyter-lsp>=2.0.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (2.3.0)\n",
"Requirement already satisfied: jupyter-server<3,>=2.4.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (2.17.0)\n",
"Requirement already satisfied: jupyterlab-server<3,>=2.28.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (2.28.0)\n",
"Requirement already satisfied: notebook-shim>=0.2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (0.2.4)\n",
"Requirement already satisfied: setuptools>=41.1.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (65.5.0)\n",
"Requirement already satisfied: anyio in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (4.12.0)\n",
"Requirement already satisfied: certifi in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (2025.11.12)\n",
"Requirement already satisfied: httpcore==1.* in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (1.0.9)\n",
"Requirement already satisfied: idna in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (3.11)\n",
"Requirement already satisfied: h11>=0.16 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from httpcore==1.*->httpx<1,>=0.25.0->jupyterlab->jupyter) (0.16.0)\n",
"Requirement already satisfied: argon2-cffi>=21.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (25.1.0)\n",
"Requirement already satisfied: jupyter-events>=0.11.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.12.0)\n",
"Requirement already satisfied: jupyter-server-terminals>=0.4.4 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.5.3)\n",
"Requirement already satisfied: nbformat>=5.3.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (5.10.4)\n",
"Requirement already satisfied: overrides>=5.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (7.7.0)\n",
"Requirement already satisfied: prometheus-client>=0.9 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.23.1)\n",
"Requirement already satisfied: pywinpty>=2.0.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (3.0.2)\n",
"Requirement already satisfied: send2trash>=1.8.2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.8.3)\n",
"Requirement already satisfied: terminado>=0.8.3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.18.1)\n",
"Requirement already satisfied: websocket-client>=1.7 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.9.0)\n",
"Requirement already satisfied: babel>=2.10 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2.17.0)\n",
"Requirement already satisfied: json5>=0.9.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (0.12.1)\n",
"Requirement already satisfied: jsonschema>=4.18.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (4.25.1)\n",
"Requirement already satisfied: requests>=2.31 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2.32.5)\n",
"Requirement already satisfied: argon2-cffi-bindings in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (25.1.0)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jinja2>=3.0.3->jupyterlab->jupyter) (3.0.3)\n",
"Requirement already satisfied: attrs>=22.2.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (25.4.0)\n",
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2025.9.1)\n",
"Requirement already satisfied: referencing>=0.28.4 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (0.37.0)\n",
"Requirement already satisfied: rpds-py>=0.7.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (0.30.0)\n",
"Requirement already satisfied: python-json-logger>=2.0.4 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (4.0.0)\n",
"Requirement already satisfied: pyyaml>=5.3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (6.0.2)\n",
"Requirement already satisfied: rfc3339-validator in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.1.4)\n",
"Requirement already satisfied: rfc3986-validator>=0.1.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.1.1)\n",
"Requirement already satisfied: fqdn in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.5.1)\n",
"Requirement already satisfied: isoduration in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (20.11.0)\n",
"Requirement already satisfied: jsonpointer>1.13 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (3.0.0)\n",
"Requirement already satisfied: rfc3987-syntax>=1.1.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.1.0)\n",
"Requirement already satisfied: uri-template in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.3.0)\n",
"Requirement already satisfied: webcolors>=24.6.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (25.10.0)\n",
"Requirement already satisfied: beautifulsoup4 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from nbconvert->jupyter) (4.14.3)\n",
"Requirement already satisfied: bleach!=5.0.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from bleach[css]!=5.0.0->nbconvert->jupyter) (6.3.0)\n",
"Requirement already satisfied: defusedxml in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from nbconvert->jupyter) (0.7.1)\n",
"Requirement already satisfied: jupyterlab-pygments in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from nbconvert->jupyter) (0.3.0)\n",
"Requirement already satisfied: mistune<4,>=2.0.3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from nbconvert->jupyter) (3.1.4)\n",
"Requirement already satisfied: nbclient>=0.5.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from nbconvert->jupyter) (0.10.2)\n",
"Requirement already satisfied: pandocfilters>=1.4.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from nbconvert->jupyter) (1.5.1)\n",
"Requirement already satisfied: webencodings in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from bleach!=5.0.0->bleach[css]!=5.0.0->nbconvert->jupyter) (0.5.1)\n",
"Requirement already satisfied: tinycss2<1.5,>=1.1.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from bleach[css]!=5.0.0->nbconvert->jupyter) (1.4.0)\n",
"Requirement already satisfied: fastjsonschema>=2.15 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from nbformat>=5.3.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.21.2)\n",
"Requirement already satisfied: charset_normalizer<4,>=2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from requests>=2.31->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (3.4.4)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from requests>=2.31->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2.6.0)\n",
"Requirement already satisfied: lark>=1.2.2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from rfc3987-syntax>=1.1.0->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.3.1)\n",
"Requirement already satisfied: cffi>=1.0.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.0.0)\n",
"Requirement already satisfied: pycparser in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from cffi>=1.0.1->argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.23)\n",
"Requirement already satisfied: soupsieve>=1.6.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from beautifulsoup4->nbconvert->jupyter) (2.8)\n",
"Requirement already satisfied: arrow>=0.15.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.4.0)\n",
"Requirement already satisfied: tzdata in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from arrow>=0.15.0->isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2025.2)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: ipywidgets in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (8.1.8)\n",
"Requirement already satisfied: comm>=0.1.3 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipywidgets) (0.2.3)\n",
"Requirement already satisfied: ipython>=6.1.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipywidgets) (9.8.0)\n",
"Requirement already satisfied: traitlets>=4.3.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipywidgets) (5.14.3)\n",
"Requirement already satisfied: widgetsnbextension~=4.0.14 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ipywidgets) (4.0.15)\n",
"Requirement already satisfied: jupyterlab_widgets~=3.0.15 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ipywidgets) (3.0.16)\n",
"Requirement already satisfied: colorama>=0.4.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (0.4.6)\n",
"Requirement already satisfied: decorator>=4.3.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (5.2.1)\n",
"Requirement already satisfied: ipython-pygments-lexers>=1.0.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (1.1.1)\n",
"Requirement already satisfied: jedi>=0.18.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (0.19.2)\n",
"Requirement already satisfied: matplotlib-inline>=0.1.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (0.2.1)\n",
"Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (3.0.52)\n",
"Requirement already satisfied: pygments>=2.11.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (2.19.2)\n",
"Requirement already satisfied: stack_data>=0.6.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (0.6.3)\n",
"Requirement already satisfied: typing_extensions>=4.6 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (4.15.0)\n",
"Requirement already satisfied: wcwidth in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets) (0.2.14)\n",
"Requirement already satisfied: parso<0.9.0,>=0.8.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jedi>=0.18.1->ipython>=6.1.0->ipywidgets) (0.8.5)\n",
"Requirement already satisfied: executing>=1.2.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=6.1.0->ipywidgets) (2.2.1)\n",
"Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=6.1.0->ipywidgets) (3.0.1)\n",
"Requirement already satisfied: pure-eval in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=6.1.0->ipywidgets) (0.2.3)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: ipykernel in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (7.1.0)\n",
"Requirement already satisfied: comm>=0.1.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (0.2.3)\n",
"Requirement already satisfied: debugpy>=1.6.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (1.8.17)\n",
"Requirement already satisfied: ipython>=7.23.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (9.8.0)\n",
"Requirement already satisfied: jupyter-client>=8.0.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (8.6.3)\n",
"Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (5.9.1)\n",
"Requirement already satisfied: matplotlib-inline>=0.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (0.2.1)\n",
"Requirement already satisfied: nest-asyncio>=1.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (1.6.0)\n",
"Requirement already satisfied: packaging>=22 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (25.0)\n",
"Requirement already satisfied: psutil>=5.7 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (7.1.3)\n",
"Requirement already satisfied: pyzmq>=25 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (27.1.0)\n",
"Requirement already satisfied: tornado>=6.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (6.5.2)\n",
"Requirement already satisfied: traitlets>=5.4.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (5.14.3)\n",
"Requirement already satisfied: colorama>=0.4.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (0.4.6)\n",
"Requirement already satisfied: decorator>=4.3.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (5.2.1)\n",
"Requirement already satisfied: ipython-pygments-lexers>=1.0.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (1.1.1)\n",
"Requirement already satisfied: jedi>=0.18.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (0.19.2)\n",
"Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (3.0.52)\n",
"Requirement already satisfied: pygments>=2.11.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (2.19.2)\n",
"Requirement already satisfied: stack_data>=0.6.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (0.6.3)\n",
"Requirement already satisfied: typing_extensions>=4.6 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (4.15.0)\n",
"Requirement already satisfied: wcwidth in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=7.23.1->ipykernel) (0.2.14)\n",
"Requirement already satisfied: parso<0.9.0,>=0.8.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jedi>=0.18.1->ipython>=7.23.1->ipykernel) (0.8.5)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jupyter-client>=8.0.0->ipykernel) (2.9.0.post0)\n",
"Requirement already satisfied: platformdirs>=2.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel) (4.5.1)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from python-dateutil>=2.8.2->jupyter-client>=8.0.0->ipykernel) (1.17.0)\n",
"Requirement already satisfied: executing>=1.2.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel) (2.2.1)\n",
"Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel) (3.0.1)\n",
"Requirement already satisfied: pure-eval in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel) (0.2.3)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
2025-11-17 10:52:00 +00:00
"source": [
"%pip install --upgrade pip\n",
"%pip install --upgrade jupyter\n",
"%pip install --upgrade ipywidgets\n",
2025-12-06 21:15:49 +01:00
"%pip install --upgrade ipykernel"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "13103c58",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: transformers in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (4.57.3)\n",
"Requirement already satisfied: pillow in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (12.0.0)\n",
"Requirement already satisfied: paddleocr in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (3.3.2)\n",
"Requirement already satisfied: hf_xet in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (1.2.0)\n",
"Requirement already satisfied: paddlepaddle in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (3.2.2)\n",
"Requirement already satisfied: filelock in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from transformers) (3.20.0)\n",
"Requirement already satisfied: huggingface-hub<1.0,>=0.34.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from transformers) (0.36.0)\n",
"Requirement already satisfied: numpy>=1.17 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from transformers) (2.3.5)\n",
"Requirement already satisfied: packaging>=20.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from transformers) (25.0)\n",
"Requirement already satisfied: pyyaml>=5.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from transformers) (6.0.2)\n",
"Requirement already satisfied: regex!=2019.12.17 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from transformers) (2025.11.3)\n",
"Requirement already satisfied: requests in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from transformers) (2.32.5)\n",
"Requirement already satisfied: tokenizers<=0.23.0,>=0.22.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from transformers) (0.22.1)\n",
"Requirement already satisfied: safetensors>=0.4.3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from transformers) (0.7.0)\n",
"Requirement already satisfied: tqdm>=4.27 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from transformers) (4.67.1)\n",
"Requirement already satisfied: fsspec>=2023.5.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (2025.12.0)\n",
"Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (4.15.0)\n",
"Requirement already satisfied: paddlex<3.4.0,>=3.3.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (3.3.10)\n",
"Requirement already satisfied: aistudio-sdk>=0.3.5 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (0.3.8)\n",
"Requirement already satisfied: chardet in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (5.2.0)\n",
"Requirement already satisfied: colorlog in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (6.10.1)\n",
"Requirement already satisfied: modelscope>=1.28.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (1.32.0)\n",
"Requirement already satisfied: pandas>=1.3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (2.3.3)\n",
"Requirement already satisfied: prettytable in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (3.17.0)\n",
"Requirement already satisfied: py-cpuinfo in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (9.0.0)\n",
"Requirement already satisfied: pydantic>=2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (2.12.5)\n",
"Requirement already satisfied: ruamel.yaml in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (0.18.16)\n",
"Requirement already satisfied: ujson in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (5.11.0)\n",
"Requirement already satisfied: imagesize in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (1.4.1)\n",
"Requirement already satisfied: opencv-contrib-python==4.10.0.84 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (4.10.0.84)\n",
"Requirement already satisfied: pyclipper in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (1.4.0)\n",
"Requirement already satisfied: pypdfium2>=4 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (5.1.0)\n",
"Requirement already satisfied: python-bidi in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (0.6.7)\n",
"Requirement already satisfied: shapely in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (2.1.2)\n",
"Requirement already satisfied: httpx in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from paddlepaddle) (0.28.1)\n",
"Requirement already satisfied: protobuf>=3.20.2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from paddlepaddle) (6.33.2)\n",
"Requirement already satisfied: opt-einsum==3.3.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from paddlepaddle) (3.3.0)\n",
"Requirement already satisfied: networkx in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from paddlepaddle) (3.6)\n",
"Requirement already satisfied: psutil in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from aistudio-sdk>=0.3.5->paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (7.1.3)\n",
"Requirement already satisfied: bce-python-sdk in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from aistudio-sdk>=0.3.5->paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (0.9.55)\n",
"Requirement already satisfied: click in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from aistudio-sdk>=0.3.5->paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (8.2.1)\n",
"Requirement already satisfied: setuptools in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from modelscope>=1.28.0->paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (65.5.0)\n",
"Requirement already satisfied: urllib3>=1.26 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from modelscope>=1.28.0->paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (2.6.0)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from pandas>=1.3->paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pandas>=1.3->paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (2025.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pandas>=1.3->paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (2025.2)\n",
"Requirement already satisfied: annotated-types>=0.6.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pydantic>=2->paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (0.7.0)\n",
"Requirement already satisfied: pydantic-core==2.41.5 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pydantic>=2->paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (2.41.5)\n",
"Requirement already satisfied: typing-inspection>=0.4.2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pydantic>=2->paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (0.4.2)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from python-dateutil>=2.8.2->pandas>=1.3->paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (1.17.0)\n",
"Requirement already satisfied: charset_normalizer<4,>=2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from requests->transformers) (3.4.4)\n",
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from requests->transformers) (3.11)\n",
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from requests->transformers) (2025.11.12)\n",
"Requirement already satisfied: colorama in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from tqdm>=4.27->transformers) (0.4.6)\n",
"Requirement already satisfied: pycryptodome>=3.8.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from bce-python-sdk->aistudio-sdk>=0.3.5->paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (3.23.0)\n",
"Requirement already satisfied: future>=0.6.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from bce-python-sdk->aistudio-sdk>=0.3.5->paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (1.0.0)\n",
"Requirement already satisfied: anyio in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from httpx->paddlepaddle) (4.12.0)\n",
"Requirement already satisfied: httpcore==1.* in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from httpx->paddlepaddle) (1.0.9)\n",
"Requirement already satisfied: h11>=0.16 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from httpcore==1.*->httpx->paddlepaddle) (0.16.0)\n",
"Requirement already satisfied: wcwidth in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from prettytable->paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (0.2.14)\n",
"Requirement already satisfied: ruamel.yaml.clib>=0.2.7 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ruamel.yaml->paddlex<3.4.0,>=3.3.0->paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr) (0.2.15)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: pandas in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (2.3.3)\n",
"Requirement already satisfied: numpy>=1.23.2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pandas) (2.3.5)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from pandas) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pandas) (2025.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pandas) (2025.2)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: matplotlib in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (3.10.7)\n",
"Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib) (1.3.3)\n",
"Requirement already satisfied: cycler>=0.10 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib) (0.12.1)\n",
"Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib) (4.61.0)\n",
"Requirement already satisfied: kiwisolver>=1.3.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib) (1.4.9)\n",
"Requirement already satisfied: numpy>=1.23 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib) (2.3.5)\n",
"Requirement already satisfied: packaging>=20.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from matplotlib) (25.0)\n",
"Requirement already satisfied: pillow>=8 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib) (12.0.0)\n",
"Requirement already satisfied: pyparsing>=3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib) (3.2.5)\n",
"Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from matplotlib) (2.9.0.post0)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from python-dateutil>=2.7->matplotlib) (1.17.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: seaborn in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (0.13.2)\n",
"Requirement already satisfied: numpy!=1.24.0,>=1.20 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from seaborn) (2.3.5)\n",
"Requirement already satisfied: pandas>=1.2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from seaborn) (2.3.3)\n",
"Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from seaborn) (3.10.7)\n",
"Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.3.3)\n",
"Requirement already satisfied: cycler>=0.10 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.12.1)\n",
"Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.61.0)\n",
"Requirement already satisfied: kiwisolver>=1.3.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.9)\n",
"Requirement already satisfied: packaging>=20.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (25.0)\n",
"Requirement already satisfied: pillow>=8 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (12.0.0)\n",
"Requirement already satisfied: pyparsing>=3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.2.5)\n",
"Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pandas>=1.2->seaborn) (2025.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pandas>=1.2->seaborn) (2025.2)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.17.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
2025-11-17 10:52:00 +00:00
"# Install necessary packages\n",
2025-12-06 21:15:49 +01:00
"%pip install transformers pillow paddleocr hf_xet paddlepaddle\n",
"\n",
"\n",
2025-11-17 10:52:00 +00:00
"\n",
"# Data analysis and visualization\n",
"%pip install pandas\n",
"%pip install matplotlib\n",
"%pip install seaborn"
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": 47,
2025-11-17 10:52:00 +00:00
"id": "ae33632a",
"metadata": {},
"outputs": [],
"source": [
"# Imports\n",
"import os, json\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
2025-12-06 21:15:49 +01:00
"\n",
2025-11-17 10:52:00 +00:00
"import re\n",
"from datetime import datetime"
]
},
{
"cell_type": "markdown",
"id": "0e00f1b0",
"metadata": {},
"source": [
"## 1 Configuration"
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": 39,
2025-11-17 10:52:00 +00:00
"metadata": {},
"outputs": [],
"source": [
2025-12-06 21:15:49 +01:00
"PDF_FOLDER = './dataset' # Folder containing PDF files\n",
2025-11-17 10:52:00 +00:00
"OUTPUT_FOLDER = 'results'\n",
"os.makedirs(OUTPUT_FOLDER, exist_ok=True)"
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": 40,
2025-11-17 10:52:00 +00:00
"id": "8bd4ca23",
"metadata": {},
"outputs": [
{
2025-12-06 21:15:49 +01:00
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">c:\\Users\\Sergio\\Desktop\\MastersThesis\\dataset\n",
"</pre>\n"
],
"text/plain": [
"c:\\Users\\Sergio\\Desktop\\MastersThesis\\dataset\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">c:\\Users\\Sergio\\Desktop\\MastersThesis\\paddle_ocr_tuning.py\n",
"</pre>\n"
],
"text/plain": [
"c:\\Users\\Sergio\\Desktop\\MastersThesis\\paddle_ocr_tuning.py\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">c:\\Users\\Sergio\\Desktop\\MastersThesis\n",
"</pre>\n"
],
"text/plain": [
"c:\\Users\\Sergio\\Desktop\\MastersThesis\n"
]
},
"metadata": {},
"output_type": "display_data"
2025-11-17 10:52:00 +00:00
}
],
"source": [
"PDF_FOLDER_ABS = os.path.abspath(PDF_FOLDER) # ./instructions -> C:\\...\\instructions\n",
"SCRIPT_ABS = os.path.abspath(\"paddle_ocr_tuning.py\") # paddle_ocr_tuning.py -> C:\\...\\paddle_ocr_tuning.py\n",
"SCRIPT_DIR = os.path.dirname(SCRIPT_ABS)\n",
"\n",
"print(PDF_FOLDER_ABS)\n",
"print(SCRIPT_ABS)\n",
"print(SCRIPT_DIR)"
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": 41,
2025-11-17 10:52:00 +00:00
"id": "243849b9",
"metadata": {},
2025-12-06 21:15:49 +01:00
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)\u001b[0m\n",
"\u001b[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\\Users\\Sergio\\.paddlex\\official_models\\PP-LCNet_x1_0_doc_ori`.\u001b[0m\n",
"\u001b[32mCreating model: ('UVDoc', None)\u001b[0m\n",
"\u001b[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\\Users\\Sergio\\.paddlex\\official_models\\UVDoc`.\u001b[0m\n",
"\u001b[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)\u001b[0m\n",
"\u001b[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\\Users\\Sergio\\.paddlex\\official_models\\PP-LCNet_x1_0_textline_ori`.\u001b[0m\n",
"\u001b[32mCreating model: ('PP-OCRv5_server_det', None)\u001b[0m\n",
"\u001b[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\\Users\\Sergio\\.paddlex\\official_models\\PP-OCRv5_server_det`.\u001b[0m\n",
"\u001b[32mCreating model: ('PP-OCRv5_server_rec', None)\u001b[0m\n",
"\u001b[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\\Users\\Sergio\\.paddlex\\official_models\\PP-OCRv5_server_rec`.\u001b[0m\n"
]
}
],
2025-11-17 10:52:00 +00:00
"source": [
"# 3. PaddleOCR \n",
"# https://www.paddleocr.ai/v3.0.0/en/version3.x/pipeline_usage/OCR.html?utm_source=chatgpt.com#21-command-line\n",
"from paddleocr import PaddleOCR\n",
"\n",
"# Initialize with better settings for Spanish/Latin text\n",
"# https://www.paddleocr.ai/main/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html?utm_source=chatgpt.com#5-models-and-their-supported-languages\n",
"paddleocr_model = PaddleOCR(\n",
" text_detection_model_name=\"PP-OCRv5_server_det\",\n",
" text_recognition_model_name=\"PP-OCRv5_server_rec\"\n",
")"
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": 42,
2025-11-17 10:52:00 +00:00
"id": "329da34a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
2025-12-06 21:15:49 +01:00
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3.3</span>.<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2</span>\n",
2025-11-17 10:52:00 +00:00
"</pre>\n"
],
"text/plain": [
2025-12-06 21:15:49 +01:00
"\u001b[1;36m3.3\u001b[0m.\u001b[1;36m2\u001b[0m\n"
2025-11-17 10:52:00 +00:00
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import paddleocr\n",
"\n",
"print(paddleocr.__version__)"
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": 43,
2025-11-17 10:52:00 +00:00
"id": "b1541bb6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
2025-12-06 21:15:49 +01:00
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">c:\\Users\\Sergio\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\paddleocr\n",
2025-11-17 10:52:00 +00:00
"</pre>\n"
],
"text/plain": [
2025-12-06 21:15:49 +01:00
"c:\\Users\\Sergio\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\paddleocr\n"
2025-11-17 10:52:00 +00:00
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 1) Locate the installed PaddleOCR package\n",
"pkg_dir = os.path.dirname(paddleocr.__file__)\n",
"print(pkg_dir)"
]
},
{
"cell_type": "markdown",
"id": "84c999e2",
"metadata": {},
"source": [
"## 2 Helper Functions"
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": 48,
2025-11-17 10:52:00 +00:00
"id": "9596c7df",
"metadata": {},
"outputs": [],
"source": [
"from typing import List, Optional\n",
2025-12-06 21:15:49 +01:00
"from paddle_ocr_tuning import evaluate_text, assemble_from_paddle_result\n",
"from dataset_manager import ImageTextDataset"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "b7c1bbf8",
"metadata": {},
"outputs": [],
"source": [
"from PIL import Image\n",
2025-11-17 10:52:00 +00:00
"\n",
2025-12-06 21:15:49 +01:00
"def show_page(img: Image.Image, scale: float = 1):\n",
2025-11-17 10:52:00 +00:00
" \"\"\"\n",
" Displays a smaller version of the image with text as a footer.\n",
" \"\"\"\n",
" # Compute plot size based on image dimensions (but without resizing the image)\n",
" w, h = img.size\n",
" figsize = (w * scale / 100, h * scale / 100) # convert pixels to inches approx\n",
"\n",
" fig, ax = plt.subplots(figsize=figsize)\n",
" ax.imshow(img)\n",
" ax.axis(\"off\")\n",
"\n",
"\n",
" # Add OCR text below the image (footer)\n",
" # plt.figtext(0.5, 0.02, text.strip(), wrap=True, ha='center', va='bottom', fontsize=10)\n",
" plt.tight_layout()\n",
" plt.show()"
]
},
2025-12-06 21:15:49 +01:00
{
"cell_type": "code",
"execution_count": 44,
"id": "b9d3fe25",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAH3CAYAAACSIBV+AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjcsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvTLEjVAAAAAlwSFlzAAAPYQAAD2EBqD+naQAAZpVJREFUeJztvXewFlea5vle7zEX7723wgmQBJIQIG9LKqOaUtV2b8XE7sT+s7sxEzuxOzGzsRM7sTPdMz3d1eW6nISQEAgrARLee++991zgwvVu4/d+nNuXK4QAfUgJ+fwiPr5LmpMn88t8zpsnz/tkSl1dXZ0JIYSILKnfdwWEEELcGQm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEHAm1EEJEnPTvuwLi4aWurs7qzCyFTwr/CiEeBIqoxX2L9IKTxfaftp63atRaCPHAkFCL+xLpNedL7Pd7L9tb3ZtZuoJpIR4oEmpxz9TWmZ28UWn/ZWwH69U06766PSprau3Lk8VWXlNbL/61dKXUKTwXojESanHPVNfV2YGrFZabnnrffdM3qmrt3206a9cqavz/n50otv+w6az3eQshbkVCLe6ZqtpE/3RxZUJk75eGotwuN8P6Nc/RryHEbdCoD/GtocuirLrOctNT7HpVrZVV11rzrDTLSE2pj7jp0kDgr1TUWHZa6lci58Etcqx/82wfQRKW5yHllYpqS0tJsWaZaZaakhhd0rCs9NQUa5aV5hGHRp6IRxUJtfjWnCutsn+97rQ926HAuzDKq2tdPP/LmI7WJjfDhfXo9Ur7PzeccSGvqauz8e3yrZrO7pssPX3dVpy9Yf/3yHa+/LHrlfbvN521S+XVVlNn1rtZlv37Ee2saWaa7Swqs/+45bz3b1fU1Nmgwmz7t8MT84R4FFHXh/jWEN0uP3vDtl8us1891ck+mtjNRfP3ey/VR7//x/ozNqp1nn30XDf73fjOduBahV0oq6ov43pVjZ0vTfy/srbO/s360zaida6X9eFzXW1oi1y7VF7jUfT/uua0/ahnc5v6XDebMqGri/9v9lzUg0jxyCKhFkmhbU6G/dth7axVdrrlpafaa12b2bbLZd7FcfJGlR0prrCf9i603IxU74/+DyPbWeucjNuWRfR94kaV/bRXC8vPSLPmWen28z6F1qNJpq07X2IFmak2sWOB0atCeT/p1dwWn7ruDYIQjyLq+hBJISstxbLT/rlPmhEhCCej7c6XVXmEXZCR6JpgGfqp6V/+uq6U5plplp/xz3FE6Js+cr3Cdlwus58vPV4/jz7xvIxUSwz0E+LRQ0ItHggNJZiHgS7a9Qnn33BShuVvEyDzgHJs23z727Edb5memZZiWV8j/EI87KjrQzxwOuRl+FC+s6VVLr5ExvuvldePoW5Mp/xMu1xRbWdKKxN+InV1VlJVY+U1dTawMMeOX690wS7MSrMW2en+nZ+uU1k8uiiiFg8c+qSf6VBg/9fGs/Y/DWhlh4srbNqhK1/b9dExL8OebV9g/9va0758SXWtfXCgyP7NY21sWMtc61aQaf/72lP2bq9Cy0xLtR2XS61ZVrq93b2Zfk3xSCKhjjF0LxCHpt1jl0Fmaor9uGdzf8gH9D//sGdz734IdMjP8AeKdFnTv/zvRrSzvxy4bP+095J1bZJpf/tER/vyVLH3LUPvptlW2ymxLtX5dyPb2dSDV2zKwSJrkplmv+zf0qNpNvHfnuhk0w5fsQ8OFlmKpVjfZlk2qVNuMg+NEJEipU7mCrGF4XDZ6an3PP644SkTHvI1/P+ty/CA8avrNeSby7h1+jfNE+JRQxG1uGcai+LtRPJupyVzeSEeVfQERgghIo6EWgghIo6EWgghIo6EWgghIo6EWgghIo6EWgghIo6EWgghIo6EWgghIo6EWgghIo6EWgghIo6EWgghIo6EWgghIo6EWgghIo6EWgghIo6EWgghIo6EWgghIo5eHCDuGX/h7M33iTc28L/dm1oaUllT6+vyvsTUu3gBAOXV1iVezxWFlwVQnxtVNZafkRaJ+sTlXLOvOd/igiJqcc8XzraLN+w/bjhulSjobZh24IKLWXiDeMPP+nPFtuL0VZt/9LJdq7z9MmE7/vbx6lqbd/TSLdPutPy9zLufZStq6uxX209bbYNlG68nksf50kr7u62n7P/ZcMyuVVbH9tAqohb3BDJUW1dnl8oq/ft27Ckqsac6NLPPjl62ovIqe6x1gfVqlmufHDhvp25U2Oh2TS0nPdVqauvsk4MJUX+uU6EtOXXFhe6HvdvY2rPX7MCVUnu2c6Flp6VacWWNfXroglXU1NrrPVrZlgvX7VxJpTXPTrdXure0RSeu2PHicnu2U3M7cb3cTl6vsNd7tvL3QfL3guOXvYyf9Gljm84X27nSSpvcpYXtvHTDxaBVToa90K2FfXm8yJd/rnOhHbhaaqeul9vTHZv
"text/plain": [
"<Figure size 372.15x526.2 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Índice\n",
"<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>. Indicaciones generales <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>\n",
"<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1.1</span>. Línea de discurso <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>\n",
"<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1.2</span>. Estructura general y extensión del TFE <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">4</span>\n",
"<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1.3</span>. Formatos y plantilla de trabajo <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5</span>\n",
"<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1.4</span>. Estética y estilo de redacción <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">7</span>\n",
"<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1.5</span>. Normativa de citas <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">8</span>\n",
"<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2</span>. Estructura del documento <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">9</span>\n",
"<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2.1</span>. Resumen <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">10</span>\n",
"<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2.2</span>. Organización del trabajo en grupo <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">11</span>\n",
"<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2.3</span>. Introducción <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">11</span>\n",
"<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2.4</span>. Contexto y estado del arte <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">13</span>\n",
"<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2.5</span>. Objetivos concretos y metodología de trabajo <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span>\n",
"<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2.6</span>. Desarrollo específico de la contribución <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">17</span>\n",
"<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2.7</span>. Conclusiones y trabajo futuro <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">20</span>\n",
"<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2.8</span>. Referencias bibliográficas <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">21</span>\n",
"© Universidad Internacional de La Rioja <span style=\"font-weight: bold\">(</span>UNIR<span style=\"font-weight: bold\">)</span>\n",
"<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2.8</span>.<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>. Herramientas para buscar bibliografía <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">22</span>\n",
"<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2.9</span>. Anexos <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">23</span>\n",
"<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2.10</span>. Índice de acrónimos <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">24</span>\n",
"</pre>\n"
],
"text/plain": [
"Índice\n",
"\u001b[1;36m1\u001b[0m. Indicaciones generales \u001b[1;36m3\u001b[0m\n",
"\u001b[1;36m1.1\u001b[0m. Línea de discurso \u001b[1;36m3\u001b[0m\n",
"\u001b[1;36m1.2\u001b[0m. Estructura general y extensión del TFE \u001b[1;36m4\u001b[0m\n",
"\u001b[1;36m1.3\u001b[0m. Formatos y plantilla de trabajo \u001b[1;36m5\u001b[0m\n",
"\u001b[1;36m1.4\u001b[0m. Estética y estilo de redacción \u001b[1;36m7\u001b[0m\n",
"\u001b[1;36m1.5\u001b[0m. Normativa de citas \u001b[1;36m8\u001b[0m\n",
"\u001b[1;36m2\u001b[0m. Estructura del documento \u001b[1;36m9\u001b[0m\n",
"\u001b[1;36m2.1\u001b[0m. Resumen \u001b[1;36m10\u001b[0m\n",
"\u001b[1;36m2.2\u001b[0m. Organización del trabajo en grupo \u001b[1;36m11\u001b[0m\n",
"\u001b[1;36m2.3\u001b[0m. Introducción \u001b[1;36m11\u001b[0m\n",
"\u001b[1;36m2.4\u001b[0m. Contexto y estado del arte \u001b[1;36m13\u001b[0m\n",
"\u001b[1;36m2.5\u001b[0m. Objetivos concretos y metodología de trabajo \u001b[1;36m14\u001b[0m\n",
"\u001b[1;36m2.6\u001b[0m. Desarrollo específico de la contribución \u001b[1;36m17\u001b[0m\n",
"\u001b[1;36m2.7\u001b[0m. Conclusiones y trabajo futuro \u001b[1;36m20\u001b[0m\n",
"\u001b[1;36m2.8\u001b[0m. Referencias bibliográficas \u001b[1;36m21\u001b[0m\n",
"© Universidad Internacional de La Rioja \u001b[1m(\u001b[0mUNIR\u001b[1m)\u001b[0m\n",
"\u001b[1;36m2.8\u001b[0m.\u001b[1;36m1\u001b[0m. Herramientas para buscar bibliografía \u001b[1;36m22\u001b[0m\n",
"\u001b[1;36m2.9\u001b[0m. Anexos \u001b[1;36m23\u001b[0m\n",
"\u001b[1;36m2.10\u001b[0m. Índice de acrónimos \u001b[1;36m24\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"#test\n",
"dataset = ImageTextDataset(PDF_FOLDER_ABS)\n",
"img, txt = dataset[1]\n",
"show_page(img, 0.15)\n",
"print(txt)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "dcd27755",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAH3CAYAAACSIBV+AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjcsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvTLEjVAAAAAlwSFlzAAAPYQAAD2EBqD+naQAAxGpJREFUeJzt/WdsXVm2rQkueiNREuW9lyjvvTcRCpsRkZmRkfnuzfuqHwqo7kLhoVH1gKruRvefAqoLaPSPaqB+VN2672Xed2/6DCvvvffee28piUb0bHxDZ57cOkFSIg8pHVFzBBjnaO+1l9t7jTX3OmvMmVZfX18fHA6Hw5GySH/TFXA4HA5H03CidjgcjhSHE7XD4XCkOJyoHQ6HI8XhRO1wOBwpDidqh8PhSHE4UTscDkeKw4na4XA4UhxO1A6Hw5HicKJ2OByOFIcTtcPhcKQ4nKgdDocjxeFE7XA4HCkOJ2qHw+FIcThROxwOR4rDidrhcDhSHE7UDofDkeJwonY4HI4UhxO1w+FwpDicqB0OhyPF4UTtcDgcKQ4naofD4UhxOFE7HA5HisOJ2uFwOFIcTtQOh8OR4nCidjgcjhSHE7XD4XCkOJyoHQ6HI8XhRO1wOBwpDidqh8PhSHE4UTscDkeKw4na4XA4UhxO1A6Hw5HicKJ2OByOFIcTtcPhcKQ4nKgdDocjxeFE7XA4HCkOJ2qHw+FIcThROxwOR4rDidrhcDhSHE7UDofDkeJwonY4HI4UhxO1w+FwpDicqB0OhyPF4UTtcDgcKQ4naofD4UhxOFE7HA5HisOJ2uFwOFIcmW+6Am8r6urq9OdwOF4d6enp+nM0D07ULUR1TXWorKxs6eUOxzuJnJyckJOd86ar8dbBp7YkUF9fH/9r7Pzly5fDyZMnQ011dbPyZhK4du1ao3k3hOrq6nD16tUXrqmpqQkbN24MN2/c+FH62tpa1e9lZXD+1s2b4fDhw+FZebmO8TZx7uzZcPzYsVBWVtastjkcjubBLeoksfnOs9AxMy1M7ZYb0tNCSEtLi597XFwcNm3aFGbPnh2eVVSEmtLSkJeXJxKuqqoKT548CV27dg0FBQUi2E6dOun7vXv3QkZGhgiQtFevXAmFXbvq/N27d0N+fn7o0aPH83NXr4bu3buHwsJCEWppaWl4/Pix8s7OzlYaCHXatGnhypUrITMzM/Ts2VP5UNPSkhKROfnk5uaGXr166RygvI4dOz5vy+PH4c6dOzr34YcfalLYuXNnmDFzpvIx4qcMruH19umTJ6GgU6fw5PHj0K9/f9Xb4XA0H07USWL/g4rwv55+HP5hWEH4H8Z3DZ2zM+Ln8vLzQ1ZWVrh+7Vro0rlzOHT4cBg3bly4fOlSOH/+vL5v37YtDBo8ODx79kxEOHny5LB///6weNGicPbs2XDu3LnQu1evsGvXrjBt+nSl/+KnPxUpb9ywQflDmL/+9a9FnqdPnw4XL1wQSd+8efM5kaanK58rly+H8vLyMHXatLB+/frw6aefytp/VFwc6mprw61bt8LkKVPCmtWrw2effx5uXL8epkydqjJGjxkTHjx8qFdXABHznfKYJGjr0aNHReQDBw4Me/fuDUMGDw4HDh4MkydNCpcuXw7Lli17YSJzOByvBl/6SBLQTu+8jDC2S07IyXiRhCC4r776KlRUVobTZ86ISLFwsTyxXidMnCjighCxoEcVFekTAsfqra2pkVU+fsIEWaYsO0CYffr0Uf63bt8OmVlZYdSoUcoH8oZwMzIzw6TJk0Onzp1l3fbp3VuWMwscQ4cNk1U/bNiwMGjQoFBbVxfu3b0bxo4dG/r26xeePn0aBg8erPNG0uDChQvhwYMHmkiwwCHqX3z1la47dOiQ0vAmUFRUFEaOHBny8/LC1KlTQ9++fcOkSZM0EXGd/wDrcDQfTtRJ4tMBHcLypf3C3w0tCLkZL3ZnSUlJWLVqlazYMWPGhMqKinDk8OHQuUsXnVu5YkUYMGBAmDN3rpYsIDKWPjp37iwChqzHjhundCwb9OnbNxR26RK3SmfOnBmKi4tF0BAnJN+jZ8/QrWtXWdQ9e/QIebm5WhohHwBhkhfHuKZXz54i0s2bN2tSgKBZViHPPbt3a3KBXLHaqT9WM98vXrwY1q1bF44dPRomTZyovMeNHRt2794d9uzZo3owYbCUkpWdrfKOHjkSHj16lGyXOxzvHNLqm/NrlSOOyqrKUFFR8beObOCVXhZubAsfpGjfSbt61aqw9L334ksJWNkcj25d4nqOcY7jZjVbGr5Hz9kxK8Out89oXRLPJZYR/QRRSxiyHj9+fLwe0essn4bqYMd8+ePdhe/6aBmcqJMk6paQjhEaFvDbRlrRef1tq7vjzcOJumXwHxPfACA4dl+8jXBydjheP3yN2uFwOFIcTtRJwK1Lh8PxOuBE7XA4HCkOJ+ok4BtmHA7H64ATtcPhcKQ4nKgdDocjxeFE7XA4HCkOJ2qHw+FIcThROxwOR4rDidrhcDhSHE7UDofDkeJwonY4HI4UhxO1w+FwpDicqJOA+/pwOByvA07UDofDkeJwok4COP8nmCt/fG8JiJzSlM8Qzl2+fPmlsQZJZ5HCX5YfkcIJnNtcXyWkf1nezcmTNj18+LDRawhXFo2ik1jWtWvXFM0dEOvx1s2bjaalLEKdEQrs+vXril3ZGAgK3FC5t2/fVmT4pkCduQcv6yPSkReBjnl+qH8iCOFGXW7fuqWo8tSf9jbWJy8D19F2nidCqdEXly5dcp81bwHeTu/1KQKCuX7/3XeKOdilSxfFPQTEPVQw1+pqxQuEFAhmy7FOnTqJ1BmYxEYkviBxE4nczWAk1mGHDh20rMKg5Nj27dtD//794+eJeQgqnj1T4FyC1RKIgEjjBKwlziFlEZjWAtFCVJTLtevWrg0ffvSRiIDBSxRxC89FGUSeIR15UAfaRn0uxUhlxowZupb0lA1qY1HMITNiOXItZdJGawsxF+vq6xVwl+u5BrKlvUa49B39yLWPHj4MHQsKlI56khftgeSo88EDB8LiJUvUzsOHD6vt9LWdpx8oh7wOHjwYZs2aJTKHpObNn68yuUfEsLQ+YrLbt3ev4lhSLvl1iZ0nYvvIESMUQLi6pkZlANLQNuJc3r9/X23iPlM2aThP/WnnmdOnWTNTO4l2smv3bt3bvrGAxZRp/b1/3z6R6bDhw8OGP/whTJs2LfTu00f3iGeOutIn3Af6m/y4nnbznPCdvrZ7WlVZGQ4cOKCAzNwHIt3Th5RvwSx8OS814USdBBjYBHBlYGGlEkuQgT9t+vSwZvVqDSyidxMN/Mrly2HAwIEKHEvQWqwaBhLWEoMMK4xBTrDazz77THmuWrlSg+f+vXvh1KlTsr6wwr746U9FkH/805+UFwN2zpw5oeTp07BlyxYRFqQyedKksH3HjvDee+8pDQQ1dcoURROHBLdv2ybC6Na9u66nTocOHhSxzJg5M6xdsyZ07dYtTJw4UcF5scbOnT2rNpCO9hMV/fz58yICyOnC+fMKwHv8xAnMR0UyJyo5REn9mVjmzJ4d1qxZExYuWiTyh5ghN8iMuu5gYhowQKRBUNxNmzZpIoN8IBX6mokkaoczSUBey3/4Qefy8vPVn0RB71RQoCC8I0aM0LXgzu3b4dixY/Eo7kR+h+xlbV64oAjxGzduDLk5OWHU6NFqP+2BoI8dP656L1y4UJMs95q6Pnz0KMyaOVPEeebMGd1z6kt6gghzLaB/B/TvH7p16/a88rF8jx8/rnu0ZMkSXUcd6uvq1JbDhw7pXE5uriar5cuXi4R5XghizL0hQjz3G8LukJ+vZ+WTTz/V2xPXLlq0SCTN5Aa4XxgSPAtnzp4N8+fP1310pB586SNJZGZkhOycHA3wKVOmhPETJmjQQHCzZs/WQFi8eLHIcMGCBRoUDK7qqiq99kPeY8eODdeuXtVA6d+vnwiGwfj4yZPw/rJloXuPHiLImtpaEastg2RlZmpQMzmQF4TPtVhUs2fNEsmNHj06DBkyROVigUMiAwcN0oDv1bt3mL9ggeoLGMyVVVUqgzw
"text/plain": [
"<Figure size 372.15x526.2 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Superior e inferior: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2</span>,<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5</span> cm.\n",
"Formato de párrafo en texto principal <span style=\"font-weight: bold\">(</span>estilo de la plantilla “Normal”<span style=\"font-weight: bold\">)</span>:\n",
" Calibri <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">12</span>, justificado, interlineado <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>,<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5</span>, espacio entre párrafos <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">6</span> puntos\n",
"anterior y <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">6</span> puntos posterior, sin sangría.\n",
"Títulos:\n",
" Primer nivel <span style=\"font-weight: bold\">(</span>estilo de la plantilla “Título <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>”<span style=\"font-weight: bold\">)</span>: Calibri Light <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">18</span>, azul, justificado,\n",
"interlineado <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>,<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5</span>, espacio entre párrafos <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">6</span> puntos anterior y <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">6</span> puntos\n",
"posterior, sin sangría.\n",
" Segundo nivel <span style=\"font-weight: bold\">(</span>estilo de la plantilla “Título <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2</span>”<span style=\"font-weight: bold\">)</span>: Calibri Light <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span>, azul,\n",
"justificado, interlineado <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>,<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5</span>, espacio entre párrafos <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">6</span> puntos anterior y <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">6</span>\n",
"puntos posterior, sin sangría.\n",
" Tercer nivel <span style=\"font-weight: bold\">(</span>estilo de la plantilla “Título <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>”: Calibri Light <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">12</span>, justificado,\n",
"interlineado <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>,<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5</span>, espacio entre párrafos <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">6</span> puntos anterior y <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">6</span> puntos\n",
"posterior, sin sangría.\n",
"Notas al pie:\n",
" Calibri <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">10</span>, justificado, interlineado sencillo, espacio entre párrafos <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> puntos\n",
"anterior y <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> puntos posterior, sin sangría.\n",
"Tablas y figuras:\n",
" Título en la parte superior de la tabla o figura.\n",
" Numeración tabla o figura <span style=\"font-weight: bold\">(</span>Tabla <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>/ Figura1<span style=\"font-weight: bold\">)</span>: Calibri <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">12</span>, negrita, justificado.\n",
" Nombre tabla o figura: Calibri <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">12</span>, cursiva, justificado.\n",
" Cuerpo: la tipografía de las tablas o figuras se pueden reducir hasta los <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">9</span>\n",
"puntos si estas contienen mucha información. Si la tabla o figura es muy\n",
"grande, también se puede colocar en apaisado dentro de la hoja.\n",
" Fuente de la tabla o figura en la parte inferior. Calibri <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">9</span>,<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5</span>, centrado.\n",
"Encabezado y pie de página:\n",
" Todas las páginas llevarán un encabezado con el nombre completo del\n",
"estudiante y el título del TFE.\n",
"© Universidad Internacional de La Rioja <span style=\"font-weight: bold\">(</span>UNIR<span style=\"font-weight: bold\">)</span>\n",
" Todas las páginas llevarán también un pie de página con el número de página.\n",
"Instrucciones para la redacción y elaboración del TFE\n",
"<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">6</span>\n",
"Máster Universitario en Inteligencia Artificial\n",
"</pre>\n"
],
"text/plain": [
"Superior e inferior: \u001b[1;36m2\u001b[0m,\u001b[1;36m5\u001b[0m cm.\n",
"Formato de párrafo en texto principal \u001b[1m(\u001b[0mestilo de la plantilla “Normal”\u001b[1m)\u001b[0m:\n",
" Calibri \u001b[1;36m12\u001b[0m, justificado, interlineado \u001b[1;36m1\u001b[0m,\u001b[1;36m5\u001b[0m, espacio entre párrafos \u001b[1;36m6\u001b[0m puntos\n",
"anterior y \u001b[1;36m6\u001b[0m puntos posterior, sin sangría.\n",
"Títulos:\n",
" Primer nivel \u001b[1m(\u001b[0mestilo de la plantilla “Título \u001b[1;36m1\u001b[0m”\u001b[1m)\u001b[0m: Calibri Light \u001b[1;36m18\u001b[0m, azul, justificado,\n",
"interlineado \u001b[1;36m1\u001b[0m,\u001b[1;36m5\u001b[0m, espacio entre párrafos \u001b[1;36m6\u001b[0m puntos anterior y \u001b[1;36m6\u001b[0m puntos\n",
"posterior, sin sangría.\n",
" Segundo nivel \u001b[1m(\u001b[0mestilo de la plantilla “Título \u001b[1;36m2\u001b[0m”\u001b[1m)\u001b[0m: Calibri Light \u001b[1;36m14\u001b[0m, azul,\n",
"justificado, interlineado \u001b[1;36m1\u001b[0m,\u001b[1;36m5\u001b[0m, espacio entre párrafos \u001b[1;36m6\u001b[0m puntos anterior y \u001b[1;36m6\u001b[0m\n",
"puntos posterior, sin sangría.\n",
" Tercer nivel \u001b[1m(\u001b[0mestilo de la plantilla “Título \u001b[1;36m3\u001b[0m”: Calibri Light \u001b[1;36m12\u001b[0m, justificado,\n",
"interlineado \u001b[1;36m1\u001b[0m,\u001b[1;36m5\u001b[0m, espacio entre párrafos \u001b[1;36m6\u001b[0m puntos anterior y \u001b[1;36m6\u001b[0m puntos\n",
"posterior, sin sangría.\n",
"Notas al pie:\n",
" Calibri \u001b[1;36m10\u001b[0m, justificado, interlineado sencillo, espacio entre párrafos \u001b[1;36m0\u001b[0m puntos\n",
"anterior y \u001b[1;36m0\u001b[0m puntos posterior, sin sangría.\n",
"Tablas y figuras:\n",
" Título en la parte superior de la tabla o figura.\n",
" Numeración tabla o figura \u001b[1m(\u001b[0mTabla \u001b[1;36m1\u001b[0m/ Figura1\u001b[1m)\u001b[0m: Calibri \u001b[1;36m12\u001b[0m, negrita, justificado.\n",
" Nombre tabla o figura: Calibri \u001b[1;36m12\u001b[0m, cursiva, justificado.\n",
" Cuerpo: la tipografía de las tablas o figuras se pueden reducir hasta los \u001b[1;36m9\u001b[0m\n",
"puntos si estas contienen mucha información. Si la tabla o figura es muy\n",
"grande, también se puede colocar en apaisado dentro de la hoja.\n",
" Fuente de la tabla o figura en la parte inferior. Calibri \u001b[1;36m9\u001b[0m,\u001b[1;36m5\u001b[0m, centrado.\n",
"Encabezado y pie de página:\n",
" Todas las páginas llevarán un encabezado con el nombre completo del\n",
"estudiante y el título del TFE.\n",
"© Universidad Internacional de La Rioja \u001b[1m(\u001b[0mUNIR\u001b[1m)\u001b[0m\n",
" Todas las páginas llevarán también un pie de página con el número de página.\n",
"Instrucciones para la redacción y elaboración del TFE\n",
"\u001b[1;36m6\u001b[0m\n",
"Máster Universitario en Inteligencia Artificial\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"dataset = ImageTextDataset(PDF_FOLDER_ABS)\n",
"img, txt = dataset[5]\n",
"show_page(img, 0.15)\n",
"print(txt)"
]
},
2025-11-17 10:52:00 +00:00
{
"cell_type": "markdown",
"id": "e42cae29",
"metadata": {},
"source": [
"## Run AI OCR Benchmark"
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": 23,
2025-11-17 10:52:00 +00:00
"id": "9b55c154",
"metadata": {},
2025-12-06 21:15:49 +01:00
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAH3CAYAAACSIBV+AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjcsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvTLEjVAAAAAlwSFlzAAAPYQAAD2EBqD+naQAAxGpJREFUeJzt/WdsXVm2rQkueiNREuW9lyjvvTcRCpsRkZmRkfnuzfuqHwqo7kLhoVH1gKruRvefAqoLaPSPaqB+VN2672Xed2/6DCvvvffee28piUb0bHxDZ57cOkFSIg8pHVFzBBjnaO+1l9t7jTX3OmvMmVZfX18fHA6Hw5GySH/TFXA4HA5H03CidjgcjhSHE7XD4XCkOJyoHQ6HI8XhRO1wOBwpDidqh8PhSHE4UTscDkeKw4na4XA4UhxO1A6Hw5HicKJ2OByOFIcTtcPhcKQ4nKgdDocjxeFE7XA4HCkOJ2qHw+FIcThROxwOR4rDidrhcDhSHE7UDofDkeJwonY4HI4UhxO1w+FwpDicqB0OhyPF4UTtcDgcKQ4naofD4UhxOFE7HA5HisOJ2uFwOFIcTtQOh8OR4nCidjgcjhSHE7XD4XCkOJyoHQ6HI8XhRO1wOBwpDidqh8PhSHE4UTscDkeKw4na4XA4UhxO1A6Hw5HicKJ2OByOFIcTtcPhcKQ4nKgdDocjxeFE7XA4HCkOJ2qHw+FIcThROxwOR4rDidrhcDhSHE7UDofDkeJwonY4HI4UhxO1w+FwpDicqB0OhyPF4UTtcDgcKQ4naofD4UhxOFE7HA5HisOJ2uFwOFIcmW+6Am8r6urq9OdwOF4d6enp+nM0D07ULUR1TXWorKxs6eUOxzuJnJyckJOd86ar8dbBp7YkUF9fH/9r7Pzly5fDyZMnQ011dbPyZhK4du1ao3k3hOrq6nD16tUXrqmpqQkbN24MN2/c+FH62tpa1e9lZXD+1s2b4fDhw+FZebmO8TZx7uzZcPzYsVBWVtastjkcjubBLeoksfnOs9AxMy1M7ZYb0tNCSEtLi597XFwcNm3aFGbPnh2eVVSEmtLSkJeXJxKuqqoKT548CV27dg0FBQUi2E6dOun7vXv3QkZGhgiQtFevXAmFXbvq/N27d0N+fn7o0aPH83NXr4bu3buHwsJCEWppaWl4/Pix8s7OzlYaCHXatGnhypUrITMzM/Ts2VP5UNPSkhKROfnk5uaGXr166RygvI4dOz5vy+PH4c6dOzr34YcfalLYuXNnmDFzpvIx4qcMruH19umTJ6GgU6fw5PHj0K9/f9Xb4XA0H07USWL/g4rwv55+HP5hWEH4H8Z3DZ2zM+Ln8vLzQ1ZWVrh+7Vro0rlzOHT4cBg3bly4fOlSOH/+vL5v37YtDBo8ODx79kxEOHny5LB///6weNGicPbs2XDu3LnQu1evsGvXrjBt+nSl/+KnPxUpb9ywQflDmL/+9a9FnqdPnw4XL1wQSd+8efM5kaanK58rly+H8vLyMHXatLB+/frw6aefytp/VFwc6mprw61bt8LkKVPCmtWrw2effx5uXL8epkydqjJGjxkTHjx8qFdXABHznfKYJGjr0aNHReQDBw4Me/fuDUMGDw4HDh4MkydNCpcuXw7Lli17YSJzOByvBl/6SBLQTu+8jDC2S07IyXiRhCC4r776KlRUVobTZ86ISLFwsTyxXidMnCjighCxoEcVFekTAsfqra2pkVU+fsIEWaYsO0CYffr0Uf63bt8OmVlZYdSoUcoH8oZwMzIzw6TJk0Onzp1l3fbp3VuWMwscQ4cNk1U/bNiwMGjQoFBbVxfu3b0bxo4dG/r26xeePn0aBg8erPNG0uDChQvhwYMHmkiwwCHqX3z1la47dOiQ0vAmUFRUFEaOHBny8/LC1KlTQ9++fcOkSZM0EXGd/wDrcDQfTtRJ4tMBHcLypf3C3w0tCLkZL3ZnSUlJWLVqlazYMWPGhMqKinDk8OHQuUsXnVu5YkUYMGBAmDN3rpYsIDKWPjp37iwChqzHjhundCwb9OnbNxR26RK3SmfOnBmKi4tF0BAnJN+jZ8/QrWtXWdQ9e/QIebm5WhohHwBhkhfHuKZXz54i0s2bN2tSgKBZViHPPbt3a3KBXLHaqT9WM98vXrwY1q1bF44dPRomTZyovMeNHRt2794d9uzZo3owYbCUkpWdrfKOHjkSHj16lGyXOxzvHNLqm/NrlSOOyqrKUFFR8beObOCVXhZubAsfpGjfSbt61aqw9L334ksJWNkcj25d4nqOcY7jZjVbGr5Hz9kxK8Out89oXRLPJZYR/QRRSxiyHj9+fLwe0essn4bqYMd8+ePdhe/6aBmcqJMk6paQjhEaFvDbRlrRef1tq7vjzcOJumXwHxPfACA4dl+8jXBydjheP3yN2uFwOFIcTtRJwK1Lh8PxOuBE7XA4HCkOJ+ok4BtmHA7H64ATtcPhcKQ4nKgdDocjxeFE7XA4HCkOJ2qHw+FIcThROxwOR4rDidrhcDhSHE7UDofDkeJwonY4HI4UhxO1w+FwpDicqJOA+/pwOByvA07UDofDkeJwok4COP8nmCt/fG8JiJzSlM8Qzl2+fPmlsQZJZ5HCX5YfkcIJnNtcXyWkf1nezcmTNj18+LDRawhXFo2ik1jWtWvXFM0dEOvx1s2bjaalLEKdEQrs+vXril3ZGAgK3FC5t2/fVmT4pkCduQcv6yPSkReBjnl+qH8iCOFGXW7fuqWo8tSf9jbWJy8D19F2nidCqdEXly5dcp81bwHeTu/1KQKCuX7/3XeKOdilSxfFPQTEPVQw1+pqxQuEFAhmy7FOnTqJ1BmYxEYkviBxE4nczWAk1mGHDh20rMKg5Nj27dtD//794+eJeQgqnj1T4FyC1RKIgEjjBKwlziFlEZjWAtFCVJTLtevWrg0ffvSRiIDBSxRxC89FGUSeIR15UAfaRn0uxUhlxowZupb0lA1qY1HMITNiOXItZdJGawsxF+vq6xVwl+u5BrKlvUa49B39yLWPHj4MHQsKlI56khftgeSo88EDB8LiJUvUzsOHD6vt9LWdpx8oh7wOHjwYZs2aJTKHpObNn68yuUfEsLQ+YrLbt3ev4lhSLvl1iZ0nYvvIESMUQLi6pkZlANLQNuJc3r9/X23iPlM2aThP/WnnmdOnWTNTO4l2smv3bt3bvrGAxZRp/b1/3z6R6bDhw8OGP/whTJs2LfTu00f3iGeOutIn3Af6m/y4nnbznPCdvrZ7WlVZGQ4cOKCAzNwHIt3Th5RvwSx8OS814USdBBjYBHBlYGGlEkuQgT9t+vSwZvVqDSyidxMN/Mrly2HAwIEKHEvQWqwaBhLWEoMMK4xBTrDazz77THmuWrlSg+f+vXvh1KlTsr6wwr746U9FkH/805+UFwN2zpw5oeTp07BlyxYRFqQyedKksH3HjvDee+8pDQQ1dcoURROHBLdv2ybC6Na9u66nTocOHhSxzJg5M6xdsyZ07dYtTJw4UcF5scbOnT2rNpCO9hMV/fz58yICyOnC+fMKwHv8xAnMR0UyJyo5REn9mVjmzJ4d1qxZExYuWiTyh5ghN8iMuu5gYhowQKRBUNxNmzZpIoN8IBX6mokkaoczSUBey3/4Qefy8vPVn0RB71RQoCC8I0aM0LXgzu3b4dixY/Eo7kR+h+xlbV64oAjxGzduDLk5OWHU6NFqP+2BoI8dP656L1y4UJMs95q6Pnz0KMyaOVPEeebMGd1z6kt6gghzLaB/B/TvH7p16/a88rF8jx8/rnu0ZMkSXUcd6uvq1JbDhw7pXE5uriar5cuXi4R5XghizL0hQjz3G8LukJ+vZ+WTTz/V2xPXLlq0SCTN5Aa4XxgSPAtnzp4N8+fP1310pB586SNJZGZkhOycHA3wKVOmhPETJmjQQHCzZs/WQFi8eLHIcMGCBRoUDK7qqiq99kPeY8eODdeuXtVA6d+vnwiGwfj4yZPw/rJloXuPHiLImtpaEastg2RlZmpQMzmQF4TPtVhUs2fNEsmNHj06DBkyROVigUMiAwcN0oDv1bt3mL9ggeoLGMyVVVUqgzw
"text/plain": [
"<Figure size 372.15x526.2 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"ref: \n",
"Superior e inferior: 2,5 cm.\n",
"Formato de párrafo en texto principal (estilo de la plantilla “Normal”):\n",
" Calibri 12, justificado, interlineado 1,5, espacio entre párrafos 6 puntos\n",
"anterior y 6 puntos posterior, sin sangría.\n",
"Títulos:\n",
" Primer nivel (estilo de la plantilla “Título 1”): Calibri Light 18, azul, justificado,\n",
"interlineado 1,5, espacio entre párrafos 6 puntos anterior y 6 puntos\n",
"posterior, sin sangría.\n",
" Segundo nivel (estilo de la plantilla “Título 2”): Calibri Light 14, azul,\n",
"justificado, interlineado 1,5, espacio entre párrafos 6 puntos anterior y 6\n",
"puntos posterior, sin sangría.\n",
" Tercer nivel (estilo de la plantilla “Título 3”: Calibri Light 12, justificado,\n",
"interlineado 1,5, espacio entre párrafos 6 puntos anterior y 6 puntos\n",
"posterior, sin sangría.\n",
"Notas al pie:\n",
" Calibri 10, justificado, interlineado sencillo, espacio entre párrafos 0 puntos\n",
"anterior y 0 puntos posterior, sin sangría.\n",
"Tablas y figuras:\n",
" Título en la parte superior de la tabla o figura.\n",
" Numeración tabla o figura (Tabla 1/ Figura1): Calibri 12, negrita, justificado.\n",
" Nombre tabla o figura: Calibri 12, cursiva, justificado.\n",
" Cuerpo: la tipografía de las tablas o figuras se pueden reducir hasta los 9\n",
"puntos si estas contienen mucha información. Si la tabla o figura es muy\n",
"grande, también se puede colocar en apaisado dentro de la hoja.\n",
" Fuente de la tabla o figura en la parte inferior. Calibri 9,5, centrado.\n",
"Encabezado y pie de página:\n",
" Todas las páginas llevarán un encabezado con el nombre completo del\n",
"estudiante y el título del TFE.\n",
"© Universidad Internacional de La Rioja (UNIR)\n",
" Todas las páginas llevarán también un pie de página con el número de página.\n",
"Instrucciones para la redacción y elaboración del TFE\n",
"6\n",
"Máster Universitario en Inteligencia Artificial\n",
"paddle_text: \n",
"Superior e inferior: 2,5 cm.\n",
"Formato de párrafo en texto principal (estilo de la plantilla “Normal\"):\n",
"Calibri 12, justificado, interlineado 1,5, espacio entre párrafos 6 puntos\n",
"anterior y 6 puntos posterior, sin sangría.\n",
"Títulos:\n",
"Primer nivel (estilo de la plantillaTítulo 1\"): Calibri Light 18, azul, justificado,\n",
"interlineado 1,5,espacio entre párrafos 6 puntos anterior y 6 puntos\n",
"posterior, sin sangría.\n",
"Segundo nivel (estilo de la plantilla Titulo 2\"): Calibri Light 14, azul,\n",
"justificado, interlineado 1,5, espacio entre párrafos 6 puntos anterior y 6\n",
"puntos posterior, sin sangría.\n",
"Tercer nivel (estilo de la plantilla Título 3\": Calibri Light 12, justificado,\n",
"interlineado 1,5,espacio entre párrafos 6 puntos anterior y 6 puntos\n",
"posterior, sin sangría.\n",
"Notas al pie:\n",
"Calibri 10, justificado, interlineado sencillo, espacio entre párrafos O puntos\n",
"anterior y O puntos posterior, sin sangra.\n",
"Tablas y figuras:\n",
"Título en la parte superior de la tabla o figura.\n",
"Numeración tabla o figura (Tabla 1/ Figura1): Calibri 12, negrita, justificado.\n",
"Nombre tabla o figura: Calibri 12, cursiva, justificado.\n",
"Cuerpo: la tipografía de las tablas o figuras se pueden reducir hasta los 9\n",
"puntos si estas contienen mucha información. Si la tabla o figura es muy\n",
"grande, también se puede colocar en apaisado dentro de la hoja.\n",
"Fuente de la tabla o figura en la parte inferior. Calibri 9,5, centrado.\n",
"Encabezado y pie de página:\n",
"Todas las páginas llevarán un encabezado con el nombre completo del\n",
"estudiante y el título del TFE.\n",
"© Universidad Internacional de La Rioja (UNiR)\n",
"Todas las páginas llevarán también un pie de página con el número de página.\n",
"Instrucciones para la redacción y elaboración del TFE\n",
"Máster Universitario en Inteligencia Artificial 9\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAH3CAYAAACSIBV+AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjcsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvTLEjVAAAAAlwSFlzAAAPYQAAD2EBqD+naQAA3gRJREFUeJzs/Qe4XVWeJXhuee+RQxY5hHByeCS8dxEBBBGZkRGRPrOqe7KqZ7pMVtZM93RXV+Z0mcnqmozMjMjwQeC9F0bCI4RAIEAGGSQhCUkIeW/m+/2P9uPq8WQRcOHt9X1P9+rec7bfa//Pvmet02Lv3r17U0FBQUFB3aLll12AgoKCgoKDoxB1QUFBQZ2jEHVBQUFBnaMQdUFBQUGdoxB1QUFBQZ2jEHVBQUFBnaMQdUFBQUGdoxB1QUFBQZ2jEHVBQUFBnaMQdUFBQUGdoxB1QUFBQZ2jEHVBQUFBnaMQdUFBQUGdoxB1QUFBQZ2jEHVBQUFBnaMQdUFBQUGdoxB1QUFBQZ2jEHVBQUFBnaMQdUFBQUGdoxB1QUFBQZ2jEHVBQUFBnaMQdUFBQUGdoxB1QUFBQZ2jEHVBQUFBnaMQdUFBQUGdoxB1QUFBQZ2jEHVBQUFBnaMQdUFBQUGdoxB1QUFBQZ2jEHVBQUFBnaMQdUFBQUGdoxB1QUFBQZ2jEHVBQUFBnaMQdUFBQUGdoxB1QUFBQZ2jEHVBQUFBnaMQdUFBQUGdoxB1QUFBQZ2jEHVBQUFBnaMQdUFBQUGdoxB1QUFBQZ2jEHVBQUFBnaMQdUFBQUGdoxB1QUFBQZ2jEHVBQUFBnaMQdUFBQUGdoxB1QUFBQZ2jEHVBQUFBnaMQdUFBQUGdoxB1QUFBQZ2jEHVBQUFBnaMQdUFBQUGdoxB1QUFBQZ2jEHVBQUFBnaMQdUFBQUGdoxB1QUFBQZ2jEHVBQUFBnaMQdUFBQUGdoxB1QUFBQZ2jEHVBQUFBnaN1aobYu3fvl12EgoKCz4AWLVo0q/ZrlkQNr67akj7atuvLLkZBQcERoHeH1mlC747Nrs1a7G2G4aUqr9m6M23fvefLLkpBQcERoH3rlqlX+zYlom4uaNkipVb+KSgo+MqgZTOdsuXHxIKCgoI6R7Mmalsgh7Pz45jNmzalPXuOfqsk0th8dGk4d/fu3Z/bj6DKtGXz5iPKI59Te/zOHTvStm3bDnmuczZuWJ/2fob2/DKh7h+tXh31PdbQNtLPYzO/z68Hw9atW9OuXbuaHDv+du3ambZu2fKp7/XjZxnbnwXyX7tmTZTrYGXYsWN72n4YY+vrimZN1O+8OSstXrAg3ueJ0fgv4/GH7k9bt3xCTAc6/mB/jz1wX9q2besRnZMxc/orafOmjft9t+z999OmjRuPOL3G703Uxx+8P73+6vS0ccOGQ5alOmdTtEltegvfm59mvfbqIcuApGdOn56keKD0D/S3ZPGiWPCOpv0Pp50PNRa8Ll7wXnri4QePuhxI8725cw5YhqlTHk8b1q9PixcuSI8/9EB89uxTUw6Z30vPTk0rl3+wX11mv/F6uvVnP4mxt3jhwvTis1P3q2c1th+I/jzWbepvx44daeH8eQdsS+V96N670vQXn0+rV314wHTmz5mTZs96PTVXNNu7PuDjtWvT7n2RyoL589Lbs95I3Xv0TGMnTkwvPjst9et/fBo78Yw4duOGjempxx5NHTp2TJMuviRNf+H5tPajj9KpY8dFOgbcCcNHpEUL3ksDBg2OSSXqOvv8yWn1hyvTgvlzg1hF5s8+9WTasX17OmfSBem9eXMiohh98ilp7rvvpI4dO0War7zwXBo09IR0yuljI/9WrVoFob44bWravHlzGnPqaenJRx5KffsfH+VBsm3ato3yvvL8c6lrt25+K07r169L7du1S9t3bE+9+/RNJ445Ob307LTUsmXLdM7kC9P0F19I6z9emzZsWB957N27Jz0/9eko+8mnnR4/2sx99+004cxzUr/jj4+IDWms//jjIPUPli5Js2bOSD169kpdu3YLYli+dGk6Z/IFacuWLendt95Mffv3T8f16Zveen1mtI32EHkjoxkvvxhtMmLUiWnMaafv65eP0isvPJ/atWuXTh8/Mc145aW0ZdOmdMrYcemR++9Ng4eekEaNPikWhpNOOTWt+GBZpDXx7HPT0vcXRTt36dYtDRg4KM2fOycNGjI0de7SJb07+63oo2VL34+20HazZr6WevTomc46f1LUf9PGDdHGcMY556UZL78UfSmfUSeNibHy6ksvpi1bNkcU+MLUZ+IzfTlz+supRYuWqVuP7tF+Pm/Tpm1q1bpVGnfGWdGne3bvTsNHnZhu/dk/pRu/+7vRvsZGv+MHRH7+36pV6/T+wgVp5YrlEUycf+HFadmS99PYCWekx566L/rg7EmT05szX4v32uKN12ak5UuXRNq1MEb140mnnpZWf/hh2rlzZ3yubHNmvxVjbsmihWnL5i3pxWnT0tatW2LMaofZs95II04cHSSu/OrXrXuPWKi2b9+eunXvHn13yZVXp7ffnJWWL1sa7aQvVq1cEeOv34AB6YE7b0/f+cHvp/Xr1kU9hg4fHv2aA5AN69bFuN+9a3d6+vFHI82hw0ek7t27pzdfn5l6Hndc6tSp86euFpoTmnVEnbF71670/DNPpQsvvTxI5J0330xrV69Oxw8c2HCM3zDGnXFmWvfx2orc1q1Lky++ND379JNBAIi2urzcFe/7DxiY2rZtm15/9ZX06ssvpskXX5bad+gQA/O43n1i8iBEk+XUcePTqy+9EMS0cP7ciByQIGLMMDnWffxxWv7BsjTx7HPSm6+/FoN57BlnBqkB0pz/7rtpxfIPYmLNfeftNG7imendt2encRPPion3/NRnYpKtWb0qoqt1az8KgkMQIrj35s6NheOCSy9Lzz3zVJrzzttBwBXxp1iQlGPC2eekvWlvmvLIQ6lr1+5Bwtqkd99+6dTxE6JuUx5+MPXsdVyQ3aL35kf9Rp10UhoybHhav+7jICPRVpDcq680RFoWMgvJ0sWL0/y576ZVK1akcWeeld564/U0dNjwqJM69jqud7Q5spH2C9OeTu+89WYaNnJU9GmPXr3SwMGDI1oTiXfv0SMIHGH1HzAoFrquXbtGusoO+tYCY/F9+61ZafGiBens8yel1155qRoHLVpEu+u/1155OY0cfVIsPi8//2wsCmeed15aOH9+EJy+Gj5qVCwcO3fuiAXGZ4CsLMQWoUuuvCb6ShmkP3L06CDvDevXpRNGjEjvzn4z2vW16S9HXtK2QDjnzHPOi/4/89zzUq/evT81ti28M1+dnt54tbrSaRjzu3fFwjH5kktTh46d0pszZ6Qu3bqmsRMmRkSv/yZfclnq269/5GOB8Rrt2LNntLvFz1hS1henPROEaszMn/NO9IHyt2/fPg0bOTICoLnvzE6XXnVNmvnKy2nb1q0N7TD6lFMjoLBAu9IwHt54dXrq2KlzGnLCsAhCPo9tpq8Smj1Ri2xFdwaeyKdlq1ZBkIhI9GbCgwnUunXrIBCD1mvLVi3jtV27tum4Pn3i/D79+kV6LzzzdES4iNtebKt95+7aubPKp6W/lkHeCMcc6tqte5p86eVBRCeffnp67P77PrVvh2yCNPemKE+LfZMOAZ07+cLUf+DAWAhE/u07tE89evaMyKd7zx5xvPy7dO0a0ZtIUxmUJwsIpOW98rVq2SrStDeI7PJlu3Mi7xYtom7KZAJ27to1Pm/VsmVqkVrE8Sb15ddenzp06Jj69O+fPli6NIindes2kZaydO3eLSLRDGWwOJx34UURaUpDnaXXuk3rWDWVQVsrQ4cOHdIJI0amM889P6KzOe/MjsXv6ccfa9h3r47vHwvoWedNSo/ef0/avn1HRIiXXnV1lKPKuyKgCWednYaNGJW6dese/VIt1Z+MA5FyNWZaRyTu81gIunRNbdq2iUVCmV1ptGvfLs19e3ZaMG9unBsTL+pbXdY7v0qjqr+rj5XLl8dno8eckqZOeSINHzkqxlIeRy1btoj0u3bvnvbs2b1vbLoi2hu
"text/plain": [
"<Figure size 372.15x526.2 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"ref: \n",
"Los borradores intermedios deberán entregarse en formato Word. El documento final\n",
"deberá depositarse en formato PDF.\n",
"1.4. Estética y estilo de redacción\n",
"Es fundamental que el TFE presente un aspecto elegante y correcto. Se trata de un\n",
"trabajo académico y debe reflejar la madurez y el nivel formativo de una persona que\n",
"ha finalizado un estudio de grado o postgrado. Ten en cuenta las siguientes\n",
"recomendaciones en todas y cada una de las entregas que realices y, en especial, en\n",
"el depósito final del documento:\n",
" Verifica la originalidad del documento, asegurándote de que citas todas las\n",
"fuentes consultadas y no existen textos de autoría ajena sin referenciar\n",
"correctamente.\n",
" Cuida la presentación del trabajo. Comprueba que formatos como tipo y tamaño\n",
"de letra, número de páginas, encabezados, justificación de párrafos, interlineado,\n",
"etc., son correctos.\n",
" Revisa la ortografía y la redacción. Utiliza el corrector de Word para asegurarte de\n",
"que no has dejado ninguna errata. Una lectura detenida del documento también\n",
"te ayudará a detectar erratas, omisiones o redundancias. Si es posible, pide a\n",
"alguien cercano que lo lea y te dé su opinión sobre la redacción. Presta especial\n",
"atención a los siguientes aspectos:\n",
"- Que los párrafos sigan un orden o hilo argumental lógico.\n",
"- Que la información se presente de una manera que facilite su\n",
"comprensión, definiendo los conceptos necesarios e incluyendo las citas\n",
"bibliográficas pertinentes.\n",
"- Elimina párrafos demasiado cortos. Cada párrafo debería tener al menos\n",
"© Universidad Internacional de La Rioja (UNIR)\n",
"tres oraciones.\n",
"- Elimina frases superfluas y repeticiones de ideas.\n",
"Instrucciones para la redacción y elaboración del TFE\n",
"7\n",
"Máster Universitario en Inteligencia Artificial\n",
"paddle_text: \n",
"Los borradores intermedios deberán entregarse en formato Word. El documento final\n",
"deberá depositarse en formato PDf.\n",
"1.4. Estética y estilo de redacción\n",
"Es fundamental que el TFE presente un aspecto elegante y correcto. Se trata de un\n",
"trabajo académico y debe reflejar la madurez y el nivel formativo de una persona que\n",
"ha finalizado un estudio de grado o postgrado. Ten en cuenta las siguientes\n",
"recomendaciones en todas y cada una de las entregas que realices y, en especial, en\n",
"el deposito final del documento:\n",
"Verifica la originalidad del documento,asegurándote de que citas todas las\n",
"fuentes consultadas y no existen textos de autoría ajena sin referenciar\n",
"correctamente.\n",
"Cuida la presentación del trabajo. Comprueba que formatos como tipo y tamaño\n",
"de letra, número de páginas, encabezados, justificación de párrafos, interlineado,\n",
"etc., son correctos.\n",
"Revisa la ortografía y la redacción. Utiliza el corrector de Word para asegurarte de\n",
"que no has dejado ninguna errata. Una lectura detenida del documento también\n",
"te ayudará a detectar erratas, omisiones o redundancias. Si es posible, pide a\n",
"alguien cercano que lo lea y te dé su opinión sobre la redacción. Presta especial\n",
"atención a los siguientes aspectos:\n",
"Que los párrafos sigan un orden o hilo argumental lógico.\n",
"Que la información se presente de una manera que facilite su\n",
"comprensión, definiendo los conceptos necesarios e incluyendo las citas\n",
"bibliograficas pertinentes.\n",
"Elimina párrafos demasiado cortos. Cada párrafo debería tener al menos\n",
"© Universidad Internacional de La Rioja (UNiR) tres oraciones.\n",
"Elimina frases superfluas y repeticiones de ideas.\n",
"Instrucciones para la redacción y elaboración del TfE 7\n",
"Máster Universitario en Inteligencia Artificial\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAH3CAYAAACSIBV+AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjcsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvTLEjVAAAAAlwSFlzAAAPYQAAD2EBqD+naQAAf+VJREFUeJztvQeUVdeV570rZ6qgoKDISYBIIgkJUM7BSlayLQepg3u67e52zzfzrZn5Znp6utdMu9vttt0O07Ys56AMkkARkZNAiJxTkUNRUDmHb/324xSPMqGQSuii9/+tBe/VDSfdc/9nn/Pu3jepra2tzYQQQkSW5E+6AEIIIc6PhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhFoIISKOhPoS0tbWZidPnrTDhw9beflJ//t8VFVVWXV1tW3bttVaW1s7nUdpaekF0/6oUI9Dhw5aFGlqarItW7Z0qs2OHTtmy5Yt9XPOBels3779rOnRzmVlZX9wPts7cw0aGxtt165dnTqWPMiLYy+mjh8F+hL/Pgr19fXt7dexnmy7mP6dqEioLyF0xp///Oe2cuVK27OnxP9GiPlsaWnx73xy89bW1trRo0ettPSYvfvuPN/HzUlHZ39NTU17p+ezubm5/fwdO3Z4mg0NDVZbGzsupMlxbOMz5BXSIv26ujrfx83F8fF5cDyDB/sPHjxoGzZsbK9bc3OTn8M/8uUzpBnqyPfYMXXtZSY90g1twbbTacbKyD++hzo0NTWeUa7T22PtQ9n4Oykp6Yx9oW78zXfyfPPNNyw1Nc2Skszz5/iO7UoZ5859x8tB+Tk3XnD27Nnj20Nbku66dets06ZNfj7tEfKLz4PtCO/ChQv8OPbH1zlcT8pOuuRRUlLieR46dMhqaqr/oI4dr1do+1CO+Gsa+mRo9/jrwTEcu3nzZtu5c6enzb54QeWYcM1DOuQR30dJiwGdz9mzZ7eXM/QH8pw3b561tsbatmP5+F5zqu4ft/ERZVI/6QJczsR3HG6YzkCHHThwgA0cONDefPNN75gzZsywFStW+M08evRoW7hwofXp09v69etveXm5dvToEXvllVe88z/00EP26quvWnJysl111VU2fvx4T/e111718kyfPsP27t1rV1xxhc2ZM9vLNWnSZFuwYIHl5eVZWdlxKyzs6d+HDx9uixYttJSUVC8DNyQDw7XXTrO1a9d4ejfeeJMNHTrU80B8OKalpdnGjh17Rju88847duDAQbe0i4uLrby83L74xS/anDlzLCUl2evCjX74MAJTa1/4whe8/tQjJSXFhgwZYjt2bLerr55qI0eO9HQ/+OADy87Obh9Q+DsrK8u/P/nkk5aamur7fvrTp61bt3yrq6u1e++911avXm1Hjhy2AQMG2K9//SsbOHAQpbTS0uM2Y8Z0t2CZ1dx//wO2detW69Gjh82dO9eOHz/uovDEE1+03NxcT/u3v/2tl+XIkSM+ANCOiMrtt9/h6cPBgwesoKDAZs6cad27F9iQIUNt584dVldXb2lpafbyyy/ZNddc6+mRDgL7wAMP2qxZsyw1NcWP45pzzfbv32+9evXyclVXV9kVV4ywTZs2WmZmll/rAwcOWE5Oti1ZstQHR+rNIJKbm+fi96Uvfcnbk7woD9u41t26dbP169e7OD766KPWt29fv27U58CB/TZs2DBrbW2z/fv3WX5+gQ0dOsQWLFhox4+X2s0332KLFy/y60vfpU/Qr2gPrm9RUS8bPXqMbdy4wduYPvqb3/zaevbs6eWjfblW7777rrcTs8kZM66zVatW+nncRnv37rN5897179OmTWvvX1zLDRs2+L1BuvStREQW9UcAQXzxxRfc8u0sbW2tVlsbs+wyMjLcAuGmQsA+97nP2ZVXXuk3wWc/+7ALAjdxz5697LHHHrPGxgZbs2aNixACwA0WIC0EjO2IJdNibhpuEG5+eOSRR/zvhx9+2MvMIDF27Di75557/GZApD7zmft8OaCystJFNH55IycnxzIzM23Xrt3W0nLmVJVzb7nlZuvXr59df/31LlhMdxHf9PR0LwMW1y233GqDBg308lFv6szUOtSVeoQBEMs/WLCIMMJE2YN1GKAs1A2h4ViEjIECazUtLd1v8MbGJhfxwYOHWHZ2jlVUVHgaI0eO8oGMsj722OMukqG9aAOOoYxsX7PmAy+TWZILfSCkxTEPPviQ7du3z668crQPCgwsAwYM9DbZsmWzX8fi4r62dOkS69Onjz300GctOzvr1IwJK7Ta80UwH3/8cy6CGRmZ9vjjsbKR16pVq+yuu+50wVy7dm17/YO1GsrOvoyMdBdURP/aa6+xCRMmuNgDIr558yZ79NHHfHBG0MmLQYZz77rrLrvhhhu8PzIQcB0ZqAP0mWuuucYHnfz8bl5X+iYDMgJN+blHKHN+fr6NGjXSj6dsXMPKytjsDD74YLVdd931dvfdd3s7B1avft8efPABmzhxordfoiKh/ghgUd133/1WVFTU6XOwdrCSuaGwIrk56PBNTc0+zUQkuCEQSQQBQ72iotw2btxo9fUNfnMXFhbapEkT3foMjBgx0tPmOM7p3r27Hzd58hQ/FsuN/aTNZ3Jykgsi64Pki/XDdoSe71jdWIFjxsQsG47FIurVq6elpcUmYvGTiKSkZF9CYCDgH2kh7KSDdXTjjTe6ZR3bl+o3NWKK5Uhd+/Qp9kEKCz+Qk5Pr1i9ignbHyo0FnnzGbAYLjXojVDt27PRBA4sYQltyTnp6mh9XVVXp7R4rd5KnS7kYPI4dK/W2AwYNBJQljIqKSh8waZvp06e1W/2nau//03akRbvwHYFkcM3MzPD8GTTIgxkSVj5iT9oNDY0+uGzbtt22bdt2aqaAiG72wZaBjO8MPOHabt68xbZv32bFxX3a2zu+XSg714qB6LbbbvM25/pwXDiGuqenZ3gZKBcWLf2L9kJY6Rvbt+/wwRKLmn3xM8cwGNOmzAhIH8s/zDLYHqs7bZJkzc0YJIf9GER39+5d7f0Iq5u6s518Anl53byue/bstqKi3paoJLUl8sLPJYamjt305S6i3DxYryxhYHXs2rXTLTGEhOkzU3FuGj5ZsmD5gMGBm+rEiRM2ZswY7+BBcLGkx4+/ql0IOI50Oe7kyRM2aNBgX+McPHiwWzrkjRAyzZ04cZL/zaDDjc80liUKptuIGnlgifEPoeBmwors3bu372PNlO0nTpS5OFNmRA2xYh/TeixoykuZEFIEiIEJgcayitX/Sj+fG5v0sbQ4FisUS2zQoEFusdIOlJPzvv/979uUKVNswID+LqYs0XAOSzbkwYBIuSlPbP14rQ8CTPcRP+qCFci1YUbAdvKnXrt37/Y0KDftRrswYHLNGIg4hvJQdwYe0qK+WL+UY8SIES50pMs1YxmDulA28mPALizsYb16FflsqaAg3y1wBh1EkPbgutKeLIvRBlyjmLWcYePGjXPrm2vLNaV9ghhTZ8rPNgZGZkPkR58K/YZ+wW8N1JnZCGWknVhioDwsmbBMwgyMARALn75F+5AWx1AO6kcfxJKm7s8++6xNnjy5/dqyhEU7hrbAOOnRo7vPcOiv9G3aC9GmbRlAgIFy7dq1PnBQf+qWiEioExhEEvFEgC5XEF7WOqdMuTphb+KowSCzdesWH/w7+9uNOD8SaiGEiDhaoxZCiIgjoRZCiIgjoRZCiIgjoRZCiIg
"text/plain": [
"<Figure size 372.15x526.2 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"ref: \n",
"- Escribe siempre al menos un párrafo de introducción en cada capítulo o\n",
"apartado, explicando de qué vas a tratar en esa sección. Evita que\n",
"aparezcan dos encabezados de nivel consecutivos sin ningún texto entre\n",
"medias.\n",
" Repasa las citas bibliográficas. Comprueba que todas ellas son correctas y siguen\n",
"la normativa que exige la titulación.\n",
" Asegúrate de que las figuras y las tablas se ven clara y correctamente, e incluyen\n",
"número y título, así como su procedencia o fuente.\n",
" Comprueba que los índices se generan correctamente.\n",
"1.5. Normativa de citas\n",
"En esta titulación se cita de acuerdo con la normativa APA.\n",
"Recuerda que tienes una guía con explicaciones y ejemplos en el apartado Citas y\n",
"bibliografía del aula virtual: https://bibliografiaycitas.unir.net/\n",
"© Universidad Internacional de La Rioja (UNIR)\n",
"Instrucciones para la redacción y elaboración del TFE\n",
"8\n",
"Máster Universitario en Inteligencia Artificial\n",
"paddle_text: \n",
"Escribe siempre al menos un párrafo de introducción en cada capítulo o\n",
"apartado,explicando de qué vas a tratar en esa sección. Evita que\n",
"aparezcan dos encabezados de nivel consecutivos sin ningún texto entre\n",
"medias.\n",
"Repasa las citas bibliográficas. Comprueba que todas ellas son correctas y siguen\n",
"la normativa que exige la titulación.\n",
"Asegúrate de que las figuras y las tablas se ven clara y correctamente, e incluyen\n",
"número y título, así como su procedencia o fuente.\n",
"Comprueba que los índices se generan correctamente.\n",
"1.5. Normativa adecitas\n",
"En esta titulacióon se cita de acuerdo con la normativa Apa.\n",
"Recuerda que tienes una guía con explicaciones y ejemplos en el apartado Citas y\n",
"bibliografía del aula virtual: https://bibliografiaycitas.unir.net/\n",
"© Universidad Internacional de La Rioja (UNIR)\n",
"Instrucciones para la redacción y elaboración del TfE\n",
"Máster Universitario en lnteligencia Artificial ∞\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAH3CAYAAACSIBV+AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjcsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvTLEjVAAAAAlwSFlzAAAPYQAAD2EBqD+naQAA9ZRJREFUeJzs/QfYlVWaJvovcg4iSBAUBQyYMGACjJhz1spVXeH0zPR0mJ4550z6z/SZOV1zrpkOU9WhuqurqqvMOUfMYkDBBIqZpCA55/C/fs/L+th+fiRF3bjXc10fe7P3+67wrGfd61nvfu/7bbVp06ZNqVixYsWK1a21/qobUKxYsWLFtm0FqIsVK1aszq0AdbFixYrVuRWgLlasWLE6twLUxYoVK1bnVoC6WLFixercClAXK1asWJ1bAepixYoVq3MrQF2sWLHd2tZv3JTeWrw6fZ2tAHWxYsV2a2uVUmrX2r9fXytAXaxYsWJ1bgWoixUrVqzOrQB1sWLFitW5FaAuVqxYsTq3AtTFihUrVudWgLpYsWLF6twKUBcrVqxYnVsB6mLFihWrcytAXaxYsWJ1bgWoixUrVqzOrQB1sWLFitW5FaAuVqxYsTq3AtTFihUrVudWgLpYsWLF6twKUBcrVqxYnVsB6mLFihWrcytAXaxYsd3eNqWvtxWgLlas2G5vrdLX2wpQFytWbLe3VunrbQWoixUrttvbpvT1tgLUxYoVK1bnVoC6WLFixercClAXK1asWJ1bAepixYoVq3MrQF2sWLFidW4FqIsVK7bbW6v09bYC1MWKFdvtbVP6elsB6mLFihWrcytAXaxYsWJ1bgWoixUrVqzOrQB1sWLFitW5FaAuVqzYbm+b0tfbClAXK1Zst7dW6ettBaiLFStWrM6tAHWxYsWK1bkVoC5WrFixOrcC1MWKFStW51aAulixYsXq3ApQFytWrFidWwHqYsWKFatzK0BdrFixYnVuBaiLFStWrM6t7VfdgGLFdlfbuKkiLrdq1Wq7zLhNmzalDZtSatOqOj7twPEbN2dSO3J8sa+3lYz6a2Am9fqNm9LM5WvT5IWr0txV6+Kzz1LOwtXr04cr1jb9fbRibVq2bsNnKu/rbvdNX5KufXvhDglNrFy/Mf3R+FlpweoNO1S24//tcx+mRWt27PhiX28rGfXXwOavXp/+44TZAayd2rZOH65Yl35/eO/0zQN6pdY7kY2t3bgpfffxaWnZ2o2pY9vqPP9+Y1iv9K1hvbZ6HhB/f+na1LNDm7Rnxy8upNSjb61bpTSgS/v0Vdv0ZWvTx6vW79Cx6zel9PL8lWn1ho07dvzGlJ7/eMUOH7872oLV69PiNRvS/t3bl13DdqwA9dfAVqzfmC4Y3COdOqBr6tCmdUzw3396Rjpl725pUNcdB7SNm1JasnZj+ssT907D9+jU9Hk7yLgd+5+vfpzOGNg9Xbp/z/RF2u/eXpg6tm2d/vjwvb7Qeop98fbkR8vSuFnL0t+MGVTcvR0rQP01sH27to+/fC3z6D6dU/d2bdJHK9btFFBnA/Yy85auyb69eE16Ye6K1LFN63RS/66pT6e26dk5y9MHS9ekl+atSB3atEoH9uyYhnRvn95avCZ1atsqzV21Pk1ZuDqdtne3tFentunFeSvTCX27pLabF4DFa9anqYvXpOP26hx9UM+7S6p62rRqlU7s1yXt07V9emnuyvTGotWpfZtWcdnBZ4f26hifdWvfJv6fM291DO3eIfXq2DatWLchvb5wVdqvW4c07sNlqV+ntunUvbulNRs2pVfmr0xvLl6dOrdtnUb165oGdmnXYnanzGXrNqanZy9PH69al0b26RLXkGttw8ZN0ZaJ81emjm1abbO8lnw7ddHqNGHeyrRH+zbpyN6dP1X/0nUb03NzlqeZy9el/l3ahV8cq3zfr9u4Kb00b2V6c9Hq1LdzuzS6X5fUo32bWMhfXbAqfJ53WC6PzVq+Lh3Zu1NcuZkwd2U6qGeHaP9bi1enEb07pyP27BQZ72MfLkvrN21Kpw6oxi/3Z1v9XbV+Y/j20F6d0vg5y9OclevTyL06p4P36Bi7NOcZo/eXrUn3zViaendsk47dq0t8p73PfbwiTVu2Nu3VsW06sV/X+L6Rr9WXa9RfA4sfs2qC2CRZs2FjXIr4rGbiN/8zmf/gmZkxed9Zsjr989sL0oZNm9LkhavTvNXrA1xfnLsizV65Nsq4/t2F6f98/sP0/06ak95duia9u3R1Wrhmffq/X/gwJmO2d5asSf9t4uz4sU0990xfkv6Pp2YEkLy9eHX6yZMz0tTFq9NbS1bH5R3X4tXzwbI1cf6vpi5Ij85a9omdwZ9PmhNgwCxYfzx+VrQdeAAIoGYX8Ku3FqRV6zelSfNXpase+SBNX161vbktXbsx/fjJGenuaUvi/X988aN02/uLmy5PA9p/fHN+9Jd/gOV3HpuWXlmwarvX932v3N9/emaav2p9LAb/4ukZUU7tJR/l3TVtSVq1YWO684PF6ZpHPgjQrn6jSOm/vDQ7/a9X58aC8sCMJelfPzMrrd5Q/Xbxnyd8FJe2sk2ctzL95Wtz473fN/7LSx+lf/Pch+n6dxalxWs3pN9/akb65Zvz0x+Onxlj98LHK9L3H5+elqzdsEP9tTj/4WafPzNnRZq7el36wRPTA4DzZaP3lqxJ81atTy/NXZGmLlqTuEkc/eDx6emmdxdFHD80c2m68pH3I7Ya+XeSklF/zUwwP/rhsjSgS7vIsnfW1m7YmH768py0Z4cqNFxz/tMj+kZmPG7W0nT2Pt3Tvz6sT3wH7FwW+f1DeqeXF6xMZw3sni4fskdTO+DCB8vWpjvO2j/179wuPv9o5br4/BNt3gyuTNb4F6/OTX9+3IDIANmsFetSl3at4zq5Ce7Shzblepxa/fvJMmvfT1u+Nv3rw/ZKVw3do+kOjd8f3icWM3diqP+HT05Pz8xengZ36/Apn949fXG8/+vRA1OH1q3SN4ftkb73+PSmYz5YujZd+87CdN3pg5sy+9++vTD979fnpl+esu82fb583cYAzT8b2T92KZYwQOzHxNz+v359bhqxZ6f0n47uH+11zRtI/u2UeeGrSfNXpmfnrEi3nbV/6tWhTfQHwNq1wLfm2X9zn61evynt3aVd+g9H9UvtW7dKe3Rom/7rS7PT9acPTsf37RIgf+lD78eCZme0/f5uCtC9eL+e6cLBPcLnXdu2iR9fT+zbJZ27b49YrB+ZtTT9/47p37Qr+MUb89PAru3TT48bELFl8f7vk+aEf34+ZtDXXnd6a1aA+mtkAn3minXpbybPS//9uAFxGWJnzcQ2EV0mYC6BtGuz5ZLK//fKx+mYPp1jm2q7myfY1uyqIXsESO/otnX6sjWRrdl653Py5ZvPk1EN6d4hruPX/ri6Z8c2cXeFbfnKDRtT+9att3qXxVMfLU9nD+oeIK1dvTu2jf/LHNmL81akrm1bR4ZrJ8C4zaUil1i2ZTNXrI1Fj3+VbR90+t7dUr/Ni5vMEgj/7UmDmi4XmbgXD+6R/vOLswNEZarH9e0SIB1ltEpxCWpHzeWkS/brGZe92EE9O6aDe3ZMx7gc0apVat86pcHd2qfZK9ftcH8Bv35knx/Sq2MAs0WkpdDUjyc+Wpb+bOSA1H5zO/ymfeHgHpHh80OXdi3vEr/uAF6A+mtiQAzI/J/PfZi+OaxXZKOf5ZqeSQWEXVtsbn4sVKIsTtb7J0fsFRng9gBgZ9ph2x6Lww78gLkz1jbAq9Un/PXgzKXp76fMT3t0aBNZussBwKS5gZ1FazekXjXXSZvfO833c1atj0sYuRrrigxeXwDx1izusmnT6hN9ri0d8Ln7w/Xmpu9bpch6XeKCi4vWrA+Q/qymttr+VPd7f/IY/sv3jm+vv7nM2kKdv60li4/sLlx3r7Ue7dvE5Zl127gB5ut+UaQA9dfA8g9d/+75D9OI3p3S9w7ac6duy9tRk82dNah7ZNyPzFqW/q/nP0z3njt0pwBCu0xoP0TltrvOna1buzYxWQFQu9ZtdmhWyrrWbqiuo0eG74eu7WT
"text/plain": [
"<Figure size 372.15x526.2 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"ref: \n",
"2. Estructura del documento\n",
"En esta sección se describe con mayor profundidad la estructura y los contenidos\n",
"esperados en cada apartado de tu TFE.\n",
"Léela con detenimiento y compárala con la programación semanal que encontrarás\n",
"en el aula virtual, pues en cada borrador deberás entregar completados diferentes\n",
"apartados que se explican a continuación, y que se elaboran de una manera no\n",
"necesariamente lineal.\n",
"Como ya se ha mencionado, la memoria debe estar estructurada en capítulos. Por\n",
"norma general, la estructura de capítulos suele reflejar la línea de discurso del\n",
"trabajo, empezando por una introducción donde se plantea el problema, seguida de\n",
"un estudio de la literatura donde se estudia y describe el contexto. Posteriormente\n",
"se establecen claramente la hipótesis de trabajo y los objetivos concretos de\n",
"investigación, así como la descripción de la metodología seguida para alcanzar los\n",
"objetivos. Posteriormente se describe la contribución del trabajo, seguida de una\n",
"evaluación de la misma. La evaluación da pie a la elaboración de las conclusiones,\n",
"que deben relacionar los resultados obtenidos con los objetivos planteados\n",
"inicialmente. Finalmente, se describen las líneas de trabajo futuro necesarias para\n",
"seguir avanzando hacia la consecución de los objetivos.\n",
"A continuación, te dejamos algunos consejos generales sobre cómo organizar los\n",
"capítulos, pero ten en cuenta que cada trabajo es único y esta organización es una\n",
"guía general adaptable. El director específico de tu TFE podrá aportarte consejos\n",
"sobre cómo organizar la memoria adaptándote al contexto de tu trabajo concreto.\n",
"Como recomendación general, la estructura de capítulos de tu memoria debería ser\n",
"similar a la siguiente propuesta:\n",
"© Universidad Internacional de La Rioja (UNIR)\n",
" Organización del trabajo en grupo (solo en trabajos grupales)\n",
" Capítulo 1 – Introducción\n",
"Instrucciones para la redacción y elaboración del TFE\n",
"9\n",
"Máster Universitario en Inteligencia Artificial\n",
"paddle_text: \n",
"2.E Estructura del documento\n",
"En esta sección se describe con mayor profundidad la estructura y los contenidos\n",
"esperados en cada apartado de tu Tfe.\n",
"Léela con detenimiento y compárala con la programación semanal que encontraras\n",
"en el aula virtual, pues en cada borrador deberás entregar completados diferentes\n",
"apartados que se explican a continuación,y que se elaboran de una manera no\n",
"necesariamente lineal.\n",
"Como ya se ha mencionado, la memoria debe estar estructurada en capítulos. Por\n",
"norma general, la estructura de capitulos suele reflejar la linea de discurso del\n",
"trabajo, empezando por una introducción donde se plantea el problema, seguida de\n",
"un estudio de la literatura donde se estudia y describe el contexto. Posteriormente\n",
"se establecen claramente la hipótesis de trabajo y los objetivos concretos de\n",
"investigación, así como la descripción de la metodología seguida para alcanzar los\n",
"objetivos. Posteriormente se describe la contribución del trabajo, seguida de una\n",
"evaluación de la misma. La evaluación da pie a la elaboración de las conclusiones,\n",
"que deben relacionar los resultados obtenidos con los objetivos planteados\n",
"inicialmente. Finalmente, se describen las líneas de trabajo futuro necesarias para\n",
"seguir avanzando hacia la consecución de los objetivos.\n",
"A continuación, te dejamos algunos consejos generales sobre cómo organizar los\n",
"capítulos, pero ten en cuenta que cada trabajo es único y esta organización es una\n",
"guía general adaptable. El director especifico de tu TFE podrá aportarte consejos\n",
"sobre cómo organizar la memoria adaptándote al contexto de tu trabajo concreto.\n",
"Como recomendación general, la estructura de capítulos de tu memoria debería ser\n",
"similar a la siguiente propuesta:\n",
"© Universidad Internacional de La Rioja (UNiR)\n",
"Organización del trabajo en grupo (solo en trabajos grupales)\n",
"Capítulo1– Introducción\n",
"Instrucciones para la redacción y elaboración del TFE\n",
"Máster Universitario en Inteligencia Artificial 6\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAH3CAYAAACSIBV+AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjcsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvTLEjVAAAAAlwSFlzAAAPYQAAD2EBqD+naQAAsYtJREFUeJzt/Qd0XUeWpolueEsSIAB677333kukDCmJ8qlUuqrK6Z7pWTOre6ZedXX3vHpTNd1dVbOyqrIyOzOlTGXKkBQ9KXrvvffeW9DAG8K99e3DAA+vABAkIfJQd/9rYd17z4kTPv7YEYj9n4iKiooKMRgMBkNgEfm8M2AwGAyGmmFEbTAYDAGHEbXBYDAEHEbUBoPBEHAYURsMBkPAYURtMBgMAYcRtcFgMAQcRtQGg8EQcBhRGwwGQ8BhRG0wGAwBhxG1wWAwBBxG1AaDwRBwGFEbDAZDwGFEbTAYDAGHEbXBYDAEHEbUBoPBEHAYURsMBkPAYURtMBgMAYcRtcFgMAQcRtQGg8EQcBhRGwwGQ8BhRG0wGAwBhxG1wWAwBBxG1AaDwRBwGFEbDAZDwGFEbTAYDAGHEbXBYDAEHEbUBoPBEHAYURsMBkPAYURtMBgMAYcRtcFgMAQcRtQGg8EQcBhRGwwGQ8BhRG0wGAwBhxG1wWAwBBxG1AaDwRBwGFEbDAZDwGFEbTAYDAGHEbXBYDAEHEbUBoPBEHAYURsMBkPAYURtMBgMAYcRtcFgMAQcRtQGg8EQcBhRGwwGQ8BhRG0wGAwBhxG1wWAwBBxG1AaDwRBwGFEbDAZDwGFEbTAYDAGHEfVzQkVFhdy5c0euX78uZWVlNYY7ffq05Ofn6/fQe+fPn5f8/LwqnysoyJfLly9LcXHxI/OSnZ0tV69elZKSkm+lExq2NvdrClNb1BRPaWmpnDp1ssZ0ioqK5Ny5cw+F4dqZM2eeqowGw7OGEXUdI7ekTErKHk1m+/btla+//lq2bNmsRAyBODKGLAsLC5VgIaSrV6/IyZMn5eDBg/qb++Xl5fp99+5dcvv2bb2Wm5sr5eUe6RPP9u3bZd26tTJjxoxq8+Mmgs8//1x27Nguhw4d0omDuPjkj3yQN9Lctm2rkh/fyXNBQYF+v3fvXuW1b75ZLHl5eRo3z3HNkR/h+CTvboLiOcJzzXsmT8Nt2rRJLl26pPf98XCPiWXNmjX6m7oiH66MfBYXF8nNmzdl8+bNeo3neS4nJ0e2bt1aGY5rpE0art5v3Lghq1evrsyju28wPC9EP7eUv6eYeyZL1l3Nlf+jT2PpmhovkRER3woDQWzatFl+8IMfSMOGDZVkVqxYIdeuXZWJEyfJxo0bJCYmRrKzc+Sdd96Ra9euy6lTpyQrK1vi4uKUKAcNGlRJQkVFxfLFF1/oM0lJSTJ16lSJiIiQsWPHqZU8f/58DefIyCE6OloiIyNlw4YN8tprr0mrVq2UuGbOnCEREZESFRUpnTp1ks2btwjFIG+kmZaWJuPGjZcNG9ZrWYYOHSY7duyQ9u3bK8GvWbNWkpOTJSEhUY4fP67lmz79LcnIaCSrVq2SXr166XXiJk3KnpubI/Bsly5dZP/+fdK9ew/ZtGmjnD9/TkaOHCV79+6VGzeuy3vvvSdz586T6OgouXv3rtYL4SDUsWPHSseOnZTEv/rqS60Dyrh79261vplwyLcDz6xdu0brqHfv3nLhwkVNg/Lt2bNH2rRpo3kpL6+QevXqyeTJkzVOg+FZwyzqOkZBabl8fvKO/HzjRbmS/4AU/XBWM6QLoqKilNhKSkp1qwLLcerUadKgQQPdGoHEunbtKkOHDlUixiosKyuVnJxsfR6iSUxMVBK7ePGCxg2hQLoQ46RJEzUcJDp//jy15GfP/louXryo10kvPj5ev7MdgyVJXLdu3ZLbt+9I7969ZPjw4ZpO167dZOTIkXLy5AmND0K/c+e2NG3aVObNm6v3unTpLIMHD9FVwxtvvCE9e/aUY8eOa/zt2rVTS//cubPSuHFjzSNWemxsrGRm3tR8k5/69etrmSFpSDIxMUHzw7ZFQkK8vP32O1K/fgMl4QkTJsj48RNk3779mgZxM2lQh7GxcWqZU8d5eflad34kJ9fTe2fPnlWr+5VXXpVRo0brZAJh799/QOLiYnVVY1a14XnBLOo6RkpclFrT/3OPDGmeFFNlGKzYtm3bysqVK6Vt2zZy926Wbj9gXQMI4/DhQ7qlAUmJREhMTKxcuHBBWrZsqUt6tkGKi++p1QrJs2Vx7NgxiYuLVysSUoGQIaHIyCidHCC/d99971v56datq1q1WJWQJJY7Fi+kyWRCfMRDHDEx0XLp0mVp0CBF0tPTpW/ffkpobHd07NhRLWDSvnbtmiQlJcvRo0d1H33gwAEaH0Q9a9Ys6dWrp8aNVduoUWO93r17d32GONetWydNmjTRrY+8PLZhyiUxMUnLS70cOXJE9+DbtWurkwDxEB7Uq1dfLWLKXVpaIk2aNJbmzZtLr169tQ6cUQxp79q1S614tjcoIxMG5aVNqIuMjHTp3LmL9O4dr+1mMDwPRFTYf03qFMVl5RIdGaFLlZqWyViSBw4ckJKSe2qlsoSHBFu0aKkE26NHD7VSIXSsbKzP/fv3q3WK9QcxQZCQS2pqqly5ckWtb57jN81KeLYdsLYhYcJWBSzow4cPq+XO81lZWXLx4iW1aN0zfJJniIwJgfhOnDih8Xfu3FmfTU/PUIu8oqJcre3WrdvIwYMHND+UEaIjX7/97W/lpZcmSZs2bfU3ljJxNmrUSMMSb6dOHdUaZm+eCeD48WNKwN5kcF7JOzW1obRo0ULLybYPVjCf1A0TGaCOmACpa+qB7RaIngmPtA8dYt/fI3lv0mikcezbt0/JHaI/e/actGnTWlq1am1bH4bnAiPqgAHywMrr06ePkuL3DZAhWx+DBw9Wy9VgMDwaRtQGg8EQcNimm8FgMAQcRtQGg8EQcBhRGwwGQ8BhRG0wGAwBh/3b/QWAO0FZ1XG/mu49bXp1He93ieryXJv6eVSYJ63jF7EeDcGEWdR1jNoeS+dMMm7XOHaEesv5gfMILuWcT+aZ0LQ46nb79q0q84ETDU41eBTWlC/OUeOivWLFcnVPx9V6zZrVlfdxXnFejP5jduhi8Cxnjr9rrz1PZMrTFanuvtPn8IOjjjgIVQfKgR4Kzi3VgXbi7HVVaVYlluXg1eOaatN9FvVm+H7AiLqOcSanWK7ke8JD1YF7kCIEkpGRoZoVuEbjxYf7N6JAOHrglcdvPPIQTPrmm29UxwIHEa7zeeXKZcnPL9BncOqA9F3anlNMw0qtj+rygn4H6eMdiBs4jjN4JuL9xwSBNcgf3nt4QJIu9z/77DN1juGeUwGE0CBUvCjxroQAiQOHGieqRFgIlXI7ort+/ZrGyaRy4sRxyczMVBLDuQUHGCaF3//+U3VQYcLiGnXk0jxy5LAcPXpE4+UZ0oMocQQiDeI7cGC/1h9hSAOnGK7jOENalA3nIicgRZpcI17ioL347QSgyP+vfvUrfYZ75In6d3kjfVePd+96E+2xY0cr1fswskkXBybChE7EBoODbX3UMVZdzpVfHLwp/753Y/mwY0NJjPn2XMjgxBPxz//8L9T9G0KARBjUkBkiSbhTQ3RvvTVdCSUlpYEUFHjKbnj7oUeBjgc6FJDl5s2bpHfvPipG9Gd/9ufqSZiQkKDWIt57ACLAMsZzELd0vPFIH6/A6dOnq0t4p06dNQxCUBAyzw8aNFjziIYG3obkb8CAAeod6FYGxDNs2DC9161bNyW0Fi2ay9Gjx5Qc8XIkLG7j27dvU7dsrPjx48drPhGV6tmzlz4/ZswYXQmg38EEhQ4JYkl4OyJAhXYHhAhhQpBYpnhUQroQJBojuKVTx4CwS5culb59+8qXX34hAwYMVHJlxYEOCiRJ2ZmIVq1aKT//+f+keVq8eJHqiTDp4Hnpucl30vDvvvvufZXBHC0X9xo39uqTiYk8oXuC0NatW5myfv06eeedd9U9f+fOHdqu27fv0HZg8klJSdXVzCuvvGLbJIZvwSzqOgb
"text/plain": [
"<Figure size 372.15x526.2 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"ref: \n",
"Capítulo 2 – Contexto y estado del arte\n",
" Capítulo 3 – Objetivos y metodología de trabajo\n",
" Capítulo 4 – capítulo de desarrollo de la contribución, título del capítulo\n",
"dependiendo de la tipología del trabajo\n",
" Capítulo 5 – capítulo de desarrollo de la contribución, título del capítulo\n",
"dependiendo de la tipología del trabajo\n",
" Capítulo 6 – capítulo de desarrollo de la contribución, título del capítulo\n",
"dependiendo de la tipología del trabajo\n",
" Capítulo 7 – Conclusiones y trabajo futuro\n",
"2.1. Resumen\n",
"El resumen se redacta en último lugar ya que recoge las contribuciones más\n",
"importantes del trabajo. Es necesario tener muy clara y completa del documento para\n",
"poder resumirlo correctamente.\n",
"Tendrá una extensión de 150 a 300 palabras y deberá ofrecer una visión global de lo\n",
"que el lector encontrará en el trabajo, destacando sus aspectos fundamentales.\n",
"Deberás indicar claramente cuál es el objetivo principal del trabajo, la metodología\n",
"seguida para alcanzarlo, los resultados obtenidos y la principal conclusión alcanzada.\n",
"A continuación, indicarás de 3 a 5 palabras clave o keywords como descriptores del\n",
"trabajo que lo enmarcan en unas temáticas determinadas. Serán los utilizados para\n",
"localizar tu trabajo si llega a ser publicado.\n",
"© Universidad Internacional de La Rioja (UNIR)\n",
"Instrucciones para la redacción y elaboración del TFE\n",
"10\n",
"Máster Universitario en Inteligencia Artificial\n",
"paddle_text: \n",
"Capitulo 2 – Contexto y estado del arte\n",
"Capítulo 3 – Objetivos y metodología de trabajo\n",
"Capítulo 4 – capítulo de desarrollo de la contribución, título del capítulo\n",
"dependiendo de la tipología del trabajo\n",
"Capítulo 5 – capítulo de desarrollo de la contribución, título del capítulo\n",
"dependiendo de la tipología del trabajo\n",
"Capítulo 6 – capítulo de desarrollo de la contribución, título del capítulo\n",
"dependiendo de la tipología del trabajo\n",
"Capítulo 7 – Conclusiones y trabajo futuro\n",
"2.1. Resumen\n",
"El resumen se redacta en último lugar ya que recoge las contribuciones más\n",
"importantes del trabajo. Es necesario tener muy clara y completa del documento para\n",
"poder resumirlo correctamente.\n",
"Tendrá una extensión de 150 a 300 palabras y deberá ofrecer una visión global de lo\n",
"que el lector encontrará en el trabajo,destacando sus aspectos fundamentales.\n",
"Deberás indicar claramente cuál es el objetivo principal del trabajo, la metodología\n",
"seguida para alcanzarlo, los resultados obtenidos y la principal conclusión alcanzada.\n",
"A continuación, indicarás de 3 a 5 palabras clave o keywords como descriptores del\n",
"trabajo que lo enmarcan en unas temáticas determinadas. Serán los utilizados para\n",
"localizar tu trabajo si llega a ser publicado.\n",
"© Universidad Internacional de La Rioja (UNIR)\n",
"Instrucciones para la redacción y elaboración del TFE 10\n",
"Máster Universitario en lnteligencia Artificial\n"
]
}
],
2025-11-17 10:52:00 +00:00
"source": [
2025-12-06 21:15:49 +01:00
"from itertools import islice\n",
2025-11-17 10:52:00 +00:00
"\n",
2025-12-06 21:15:49 +01:00
"results = []\n",
"for img, txt in islice(dataset, 5, 10):\n",
" image_array = np.array(img)\n",
" out = paddleocr_model.predict(\n",
" image_array,\n",
" use_doc_orientation_classify=False,\n",
" use_doc_unwarping=False,\n",
" use_textline_orientation=True\n",
" )\n",
" show_page(img, 0.15)\n",
" print(f\"ref: \\n{txt}\")\n",
" paddle_text = assemble_from_paddle_result(out)\n",
" print(f\"paddle_text: \\n{paddle_text}\")\n",
" results.append({'Model': 'PaddleOCR', 'Prediction': paddle_text, **evaluate_text(txt, paddle_text)})\n",
2025-11-17 10:52:00 +00:00
" "
]
},
{
"cell_type": "markdown",
"id": "0db6dc74",
"metadata": {},
"source": [
"## 5 Save and Analyze Results"
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": 24,
2025-11-17 10:52:00 +00:00
"id": "da3155e3",
"metadata": {},
2025-12-06 21:15:49 +01:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Benchmark results saved as ai_ocr_benchmark_finetune_results_20251206_200806.csv\n",
" WER CER\n",
"Model \n",
"PaddleOCR 0.104067 0.012581\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAArwAAAIVCAYAAAAzqSxlAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjcsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvTLEjVAAAAAlwSFlzAAAPYQAAD2EBqD+naQAAQKNJREFUeJzt3Qd4VNX29/EVEkpAKdJ7Db1KRxAQNCoWbBRFqgpIFQUCArEg4FUQMRTFi4iXLogICAIConQCKkpVSi5IE+kQIJn3Wfv/ztyZZBKSkMyEne/nec4lZ8+Zkz0nk+sve9bZO8DhcDgEAAAAsFQmf3cAAAAASEsEXgAAAFiNwAsAAACrEXgBAABgNQIvAAAArEbgBQAAgNUIvAAAALAagRcAAABWI/ACAADAagReAPCh6dOnS0BAgGzbtu22uO6dO3eWO+6445bO8fLLL8v999+fan3CrWnQoIEMGjSIy4gMhcALWGzSpEkmXNWvXz/BY/Tx3r17J+l8169flwkTJkjdunXlzjvvNEFIv9Y2fcybmJgY+eyzz6RZs2Zy1113SdasWaVUqVLSpUsXj9DnDILOLSgoSIoWLWoC19GjR5PUvzfeeMPjHJkyZZLChQvLI488Ips2bUrSOZC6Dh48KJ9++qkMHTrU7J88edL8bPr16xfvWG3Tx8LDw+M91rFjR8mcObNcvnzZ7Ov7wv1n7b5ly5bN9by1a9d6PBYYGCgFChSQp59+Wnbv3p2s17J06VLzfs+RI4d5Xz311FPy+++/J/uanDhxQl577TWpWLGiZM+e3Zyvdu3aMnLkSDl79qzrOP2dSeg16nNT+rszePBgmThxohw/fjzZfQduV0H+7gCAtDNz5kwTLrds2SIHDhyQcuXKpfhcly5dklatWsm6detMgNT/mGqgXL58uQkqCxcuNIFA/+PtdOXKFXnyySfNMffee68JPRp6Dx06JPPmzZPPP/9cjhw5IsWKFXM956233pLSpUvL1atXTUjV/5j/+OOPsmvXLo8gk5jJkyebMB4bGytRUVEydepU8/31OtSsWTPF1wDJ9+GHH5qfZ/Pmzc2+hs2QkBDzM43rp59+MmFN//X2WK1atUxAdNI/njRMx6WhNq6+ffuasKp/mP3yyy8yZcoUE4b1fVWoUKGbvo6tW7fK448/LlWqVJF//etfcv78eVmyZIlpr1y5cpKuhfM8Dz/8sFy8eFE6dOhggq7SP/7GjBkjP/zwg3z33Xeu4/V3Y/To0fHOkytXrnhtSf3d0deRM2dO8wexPgfIEBwArPTnn3869Fd84cKFjvz58zveeOMNr8fpMb169brp+V566SVz7EcffRTvsYiICPNYjx49PNr1vNr+wQcfxHvOjRs3HO+9954jKirK7H/22Wfm2K1bt3ocN3jwYNM+d+7cm/YxPDzcHHvq1CmP9l27dpn2oUOHOvwtodeZ3ly8eNH826lTJ0eOHDlSdI5r16458uXL5xg2bJhHe5cuXRyBgYGOCxcueHy/oKAgx7PPPuu44447zPvD6dixY+aavfLKK662pPZrzZo15rnz58/3aJ88ebJpf/fdd5P0WgYNGuQICAhwHD9+3KP96tWrjqT6559/HEWLFnUULFjQsXv37niP67nffvtt137Tpk0dVapUuel5U/K707t3b0fJkiUdsbGxSe4/cDujpAGweHQ3T548ZlRWP77V/ZT673//K//+97/lvvvu81r+0KtXLzOCp6NteqzzOR9//LGp3ezfv7/XUTj9WNd9dNebJk2amH//+OOPFPffOYKno4fuoqOjzcfnOvKto4XFixc3tY3a7q3sY9GiRVK1alVzrI706ch1XPoRcrdu3aRIkSLmOB1x69mzp1y7di3e9x4wYIDkz5/fjIo/8cQTcurUKY9jdHReR9N1JLJOnToSHBws1apVM/tKR9V1X0fvdKRwx44dHs/XkUwdiS9Tpow5Rq9D165d5e+///ZaCqIfzz/77LPmfdO4ceMEr+fOnTtNv/Ujdx2pTIiOLp4+fVpatmzp0a7n1lIX9zKTzZs3y40bN8x7Qs+p38PJOeKbWJ+SK7nvK/00wxv9GSeV/j7o+2PcuHEeJQlOBQsWlGHDhokvXqP+Xh4+fNjjOgM2I/ACltKAq+UEWbJkkfbt28v+/fvNx6kp8e2335qAonWUCdHHNLA4Q6A+R/eff/55uRVa/qA0hCXVmTNnTNDSelENgS+++KIJfG3atHEdo+UOjz32mLz//vvy6KOPykcffSStW7eWDz74QNq2bes1vOnNV+3atTMfaevHxlrD6R4ejx07JvXq1ZM5c+aYc2hts75+LQNx1p469enTR37++WcTuDUQf/PNN17/mNBSFA2h2kf9aPuff/4xX+vP95VXXjEfi7/55psm1Ojr09fltHLlSvnzzz9NvbS+Pu279k0/Uv+/wX1PzzzzjOnnqFGjzDXzRt9D+oePlhfozzixG9o2bNhggrQe684ZXN3LGjTUli9f3hyrfwS5lzUkFnj15xx303KD1H5f6c9R/0jTa+7t2iXF4sWLzR8t+gdoUunvnbfXqCVGt/IanaUU3spHACv5e4gZQOrbtm2b+Shz5cqVZl8/tixWrJijX79+KSpp6N+/vzlux44dCR4TGRlpjhkwYIDZ14+fb/Ycbx/Lrlq1ypQkaKnDl19+acoxsmbN6ip9SEpJQ9wtd+7cjuXLl3sc+8UXXzgyZcrkWL9+vUf7lClTzHN++uknV5vuZ8mSxXHgwAFX288//xyvxKNjx47mnN7KFZwfHTtfZ8uWLT0+TtbrpR/znz171tWmHznrsRs2bHC1rVixwrQFBwc7Dh8+7Gr/+OOPTbt+hO90+fLleP2YPXu2Oe6HH36Id93at28f73j30oEff/zRkTNnTkerVq2S9FF+hw4dHHnz5vX6WIECBRwtWrRw7YeGhppSB9WmTRvHM88843qsTp06jpCQkHj98vaz1k3PFbekYdq0aeZ9peUR+l4oV66cKVHYsmWLIykWLVrkyJ49u/kZOd/jyZUnTx5HjRo1kny8ljQk9Bq7d+9+y787+p7u2bNnil4LcLvhpjXAQjr6px+POm8U0lE2HXH8z3/+I2PHjvV6U09iLly4YP7VmRkS4nzMObrm/Dex53gT9+Nv/Vhf+32z0gd3CxYsMDflaFbVj5D1JjYdjdWbgRo1amSOmT9/vlSqVMl8tKwjZk46eqnWrFnjOtbZr7Jly7r2q1evbr6HjqAqHVnVkgcdfdXyg7j0Z+DupZde8mjTj591dFk/ZtZzO+kNUQ0bNnTtO2fc0H6WKFEiXrv2R0sNlI4mOumItJYK6JRUKjIy0vWRt1OPHj0SvKZ6PfS1PfDAA2aUWD85uBkd/U5oBPWee+4xI9A6gqnXQcsbdLTd+di7775rvtYRZ/3Y3dsnBTpqryPjceXLly9em5ZyuNOSjC+++MLcyHYzekOZjp5rKYK+bv3Z6ftaS0GcQkNDTV/Xr1+f4Hn0dyK5vw/6/tebLuPy9vuQ3N8d/dm4v/cBmxF4ActogNBAomFXp4RyD0QadlevXm1CS3I4/yPtDL5JCcUaBm/2HG90uiT9aPvcuXMybdo0c9d6cuoklc7I4B569CNknRlAywi2b99u2rTEQ6el0uDjjZZDuHMPl+6BQUsMlNbfaqDRGt+kiHs+ZzB0ni+h45x352u9sbd29+draYeWO+j7Ie7r0esbl9Ybe6NhWWvB9WNwnV0jbi10YhL6+F/LE7766isTZnW6Me2PBl2lf2hoeYh+JK/vYS2N8VbOoH+4xQ15CRkxYoQJ+Br69fvqNUmoLjcuravV94/WqjunFRs+fLi55lrioH777TdTMpIY/Z1I7u+D1ncn9TUm93dHfzZx/xADbEXgBSzz/fffy19//WX+g66bt9Hf5AZeHQl13gSV0LRe+phyTtHkvCnn119/TdZUYFoD6xwh1ZpaDTpaw7p3794UL4Cgz9PA//XXX5vaRw0ROiKrN3zpqJ03cQNlQqPiKa3nTOr5EjouKc/XUUmtox04cKD5GTinanvwwQc9an2d3EeE3Wlo0rp
"text/plain": [
"<Figure size 800x500 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
2025-11-17 10:52:00 +00:00
"source": [
"df_results = pd.DataFrame(results)\n",
"\n",
"# Generate a unique filename with timestamp\n",
"timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
"filename = f\"ai_ocr_benchmark_finetune_results_{timestamp}.csv\"\n",
"filepath = os.path.join(OUTPUT_FOLDER, filename)\n",
"\n",
"df_results.to_csv(filepath, index=False)\n",
"print(f\"Benchmark results saved as {filename}\")\n",
"\n",
"# Summary by model\n",
"summary = df_results.groupby('Model')[['WER', 'CER']].mean()\n",
"print(summary)\n",
"\n",
"# Plot\n",
"summary.plot(kind='bar', figsize=(8,5), title='AI OCR Benchmark (WER & CER)')\n",
"plt.ylabel('Error Rate')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "3e0f00c0",
"metadata": {},
"source": [
"### How to read this chart:\n",
"- CER (Character Error Rate) focus on raw transcription quality\n",
"- WER (Word Error Rate) penalizes incorrect tokenization or missing spaces\n",
"- CER and WER are error metrics, which means:\n",
" - Higher values = worse performance\n",
" - Lower values = better accuracy"
]
},
{
"cell_type": "markdown",
"id": "830b0e25",
"metadata": {},
"source": [
"# Busqueda de hyperparametros\n",
"https://docs.ray.io/en/latest/tune/index.html"
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": 25,
2025-11-17 10:52:00 +00:00
"id": "3a4bd700",
"metadata": {},
2025-12-06 21:15:49 +01:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Python 3.11.9\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\"pip\" no se reconoce como un comando interno o externo,\n",
"programa o archivo por lotes ejecutable.\n"
]
}
],
2025-11-17 10:52:00 +00:00
"source": [
"!python --version\n",
"!pip --version"
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": 26,
2025-11-17 10:52:00 +00:00
"id": "b0cf4bcf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2025-12-06 21:15:49 +01:00
"Requirement already satisfied: rich in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (14.2.0)\n",
"Requirement already satisfied: ray[tune] in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (2.52.1)\n",
"Requirement already satisfied: click!=8.3.*,>=7.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ray[tune]) (8.2.1)\n",
"Requirement already satisfied: filelock in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ray[tune]) (3.20.0)\n",
"Requirement already satisfied: jsonschema in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ray[tune]) (4.25.1)\n",
"Requirement already satisfied: msgpack<2.0.0,>=1.0.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ray[tune]) (1.1.2)\n",
"Requirement already satisfied: packaging in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ray[tune]) (25.0)\n",
"Requirement already satisfied: protobuf>=3.20.3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ray[tune]) (6.33.2)\n",
"Requirement already satisfied: pyyaml in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ray[tune]) (6.0.2)\n",
"Requirement already satisfied: requests in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ray[tune]) (2.32.5)\n",
"Requirement already satisfied: pandas in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ray[tune]) (2.3.3)\n",
"Requirement already satisfied: pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ray[tune]) (2.12.5)\n",
"Requirement already satisfied: tensorboardX>=1.9 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ray[tune]) (2.6.4)\n",
"Requirement already satisfied: pyarrow>=9.0.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ray[tune]) (22.0.0)\n",
"Requirement already satisfied: fsspec in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ray[tune]) (2025.12.0)\n",
"Requirement already satisfied: annotated-types>=0.6.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3->ray[tune]) (0.7.0)\n",
"Requirement already satisfied: pydantic-core==2.41.5 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3->ray[tune]) (2.41.5)\n",
"Requirement already satisfied: typing-extensions>=4.14.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3->ray[tune]) (4.15.0)\n",
"Requirement already satisfied: typing-inspection>=0.4.2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3->ray[tune]) (0.4.2)\n",
"Requirement already satisfied: markdown-it-py>=2.2.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from rich) (4.0.0)\n",
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from rich) (2.19.2)\n",
"Requirement already satisfied: colorama in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from click!=8.3.*,>=7.0->ray[tune]) (0.4.6)\n",
"Requirement already satisfied: mdurl~=0.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from markdown-it-py>=2.2.0->rich) (0.1.2)\n",
"Requirement already satisfied: numpy in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from tensorboardX>=1.9->ray[tune]) (2.3.5)\n",
"Requirement already satisfied: attrs>=22.2.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema->ray[tune]) (25.4.0)\n",
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema->ray[tune]) (2025.9.1)\n",
"Requirement already satisfied: referencing>=0.28.4 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema->ray[tune]) (0.37.0)\n",
"Requirement already satisfied: rpds-py>=0.7.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema->ray[tune]) (0.30.0)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from pandas->ray[tune]) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pandas->ray[tune]) (2025.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pandas->ray[tune]) (2025.2)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from python-dateutil>=2.8.2->pandas->ray[tune]) (1.17.0)\n",
"Requirement already satisfied: charset_normalizer<4,>=2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from requests->ray[tune]) (3.4.4)\n",
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from requests->ray[tune]) (3.11)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from requests->ray[tune]) (2.6.0)\n",
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from requests->ray[tune]) (2025.11.12)\n",
2025-11-17 10:52:00 +00:00
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"# Instalación de Ray y Ray Tune\n",
"%pip install -U \"ray[tune]\" rich"
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": 27,
2025-11-17 10:52:00 +00:00
"id": "f3ca0b9b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2025-12-06 21:15:49 +01:00
"2025-12-06 20:08:33,299\tINFO worker.py:2023 -- Started a local Ray instance.\n"
2025-11-17 10:52:00 +00:00
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
2025-12-06 21:15:49 +01:00
"Ray Tune listo (versión: 2.52.1 )\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\Sergio\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\ray\\_private\\worker.py:2062: FutureWarning: Tip: In future versions of Ray, Ray will no longer override accelerator visible devices env var if num_gpus=0 or num_gpus=None (default). To enable this behavior and turn off this error message, set RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO=0\n",
" warnings.warn(\n"
2025-11-17 10:52:00 +00:00
]
}
],
"source": [
"import ray\n",
"from ray import tune\n",
"from ray.tune.schedulers import ASHAScheduler\n",
"\n",
"ray.init(ignore_reinit_error=True)\n",
"print(\"Ray Tune listo (versión:\", ray.__version__, \")\")"
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": 28,
2025-11-17 10:52:00 +00:00
"id": "ae5a10c4",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2025-12-06 21:15:49 +01:00
"2025-12-06 20:08:38,850\tINFO worker.py:1855 -- Calling ray.init() again after it has already been called.\n"
2025-11-17 10:52:00 +00:00
]
}
],
"source": [
"# ===============================================================\n",
"# 🔍 RAY TUNE: OPTIMIZACIÓN AUTOMÁTICA DE HIPERPARÁMETROS OCR\n",
"# ===============================================================\n",
"\n",
"from ray import tune, air\n",
"from ray.tune.schedulers import ASHAScheduler\n",
"import pandas as pd\n",
"import time\n",
"import colorama\n",
"from rich import print\n",
"import sys, subprocess \n",
"from rich.console import Console\n",
"\n",
"colorama.just_fix_windows_console()\n",
"ray.init(ignore_reinit_error=True)\n",
"\n",
"# Tell Ray Tune to use a Jupyter-compatible console\n",
"console = Console(force_jupyter=True)"
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": null,
2025-11-17 10:52:00 +00:00
"id": "96c320e8",
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"# --- Configuración base del experimento ---\n",
"search_space = {\n",
" \"textline_orientation\": tune.choice([True, False]),\n",
" \"text_det_box_thresh\": tune.uniform(0.4, 0.7),\n",
" \"text_det_unclip_ratio\": tune.uniform(1.2, 2.0),\n",
" \"text_rec_score_thresh\": tune.choice([0.0, 0.2, 0.4]),\n",
" \"line_tolerance\": tune.choice([0.5, 0.6, 0.7]),\n",
" \"min_box_score\": tune.choice([0, 0.5, 0.6])\n",
"}\n",
"KEYMAP = {\n",
" \"textline_orientation\": \"textline-orientation\",\n",
" \"text_det_box_thresh\": \"text-det-box-thresh\",\n",
" \"text_det_unclip_ratio\": \"text-det-unclip-ratio\",\n",
" \"text_rec_score_thresh\": \"text-rec-score-thresh\",\n",
" \"line_tolerance\": \"line-tolerance\",\n",
" \"min_box_score\": \"min-box-score\",\n",
"}"
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": 51,
2025-11-17 10:52:00 +00:00
"id": "accb4e9d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
2025-12-06 21:15:49 +01:00
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Notebook Python: c:\\Users\\Sergio\\AppData\\Local\\Programs\\Python\\Python311\\python.exe\n",
2025-11-17 10:52:00 +00:00
"</pre>\n"
],
"text/plain": [
2025-12-06 21:15:49 +01:00
"Notebook Python: c:\\Users\\Sergio\\AppData\\Local\\Programs\\Python\\Python311\\python.exe\n"
2025-11-17 10:52:00 +00:00
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
2025-12-06 21:15:49 +01:00
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">{</span>\n",
" <span style=\"color: #008000; text-decoration-color: #008000\">'CER'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.06378901032380213</span>,\n",
" <span style=\"color: #008000; text-decoration-color: #008000\">'WER'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.15152788564460193</span>,\n",
" <span style=\"color: #008000; text-decoration-color: #008000\">'TIME'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">339.62537813186646</span>,\n",
" <span style=\"color: #008000; text-decoration-color: #008000\">'PAGES'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5</span>,\n",
" <span style=\"color: #008000; text-decoration-color: #008000\">'TIME_PER_PAGE'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">67.83050785064697</span>\n",
"<span style=\"font-weight: bold\">}</span>\n",
2025-11-17 10:52:00 +00:00
"</pre>\n"
],
"text/plain": [
2025-12-06 21:15:49 +01:00
"\u001b[1m{\u001b[0m\n",
" \u001b[32m'CER'\u001b[0m: \u001b[1;36m0.06378901032380213\u001b[0m,\n",
" \u001b[32m'WER'\u001b[0m: \u001b[1;36m0.15152788564460193\u001b[0m,\n",
" \u001b[32m'TIME'\u001b[0m: \u001b[1;36m339.62537813186646\u001b[0m,\n",
" \u001b[32m'PAGES'\u001b[0m: \u001b[1;36m5\u001b[0m,\n",
" \u001b[32m'TIME_PER_PAGE'\u001b[0m: \u001b[1;36m67.83050785064697\u001b[0m\n",
"\u001b[1m}\u001b[0m\n"
2025-11-17 10:52:00 +00:00
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">return code: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span>\n",
"</pre>\n"
],
"text/plain": [
"return code: \u001b[1;36m0\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
2025-12-06 21:15:49 +01:00
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">args: <span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'c:\\\\Users\\\\Sergio\\\\AppData\\\\Local\\\\Programs\\\\Python\\\\Python311\\\\python.exe'</span>, \n",
"<span style=\"color: #008000; text-decoration-color: #008000\">'c:\\\\Users\\\\Sergio\\\\Desktop\\\\MastersThesis\\\\paddle_ocr_tuning.py'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'--pdf-folder'</span>, \n",
"<span style=\"color: #008000; text-decoration-color: #008000\">'c:\\\\Users\\\\Sergio\\\\Desktop\\\\MastersThesis\\\\dataset'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'--textline-orientation'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'True'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'--text-det-box-thresh'</span>, \n",
"<span style=\"color: #008000; text-decoration-color: #008000\">'0.46611732611383844'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'--text-det-unclip-ratio'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'1.3598680409827462'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'--text-rec-score-thresh'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'0.0'</span>, \n",
"<span style=\"color: #008000; text-decoration-color: #008000\">'--line-tolerance'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'0.5'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'--min-box-score'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'0.6'</span><span style=\"font-weight: bold\">]</span>\n",
2025-11-17 10:52:00 +00:00
"</pre>\n"
],
"text/plain": [
2025-12-06 21:15:49 +01:00
"args: \u001b[1m[\u001b[0m\u001b[32m'c:\\\\Users\\\\Sergio\\\\AppData\\\\Local\\\\Programs\\\\Python\\\\Python311\\\\python.exe'\u001b[0m, \n",
"\u001b[32m'c:\\\\Users\\\\Sergio\\\\Desktop\\\\MastersThesis\\\\paddle_ocr_tuning.py'\u001b[0m, \u001b[32m'--pdf-folder'\u001b[0m, \n",
"\u001b[32m'c:\\\\Users\\\\Sergio\\\\Desktop\\\\MastersThesis\\\\dataset'\u001b[0m, \u001b[32m'--textline-orientation'\u001b[0m, \u001b[32m'True'\u001b[0m, \u001b[32m'--text-det-box-thresh'\u001b[0m, \n",
"\u001b[32m'0.46611732611383844'\u001b[0m, \u001b[32m'--text-det-unclip-ratio'\u001b[0m, \u001b[32m'1.3598680409827462'\u001b[0m, \u001b[32m'--text-rec-score-thresh'\u001b[0m, \u001b[32m'0.0'\u001b[0m, \n",
"\u001b[32m'--line-tolerance'\u001b[0m, \u001b[32m'0.5'\u001b[0m, \u001b[32m'--min-box-score'\u001b[0m, \u001b[32m'0.6'\u001b[0m\u001b[1m]\u001b[0m\n"
2025-11-17 10:52:00 +00:00
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import sys, subprocess\n",
"print(\"Notebook Python:\", sys.executable)\n",
"# test paddle ocr run with params\n",
"args = [sys.executable, \n",
" SCRIPT_ABS, \n",
" \"--pdf-folder\", PDF_FOLDER_ABS, \n",
" \"--textline-orientation\",\"True\",\n",
" \"--text-det-box-thresh\",\"0.46611732611383844\",\n",
" \"--text-det-unclip-ratio\",\"1.3598680409827462\",\n",
" \"--text-rec-score-thresh\",\"0.0\",\n",
" \"--line-tolerance\", \"0.5\",\n",
" \"--min-box-score\",\"0.6\"]\n",
"test_proc = subprocess.run(args, capture_output=True, text=True, cwd=SCRIPT_DIR)\n",
"if test_proc.returncode != 0:\n",
" print(test_proc.stderr)\n",
"last = test_proc.stdout.strip().splitlines()[-1]\n",
"\n",
"metrics = json.loads(last)\n",
"print(metrics)\n",
"\n",
"print(f\"return code: {test_proc.returncode}\")\n",
"print(f\"args: {args}\")"
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": null,
2025-11-17 10:52:00 +00:00
"id": "8df28468",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2025-12-06 21:15:49 +01:00
"c:\\Users\\Sergio\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\ray\\tune\\impl\\tuner_internal.py:144: RayDeprecationWarning: The `RunConfig` class should be imported from `ray.tune` when passing it to the Tuner. Please update your imports. See this issue for more context and migration options: https://github.com/ray-project/ray/issues/49454. Disable these warnings by setting the environment variable: RAY_TRAIN_ENABLE_V2_MIGRATION_WARNINGS=0\n",
2025-11-17 10:52:00 +00:00
" _log_deprecation_warning(\n",
2025-12-06 21:15:49 +01:00
"2025-12-06 20:56:49,361\tINFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949\n"
2025-11-17 10:52:00 +00:00
]
},
{
"data": {
"text/html": [
"<div class=\"tuneStatus\">\n",
" <div style=\"display: flex;flex-direction: row\">\n",
" <div style=\"display: flex;flex-direction: column;\">\n",
" <h3>Tune Status</h3>\n",
" <table>\n",
"<tbody>\n",
2025-12-06 21:15:49 +01:00
"<tr><td>Current time:</td><td>2025-12-06 21:03:49</td></tr>\n",
"<tr><td>Running for: </td><td>00:06:59.77 </td></tr>\n",
"<tr><td>Memory: </td><td>7.3/15.9 GiB </td></tr>\n",
2025-11-17 10:52:00 +00:00
"</tbody>\n",
"</table>\n",
" </div>\n",
" <div class=\"vDivider\"></div>\n",
" <div class=\"systemInfo\">\n",
" <h3>System Info</h3>\n",
2025-12-06 21:15:49 +01:00
" Using AsyncHyperBand: num_stopped=0<br>Bracket: Iter 64.000: None | Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: -0.09990138305449689<br>Logical resource usage: 2.0/16 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:G)\n",
2025-11-17 10:52:00 +00:00
" </div>\n",
" \n",
" </div>\n",
" <div class=\"hDivider\"></div>\n",
" <div class=\"trialStatus\">\n",
" <h3>Trial Status</h3>\n",
" <table>\n",
"<thead>\n",
2025-12-06 21:15:49 +01:00
"<tr><th>Trial name </th><th>status </th><th>loc </th><th style=\"text-align: right;\"> line_tolerance</th><th style=\"text-align: right;\"> min_box_score</th><th style=\"text-align: right;\"> text_det_box_thresh</th><th style=\"text-align: right;\"> text_det_unclip_rati\n",
2025-11-17 10:52:00 +00:00
"o</th><th style=\"text-align: right;\"> text_rec_score_thres\n",
"h</th><th>textline_orientation </th><th style=\"text-align: right;\"> iter</th><th style=\"text-align: right;\"> total time (s)</th><th style=\"text-align: right;\"> CER</th><th style=\"text-align: right;\"> WER</th><th style=\"text-align: right;\"> TIME</th></tr>\n",
"</thead>\n",
"<tbody>\n",
2025-12-06 21:15:49 +01:00
"<tr><td>trainable_paddle_ocr_b3bdc_00002</td><td>RUNNING </td><td>127.0.0.1:10864</td><td style=\"text-align: right;\"> 0.6</td><td style=\"text-align: right;\"> 0 </td><td style=\"text-align: right;\"> 0.532972</td><td style=\"text-align: right;\">1.9115 </td><td style=\"text-align: right;\">0.4</td><td>False </td><td style=\"text-align: right;\"> </td><td style=\"text-align: right;\"> </td><td style=\"text-align: right;\"> </td><td style=\"text-align: right;\"> </td><td style=\"text-align: right;\"> </td></tr>\n",
"<tr><td>trainable_paddle_ocr_b3bdc_00003</td><td>RUNNING </td><td>127.0.0.1:11400</td><td style=\"text-align: right;\"> 0.7</td><td style=\"text-align: right;\"> 0.5</td><td style=\"text-align: right;\"> 0.6633 </td><td style=\"text-align: right;\">1.69526</td><td style=\"text-align: right;\">0.4</td><td>False </td><td style=\"text-align: right;\"> </td><td style=\"text-align: right;\"> </td><td style=\"text-align: right;\"> </td><td style=\"text-align: right;\"> </td><td style=\"text-align: right;\"> </td></tr>\n",
"<tr><td>trainable_paddle_ocr_b3bdc_00000</td><td>TERMINATED</td><td>127.0.0.1:19504</td><td style=\"text-align: right;\"> 0.7</td><td style=\"text-align: right;\"> 0 </td><td style=\"text-align: right;\"> 0.565297</td><td style=\"text-align: right;\">1.28249</td><td style=\"text-align: right;\">0.2</td><td>True </td><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 399.525</td><td style=\"text-align: right;\">0.0639132</td><td style=\"text-align: right;\">0.148775</td><td style=\"text-align: right;\">376.277</td></tr>\n",
"<tr><td>trainable_paddle_ocr_b3bdc_00001</td><td>TERMINATED</td><td>127.0.0.1:18012</td><td style=\"text-align: right;\"> 0.7</td><td style=\"text-align: right;\"> 0 </td><td style=\"text-align: right;\"> 0.610761</td><td style=\"text-align: right;\">1.78824</td><td style=\"text-align: right;\">0 </td><td>True </td><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 386.487</td><td style=\"text-align: right;\">0.13589 </td><td style=\"text-align: right;\">0.304316</td><td style=\"text-align: right;\">362.611</td></tr>\n",
2025-11-17 10:52:00 +00:00
"</tbody>\n",
"</table>\n",
" </div>\n",
"</div>\n",
"<style>\n",
".tuneStatus {\n",
" color: var(--jp-ui-font-color1);\n",
"}\n",
".tuneStatus .systemInfo {\n",
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
".tuneStatus td {\n",
" white-space: nowrap;\n",
"}\n",
".tuneStatus .trialStatus {\n",
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
".tuneStatus h3 {\n",
" font-weight: bold;\n",
"}\n",
".tuneStatus .hDivider {\n",
" border-bottom-width: var(--jp-border-width);\n",
" border-bottom-color: var(--jp-border-color0);\n",
" border-bottom-style: solid;\n",
"}\n",
".tuneStatus .vDivider {\n",
" border-left-width: var(--jp-border-width);\n",
" border-left-color: var(--jp-border-color0);\n",
" border-left-style: solid;\n",
" margin: 0.5em 1em 0.5em 1em;\n",
"}\n",
"</style>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
2025-12-06 21:15:49 +01:00
"2025-12-06 20:56:49,376\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\Sergio\\AppData\\Local\\Temp\\ray\\session_2025-12-06_20-08-28_976013_10020\\artifacts\\2025-12-06_20-56-49\\trainable_paddle_ocr_2025-12-06_20-56-49\\driver_artifacts\\trainable_paddle_ocr_b3bdc_00000_0_line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.5653,text_det_unclip_ratio=1.2825,t_2025-12-06_20-56-49\n",
"2025-12-06 20:56:49,380\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\Sergio\\AppData\\Local\\Temp\\ray\\session_2025-12-06_20-08-28_976013_10020\\artifacts\\2025-12-06_20-56-49\\trainable_paddle_ocr_2025-12-06_20-56-49\\driver_artifacts\\trainable_paddle_ocr_b3bdc_00000_0_line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.5653,text_det_unclip_ratio=1.2825,t_2025-12-06_20-56-49\n",
"2025-12-06 20:56:49,384\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\Sergio\\AppData\\Local\\Temp\\ray\\session_2025-12-06_20-08-28_976013_10020\\artifacts\\2025-12-06_20-56-49\\trainable_paddle_ocr_2025-12-06_20-56-49\\driver_artifacts\\trainable_paddle_ocr_b3bdc_00001_1_line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.6108,text_det_unclip_ratio=1.7882,t_2025-12-06_20-56-49\n",
"2025-12-06 20:56:49,387\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\Sergio\\AppData\\Local\\Temp\\ray\\session_2025-12-06_20-08-28_976013_10020\\artifacts\\2025-12-06_20-56-49\\trainable_paddle_ocr_2025-12-06_20-56-49\\driver_artifacts\\trainable_paddle_ocr_b3bdc_00001_1_line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.6108,text_det_unclip_ratio=1.7882,t_2025-12-06_20-56-49\n",
"2025-12-06 20:56:54,158\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\Sergio\\AppData\\Local\\Temp\\ray\\session_2025-12-06_20-08-28_976013_10020\\artifacts\\2025-12-06_20-56-49\\trainable_paddle_ocr_2025-12-06_20-56-49\\driver_artifacts\\trainable_paddle_ocr_b3bdc_00001_1_line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.6108,text_det_unclip_ratio=1.7882,t_2025-12-06_20-56-49\n",
"2025-12-06 20:56:54,158\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\Sergio\\AppData\\Local\\Temp\\ray\\session_2025-12-06_20-08-28_976013_10020\\artifacts\\2025-12-06_20-56-49\\trainable_paddle_ocr_2025-12-06_20-56-49\\driver_artifacts\\trainable_paddle_ocr_b3bdc_00001_1_line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.6108,text_det_unclip_ratio=1.7882,t_2025-12-06_20-56-49\n",
"2025-12-06 20:56:54,163\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\Sergio\\AppData\\Local\\Temp\\ray\\session_2025-12-06_20-08-28_976013_10020\\artifacts\\2025-12-06_20-56-49\\trainable_paddle_ocr_2025-12-06_20-56-49\\driver_artifacts\\trainable_paddle_ocr_b3bdc_00000_0_line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.5653,text_det_unclip_ratio=1.2825,t_2025-12-06_20-56-49\n",
"2025-12-06 20:56:54,163\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\Sergio\\AppData\\Local\\Temp\\ray\\session_2025-12-06_20-08-28_976013_10020\\artifacts\\2025-12-06_20-56-49\\trainable_paddle_ocr_2025-12-06_20-56-49\\driver_artifacts\\trainable_paddle_ocr_b3bdc_00000_0_line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.5653,text_det_unclip_ratio=1.2825,t_2025-12-06_20-56-49\n",
"\u001b[36m(trainable_paddle_ocr pid=19504)\u001b[0m [2025-12-06 20:57:24,408 E 19504 14856] core_worker_process.cc:837: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14\n"
2025-11-17 10:52:00 +00:00
]
},
{
"data": {
"text/html": [
"<div class=\"trialProgress\">\n",
" <h3>Trial Progress</h3>\n",
" <table>\n",
"<thead>\n",
"<tr><th>Trial name </th><th style=\"text-align: right;\"> CER</th><th style=\"text-align: right;\"> PAGES</th><th style=\"text-align: right;\"> TIME</th><th style=\"text-align: right;\"> TIME_PER_PAGE</th><th style=\"text-align: right;\"> WER</th></tr>\n",
"</thead>\n",
"<tbody>\n",
2025-12-06 21:15:49 +01:00
"<tr><td>trainable_paddle_ocr_b3bdc_00000</td><td style=\"text-align: right;\">0.0639132</td><td style=\"text-align: right;\"> 5</td><td style=\"text-align: right;\">376.277</td><td style=\"text-align: right;\"> 75.1485</td><td style=\"text-align: right;\">0.148775</td></tr>\n",
"<tr><td>trainable_paddle_ocr_b3bdc_00001</td><td style=\"text-align: right;\">0.13589 </td><td style=\"text-align: right;\"> 5</td><td style=\"text-align: right;\">362.611</td><td style=\"text-align: right;\"> 72.4062</td><td style=\"text-align: right;\">0.304316</td></tr>\n",
2025-11-17 10:52:00 +00:00
"</tbody>\n",
"</table>\n",
"</div>\n",
"<style>\n",
".trialProgress {\n",
" display: flex;\n",
" flex-direction: column;\n",
" color: var(--jp-ui-font-color1);\n",
"}\n",
".trialProgress h3 {\n",
" font-weight: bold;\n",
"}\n",
".trialProgress td {\n",
" white-space: nowrap;\n",
"}\n",
"</style>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
2025-12-06 21:15:49 +01:00
"2025-12-06 21:03:20,722\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\Sergio\\AppData\\Local\\Temp\\ray\\session_2025-12-06_20-08-28_976013_10020\\artifacts\\2025-12-06_20-56-49\\trainable_paddle_ocr_2025-12-06_20-56-49\\driver_artifacts\\trainable_paddle_ocr_b3bdc_00001_1_line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.6108,text_det_unclip_ratio=1.7882,t_2025-12-06_20-56-49\n",
"2025-12-06 21:03:20,823\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\Sergio\\AppData\\Local\\Temp\\ray\\session_2025-12-06_20-08-28_976013_10020\\artifacts\\2025-12-06_20-56-49\\trainable_paddle_ocr_2025-12-06_20-56-49\\driver_artifacts\\trainable_paddle_ocr_b3bdc_00002_2_line_tolerance=0.6000,min_box_score=0,text_det_box_thresh=0.5330,text_det_unclip_ratio=1.9115,t_2025-12-06_21-03-20\n",
"2025-12-06 21:03:20,826\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\Sergio\\AppData\\Local\\Temp\\ray\\session_2025-12-06_20-08-28_976013_10020\\artifacts\\2025-12-06_20-56-49\\trainable_paddle_ocr_2025-12-06_20-56-49\\driver_artifacts\\trainable_paddle_ocr_b3bdc_00002_2_line_tolerance=0.6000,min_box_score=0,text_det_box_thresh=0.5330,text_det_unclip_ratio=1.9115,t_2025-12-06_21-03-20\n",
"2025-12-06 21:03:27,092\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\Sergio\\AppData\\Local\\Temp\\ray\\session_2025-12-06_20-08-28_976013_10020\\artifacts\\2025-12-06_20-56-49\\trainable_paddle_ocr_2025-12-06_20-56-49\\driver_artifacts\\trainable_paddle_ocr_b3bdc_00002_2_line_tolerance=0.6000,min_box_score=0,text_det_box_thresh=0.5330,text_det_unclip_ratio=1.9115,t_2025-12-06_21-03-20\n",
"2025-12-06 21:03:27,093\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\Sergio\\AppData\\Local\\Temp\\ray\\session_2025-12-06_20-08-28_976013_10020\\artifacts\\2025-12-06_20-56-49\\trainable_paddle_ocr_2025-12-06_20-56-49\\driver_artifacts\\trainable_paddle_ocr_b3bdc_00002_2_line_tolerance=0.6000,min_box_score=0,text_det_box_thresh=0.5330,text_det_unclip_ratio=1.9115,t_2025-12-06_21-03-20\n",
"2025-12-06 21:03:33,718\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\Sergio\\AppData\\Local\\Temp\\ray\\session_2025-12-06_20-08-28_976013_10020\\artifacts\\2025-12-06_20-56-49\\trainable_paddle_ocr_2025-12-06_20-56-49\\driver_artifacts\\trainable_paddle_ocr_b3bdc_00000_0_line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.5653,text_det_unclip_ratio=1.2825,t_2025-12-06_20-56-49\n",
"2025-12-06 21:03:33,736\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\Sergio\\AppData\\Local\\Temp\\ray\\session_2025-12-06_20-08-28_976013_10020\\artifacts\\2025-12-06_20-56-49\\trainable_paddle_ocr_2025-12-06_20-56-49\\driver_artifacts\\trainable_paddle_ocr_b3bdc_00003_3_line_tolerance=0.7000,min_box_score=0.5000,text_det_box_thresh=0.6633,text_det_unclip_ratio=1.6_2025-12-06_21-03-33\n",
"2025-12-06 21:03:33,737\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\Sergio\\AppData\\Local\\Temp\\ray\\session_2025-12-06_20-08-28_976013_10020\\artifacts\\2025-12-06_20-56-49\\trainable_paddle_ocr_2025-12-06_20-56-49\\driver_artifacts\\trainable_paddle_ocr_b3bdc_00003_3_line_tolerance=0.7000,min_box_score=0.5000,text_det_box_thresh=0.6633,text_det_unclip_ratio=1.6_2025-12-06_21-03-33\n",
"2025-12-06 21:03:38,480\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\Sergio\\AppData\\Local\\Temp\\ray\\session_2025-12-06_20-08-28_976013_10020\\artifacts\\2025-12-06_20-56-49\\trainable_paddle_ocr_2025-12-06_20-56-49\\driver_artifacts\\trainable_paddle_ocr_b3bdc_00003_3_line_tolerance=0.7000,min_box_score=0.5000,text_det_box_thresh=0.6633,text_det_unclip_ratio=1.6_2025-12-06_21-03-33\n",
"2025-12-06 21:03:38,481\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\Sergio\\AppData\\Local\\Temp\\ray\\session_2025-12-06_20-08-28_976013_10020\\artifacts\\2025-12-06_20-56-49\\trainable_paddle_ocr_2025-12-06_20-56-49\\driver_artifacts\\trainable_paddle_ocr_b3bdc_00003_3_line_tolerance=0.7000,min_box_score=0.5000,text_det_box_thresh=0.6633,text_det_unclip_ratio=1.6_2025-12-06_21-03-33\n"
2025-11-17 10:52:00 +00:00
]
}
],
"source": [
"def trainable_paddle_ocr(config):\n",
2025-12-06 21:15:49 +01:00
" args = [sys.executable, SCRIPT_ABS, \"--pdf-folder\", PDF_FOLDER_ABS]\n",
2025-11-17 10:52:00 +00:00
" for k, v in config.items():\n",
" args += [f\"--{KEYMAP[k]}\", str(v)]\n",
" proc = subprocess.run(args, capture_output=True, text=True, cwd=SCRIPT_DIR)\n",
"\n",
" if proc.returncode != 0:\n",
2025-12-06 21:15:49 +01:00
" tune.report({\"CER\": 1.0, \"WER\": 1.0, \"TIME\": 0.0, \"ERROR\": proc.stderr[:500]})\n",
2025-11-17 10:52:00 +00:00
" return\n",
" # última línea = JSON con métricas\n",
" last = proc.stdout.strip().splitlines()[-1]\n",
" \n",
" metrics = json.loads(last)\n",
" tune.report(metrics=metrics)\n",
"\n",
"scheduler = ASHAScheduler(grace_period=1, reduction_factor=2)\n",
"\n",
"tuner = tune.Tuner(\n",
" trainable_paddle_ocr,\n",
" tune_config=tune.TuneConfig(metric=\"CER\", \n",
" mode=\"min\", \n",
" scheduler=scheduler, \n",
2025-12-06 21:15:49 +01:00
" num_samples=32, \n",
" max_concurrent_trials=2),\n",
2025-11-17 10:52:00 +00:00
" run_config=air.RunConfig(verbose=2, log_to_file=False),\n",
" param_space=search_space\n",
")\n",
"\n",
"results = tuner.fit()\n",
"\n"
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": null,
2025-11-17 10:52:00 +00:00
"id": "710a67ce",
"metadata": {},
"outputs": [],
"source": [
2025-12-06 21:15:49 +01:00
"df = results.get_dataframe()"
2025-11-17 10:52:00 +00:00
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": null,
2025-11-17 10:52:00 +00:00
"id": "1ab345a3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
2025-12-06 21:15:49 +01:00
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Guardado: raytune_paddle_subproc_results_20251206_205059.csv\n",
2025-11-17 10:52:00 +00:00
"</pre>\n"
],
"text/plain": [
2025-12-06 21:15:49 +01:00
"Guardado: raytune_paddle_subproc_results_20251206_205059.csv\n"
2025-11-17 10:52:00 +00:00
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Generate a unique filename with timestamp\n",
"timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
"filename = f\"raytune_paddle_subproc_results_{timestamp}.csv\"\n",
"filepath = os.path.join(OUTPUT_FOLDER, filename)\n",
"\n",
"\n",
"df.to_csv(filename, index=False)\n",
"print(f\"Guardado: {filename}\")"
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": 68,
2025-11-17 10:52:00 +00:00
"id": "3e3a34e4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>CER</th>\n",
" <th>WER</th>\n",
" <th>TIME</th>\n",
" <th>timestamp</th>\n",
" <th>training_iteration</th>\n",
" <th>time_this_iter_s</th>\n",
" <th>time_total_s</th>\n",
" <th>pid</th>\n",
" <th>time_since_restore</th>\n",
" <th>iterations_since_restore</th>\n",
" <th>config/text_det_box_thresh</th>\n",
" <th>config/text_det_unclip_ratio</th>\n",
" <th>config/text_rec_score_thresh</th>\n",
" <th>config/line_tolerance</th>\n",
" <th>config/min_box_score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
2025-12-06 21:15:49 +01:00
" <td>32.0</td>\n",
" <td>32.0</td>\n",
" <td>32.0</td>\n",
" <td>3.200000e+01</td>\n",
" <td>32.0</td>\n",
" <td>32.000000</td>\n",
" <td>32.000000</td>\n",
" <td>32.000000</td>\n",
" <td>32.000000</td>\n",
" <td>32.0</td>\n",
" <td>32.000000</td>\n",
" <td>32.000000</td>\n",
" <td>32.000000</td>\n",
" <td>32.000000</td>\n",
" <td>32.000000</td>\n",
2025-11-17 10:52:00 +00:00
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
2025-12-06 21:15:49 +01:00
" <td>0.0</td>\n",
" <td>1.765050e+09</td>\n",
" <td>1.0</td>\n",
" <td>8.467219</td>\n",
" <td>8.467219</td>\n",
" <td>12978.500000</td>\n",
" <td>8.467219</td>\n",
" <td>1.0</td>\n",
" <td>0.519336</td>\n",
" <td>1.587766</td>\n",
" <td>0.243750</td>\n",
" <td>0.612500</td>\n",
" <td>0.378125</td>\n",
2025-11-17 10:52:00 +00:00
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2025-12-06 21:15:49 +01:00
" <td>6.192431e+01</td>\n",
2025-11-17 10:52:00 +00:00
" <td>0.0</td>\n",
2025-12-06 21:15:49 +01:00
" <td>0.149528</td>\n",
" <td>0.149528</td>\n",
" <td>6630.854934</td>\n",
" <td>0.149528</td>\n",
2025-11-17 10:52:00 +00:00
" <td>0.0</td>\n",
2025-12-06 21:15:49 +01:00
" <td>0.081852</td>\n",
" <td>0.214312</td>\n",
" <td>0.174018</td>\n",
" <td>0.090696</td>\n",
" <td>0.262414</td>\n",
2025-11-17 10:52:00 +00:00
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
2025-12-06 21:15:49 +01:00
" <td>0.0</td>\n",
" <td>1.765050e+09</td>\n",
" <td>1.0</td>\n",
" <td>8.143713</td>\n",
" <td>8.143713</td>\n",
" <td>928.000000</td>\n",
" <td>8.143713</td>\n",
" <td>1.0</td>\n",
" <td>0.405465</td>\n",
" <td>1.223846</td>\n",
" <td>0.000000</td>\n",
2025-11-17 10:52:00 +00:00
" <td>0.500000</td>\n",
2025-12-06 21:15:49 +01:00
" <td>0.000000</td>\n",
2025-11-17 10:52:00 +00:00
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
2025-12-06 21:15:49 +01:00
" <td>0.0</td>\n",
" <td>1.765050e+09</td>\n",
" <td>1.0</td>\n",
" <td>8.376062</td>\n",
" <td>8.376062</td>\n",
" <td>7607.000000</td>\n",
" <td>8.376062</td>\n",
" <td>1.0</td>\n",
" <td>0.450677</td>\n",
" <td>1.414576</td>\n",
" <td>0.000000</td>\n",
" <td>0.500000</td>\n",
" <td>0.000000</td>\n",
2025-11-17 10:52:00 +00:00
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
2025-12-06 21:15:49 +01:00
" <td>0.0</td>\n",
" <td>1.765050e+09</td>\n",
" <td>1.0</td>\n",
" <td>8.466036</td>\n",
" <td>8.466036</td>\n",
" <td>12822.000000</td>\n",
" <td>8.466036</td>\n",
" <td>1.0</td>\n",
" <td>0.517449</td>\n",
" <td>1.591365</td>\n",
" <td>0.300000</td>\n",
" <td>0.600000</td>\n",
" <td>0.500000</td>\n",
2025-11-17 10:52:00 +00:00
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
2025-12-06 21:15:49 +01:00
" <td>0.0</td>\n",
" <td>1.765051e+09</td>\n",
" <td>1.0</td>\n",
" <td>8.534602</td>\n",
" <td>8.534602</td>\n",
" <td>18918.000000</td>\n",
" <td>8.534602</td>\n",
" <td>1.0</td>\n",
" <td>0.549884</td>\n",
" <td>1.794517</td>\n",
" <td>0.400000</td>\n",
" <td>0.700000</td>\n",
" <td>0.600000</td>\n",
2025-11-17 10:52:00 +00:00
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
2025-12-06 21:15:49 +01:00
" <td>0.0</td>\n",
" <td>1.765051e+09</td>\n",
" <td>1.0</td>\n",
" <td>8.895011</td>\n",
" <td>8.895011</td>\n",
" <td>23328.000000</td>\n",
" <td>8.895011</td>\n",
" <td>1.0</td>\n",
" <td>0.685410</td>\n",
" <td>1.957707</td>\n",
" <td>0.400000</td>\n",
" <td>0.700000</td>\n",
2025-11-17 10:52:00 +00:00
" <td>0.600000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
2025-12-06 21:15:49 +01:00
" CER WER TIME timestamp training_iteration time_this_iter_s \\\n",
"count 32.0 32.0 32.0 3.200000e+01 32.0 32.000000 \n",
"mean 1.0 1.0 0.0 1.765050e+09 1.0 8.467219 \n",
"std 0.0 0.0 0.0 6.192431e+01 0.0 0.149528 \n",
"min 1.0 1.0 0.0 1.765050e+09 1.0 8.143713 \n",
"25% 1.0 1.0 0.0 1.765050e+09 1.0 8.376062 \n",
"50% 1.0 1.0 0.0 1.765050e+09 1.0 8.466036 \n",
"75% 1.0 1.0 0.0 1.765051e+09 1.0 8.534602 \n",
"max 1.0 1.0 0.0 1.765051e+09 1.0 8.895011 \n",
2025-11-17 10:52:00 +00:00
"\n",
2025-12-06 21:15:49 +01:00
" time_total_s pid time_since_restore \\\n",
"count 32.000000 32.000000 32.000000 \n",
"mean 8.467219 12978.500000 8.467219 \n",
"std 0.149528 6630.854934 0.149528 \n",
"min 8.143713 928.000000 8.143713 \n",
"25% 8.376062 7607.000000 8.376062 \n",
"50% 8.466036 12822.000000 8.466036 \n",
"75% 8.534602 18918.000000 8.534602 \n",
"max 8.895011 23328.000000 8.895011 \n",
2025-11-17 10:52:00 +00:00
"\n",
2025-12-06 21:15:49 +01:00
" iterations_since_restore config/text_det_box_thresh \\\n",
"count 32.0 32.000000 \n",
"mean 1.0 0.519336 \n",
"std 0.0 0.081852 \n",
"min 1.0 0.405465 \n",
"25% 1.0 0.450677 \n",
"50% 1.0 0.517449 \n",
"75% 1.0 0.549884 \n",
"max 1.0 0.685410 \n",
2025-11-17 10:52:00 +00:00
"\n",
2025-12-06 21:15:49 +01:00
" config/text_det_unclip_ratio config/text_rec_score_thresh \\\n",
"count 32.000000 32.000000 \n",
"mean 1.587766 0.243750 \n",
"std 0.214312 0.174018 \n",
"min 1.223846 0.000000 \n",
"25% 1.414576 0.000000 \n",
"50% 1.591365 0.300000 \n",
"75% 1.794517 0.400000 \n",
"max 1.957707 0.400000 \n",
2025-11-17 10:52:00 +00:00
"\n",
2025-12-06 21:15:49 +01:00
" config/line_tolerance config/min_box_score \n",
"count 32.000000 32.000000 \n",
"mean 0.612500 0.378125 \n",
"std 0.090696 0.262414 \n",
"min 0.500000 0.000000 \n",
"25% 0.500000 0.000000 \n",
"50% 0.600000 0.500000 \n",
"75% 0.700000 0.600000 \n",
"max 0.700000 0.600000 "
2025-11-17 10:52:00 +00:00
]
},
2025-12-06 21:15:49 +01:00
"execution_count": 68,
2025-11-17 10:52:00 +00:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": null,
2025-11-17 10:52:00 +00:00
"id": "4ce5eb6a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Correlación con CER:\n",
2025-12-06 21:15:49 +01:00
" config/text_det_box_thresh NaN\n",
"config/text_det_unclip_ratio NaN\n",
"config/text_rec_score_thresh NaN\n",
"config/line_tolerance NaN\n",
"config/min_box_score NaN\n",
"CER NaN\n",
2025-11-17 10:52:00 +00:00
"Name: CER, dtype: float64\n",
"</pre>\n"
],
"text/plain": [
"Correlación con CER:\n",
2025-12-06 21:15:49 +01:00
" config/text_det_box_thresh NaN\n",
"config/text_det_unclip_ratio NaN\n",
"config/text_rec_score_thresh NaN\n",
"config/line_tolerance NaN\n",
"config/min_box_score NaN\n",
"CER NaN\n",
2025-11-17 10:52:00 +00:00
"Name: CER, dtype: float64\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Correlación con WER:\n",
2025-12-06 21:15:49 +01:00
" config/text_det_box_thresh NaN\n",
"config/text_det_unclip_ratio NaN\n",
"config/text_rec_score_thresh NaN\n",
"config/line_tolerance NaN\n",
"config/min_box_score NaN\n",
"WER NaN\n",
2025-11-17 10:52:00 +00:00
"Name: WER, dtype: float64\n",
"</pre>\n"
],
"text/plain": [
"Correlación con WER:\n",
2025-12-06 21:15:49 +01:00
" config/text_det_box_thresh NaN\n",
"config/text_det_unclip_ratio NaN\n",
"config/text_rec_score_thresh NaN\n",
"config/line_tolerance NaN\n",
"config/min_box_score NaN\n",
"WER NaN\n",
2025-11-17 10:52:00 +00:00
"Name: WER, dtype: float64\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"param_cols = [\n",
" \"config/text_det_box_thresh\",\n",
" \"config/text_det_unclip_ratio\",\n",
" \"config/text_rec_score_thresh\",\n",
" \"config/line_tolerance\",\n",
" \"config/min_box_score\",\n",
"]\n",
"# Correlación de Pearson con CER y WER\n",
"corr_cer = df[param_cols + [\"CER\"]].corr()[\"CER\"].sort_values(ascending=False)\n",
"corr_wer = df[param_cols + [\"WER\"]].corr()[\"WER\"].sort_values(ascending=False)\n",
"\n",
"print(\"Correlación con CER:\\n\", corr_cer)\n",
"print(\"Correlación con WER:\\n\", corr_wer)"
]
},
{
"cell_type": "code",
2025-12-06 21:15:49 +01:00
"execution_count": null,
2025-11-17 10:52:00 +00:00
"id": "02fc0a87",
"metadata": {},
"outputs": [
{
"data": {
2025-12-06 21:15:49 +01:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkAAAAHHCAYAAABXx+fLAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjcsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvTLEjVAAAAAlwSFlzAAAPYQAAD2EBqD+naQAATY9JREFUeJzt3Ql4FFXW8PETgklQIIoRAojsghA2WYMLOoZNRVB0kFFBXnRGBAQZGYFBAuIIDjKDIwjuiAwDLyqgiFEEwYVoZFPZRXFYJAREExYBTep7zn2/aruTTtIdutNL/X/P05Cqul1ddWs7dZeqGMuyLAEAAHCQCqFeAAAAgPJGAAQAAByHAAgAADgOARAAAHAcAiAAAOA4BEAAAMBxCIAAAIDjEAABAADHIQACAACOQwAUYY4fPy733HOPJCcnS0xMjIwcOdKMP3TokNx6661y4YUXmvEzZsyQSF8np5k7d65Z/++++07C1TXXXCMpKSkSzcuj22DixImlptM0mjaS97X169eHelGAkCEACqOTUXGfTz/91JX28ccfN+mHDBkir776qtx1111m/IMPPijvvvuujB071ozv0aNHwJdTf3vp0qVBma+3dfKmXr16rnypUKGCnH/++dKiRQv54x//KJ999tlZLcczzzxjliPYgpWPZWFfxEv7aKCByLBkyRLp2bOnJCUlSVxcnNSqVUt+//vfy+rVqyUarFixwqcANZDuvvvuYo+NhIQECVfu50v9nHfeedKhQweZN29eROV/sFQM9QLgN48++qjUr1+/SJY0atTI9beexDp16iTp6ekeaXR879695aGHHgrqhVtLmfr06RPQ+Ra3TsVp3bq1/PnPfzZ/Hzt2TLZv3y6LFy+W559/3gSC//jHP8ocAOlFQ092wVRcPmrgd/vtt0t8fLyUl1tuucVj/9LSOA1Eb775ZjPNVqNGjXJbJpSNvtbxf/7nf0wQ36ZNGxk1apQpVT148KAJiq677jr55JNPpHPnzhGdxXoBnjVrVrlfhPW4fOGFF4qMj42NlXDmfr7UfUHXYeDAgXL69Gm59957Iyb/g4EAKIzoXVu7du1KTJOTkyPNmjXzOl5LQyJRcetUnNq1a8udd97pMe6JJ56QP/zhD/LPf/5TGjdubC7ikUZPpOV9Mm3ZsqX52I4cOWLyTscVzuOzderUKVMioSV3CLzp06eb4EerkPUmwL167q9//aspXa1YsXxP+SdPnpRzzz1XIiF41P2zUqVKxabRvCvLMXHixAlT8hKM/Pn111+loKDAHFe+ni/1Bq9BgwbmXFmWACiacCaKEGvWrDEntD179sjbb7/tKtK0q8/0ANao3B5v++mnn8wJsU6dOuYORu/2NVjQg8adDj/11FOmOkmLdC+66CJTjWa3EdB56oH8yiuvuH6jtJISDWwGDx5sSg90nq1atTLfL22dytIGRk9ceoKvVq2a/O1vfzP54b5u2iaqefPmZjl0ef70pz/Jjz/+6FFUvHXrVlm7dq3Xap/yyMfi2gBpyZQuu/6uVmcMHTrULI+3tjDbtm2Ta6+91pxU9cT397//XYKhtN+xt+3ChQtl/PjxJo2mzcvLM9O1ulLzJTEx0Yzv0qWLKZ1wp6V7mue6bXTdq1evLl27dpWNGzf6vTy+7I8l+fjjj6V9+/bmew0bNpRnn33Wr/zSEsq2bdua/VRLGfWCdODAAY80uh9UrlzZjNfSQf1b9x8t1c3Pzy9x/j///LNMmTJFmjZtKk8++aTXtklawqjVH+60FEBLivR39CKtJX+HDx/2SLNs2TK54YYbzL6n20HXf/LkyUWWyd4HN2zYIFdffbXZFuPGjfNrHva+cf3118sFF1xglkmDcT2m7DzS85xyr9rx51hXuk/deOONptmA3nTqdvF3m3pjH8N6Hrn//vvNPnvxxReXmj++7Jt6XtB56/bVddQ81LzUfd8fuq11P/nmm288xn/00Udy2223ySWXXGLmq+c6LVHXfcsWqPwPF5QAhZHc3FxzB+5Ody5t2HzZZZeZC7zukHpA2UWaWtRtt5vRi8OAAQM87i70wqInVN0Jdcdet26daSekRaHuDaX14NODV0uhtEGy3lnoAaHtj/QEob+h4/UEqu1tlB6AxdGDRg/43bt3y7Bhw0zVnl4E9ADSi/eIESOKXSc9QMtCLxh6An/xxRfNSUEPQqXrrus2aNAgeeCBB0zANXPmTNm0aZO56J5zzjkmL4YPH27moXfL7tU+ocxHLWaeNGmSpKWlmZKZnTt3yuzZs+Xzzz93LbtNTzIaVGjVlbb5eO211+Thhx82wZguT6D48zt6kdO7U72I68VW/9YqT02nAYFWe2qJ0Msvvyy/+93vTF7ZF+n77rvPzFv3Hy0h/OGHH0wgolWel19+uV/L48v+WJyvvvpKunXrZvZL3R66TXW5fa0WtPc9DaA0SNEOC3pB1+2n+6B7ya0GBN27d5eOHTuaC937779vSnZ0HympVFPz5ejRoyZg9KcUUfd5DTR0ffQCq/uy5s+iRYs8ll+PCw2U9H/dfhMmTDDB7LRp0zzmp9tI81yrcjXIs/PI13msXLnSBCY1a9Y020Sr8HR7L1++3Azr8ff999+bdHosFebLsW7TY6l///7mO1oS0qRJk1Lzq/D5Wek+XbVqVY9xGvzo/qLrqDc8JeWPv/umHitaWqXnDw1U9KbPH7r/7t+/32x3d/qbeq7T/UyvOVlZWfL000+btDpNBTL/w4KFkHv55Ze1uMLrJz4+3iNt3bp1rRtuuKHIPDTt0KFDPcZNnjzZOu+886xdu3Z5jB8zZowVGxtr7d271wyvXr3afP+BBx4oMt+CggLX3zqvgQMH+rROM2bMMPOcP3++a9yZM2es1NRUq3LlylZeXl6p6+RNaWn/+c9/mt9dtmyZGf7oo4/M8L///W+PdBkZGUXGN2/e3OrSpUuReZZXPtr7wZ49e8xwTk6OFRcXZ3Xr1s3Kz893pZs5c6ZJ99JLL7nG6XLruHnz5rnGnT592kpOTrb69u1r+erw4cNmPunp6V6n+/o7H3zwgUnXoEED6+TJkx750LhxY6t79+4eeaJp6tevb3Xt2tU1LjExscg+Xdbl8Wd/LLz+ffr0sRISEqz//ve/rnHbtm0z2760U6j+RvXq1a2UlBTr559/do1fvny5+e6ECRNc43Sf0HGPPvqoxzzatGljtW3btsTfeeqpp8x3lyxZYvnC3tfS0tI8tsODDz5o1uunn35yjXPffrY//elP1rnnnmudOnWqyLaYM2dOkfS+zOPXX381+4Ae4z/++KNHWvdl1H3CW777c6zrb+g4neYLe9t4++i+XDhfr7zySrM+7orLH1/3TT0vaLqqVauac4MvdD31/KHHtX6++uor66677vJ6vTjpZRtNmTLFiomJ8dj3A5H/4YIqsDCiRYsaWbt/3nnnnTLPT6P2q666ykT6eudif7Q0Qe80P/zwQ5Pu9ddfNyVN3hohl7WbrzaU07s3vcOyafSvdwXa0FaLiINB7y7t6hM7D7SaRUvH3PNASx807QcffBC2+ah3/2fOnDF39e7tZvRuVe84tdqw8Lq71/XrnamWpnz77bcSSP78jja2dG9XsXnzZvn6669Ney29G7bzUu+StZGu5qVdraglI1odonecZ7s8Zd0fdftqNYlWSWnJn01LL7WkpjRa9anVG1oi4N5bSKuDtBqi8Da0S77c6b5X2ja0qxarVKki/tBSBPd9U39L1/m///2va5z79tPjSreXptPSgh07dnjMT0sk9O6/MF/moaUEWmKg+3vh9oy+HD/+Huta0uLLNrTp9it8ftbP1KlTi6TVY9RbSZy3/PF33+zbt69fpeTvvfeeSa8fLRXVkhtdhmmFSu/ct5Eej5p32mBe7wl025QmEOfa8kYVWBjRk3ZpjaD9oReaL7/8stiDRU/MSuuCtW7e36LUkugJVBsjF27wqhcOe3ow6AnD/UKgeaBVi1oXX1IehGM+2nlUuGheL/DaiLFwHmo1YuELhQZtuuyB5M/vFO7VqHlpB0bF0e2l89N2PJpO2yLoSVTbhWg
2025-11-17 10:52:00 +00:00
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
2025-12-06 21:15:49 +01:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAHHCAYAAABDUnkqAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjcsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvTLEjVAAAAAlwSFlzAAAPYQAAD2EBqD+naQAAQ1NJREFUeJzt3Ql8FPX9//FPEo6AQriEBEQu6xFRTsGooFUQPPiptS3aKpQiKni0pbaCVhCpomKVtiIqVrHFVlrrUStiFdGiorFQLy6LIqASAiLhMlyZ/+P97X/S3c0mbLKb3c3k9Xw8lrAz352Z78zszme+12R4nucZAABAQGSmegMAAAASieAGAAAECsENAAAIFIIbAAAQKAQ3AAAgUAhuAABAoBDcAACAQCG4AQAAgUJwAwAAAoXgBjW2c+dOu/zyyy03N9cyMjLsxz/+sZu+adMm+/a3v22tW7d202fMmFHn81RTP/jBD6xz585WV91yyy1uPwCcC6hLCG4QZs6cOe5iVtnrrbfeKk97++23u/Rjx461P/zhD3bZZZe56T/5yU/sxRdftIkTJ7rpQ4cOTfhe1rqfeeaZWllutDxFo6DlvPPOs3SjgKqqY+i/lA7pZdy4cZaZmWlbt24Nm673mt64cWMrLS0Nm/fJJ5+443njjTdaup9/2dnZlq70fQ7d1kMOOcT69etnv//972u8zPnz57ugEMnXIAXrRB1w6623WpcuXSpMP/LII8v//8orr9hJJ51kkydPDkuj6eeff75df/31tbZ9CkJUOnTBBRckdLmV5ammZs+ebWVlZZZMV155pQ0aNKj8/dq1a23SpEl2xRVX2IABA8qnd+vWLanbhYM79dRTbdasWfbGG2/YsGHDyqe/+eabLrjZt2+f/etf/3LpfErrfzZdKAh7+OGHK0zPysqydNazZ0/76U9/6v6/ceNGl4eRI0fanj17bMyYMTUKbmbOnEmAkwIEN4jq7LPPtr59+1a5d4qLiy0/Pz/q9BYtWtTJPVtZnmqqYcOGlmwFBQXu5dPFUMGNpl166aWWSrt377amTZumdBvSmR+gvP7662HBjQKYE044wb7++ms3LzSQ0XsFPieffHJc696/f78LxBs1amTxatCgQY3OtV27drkSk9o4d2LJX4cOHcK2W6VQXbt2tXvvvbdGwQ1Sh2opVNurr77qim1VIvD888+XF+P6VVp60LzuVvzpvm3btrk2LB07dnR3dioFuvPOOyuUbOj9r3/9azv++ONdMfZhhx3mqrZ0kRYtUz+Cjz32WMxVLApaRo8ebe3atXPL7NGjh/v8wfL06aefJrTNjZan5d5999320EMPudIT7YsTTzzR3nnnnQqfX7VqlSuhatWqldtuBZx/+9vfLBH+8pe/WJ8+faxJkybWpk0b96P++eefx/TZuXPnln9W23bxxRfbhg0bwtKcfvrp1r17d1u6dKkNHDjQXZj8qpNnn33Wzj33XGvfvr3Lv/bD1KlT7cCBA1GXsWLFCvvmN7/plqEL0F133VVhm1RdoyqAo446yu2rvLw8+9a3vmUff/xx2LmlNmDHHXecS6PzQSVdX331Vcwleyr90gVYAbxKKFeuXBm1bcqaNWvc8Ve6nJwcGzVqlLtAV+WII45w3w+/NMan96eccooLYKLNU378G4qDneuR56H2h38eaj/7AZPOSX1e8x588EFLNP/34rXXXnPVcW3btrXDDz/8oOdOIvIXK/32HHPMMWHnkCxevNi+853vuOOl5eqYqTpewadPx16/gxJa3ZWocxFVo+QGUZWUlNiWLVvCpumLqUbCxx57rGuPoi+zfoz8YtxevXqVt1MZPHiwjRgxovyz+lE/7bTT3MVTX2D9KKioXe1yVPwb2uhYP1z64VPpkRr36o5LPyZq76OLu9ah6aoPV1XLwapY9IOjH0tdbK655hpX3aYLu358FHD96Ec/qjRP+nGrDX/84x9tx44dbl9ov+pirQux2k/4pT3Lly93FzRdzCdMmOAuqH/+859dVdxf//pXu/DCC2u8fu1fXWx1AZs2bZprBK6AUhfKf//731WWvN122212880323e/+113HDZv3my//e1v3UUo8rNffvmlO44KfhQ86QfcX/+hhx5q48ePd38VNKh0afv27TZ9+vSw9enHXsGt9o/W+eSTT9oNN9zggl8tWxQUqf3TwoUL3bp0TLV/X3rpJfvwww/Lzw/tbz/v1113nQtm77vvPrfdyntVJW0vv/yyW5/u5BXA6LxSvnWMli1bVqHhuLZV55r2r+arikMXcAX0VVGpzFNPPeWqQnTh3Lt3rwt81Q5M36Of//zn7gZC5432jS7YV111VczneqhHH33UBYX6HmldClQ/+OADO+uss9y5r3zq+6dqWv/YxSry90NUatK8efOwaQpstC4df920VHXuJCJ/1aG8f/bZZ9ayZcuw6VqnjoWOiX4TCwsL3bmgtJrnn2tffPGFOwf12xIpnnMRMfCAEI8++qin0yLaq3HjxmH7qlOnTt65555bYf8p7dVXXx02berUqd4hhxziffTRR2HTJ0yY4GVlZXnr169371955RX3+euuu67CcsvKysr/r2WNHDkypmM3Y8YMt8y5c+eWT9u7d69XUFDgHXrood727dsPmqdoYkmrbVQ639q1a922tG7d2tu6dWv59GeffdZNf+6558qnnXnmmd7xxx/vlZaWhu2Dk08+2fvGN77hxeqdd95xy9ax9fPetm1br3v37t7XX39dnu7vf/+7Szdp0qTyaZMnT3bTfJ9++qk7XrfddlvYOj744AOvQYMGYdNPO+0099kHHnigwjbt3r27wrQrr7zSa9q0aVh+/WX8/ve/L5+2Z88eLzc317vooovKpz3yyCMu3T333FPpebN48WKX5vHHHw+bv2DBgqjTI/Xs2dPtty+//LJ82nvvvedlZmZ6I0aMqLDPfvjDH4Z9/sILL3TH/WBmzpzpPq/tlSVLlrj369at81asWOH+v3z58rBj5m97rOe6fx42b97cKy4uDlv/BRdc4GVnZ7v1+bReHfdYLhk65yv7DRkyZEiF35pTTz3V279/f9gyKjt3EpG/yuh7etZZZ3mbN292L53Tl112WdTfs2jn77Rp07yMjIyw/abPRdtn8Z6LODiqpRCVilN1xxH6euGFF2q8t3Q3o+J83QHpjs5/qeGr7rr/+c9/unQqkdAdabQGvTXtkqxGferafckll5RP012R7pbU9VvF4sk2fPjwsLtBv6GvSm783jEqzdDdv0og/P2lu9khQ4bYf/7zn5irkCKpek9F+7pjDu29omoiFcGrWq4yKlFQcbq2K/Q4av9+4xvfsEWLFoWl192y7kwjqTrL5+dP+0B3w6qKC6WSndB2ELr7V6mdv6/880ZVa9dee22l543OQVUPqVQxdNtVvaZ1RG57KJUuvvvuu66EIPTuX+1gtDydY5H80hSf8qfjp9KpWNvdiO7iVXqn0k4dH63fr5qKbExc3XP9oosuCiud1HdRPR1VOqj1+VSyqfMuVjqvIn8/9LrjjjsqpFVblmgNjaOdO/Hm72D+8Y9/uPR6qWRQJS7ahsjSxNDzV6VNOo9UZah7O5W8HEw85yJiQ7UUotLF42ANiqtDF+P333+/0h8aXWxFddtqh1Hd4uOqrFu3zl141egylH6w/fnJFnrhED/Q8evbVeyuH0pV/+hV2T7TRa+6/PweffTRFebp4ulfVCs7jtou7c9oIovStX3RGnCqyu0Xv/iFC+AiL/aqEg2lasLIwFb7S+eTT+eN8qOGrFVtu5atqqGqzsHq7jOdRwoIIhvDVnWMI6tmQqmtiar2QgMYVX2J9oMahmuaggL9VXsPf13VPdcje0SqilFVP9GOr/IeLYiLRsFKaI+9qkTrlVnZuRNv/g6mf//+9stf/tIFearO1P91vCK3Y/369a4aTe3fItvIRJ6/iT4XERuCGySF7vZ1l6L2AtGoEWh9UlmX2P/W6v13f4m601d2xxzaLT9ZtF26wKoUL1oedNdZ2R2uT20j1P5KF3gNOaD2MLrTV7sUtaWJbGB+sH1
2025-11-17 10:52:00 +00:00
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"plt.scatter(df[\"config/text_det_box_thresh\"], df[\"CER\"])\n",
"plt.xlabel(\"Detection Box Threshold\")\n",
"plt.ylabel(\"CER\")\n",
"plt.title(\"Effect of Detection Threshold on Character Error Rate\")\n",
"plt.show()\n",
"\n",
"plt.scatter(df[\"config/line_tolerance\"], df[\"WER\"])\n",
"plt.xlabel(\"Line Tolerance\")\n",
"plt.ylabel(\"WER\")\n",
"plt.title(\"Effect of Line Tolerance on Word Error Rate\")\n",
"plt.show()\n"
]
}
],
"metadata": {
"kernelspec": {
2025-12-06 21:15:49 +01:00
"display_name": "Python 3",
2025-11-17 10:52:00 +00:00
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}