653 lines
252 KiB
Plaintext
653 lines
252 KiB
Plaintext
|
|
{
|
||
|
|
"cells": [
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"id": "be3c1872",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"# AI-based OCR Benchmark Notebook\n",
|
||
|
|
"\n",
|
||
|
|
"This notebook benchmarks **AI-based OCR models** on scanned PDF documents/images in Spanish.\n",
|
||
|
|
"It excludes traditional OCR engines like Tesseract that require external installations."
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 1,
|
||
|
|
"id": "6a1e98fe",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stdout",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n",
|
||
|
|
"Requirement already satisfied: pip in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (25.2)\n",
|
||
|
|
"Note: you may need to restart the kernel to use updated packages.\n",
|
||
|
|
"Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n",
|
||
|
|
"Requirement already satisfied: jupyter in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (1.1.1)\n",
|
||
|
|
"Requirement already satisfied: notebook in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter) (7.4.7)\n",
|
||
|
|
"Requirement already satisfied: jupyter-console in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter) (6.6.3)\n",
|
||
|
|
"Requirement already satisfied: nbconvert in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter) (7.16.6)\n",
|
||
|
|
"Requirement already satisfied: ipykernel in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter) (6.30.1)\n",
|
||
|
|
"Requirement already satisfied: ipywidgets in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter) (8.1.7)\n",
|
||
|
|
"Requirement already satisfied: jupyterlab in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter) (4.4.9)\n",
|
||
|
|
"Requirement already satisfied: comm>=0.1.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (0.2.3)\n",
|
||
|
|
"Requirement already satisfied: debugpy>=1.6.5 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (1.8.17)\n",
|
||
|
|
"Requirement already satisfied: ipython>=7.23.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (9.6.0)\n",
|
||
|
|
"Requirement already satisfied: jupyter-client>=8.0.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (8.6.3)\n",
|
||
|
|
"Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (5.8.1)\n",
|
||
|
|
"Requirement already satisfied: matplotlib-inline>=0.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (0.1.7)\n",
|
||
|
|
"Requirement already satisfied: nest-asyncio>=1.4 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (1.6.0)\n",
|
||
|
|
"Requirement already satisfied: packaging>=22 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (25.0)\n",
|
||
|
|
"Requirement already satisfied: psutil>=5.7 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (7.1.0)\n",
|
||
|
|
"Requirement already satisfied: pyzmq>=25 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (27.1.0)\n",
|
||
|
|
"Requirement already satisfied: tornado>=6.2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (6.5.2)\n",
|
||
|
|
"Requirement already satisfied: traitlets>=5.4.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (5.14.3)\n",
|
||
|
|
"Requirement already satisfied: colorama in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (0.4.6)\n",
|
||
|
|
"Requirement already satisfied: decorator in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (5.2.1)\n",
|
||
|
|
"Requirement already satisfied: ipython-pygments-lexers in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (1.1.1)\n",
|
||
|
|
"Requirement already satisfied: jedi>=0.16 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (0.19.2)\n",
|
||
|
|
"Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (3.0.52)\n",
|
||
|
|
"Requirement already satisfied: pygments>=2.4.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (2.19.2)\n",
|
||
|
|
"Requirement already satisfied: stack_data in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (0.6.3)\n",
|
||
|
|
"Requirement already satisfied: wcwidth in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=7.23.1->ipykernel->jupyter) (0.2.14)\n",
|
||
|
|
"Requirement already satisfied: parso<0.9.0,>=0.8.4 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jedi>=0.16->ipython>=7.23.1->ipykernel->jupyter) (0.8.5)\n",
|
||
|
|
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-client>=8.0.0->ipykernel->jupyter) (2.9.0.post0)\n",
|
||
|
|
"Requirement already satisfied: platformdirs>=2.5 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel->jupyter) (4.4.0)\n",
|
||
|
|
"Requirement already satisfied: pywin32>=300 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel->jupyter) (311)\n",
|
||
|
|
"Requirement already satisfied: six>=1.5 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from python-dateutil>=2.8.2->jupyter-client>=8.0.0->ipykernel->jupyter) (1.17.0)\n",
|
||
|
|
"Requirement already satisfied: widgetsnbextension~=4.0.14 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipywidgets->jupyter) (4.0.14)\n",
|
||
|
|
"Requirement already satisfied: jupyterlab_widgets~=3.0.15 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipywidgets->jupyter) (3.0.15)\n",
|
||
|
|
"Requirement already satisfied: async-lru>=1.0.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab->jupyter) (2.0.5)\n",
|
||
|
|
"Requirement already satisfied: httpx<1,>=0.25.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab->jupyter) (0.28.1)\n",
|
||
|
|
"Requirement already satisfied: jinja2>=3.0.3 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab->jupyter) (3.1.6)\n",
|
||
|
|
"Requirement already satisfied: jupyter-lsp>=2.0.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab->jupyter) (2.3.0)\n",
|
||
|
|
"Requirement already satisfied: jupyter-server<3,>=2.4.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab->jupyter) (2.17.0)\n",
|
||
|
|
"Requirement already satisfied: jupyterlab-server<3,>=2.27.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab->jupyter) (2.27.3)\n",
|
||
|
|
"Requirement already satisfied: notebook-shim>=0.2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab->jupyter) (0.2.4)\n",
|
||
|
|
"Requirement already satisfied: setuptools>=41.1.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab->jupyter) (80.9.0)\n",
|
||
|
|
"Requirement already satisfied: anyio in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (4.11.0)\n",
|
||
|
|
"Requirement already satisfied: certifi in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (2025.10.5)\n",
|
||
|
|
"Requirement already satisfied: httpcore==1.* in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (1.0.9)\n",
|
||
|
|
"Requirement already satisfied: idna in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (3.10)\n",
|
||
|
|
"Requirement already satisfied: h11>=0.16 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from httpcore==1.*->httpx<1,>=0.25.0->jupyterlab->jupyter) (0.16.0)\n",
|
||
|
|
"Requirement already satisfied: argon2-cffi>=21.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (25.1.0)\n",
|
||
|
|
"Requirement already satisfied: jupyter-events>=0.11.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.12.0)\n",
|
||
|
|
"Requirement already satisfied: jupyter-server-terminals>=0.4.4 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.5.3)\n",
|
||
|
|
"Requirement already satisfied: nbformat>=5.3.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (5.10.4)\n",
|
||
|
|
"Requirement already satisfied: prometheus-client>=0.9 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.23.1)\n",
|
||
|
|
"Requirement already satisfied: pywinpty>=2.0.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (3.0.2)\n",
|
||
|
|
"Requirement already satisfied: send2trash>=1.8.2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.8.3)\n",
|
||
|
|
"Requirement already satisfied: terminado>=0.8.3 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.18.1)\n",
|
||
|
|
"Requirement already satisfied: websocket-client>=1.7 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.9.0)\n",
|
||
|
|
"Requirement already satisfied: babel>=2.10 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab-server<3,>=2.27.1->jupyterlab->jupyter) (2.17.0)\n",
|
||
|
|
"Requirement already satisfied: json5>=0.9.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab-server<3,>=2.27.1->jupyterlab->jupyter) (0.12.1)\n",
|
||
|
|
"Requirement already satisfied: jsonschema>=4.18.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab-server<3,>=2.27.1->jupyterlab->jupyter) (4.25.1)\n",
|
||
|
|
"Requirement already satisfied: requests>=2.31 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab-server<3,>=2.27.1->jupyterlab->jupyter) (2.32.5)\n",
|
||
|
|
"Requirement already satisfied: sniffio>=1.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from anyio->httpx<1,>=0.25.0->jupyterlab->jupyter) (1.3.1)\n",
|
||
|
|
"Requirement already satisfied: argon2-cffi-bindings in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (25.1.0)\n",
|
||
|
|
"Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jinja2>=3.0.3->jupyterlab->jupyter) (3.0.3)\n",
|
||
|
|
"Requirement already satisfied: attrs>=22.2.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.27.1->jupyterlab->jupyter) (25.4.0)\n",
|
||
|
|
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.27.1->jupyterlab->jupyter) (2025.9.1)\n",
|
||
|
|
"Requirement already satisfied: referencing>=0.28.4 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.27.1->jupyterlab->jupyter) (0.36.2)\n",
|
||
|
|
"Requirement already satisfied: rpds-py>=0.7.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.27.1->jupyterlab->jupyter) (0.27.1)\n",
|
||
|
|
"Requirement already satisfied: python-json-logger>=2.0.4 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (4.0.0)\n",
|
||
|
|
"Requirement already satisfied: pyyaml>=5.3 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (6.0.2)\n",
|
||
|
|
"Requirement already satisfied: rfc3339-validator in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.1.4)\n",
|
||
|
|
"Requirement already satisfied: rfc3986-validator>=0.1.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.1.1)\n",
|
||
|
|
"Requirement already satisfied: fqdn in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.5.1)\n",
|
||
|
|
"Requirement already satisfied: isoduration in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (20.11.0)\n",
|
||
|
|
"Requirement already satisfied: jsonpointer>1.13 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (3.0.0)\n",
|
||
|
|
"Requirement already satisfied: rfc3987-syntax>=1.1.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.1.0)\n",
|
||
|
|
"Requirement already satisfied: uri-template in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.3.0)\n",
|
||
|
|
"Requirement already satisfied: webcolors>=24.6.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (24.11.1)\n",
|
||
|
|
"Requirement already satisfied: beautifulsoup4 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from nbconvert->jupyter) (4.14.2)\n",
|
||
|
|
"Requirement already satisfied: bleach!=5.0.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from bleach[css]!=5.0.0->nbconvert->jupyter) (6.2.0)\n",
|
||
|
|
"Requirement already satisfied: defusedxml in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from nbconvert->jupyter) (0.7.1)\n",
|
||
|
|
"Requirement already satisfied: jupyterlab-pygments in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from nbconvert->jupyter) (0.3.0)\n",
|
||
|
|
"Requirement already satisfied: mistune<4,>=2.0.3 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from nbconvert->jupyter) (3.1.4)\n",
|
||
|
|
"Requirement already satisfied: nbclient>=0.5.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from nbconvert->jupyter) (0.10.2)\n",
|
||
|
|
"Requirement already satisfied: pandocfilters>=1.4.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from nbconvert->jupyter) (1.5.1)\n",
|
||
|
|
"Requirement already satisfied: webencodings in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from bleach!=5.0.0->bleach[css]!=5.0.0->nbconvert->jupyter) (0.5.1)\n",
|
||
|
|
"Requirement already satisfied: tinycss2<1.5,>=1.1.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from bleach[css]!=5.0.0->nbconvert->jupyter) (1.4.0)\n",
|
||
|
|
"Requirement already satisfied: fastjsonschema>=2.15 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from nbformat>=5.3.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.21.2)\n",
|
||
|
|
"Requirement already satisfied: charset_normalizer<4,>=2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests>=2.31->jupyterlab-server<3,>=2.27.1->jupyterlab->jupyter) (3.4.3)\n",
|
||
|
|
"Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests>=2.31->jupyterlab-server<3,>=2.27.1->jupyterlab->jupyter) (2.5.0)\n",
|
||
|
|
"Requirement already satisfied: lark>=1.2.2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from rfc3987-syntax>=1.1.0->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.3.0)\n",
|
||
|
|
"Requirement already satisfied: cffi>=1.0.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.0.0)\n",
|
||
|
|
"Requirement already satisfied: pycparser in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from cffi>=1.0.1->argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.23)\n",
|
||
|
|
"Requirement already satisfied: soupsieve>1.2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from beautifulsoup4->nbconvert->jupyter) (2.8)\n",
|
||
|
|
"Requirement already satisfied: typing-extensions>=4.0.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from beautifulsoup4->nbconvert->jupyter) (4.15.0)\n",
|
||
|
|
"Requirement already satisfied: arrow>=0.15.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.3.0)\n",
|
||
|
|
"Requirement already satisfied: types-python-dateutil>=2.8.10 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from arrow>=0.15.0->isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.9.0.20251008)\n",
|
||
|
|
"Requirement already satisfied: executing>=1.2.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from stack_data->ipython>=7.23.1->ipykernel->jupyter) (2.2.1)\n",
|
||
|
|
"Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from stack_data->ipython>=7.23.1->ipykernel->jupyter) (3.0.0)\n",
|
||
|
|
"Requirement already satisfied: pure-eval in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from stack_data->ipython>=7.23.1->ipykernel->jupyter) (0.2.3)\n",
|
||
|
|
"Note: you may need to restart the kernel to use updated packages.\n",
|
||
|
|
"Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n",
|
||
|
|
"Requirement already satisfied: ipywidgets in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (8.1.7)\n",
|
||
|
|
"Requirement already satisfied: comm>=0.1.3 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipywidgets) (0.2.3)\n",
|
||
|
|
"Requirement already satisfied: ipython>=6.1.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipywidgets) (9.6.0)\n",
|
||
|
|
"Requirement already satisfied: traitlets>=4.3.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipywidgets) (5.14.3)\n",
|
||
|
|
"Requirement already satisfied: widgetsnbextension~=4.0.14 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipywidgets) (4.0.14)\n",
|
||
|
|
"Requirement already satisfied: jupyterlab_widgets~=3.0.15 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipywidgets) (3.0.15)\n",
|
||
|
|
"Requirement already satisfied: colorama in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (0.4.6)\n",
|
||
|
|
"Requirement already satisfied: decorator in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (5.2.1)\n",
|
||
|
|
"Requirement already satisfied: ipython-pygments-lexers in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (1.1.1)\n",
|
||
|
|
"Requirement already satisfied: jedi>=0.16 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (0.19.2)\n",
|
||
|
|
"Requirement already satisfied: matplotlib-inline in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (0.1.7)\n",
|
||
|
|
"Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (3.0.52)\n",
|
||
|
|
"Requirement already satisfied: pygments>=2.4.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (2.19.2)\n",
|
||
|
|
"Requirement already satisfied: stack_data in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (0.6.3)\n",
|
||
|
|
"Requirement already satisfied: wcwidth in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets) (0.2.14)\n",
|
||
|
|
"Requirement already satisfied: parso<0.9.0,>=0.8.4 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets) (0.8.5)\n",
|
||
|
|
"Requirement already satisfied: executing>=1.2.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from stack_data->ipython>=6.1.0->ipywidgets) (2.2.1)\n",
|
||
|
|
"Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from stack_data->ipython>=6.1.0->ipywidgets) (3.0.0)\n",
|
||
|
|
"Requirement already satisfied: pure-eval in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from stack_data->ipython>=6.1.0->ipywidgets) (0.2.3)\n",
|
||
|
|
"Note: you may need to restart the kernel to use updated packages.\n",
|
||
|
|
"Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n",
|
||
|
|
"Requirement already satisfied: ipykernel in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (6.30.1)\n",
|
||
|
|
"Requirement already satisfied: comm>=0.1.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (0.2.3)\n",
|
||
|
|
"Requirement already satisfied: debugpy>=1.6.5 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (1.8.17)\n",
|
||
|
|
"Requirement already satisfied: ipython>=7.23.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (9.6.0)\n",
|
||
|
|
"Requirement already satisfied: jupyter-client>=8.0.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (8.6.3)\n",
|
||
|
|
"Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (5.8.1)\n",
|
||
|
|
"Requirement already satisfied: matplotlib-inline>=0.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (0.1.7)\n",
|
||
|
|
"Requirement already satisfied: nest-asyncio>=1.4 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (1.6.0)\n",
|
||
|
|
"Requirement already satisfied: packaging>=22 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (25.0)\n",
|
||
|
|
"Requirement already satisfied: psutil>=5.7 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (7.1.0)\n",
|
||
|
|
"Requirement already satisfied: pyzmq>=25 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (27.1.0)\n",
|
||
|
|
"Requirement already satisfied: tornado>=6.2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (6.5.2)\n",
|
||
|
|
"Requirement already satisfied: traitlets>=5.4.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (5.14.3)\n",
|
||
|
|
"Requirement already satisfied: colorama in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel) (0.4.6)\n",
|
||
|
|
"Requirement already satisfied: decorator in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel) (5.2.1)\n",
|
||
|
|
"Requirement already satisfied: ipython-pygments-lexers in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel) (1.1.1)\n",
|
||
|
|
"Requirement already satisfied: jedi>=0.16 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel) (0.19.2)\n",
|
||
|
|
"Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel) (3.0.52)\n",
|
||
|
|
"Requirement already satisfied: pygments>=2.4.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel) (2.19.2)\n",
|
||
|
|
"Requirement already satisfied: stack_data in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel) (0.6.3)\n",
|
||
|
|
"Requirement already satisfied: wcwidth in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=7.23.1->ipykernel) (0.2.14)\n",
|
||
|
|
"Requirement already satisfied: parso<0.9.0,>=0.8.4 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jedi>=0.16->ipython>=7.23.1->ipykernel) (0.8.5)\n",
|
||
|
|
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-client>=8.0.0->ipykernel) (2.9.0.post0)\n",
|
||
|
|
"Requirement already satisfied: platformdirs>=2.5 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel) (4.4.0)\n",
|
||
|
|
"Requirement already satisfied: pywin32>=300 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel) (311)\n",
|
||
|
|
"Requirement already satisfied: six>=1.5 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from python-dateutil>=2.8.2->jupyter-client>=8.0.0->ipykernel) (1.17.0)\n",
|
||
|
|
"Requirement already satisfied: executing>=1.2.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from stack_data->ipython>=7.23.1->ipykernel) (2.2.1)\n",
|
||
|
|
"Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from stack_data->ipython>=7.23.1->ipykernel) (3.0.0)\n",
|
||
|
|
"Requirement already satisfied: pure-eval in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from stack_data->ipython>=7.23.1->ipykernel) (0.2.3)\n",
|
||
|
|
"Note: you may need to restart the kernel to use updated packages.\n",
|
||
|
|
"Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n",
|
||
|
|
"Requirement already satisfied: easyocr in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (1.7.2)\n",
|
||
|
|
"Requirement already satisfied: transformers in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (4.57.0)\n",
|
||
|
|
"Requirement already satisfied: torch in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (2.8.0+cpu)\n",
|
||
|
|
"Requirement already satisfied: pdf2image in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (1.17.0)\n",
|
||
|
|
"Requirement already satisfied: pillow in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (11.3.0)\n",
|
||
|
|
"Requirement already satisfied: jiwer in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (4.0.0)\n",
|
||
|
|
"Requirement already satisfied: paddleocr in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (3.2.0)\n",
|
||
|
|
"Requirement already satisfied: hf_xet in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (1.1.10)\n",
|
||
|
|
"Requirement already satisfied: paddlepaddle in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (3.2.0)\n",
|
||
|
|
"Requirement already satisfied: torchvision>=0.5 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from easyocr) (0.23.0+cpu)\n",
|
||
|
|
"Requirement already satisfied: opencv-python-headless in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from easyocr) (4.12.0.88)\n",
|
||
|
|
"Requirement already satisfied: scipy in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from easyocr) (1.16.2)\n",
|
||
|
|
"Requirement already satisfied: numpy in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from easyocr) (2.1.2)\n",
|
||
|
|
"Requirement already satisfied: scikit-image in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from easyocr) (0.25.2)\n",
|
||
|
|
"Requirement already satisfied: python-bidi in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from easyocr) (0.6.6)\n",
|
||
|
|
"Requirement already satisfied: PyYAML in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from easyocr) (6.0.2)\n",
|
||
|
|
"Requirement already satisfied: Shapely in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from easyocr) (2.1.2)\n",
|
||
|
|
"Requirement already satisfied: pyclipper in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from easyocr) (1.3.0.post6)\n",
|
||
|
|
"Requirement already satisfied: ninja in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from easyocr) (1.13.0)\n",
|
||
|
|
"Requirement already satisfied: filelock in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from transformers) (3.13.1)\n",
|
||
|
|
"Requirement already satisfied: huggingface-hub<1.0,>=0.34.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from transformers) (0.35.3)\n",
|
||
|
|
"Requirement already satisfied: packaging>=20.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from transformers) (25.0)\n",
|
||
|
|
"Requirement already satisfied: regex!=2019.12.17 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from transformers) (2025.9.18)\n",
|
||
|
|
"Requirement already satisfied: requests in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from transformers) (2.32.5)\n",
|
||
|
|
"Requirement already satisfied: tokenizers<=0.23.0,>=0.22.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from transformers) (0.22.1)\n",
|
||
|
|
"Requirement already satisfied: safetensors>=0.4.3 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from transformers) (0.6.2)\n",
|
||
|
|
"Requirement already satisfied: tqdm>=4.27 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from transformers) (4.67.1)\n",
|
||
|
|
"Requirement already satisfied: fsspec>=2023.5.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (2024.6.1)\n",
|
||
|
|
"Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (4.15.0)\n",
|
||
|
|
"Requirement already satisfied: sympy>=1.13.3 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from torch) (1.13.3)\n",
|
||
|
|
"Requirement already satisfied: networkx in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from torch) (3.3)\n",
|
||
|
|
"Requirement already satisfied: jinja2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from torch) (3.1.6)\n",
|
||
|
|
"Requirement already satisfied: setuptools in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from torch) (80.9.0)\n",
|
||
|
|
"Requirement already satisfied: click>=8.1.8 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jiwer) (8.3.0)\n",
|
||
|
|
"Requirement already satisfied: rapidfuzz>=3.9.7 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jiwer) (3.14.1)\n",
|
||
|
|
"Requirement already satisfied: paddlex<3.3.0,>=3.2.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (3.2.1)\n",
|
||
|
|
"Requirement already satisfied: aistudio_sdk>=0.3.5 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (0.3.8)\n",
|
||
|
|
"Requirement already satisfied: chardet in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (5.2.0)\n",
|
||
|
|
"Requirement already satisfied: colorlog in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (6.9.0)\n",
|
||
|
|
"Requirement already satisfied: modelscope>=1.28.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (1.30.0)\n",
|
||
|
|
"Requirement already satisfied: pandas>=1.3 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (2.3.3)\n",
|
||
|
|
"Requirement already satisfied: prettytable in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (3.16.0)\n",
|
||
|
|
"Requirement already satisfied: py-cpuinfo in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (9.0.0)\n",
|
||
|
|
"Requirement already satisfied: pydantic>=2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (2.12.0)\n",
|
||
|
|
"Requirement already satisfied: ruamel.yaml in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (0.18.15)\n",
|
||
|
|
"Requirement already satisfied: ujson in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (5.11.0)\n",
|
||
|
|
"Requirement already satisfied: imagesize in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (1.4.1)\n",
|
||
|
|
"Requirement already satisfied: opencv-contrib-python==4.10.0.84 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (4.10.0.84)\n",
|
||
|
|
"Requirement already satisfied: pypdfium2>=4 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (4.30.0)\n",
|
||
|
|
"Requirement already satisfied: httpx in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from paddlepaddle) (0.28.1)\n",
|
||
|
|
"Requirement already satisfied: protobuf>=3.20.2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from paddlepaddle) (6.32.1)\n",
|
||
|
|
"Requirement already satisfied: opt-einsum==3.3.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from paddlepaddle) (3.3.0)\n",
|
||
|
|
"Requirement already satisfied: psutil in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from aistudio_sdk>=0.3.5->paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (7.1.0)\n",
|
||
|
|
"Requirement already satisfied: bce-python-sdk in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from aistudio_sdk>=0.3.5->paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (0.9.46)\n",
|
||
|
|
"Requirement already satisfied: colorama in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from click>=8.1.8->jiwer) (0.4.6)\n",
|
||
|
|
"Requirement already satisfied: urllib3>=1.26 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from modelscope>=1.28.0->paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (2.5.0)\n",
|
||
|
|
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas>=1.3->paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (2.9.0.post0)\n",
|
||
|
|
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas>=1.3->paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (2025.2)\n",
|
||
|
|
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas>=1.3->paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (2025.2)\n",
|
||
|
|
"Requirement already satisfied: annotated-types>=0.6.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pydantic>=2->paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (0.7.0)\n",
|
||
|
|
"Requirement already satisfied: pydantic-core==2.41.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pydantic>=2->paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (2.41.1)\n",
|
||
|
|
"Requirement already satisfied: typing-inspection>=0.4.2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pydantic>=2->paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (0.4.2)\n",
|
||
|
|
"Requirement already satisfied: six>=1.5 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from python-dateutil>=2.8.2->pandas>=1.3->paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (1.17.0)\n",
|
||
|
|
"Requirement already satisfied: charset_normalizer<4,>=2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests->transformers) (3.4.3)\n",
|
||
|
|
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests->transformers) (3.10)\n",
|
||
|
|
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests->transformers) (2025.10.5)\n",
|
||
|
|
"Requirement already satisfied: mpmath<1.4,>=1.1.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from sympy>=1.13.3->torch) (1.3.0)\n",
|
||
|
|
"Requirement already satisfied: pycryptodome>=3.8.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from bce-python-sdk->aistudio_sdk>=0.3.5->paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (3.23.0)\n",
|
||
|
|
"Requirement already satisfied: future>=0.6.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from bce-python-sdk->aistudio_sdk>=0.3.5->paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (1.0.0)\n",
|
||
|
|
"Requirement already satisfied: anyio in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from httpx->paddlepaddle) (4.11.0)\n",
|
||
|
|
"Requirement already satisfied: httpcore==1.* in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from httpx->paddlepaddle) (1.0.9)\n",
|
||
|
|
"Requirement already satisfied: h11>=0.16 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from httpcore==1.*->httpx->paddlepaddle) (0.16.0)\n",
|
||
|
|
"Requirement already satisfied: sniffio>=1.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from anyio->httpx->paddlepaddle) (1.3.1)\n",
|
||
|
|
"Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jinja2->torch) (3.0.3)\n",
|
||
|
|
"Requirement already satisfied: wcwidth in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from prettytable->paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (0.2.14)\n",
|
||
|
|
"Requirement already satisfied: ruamel.yaml.clib>=0.2.7 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ruamel.yaml->paddlex<3.3.0,>=3.2.0->paddlex[ocr-core]<3.3.0,>=3.2.0->paddleocr) (0.2.14)\n",
|
||
|
|
"Requirement already satisfied: imageio!=2.35.0,>=2.33 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from scikit-image->easyocr) (2.37.0)\n",
|
||
|
|
"Requirement already satisfied: tifffile>=2022.8.12 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from scikit-image->easyocr) (2025.10.4)\n",
|
||
|
|
"Requirement already satisfied: lazy-loader>=0.4 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from scikit-image->easyocr) (0.4)\n",
|
||
|
|
"Note: you may need to restart the kernel to use updated packages.\n",
|
||
|
|
"Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n",
|
||
|
|
"Requirement already satisfied: PyMuPDF in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (1.26.4)\n",
|
||
|
|
"Note: you may need to restart the kernel to use updated packages.\n",
|
||
|
|
"Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n",
|
||
|
|
"Requirement already satisfied: pandas in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (2.3.3)\n",
|
||
|
|
"Requirement already satisfied: numpy>=1.26.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas) (2.1.2)\n",
|
||
|
|
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas) (2.9.0.post0)\n",
|
||
|
|
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas) (2025.2)\n",
|
||
|
|
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas) (2025.2)\n",
|
||
|
|
"Requirement already satisfied: six>=1.5 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n",
|
||
|
|
"Note: you may need to restart the kernel to use updated packages.\n",
|
||
|
|
"Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n",
|
||
|
|
"Requirement already satisfied: matplotlib in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (3.10.6)\n",
|
||
|
|
"Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib) (1.3.3)\n",
|
||
|
|
"Requirement already satisfied: cycler>=0.10 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib) (0.12.1)\n",
|
||
|
|
"Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib) (4.60.1)\n",
|
||
|
|
"Requirement already satisfied: kiwisolver>=1.3.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib) (1.4.9)\n",
|
||
|
|
"Requirement already satisfied: numpy>=1.23 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib) (2.1.2)\n",
|
||
|
|
"Requirement already satisfied: packaging>=20.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib) (25.0)\n",
|
||
|
|
"Requirement already satisfied: pillow>=8 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib) (11.3.0)\n",
|
||
|
|
"Requirement already satisfied: pyparsing>=2.3.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib) (3.2.5)\n",
|
||
|
|
"Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib) (2.9.0.post0)\n",
|
||
|
|
"Requirement already satisfied: six>=1.5 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from python-dateutil>=2.7->matplotlib) (1.17.0)\n",
|
||
|
|
"Note: you may need to restart the kernel to use updated packages.\n",
|
||
|
|
"Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n",
|
||
|
|
"Requirement already satisfied: seaborn in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (0.13.2)\n",
|
||
|
|
"Requirement already satisfied: numpy!=1.24.0,>=1.20 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from seaborn) (2.1.2)\n",
|
||
|
|
"Requirement already satisfied: pandas>=1.2 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from seaborn) (2.3.3)\n",
|
||
|
|
"Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from seaborn) (3.10.6)\n",
|
||
|
|
"Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.3.3)\n",
|
||
|
|
"Requirement already satisfied: cycler>=0.10 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.12.1)\n",
|
||
|
|
"Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.60.1)\n",
|
||
|
|
"Requirement already satisfied: kiwisolver>=1.3.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.9)\n",
|
||
|
|
"Requirement already satisfied: packaging>=20.0 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (25.0)\n",
|
||
|
|
"Requirement already satisfied: pillow>=8 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (11.3.0)\n",
|
||
|
|
"Requirement already satisfied: pyparsing>=2.3.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.2.5)\n",
|
||
|
|
"Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.9.0.post0)\n",
|
||
|
|
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas>=1.2->seaborn) (2025.2)\n",
|
||
|
|
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas>=1.2->seaborn) (2025.2)\n",
|
||
|
|
"Requirement already satisfied: six>=1.5 in c:\\users\\sji\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.17.0)\n",
|
||
|
|
"Note: you may need to restart the kernel to use updated packages.\n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"%pip install --upgrade pip\n",
|
||
|
|
"%pip install --upgrade jupyter\n",
|
||
|
|
"%pip install --upgrade ipywidgets\n",
|
||
|
|
"%pip install -U ipykernel\n",
|
||
|
|
"\n",
|
||
|
|
"# Install necessary packages\n",
|
||
|
|
"%pip install easyocr transformers torch pdf2image pillow jiwer paddleocr hf_xet paddlepaddle\n",
|
||
|
|
"# pdf reading\n",
|
||
|
|
"%pip install PyMuPDF\n",
|
||
|
|
"\n",
|
||
|
|
"# Data analysis and visualization\n",
|
||
|
|
"%pip install pandas\n",
|
||
|
|
"%pip install matplotlib\n",
|
||
|
|
"%pip install seaborn"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 2,
|
||
|
|
"id": "ae33632a",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"# Imports\n",
|
||
|
|
"import os\n",
|
||
|
|
"import numpy as np\n",
|
||
|
|
"import pandas as pd\n",
|
||
|
|
"import matplotlib.pyplot as plt\n",
|
||
|
|
"from pdf2image import convert_from_path\n",
|
||
|
|
"from PIL import Image, ImageOps\n",
|
||
|
|
"import easyocr\n",
|
||
|
|
"from transformers import TrOCRProcessor, VisionEncoderDecoderModel\n",
|
||
|
|
"import torch\n",
|
||
|
|
"from jiwer import wer, cer\n",
|
||
|
|
"from paddleocr import PaddleOCR\n",
|
||
|
|
"import fitz # PyMuPDF"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"id": "0e00f1b0",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"## 1 Configuration"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 3,
|
||
|
|
"id": "dda5534d",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stderr",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n",
|
||
|
|
"Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n",
|
||
|
|
"Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-stage1 and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']\n",
|
||
|
|
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
|
||
|
|
"C:\\Users\\sji\\AppData\\Local\\Temp\\ipykernel_17700\\3778845089.py:12: DeprecationWarning: The parameter `use_angle_cls` has been deprecated and will be removed in the future. Please use `use_textline_orientation` instead.\n",
|
||
|
|
" paddleocr_model = PaddleOCR(lang='es', use_angle_cls=True) # PaddleOCR in Spanish\n",
|
||
|
|
"c:\\Users\\sji\\Desktop\\MastersThesis\\.venv\\Lib\\site-packages\\paddle\\utils\\cpp_extension\\extension_utils.py:718: UserWarning: No ccache found. Please be aware that recompiling all source files may be required. You can download and install ccache from: https://github.com/ccache/ccache/blob/master/doc/INSTALL.md\n",
|
||
|
|
" warnings.warn(warning_message)\n",
|
||
|
|
"\u001b[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)\u001b[0m\n",
|
||
|
|
"\u001b[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\\Users\\sji\\.paddlex\\official_models\\PP-LCNet_x1_0_doc_ori`.\u001b[0m\n",
|
||
|
|
"\u001b[32mCreating model: ('UVDoc', None)\u001b[0m\n",
|
||
|
|
"\u001b[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\\Users\\sji\\.paddlex\\official_models\\UVDoc`.\u001b[0m\n",
|
||
|
|
"\u001b[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)\u001b[0m\n",
|
||
|
|
"\u001b[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\\Users\\sji\\.paddlex\\official_models\\PP-LCNet_x1_0_textline_ori`.\u001b[0m\n",
|
||
|
|
"\u001b[32mCreating model: ('PP-OCRv5_server_det', None)\u001b[0m\n",
|
||
|
|
"\u001b[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\\Users\\sji\\.paddlex\\official_models\\PP-OCRv5_server_det`.\u001b[0m\n",
|
||
|
|
"\u001b[32mCreating model: ('latin_PP-OCRv5_mobile_rec', None)\u001b[0m\n",
|
||
|
|
"\u001b[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\\Users\\sji\\.paddlex\\official_models\\latin_PP-OCRv5_mobile_rec`.\u001b[0m\n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"PDF_FOLDER = './instructions' # Folder containing PDF files\n",
|
||
|
|
"OUTPUT_FOLDER = 'results'\n",
|
||
|
|
"os.makedirs(OUTPUT_FOLDER, exist_ok=True)\n",
|
||
|
|
"\n",
|
||
|
|
"LANGUAGES = ['es'] # OCR language(s)\n",
|
||
|
|
"#device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
|
||
|
|
"device = 'cpu'\n",
|
||
|
|
"# Initialize AI OCR models\n",
|
||
|
|
"easyocr_reader = easyocr.Reader(LANGUAGES)\n",
|
||
|
|
"trocr_processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-stage1')\n",
|
||
|
|
"trocr_model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-stage1').to(device)\n",
|
||
|
|
"paddleocr_model = PaddleOCR(lang='es', use_angle_cls=True) # PaddleOCR in Spanish"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"id": "84c999e2",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"## 2 Helper Functions"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 4,
|
||
|
|
"id": "9596c7df",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"def pdf_to_images(pdf_path):\n",
|
||
|
|
" return convert_from_path(pdf_path)\n",
|
||
|
|
"\n",
|
||
|
|
"def ocr_easyocr(img):\n",
|
||
|
|
" result = easyocr_reader.readtext(np.array(img))\n",
|
||
|
|
" res = ' '.join([r[1] for r in result])\n",
|
||
|
|
" plt.figure(figsize=(10, 12))\n",
|
||
|
|
" plt.imshow(img)\n",
|
||
|
|
" plt.axis('off')\n",
|
||
|
|
" plt.title(res, fontsize=10)\n",
|
||
|
|
" plt.show()\n",
|
||
|
|
" return res\n",
|
||
|
|
"\n",
|
||
|
|
"def pdf_extract_text(pdf_path, page_num) -> str:\n",
|
||
|
|
" \"\"\"\n",
|
||
|
|
" Extracts text from a specific PDF page in proper reading order.\n",
|
||
|
|
" \"\"\"\n",
|
||
|
|
" doc = fitz.open(pdf_path)\n",
|
||
|
|
" \n",
|
||
|
|
" if page_num < 1 or page_num > len(doc):\n",
|
||
|
|
" return \"\"\n",
|
||
|
|
" \n",
|
||
|
|
" page = doc[page_num - 1]\n",
|
||
|
|
" blocks = page.get_text(\"blocks\") # returns list of (x0, y0, x1, y1, \"text\", block_no, block_type)\n",
|
||
|
|
" \n",
|
||
|
|
" # Sort blocks top-to-bottom, left-to-right\n",
|
||
|
|
" blocks_sorted = sorted(blocks, key=lambda b: (b[1], b[0])) # y0, then x0\n",
|
||
|
|
" \n",
|
||
|
|
" text = \" \".join([b[4].replace('\\n', ' ').strip() for b in blocks_sorted])\n",
|
||
|
|
" return text\n",
|
||
|
|
"\n",
|
||
|
|
"def evaluate_text(reference, prediction):\n",
|
||
|
|
" return {'WER': wer(reference, prediction), 'CER': cer(reference, prediction)}"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"id": "e42cae29",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"## 3 Run AI OCR Benchmark"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 5,
|
||
|
|
"id": "9b55c154",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stderr",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"c:\\Users\\sji\\Desktop\\MastersThesis\\.venv\\Lib\\site-packages\\torch\\utils\\data\\dataloader.py:666: UserWarning: 'pin_memory' argument is set as true but no accelerator is found, then device pinned memory won't be used.\n",
|
||
|
|
" warnings.warn(warn_msg)\n"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAsAAAAPDCAYAAAC9zL31AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjYsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvq6yFwwAAAAlwSFlzAAAPYQAAD2EBqD+naQAAm9NJREFUeJzt3QV0HcfBhuGRmZnZjsPM3CRtkiZlTJmZ4S8zMzOlzE25ado0aZiZGjQzM0u+/3lHGuX6+pLAtpR5n3Oc2NKF3dnZ3W9nZ2YbCoVCIUiSJEmZ6LGvF0CSJEnamwzAkiRJyooBWJIkSVkxAEuSJCkrBmBJkiRlxQAsSZKkrBiAJUmSlBUDsCRJkrJiAJb2gK1bt4ZPf/rT4ZFHHtnXi5K9yy67LPzoRz/a14vxmPb9738/XHnllft6MSSpbgbgFi9/+cvDM57xjH29GN0KJ7yGhoawdu3a0BX89Kc/DcOGDesSdeatb31rDL8zZ84MXRHb7S9/+Uvdr//Yxz4WjjrqqNAVzZ07N67PnXfeudvvHnroobidTjjhhC5x3Ohq+0xn+MUvfhF++MMfhuOPPz48FnWFc0O1Ot4dj51dUem+2VXL5Mwzzwxvf/vbQ1cyt0z9vO6668Lhhx8eevfuHfefth772rqe7Tq2FtrgZS97WeHpT396obNMnTq18NWvfrXQFaxdu7awZs2afb0Y3coVV1zBY7S7TLn95Cc/KQwdOnSf15lf/vKXhXPOOaewffv2QlfFdvvzn/9c9+s/+tGPFo488shCV9TY2FhYsmRJYceOHbv8fPPmzYVjjz22cOmll+6x727rMbHaPpN+V+0Pr6Gel/vdD3/4w/g5lX7ft2/fmss1bNiwwpYtW3b53c0339z6GaUefPDBwmGHHRbLf1+eS+qtr3PmzInrcccdd3Src0NpHe/sYy/7yrJlywpdUVtzQnvLpvR9XbVMVq1aVVi/fn2nfNavf/3rQo8ePQpvfOMbO7TvljsGn3DCCYUXv/jFhQULFsQy3bZtW3zNzp0798h6tme79wpdXFNTU0z1PXrs2cbqoUOHhlzt2LEjXqXltL33ZJ150YteFP9o79S9nj17hnHjxu328/79+4dbb701dBennHJKWLJkSeu/3/a2t4X169eHn/zkJ60/GzFiRGxtGTJkSHjwwQcr1sdyv2e/qmXw4MHhz3/+c3jBC17Q+rMLL7wwTJkyJcyfP3+31x9wwAHhnnvuCfvK9u3bQ58+fbr9uaHWflCpjnfWd7Ov8EeP6qplwjGgs1x44YXhPe95T+zC9OUvfzn069ev5rm53vo5a9as8PrXvz5MmjSp9WdtqcOduZ6VdChl0ETNrV4KkIVl5bhVmtDQxL85ePbt2zdMmDAhvj69d968eeEd73hHLNRUsOm2w9/+9rdwyCGHxPdx4C3XHE6zOrenkm3btoX3vve9YfLkyfF93H5mAyf33XdfeMpTnhJPDhzoTz/99LiRyt3m4rNY1jFjxsRKcdppp4Vbbrllt+b2yy+/PBx33HFhwIAB8QRWetL561//Go455pj4GTNmzAgf//jHQ2NjY83yqXYbmsrKOvKdF1xwQVi3bl3ra1jGc845J4waNSoeuM8444xw++237/I5LPd3v/vd8LSnPS0MHDgw9lWlcr/qVa8K06dPjzv9gQceGL7+9a+Htli1alU8cU6cODEuG7c/fvOb31R9T6XtTfm/613vip/FMp544om79THkvZQd3/XMZz4zfn8xtu3Tn/70MHbs2DBo0KB4i5b+oMW6Yp0p9vjHPz68+c1v3uVnK1asiCd9PqeSavWuHMqAMMMy8foPf/jD8cRYqlrd27lzZ/jEJz4RD3iUJXX1X//61263yX73u9/Fesmy/epXv6qr3vDZX/jCF+L24bPZ7tTbSrffrrrqqtjtgdeOHz8+vO9979tl/Wsdu8phH3nnO98Z6+vIkSPje5sb03ddzs9+9rOt+9GRRx4ZLrroolAPtinLkf7wfpa/+Gcp7LG+xT9Pr0/K/Z79oJaXvexl4cc//nHrv7ds2RJ++9vfxp8Xq2ebsd78nOWivM4+++ywadOmWM4/+9nPYh1Nx/60by9YsCDWK8qY7cL+y/ZN0j7Htud4yXGqI9Lx59///nc4+OCD43HivPPO2+VCpHg//8EPfhC/l+1cjOV85StfWff+V+4YvGbNmnjhPHr06Fhm+++/f+vFT3Ed5+9nnXVW/Pnw4cPjz9N5kP2N406qoxy70vGq2j5Y7nY/y7fffvvFOkc509WlFvraU4587kEHHRS+853v7Pbdf/rTn+LyU2/YP2644YY2bLHmsuN7OObzGZQT54/0HZXKpq37Zrky+dSnPhWP75wLXv3qV8fjSmnXsM4oA7oPcIzi96zHE5/4xFg/UJqF2C6cT1gm9vMXvvCFYfny5TXLcc6cOeH666+P68Cxn2WqdW6mjpfbd0vrJ3/nGMHr+TufVa6Lwt5Yz6rqbisu0/R9xhlnFIYMGVL42Mc+VnjooYcKP/vZzwoNDQ2ttxz/8Ic/xN//85//LMybN69w0003FX7wgx+0Nm9PmjSp8IlPfCI2i6dbaNy+6927d+GUU04pXHfddYUHHnigsGnTpvhdb3vb23ZZHpaFZUouuOCCwuTJkwt/+tOfCrNmzSpcdtllhd/+9rfxdwsXLiyMGDGi8KxnPatwyy23xFt3P/7xj+Pnl1u3t771rYUJEybEZb/vvvvi74cPHx6Xu7i5/cQTTyxceeWV8TWnn356XO7k6quvjuv/05/+NC4P5TJt2rRYXrXKp9JtvYEDBxYe//jHx1t4V111VWHmzJmFF77wha2vufzyywu/+MUvCvfff3/hf//7X+FVr3pVYezYsbvcSmC5x4wZE9ef5eK7uV3/kY98JJbN7Nmz4238AQMGFH73u9/VfcuBMv7iF78Yl43P/cY3vlHo2bNnXK9KKm3vV7/61fFnlOEjjzwSP5dbuNQz3HjjjfHWzec///m4Lb/+9a/H27fFXSDuvPPOwve+973CPffcE9/3oQ99qNCvX7+4vl21zpT61a9+FT9j69atrT/7yle+EutRpVtJtepduS4Qn/zkJ2P5c4v4b3/7W6wzlG1b6h7Lxff+5je/iWX0nve8J27btM3S7WeW5Y9//GOsZ4sXL66r3vBZlAPrRH245pprWm/5l97W5vOou9zWYz9gPUeNGhXXod5jVzmUB8vAsqd9a/DgwbvUgU996lOFgw46qPCvf/0rrgv1m3rL9m7rbbpK3QRqdfVpT1egtFzUcZY37SMcS+hKQBkWny5Kt9m3v/3tQq9evVq3GduVf1Mn2D533313fM2GDRviH/a78847r/XYz+1RjkEHH3xw4ZWvfGV8PWVM/TrwwAPj71OZDBo0qPCSl7ykcO+998Y/HekCkY4/Z599dtzHb7vttrgMxfW6eDusXr260KdPn3icSNi/i39W7/5Xegx+05veVDjqqKPicrCc//nPf+K+WLrc3G6mDqbtRfnRTQMXXXRR/N3DDz8cX/vUpz61cPjhhxeampqq7oOldYbjIeXCNuM7vvzlL8d98r///W/FOsQ5Y/z48a2fy/85flIOxd/N/vGPf/wjfu5znvOc2MWhtPtStS4QfAbZgdv3rCfHXeoE26Fa2bR13ywtE9aP8wfbjM/++Mc/HrdzcT3rjDJgu7Fcb3jDG+I5jDr+zW9+s7BixYr4+9IsdOGFF8ZzDut0ww03FE4++eTC+eefX6jlwx/+cPxu8Pkc22udm9etW1d23y2tn/ycsvna174W/053ktLy7ez1bE8XiA4H4NNOO22X1xx//PGF9773vfHv7DQHHHBAxb6Q5fr2pP5rFEixWgGYisT7OGiU8/73v78wffr0istSvG4bN26MG57wkfA+ws0XvvCFXQq7+EB48cUXx5+lPnRPeMITCp/5zGd2+R5OKOwg9ZRPuYM6ByFOPskll1wSg2ClPngc+DhJ//3vf2/9Gcv49re/veb3cUB+9rOfXfH39VS4Jz/5yYX/+7//q/j7ctubkwHruWjRol1eS3myHfGCF7yg8KQ
|
||
|
|
"text/plain": [
|
||
|
|
"<Figure size 1000x1200 with 1 Axes>"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "display_data"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"name": "stderr",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"[2025-10-08 12:29:47,364] [ WARNING] image.py:661 - Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Got range [-0.96862745..1.0].\n"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAakAAAGzCAYAAACVYeimAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjYsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvq6yFwwAAAAlwSFlzAAAPYQAAD2EBqD+naQAAXtlJREFUeJztnQm8ldP6x9dp1KBSadJAQpoMSdJFlJJkKPdGVOiiJAohInUvZZahXFOFErkyRCVNhlLqShMpUtFIGpTm9//5Pp//2p/37PY+U+c47zn9vp/PPvvsd1jzu571POt510oJgiBwQgghRAQpkNsJEEIIIZIhISWEECKySEgJIYSILBJSQgghIouElBBCiMgiISWEECKySEgJIYSILBJSQgghIouElBBCiMgiISVEPueRRx5xtWvXdvv378/UfT/99JNLSUlxI0aMyLG0ZTUdDzzwgB37qyFO4vY8//zzrnr16m7Xrl1/eVoOFSSkDgF4uHm45s6d66LAjh077EGfPn16hq5fsmSJXU9nlV1QHhn5pJVGVhR77bXX3Nlnn+3KlCnjihcv7urXr+8GDhzotm/fnvS+cePGudatW7vy5cu7IkWKuCpVqrh//OMfburUqbFriDecjoIFC7oKFSq4yy+/3H377bcZzufWrVvdww8/7O666y5XoIAe9+zmmmuucbt373b/+c9/cjsp+ZZCuZ0AceiBkBowYID936xZswwJKa7n2qOPPjpb0oBwCfPqq6+6yZMnH3D8xBNPTHj/vn37XMeOHd1bb73lzjrrLBOiCKnPPvvM0jp27Fj3ySefuIoVK6YSatddd50NGk455RR32223uUqVKrm1a9ea4GrevLn74osv3Jlnnhm755ZbbnGNGjVye/bscQsWLLCROwJs0aJFdm96vPLKK27v3r3uyiuvzHQZ1ahRw/3555+ucOHCLmr069fP3X333bmdDHfYYYe5Ll26uCeeeML17NkzV7S7fA8LzIr8zfDhw1lEOPjqq6+CKLBx40ZLT//+/TN0/dixY+36adOm5ViaevToYXGkx/bt2+37oYcesuvvuOOOA655//33gwIFCgQXXHBBquOPPvqo3dOrV69g//79B9z36quvBrNnz7b/ySvXkvcww4YNs+MPP/xwhvLVoEGD4Oqrrw7yMitWrLA8045zm0Ttdu7cuXZ8ypQpuZau/Iz0/0PYTFGyZEn3yy+/uEsvvdT+P/LII90dd9xhWkL8fMBjjz3mnnzySRtdFytWzJ1zzjk2mg+DppNIMyIurwERHvEAGoc3Z4Xt/GHQOv7+97/b/+eee25CM9zQoUNd3bp1XdGiRc101qNHD7d58+aDLiPyUq9ePTdv3jwz6aEp3XPPPaZdPProo+744493gwYNOuC+tm3b2uh64sSJ7ssvv7Rj3MO1zA1RlolG3J06dXKnn356mmlCa4Mffvgh3fSvWLHCtK8WLVrEjqGRlS1b1l177bUJTYNoBrSBZHNB69ats3urVq1q5V25cmV3ySWXpDLFJqtP2gBtwbNp0yaLCxMp7a9UqVJmBv3mm2/SzVv8nBThJjPZhtPC3FH//v1drVq1LP3VqlVzd9555wFzSvzu3bu3tdXDDz/cXXzxxe7nn39OmJaGDRtamb733nvppltkHpn7DmEQRq1atXKNGze2jhPz1OOPP+6OPfZY17179wPMYdu2bTMBsHPnTjdkyBB33nnnuYULF6YyaaUHD/2wYcMs/Msuu8y1a9fOjjdo0CDh9QgHTF5PP/20CQhvfvPfdEAIOzpiwly6dKmF/9VXX5np7GBNVb/99pt1nFdccYW7+uqrLa+ff/65+/33392tt97qChVK/Ah17tzZDR8+3I0fP96dccYZdg+dcq9evWx+Kat4YXDEEUeke+3MmTPt+9RTT40dozwo93feecfmUZgT87z77rvWOZPXZLRv394tXrzYTFsInQ0bNpiZdNWqVZk2xf74448WJ4OQY445xq1fv97SxAAIEy8Djoxy4403phLGwCBh1KhRNpcHOI4gbKiLG264wdoQ7ZfB1/fff29p8fzzn/90r7/+upl0Mb8yX9imTZuk8VPGtDeRA+S2Kidyx9zXpUsXOzZw4MBU155yyilBw4YNDzC1FCtWLPj5559jxzFLcbx3796xY+ecc4594iGuGjVqZLu5b8OGDUGRIkWCli1bBvv27Ysdf/bZZ+36V155JTgYcx954djzzz+f6vhTTz1lx8eNG5c0vE2bNtk17dq1s99DhgxJ954w3txHHiivNWvWBBMnTgxq1aoVpKSkBHPmzEk3jH79+lkY27ZtS3V80qRJdvyDDz5IdfzCCy8MatasmdTM9vvvv9tvzJZpkaxuaQO0Bc/OnTtT1ZuPs2jRoqnaZSJzH+Gn1X0tW7YsKF26dHD++ecHe/futWOvvfaamWE/++yzVNdSv4T1xRdf2O/58+fb75tuuinVdR07dkyatxtuuMGeEZH9yNx3iNOtW7cDzEmMcOPBJHjUUUfFfmOWQgP76KOPXG6B5odnFdpJ2HPt+uuvN9PRhx9+eNBxYBKKN42hUQJmoGT4c5jQwt9p3ZMIHC3QPtEqLrjgArdlyxZz7sCZIiNaIJoeprQwaMB4Fr755puxY2iGaEQdOnRIGh5mXjQvTK1cnx1l6+sNrZ70ktYTTjjB/e9//8tyuHhWoi2ibb7xxhsxzRVnFrQnTK6//vpr7EN5wLRp0+zbt2k0+DC0s2QQFyZdnIJE9iIhdQjD/IOfHwo/bIk6oOOOO+6AY8zJZKdbeGZZuXKlfdOphaEjrVmzZuz8wYBgDpvEwoLGC6tExAsyhGZ69yTi/vvvN+GB9x8mRITUwbqSI7gw2zGH4udiMP8xX5WWkEKo4M4+YcIEM3tiiuUdLOapsgLmN0xttC3CRnDSHplHI59ZhUEKc3aUWbly5WLHly1bZqZK4gh/aMeA6RJoN5QxZu8w8e0sjN/gXN592Y+E1CHMwcyNJCLZAxp2xMhroD3E4+fD6EyT4c/VqVPHvhm9A3MgmQGnAuZa0GRHjhxpcyp0wqtXr073Xjpo3M8TCUbmnTiOwAFc6UnjSSedlGaYaBPM3+AEwiDnvvvus/L4+uuv001PfDt46KGHzA0fYcf8z6RJk0wg4wST2RePPcyVoj29+OKL7uSTT051jjApT+JI9LnppptcVmFgh2NNovYiDg4JKZEhGIXGQ2cVnixHC0vkVRev0WR2tJnsejwNAWeJMJgA8Wzz57Obv/3tb/by7ujRo5MKYBxN4KKLLord481PByO0Bw8ebI4rDz74YLrXesFIWcSDYMAzD5MfJi8cA9LSosKgYdx+++3u448/Ng9PyhuHm7TaAdfwPliYt99+2zw2X375ZROaLVu2NIGcVc9M3lHDWxBBetVVVyVMN84rvI9GPPEfrynRbhBo8R6U8e0sDGWc7J06cXBISIkMgecT7uqeOXPmuNmzZ5vnW7gT+O6779zGjRtjx3Anjvd6YsQJGe2MSpQokfB6OhZMcXj+eXML0OlhLkrLG+tgIP10hnRa99577wHnmQvDbRvPSTz7/D2s+sBqEXyH0+tBm6Bc04IyxlRH+OmZ2Zo0aWLfiVYawZzF6hUffPCBzXGhcaUnpJhvQUDGpweTZtiFm2OffvppquteeOGFA4Qzmnx8OTBvFG5nGQUByKodDAZ4PSARnCdstKx4mE/yq4T4Nk27CvPUU08ljZ85tPBL2CL7kAu6yBC8V0IHgJs3HRIPLOYk3jEJT/Lz5j2dc9euXc3GzwoJmG+84wBgEsEMxiie+QDeMeF9JD6JwGxDh8Z8CMKH+Qsmu3Et7tu3r7mg41SAKQzBwXtTOBbgMp5TsNoBJi7SNGvWLBMc5Av3ZoQNo2rMc2H69OljcyJoHUzSIyRYNQJhwyAAAeXdxtOCcDDPUQdoVslgXo4yxcGEuokHofTMM8/Ye0OYwdLTBNCc0ULo7Kk/5raY98F1POy2jvs2DjmUyfnnn28DFUx5zDmFQctkCSkcU+jgMYXiMk66MwtODgyOaI9jxoxJdY7XG/jwHhrlRtoo/6Z
|
||
|
|
"text/plain": [
|
||
|
|
"<Figure size 640x480 with 1 Axes>"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "display_data"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAqEAAAPDCAYAAACD6dK7AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjYsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvq6yFwwAAAAlwSFlzAAAPYQAAD2EBqD+naQAAfGhJREFUeJzt3QW0HdX5sPEJ7u4Ud3eKk+JFihV3LxQoEKBAgeJOi5fiFC1FWrQFSnC3Utzd3X2+9ex/9/0mJ0fvvXmTmzy/te5KcnPOyJ49M++8W6ZfWZZlIUmSJAUaKXJlkiRJEgxCJUmSFM4gVJIkSeEMQiVJkhTOIFSSJEnhDEIlSZIUziBUkiRJ4QxCJUmSFM4gVJIkSeEMQiVJkhTOIFSSJEnhDEIlSZIUziBUkiRJ4QxCJUmSFM4gVJK64aKLLirGGWecrp877rhjaG+SJPUp/cqyLIf2RkhSX/PZZ58V77zzTte/p5566mLMMcccqtskSX2JQagkSZLC2RwvSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkiRJCmcQKkmSpHAGoZIkSQpnECpJkqRwBqGSJEkKZxAqSZKkcAahkqQhrizL9KPhn8da7TIIVQguSD/++KMXphqUx/fff2+5DEUvvPBCsd9++xUffPBB0Vc8/fTTxTHHHFN8/vnnQ/2c/uGHH5rWX/5v4MCBxR577FF88sknRV/08ccfFwcccEDx5JNPdvzdTz/9tPj9739f/Pe//y36Mo4jx5pj3uwz7777bjrWd999d9i2sU1nnHFGccstt3TrWvrtt98Wf/zjH4ubbrqpaVDNelr95PU3+3x1G8sWyx7e7w2jDO0N0IiBQOukk04qFlhggWK55ZYb2pszzHj22WeLQw89tNh9992LhRZaaGhvzgjp1VdfLc4///xi6623LiaeeOKiL3jmmWeK008/vdhkk02KccYZZ6htx5/+9KfilVdeKQ455JBijDHGqPsZgpJ99923WHvttYtxxx236IsIJC+44IJi0UUXLeacc86OvvvZZ58VF154YTHffPMV88wzT9FXff311ykQn3XWWYttt922GGmkwXNYBKlc5x977LHid7/7Xdi2sV7KuH///sXPfvazjr//3XffFVdeeWXxxRdfFCuuuOJg///NN98Uf/7zn4s33nij6XIoky222KKYY4450oMi28Syay211FLFL37xi666xXn04YcfDva50Ucfvdhuu+2KaaedthheGYQqLAi96KKL0sk8JINQTvz77ruv2HDDDdMJPKwj+3bvvfemG7XU1zz++OPF888/n87vRsHBaaedVkw//fTFTjvtVDdw6QuG92xUOwimHnzwwaZN7Q888EBx3XXXFWeffXafeaBr5/jy/wSJ1es0WfHnnnuuWG211YpRRvm/UGrkkUdO9zjcdtttKbhceeWVB7sX8WCSvfHGG8XRRx9dLL744sVkk002yOd4sKsXxA5PDEI1XLnnnntSM+Waa67ZJ4JQMit33XVXMdFEEw3tTZE6xs2Tm+TYY4/dMHs24YQTps+Rse3Xr1/4Nqp3kMW+4ooritFGG63hw8R7771XHHbYYcX8888/XB1rgsGDDjpokN/RfH/OOeekbgBjjjnmYN+hKX2qqaYqTjnllKbX97IsU5nSJWjJJZcsRjQGoRpqT5v8m6fGUUcdNV3UuGF99dVXKXjkpK690OU+SXyGGx+f4eLAxY4Tnn49/PC5vByWwQkOvsNneWrl/1k3N0/Wn7clf6eK7/H/fK56YeV3rI9l8Xu2p95n2Db2jeWzvfzkdfDneOONl56g65UXy//yyy/Tv1k+21d7cWf7WAf7SfnQT5DPjDXWWGlf690Mcjmy/Nptqt32Vp9rdkz5O01c/JuyrrefrIPPUUZ8p1455uPLPvJ39pFlscy8PexTXg7fZzk5Q9EdebvYf5bDvtduV6Pv5W2lPPLxIAj
|
||
|
|
"text/plain": [
|
||
|
|
"<Figure size 1000x1200 with 1 Axes>"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "display_data"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAqEAAAPDCAYAAACD6dK7AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjYsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvq6yFwwAAAAlwSFlzAAAPYQAAD2EBqD+naQAAhAtJREFUeJzt3QWUHUX6sPEOFtzdggd31+DuiztBFpfgi7suzuKyuC/uBHdfnITg7hYs9Hee+m/dr+fm6kiN5PmdMyeZO7e7q6uru98u6155nueZJEmSlNBIKTcmSZIkwSBUkiRJyRmESpIkKTmDUEmSJCVnECpJkqTkDEIlSZKUnEGoJEmSkjMIlSRJUnIGoZIkSUrOIFSS1LTpppsuO+200zo7GZK6sV6+tlNSav369cvmnXdeg5hu7Msvv8zGGmusbMwxx+zspEjqpkbp7ARIkrqfSSaZpLOTIKmbszle6sG1jbvvvnu23377ZRNOOGE2+eSTZ4cffnjNZZ599tlsxRVXzCaeeOJsvPHGy5ZZZpnshRdeqLnMX3/9lR155JHZ1FNPnfXu3TvUcN59991Vv7/11ltnDz/8cHb66adnvXr1Cj/vvfdexe9+++232ZZbbplNMMEEocZt1VVXzd55552a6fnuu++y7bbbLgRJ4447brbccstlL7/8cs1lPvzww2zDDTfMxh9//JBXa6+9dtU0Ra+++mpIz9hjj51NNtlk2RZbbJF99dVXNZd57LHHsqWWWiobY4wxsmmmmSYcn59//rnmMrfccks2//zzZ6OPPno2wwwzZEcccUT2559/1lzmwgsvzGabbbawzKyzzpqdc845dY/hcccdl00//fQhbfPMM092ww03tLk5PkU6JHVjNMdL6nmWWWaZfNxxx80PP/zw/O23384vu+yyvFevXvm9995bdZkHHnggv/zyy/M33ngjf/311/P+/fvnk002Wf7DDz9UXeaf//xn2M7VV1+dv/nmm/l+++2XjzrqqGGblXz33Xf5Yostlm+//fb5p59+Gn7+/PPPit9da6218tlmmy1/5JFH8pdeeilfeeWV85lmmin//fffq6ZnhRVWyNdcc8382WefDWkYMGBAPtFEE+Vff/11xe+zLrax7bbb5q+88krY70033TTv27dv/ttvv1Vc5ttvv80nmWSS/MADDwx59cILL+Qrrrhivuyyy1ZN16BBg/KxxhorP/XUU0O6Hn/88Xy++ebLt95666rLsN/k7aWXXpoPHjw4HLvpppsuHNNqrrjiinyKKabIb7zxxvzdd98N/0444YRhHdUcffTR+ayzzprffffdYTuXXHJJ3rt37/yhhx6qukyfPn3CvnR2OiR1XwahUg8OQpdccskWny200EL5/vvv3/A6hg0blo8zzjj5bbfdVvU7U045ZX7MMccMt52dd965Ztr22GOPmtsmUOM5mWAt+uqrr/Ixxhgjv+666you8+ijj4ag7ddff23x+Ywzzpifd955FZch6Cbg/Ouvv0qfEXyynXvuuafiMkcddVS+0kortfjsww8/DOl96623Ki5DQL/DDjsMl96RRhopHzp0aMVlll9++fzYY48dLr0Ed9Wwr1ddddVw6SXwr4S8GnPMMfMnnnhiuPRusskmrQ5CU6VDUvdln1CpB5t77rlb/D7FFFNkX3zxRdXvf/7559nBBx+cPfTQQ+F7w4YNy3755Zfsgw8+qPj9H374Ifvkk0+yJZZYosXn/F6vCbyeN954IxtllFGyRRZZpPTZRBNNlPXt2zf8rRK2+dNPP4XvFQ0dOjQbPHhw1WUGDRqUjTPOOC0+//XXX2suM3DgwNAUX45lZplllorLvPLKK9mVV15Z+oyKAJqghwwZEpqtKy3z+OOPZ8ccc0zpM44JaeO4lA8Kommf7ffv3z/bfvvtS5/TfE/3ikrYd9ZFN4yi33//PZtvvvmy1ugq6ZDUtRmESj3YqKOO2uJ3+l8S9FSz1VZbZV9//XXor9mnT5/Qx3OxxRYLgUB3QABKoE0QXY7+ntWWWWCBBVoEh/UG37DMmmuumZ1wwgnD/Y3tV1tmxx13DP1Ay0077bRVl6EP6HrrrTfc3+hnWen7uOCCC1oE7xh55JGrbgN33HFHNtVUU7X4G8e/NbpKOiR1bQahkkqodWPwyGqrrVYasFNrsA0Df6accsqwHIOYiutZeOGFqy432mijhRq9WqgZpObs6aefzhZffPHwGQHyW2+9lc0+++wVl2EAz2effRZqUBk40wiWufbaa7NJJ5007E+jy9x4441hG2yr0WVef/31bKaZZmro+3EZ9rfRZRggxfF49913s80226yhZchLgjxqu4vHsC26SjokdW0GoZJKZp555uzyyy/PFlxwwdDUvu+++4ZRyrXwncMOOyybccYZw8j4Sy65JHvppZcq1ixGBG8El4xAp0mbEekjjTTScGlhlDrNueedd15oLj/ggANCLRmfV7LCCiuEmtt11lknO/HEE0OzON0FqF1bd911w36VI0g66aSTwjrjKP/3338/u+mmm8LMAvxebpdddgm1fJtssklp9gGak6+55powIrxSbd/++++fLbrootmuu+4aRu8zxyZB6X333ZedddZZFffn0EMPzdZYY41QU/q3v/0t5BFN9IzMP/rooysuQ80pta00e6+yyirZb7/9lj333HNhpoG99957uO+Tr/vss0+21157hVryJZdcMvv+++/DgwRBObXjrdFV0iGpC+vsTqmSOkalwT9rr712vtVWW1VdhlHeCy64YD766KPnM888c3799dfXHYDC4CVGa0811VRhVPw888yT33XXXTXTxuCdRRddNAz+4TI0ZMiQit/75ptv8i222CIfb7zxwncZHV9t1H3ESP7ddtstDJgiPdNMM02+2Wab5R988EHVZRihv+WWW+YTTzxxGI09wwwzhNH733//fdVlSMe6666bjz/++CFtjOrec889WwxwKvfMM8+EUfRjjz12GCk/99xzDzeoqxwjxRdffPGwDQZdLbzwwvn5559fc5krr7wyn3feefPRRhstn2CCCfKll146v+mmm6p+nzSfdtppYYAWecbIf/L64YcfrrpMvXKRKh2Sui/fmCRJkqTknKxekiRJyRmESpIkKTmDUEmSJCVnECpJkqTkDEIlSZKUnEGoJEmSkjMIlSRJUnIGoZIkSUrOIFSSJEnJGYRKkiQpOYNQSZIkJWcQKkmSpOQMQiVJkpScQagkSZKSMwiVJElScgahkiRJSs4gVJIkSckZhEqSJCk5g1BJkiQlZxAqSZKk5AxCJUmSlJxBqCRJkpIzCJUkSVJyBqGSJElKziBUkiRJyRmESpIkKTmDUEmSJCVnECpJkqTkDEIlSZKUnEGoJEmSkjMIlSRJUnIGoZIkSUrOIFSSJEnJGYRKkiQpOYNQSZIkJWcQKkmSpOQMQiVJkpScQagkSZKSMwiVJElScgahkiRJSs4gVJIkSckZhEqSJCk5g1BJkiQlZxAqSZKk5AxCJUmSlJxBqCRJkpIzCJUkSVJyBqGSJElKziBUkiRJyRmESpIkKTmDUEmSJCVnECpJkqTkDEIlSZKUnEGoJEmSkjMIlSRJUnIGoZIkSUrOIFSSJEnJGYRKkiQpOYNQSZIkJWcQKkmSpOQMQiVJkpScQagkSZKSMwiVJElScgahkiRJSs4gVJIkSckZhEqSJCk5g1BJkiQlZxAqSZKk5AxCJUmSlJxBqCRJkpIzCJUkSVJyBqGSJElKziBUkiRJyRmESpIkKTmDUEmSJCVnECpJkqTkDEIlSZKUnEGoJEmSkjMIlSRJUnIGoZIkSUrOIFSSJEnJGYRKkiQpOYNQSZIkJWcQKkmSpOQMQiVJkpScQagkSZKSMwiVJElScgahkiRJSs4gVJIkSckZhEqSJCk5g1BJkiQlZxAqSZKk5AxCJUmSlJxBqCRJkpIzCJUkSVJyBqGSJElKziBUkiRJyRmESpIkKTmDUEmSJCVnECpJkqTkDEIlSZKUnEGoJEmSkjMIlSRJUnIGoZIkSUrOIFSSJEnJGYRKkiQpOYNQSZIkJWcQKkm
|
||
|
|
"text/plain": [
|
||
|
|
"<Figure size 1000x1200 with 1 Axes>"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "display_data"
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"def ocr_trocr(img):\n",
|
||
|
|
" img_gray = img.convert('L')\n",
|
||
|
|
" img_bin = ImageOps.autocontrast(img_gray, cutoff=1) # boost faint text\n",
|
||
|
|
" img_proc = img_bin.convert('RGB')\n",
|
||
|
|
"\n",
|
||
|
|
" # TrOCR expects a list of images\n",
|
||
|
|
" pixel_values = trocr_processor(images=[img_proc], return_tensors=\"pt\").pixel_values\n",
|
||
|
|
" \n",
|
||
|
|
" # Generate text\n",
|
||
|
|
" generated_ids = trocr_model.generate(pixel_values)\n",
|
||
|
|
" \n",
|
||
|
|
" plt.imshow(pixel_values[0].permute(1,2,0).cpu().numpy())\n",
|
||
|
|
" plt.title(\"Input to TrOCR (visualized)\")\n",
|
||
|
|
" plt.show()\n",
|
||
|
|
" # Decode\n",
|
||
|
|
" res = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]\n",
|
||
|
|
" \n",
|
||
|
|
" # Show image with OCR text as title\n",
|
||
|
|
" plt.figure(figsize=(10, 12))\n",
|
||
|
|
" plt.imshow(img_proc)\n",
|
||
|
|
" plt.axis('off')\n",
|
||
|
|
" plt.title(res, fontsize=10)\n",
|
||
|
|
" plt.show()\n",
|
||
|
|
" return res\n",
|
||
|
|
"\n",
|
||
|
|
"def ocr_paddle(img):\n",
|
||
|
|
" img_gray = img.convert('L')\n",
|
||
|
|
" img_bin = ImageOps.autocontrast(img_gray, cutoff=1) # boost faint text\n",
|
||
|
|
" img_proc = img_bin.convert('RGB')\n",
|
||
|
|
" result = paddleocr_model.predict(np.array(img_proc))\n",
|
||
|
|
" res = ' '.join([line[1][0] for page in result for line in page])\n",
|
||
|
|
"\n",
|
||
|
|
" # Show the processed image\n",
|
||
|
|
" plt.figure(figsize=(10, 12))\n",
|
||
|
|
" plt.imshow(img_proc)\n",
|
||
|
|
" plt.axis('off')\n",
|
||
|
|
" plt.title(res, fontsize=10)\n",
|
||
|
|
" plt.show()\n",
|
||
|
|
" return res\n",
|
||
|
|
"\n",
|
||
|
|
"\n",
|
||
|
|
"results = []\n",
|
||
|
|
"\n",
|
||
|
|
"for pdf_file in os.listdir(PDF_FOLDER):\n",
|
||
|
|
" if not pdf_file.lower().endswith('.pdf'):\n",
|
||
|
|
" continue\n",
|
||
|
|
" pdf_path = os.path.join(PDF_FOLDER, pdf_file)\n",
|
||
|
|
" images = pdf_to_images(pdf_path)\n",
|
||
|
|
" \n",
|
||
|
|
" for i, img in enumerate(images):\n",
|
||
|
|
" if i != 0:\n",
|
||
|
|
" break\n",
|
||
|
|
" page_num = i+1\n",
|
||
|
|
" ref = pdf_extract_text(pdf_path, page_num=page_num)\n",
|
||
|
|
" \n",
|
||
|
|
" # EasyOCR\n",
|
||
|
|
" easy_text = ocr_easyocr(img)\n",
|
||
|
|
" results.append({'PDF': pdf_file, 'Page': page_num, 'Model': 'EasyOCR', 'Prediction': easy_text, **evaluate_text(ref, easy_text)})\n",
|
||
|
|
" \n",
|
||
|
|
" # TrOCR\n",
|
||
|
|
" trocr_text = ocr_trocr(img)\n",
|
||
|
|
" results.append({'PDF': pdf_file, 'Page': page_num, 'Model': 'TrOCR', 'Prediction': trocr_text, **evaluate_text(ref, trocr_text)})\n",
|
||
|
|
" \n",
|
||
|
|
" # PaddleOCR\n",
|
||
|
|
" paddle_text = ocr_paddle(img)\n",
|
||
|
|
" results.append({'PDF': pdf_file, 'Page': page_num, 'Model': 'PaddleOCR', 'Prediction': paddle_text, **evaluate_text(ref, paddle_text)})"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"id": "0db6dc74",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"## 4 Save and Analyze Results"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 6,
|
||
|
|
"id": "da3155e3",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stdout",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"Benchmark results saved!\n",
|
||
|
|
" WER CER\n",
|
||
|
|
"Model \n",
|
||
|
|
"EasyOCR 0.000000 0.000000\n",
|
||
|
|
"PaddleOCR 1.153846 0.782178\n",
|
||
|
|
"TrOCR 1.000000 1.000000\n"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAArMAAAIVCAYAAADConfoAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjYsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvq6yFwwAAAAlwSFlzAAAPYQAAD2EBqD+naQAARsVJREFUeJzt3Qd8VFX6//EnlISiEJQmCIQqoDRhQZoSRSMgFkQQlC6CgrSfdKUoAi6CqERwQWBxlaIi6qKwgKBSBGkuKL0vGopIgAAJJPN/Pcf/jJlkEpKQzJ2TfN6v15XcO3cmZyYZ851zn3NOkMvlcgkAAABgoVxONwAAAADIKMIsAAAArEWYBQAAgLUIswAAALAWYRYAAADWIswCAADAWoRZAAAAWIswCwAAAGsRZgEAAGAtwiwAZKK5c+dKUFCQbN68WWzQtWtXueGGG67rMZ5//nm5//77M61NuD533XWXDBkyxOlmAH5DmAUs9u6775rg1KBBgxTP0dv79u2bpse7cuWKvP322/K3v/1NbrzxRhNy9Gs9prf5Eh8fL3PmzJFmzZrJTTfdJCEhIRIWFibdunXzCnTukOfe8uTJI6VLlzZh6vjx42lq35gxY7weI1euXHLLLbfIQw89JD/88EOaHgOZ69ChQzJr1iwZMWKE2T958qT52fTv3z/ZuXpMbxs9enSy2zp37ix58+aVixcvmn39vUj8s0685cuXz3O/NWvWeN2WO3duKV68uLRt21Z27dqVrueydOlS8/tesGBB83v1+OOPyy+//JLu1+TEiRPy4osvStWqVaVAgQLm8erWrSvjxo2Ts2fPes7T90xKz1Hvm9H3ztChQyUyMlKioqLS3XbARnmcbgCAjPvwww9NcNy0aZPs379fKlWqlOHHiomJkVatWsm3335rwqH+odSwuGzZMhNCFi9ebP7Y6x9mt0uXLkmbNm3MOXfffbcJNBpoDx8+LIsWLZJ//vOfcvToUbn11ls993nllVekfPnycvnyZRNA9Q/12rVrZefOnV4hJTXTp083QTshIUGOHTsmM2fONN9fX4fatWtn+DVA+r311lvm5xkeHm72NUhWrlzZ/EyTWrdunQli+q+v2+rUqWPCn5t+MNKgnJQG1qT69etngqh+6Prvf/8rM2bMMEFXf69Klix5zefx448/yiOPPCK33367/P3vf5dz587Jv//9b3O8evXqaXot3I/TsmVLuXDhgjz99NMmxCr9YDdx4kT57rvv5D//+Y/nfH1vTJgwIdnjFC5cONmxtL539HkUKlTIfNjV+wDZnguAlQ4ePOjSt/DixYtdxYoVc40ZM8bneXpOnz59rvl4zz77rDn3nXfeSXbbtGnTzG29e/f2Oq6Pq8fffPPNZPe5evWqa9KkSa5jx46Z/Tlz5phzf/zxR6/zhg4dao4vXLjwmm0cPXq0OffUqVNex3fu3GmOjxgxwuW0lJ5noLlw4YL5t0uXLq6CBQtm6DHi4uJcRYsWdb300ktex7t16+bKnTu36/z5817fL0+ePK6OHTu6brjhBvP74fbrr7+a12zgwIGeY2lt1+rVq819P/74Y6/j06dPN8dff/31ND2XIUOGuIKCglxRUVFexy9fvuxKqz/++MNVunRpV4kSJVy7du1Kdrs+9quvvurZv+eee1y33377NR83I++dvn37usqVK+dKSEhIc/sBW1FmAFjcK1ukSBHTm6qXVHU/o/73v//J+++/L/fee6/PkoQ+ffqYnjftJdNz3fd57733TK3kgAEDfPae6aXWxL2yvjRt2tT8e+DAgQy3393zpr1+icXGxppL2tpjrb18ZcqUMbWEetxXKcaSJUvkjjvuMOdqD532OCell3V79OghpUqVMudpT9lzzz0ncXFxyb73oEGDpFixYqY3+7HHHpNTp055naO96toLrj2I9erVk/z580uNGjXMvtLecN3XXjft4du2bZvX/bUHUnvQK1SoYM7R16F79+7y+++/+yzP0EvmHTt2NL83TZo0SfH13L59u2m3XgbXHsaUaK/g6dOnpXnz5l7H9bG1/CRx6cfGjRvl6tWr5ndCH1O/h5u7pza1NqVXen+v9CqEL/ozTit9P+jvx5QpU7zKBNxKlCghL730kvjjOer78siRI16vM5BdEWYBS2l41Uv8wcHB0qFDB9m3b5+5xJkRX3/9tQkfWreYEr1Nw4g74Ol9dL9Tp05yPbQkQWnASqszZ86YEKX1mRrwevbsacJcu3btPOdoCcLDDz8sb7zxhrRu3VreeecdefTRR+XNN9+U9u3b+wxmOpDpySefNJeZ9VKu1kwmDoa//vqr1K9fXxYsWGAeQ2uJ9flraYa71tPthRdekJ9++smEaQ27X375pc8PCloeogFT26iXm//44w/ztf58Bw4caC5Vjx071gQWfX76vNxWrFghBw8eNPXJ+vy07do2vcz9Z6e8tyeeeMK0c/z48eY180V/h/RDjV7y159xaoPD1q9fb0KynpuYO5QmLjXQwFqlShVzrn7ASVxqkFqY1Z9z0k1LADL790p/jvoBTF9zX69dWnzxxRfmA4l+uEwrfd/5eo5a9nM9z9Fd3uCrpAPIdpzuGgaQfps3bzaXF1esWGH29VLirbfe6urfv3+GygwGDBhgztu2bVuK52zdutWcM2jQILOvl4SvdR9fl0pXrlxpygS0/OCTTz4xJRIhISGecoS0lBkk3UJDQ13Lli3zOveDDz5w5cqVy/X99997HZ8xY4a5z7p16zzHdD84ONi1f/9+z7GffvopWdlF586dzWP6KiFwX851P8/mzZt7XeLV10svvZ89e9ZzTC8D67nr16/3HFu+fLk5lj9/fteRI0c8x9977z1zXC+ru128eDFZO+bPn2/O++6775K9bh06dEh2fuLL+WvXrnUVKlTI1apVqzRdXn/66addN998s8/bihcv7rrvvvs8+xEREab8QLVr1871xBNPeG6rV6+eq3Llysna5etnrZs+VtIyg9mzZ5vfKy1Z0N+FSpUqmbKBTZs2udJiyZIlrgIFCpifkft3PL2KFCniqlWrVprP1zKDlJ5jr169rvu9o7/Tzz33XIaeC2ATBoABFtJeO71k6R50o71j2lP4r3/9SyZPnuxzgExqzp8/b/7VGQxS4r7N3Svm/je1+/iS9JK0XmrXdl+rHCGxTz/91Axw0Ryql3V1QJj2ourAmkaNGplzPv74Y6lWrZq53Ks9XW7a66hWr17tOdfdrooVK3r2a9asab6H9nwq7RHVMgTtNdWSgKT0Z5DYs88+63VMLwlrr7Be+tXHdtPBRQ0bNvTsu2em0HaWLVs22XFtj17+V9oL6KY9yXr5XqdlUlu3bvVchnbr3bt3iq+pvh763B544AHTu6s9/teivdYp9Xw2btzY9Bxrz6O+DlpyoL3k7ttef/1187X2FOulcF89/Nrbrj3aSRUtWjTZMS2vSEzLJD744AMzKOxadHCW9npreYA+b/3Z6e+1lme4RUREmLZ+//33KT6OvifS+37Q338dwJiUr/dDet87+rNJ/LsPZFeEWcAyGg40bGiQ1WmREocdDbKrVq0ygSQ93H+A3aE2LYFXg9617uOLThmkl5ujo6Nl9uzZZnR3euoSlc5ckDjQ6GVdHUGvl/a3bNlijmnZhU7NpKHGFy1RSCxxcEwcBvSyv9J6Vw0rWlObFkkfzx363I+X0nnuUexa3+vreOL7a7mFliDo70PS56Ovb1Ja3+uLBmGtvdZL0zoLRdLa49SkdEleSwY+++wzE1R1yi1tj4ZYpR8itGRDL5Pr77CWq/gqMdAPZUkDXEpGjRplwrsGev2++pqkVAeblNax6u+P1oa7p9Z6+eWXzWuuZQfq559/NmUcqdH3RHrfD1pPndbnmN73jv5skn7IArIjwixgmW+++UZ+++0388daN1+9tukNs9qD6R5QlNLUVnqbck9T5B7gsmPHjnRNh6U1p+6eTa1h1RCjNaN79uzJ8OT9ej8N859//rmpNdSAoD2pOnhKe9t8SRoWU+rNzmj9ZFofL6Xz0nJ/7U3UutXBgwebn4F7urIHH3z
|
||
|
|
"text/plain": [
|
||
|
|
"<Figure size 800x500 with 1 Axes>"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "display_data"
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"df_results = pd.DataFrame(results)\n",
|
||
|
|
"df_results.to_csv(os.path.join(OUTPUT_FOLDER, 'ai_ocr_benchmark_results.csv'), index=False)\n",
|
||
|
|
"print('Benchmark results saved!')\n",
|
||
|
|
"\n",
|
||
|
|
"# Summary by model\n",
|
||
|
|
"summary = df_results.groupby('Model')[['WER', 'CER']].mean()\n",
|
||
|
|
"print(summary)\n",
|
||
|
|
"\n",
|
||
|
|
"# Plot\n",
|
||
|
|
"summary.plot(kind='bar', figsize=(8,5), title='AI OCR Benchmark (WER & CER)')\n",
|
||
|
|
"plt.ylabel('Error Rate')\n",
|
||
|
|
"plt.show()"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"metadata": {
|
||
|
|
"kernelspec": {
|
||
|
|
"display_name": "Python 3",
|
||
|
|
"name": "python3"
|
||
|
|
},
|
||
|
|
"language_info": {
|
||
|
|
"name": "python",
|
||
|
|
"version": "3.10"
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"nbformat": 4,
|
||
|
|
"nbformat_minor": 5
|
||
|
|
}
|