Files
MastersThesis/prepare_dataset.ipynb

497 lines
44 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": 91,
"id": "93809ffc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pip in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (25.3)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: jupyter in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (1.1.1)\n",
"Requirement already satisfied: notebook in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter) (7.5.0)\n",
"Requirement already satisfied: jupyter-console in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter) (6.6.3)\n",
"Requirement already satisfied: nbconvert in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter) (7.16.6)\n",
"Requirement already satisfied: ipykernel in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jupyter) (7.1.0)\n",
"Requirement already satisfied: ipywidgets in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter) (8.1.8)\n",
"Requirement already satisfied: jupyterlab in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter) (4.5.0)\n",
"Requirement already satisfied: comm>=0.1.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (0.2.3)\n",
"Requirement already satisfied: debugpy>=1.6.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (1.8.17)\n",
"Requirement already satisfied: ipython>=7.23.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (9.8.0)\n",
"Requirement already satisfied: jupyter-client>=8.0.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (8.6.3)\n",
"Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (5.9.1)\n",
"Requirement already satisfied: matplotlib-inline>=0.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (0.2.1)\n",
"Requirement already satisfied: nest-asyncio>=1.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (1.6.0)\n",
"Requirement already satisfied: packaging>=22 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (25.0)\n",
"Requirement already satisfied: psutil>=5.7 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (7.1.3)\n",
"Requirement already satisfied: pyzmq>=25 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (27.1.0)\n",
"Requirement already satisfied: tornado>=6.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (6.5.2)\n",
"Requirement already satisfied: traitlets>=5.4.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (5.14.3)\n",
"Requirement already satisfied: colorama>=0.4.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (0.4.6)\n",
"Requirement already satisfied: decorator>=4.3.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (5.2.1)\n",
"Requirement already satisfied: ipython-pygments-lexers>=1.0.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (1.1.1)\n",
"Requirement already satisfied: jedi>=0.18.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (0.19.2)\n",
"Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (3.0.52)\n",
"Requirement already satisfied: pygments>=2.11.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (2.19.2)\n",
"Requirement already satisfied: stack_data>=0.6.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (0.6.3)\n",
"Requirement already satisfied: typing_extensions>=4.6 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (4.15.0)\n",
"Requirement already satisfied: wcwidth in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=7.23.1->ipykernel->jupyter) (0.2.14)\n",
"Requirement already satisfied: parso<0.9.0,>=0.8.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jedi>=0.18.1->ipython>=7.23.1->ipykernel->jupyter) (0.8.5)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jupyter-client>=8.0.0->ipykernel->jupyter) (2.9.0.post0)\n",
"Requirement already satisfied: platformdirs>=2.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel->jupyter) (4.5.1)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from python-dateutil>=2.8.2->jupyter-client>=8.0.0->ipykernel->jupyter) (1.17.0)\n",
"Requirement already satisfied: executing>=1.2.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel->jupyter) (2.2.1)\n",
"Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel->jupyter) (3.0.1)\n",
"Requirement already satisfied: pure-eval in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel->jupyter) (0.2.3)\n",
"Requirement already satisfied: widgetsnbextension~=4.0.14 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ipywidgets->jupyter) (4.0.15)\n",
"Requirement already satisfied: jupyterlab_widgets~=3.0.15 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ipywidgets->jupyter) (3.0.16)\n",
"Requirement already satisfied: async-lru>=1.0.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (2.0.5)\n",
"Requirement already satisfied: httpx<1,>=0.25.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (0.28.1)\n",
"Requirement already satisfied: jinja2>=3.0.3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (3.1.6)\n",
"Requirement already satisfied: jupyter-lsp>=2.0.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (2.3.0)\n",
"Requirement already satisfied: jupyter-server<3,>=2.4.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (2.17.0)\n",
"Requirement already satisfied: jupyterlab-server<3,>=2.28.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (2.28.0)\n",
"Requirement already satisfied: notebook-shim>=0.2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (0.2.4)\n",
"Requirement already satisfied: setuptools>=41.1.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (65.5.0)\n",
"Requirement already satisfied: anyio in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (4.12.0)\n",
"Requirement already satisfied: certifi in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (2025.11.12)\n",
"Requirement already satisfied: httpcore==1.* in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (1.0.9)\n",
"Requirement already satisfied: idna in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (3.11)\n",
"Requirement already satisfied: h11>=0.16 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from httpcore==1.*->httpx<1,>=0.25.0->jupyterlab->jupyter) (0.16.0)\n",
"Requirement already satisfied: argon2-cffi>=21.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (25.1.0)\n",
"Requirement already satisfied: jupyter-events>=0.11.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.12.0)\n",
"Requirement already satisfied: jupyter-server-terminals>=0.4.4 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.5.3)\n",
"Requirement already satisfied: nbformat>=5.3.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (5.10.4)\n",
"Requirement already satisfied: overrides>=5.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (7.7.0)\n",
"Requirement already satisfied: prometheus-client>=0.9 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.23.1)\n",
"Requirement already satisfied: pywinpty>=2.0.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (3.0.2)\n",
"Requirement already satisfied: send2trash>=1.8.2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.8.3)\n",
"Requirement already satisfied: terminado>=0.8.3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.18.1)\n",
"Requirement already satisfied: websocket-client>=1.7 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.9.0)\n",
"Requirement already satisfied: babel>=2.10 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2.17.0)\n",
"Requirement already satisfied: json5>=0.9.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (0.12.1)\n",
"Requirement already satisfied: jsonschema>=4.18.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (4.25.1)\n",
"Requirement already satisfied: requests>=2.31 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2.32.5)\n",
"Requirement already satisfied: argon2-cffi-bindings in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (25.1.0)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jinja2>=3.0.3->jupyterlab->jupyter) (3.0.3)\n",
"Requirement already satisfied: attrs>=22.2.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (25.4.0)\n",
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2025.9.1)\n",
"Requirement already satisfied: referencing>=0.28.4 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (0.37.0)\n",
"Requirement already satisfied: rpds-py>=0.7.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (0.30.0)\n",
"Requirement already satisfied: python-json-logger>=2.0.4 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (4.0.0)\n",
"Requirement already satisfied: pyyaml>=5.3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (6.0.2)\n",
"Requirement already satisfied: rfc3339-validator in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.1.4)\n",
"Requirement already satisfied: rfc3986-validator>=0.1.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.1.1)\n",
"Requirement already satisfied: fqdn in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.5.1)\n",
"Requirement already satisfied: isoduration in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (20.11.0)\n",
"Requirement already satisfied: jsonpointer>1.13 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (3.0.0)\n",
"Requirement already satisfied: rfc3987-syntax>=1.1.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.1.0)\n",
"Requirement already satisfied: uri-template in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.3.0)\n",
"Requirement already satisfied: webcolors>=24.6.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (25.10.0)\n",
"Requirement already satisfied: beautifulsoup4 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from nbconvert->jupyter) (4.14.3)\n",
"Requirement already satisfied: bleach!=5.0.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from bleach[css]!=5.0.0->nbconvert->jupyter) (6.3.0)\n",
"Requirement already satisfied: defusedxml in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from nbconvert->jupyter) (0.7.1)\n",
"Requirement already satisfied: jupyterlab-pygments in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from nbconvert->jupyter) (0.3.0)\n",
"Requirement already satisfied: mistune<4,>=2.0.3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from nbconvert->jupyter) (3.1.4)\n",
"Requirement already satisfied: nbclient>=0.5.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from nbconvert->jupyter) (0.10.2)\n",
"Requirement already satisfied: pandocfilters>=1.4.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from nbconvert->jupyter) (1.5.1)\n",
"Requirement already satisfied: webencodings in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from bleach!=5.0.0->bleach[css]!=5.0.0->nbconvert->jupyter) (0.5.1)\n",
"Requirement already satisfied: tinycss2<1.5,>=1.1.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from bleach[css]!=5.0.0->nbconvert->jupyter) (1.4.0)\n",
"Requirement already satisfied: fastjsonschema>=2.15 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from nbformat>=5.3.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.21.2)\n",
"Requirement already satisfied: charset_normalizer<4,>=2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from requests>=2.31->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (3.4.4)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from requests>=2.31->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2.6.0)\n",
"Requirement already satisfied: lark>=1.2.2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from rfc3987-syntax>=1.1.0->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.3.1)\n",
"Requirement already satisfied: cffi>=1.0.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.0.0)\n",
"Requirement already satisfied: pycparser in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from cffi>=1.0.1->argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.23)\n",
"Requirement already satisfied: soupsieve>=1.6.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from beautifulsoup4->nbconvert->jupyter) (2.8)\n",
"Requirement already satisfied: arrow>=0.15.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.4.0)\n",
"Requirement already satisfied: tzdata in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from arrow>=0.15.0->isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2025.2)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: ipywidgets in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (8.1.8)\n",
"Requirement already satisfied: comm>=0.1.3 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipywidgets) (0.2.3)\n",
"Requirement already satisfied: ipython>=6.1.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipywidgets) (9.8.0)\n",
"Requirement already satisfied: traitlets>=4.3.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipywidgets) (5.14.3)\n",
"Requirement already satisfied: widgetsnbextension~=4.0.14 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ipywidgets) (4.0.15)\n",
"Requirement already satisfied: jupyterlab_widgets~=3.0.15 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ipywidgets) (3.0.16)\n",
"Requirement already satisfied: colorama>=0.4.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (0.4.6)\n",
"Requirement already satisfied: decorator>=4.3.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (5.2.1)\n",
"Requirement already satisfied: ipython-pygments-lexers>=1.0.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (1.1.1)\n",
"Requirement already satisfied: jedi>=0.18.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (0.19.2)\n",
"Requirement already satisfied: matplotlib-inline>=0.1.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (0.2.1)\n",
"Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (3.0.52)\n",
"Requirement already satisfied: pygments>=2.11.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (2.19.2)\n",
"Requirement already satisfied: stack_data>=0.6.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (0.6.3)\n",
"Requirement already satisfied: typing_extensions>=4.6 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (4.15.0)\n",
"Requirement already satisfied: wcwidth in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets) (0.2.14)\n",
"Requirement already satisfied: parso<0.9.0,>=0.8.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jedi>=0.18.1->ipython>=6.1.0->ipywidgets) (0.8.5)\n",
"Requirement already satisfied: executing>=1.2.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=6.1.0->ipywidgets) (2.2.1)\n",
"Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=6.1.0->ipywidgets) (3.0.1)\n",
"Requirement already satisfied: pure-eval in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=6.1.0->ipywidgets) (0.2.3)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: ipykernel in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (7.1.0)\n",
"Requirement already satisfied: comm>=0.1.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (0.2.3)\n",
"Requirement already satisfied: debugpy>=1.6.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (1.8.17)\n",
"Requirement already satisfied: ipython>=7.23.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (9.8.0)\n",
"Requirement already satisfied: jupyter-client>=8.0.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (8.6.3)\n",
"Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (5.9.1)\n",
"Requirement already satisfied: matplotlib-inline>=0.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (0.2.1)\n",
"Requirement already satisfied: nest-asyncio>=1.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (1.6.0)\n",
"Requirement already satisfied: packaging>=22 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (25.0)\n",
"Requirement already satisfied: psutil>=5.7 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (7.1.3)\n",
"Requirement already satisfied: pyzmq>=25 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (27.1.0)\n",
"Requirement already satisfied: tornado>=6.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (6.5.2)\n",
"Requirement already satisfied: traitlets>=5.4.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (5.14.3)\n",
"Requirement already satisfied: colorama>=0.4.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (0.4.6)\n",
"Requirement already satisfied: decorator>=4.3.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (5.2.1)\n",
"Requirement already satisfied: ipython-pygments-lexers>=1.0.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (1.1.1)\n",
"Requirement already satisfied: jedi>=0.18.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (0.19.2)\n",
"Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (3.0.52)\n",
"Requirement already satisfied: pygments>=2.11.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (2.19.2)\n",
"Requirement already satisfied: stack_data>=0.6.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (0.6.3)\n",
"Requirement already satisfied: typing_extensions>=4.6 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (4.15.0)\n",
"Requirement already satisfied: wcwidth in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=7.23.1->ipykernel) (0.2.14)\n",
"Requirement already satisfied: parso<0.9.0,>=0.8.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jedi>=0.18.1->ipython>=7.23.1->ipykernel) (0.8.5)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jupyter-client>=8.0.0->ipykernel) (2.9.0.post0)\n",
"Requirement already satisfied: platformdirs>=2.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel) (4.5.1)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from python-dateutil>=2.8.2->jupyter-client>=8.0.0->ipykernel) (1.17.0)\n",
"Requirement already satisfied: executing>=1.2.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel) (2.2.1)\n",
"Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel) (3.0.1)\n",
"Requirement already satisfied: pure-eval in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel) (0.2.3)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install --upgrade pip\n",
"%pip install --upgrade jupyter\n",
"%pip install --upgrade ipywidgets\n",
"%pip install --upgrade ipykernel"
]
},
{
"cell_type": "code",
"execution_count": 92,
"id": "48724594",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pdf2image in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (1.17.0)\n",
"Requirement already satisfied: pillow in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (12.0.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: PyMuPDF in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (1.26.6)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: pandas in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (2.3.3)\n",
"Requirement already satisfied: numpy>=1.23.2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pandas) (2.3.5)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from pandas) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pandas) (2025.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pandas) (2025.2)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: matplotlib in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (3.10.7)\n",
"Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib) (1.3.3)\n",
"Requirement already satisfied: cycler>=0.10 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib) (0.12.1)\n",
"Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib) (4.61.0)\n",
"Requirement already satisfied: kiwisolver>=1.3.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib) (1.4.9)\n",
"Requirement already satisfied: numpy>=1.23 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib) (2.3.5)\n",
"Requirement already satisfied: packaging>=20.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from matplotlib) (25.0)\n",
"Requirement already satisfied: pillow>=8 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib) (12.0.0)\n",
"Requirement already satisfied: pyparsing>=3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib) (3.2.5)\n",
"Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from matplotlib) (2.9.0.post0)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from python-dateutil>=2.7->matplotlib) (1.17.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: seaborn in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (0.13.2)\n",
"Requirement already satisfied: numpy!=1.24.0,>=1.20 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from seaborn) (2.3.5)\n",
"Requirement already satisfied: pandas>=1.2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from seaborn) (2.3.3)\n",
"Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from seaborn) (3.10.7)\n",
"Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.3.3)\n",
"Requirement already satisfied: cycler>=0.10 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.12.1)\n",
"Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.61.0)\n",
"Requirement already satisfied: kiwisolver>=1.3.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.9)\n",
"Requirement already satisfied: packaging>=20.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (25.0)\n",
"Requirement already satisfied: pillow>=8 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (12.0.0)\n",
"Requirement already satisfied: pyparsing>=3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.2.5)\n",
"Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pandas>=1.2->seaborn) (2025.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pandas>=1.2->seaborn) (2025.2)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.17.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"# Install necessary packages\n",
"%pip install pdf2image pillow \n",
"# pdf reading\n",
"%pip install PyMuPDF\n",
"\n",
"# Data analysis and visualization\n",
"%pip install pandas\n",
"%pip install matplotlib\n",
"%pip install seaborn"
]
},
{
"cell_type": "code",
"execution_count": 93,
"id": "e1f793b6",
"metadata": {},
"outputs": [],
"source": [
"import os, json\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from pdf2image import convert_from_path\n",
"from PIL import Image, ImageOps\n",
"import fitz # PyMuPDF\n",
"import re\n",
"from datetime import datetime\n",
"from typing import List\n",
"import shutil"
]
},
{
"cell_type": "code",
"execution_count": 94,
"id": "1652a78e",
"metadata": {},
"outputs": [],
"source": [
"def pdf_to_images(pdf_path: str, output_dir: str, dpi: int = 300):\n",
" \"\"\"\n",
" Render a PDF into a list of PIL Images using PyMuPDF or pdf2image.\n",
" 'pages' is 1-based (e.g., range(1, 10) -> pages 19).\n",
" \"\"\"\n",
" if fitz is not None:\n",
" doc = fitz.open(pdf_path)\n",
" total_pages = len(doc)\n",
"\n",
" # Adjust page indices (PyMuPDF uses 0-based indexing)\n",
" page_indices = list(range(total_pages))\n",
"\n",
" for i in page_indices:\n",
" page = doc.load_page(i)\n",
" mat = fitz.Matrix(dpi / 72.0, dpi / 72.0)\n",
" pix = page.get_pixmap(matrix=mat, alpha=False)\n",
" img = Image.frombytes(\"RGB\", [pix.width, pix.height], pix.samples)\n",
" # Build filename\n",
" out_path = os.path.join(\n",
" output_dir,\n",
" f\"page_{i + 1:04d}.png\"\n",
" )\n",
"\n",
" img.save(out_path, \"PNG\")\n",
" doc.close()\n",
" else:\n",
" raise RuntimeError(\"Install PyMuPDF or pdf2image to convert PDFs.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f523dd58",
"metadata": {},
"outputs": [],
"source": [
"import fitz\n",
"import re\n",
"import os\n",
"\n",
"def _pdf_extract_text_structured(page, margin_threshold=50):\n",
" \"\"\"\n",
" Extract text using PyMuPDF's dict mode which preserves\n",
" the actual line structure from the PDF.\n",
" \"\"\"\n",
" data = page.get_text(\"dict\")\n",
" \n",
" # Collect all lines with their Y position\n",
" all_lines = []\n",
" margin_text_parts = [] # Collect vertical/margin text\n",
" margin_y_positions = []\n",
" \n",
" for block in data.get(\"blocks\", []):\n",
" if block.get(\"type\") != 0: # Skip non-text blocks\n",
" continue\n",
" \n",
" block_bbox = block.get(\"bbox\", (0, 0, 0, 0))\n",
" block_width = block_bbox[2] - block_bbox[0]\n",
" block_height = block_bbox[3] - block_bbox[1]\n",
" \n",
" # Detect vertical/margin text\n",
" is_margin_text = (block_bbox[0] < margin_threshold or \n",
" block_height > block_width * 2)\n",
" \n",
" for line in block.get(\"lines\", []):\n",
" direction = line.get(\"dir\", (1, 0))\n",
" bbox = line.get(\"bbox\", (0, 0, 0, 0))\n",
" y_center = (bbox[1] + bbox[3]) / 2\n",
" x_start = bbox[0]\n",
" \n",
" # Collect text from all spans\n",
" line_text = \"\"\n",
" for span in line.get(\"spans\", []):\n",
" text = span.get(\"text\", \"\")\n",
" line_text += text\n",
" \n",
" line_text = line_text.strip()\n",
" line_text = re.sub(r\"[•▪◦●❖▶■]\", \"\", line_text)\n",
" \n",
" if not line_text:\n",
" continue\n",
" \n",
" # Check if this is margin/vertical text\n",
" if is_margin_text or abs(direction[0]) < 0.9:\n",
" margin_text_parts.append((y_center, line_text))\n",
" margin_y_positions.append(y_center)\n",
" else:\n",
" all_lines.append((y_center, x_start, line_text))\n",
" \n",
" # Reconstruct margin text as single line at its vertical center\n",
" if margin_text_parts:\n",
" # Sort by Y position (top to bottom) and join\n",
" margin_text_parts.sort(key=lambda x: x[0])\n",
" full_margin_text = \" \".join(part[1] for part in margin_text_parts)\n",
" # Calculate vertical center of the watermark\n",
" avg_y = sum(margin_y_positions) / len(margin_y_positions)\n",
" # Add as a single line\n",
" all_lines.append((avg_y, -1, full_margin_text)) # x=-1 to sort first\n",
" \n",
" if not all_lines:\n",
" return \"\"\n",
" \n",
" # Sort by Y first, then by X\n",
" all_lines.sort(key=lambda x: (x[0], x[1]))\n",
" \n",
" # Group lines at same vertical position\n",
" merged_rows = []\n",
" current_row = [all_lines[0]]\n",
" current_y = all_lines[0][0]\n",
" \n",
" for y_center, x_start, text in all_lines[1:]:\n",
" if abs(y_center - current_y) <= 2:\n",
" current_row.append((y_center, x_start, text))\n",
" else:\n",
" current_row.sort(key=lambda x: x[1])\n",
" row_text = \" \".join(item[2] for item in current_row)\n",
" merged_rows.append((current_y, row_text))\n",
" current_row = [(y_center, x_start, text)]\n",
" current_y = y_center\n",
" \n",
" if current_row:\n",
" current_row.sort(key=lambda x: x[1])\n",
" row_text = \" \".join(item[2] for item in current_row)\n",
" merged_rows.append((current_y, row_text))\n",
" \n",
" # Sort rows by Y and extract text\n",
" merged_rows.sort(key=lambda x: x[0])\n",
" lines = [row[1] for row in merged_rows]\n",
" \n",
" # Join and clean up\n",
" text = \"\\n\".join(lines)\n",
" text = re.sub(r\" +\", \" \", text).strip()\n",
" text = re.sub(r\"\\n{3,}\", \"\\n\\n\", text).strip()\n",
" \n",
" return text\n",
"\n",
"def pdf_extract_text(pdf_path, output_dir, margin_threshold=50):\n",
" os.makedirs(output_dir, exist_ok=True)\n",
" doc = fitz.open(pdf_path)\n",
" \n",
" for i, page in enumerate(doc):\n",
" text = _pdf_extract_text_structured(page, margin_threshold)\n",
" if not text.strip():\n",
" continue\n",
" out_path = os.path.join(output_dir, f\"page_{i + 1:04d}.txt\")\n",
" with open(out_path, \"w\", encoding=\"utf-8\") as f:\n",
" f.write(text)"
]
},
{
"cell_type": "code",
"execution_count": 96,
"id": "9f64a8c0",
"metadata": {},
"outputs": [],
"source": [
"PDF_FOLDER = './instructions' # Folder containing PDF files\n",
"OUTPUT_FOLDER = './dataset'\n",
"\n",
"os.makedirs(OUTPUT_FOLDER, exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "41e4651d",
"metadata": {},
"outputs": [],
"source": [
"i = 0\n",
"\n",
"pdf_files = sorted([\n",
" fname for fname in os.listdir(PDF_FOLDER)\n",
" if fname.lower().endswith(\".pdf\")\n",
"])\n",
"\n",
"\n",
"for fname in pdf_files:\n",
" # build output directories\n",
" out_img_path = os.path.join(OUTPUT_FOLDER, str(i), \"img\")\n",
" out_txt_path = os.path.join(OUTPUT_FOLDER, str(i), \"txt\")\n",
"\n",
" os.makedirs(out_img_path, exist_ok=True)\n",
" os.makedirs(out_txt_path, exist_ok=True)\n",
"\n",
" # source and destination PDF paths\n",
" src_pdf = os.path.join(PDF_FOLDER, fname)\n",
" pdf_path = os.path.join(OUTPUT_FOLDER, str(i), fname)\n",
"\n",
" # copy PDF into numbered folder\n",
" shutil.copy(src_pdf, pdf_path)\n",
"\n",
" # convert PDF → images\n",
" pdf_to_images(\n",
" pdf_path=pdf_path,\n",
" output_dir=out_img_path,\n",
" dpi=300\n",
" )\n",
" pdf_extract_text(\n",
" pdf_path=pdf_path,\n",
" output_dir=out_txt_path,\n",
" margin_threshold=40\n",
" )\n",
"\n",
" i += 1"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}