{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "93809ffc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: pip in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (25.3)\n", "Note: you may need to restart the kernel to use updated packages.\n", "Requirement already satisfied: jupyter in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (1.1.1)\n", "Requirement already satisfied: notebook in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter) (7.5.0)\n", "Requirement already satisfied: jupyter-console in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter) (6.6.3)\n", "Requirement already satisfied: nbconvert in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter) (7.16.6)\n", "Requirement already satisfied: ipykernel in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter) (7.1.0)\n", "Requirement already satisfied: ipywidgets in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter) (8.1.8)\n", "Requirement already satisfied: jupyterlab in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter) (4.5.0)\n", "Requirement already satisfied: comm>=0.1.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (0.2.3)\n", "Requirement already satisfied: debugpy>=1.6.5 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (1.8.17)\n", "Requirement already satisfied: ipython>=7.23.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (9.8.0)\n", "Requirement already satisfied: jupyter-client>=8.0.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (8.6.3)\n", "Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (5.9.1)\n", "Requirement already satisfied: matplotlib-inline>=0.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (0.2.1)\n", "Requirement already satisfied: nest-asyncio>=1.4 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (1.6.0)\n", "Requirement already satisfied: packaging>=22 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (25.0)\n", "Requirement already satisfied: psutil>=5.7 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (7.1.3)\n", "Requirement already satisfied: pyzmq>=25 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (27.1.0)\n", "Requirement already satisfied: tornado>=6.2 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (6.5.2)\n", "Requirement already satisfied: traitlets>=5.4.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel->jupyter) (5.14.3)\n", "Requirement already satisfied: colorama>=0.4.4 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (0.4.6)\n", "Requirement already satisfied: decorator>=4.3.2 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (5.2.1)\n", "Requirement already satisfied: ipython-pygments-lexers>=1.0.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (1.1.1)\n", "Requirement already satisfied: jedi>=0.18.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (0.19.2)\n", "Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (3.0.52)\n", "Requirement already satisfied: pygments>=2.11.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (2.19.2)\n", "Requirement already satisfied: stack_data>=0.6.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (0.6.3)\n", "Requirement already satisfied: typing_extensions>=4.6 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (4.15.0)\n", "Requirement already satisfied: wcwidth in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=7.23.1->ipykernel->jupyter) (0.2.14)\n", "Requirement already satisfied: parso<0.9.0,>=0.8.4 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jedi>=0.18.1->ipython>=7.23.1->ipykernel->jupyter) (0.8.5)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-client>=8.0.0->ipykernel->jupyter) (2.9.0.post0)\n", "Requirement already satisfied: platformdirs>=2.5 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel->jupyter) (4.5.1)\n", "Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from python-dateutil>=2.8.2->jupyter-client>=8.0.0->ipykernel->jupyter) (1.17.0)\n", "Requirement already satisfied: executing>=1.2.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel->jupyter) (2.2.1)\n", "Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel->jupyter) (3.0.1)\n", "Requirement already satisfied: pure-eval in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel->jupyter) (0.2.3)\n", "Requirement already satisfied: widgetsnbextension~=4.0.14 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipywidgets->jupyter) (4.0.15)\n", "Requirement already satisfied: jupyterlab_widgets~=3.0.15 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipywidgets->jupyter) (3.0.16)\n", "Requirement already satisfied: async-lru>=1.0.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab->jupyter) (2.0.5)\n", "Requirement already satisfied: httpx<1,>=0.25.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab->jupyter) (0.28.1)\n", "Requirement already satisfied: jinja2>=3.0.3 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab->jupyter) (3.1.6)\n", "Requirement already satisfied: jupyter-lsp>=2.0.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab->jupyter) (2.3.0)\n", "Requirement already satisfied: jupyter-server<3,>=2.4.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab->jupyter) (2.17.0)\n", "Requirement already satisfied: jupyterlab-server<3,>=2.28.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab->jupyter) (2.28.0)\n", "Requirement already satisfied: notebook-shim>=0.2 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab->jupyter) (0.2.4)\n", "Requirement already satisfied: setuptools>=41.1.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab->jupyter) (65.5.0)\n", "Requirement already satisfied: anyio in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (4.12.0)\n", "Requirement already satisfied: certifi in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (2025.11.12)\n", "Requirement already satisfied: httpcore==1.* in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (1.0.9)\n", "Requirement already satisfied: idna in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (3.11)\n", "Requirement already satisfied: h11>=0.16 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from httpcore==1.*->httpx<1,>=0.25.0->jupyterlab->jupyter) (0.16.0)\n", "Requirement already satisfied: argon2-cffi>=21.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (25.1.0)\n", "Requirement already satisfied: jupyter-events>=0.11.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.12.0)\n", "Requirement already satisfied: jupyter-server-terminals>=0.4.4 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.5.3)\n", "Requirement already satisfied: nbformat>=5.3.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (5.10.4)\n", "Requirement already satisfied: overrides>=5.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (7.7.0)\n", "Requirement already satisfied: prometheus-client>=0.9 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.23.1)\n", "Requirement already satisfied: pywinpty>=2.0.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (3.0.2)\n", "Requirement already satisfied: send2trash>=1.8.2 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.8.3)\n", "Requirement already satisfied: terminado>=0.8.3 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.18.1)\n", "Requirement already satisfied: websocket-client>=1.7 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.9.0)\n", "Requirement already satisfied: babel>=2.10 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2.17.0)\n", "Requirement already satisfied: json5>=0.9.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (0.12.1)\n", "Requirement already satisfied: jsonschema>=4.18.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (4.25.1)\n", "Requirement already satisfied: requests>=2.31 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2.32.5)\n", "Requirement already satisfied: argon2-cffi-bindings in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (25.1.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jinja2>=3.0.3->jupyterlab->jupyter) (3.0.3)\n", "Requirement already satisfied: attrs>=22.2.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (25.4.0)\n", "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2025.9.1)\n", "Requirement already satisfied: referencing>=0.28.4 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (0.37.0)\n", "Requirement already satisfied: rpds-py>=0.7.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (0.30.0)\n", "Requirement already satisfied: python-json-logger>=2.0.4 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (4.0.0)\n", "Requirement already satisfied: pyyaml>=5.3 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (6.0.2)\n", "Requirement already satisfied: rfc3339-validator in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.1.4)\n", "Requirement already satisfied: rfc3986-validator>=0.1.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.1.1)\n", "Requirement already satisfied: fqdn in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.5.1)\n", "Requirement already satisfied: isoduration in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (20.11.0)\n", "Requirement already satisfied: jsonpointer>1.13 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (3.0.0)\n", "Requirement already satisfied: rfc3987-syntax>=1.1.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.1.0)\n", "Requirement already satisfied: uri-template in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.3.0)\n", "Requirement already satisfied: webcolors>=24.6.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (25.10.0)\n", "Requirement already satisfied: beautifulsoup4 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from nbconvert->jupyter) (4.14.3)\n", "Requirement already satisfied: bleach!=5.0.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from bleach[css]!=5.0.0->nbconvert->jupyter) (6.3.0)\n", "Requirement already satisfied: defusedxml in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from nbconvert->jupyter) (0.7.1)\n", "Requirement already satisfied: jupyterlab-pygments in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from nbconvert->jupyter) (0.3.0)\n", "Requirement already satisfied: mistune<4,>=2.0.3 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from nbconvert->jupyter) (3.1.4)\n", "Requirement already satisfied: nbclient>=0.5.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from nbconvert->jupyter) (0.10.2)\n", "Requirement already satisfied: pandocfilters>=1.4.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from nbconvert->jupyter) (1.5.1)\n", "Requirement already satisfied: webencodings in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from bleach!=5.0.0->bleach[css]!=5.0.0->nbconvert->jupyter) (0.5.1)\n", "Requirement already satisfied: tinycss2<1.5,>=1.1.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from bleach[css]!=5.0.0->nbconvert->jupyter) (1.4.0)\n", "Requirement already satisfied: fastjsonschema>=2.15 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from nbformat>=5.3.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.21.2)\n", "Requirement already satisfied: charset_normalizer<4,>=2 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests>=2.31->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (3.4.4)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from requests>=2.31->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2.6.0)\n", "Requirement already satisfied: lark>=1.2.2 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from rfc3987-syntax>=1.1.0->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.3.1)\n", "Requirement already satisfied: cffi>=1.0.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.0.0)\n", "Requirement already satisfied: pycparser in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from cffi>=1.0.1->argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.23)\n", "Requirement already satisfied: soupsieve>=1.6.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from beautifulsoup4->nbconvert->jupyter) (2.8)\n", "Requirement already satisfied: arrow>=0.15.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.4.0)\n", "Requirement already satisfied: tzdata in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from arrow>=0.15.0->isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2025.2)\n", "Note: you may need to restart the kernel to use updated packages.\n", "Requirement already satisfied: ipywidgets in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (8.1.8)\n", "Requirement already satisfied: comm>=0.1.3 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipywidgets) (0.2.3)\n", "Requirement already satisfied: ipython>=6.1.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipywidgets) (9.8.0)\n", "Requirement already satisfied: traitlets>=4.3.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipywidgets) (5.14.3)\n", "Requirement already satisfied: widgetsnbextension~=4.0.14 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipywidgets) (4.0.15)\n", "Requirement already satisfied: jupyterlab_widgets~=3.0.15 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipywidgets) (3.0.16)\n", "Requirement already satisfied: colorama>=0.4.4 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (0.4.6)\n", "Requirement already satisfied: decorator>=4.3.2 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (5.2.1)\n", "Requirement already satisfied: ipython-pygments-lexers>=1.0.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (1.1.1)\n", "Requirement already satisfied: jedi>=0.18.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (0.19.2)\n", "Requirement already satisfied: matplotlib-inline>=0.1.5 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (0.2.1)\n", "Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (3.0.52)\n", "Requirement already satisfied: pygments>=2.11.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (2.19.2)\n", "Requirement already satisfied: stack_data>=0.6.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (0.6.3)\n", "Requirement already satisfied: typing_extensions>=4.6 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (4.15.0)\n", "Requirement already satisfied: wcwidth in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets) (0.2.14)\n", "Requirement already satisfied: parso<0.9.0,>=0.8.4 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jedi>=0.18.1->ipython>=6.1.0->ipywidgets) (0.8.5)\n", "Requirement already satisfied: executing>=1.2.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from stack_data>=0.6.0->ipython>=6.1.0->ipywidgets) (2.2.1)\n", "Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from stack_data>=0.6.0->ipython>=6.1.0->ipywidgets) (3.0.1)\n", "Requirement already satisfied: pure-eval in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from stack_data>=0.6.0->ipython>=6.1.0->ipywidgets) (0.2.3)\n", "Note: you may need to restart the kernel to use updated packages.\n", "Requirement already satisfied: ipykernel in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (7.1.0)\n", "Requirement already satisfied: comm>=0.1.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (0.2.3)\n", "Requirement already satisfied: debugpy>=1.6.5 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (1.8.17)\n", "Requirement already satisfied: ipython>=7.23.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (9.8.0)\n", "Requirement already satisfied: jupyter-client>=8.0.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (8.6.3)\n", "Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (5.9.1)\n", "Requirement already satisfied: matplotlib-inline>=0.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (0.2.1)\n", "Requirement already satisfied: nest-asyncio>=1.4 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (1.6.0)\n", "Requirement already satisfied: packaging>=22 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (25.0)\n", "Requirement already satisfied: psutil>=5.7 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (7.1.3)\n", "Requirement already satisfied: pyzmq>=25 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (27.1.0)\n", "Requirement already satisfied: tornado>=6.2 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (6.5.2)\n", "Requirement already satisfied: traitlets>=5.4.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipykernel) (5.14.3)\n", "Requirement already satisfied: colorama>=0.4.4 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel) (0.4.6)\n", "Requirement already satisfied: decorator>=4.3.2 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel) (5.2.1)\n", "Requirement already satisfied: ipython-pygments-lexers>=1.0.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel) (1.1.1)\n", "Requirement already satisfied: jedi>=0.18.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel) (0.19.2)\n", "Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel) (3.0.52)\n", "Requirement already satisfied: pygments>=2.11.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel) (2.19.2)\n", "Requirement already satisfied: stack_data>=0.6.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel) (0.6.3)\n", "Requirement already satisfied: typing_extensions>=4.6 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel) (4.15.0)\n", "Requirement already satisfied: wcwidth in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=7.23.1->ipykernel) (0.2.14)\n", "Requirement already satisfied: parso<0.9.0,>=0.8.4 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jedi>=0.18.1->ipython>=7.23.1->ipykernel) (0.8.5)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-client>=8.0.0->ipykernel) (2.9.0.post0)\n", "Requirement already satisfied: platformdirs>=2.5 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel) (4.5.1)\n", "Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from python-dateutil>=2.8.2->jupyter-client>=8.0.0->ipykernel) (1.17.0)\n", "Requirement already satisfied: executing>=1.2.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel) (2.2.1)\n", "Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel) (3.0.1)\n", "Requirement already satisfied: pure-eval in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel) (0.2.3)\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "%pip install --upgrade pip\n", "%pip install --upgrade jupyter\n", "%pip install --upgrade ipywidgets\n", "%pip install --upgrade ipykernel" ] }, { "cell_type": "code", "execution_count": 2, "id": "48724594", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting pdf2image\n", " Using cached pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)\n", "Requirement already satisfied: pillow in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (12.0.0)\n", "Using cached pdf2image-1.17.0-py3-none-any.whl (11 kB)\n", "Installing collected packages: pdf2image\n", "Successfully installed pdf2image-1.17.0\n", "Note: you may need to restart the kernel to use updated packages.\n", "Collecting PyMuPDF\n", " Using cached pymupdf-1.26.6-cp310-abi3-win_amd64.whl.metadata (3.4 kB)\n", "Using cached pymupdf-1.26.6-cp310-abi3-win_amd64.whl (18.4 MB)\n", "Installing collected packages: PyMuPDF\n", "Successfully installed PyMuPDF-1.26.6\n", "Note: you may need to restart the kernel to use updated packages.\n", "Requirement already satisfied: pandas in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (2.3.3)\n", "Requirement already satisfied: numpy>=1.23.2 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas) (2.3.5)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas) (2025.2)\n", "Requirement already satisfied: tzdata>=2022.7 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas) (2025.2)\n", "Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n", "Note: you may need to restart the kernel to use updated packages.\n", "Requirement already satisfied: matplotlib in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (3.10.7)\n", "Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib) (1.3.3)\n", "Requirement already satisfied: cycler>=0.10 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib) (0.12.1)\n", "Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib) (4.61.0)\n", "Requirement already satisfied: kiwisolver>=1.3.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib) (1.4.9)\n", "Requirement already satisfied: numpy>=1.23 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib) (2.3.5)\n", "Requirement already satisfied: packaging>=20.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib) (25.0)\n", "Requirement already satisfied: pillow>=8 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib) (12.0.0)\n", "Requirement already satisfied: pyparsing>=3 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib) (3.2.5)\n", "Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib) (2.9.0.post0)\n", "Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from python-dateutil>=2.7->matplotlib) (1.17.0)\n", "Note: you may need to restart the kernel to use updated packages.\n", "Requirement already satisfied: seaborn in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (0.13.2)\n", "Requirement already satisfied: numpy!=1.24.0,>=1.20 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from seaborn) (2.3.5)\n", "Requirement already satisfied: pandas>=1.2 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from seaborn) (2.3.3)\n", "Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from seaborn) (3.10.7)\n", "Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.3.3)\n", "Requirement already satisfied: cycler>=0.10 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.12.1)\n", "Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.61.0)\n", "Requirement already satisfied: kiwisolver>=1.3.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.9)\n", "Requirement already satisfied: packaging>=20.0 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (25.0)\n", "Requirement already satisfied: pillow>=8 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (12.0.0)\n", "Requirement already satisfied: pyparsing>=3 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.2.5)\n", "Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas>=1.2->seaborn) (2025.2)\n", "Requirement already satisfied: tzdata>=2022.7 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from pandas>=1.2->seaborn) (2025.2)\n", "Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\desktop\\mastersthesis\\.venv\\lib\\site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.17.0)\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "# Install necessary packages\n", "%pip install pdf2image pillow \n", "# pdf reading\n", "%pip install PyMuPDF\n", "\n", "# Data analysis and visualization\n", "%pip install pandas\n", "%pip install matplotlib\n", "%pip install seaborn" ] }, { "cell_type": "code", "execution_count": 3, "id": "e1f793b6", "metadata": {}, "outputs": [], "source": [ "import os, json\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from pdf2image import convert_from_path\n", "from PIL import Image, ImageOps\n", "import fitz # PyMuPDF\n", "import re\n", "from datetime import datetime\n", "from typing import List\n", "import shutil" ] }, { "cell_type": "code", "execution_count": 4, "id": "1652a78e", "metadata": {}, "outputs": [], "source": [ "def pdf_to_images(pdf_path: str, output_dir: str, dpi: int = 300):\n", " \"\"\"\n", " Render a PDF into a list of PIL Images using PyMuPDF or pdf2image.\n", " 'pages' is 1-based (e.g., range(1, 10) -> pages 1–9).\n", " \"\"\"\n", " if fitz is not None:\n", " doc = fitz.open(pdf_path)\n", " total_pages = len(doc)\n", "\n", " # Adjust page indices (PyMuPDF uses 0-based indexing)\n", " page_indices = list(range(total_pages))\n", "\n", " for i in page_indices:\n", " page = doc.load_page(i)\n", " mat = fitz.Matrix(dpi / 72.0, dpi / 72.0)\n", " pix = page.get_pixmap(matrix=mat, alpha=False)\n", " img = Image.frombytes(\"RGB\", [pix.width, pix.height], pix.samples)\n", " # Build filename\n", " out_path = os.path.join(\n", " output_dir,\n", " f\"page_{i + 1:04d}.png\"\n", " )\n", "\n", " img.save(out_path, \"PNG\")\n", " doc.close()\n", " else:\n", " raise RuntimeError(\"Install PyMuPDF or pdf2image to convert PDFs.\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "f523dd58", "metadata": {}, "outputs": [], "source": [ "import fitz\n", "import re\n", "import os\n", "\n", "def _pdf_extract_text_structured(page, margin_threshold=50):\n", " \"\"\"\n", " Extract text using PyMuPDF's dict mode which preserves\n", " the actual line structure from the PDF.\n", " \"\"\"\n", " data = page.get_text(\"dict\")\n", " \n", " # Collect all lines with their Y position\n", " all_lines = []\n", " margin_text_parts = [] # Collect vertical/margin text\n", " margin_y_positions = []\n", " \n", " for block in data.get(\"blocks\", []):\n", " if block.get(\"type\") != 0: # Skip non-text blocks\n", " continue\n", " \n", " block_bbox = block.get(\"bbox\", (0, 0, 0, 0))\n", " block_width = block_bbox[2] - block_bbox[0]\n", " block_height = block_bbox[3] - block_bbox[1]\n", " \n", " # Detect vertical/margin text\n", " is_margin_text = (block_bbox[0] < margin_threshold or \n", " block_height > block_width * 2)\n", " \n", " for line in block.get(\"lines\", []):\n", " direction = line.get(\"dir\", (1, 0))\n", " bbox = line.get(\"bbox\", (0, 0, 0, 0))\n", " y_center = (bbox[1] + bbox[3]) / 2\n", " x_start = bbox[0]\n", " \n", " # Collect text from all spans\n", " line_text = \"\"\n", " for span in line.get(\"spans\", []):\n", " text = span.get(\"text\", \"\")\n", " line_text += text\n", " \n", " line_text = line_text.strip()\n", " line_text = re.sub(r\"[•▪◦●❖▶■\\uf000-\\uf0ff]\", \"\", line_text)\n", " \n", " if not line_text:\n", " continue\n", " \n", " # Check if this is margin/vertical text\n", " if is_margin_text or abs(direction[0]) < 0.9:\n", " margin_text_parts.append((y_center, line_text))\n", " margin_y_positions.append(y_center)\n", " else:\n", " all_lines.append((y_center, x_start, line_text))\n", " \n", " # Reconstruct margin text as single line at its vertical center\n", " if margin_text_parts:\n", " # Sort by Y position (top to bottom) and join\n", " margin_text_parts.sort(key=lambda x: x[0])\n", " full_margin_text = \" \".join(part[1] for part in margin_text_parts)\n", " # Calculate vertical center of the watermark\n", " avg_y = sum(margin_y_positions) / len(margin_y_positions)\n", " # Add as a single line\n", " all_lines.append((avg_y, -1, full_margin_text)) # x=-1 to sort first\n", " \n", " if not all_lines:\n", " return \"\"\n", " \n", " # Sort by Y first, then by X\n", " all_lines.sort(key=lambda x: (x[0], x[1]))\n", " \n", " # Group lines at same vertical position\n", " merged_rows = []\n", " current_row = [all_lines[0]]\n", " current_y = all_lines[0][0]\n", " \n", " for y_center, x_start, text in all_lines[1:]:\n", " if abs(y_center - current_y) <= 2:\n", " current_row.append((y_center, x_start, text))\n", " else:\n", " current_row.sort(key=lambda x: x[1])\n", " row_text = \" \".join(item[2] for item in current_row)\n", " merged_rows.append((current_y, row_text))\n", " current_row = [(y_center, x_start, text)]\n", " current_y = y_center\n", " \n", " if current_row:\n", " current_row.sort(key=lambda x: x[1])\n", " row_text = \" \".join(item[2] for item in current_row)\n", " merged_rows.append((current_y, row_text))\n", " \n", " # Sort rows by Y and extract text\n", " merged_rows.sort(key=lambda x: x[0])\n", " lines = [row[1] for row in merged_rows]\n", " \n", " # Join and clean up\n", " text = \"\\n\".join(lines)\n", " text = re.sub(r\" +\", \" \", text).strip()\n", " text = re.sub(r\"\\n{3,}\", \"\\n\\n\", text).strip()\n", " \n", " return text\n", "\n", "def pdf_extract_text(pdf_path, output_dir, margin_threshold=50):\n", " os.makedirs(output_dir, exist_ok=True)\n", " doc = fitz.open(pdf_path)\n", " \n", " for i, page in enumerate(doc):\n", " text = _pdf_extract_text_structured(page, margin_threshold)\n", " if not text.strip():\n", " continue\n", " out_path = os.path.join(output_dir, f\"page_{i + 1:04d}.txt\")\n", " with open(out_path, \"w\", encoding=\"utf-8\") as f:\n", " f.write(text)" ] }, { "cell_type": "code", "execution_count": 8, "id": "9f64a8c0", "metadata": {}, "outputs": [], "source": [ "PDF_FOLDER = './../instructions' # Folder containing PDF files\n", "OUTPUT_FOLDER = './dataset'\n", "\n", "os.makedirs(OUTPUT_FOLDER, exist_ok=True)" ] }, { "cell_type": "code", "execution_count": 9, "id": "41e4651d", "metadata": {}, "outputs": [], "source": [ "i = 0\n", "\n", "pdf_files = sorted([\n", " fname for fname in os.listdir(PDF_FOLDER)\n", " if fname.lower().endswith(\".pdf\")\n", "])\n", "\n", "\n", "for fname in pdf_files:\n", " # build output directories\n", " out_img_path = os.path.join(OUTPUT_FOLDER, str(i), \"img\")\n", " out_txt_path = os.path.join(OUTPUT_FOLDER, str(i), \"txt\")\n", "\n", " os.makedirs(out_img_path, exist_ok=True)\n", " os.makedirs(out_txt_path, exist_ok=True)\n", "\n", " # source and destination PDF paths\n", " src_pdf = os.path.join(PDF_FOLDER, fname)\n", " pdf_path = os.path.join(OUTPUT_FOLDER, str(i), fname)\n", "\n", " # copy PDF into numbered folder\n", " shutil.copy(src_pdf, pdf_path)\n", "\n", " # convert PDF → images\n", " pdf_to_images(\n", " pdf_path=pdf_path,\n", " output_dir=out_img_path,\n", " dpi=300\n", " )\n", " pdf_extract_text(\n", " pdf_path=pdf_path,\n", " output_dir=out_txt_path,\n", " margin_threshold=40\n", " )\n", "\n", " i += 1" ] } ], "metadata": { "kernelspec": { "display_name": ".venv (3.11.9)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }