ray rune process optimization

This commit is contained in:
2025-12-06 21:15:49 +01:00
parent 6d6bebfed9
commit 7503a23b4a
5 changed files with 1810 additions and 577 deletions

2
.gitignore vendored
View File

@@ -1,3 +1,5 @@
~$*.docx ~$*.docx
results/ results/
__pycache__/* __pycache__/*
dataset
results

45
dataset_manager.py Normal file
View File

@@ -0,0 +1,45 @@
# Imports
import os
from PIL import Image
class ImageTextDataset:
def __init__(self, root):
self.samples = []
for folder in sorted(os.listdir(root)):
sub = os.path.join(root, folder)
img_dir = os.path.join(sub, "img")
txt_dir = os.path.join(sub, "txt")
if not (os.path.isdir(img_dir) and os.path.isdir(txt_dir)):
continue
for fname in sorted(os.listdir(img_dir)):
if not fname.lower().endswith((".png", ".jpg", ".jpeg")):
continue
img_path = os.path.join(img_dir, fname)
# text file must have same name but .txt
txt_name = os.path.splitext(fname)[0] + ".txt"
txt_path = os.path.join(txt_dir, txt_name)
if not os.path.exists(txt_path):
continue
self.samples.append((img_path, txt_path))
def __len__(self):
return len(self.samples)
def __getitem__(self, idx):
img_path, txt_path = self.samples[idx]
# Load image
image = Image.open(img_path).convert("RGB")
# Load text
with open(txt_path, "r", encoding="utf-8") as f:
text = f.read()
return image, text

File diff suppressed because one or more lines are too long

View File

@@ -1,95 +1,16 @@
# Imports # Imports
import argparse, json, os, sys, time import argparse, json, time, re
from typing import List
import numpy as np import numpy as np
from PIL import Image
import fitz # PyMuPDF
from paddleocr import PaddleOCR from paddleocr import PaddleOCR
import re
from jiwer import wer, cer from jiwer import wer, cer
from dataset_manager import ImageTextDataset
from itertools import islice
def export_config(paddleocr_model): def export_config(paddleocr_model):
yaml_path = "paddleocr_pipeline_dump.yaml" yaml_path = "paddleocr_pipeline_dump.yaml"
paddleocr_model.export_paddlex_config_to_yaml(yaml_path) paddleocr_model.export_paddlex_config_to_yaml(yaml_path)
print("Exported:", yaml_path) print("Exported:", yaml_path)
def pdf_to_images(pdf_path: str, dpi: int = 300, pages: List[int] = None) -> List[Image.Image]:
"""
Render a PDF into a list of PIL Images using PyMuPDF or pdf2image.
'pages' is 1-based (e.g., range(1, 10) -> pages 19).
"""
images = []
if fitz is not None:
doc = fitz.open(pdf_path)
total_pages = len(doc)
# Adjust page indices (PyMuPDF uses 0-based indexing)
if pages is None:
page_indices = list(range(total_pages))
else:
# Filter out invalid pages and convert to 0-based
page_indices = [p - 1 for p in pages if 1 <= p <= total_pages]
for i in page_indices:
page = doc.load_page(i)
mat = fitz.Matrix(dpi / 72.0, dpi / 72.0)
pix = page.get_pixmap(matrix=mat, alpha=False)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
images.append(img)
doc.close()
else:
raise RuntimeError("Install PyMuPDF or pdf2image to convert PDFs.")
return images
def pdf_extract_text(pdf_path, page_num, line_tolerance=15) -> str:
"""
Extracts text from a specific PDF page in proper reading order.
Adds '\n' when blocks are vertically separated more than line_tolerance.
Removes bullet-like characters (, •, ▪, etc.).
"""
doc = fitz.open(pdf_path)
if page_num < 1 or page_num > len(doc):
return ""
page = doc[page_num - 1]
blocks = page.get_text("blocks") # (x0, y0, x1, y1, text, block_no, block_type)
# Sort blocks: top-to-bottom, left-to-right
blocks_sorted = sorted(blocks, key=lambda b: (b[1], b[0]))
text_lines = []
last_y = None
for b in blocks_sorted:
y0 = b[1]
text_block = b[4].strip()
# Remove bullet-like characters
text_block = re.sub(r"[•▪◦●❖▶■]", "", text_block)
# If new line (based on vertical gap)
if last_y is not None and abs(y0 - last_y) > line_tolerance:
text_lines.append("") # blank line for spacing
text_lines.append(text_block.strip())
last_y = y0
# Join all lines with real newlines
text = "\n".join(text_lines)
# Normalize spaces
text = re.sub(r"\s*\n\s*", "\n", text).strip() # remove spaces around newlines
text = re.sub(r" +", " ", text).strip() # collapse multiple spaces to one
text = re.sub(r"\n{3,}", "\n\n", text).strip() # avoid triple blank lines
doc.close()
return text
def evaluate_text(reference, prediction): def evaluate_text(reference, prediction):
return {'WER': wer(reference, prediction), 'CER': cer(reference, prediction)} return {'WER': wer(reference, prediction), 'CER': cer(reference, prediction)}
@@ -189,18 +110,15 @@ def assemble_from_paddle_result(paddleocr_predict, min_score=0.0, line_tol_facto
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--pdf-folder", required=True) parser.add_argument("--pdf-folder", required=True)
parser.add_argument("--dpi", type=int, default=300)
parser.add_argument("--textline-orientation", type=lambda s: s.lower()=="true", default=True) parser.add_argument("--textline-orientation", type=lambda s: s.lower()=="true", default=True)
parser.add_argument("--text-det-box-thresh", type=float, default=0.6) parser.add_argument("--text-det-box-thresh", type=float, default=0.6)
parser.add_argument("--text-det-unclip-ratio", type=float, default=1.5) parser.add_argument("--text-det-unclip-ratio", type=float, default=1.5)
parser.add_argument("--text-rec-score-thresh", type=float, default=0.0) parser.add_argument("--text-rec-score-thresh", type=float, default=0.0)
parser.add_argument("--line-tolerance", type=float, default=0.6) parser.add_argument("--line-tolerance", type=float, default=0.6)
parser.add_argument("--min-box-score", type=float, default=0.0) parser.add_argument("--min-box-score", type=float, default=0.0)
parser.add_argument("--pages-per-pdf", type=int, default=2)
parser.add_argument("--lang", default="es") parser.add_argument("--lang", default="es")
args = parser.parse_args() args = parser.parse_args()
@@ -212,31 +130,26 @@ def main():
lang=args.lang, lang=args.lang,
) )
dataset = ImageTextDataset(args.pdf_folder)
cer_list, wer_list = [], [] cer_list, wer_list = [], []
time_per_page_list = [] time_per_page_list = []
t0 = time.time() t0 = time.time()
for fname in os.listdir(args.pdf_folder): for img, ref in islice(dataset, 5, 10):
if not fname.lower().endswith(".pdf"): arr = np.array(img)
continue tp0 = time.time()
pdf_path = os.path.join(args.pdf_folder, fname) out = ocr.predict(
images = pdf_to_images(pdf_path, dpi=args.dpi, pages=range(1, args.pages_per_pdf+1)) arr,
for i, img in enumerate(images): text_det_box_thresh=args.text_det_box_thresh,
ref = pdf_extract_text(pdf_path, i+1) text_det_unclip_ratio=args.text_det_unclip_ratio,
arr = np.array(img) text_rec_score_thresh=args.text_rec_score_thresh,
tp0 = time.time() use_textline_orientation=args.textline_orientation
out = ocr.predict( )
arr, pred = assemble_from_paddle_result(out, args.min_box_score, args.line_tolerance)
text_det_box_thresh=args.text_det_box_thresh, time_per_page_list.append(float(time.time() - tp0))
text_det_unclip_ratio=args.text_det_unclip_ratio, m = evaluate_text(ref, pred)
text_rec_score_thresh=args.text_rec_score_thresh, cer_list.append(m["CER"])
use_textline_orientation=args.textline_orientation wer_list.append(m["WER"])
)
pred = assemble_from_paddle_result(out, args.min_box_score, args.line_tolerance)
time_per_page_list.append(float(time.time() - tp0))
m = evaluate_text(ref, pred)
cer_list.append(m["CER"])
wer_list.append(m["WER"])
metrics = { metrics = {
"CER": float(np.mean(cer_list) if cer_list else 1.0), "CER": float(np.mean(cer_list) if cer_list else 1.0),

496
prepare_dataset.ipynb Normal file
View File

@@ -0,0 +1,496 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 91,
"id": "93809ffc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pip in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (25.3)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: jupyter in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (1.1.1)\n",
"Requirement already satisfied: notebook in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter) (7.5.0)\n",
"Requirement already satisfied: jupyter-console in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter) (6.6.3)\n",
"Requirement already satisfied: nbconvert in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter) (7.16.6)\n",
"Requirement already satisfied: ipykernel in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jupyter) (7.1.0)\n",
"Requirement already satisfied: ipywidgets in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter) (8.1.8)\n",
"Requirement already satisfied: jupyterlab in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter) (4.5.0)\n",
"Requirement already satisfied: comm>=0.1.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (0.2.3)\n",
"Requirement already satisfied: debugpy>=1.6.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (1.8.17)\n",
"Requirement already satisfied: ipython>=7.23.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (9.8.0)\n",
"Requirement already satisfied: jupyter-client>=8.0.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (8.6.3)\n",
"Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (5.9.1)\n",
"Requirement already satisfied: matplotlib-inline>=0.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (0.2.1)\n",
"Requirement already satisfied: nest-asyncio>=1.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (1.6.0)\n",
"Requirement already satisfied: packaging>=22 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (25.0)\n",
"Requirement already satisfied: psutil>=5.7 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (7.1.3)\n",
"Requirement already satisfied: pyzmq>=25 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (27.1.0)\n",
"Requirement already satisfied: tornado>=6.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (6.5.2)\n",
"Requirement already satisfied: traitlets>=5.4.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel->jupyter) (5.14.3)\n",
"Requirement already satisfied: colorama>=0.4.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (0.4.6)\n",
"Requirement already satisfied: decorator>=4.3.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (5.2.1)\n",
"Requirement already satisfied: ipython-pygments-lexers>=1.0.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (1.1.1)\n",
"Requirement already satisfied: jedi>=0.18.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (0.19.2)\n",
"Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (3.0.52)\n",
"Requirement already satisfied: pygments>=2.11.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (2.19.2)\n",
"Requirement already satisfied: stack_data>=0.6.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (0.6.3)\n",
"Requirement already satisfied: typing_extensions>=4.6 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel->jupyter) (4.15.0)\n",
"Requirement already satisfied: wcwidth in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=7.23.1->ipykernel->jupyter) (0.2.14)\n",
"Requirement already satisfied: parso<0.9.0,>=0.8.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jedi>=0.18.1->ipython>=7.23.1->ipykernel->jupyter) (0.8.5)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jupyter-client>=8.0.0->ipykernel->jupyter) (2.9.0.post0)\n",
"Requirement already satisfied: platformdirs>=2.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel->jupyter) (4.5.1)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from python-dateutil>=2.8.2->jupyter-client>=8.0.0->ipykernel->jupyter) (1.17.0)\n",
"Requirement already satisfied: executing>=1.2.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel->jupyter) (2.2.1)\n",
"Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel->jupyter) (3.0.1)\n",
"Requirement already satisfied: pure-eval in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel->jupyter) (0.2.3)\n",
"Requirement already satisfied: widgetsnbextension~=4.0.14 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ipywidgets->jupyter) (4.0.15)\n",
"Requirement already satisfied: jupyterlab_widgets~=3.0.15 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ipywidgets->jupyter) (3.0.16)\n",
"Requirement already satisfied: async-lru>=1.0.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (2.0.5)\n",
"Requirement already satisfied: httpx<1,>=0.25.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (0.28.1)\n",
"Requirement already satisfied: jinja2>=3.0.3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (3.1.6)\n",
"Requirement already satisfied: jupyter-lsp>=2.0.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (2.3.0)\n",
"Requirement already satisfied: jupyter-server<3,>=2.4.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (2.17.0)\n",
"Requirement already satisfied: jupyterlab-server<3,>=2.28.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (2.28.0)\n",
"Requirement already satisfied: notebook-shim>=0.2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (0.2.4)\n",
"Requirement already satisfied: setuptools>=41.1.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab->jupyter) (65.5.0)\n",
"Requirement already satisfied: anyio in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (4.12.0)\n",
"Requirement already satisfied: certifi in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (2025.11.12)\n",
"Requirement already satisfied: httpcore==1.* in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (1.0.9)\n",
"Requirement already satisfied: idna in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (3.11)\n",
"Requirement already satisfied: h11>=0.16 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from httpcore==1.*->httpx<1,>=0.25.0->jupyterlab->jupyter) (0.16.0)\n",
"Requirement already satisfied: argon2-cffi>=21.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (25.1.0)\n",
"Requirement already satisfied: jupyter-events>=0.11.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.12.0)\n",
"Requirement already satisfied: jupyter-server-terminals>=0.4.4 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.5.3)\n",
"Requirement already satisfied: nbformat>=5.3.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (5.10.4)\n",
"Requirement already satisfied: overrides>=5.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (7.7.0)\n",
"Requirement already satisfied: prometheus-client>=0.9 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.23.1)\n",
"Requirement already satisfied: pywinpty>=2.0.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (3.0.2)\n",
"Requirement already satisfied: send2trash>=1.8.2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.8.3)\n",
"Requirement already satisfied: terminado>=0.8.3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.18.1)\n",
"Requirement already satisfied: websocket-client>=1.7 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.9.0)\n",
"Requirement already satisfied: babel>=2.10 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2.17.0)\n",
"Requirement already satisfied: json5>=0.9.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (0.12.1)\n",
"Requirement already satisfied: jsonschema>=4.18.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (4.25.1)\n",
"Requirement already satisfied: requests>=2.31 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2.32.5)\n",
"Requirement already satisfied: argon2-cffi-bindings in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (25.1.0)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jinja2>=3.0.3->jupyterlab->jupyter) (3.0.3)\n",
"Requirement already satisfied: attrs>=22.2.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (25.4.0)\n",
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2025.9.1)\n",
"Requirement already satisfied: referencing>=0.28.4 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (0.37.0)\n",
"Requirement already satisfied: rpds-py>=0.7.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (0.30.0)\n",
"Requirement already satisfied: python-json-logger>=2.0.4 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (4.0.0)\n",
"Requirement already satisfied: pyyaml>=5.3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (6.0.2)\n",
"Requirement already satisfied: rfc3339-validator in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.1.4)\n",
"Requirement already satisfied: rfc3986-validator>=0.1.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.1.1)\n",
"Requirement already satisfied: fqdn in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.5.1)\n",
"Requirement already satisfied: isoduration in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (20.11.0)\n",
"Requirement already satisfied: jsonpointer>1.13 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (3.0.0)\n",
"Requirement already satisfied: rfc3987-syntax>=1.1.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.1.0)\n",
"Requirement already satisfied: uri-template in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.3.0)\n",
"Requirement already satisfied: webcolors>=24.6.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (25.10.0)\n",
"Requirement already satisfied: beautifulsoup4 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from nbconvert->jupyter) (4.14.3)\n",
"Requirement already satisfied: bleach!=5.0.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from bleach[css]!=5.0.0->nbconvert->jupyter) (6.3.0)\n",
"Requirement already satisfied: defusedxml in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from nbconvert->jupyter) (0.7.1)\n",
"Requirement already satisfied: jupyterlab-pygments in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from nbconvert->jupyter) (0.3.0)\n",
"Requirement already satisfied: mistune<4,>=2.0.3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from nbconvert->jupyter) (3.1.4)\n",
"Requirement already satisfied: nbclient>=0.5.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from nbconvert->jupyter) (0.10.2)\n",
"Requirement already satisfied: pandocfilters>=1.4.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from nbconvert->jupyter) (1.5.1)\n",
"Requirement already satisfied: webencodings in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from bleach!=5.0.0->bleach[css]!=5.0.0->nbconvert->jupyter) (0.5.1)\n",
"Requirement already satisfied: tinycss2<1.5,>=1.1.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from bleach[css]!=5.0.0->nbconvert->jupyter) (1.4.0)\n",
"Requirement already satisfied: fastjsonschema>=2.15 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from nbformat>=5.3.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.21.2)\n",
"Requirement already satisfied: charset_normalizer<4,>=2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from requests>=2.31->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (3.4.4)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from requests>=2.31->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2.6.0)\n",
"Requirement already satisfied: lark>=1.2.2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from rfc3987-syntax>=1.1.0->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.3.1)\n",
"Requirement already satisfied: cffi>=1.0.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.0.0)\n",
"Requirement already satisfied: pycparser in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from cffi>=1.0.1->argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.23)\n",
"Requirement already satisfied: soupsieve>=1.6.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from beautifulsoup4->nbconvert->jupyter) (2.8)\n",
"Requirement already satisfied: arrow>=0.15.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.4.0)\n",
"Requirement already satisfied: tzdata in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from arrow>=0.15.0->isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2025.2)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: ipywidgets in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (8.1.8)\n",
"Requirement already satisfied: comm>=0.1.3 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipywidgets) (0.2.3)\n",
"Requirement already satisfied: ipython>=6.1.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipywidgets) (9.8.0)\n",
"Requirement already satisfied: traitlets>=4.3.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipywidgets) (5.14.3)\n",
"Requirement already satisfied: widgetsnbextension~=4.0.14 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ipywidgets) (4.0.15)\n",
"Requirement already satisfied: jupyterlab_widgets~=3.0.15 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from ipywidgets) (3.0.16)\n",
"Requirement already satisfied: colorama>=0.4.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (0.4.6)\n",
"Requirement already satisfied: decorator>=4.3.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (5.2.1)\n",
"Requirement already satisfied: ipython-pygments-lexers>=1.0.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (1.1.1)\n",
"Requirement already satisfied: jedi>=0.18.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (0.19.2)\n",
"Requirement already satisfied: matplotlib-inline>=0.1.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (0.2.1)\n",
"Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (3.0.52)\n",
"Requirement already satisfied: pygments>=2.11.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (2.19.2)\n",
"Requirement already satisfied: stack_data>=0.6.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (0.6.3)\n",
"Requirement already satisfied: typing_extensions>=4.6 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=6.1.0->ipywidgets) (4.15.0)\n",
"Requirement already satisfied: wcwidth in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets) (0.2.14)\n",
"Requirement already satisfied: parso<0.9.0,>=0.8.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jedi>=0.18.1->ipython>=6.1.0->ipywidgets) (0.8.5)\n",
"Requirement already satisfied: executing>=1.2.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=6.1.0->ipywidgets) (2.2.1)\n",
"Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=6.1.0->ipywidgets) (3.0.1)\n",
"Requirement already satisfied: pure-eval in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=6.1.0->ipywidgets) (0.2.3)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: ipykernel in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (7.1.0)\n",
"Requirement already satisfied: comm>=0.1.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (0.2.3)\n",
"Requirement already satisfied: debugpy>=1.6.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (1.8.17)\n",
"Requirement already satisfied: ipython>=7.23.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (9.8.0)\n",
"Requirement already satisfied: jupyter-client>=8.0.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (8.6.3)\n",
"Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (5.9.1)\n",
"Requirement already satisfied: matplotlib-inline>=0.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (0.2.1)\n",
"Requirement already satisfied: nest-asyncio>=1.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (1.6.0)\n",
"Requirement already satisfied: packaging>=22 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (25.0)\n",
"Requirement already satisfied: psutil>=5.7 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (7.1.3)\n",
"Requirement already satisfied: pyzmq>=25 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (27.1.0)\n",
"Requirement already satisfied: tornado>=6.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (6.5.2)\n",
"Requirement already satisfied: traitlets>=5.4.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipykernel) (5.14.3)\n",
"Requirement already satisfied: colorama>=0.4.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (0.4.6)\n",
"Requirement already satisfied: decorator>=4.3.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (5.2.1)\n",
"Requirement already satisfied: ipython-pygments-lexers>=1.0.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (1.1.1)\n",
"Requirement already satisfied: jedi>=0.18.1 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (0.19.2)\n",
"Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (3.0.52)\n",
"Requirement already satisfied: pygments>=2.11.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (2.19.2)\n",
"Requirement already satisfied: stack_data>=0.6.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (0.6.3)\n",
"Requirement already satisfied: typing_extensions>=4.6 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from ipython>=7.23.1->ipykernel) (4.15.0)\n",
"Requirement already satisfied: wcwidth in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=7.23.1->ipykernel) (0.2.14)\n",
"Requirement already satisfied: parso<0.9.0,>=0.8.4 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jedi>=0.18.1->ipython>=7.23.1->ipykernel) (0.8.5)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jupyter-client>=8.0.0->ipykernel) (2.9.0.post0)\n",
"Requirement already satisfied: platformdirs>=2.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel) (4.5.1)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from python-dateutil>=2.8.2->jupyter-client>=8.0.0->ipykernel) (1.17.0)\n",
"Requirement already satisfied: executing>=1.2.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel) (2.2.1)\n",
"Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel) (3.0.1)\n",
"Requirement already satisfied: pure-eval in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel) (0.2.3)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install --upgrade pip\n",
"%pip install --upgrade jupyter\n",
"%pip install --upgrade ipywidgets\n",
"%pip install --upgrade ipykernel"
]
},
{
"cell_type": "code",
"execution_count": 92,
"id": "48724594",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pdf2image in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (1.17.0)\n",
"Requirement already satisfied: pillow in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (12.0.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: PyMuPDF in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (1.26.6)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: pandas in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (2.3.3)\n",
"Requirement already satisfied: numpy>=1.23.2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pandas) (2.3.5)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from pandas) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pandas) (2025.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pandas) (2025.2)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: matplotlib in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (3.10.7)\n",
"Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib) (1.3.3)\n",
"Requirement already satisfied: cycler>=0.10 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib) (0.12.1)\n",
"Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib) (4.61.0)\n",
"Requirement already satisfied: kiwisolver>=1.3.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib) (1.4.9)\n",
"Requirement already satisfied: numpy>=1.23 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib) (2.3.5)\n",
"Requirement already satisfied: packaging>=20.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from matplotlib) (25.0)\n",
"Requirement already satisfied: pillow>=8 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib) (12.0.0)\n",
"Requirement already satisfied: pyparsing>=3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib) (3.2.5)\n",
"Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from matplotlib) (2.9.0.post0)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from python-dateutil>=2.7->matplotlib) (1.17.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: seaborn in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (0.13.2)\n",
"Requirement already satisfied: numpy!=1.24.0,>=1.20 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from seaborn) (2.3.5)\n",
"Requirement already satisfied: pandas>=1.2 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from seaborn) (2.3.3)\n",
"Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from seaborn) (3.10.7)\n",
"Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.3.3)\n",
"Requirement already satisfied: cycler>=0.10 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.12.1)\n",
"Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.61.0)\n",
"Requirement already satisfied: kiwisolver>=1.3.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.9)\n",
"Requirement already satisfied: packaging>=20.0 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (25.0)\n",
"Requirement already satisfied: pillow>=8 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (12.0.0)\n",
"Requirement already satisfied: pyparsing>=3 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.2.5)\n",
"Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pandas>=1.2->seaborn) (2025.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\sergio\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from pandas>=1.2->seaborn) (2025.2)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\sergio\\appdata\\roaming\\python\\python311\\site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.17.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"# Install necessary packages\n",
"%pip install pdf2image pillow \n",
"# pdf reading\n",
"%pip install PyMuPDF\n",
"\n",
"# Data analysis and visualization\n",
"%pip install pandas\n",
"%pip install matplotlib\n",
"%pip install seaborn"
]
},
{
"cell_type": "code",
"execution_count": 93,
"id": "e1f793b6",
"metadata": {},
"outputs": [],
"source": [
"import os, json\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from pdf2image import convert_from_path\n",
"from PIL import Image, ImageOps\n",
"import fitz # PyMuPDF\n",
"import re\n",
"from datetime import datetime\n",
"from typing import List\n",
"import shutil"
]
},
{
"cell_type": "code",
"execution_count": 94,
"id": "1652a78e",
"metadata": {},
"outputs": [],
"source": [
"def pdf_to_images(pdf_path: str, output_dir: str, dpi: int = 300):\n",
" \"\"\"\n",
" Render a PDF into a list of PIL Images using PyMuPDF or pdf2image.\n",
" 'pages' is 1-based (e.g., range(1, 10) -> pages 19).\n",
" \"\"\"\n",
" if fitz is not None:\n",
" doc = fitz.open(pdf_path)\n",
" total_pages = len(doc)\n",
"\n",
" # Adjust page indices (PyMuPDF uses 0-based indexing)\n",
" page_indices = list(range(total_pages))\n",
"\n",
" for i in page_indices:\n",
" page = doc.load_page(i)\n",
" mat = fitz.Matrix(dpi / 72.0, dpi / 72.0)\n",
" pix = page.get_pixmap(matrix=mat, alpha=False)\n",
" img = Image.frombytes(\"RGB\", [pix.width, pix.height], pix.samples)\n",
" # Build filename\n",
" out_path = os.path.join(\n",
" output_dir,\n",
" f\"page_{i + 1:04d}.png\"\n",
" )\n",
"\n",
" img.save(out_path, \"PNG\")\n",
" doc.close()\n",
" else:\n",
" raise RuntimeError(\"Install PyMuPDF or pdf2image to convert PDFs.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f523dd58",
"metadata": {},
"outputs": [],
"source": [
"import fitz\n",
"import re\n",
"import os\n",
"\n",
"def _pdf_extract_text_structured(page, margin_threshold=50):\n",
" \"\"\"\n",
" Extract text using PyMuPDF's dict mode which preserves\n",
" the actual line structure from the PDF.\n",
" \"\"\"\n",
" data = page.get_text(\"dict\")\n",
" \n",
" # Collect all lines with their Y position\n",
" all_lines = []\n",
" margin_text_parts = [] # Collect vertical/margin text\n",
" margin_y_positions = []\n",
" \n",
" for block in data.get(\"blocks\", []):\n",
" if block.get(\"type\") != 0: # Skip non-text blocks\n",
" continue\n",
" \n",
" block_bbox = block.get(\"bbox\", (0, 0, 0, 0))\n",
" block_width = block_bbox[2] - block_bbox[0]\n",
" block_height = block_bbox[3] - block_bbox[1]\n",
" \n",
" # Detect vertical/margin text\n",
" is_margin_text = (block_bbox[0] < margin_threshold or \n",
" block_height > block_width * 2)\n",
" \n",
" for line in block.get(\"lines\", []):\n",
" direction = line.get(\"dir\", (1, 0))\n",
" bbox = line.get(\"bbox\", (0, 0, 0, 0))\n",
" y_center = (bbox[1] + bbox[3]) / 2\n",
" x_start = bbox[0]\n",
" \n",
" # Collect text from all spans\n",
" line_text = \"\"\n",
" for span in line.get(\"spans\", []):\n",
" text = span.get(\"text\", \"\")\n",
" line_text += text\n",
" \n",
" line_text = line_text.strip()\n",
" line_text = re.sub(r\"[•▪◦●❖▶■]\", \"\", line_text)\n",
" \n",
" if not line_text:\n",
" continue\n",
" \n",
" # Check if this is margin/vertical text\n",
" if is_margin_text or abs(direction[0]) < 0.9:\n",
" margin_text_parts.append((y_center, line_text))\n",
" margin_y_positions.append(y_center)\n",
" else:\n",
" all_lines.append((y_center, x_start, line_text))\n",
" \n",
" # Reconstruct margin text as single line at its vertical center\n",
" if margin_text_parts:\n",
" # Sort by Y position (top to bottom) and join\n",
" margin_text_parts.sort(key=lambda x: x[0])\n",
" full_margin_text = \" \".join(part[1] for part in margin_text_parts)\n",
" # Calculate vertical center of the watermark\n",
" avg_y = sum(margin_y_positions) / len(margin_y_positions)\n",
" # Add as a single line\n",
" all_lines.append((avg_y, -1, full_margin_text)) # x=-1 to sort first\n",
" \n",
" if not all_lines:\n",
" return \"\"\n",
" \n",
" # Sort by Y first, then by X\n",
" all_lines.sort(key=lambda x: (x[0], x[1]))\n",
" \n",
" # Group lines at same vertical position\n",
" merged_rows = []\n",
" current_row = [all_lines[0]]\n",
" current_y = all_lines[0][0]\n",
" \n",
" for y_center, x_start, text in all_lines[1:]:\n",
" if abs(y_center - current_y) <= 2:\n",
" current_row.append((y_center, x_start, text))\n",
" else:\n",
" current_row.sort(key=lambda x: x[1])\n",
" row_text = \" \".join(item[2] for item in current_row)\n",
" merged_rows.append((current_y, row_text))\n",
" current_row = [(y_center, x_start, text)]\n",
" current_y = y_center\n",
" \n",
" if current_row:\n",
" current_row.sort(key=lambda x: x[1])\n",
" row_text = \" \".join(item[2] for item in current_row)\n",
" merged_rows.append((current_y, row_text))\n",
" \n",
" # Sort rows by Y and extract text\n",
" merged_rows.sort(key=lambda x: x[0])\n",
" lines = [row[1] for row in merged_rows]\n",
" \n",
" # Join and clean up\n",
" text = \"\\n\".join(lines)\n",
" text = re.sub(r\" +\", \" \", text).strip()\n",
" text = re.sub(r\"\\n{3,}\", \"\\n\\n\", text).strip()\n",
" \n",
" return text\n",
"\n",
"def pdf_extract_text(pdf_path, output_dir, margin_threshold=50):\n",
" os.makedirs(output_dir, exist_ok=True)\n",
" doc = fitz.open(pdf_path)\n",
" \n",
" for i, page in enumerate(doc):\n",
" text = _pdf_extract_text_structured(page, margin_threshold)\n",
" if not text.strip():\n",
" continue\n",
" out_path = os.path.join(output_dir, f\"page_{i + 1:04d}.txt\")\n",
" with open(out_path, \"w\", encoding=\"utf-8\") as f:\n",
" f.write(text)"
]
},
{
"cell_type": "code",
"execution_count": 96,
"id": "9f64a8c0",
"metadata": {},
"outputs": [],
"source": [
"PDF_FOLDER = './instructions' # Folder containing PDF files\n",
"OUTPUT_FOLDER = './dataset'\n",
"\n",
"os.makedirs(OUTPUT_FOLDER, exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "41e4651d",
"metadata": {},
"outputs": [],
"source": [
"i = 0\n",
"\n",
"pdf_files = sorted([\n",
" fname for fname in os.listdir(PDF_FOLDER)\n",
" if fname.lower().endswith(\".pdf\")\n",
"])\n",
"\n",
"\n",
"for fname in pdf_files:\n",
" # build output directories\n",
" out_img_path = os.path.join(OUTPUT_FOLDER, str(i), \"img\")\n",
" out_txt_path = os.path.join(OUTPUT_FOLDER, str(i), \"txt\")\n",
"\n",
" os.makedirs(out_img_path, exist_ok=True)\n",
" os.makedirs(out_txt_path, exist_ok=True)\n",
"\n",
" # source and destination PDF paths\n",
" src_pdf = os.path.join(PDF_FOLDER, fname)\n",
" pdf_path = os.path.join(OUTPUT_FOLDER, str(i), fname)\n",
"\n",
" # copy PDF into numbered folder\n",
" shutil.copy(src_pdf, pdf_path)\n",
"\n",
" # convert PDF → images\n",
" pdf_to_images(\n",
" pdf_path=pdf_path,\n",
" output_dir=out_img_path,\n",
" dpi=300\n",
" )\n",
" pdf_extract_text(\n",
" pdf_path=pdf_path,\n",
" output_dir=out_txt_path,\n",
" margin_threshold=40\n",
" )\n",
"\n",
" i += 1"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}