deliverable_12_11_2025 (#3)

2025-11-17 10:52:00 +00:00
parent d823f195d6
commit 6d6bebfed9
8 changed files with 3158 additions and 139 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 ~$*.docx
 results/
 __pycache__/*
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,30 @@
 {
  "version": "0.2.0",
  "configurations": [
    {
      "name": "Debug PaddleOCR CLI (.venv)",
      "type": "debugpy",
      "request": "launch",
      "program": "${file}",
      "console": "integratedTerminal",
      "justMyCode": false,
      "env": {
        "PYTHONPATH": "${workspaceFolder}",
        "PDF_FOLDER": "${workspaceFolder}/instructions",
        "PYTHONUNBUFFERED": "1",
        "RAY_MEMORY_MONITOR_ERROR_THRESHOLD": "0.95"
      },
      "python": "${workspaceFolder}/.venv/Scripts/python.exe",
      "args": [
        "--pdf-folder", "${workspaceFolder}/instructions",
        "--dpi", "300",
        "--text-det-box-thresh", "0.6",
        "--text-det-unclip-ratio", "1.5",
        "--text-rec-score-thresh", "0.0",
        "--pages-per-pdf", "2",
        "--min-box-score", "0",
        "--lang", "es"
      ]
    }
  ]
 }
--- a/ocr_benchmark_notebook.ipynb
+++ b/ocr_benchmark_notebook.ipynb
--- a/paddle_ocr_fine_tune_unir.ipynb
+++ b/paddle_ocr_fine_tune_unir.ipynb
--- a/paddle_ocr_fine_tune_unir_raytune.ipynb
+++ b/paddle_ocr_fine_tune_unir_raytune.ipynb
--- a/paddle_ocr_tuning.py
+++ b/paddle_ocr_tuning.py
@@ -0,0 +1,251 @@
 # Imports
 import argparse, json, os, sys, time
 from typing import List
 import numpy as np
 from PIL import Image
 import fitz  # PyMuPDF
 from paddleocr import PaddleOCR
 import re
 from jiwer import wer, cer
 def export_config(paddleocr_model):
    yaml_path = "paddleocr_pipeline_dump.yaml"
    paddleocr_model.export_paddlex_config_to_yaml(yaml_path)
    print("Exported:", yaml_path)
 def pdf_to_images(pdf_path: str, dpi: int = 300, pages: List[int] = None) -> List[Image.Image]:
    """
    Render a PDF into a list of PIL Images using PyMuPDF or pdf2image.
    'pages' is 1-based (e.g., range(1, 10) -> pages 1–9).
    """
    images = []
    if fitz is not None:
        doc = fitz.open(pdf_path)
        total_pages = len(doc)
        # Adjust page indices (PyMuPDF uses 0-based indexing)
        if pages is None:
            page_indices = list(range(total_pages))
        else:
            # Filter out invalid pages and convert to 0-based
            page_indices = [p - 1 for p in pages if 1 <= p <= total_pages]
        for i in page_indices:
            page = doc.load_page(i)
            mat = fitz.Matrix(dpi / 72.0, dpi / 72.0)
            pix = page.get_pixmap(matrix=mat, alpha=False)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            images.append(img)
        doc.close()
    else:
        raise RuntimeError("Install PyMuPDF or pdf2image to convert PDFs.")
    return images
 def pdf_extract_text(pdf_path, page_num, line_tolerance=15) -> str:
    """
    Extracts text from a specific PDF page in proper reading order.
    Adds '\n' when blocks are vertically separated more than line_tolerance.
    Removes bullet-like characters (, •, ▪, etc.).
    """
    doc = fitz.open(pdf_path)
    if page_num < 1 or page_num > len(doc):
        return ""
    page = doc[page_num - 1]
    blocks = page.get_text("blocks")  # (x0, y0, x1, y1, text, block_no, block_type)
    # Sort blocks: top-to-bottom, left-to-right
    blocks_sorted = sorted(blocks, key=lambda b: (b[1], b[0]))
    text_lines = []
    last_y = None
    for b in blocks_sorted:
        y0 = b[1]
        text_block = b[4].strip()
        # Remove bullet-like characters
        text_block = re.sub(r"[•▪◦●❖▶■]", "", text_block)
        # If new line (based on vertical gap)
        if last_y is not None and abs(y0 - last_y) > line_tolerance:
            text_lines.append("")  # blank line for spacing
        text_lines.append(text_block.strip())
        last_y = y0
    # Join all lines with real newlines
    text = "\n".join(text_lines)
    # Normalize spaces
    text = re.sub(r"\s*\n\s*", "\n", text).strip()   # remove spaces around newlines
    text = re.sub(r" +", " ", text).strip()          # collapse multiple spaces to one
    text = re.sub(r"\n{3,}", "\n\n", text).strip()   # avoid triple blank lines
    doc.close()
    return text
 def evaluate_text(reference, prediction):
    return {'WER': wer(reference, prediction), 'CER': cer(reference, prediction)}
 def _normalize_box_xyxy(box):
    """
    Accepts:
      - [[x,y],[x,y],[x,y],[x,y]]   (quad)
      - [x0, y0, x1, y1]            (flat)
      - [x0, y0, x1, y1, x2, y2, x3, y3] (flat quad)
    Returns (x0, y0, x1, y1)
    """
    # Quad as list of points?
    if isinstance(box, (list, tuple)) and box and isinstance(box[0], (list, tuple)):
        xs = [p[0] for p in box]
        ys = [p[1] for p in box]
        return min(xs), min(ys), max(xs), max(ys)
    # Flat list
    if isinstance(box, (list, tuple)):
        if len(box) == 4:
            x0, y0, x1, y1 = box
            # ensure order
            return min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1)
        if len(box) == 8:
            xs = box[0::2]
            ys = box[1::2]
            return min(xs), min(ys), max(xs), max(ys)
    # Fallback
    raise ValueError(f"Unrecognized box format: {box!r}")
 def assemble_from_paddle_result(paddleocr_predict, min_score=0.0, line_tol_factor=0.6):
    """
    Robust line grouping for PaddleOCR outputs:
    - normalizes boxes to (x0,y0,x1,y1)
    - adaptive line tolerance based on median box height
    - optional confidence filter
    - inserts '\n' between lines and preserves left→right order
    """
    result = paddleocr_predict
    boxes_all = []  # (x0, y0, x1, y1, y_mid, text, score)
    for item in result:
        res = item.json.get("res", {})
        boxes = res.get("rec_boxes", []) or []       # be defensive
        texts = res.get("rec_texts", []) or []
        scores = res.get("rec_scores", None)
        for i, (box, text) in enumerate(zip(boxes, texts)):
            try:
                x0, y0, x1, y1 = _normalize_box_xyxy(box)
            except Exception:
                # Skip weird boxes gracefully
                continue
            y_mid = 0.5 * (y0 + y1)
            score = float(scores[i]) if (scores is not None and i < len(scores)) else 1.0
            t = re.sub(r"\s+", " ", str(text)).strip()
            if not t:
                continue
            boxes_all.append((x0, y0, x1, y1, y_mid, t, score))
    if min_score > 0:
        boxes_all = [b for b in boxes_all if b[6] >= min_score]
    if not boxes_all:
        return ""
    # Adaptive line tolerance
    heights = [b[3] - b[1] for b in boxes_all]
    median_h = float(np.median(heights)) if heights else 20.0
    line_tol = max(8.0, line_tol_factor * median_h)
    # Sort by vertical mid, then x0
    boxes_all.sort(key=lambda b: (b[4], b[0]))
    # Group into lines
    lines, cur, last_y = [], [], None
    for x0, y0, x1, y1, y_mid, text, score in boxes_all:
        if last_y is None or abs(y_mid - last_y) <= line_tol:
            cur.append((x0, text))
        else:
            cur.sort(key=lambda t: t[0])
            lines.append(" ".join(t[1] for t in cur))
            cur = [(x0, text)]
        last_y = y_mid
    if cur:
        cur.sort(key=lambda t: t[0])
        lines.append(" ".join(t[1] for t in cur))
    res = "\n".join(lines)
    res = re.sub(r"\s+\n", "\n", res).strip()
    return res
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--pdf-folder", required=True)
    parser.add_argument("--dpi", type=int, default=300)
    parser.add_argument("--textline-orientation", type=lambda s: s.lower()=="true", default=True)
    parser.add_argument("--text-det-box-thresh", type=float, default=0.6)
    parser.add_argument("--text-det-unclip-ratio", type=float, default=1.5)
    parser.add_argument("--text-rec-score-thresh", type=float, default=0.0)
    parser.add_argument("--line-tolerance", type=float, default=0.6)
    parser.add_argument("--min-box-score", type=float, default=0.0)
    parser.add_argument("--pages-per-pdf", type=int, default=2)
    parser.add_argument("--lang", default="es")
    args = parser.parse_args()
    ocr = PaddleOCR(
        text_detection_model_name="PP-OCRv5_server_det",
        text_recognition_model_name="PP-OCRv5_server_rec",
        lang=args.lang,
    )
    cer_list, wer_list = [], []
    time_per_page_list = []
    t0 = time.time()
    for fname in os.listdir(args.pdf_folder):
        if not fname.lower().endswith(".pdf"):
            continue
        pdf_path = os.path.join(args.pdf_folder, fname)
        images = pdf_to_images(pdf_path, dpi=args.dpi, pages=range(1, args.pages_per_pdf+1))
        for i, img in enumerate(images):
            ref = pdf_extract_text(pdf_path, i+1)
            arr = np.array(img)
            tp0 = time.time()
            out = ocr.predict(
                arr,
                text_det_box_thresh=args.text_det_box_thresh,
                text_det_unclip_ratio=args.text_det_unclip_ratio,
                text_rec_score_thresh=args.text_rec_score_thresh,
                use_textline_orientation=args.textline_orientation
            )
            pred = assemble_from_paddle_result(out, args.min_box_score, args.line_tolerance)
            time_per_page_list.append(float(time.time() - tp0))
            m = evaluate_text(ref, pred)
            cer_list.append(m["CER"])
            wer_list.append(m["WER"])
    metrics = {
        "CER": float(np.mean(cer_list) if cer_list else 1.0),
        "WER": float(np.mean(wer_list) if wer_list else 1.0),
        "TIME": float(time.time() - t0),
        "PAGES": int(len(cer_list)),
        "TIME_PER_PAGE": float(np.mean(time_per_page_list) if time_per_page_list else float(time.time() - t0)),
    }
    print(json.dumps(metrics))
 if __name__ == "__main__":
    main()
--- a/thesis_report.docx
+++ b/thesis_report.docx
--- a/thesis_report.pdf
+++ b/thesis_report.pdf