ray rune process optimization

2025-12-06 21:15:49 +01:00
parent 6d6bebfed9
commit 7503a23b4a
5 changed files with 1810 additions and 577 deletions
--- a/paddle_ocr_tuning.py
+++ b/paddle_ocr_tuning.py
@@ -1,95 +1,16 @@
 # Imports
-import argparse, json, os, sys, time
-from typing import List
+import argparse, json, time, re
 import numpy as np
-from PIL import Image
-import fitz  # PyMuPDF
 from paddleocr import PaddleOCR
-import re
 from jiwer import wer, cer
+from dataset_manager import ImageTextDataset
+from itertools import islice

 def export_config(paddleocr_model):
    yaml_path = "paddleocr_pipeline_dump.yaml"
    paddleocr_model.export_paddlex_config_to_yaml(yaml_path)
    print("Exported:", yaml_path)

-def pdf_to_images(pdf_path: str, dpi: int = 300, pages: List[int] = None) -> List[Image.Image]:
-    """
-    Render a PDF into a list of PIL Images using PyMuPDF or pdf2image.
-    'pages' is 1-based (e.g., range(1, 10) -> pages 1–9).
-    """
-    images = []
-
-    if fitz is not None:
-        doc = fitz.open(pdf_path)
-        total_pages = len(doc)
-
-        # Adjust page indices (PyMuPDF uses 0-based indexing)
-        if pages is None:
-            page_indices = list(range(total_pages))
-        else:
-            # Filter out invalid pages and convert to 0-based
-            page_indices = [p - 1 for p in pages if 1 <= p <= total_pages]
-
-        for i in page_indices:
-            page = doc.load_page(i)
-            mat = fitz.Matrix(dpi / 72.0, dpi / 72.0)
-            pix = page.get_pixmap(matrix=mat, alpha=False)
-            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-            
-            images.append(img)
-        doc.close()
-    else:
-        raise RuntimeError("Install PyMuPDF or pdf2image to convert PDFs.")
-
-    return images
-
-
-def pdf_extract_text(pdf_path, page_num, line_tolerance=15) -> str:
-    """
-    Extracts text from a specific PDF page in proper reading order.
-    Adds '\n' when blocks are vertically separated more than line_tolerance.
-    Removes bullet-like characters (, •, ▪, etc.).
-    """
-    doc = fitz.open(pdf_path)
-
-    if page_num < 1 or page_num > len(doc):
-        return ""
-
-    page = doc[page_num - 1]
-    blocks = page.get_text("blocks")  # (x0, y0, x1, y1, text, block_no, block_type)
-
-    # Sort blocks: top-to-bottom, left-to-right
-    blocks_sorted = sorted(blocks, key=lambda b: (b[1], b[0]))
-
-    text_lines = []
-    last_y = None
-
-    for b in blocks_sorted:
-        y0 = b[1]
-        text_block = b[4].strip()
-
-        # Remove bullet-like characters
-        text_block = re.sub(r"[•▪◦●❖▶■]", "", text_block)
-
-        # If new line (based on vertical gap)
-        if last_y is not None and abs(y0 - last_y) > line_tolerance:
-            text_lines.append("")  # blank line for spacing
-
-        text_lines.append(text_block.strip())
-        last_y = y0
-
-    # Join all lines with real newlines
-    text = "\n".join(text_lines)
-
-    # Normalize spaces
-    text = re.sub(r"\s*\n\s*", "\n", text).strip()   # remove spaces around newlines
-    text = re.sub(r" +", " ", text).strip()          # collapse multiple spaces to one
-    text = re.sub(r"\n{3,}", "\n\n", text).strip()   # avoid triple blank lines
-
-    doc.close()
-    return text
-
 def evaluate_text(reference, prediction):
    return {'WER': wer(reference, prediction), 'CER': cer(reference, prediction)}

@@ -189,18 +110,15 @@ def assemble_from_paddle_result(paddleocr_predict, min_score=0.0, line_tol_facto



-
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--pdf-folder", required=True)
-    parser.add_argument("--dpi", type=int, default=300)
    parser.add_argument("--textline-orientation", type=lambda s: s.lower()=="true", default=True)
    parser.add_argument("--text-det-box-thresh", type=float, default=0.6)
    parser.add_argument("--text-det-unclip-ratio", type=float, default=1.5)
    parser.add_argument("--text-rec-score-thresh", type=float, default=0.0)
    parser.add_argument("--line-tolerance", type=float, default=0.6)
    parser.add_argument("--min-box-score", type=float, default=0.0)
-    parser.add_argument("--pages-per-pdf", type=int, default=2)
    parser.add_argument("--lang", default="es")
    args = parser.parse_args()

@@ -211,32 +129,27 @@ def main():
        text_recognition_model_name="PP-OCRv5_server_rec",
        lang=args.lang,
    )
-
+    
+    dataset = ImageTextDataset(args.pdf_folder)
    cer_list, wer_list = [], []
    time_per_page_list = []
    t0 = time.time()

-    for fname in os.listdir(args.pdf_folder):
-        if not fname.lower().endswith(".pdf"):
-            continue
-        pdf_path = os.path.join(args.pdf_folder, fname)
-        images = pdf_to_images(pdf_path, dpi=args.dpi, pages=range(1, args.pages_per_pdf+1))
-        for i, img in enumerate(images):
-            ref = pdf_extract_text(pdf_path, i+1)
-            arr = np.array(img)
-            tp0 = time.time()
-            out = ocr.predict(
-                arr,
-                text_det_box_thresh=args.text_det_box_thresh,
-                text_det_unclip_ratio=args.text_det_unclip_ratio,
-                text_rec_score_thresh=args.text_rec_score_thresh,
-                use_textline_orientation=args.textline_orientation
-            )
-            pred = assemble_from_paddle_result(out, args.min_box_score, args.line_tolerance)
-            time_per_page_list.append(float(time.time() - tp0))
-            m = evaluate_text(ref, pred)
-            cer_list.append(m["CER"])
-            wer_list.append(m["WER"])
+    for img, ref in islice(dataset, 5, 10):
+        arr = np.array(img)
+        tp0 = time.time()
+        out = ocr.predict(
+            arr,
+            text_det_box_thresh=args.text_det_box_thresh,
+            text_det_unclip_ratio=args.text_det_unclip_ratio,
+            text_rec_score_thresh=args.text_rec_score_thresh,
+            use_textline_orientation=args.textline_orientation
+        )
+        pred = assemble_from_paddle_result(out, args.min_box_score, args.line_tolerance)
+        time_per_page_list.append(float(time.time() - tp0))
+        m = evaluate_text(ref, pred)
+        cer_list.append(m["CER"])
+        wer_list.append(m["WER"])

    metrics = {
        "CER": float(np.mean(cer_list) if cer_list else 1.0),