ray rune process optimization

This commit is contained in:
2025-12-06 21:15:49 +01:00
parent 6d6bebfed9
commit 7503a23b4a
5 changed files with 1810 additions and 577 deletions

View File

@@ -1,95 +1,16 @@
# Imports
import argparse, json, os, sys, time
from typing import List
import argparse, json, time, re
import numpy as np
from PIL import Image
import fitz # PyMuPDF
from paddleocr import PaddleOCR
import re
from jiwer import wer, cer
from dataset_manager import ImageTextDataset
from itertools import islice
def export_config(paddleocr_model):
yaml_path = "paddleocr_pipeline_dump.yaml"
paddleocr_model.export_paddlex_config_to_yaml(yaml_path)
print("Exported:", yaml_path)
def pdf_to_images(pdf_path: str, dpi: int = 300, pages: List[int] = None) -> List[Image.Image]:
"""
Render a PDF into a list of PIL Images using PyMuPDF or pdf2image.
'pages' is 1-based (e.g., range(1, 10) -> pages 19).
"""
images = []
if fitz is not None:
doc = fitz.open(pdf_path)
total_pages = len(doc)
# Adjust page indices (PyMuPDF uses 0-based indexing)
if pages is None:
page_indices = list(range(total_pages))
else:
# Filter out invalid pages and convert to 0-based
page_indices = [p - 1 for p in pages if 1 <= p <= total_pages]
for i in page_indices:
page = doc.load_page(i)
mat = fitz.Matrix(dpi / 72.0, dpi / 72.0)
pix = page.get_pixmap(matrix=mat, alpha=False)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
images.append(img)
doc.close()
else:
raise RuntimeError("Install PyMuPDF or pdf2image to convert PDFs.")
return images
def pdf_extract_text(pdf_path, page_num, line_tolerance=15) -> str:
"""
Extracts text from a specific PDF page in proper reading order.
Adds '\n' when blocks are vertically separated more than line_tolerance.
Removes bullet-like characters (, •, ▪, etc.).
"""
doc = fitz.open(pdf_path)
if page_num < 1 or page_num > len(doc):
return ""
page = doc[page_num - 1]
blocks = page.get_text("blocks") # (x0, y0, x1, y1, text, block_no, block_type)
# Sort blocks: top-to-bottom, left-to-right
blocks_sorted = sorted(blocks, key=lambda b: (b[1], b[0]))
text_lines = []
last_y = None
for b in blocks_sorted:
y0 = b[1]
text_block = b[4].strip()
# Remove bullet-like characters
text_block = re.sub(r"[•▪◦●❖▶■]", "", text_block)
# If new line (based on vertical gap)
if last_y is not None and abs(y0 - last_y) > line_tolerance:
text_lines.append("") # blank line for spacing
text_lines.append(text_block.strip())
last_y = y0
# Join all lines with real newlines
text = "\n".join(text_lines)
# Normalize spaces
text = re.sub(r"\s*\n\s*", "\n", text).strip() # remove spaces around newlines
text = re.sub(r" +", " ", text).strip() # collapse multiple spaces to one
text = re.sub(r"\n{3,}", "\n\n", text).strip() # avoid triple blank lines
doc.close()
return text
def evaluate_text(reference, prediction):
return {'WER': wer(reference, prediction), 'CER': cer(reference, prediction)}
@@ -189,18 +110,15 @@ def assemble_from_paddle_result(paddleocr_predict, min_score=0.0, line_tol_facto
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--pdf-folder", required=True)
parser.add_argument("--dpi", type=int, default=300)
parser.add_argument("--textline-orientation", type=lambda s: s.lower()=="true", default=True)
parser.add_argument("--text-det-box-thresh", type=float, default=0.6)
parser.add_argument("--text-det-unclip-ratio", type=float, default=1.5)
parser.add_argument("--text-rec-score-thresh", type=float, default=0.0)
parser.add_argument("--line-tolerance", type=float, default=0.6)
parser.add_argument("--min-box-score", type=float, default=0.0)
parser.add_argument("--pages-per-pdf", type=int, default=2)
parser.add_argument("--lang", default="es")
args = parser.parse_args()
@@ -211,32 +129,27 @@ def main():
text_recognition_model_name="PP-OCRv5_server_rec",
lang=args.lang,
)
dataset = ImageTextDataset(args.pdf_folder)
cer_list, wer_list = [], []
time_per_page_list = []
t0 = time.time()
for fname in os.listdir(args.pdf_folder):
if not fname.lower().endswith(".pdf"):
continue
pdf_path = os.path.join(args.pdf_folder, fname)
images = pdf_to_images(pdf_path, dpi=args.dpi, pages=range(1, args.pages_per_pdf+1))
for i, img in enumerate(images):
ref = pdf_extract_text(pdf_path, i+1)
arr = np.array(img)
tp0 = time.time()
out = ocr.predict(
arr,
text_det_box_thresh=args.text_det_box_thresh,
text_det_unclip_ratio=args.text_det_unclip_ratio,
text_rec_score_thresh=args.text_rec_score_thresh,
use_textline_orientation=args.textline_orientation
)
pred = assemble_from_paddle_result(out, args.min_box_score, args.line_tolerance)
time_per_page_list.append(float(time.time() - tp0))
m = evaluate_text(ref, pred)
cer_list.append(m["CER"])
wer_list.append(m["WER"])
for img, ref in islice(dataset, 5, 10):
arr = np.array(img)
tp0 = time.time()
out = ocr.predict(
arr,
text_det_box_thresh=args.text_det_box_thresh,
text_det_unclip_ratio=args.text_det_unclip_ratio,
text_rec_score_thresh=args.text_rec_score_thresh,
use_textline_orientation=args.textline_orientation
)
pred = assemble_from_paddle_result(out, args.min_box_score, args.line_tolerance)
time_per_page_list.append(float(time.time() - tp0))
m = evaluate_text(ref, pred)
cer_list.append(m["CER"])
wer_list.append(m["WER"])
metrics = {
"CER": float(np.mean(cer_list) if cer_list else 1.0),