deliverable_12_11_2025 (#3)
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,2 +1,3 @@
|
|||||||
~$*.docx
|
~$*.docx
|
||||||
results/
|
results/
|
||||||
|
__pycache__/*
|
||||||
|
|||||||
30
.vscode/launch.json
vendored
Normal file
30
.vscode/launch.json
vendored
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
{
|
||||||
|
"version": "0.2.0",
|
||||||
|
"configurations": [
|
||||||
|
{
|
||||||
|
"name": "Debug PaddleOCR CLI (.venv)",
|
||||||
|
"type": "debugpy",
|
||||||
|
"request": "launch",
|
||||||
|
"program": "${file}",
|
||||||
|
"console": "integratedTerminal",
|
||||||
|
"justMyCode": false,
|
||||||
|
"env": {
|
||||||
|
"PYTHONPATH": "${workspaceFolder}",
|
||||||
|
"PDF_FOLDER": "${workspaceFolder}/instructions",
|
||||||
|
"PYTHONUNBUFFERED": "1",
|
||||||
|
"RAY_MEMORY_MONITOR_ERROR_THRESHOLD": "0.95"
|
||||||
|
},
|
||||||
|
"python": "${workspaceFolder}/.venv/Scripts/python.exe",
|
||||||
|
"args": [
|
||||||
|
"--pdf-folder", "${workspaceFolder}/instructions",
|
||||||
|
"--dpi", "300",
|
||||||
|
"--text-det-box-thresh", "0.6",
|
||||||
|
"--text-det-unclip-ratio", "1.5",
|
||||||
|
"--text-rec-score-thresh", "0.0",
|
||||||
|
"--pages-per-pdf", "2",
|
||||||
|
"--min-box-score", "0",
|
||||||
|
"--lang", "es"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
File diff suppressed because one or more lines are too long
1285
paddle_ocr_fine_tune_unir.ipynb
Normal file
1285
paddle_ocr_fine_tune_unir.ipynb
Normal file
File diff suppressed because one or more lines are too long
1319
paddle_ocr_fine_tune_unir_raytune.ipynb
Normal file
1319
paddle_ocr_fine_tune_unir_raytune.ipynb
Normal file
File diff suppressed because one or more lines are too long
251
paddle_ocr_tuning.py
Normal file
251
paddle_ocr_tuning.py
Normal file
@@ -0,0 +1,251 @@
|
|||||||
|
# Imports
|
||||||
|
import argparse, json, os, sys, time
|
||||||
|
from typing import List
|
||||||
|
import numpy as np
|
||||||
|
from PIL import Image
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
from paddleocr import PaddleOCR
|
||||||
|
import re
|
||||||
|
from jiwer import wer, cer
|
||||||
|
|
||||||
|
def export_config(paddleocr_model):
|
||||||
|
yaml_path = "paddleocr_pipeline_dump.yaml"
|
||||||
|
paddleocr_model.export_paddlex_config_to_yaml(yaml_path)
|
||||||
|
print("Exported:", yaml_path)
|
||||||
|
|
||||||
|
def pdf_to_images(pdf_path: str, dpi: int = 300, pages: List[int] = None) -> List[Image.Image]:
|
||||||
|
"""
|
||||||
|
Render a PDF into a list of PIL Images using PyMuPDF or pdf2image.
|
||||||
|
'pages' is 1-based (e.g., range(1, 10) -> pages 1–9).
|
||||||
|
"""
|
||||||
|
images = []
|
||||||
|
|
||||||
|
if fitz is not None:
|
||||||
|
doc = fitz.open(pdf_path)
|
||||||
|
total_pages = len(doc)
|
||||||
|
|
||||||
|
# Adjust page indices (PyMuPDF uses 0-based indexing)
|
||||||
|
if pages is None:
|
||||||
|
page_indices = list(range(total_pages))
|
||||||
|
else:
|
||||||
|
# Filter out invalid pages and convert to 0-based
|
||||||
|
page_indices = [p - 1 for p in pages if 1 <= p <= total_pages]
|
||||||
|
|
||||||
|
for i in page_indices:
|
||||||
|
page = doc.load_page(i)
|
||||||
|
mat = fitz.Matrix(dpi / 72.0, dpi / 72.0)
|
||||||
|
pix = page.get_pixmap(matrix=mat, alpha=False)
|
||||||
|
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||||||
|
|
||||||
|
images.append(img)
|
||||||
|
doc.close()
|
||||||
|
else:
|
||||||
|
raise RuntimeError("Install PyMuPDF or pdf2image to convert PDFs.")
|
||||||
|
|
||||||
|
return images
|
||||||
|
|
||||||
|
|
||||||
|
def pdf_extract_text(pdf_path, page_num, line_tolerance=15) -> str:
|
||||||
|
"""
|
||||||
|
Extracts text from a specific PDF page in proper reading order.
|
||||||
|
Adds '\n' when blocks are vertically separated more than line_tolerance.
|
||||||
|
Removes bullet-like characters (, •, ▪, etc.).
|
||||||
|
"""
|
||||||
|
doc = fitz.open(pdf_path)
|
||||||
|
|
||||||
|
if page_num < 1 or page_num > len(doc):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
page = doc[page_num - 1]
|
||||||
|
blocks = page.get_text("blocks") # (x0, y0, x1, y1, text, block_no, block_type)
|
||||||
|
|
||||||
|
# Sort blocks: top-to-bottom, left-to-right
|
||||||
|
blocks_sorted = sorted(blocks, key=lambda b: (b[1], b[0]))
|
||||||
|
|
||||||
|
text_lines = []
|
||||||
|
last_y = None
|
||||||
|
|
||||||
|
for b in blocks_sorted:
|
||||||
|
y0 = b[1]
|
||||||
|
text_block = b[4].strip()
|
||||||
|
|
||||||
|
# Remove bullet-like characters
|
||||||
|
text_block = re.sub(r"[•▪◦●❖▶■]", "", text_block)
|
||||||
|
|
||||||
|
# If new line (based on vertical gap)
|
||||||
|
if last_y is not None and abs(y0 - last_y) > line_tolerance:
|
||||||
|
text_lines.append("") # blank line for spacing
|
||||||
|
|
||||||
|
text_lines.append(text_block.strip())
|
||||||
|
last_y = y0
|
||||||
|
|
||||||
|
# Join all lines with real newlines
|
||||||
|
text = "\n".join(text_lines)
|
||||||
|
|
||||||
|
# Normalize spaces
|
||||||
|
text = re.sub(r"\s*\n\s*", "\n", text).strip() # remove spaces around newlines
|
||||||
|
text = re.sub(r" +", " ", text).strip() # collapse multiple spaces to one
|
||||||
|
text = re.sub(r"\n{3,}", "\n\n", text).strip() # avoid triple blank lines
|
||||||
|
|
||||||
|
doc.close()
|
||||||
|
return text
|
||||||
|
|
||||||
|
def evaluate_text(reference, prediction):
|
||||||
|
return {'WER': wer(reference, prediction), 'CER': cer(reference, prediction)}
|
||||||
|
|
||||||
|
def _normalize_box_xyxy(box):
|
||||||
|
"""
|
||||||
|
Accepts:
|
||||||
|
- [[x,y],[x,y],[x,y],[x,y]] (quad)
|
||||||
|
- [x0, y0, x1, y1] (flat)
|
||||||
|
- [x0, y0, x1, y1, x2, y2, x3, y3] (flat quad)
|
||||||
|
Returns (x0, y0, x1, y1)
|
||||||
|
"""
|
||||||
|
# Quad as list of points?
|
||||||
|
if isinstance(box, (list, tuple)) and box and isinstance(box[0], (list, tuple)):
|
||||||
|
xs = [p[0] for p in box]
|
||||||
|
ys = [p[1] for p in box]
|
||||||
|
return min(xs), min(ys), max(xs), max(ys)
|
||||||
|
|
||||||
|
# Flat list
|
||||||
|
if isinstance(box, (list, tuple)):
|
||||||
|
if len(box) == 4:
|
||||||
|
x0, y0, x1, y1 = box
|
||||||
|
# ensure order
|
||||||
|
return min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1)
|
||||||
|
if len(box) == 8:
|
||||||
|
xs = box[0::2]
|
||||||
|
ys = box[1::2]
|
||||||
|
return min(xs), min(ys), max(xs), max(ys)
|
||||||
|
|
||||||
|
# Fallback
|
||||||
|
raise ValueError(f"Unrecognized box format: {box!r}")
|
||||||
|
|
||||||
|
def assemble_from_paddle_result(paddleocr_predict, min_score=0.0, line_tol_factor=0.6):
|
||||||
|
"""
|
||||||
|
Robust line grouping for PaddleOCR outputs:
|
||||||
|
- normalizes boxes to (x0,y0,x1,y1)
|
||||||
|
- adaptive line tolerance based on median box height
|
||||||
|
- optional confidence filter
|
||||||
|
- inserts '\n' between lines and preserves left→right order
|
||||||
|
"""
|
||||||
|
result = paddleocr_predict
|
||||||
|
|
||||||
|
boxes_all = [] # (x0, y0, x1, y1, y_mid, text, score)
|
||||||
|
for item in result:
|
||||||
|
res = item.json.get("res", {})
|
||||||
|
boxes = res.get("rec_boxes", []) or [] # be defensive
|
||||||
|
texts = res.get("rec_texts", []) or []
|
||||||
|
scores = res.get("rec_scores", None)
|
||||||
|
|
||||||
|
for i, (box, text) in enumerate(zip(boxes, texts)):
|
||||||
|
try:
|
||||||
|
x0, y0, x1, y1 = _normalize_box_xyxy(box)
|
||||||
|
except Exception:
|
||||||
|
# Skip weird boxes gracefully
|
||||||
|
continue
|
||||||
|
|
||||||
|
y_mid = 0.5 * (y0 + y1)
|
||||||
|
score = float(scores[i]) if (scores is not None and i < len(scores)) else 1.0
|
||||||
|
|
||||||
|
t = re.sub(r"\s+", " ", str(text)).strip()
|
||||||
|
if not t:
|
||||||
|
continue
|
||||||
|
|
||||||
|
boxes_all.append((x0, y0, x1, y1, y_mid, t, score))
|
||||||
|
|
||||||
|
if min_score > 0:
|
||||||
|
boxes_all = [b for b in boxes_all if b[6] >= min_score]
|
||||||
|
|
||||||
|
if not boxes_all:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Adaptive line tolerance
|
||||||
|
heights = [b[3] - b[1] for b in boxes_all]
|
||||||
|
median_h = float(np.median(heights)) if heights else 20.0
|
||||||
|
line_tol = max(8.0, line_tol_factor * median_h)
|
||||||
|
|
||||||
|
# Sort by vertical mid, then x0
|
||||||
|
boxes_all.sort(key=lambda b: (b[4], b[0]))
|
||||||
|
|
||||||
|
# Group into lines
|
||||||
|
lines, cur, last_y = [], [], None
|
||||||
|
for x0, y0, x1, y1, y_mid, text, score in boxes_all:
|
||||||
|
if last_y is None or abs(y_mid - last_y) <= line_tol:
|
||||||
|
cur.append((x0, text))
|
||||||
|
else:
|
||||||
|
cur.sort(key=lambda t: t[0])
|
||||||
|
lines.append(" ".join(t[1] for t in cur))
|
||||||
|
cur = [(x0, text)]
|
||||||
|
last_y = y_mid
|
||||||
|
|
||||||
|
if cur:
|
||||||
|
cur.sort(key=lambda t: t[0])
|
||||||
|
lines.append(" ".join(t[1] for t in cur))
|
||||||
|
|
||||||
|
res = "\n".join(lines)
|
||||||
|
res = re.sub(r"\s+\n", "\n", res).strip()
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--pdf-folder", required=True)
|
||||||
|
parser.add_argument("--dpi", type=int, default=300)
|
||||||
|
parser.add_argument("--textline-orientation", type=lambda s: s.lower()=="true", default=True)
|
||||||
|
parser.add_argument("--text-det-box-thresh", type=float, default=0.6)
|
||||||
|
parser.add_argument("--text-det-unclip-ratio", type=float, default=1.5)
|
||||||
|
parser.add_argument("--text-rec-score-thresh", type=float, default=0.0)
|
||||||
|
parser.add_argument("--line-tolerance", type=float, default=0.6)
|
||||||
|
parser.add_argument("--min-box-score", type=float, default=0.0)
|
||||||
|
parser.add_argument("--pages-per-pdf", type=int, default=2)
|
||||||
|
parser.add_argument("--lang", default="es")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
ocr = PaddleOCR(
|
||||||
|
text_detection_model_name="PP-OCRv5_server_det",
|
||||||
|
text_recognition_model_name="PP-OCRv5_server_rec",
|
||||||
|
lang=args.lang,
|
||||||
|
)
|
||||||
|
|
||||||
|
cer_list, wer_list = [], []
|
||||||
|
time_per_page_list = []
|
||||||
|
t0 = time.time()
|
||||||
|
|
||||||
|
for fname in os.listdir(args.pdf_folder):
|
||||||
|
if not fname.lower().endswith(".pdf"):
|
||||||
|
continue
|
||||||
|
pdf_path = os.path.join(args.pdf_folder, fname)
|
||||||
|
images = pdf_to_images(pdf_path, dpi=args.dpi, pages=range(1, args.pages_per_pdf+1))
|
||||||
|
for i, img in enumerate(images):
|
||||||
|
ref = pdf_extract_text(pdf_path, i+1)
|
||||||
|
arr = np.array(img)
|
||||||
|
tp0 = time.time()
|
||||||
|
out = ocr.predict(
|
||||||
|
arr,
|
||||||
|
text_det_box_thresh=args.text_det_box_thresh,
|
||||||
|
text_det_unclip_ratio=args.text_det_unclip_ratio,
|
||||||
|
text_rec_score_thresh=args.text_rec_score_thresh,
|
||||||
|
use_textline_orientation=args.textline_orientation
|
||||||
|
)
|
||||||
|
pred = assemble_from_paddle_result(out, args.min_box_score, args.line_tolerance)
|
||||||
|
time_per_page_list.append(float(time.time() - tp0))
|
||||||
|
m = evaluate_text(ref, pred)
|
||||||
|
cer_list.append(m["CER"])
|
||||||
|
wer_list.append(m["WER"])
|
||||||
|
|
||||||
|
metrics = {
|
||||||
|
"CER": float(np.mean(cer_list) if cer_list else 1.0),
|
||||||
|
"WER": float(np.mean(wer_list) if wer_list else 1.0),
|
||||||
|
"TIME": float(time.time() - t0),
|
||||||
|
"PAGES": int(len(cer_list)),
|
||||||
|
"TIME_PER_PAGE": float(np.mean(time_per_page_list) if time_per_page_list else float(time.time() - t0)),
|
||||||
|
}
|
||||||
|
print(json.dumps(metrics))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user