src/paddle_ocr_tuning.py

# Imports
import argparse, json, time, re
import numpy as np
from paddleocr import PaddleOCR
from jiwer import wer, cer
from dataset_manager import ImageTextDataset
from itertools import islice

def export_config(paddleocr_model):
    yaml_path = "paddleocr_pipeline_dump.yaml"
    paddleocr_model.export_paddlex_config_to_yaml(yaml_path)
    print("Exported:", yaml_path)

def evaluate_text(reference, prediction):
    return {'WER': wer(reference, prediction), 'CER': cer(reference, prediction)}

def _normalize_box_xyxy(box):
    """
    Accepts:
      - [[x,y],[x,y],[x,y],[x,y]]   (quad)
      - [x0, y0, x1, y1]            (flat)
      - [x0, y0, x1, y1, x2, y2, x3, y3] (flat quad)
    Returns (x0, y0, x1, y1)
    """
    # Quad as list of points?
    if isinstance(box, (list, tuple)) and box and isinstance(box[0], (list, tuple)):
        xs = [p[0] for p in box]
        ys = [p[1] for p in box]
        return min(xs), min(ys), max(xs), max(ys)

    # Flat list
    if isinstance(box, (list, tuple)):
        if len(box) == 4:
            x0, y0, x1, y1 = box
            # ensure order
            return min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1)
        if len(box) == 8:
            xs = box[0::2]
            ys = box[1::2]
            return min(xs), min(ys), max(xs), max(ys)

    # Fallback
    raise ValueError(f"Unrecognized box format: {box!r}")

def assemble_from_paddle_result(paddleocr_predict, min_score=0.0, line_tol_factor=0.6):
    """
    Robust line grouping for PaddleOCR outputs:
    - normalizes boxes to (x0,y0,x1,y1)
    - adaptive line tolerance based on median box height
    - optional confidence filter
    - inserts '\n' between lines and preserves left→right order
    """
    result = paddleocr_predict

    boxes_all = []  # (x0, y0, x1, y1, y_mid, text, score)
    for item in result:
        res = item.json.get("res", {})
        boxes = res.get("rec_boxes", []) or []       # be defensive
        texts = res.get("rec_texts", []) or []
        scores = res.get("rec_scores", None)

        for i, (box, text) in enumerate(zip(boxes, texts)):
            try:
                x0, y0, x1, y1 = _normalize_box_xyxy(box)
            except Exception:
                # Skip weird boxes gracefully
                continue

            y_mid = 0.5 * (y0 + y1)
            score = float(scores[i]) if (scores is not None and i < len(scores)) else 1.0

            t = re.sub(r"\s+", " ", str(text)).strip()
            if not t:
                continue

            boxes_all.append((x0, y0, x1, y1, y_mid, t, score))

    if min_score > 0:
        boxes_all = [b for b in boxes_all if b[6] >= min_score]

    if not boxes_all:
        return ""

    # Adaptive line tolerance
    heights = [b[3] - b[1] for b in boxes_all]
    median_h = float(np.median(heights)) if heights else 20.0
    line_tol = max(8.0, line_tol_factor * median_h)

    # Sort by vertical mid, then x0
    boxes_all.sort(key=lambda b: (b[4], b[0]))

    # Group into lines
    lines, cur, last_y = [], [], None
    for x0, y0, x1, y1, y_mid, text, score in boxes_all:
        if last_y is None or abs(y_mid - last_y) <= line_tol:
            cur.append((x0, text))
        else:
            cur.sort(key=lambda t: t[0])
            lines.append(" ".join(t[1] for t in cur))
            cur = [(x0, text)]
        last_y = y_mid

    if cur:
        cur.sort(key=lambda t: t[0])
        lines.append(" ".join(t[1] for t in cur))

    res = "\n".join(lines)
    res = re.sub(r"\s+\n", "\n", res).strip()
    return res


def main():
    parser = argparse.ArgumentParser()
    # dataset root folder
    parser.add_argument("--pdf-folder", required=True)    
    #Whether to use document image orientation classification.
    parser.add_argument("--use-doc-orientation-classify", type=lambda s: s.lower()=="true", default=False)
    # Whether to use text image unwarping.
    parser.add_argument("--use-doc-unwarping", type=lambda s: s.lower()=="true", default=False)
    # Whether to use text line orientation classification.
    parser.add_argument("--textline-orientation", type=lambda s: s.lower()=="true", default=True)
    # Detection pixel threshold for the text detection model. Pixels with scores greater than this threshold in the output probability map are considered text pixels.
    parser.add_argument("--text-det-thresh",  type=float, default=0.0)
    # Detection box threshold for the text detection model. A detection result is considered a text region if the average score of all pixels within the border of the result is greater than this threshold.
    parser.add_argument("--text-det-box-thresh", type=float, default=0.0)    
    # Text detection expansion coefficient, which expands the text region using this method. The larger the value, the larger the expansion area.
    parser.add_argument("--text-det-unclip-ratio", type=float, default=1.5)
    # Text recognition threshold. Text results with scores greater than this threshold are retained.
    parser.add_argument("--text-rec-score-thresh", type=float, default=0.0)
    # text location
    parser.add_argument("--lang", default="es")
    args = parser.parse_args()

    
    ocr = PaddleOCR(
        text_detection_model_name="PP-OCRv5_server_det",
        text_recognition_model_name="PP-OCRv5_server_rec",
        lang=args.lang,
    )
    
    dataset = ImageTextDataset(args.pdf_folder)
    cer_list, wer_list = [], []
    time_per_page_list = []
    t0 = time.time()

    for img, ref in islice(dataset, 5, 10):
        arr = np.array(img)
        tp0 = time.time()
        out = ocr.predict(
            arr,
            use_doc_orientation_classify=args.use_doc_orientation_classify,
            use_doc_unwarping=args.use_doc_unwarping,
            use_textline_orientation=args.textline_orientation, #str2bool Whether to use text line orientation classification.
            text_det_thresh=args.text_det_thresh,
            text_det_box_thresh=args.text_det_box_thresh,
            text_det_unclip_ratio=args.text_det_unclip_ratio,
            text_rec_score_thresh=args.text_rec_score_thresh
        )
        pred = assemble_from_paddle_result(out)
        time_per_page_list.append(float(time.time() - tp0))
        m = evaluate_text(ref, pred)
        cer_list.append(m["CER"])
        wer_list.append(m["WER"])

    metrics = {
        "CER": float(np.mean(cer_list) if cer_list else 1.0),
        "WER": float(np.mean(wer_list) if wer_list else 1.0),
        "TIME": float(time.time() - t0),
        "PAGES": int(len(cer_list)),
        "TIME_PER_PAGE": float(np.mean(time_per_page_list) if time_per_page_list else float(time.time() - t0)),
    }
    print(json.dumps(metrics))

if __name__ == "__main__":
    main()
deliverable_12_11_2025 (#3) 2025-11-17 10:52:00 +00:00			`# Imports`
ray rune process optimization 2025-12-06 21:15:49 +01:00			`import argparse, json, time, re`
deliverable_12_11_2025 (#3) 2025-11-17 10:52:00 +00:00			`import numpy as np`
			`from paddleocr import PaddleOCR`
			`from jiwer import wer, cer`
ray rune process optimization 2025-12-06 21:15:49 +01:00			`from dataset_manager import ImageTextDataset`
			`from itertools import islice`
deliverable_12_11_2025 (#3) 2025-11-17 10:52:00 +00:00
			`def export_config(paddleocr_model):`
			`yaml_path = "paddleocr_pipeline_dump.yaml"`
			`paddleocr_model.export_paddlex_config_to_yaml(yaml_path)`
			`print("Exported:", yaml_path)`

			`def evaluate_text(reference, prediction):`
			`return {'WER': wer(reference, prediction), 'CER': cer(reference, prediction)}`

			`def _normalize_box_xyxy(box):`
			`"""`
			`Accepts:`
			`- [[x,y],[x,y],[x,y],[x,y]] (quad)`
			`- [x0, y0, x1, y1] (flat)`
			`- [x0, y0, x1, y1, x2, y2, x3, y3] (flat quad)`
			`Returns (x0, y0, x1, y1)`
			`"""`
			`# Quad as list of points?`
			`if isinstance(box, (list, tuple)) and box and isinstance(box[0], (list, tuple)):`
			`xs = [p[0] for p in box]`
			`ys = [p[1] for p in box]`
			`return min(xs), min(ys), max(xs), max(ys)`

			`# Flat list`
			`if isinstance(box, (list, tuple)):`
			`if len(box) == 4:`
			`x0, y0, x1, y1 = box`
			`# ensure order`
			`return min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1)`
			`if len(box) == 8:`
			`xs = box[0::2]`
			`ys = box[1::2]`
			`return min(xs), min(ys), max(xs), max(ys)`

			`# Fallback`
			`raise ValueError(f"Unrecognized box format: {box!r}")`

			`def assemble_from_paddle_result(paddleocr_predict, min_score=0.0, line_tol_factor=0.6):`
			`"""`
			`Robust line grouping for PaddleOCR outputs:`
			`- normalizes boxes to (x0,y0,x1,y1)`
			`- adaptive line tolerance based on median box height`
			`- optional confidence filter`
			`- inserts '\n' between lines and preserves left→right order`
			`"""`
			`result = paddleocr_predict`

			`boxes_all = [] # (x0, y0, x1, y1, y_mid, text, score)`
			`for item in result:`
			`res = item.json.get("res", {})`
			`boxes = res.get("rec_boxes", []) or [] # be defensive`
			`texts = res.get("rec_texts", []) or []`
			`scores = res.get("rec_scores", None)`

			`for i, (box, text) in enumerate(zip(boxes, texts)):`
			`try:`
			`x0, y0, x1, y1 = _normalize_box_xyxy(box)`
			`except Exception:`
			`# Skip weird boxes gracefully`
			`continue`

			`y_mid = 0.5 * (y0 + y1)`
			`score = float(scores[i]) if (scores is not None and i < len(scores)) else 1.0`

			`t = re.sub(r"\s+", " ", str(text)).strip()`
			`if not t:`
			`continue`

			`boxes_all.append((x0, y0, x1, y1, y_mid, t, score))`

			`if min_score > 0:`
			`boxes_all = [b for b in boxes_all if b[6] >= min_score]`

			`if not boxes_all:`
			`return ""`

			`# Adaptive line tolerance`
			`heights = [b[3] - b[1] for b in boxes_all]`
			`median_h = float(np.median(heights)) if heights else 20.0`
			`line_tol = max(8.0, line_tol_factor * median_h)`

			`# Sort by vertical mid, then x0`
			`boxes_all.sort(key=lambda b: (b[4], b[0]))`

			`# Group into lines`
			`lines, cur, last_y = [], [], None`
			`for x0, y0, x1, y1, y_mid, text, score in boxes_all:`
			`if last_y is None or abs(y_mid - last_y) <= line_tol:`
			`cur.append((x0, text))`
			`else:`
			`cur.sort(key=lambda t: t[0])`
			`lines.append(" ".join(t[1] for t in cur))`
			`cur = [(x0, text)]`
			`last_y = y_mid`

			`if cur:`
			`cur.sort(key=lambda t: t[0])`
			`lines.append(" ".join(t[1] for t in cur))`

			`res = "\n".join(lines)`
			`res = re.sub(r"\s+\n", "\n", res).strip()`
			`return res`



			`def main():`
			`parser = argparse.ArgumentParser()`
Hyper param serach results 2025-12-07 20:07:59 +01:00			`# dataset root folder`
			`parser.add_argument("--pdf-folder", required=True)`
			`#Whether to use document image orientation classification.`
			`parser.add_argument("--use-doc-orientation-classify", type=lambda s: s.lower()=="true", default=False)`
			`# Whether to use text image unwarping.`
			`parser.add_argument("--use-doc-unwarping", type=lambda s: s.lower()=="true", default=False)`
			`# Whether to use text line orientation classification.`
deliverable_12_11_2025 (#3) 2025-11-17 10:52:00 +00:00			`parser.add_argument("--textline-orientation", type=lambda s: s.lower()=="true", default=True)`
Hyper param serach results 2025-12-07 20:07:59 +01:00			`# Detection pixel threshold for the text detection model. Pixels with scores greater than this threshold in the output probability map are considered text pixels.`
			`parser.add_argument("--text-det-thresh", type=float, default=0.0)`
			`# Detection box threshold for the text detection model. A detection result is considered a text region if the average score of all pixels within the border of the result is greater than this threshold.`
			`parser.add_argument("--text-det-box-thresh", type=float, default=0.0)`
			`# Text detection expansion coefficient, which expands the text region using this method. The larger the value, the larger the expansion area.`
deliverable_12_11_2025 (#3) 2025-11-17 10:52:00 +00:00			`parser.add_argument("--text-det-unclip-ratio", type=float, default=1.5)`
Hyper param serach results 2025-12-07 20:07:59 +01:00			`# Text recognition threshold. Text results with scores greater than this threshold are retained.`
deliverable_12_11_2025 (#3) 2025-11-17 10:52:00 +00:00			`parser.add_argument("--text-rec-score-thresh", type=float, default=0.0)`
Hyper param serach results 2025-12-07 20:07:59 +01:00			`# text location`
deliverable_12_11_2025 (#3) 2025-11-17 10:52:00 +00:00			`parser.add_argument("--lang", default="es")`
			`args = parser.parse_args()`



			`ocr = PaddleOCR(`
			`text_detection_model_name="PP-OCRv5_server_det",`
			`text_recognition_model_name="PP-OCRv5_server_rec",`
			`lang=args.lang,`
			`)`
ray rune process optimization 2025-12-06 21:15:49 +01:00
			`dataset = ImageTextDataset(args.pdf_folder)`
deliverable_12_11_2025 (#3) 2025-11-17 10:52:00 +00:00			`cer_list, wer_list = [], []`
			`time_per_page_list = []`
			`t0 = time.time()`

ray rune process optimization 2025-12-06 21:15:49 +01:00			`for img, ref in islice(dataset, 5, 10):`
			`arr = np.array(img)`
			`tp0 = time.time()`
			`out = ocr.predict(`
			`arr,`
Hyper param serach results 2025-12-07 20:07:59 +01:00			`use_doc_orientation_classify=args.use_doc_orientation_classify,`
			`use_doc_unwarping=args.use_doc_unwarping,`
			`use_textline_orientation=args.textline_orientation, #str2bool Whether to use text line orientation classification.`
			`text_det_thresh=args.text_det_thresh,`
ray rune process optimization 2025-12-06 21:15:49 +01:00			`text_det_box_thresh=args.text_det_box_thresh,`
			`text_det_unclip_ratio=args.text_det_unclip_ratio,`
Hyper param serach results 2025-12-07 20:07:59 +01:00			`text_rec_score_thresh=args.text_rec_score_thresh`
ray rune process optimization 2025-12-06 21:15:49 +01:00			`)`
Hyper param serach results 2025-12-07 20:07:59 +01:00			`pred = assemble_from_paddle_result(out)`
ray rune process optimization 2025-12-06 21:15:49 +01:00			`time_per_page_list.append(float(time.time() - tp0))`
			`m = evaluate_text(ref, pred)`
			`cer_list.append(m["CER"])`
			`wer_list.append(m["WER"])`
deliverable_12_11_2025 (#3) 2025-11-17 10:52:00 +00:00
			`metrics = {`
			`"CER": float(np.mean(cer_list) if cer_list else 1.0),`
			`"WER": float(np.mean(wer_list) if wer_list else 1.0),`
			`"TIME": float(time.time() - t0),`
			`"PAGES": int(len(cer_list)),`
			`"TIME_PER_PAGE": float(np.mean(time_per_page_list) if time_per_page_list else float(time.time() - t0)),`
			`}`
			`print(json.dumps(metrics))`

			`if __name__ == "__main__":`
			`main()`