2025-11-17 10:52:00 +00:00
# Imports
2025-12-06 21:15:49 +01:00
import argparse , json , time , re
2025-11-17 10:52:00 +00:00
import numpy as np
from paddleocr import PaddleOCR
from jiwer import wer , cer
2025-12-06 21:15:49 +01:00
from dataset_manager import ImageTextDataset
from itertools import islice
2025-11-17 10:52:00 +00:00
def export_config ( paddleocr_model ) :
yaml_path = " paddleocr_pipeline_dump.yaml "
paddleocr_model . export_paddlex_config_to_yaml ( yaml_path )
print ( " Exported: " , yaml_path )
def evaluate_text ( reference , prediction ) :
return { ' WER ' : wer ( reference , prediction ) , ' CER ' : cer ( reference , prediction ) }
def _normalize_box_xyxy ( box ) :
"""
Accepts :
- [ [ x , y ] , [ x , y ] , [ x , y ] , [ x , y ] ] ( quad )
- [ x0 , y0 , x1 , y1 ] ( flat )
- [ x0 , y0 , x1 , y1 , x2 , y2 , x3 , y3 ] ( flat quad )
Returns ( x0 , y0 , x1 , y1 )
"""
# Quad as list of points?
if isinstance ( box , ( list , tuple ) ) and box and isinstance ( box [ 0 ] , ( list , tuple ) ) :
xs = [ p [ 0 ] for p in box ]
ys = [ p [ 1 ] for p in box ]
return min ( xs ) , min ( ys ) , max ( xs ) , max ( ys )
# Flat list
if isinstance ( box , ( list , tuple ) ) :
if len ( box ) == 4 :
x0 , y0 , x1 , y1 = box
# ensure order
return min ( x0 , x1 ) , min ( y0 , y1 ) , max ( x0 , x1 ) , max ( y0 , y1 )
if len ( box ) == 8 :
xs = box [ 0 : : 2 ]
ys = box [ 1 : : 2 ]
return min ( xs ) , min ( ys ) , max ( xs ) , max ( ys )
# Fallback
raise ValueError ( f " Unrecognized box format: { box !r} " )
def assemble_from_paddle_result ( paddleocr_predict , min_score = 0.0 , line_tol_factor = 0.6 ) :
"""
Robust line grouping for PaddleOCR outputs :
- normalizes boxes to ( x0 , y0 , x1 , y1 )
- adaptive line tolerance based on median box height
- optional confidence filter
- inserts ' \n ' between lines and preserves left → right order
"""
result = paddleocr_predict
boxes_all = [ ] # (x0, y0, x1, y1, y_mid, text, score)
for item in result :
res = item . json . get ( " res " , { } )
boxes = res . get ( " rec_boxes " , [ ] ) or [ ] # be defensive
texts = res . get ( " rec_texts " , [ ] ) or [ ]
scores = res . get ( " rec_scores " , None )
for i , ( box , text ) in enumerate ( zip ( boxes , texts ) ) :
try :
x0 , y0 , x1 , y1 = _normalize_box_xyxy ( box )
except Exception :
# Skip weird boxes gracefully
continue
y_mid = 0.5 * ( y0 + y1 )
score = float ( scores [ i ] ) if ( scores is not None and i < len ( scores ) ) else 1.0
t = re . sub ( r " \ s+ " , " " , str ( text ) ) . strip ( )
if not t :
continue
boxes_all . append ( ( x0 , y0 , x1 , y1 , y_mid , t , score ) )
if min_score > 0 :
boxes_all = [ b for b in boxes_all if b [ 6 ] > = min_score ]
if not boxes_all :
return " "
# Adaptive line tolerance
heights = [ b [ 3 ] - b [ 1 ] for b in boxes_all ]
median_h = float ( np . median ( heights ) ) if heights else 20.0
line_tol = max ( 8.0 , line_tol_factor * median_h )
# Sort by vertical mid, then x0
boxes_all . sort ( key = lambda b : ( b [ 4 ] , b [ 0 ] ) )
# Group into lines
lines , cur , last_y = [ ] , [ ] , None
for x0 , y0 , x1 , y1 , y_mid , text , score in boxes_all :
if last_y is None or abs ( y_mid - last_y ) < = line_tol :
cur . append ( ( x0 , text ) )
else :
cur . sort ( key = lambda t : t [ 0 ] )
lines . append ( " " . join ( t [ 1 ] for t in cur ) )
cur = [ ( x0 , text ) ]
last_y = y_mid
if cur :
cur . sort ( key = lambda t : t [ 0 ] )
lines . append ( " " . join ( t [ 1 ] for t in cur ) )
res = " \n " . join ( lines )
res = re . sub ( r " \ s+ \ n " , " \n " , res ) . strip ( )
return res
def main ( ) :
parser = argparse . ArgumentParser ( )
2025-12-07 20:07:59 +01:00
# dataset root folder
parser . add_argument ( " --pdf-folder " , required = True )
#Whether to use document image orientation classification.
parser . add_argument ( " --use-doc-orientation-classify " , type = lambda s : s . lower ( ) == " true " , default = False )
# Whether to use text image unwarping.
parser . add_argument ( " --use-doc-unwarping " , type = lambda s : s . lower ( ) == " true " , default = False )
# Whether to use text line orientation classification.
2025-11-17 10:52:00 +00:00
parser . add_argument ( " --textline-orientation " , type = lambda s : s . lower ( ) == " true " , default = True )
2025-12-07 20:07:59 +01:00
# Detection pixel threshold for the text detection model. Pixels with scores greater than this threshold in the output probability map are considered text pixels.
parser . add_argument ( " --text-det-thresh " , type = float , default = 0.0 )
# Detection box threshold for the text detection model. A detection result is considered a text region if the average score of all pixels within the border of the result is greater than this threshold.
parser . add_argument ( " --text-det-box-thresh " , type = float , default = 0.0 )
# Text detection expansion coefficient, which expands the text region using this method. The larger the value, the larger the expansion area.
2025-11-17 10:52:00 +00:00
parser . add_argument ( " --text-det-unclip-ratio " , type = float , default = 1.5 )
2025-12-07 20:07:59 +01:00
# Text recognition threshold. Text results with scores greater than this threshold are retained.
2025-11-17 10:52:00 +00:00
parser . add_argument ( " --text-rec-score-thresh " , type = float , default = 0.0 )
2025-12-07 20:07:59 +01:00
# text location
2025-11-17 10:52:00 +00:00
parser . add_argument ( " --lang " , default = " es " )
args = parser . parse_args ( )
ocr = PaddleOCR (
text_detection_model_name = " PP-OCRv5_server_det " ,
text_recognition_model_name = " PP-OCRv5_server_rec " ,
lang = args . lang ,
)
2025-12-06 21:15:49 +01:00
dataset = ImageTextDataset ( args . pdf_folder )
2025-11-17 10:52:00 +00:00
cer_list , wer_list = [ ] , [ ]
time_per_page_list = [ ]
t0 = time . time ( )
2025-12-06 21:15:49 +01:00
for img , ref in islice ( dataset , 5 , 10 ) :
arr = np . array ( img )
tp0 = time . time ( )
out = ocr . predict (
arr ,
2025-12-07 20:07:59 +01:00
use_doc_orientation_classify = args . use_doc_orientation_classify ,
use_doc_unwarping = args . use_doc_unwarping ,
use_textline_orientation = args . textline_orientation , #str2bool Whether to use text line orientation classification.
text_det_thresh = args . text_det_thresh ,
2025-12-06 21:15:49 +01:00
text_det_box_thresh = args . text_det_box_thresh ,
text_det_unclip_ratio = args . text_det_unclip_ratio ,
2025-12-07 20:07:59 +01:00
text_rec_score_thresh = args . text_rec_score_thresh
2025-12-06 21:15:49 +01:00
)
2025-12-07 20:07:59 +01:00
pred = assemble_from_paddle_result ( out )
2025-12-06 21:15:49 +01:00
time_per_page_list . append ( float ( time . time ( ) - tp0 ) )
m = evaluate_text ( ref , pred )
cer_list . append ( m [ " CER " ] )
wer_list . append ( m [ " WER " ] )
2025-11-17 10:52:00 +00:00
metrics = {
" CER " : float ( np . mean ( cer_list ) if cer_list else 1.0 ) ,
" WER " : float ( np . mean ( wer_list ) if wer_list else 1.0 ) ,
" TIME " : float ( time . time ( ) - t0 ) ,
" PAGES " : int ( len ( cer_list ) ) ,
" TIME_PER_PAGE " : float ( np . mean ( time_per_page_list ) if time_per_page_list else float ( time . time ( ) - t0 ) ) ,
}
print ( json . dumps ( metrics ) )
if __name__ == " __main__ " :
main ( )