deliverable_12_11_2025 #3

Merged
Seryusjj merged 3 commits from deliverable_12_11_2025 into main 2025-11-17 10:52:01 +00:00
2 changed files with 677 additions and 190 deletions
Showing only changes of commit d5ce54d6ae - Show all commits

File diff suppressed because one or more lines are too long

View File

@@ -199,7 +199,7 @@ def main():
parser.add_argument("--text-det-unclip-ratio", type=float, default=1.5) parser.add_argument("--text-det-unclip-ratio", type=float, default=1.5)
parser.add_argument("--text-rec-score-thresh", type=float, default=0.0) parser.add_argument("--text-rec-score-thresh", type=float, default=0.0)
parser.add_argument("--line-tolerance", type=float, default=0.6) parser.add_argument("--line-tolerance", type=float, default=0.6)
parser.add_argument("--min-box-score", type=int, default=0) parser.add_argument("--min-box-score", type=float, default=0.0)
parser.add_argument("--pages-per-pdf", type=int, default=2) parser.add_argument("--pages-per-pdf", type=int, default=2)
parser.add_argument("--lang", default="es") parser.add_argument("--lang", default="es")
args = parser.parse_args() args = parser.parse_args()
@@ -213,6 +213,7 @@ def main():
) )
cer_list, wer_list = [], [] cer_list, wer_list = [], []
time_per_page_list = []
t0 = time.time() t0 = time.time()
for fname in os.listdir(args.pdf_folder): for fname in os.listdir(args.pdf_folder):
@@ -223,6 +224,7 @@ def main():
for i, img in enumerate(images): for i, img in enumerate(images):
ref = pdf_extract_text(pdf_path, i+1) ref = pdf_extract_text(pdf_path, i+1)
arr = np.array(img) arr = np.array(img)
tp0 = time.time()
out = ocr.predict( out = ocr.predict(
arr, arr,
text_det_box_thresh=args.text_det_box_thresh, text_det_box_thresh=args.text_det_box_thresh,
@@ -231,6 +233,7 @@ def main():
use_textline_orientation=args.textline_orientation use_textline_orientation=args.textline_orientation
) )
pred = assemble_from_paddle_result(out, args.min_box_score, args.line_tolerance) pred = assemble_from_paddle_result(out, args.min_box_score, args.line_tolerance)
time_per_page_list.append(float(time.time() - tp0))
m = evaluate_text(ref, pred) m = evaluate_text(ref, pred)
cer_list.append(m["CER"]) cer_list.append(m["CER"])
wer_list.append(m["WER"]) wer_list.append(m["WER"])
@@ -238,8 +241,9 @@ def main():
metrics = { metrics = {
"CER": float(np.mean(cer_list) if cer_list else 1.0), "CER": float(np.mean(cer_list) if cer_list else 1.0),
"WER": float(np.mean(wer_list) if wer_list else 1.0), "WER": float(np.mean(wer_list) if wer_list else 1.0),
"time": float(time.time() - t0), "TIME": float(time.time() - t0),
"pages": int(len(cer_list)), "PAGES": int(len(cer_list)),
"TIME_PER_PAGE": float(np.mean(time_per_page_list) if time_per_page_list else float(time.time() - t0)),
} }
print(json.dumps(metrics)) print(json.dumps(metrics))