correlations
All checks were successful
build_docker / essential (push) Successful in 1s
build_docker / build_paddle_ocr (push) Successful in 5m47s
build_docker / build_paddle_ocr_gpu (push) Successful in 22m8s
build_docker / build_easyocr (push) Successful in 18m3s
build_docker / build_easyocr_gpu (push) Successful in 20m9s
build_docker / build_doctr (push) Successful in 19m40s
build_docker / build_raytune (push) Successful in 3m24s
build_docker / build_doctr_gpu (push) Successful in 15m35s
All checks were successful
build_docker / essential (push) Successful in 1s
build_docker / build_paddle_ocr (push) Successful in 5m47s
build_docker / build_paddle_ocr_gpu (push) Successful in 22m8s
build_docker / build_easyocr (push) Successful in 18m3s
build_docker / build_easyocr_gpu (push) Successful in 20m9s
build_docker / build_doctr (push) Successful in 19m40s
build_docker / build_raytune (push) Successful in 3m24s
build_docker / build_doctr_gpu (push) Successful in 15m35s
This commit is contained in:
@@ -97,7 +97,7 @@ def extract_figure_title_from_mermaid(lines, current_index):
|
||||
|
||||
return None
|
||||
|
||||
def parse_md_to_html_blocks(md_content):
|
||||
def parse_md_to_html_blocks(md_content, is_anexo=False):
|
||||
"""Convert markdown content to HTML blocks with template styles."""
|
||||
global table_counter, figure_counter
|
||||
|
||||
@@ -142,7 +142,8 @@ def parse_md_to_html_blocks(md_content):
|
||||
# Format: "Figura X." in bold, title in italic (per UNIR guidelines)
|
||||
# Word TOC looks for text with Caption style - anchor must be outside main caption text
|
||||
bookmark_id = f"_Ref_Fig{figure_counter}"
|
||||
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{figure_counter}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
|
||||
# mso-pagination:keep-with-next ensures caption stays with figure image (correct MSO property)
|
||||
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{figure_counter}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
|
||||
|
||||
if os.path.exists(fig_path):
|
||||
# Read actual image dimensions and scale to fit page width
|
||||
@@ -162,10 +163,12 @@ def parse_md_to_html_blocks(md_content):
|
||||
w_pt = new_w * 0.75
|
||||
h_pt = new_h * 0.75
|
||||
|
||||
html_blocks.append(f'''<p class=MsoNormal style="text-align:center"><span lang=ES><img width="{new_w}" height="{new_h}" style="width:{w_pt}pt;height:{h_pt}pt;display:block;margin:0 auto" src="{fig_file}" alt="{fig_title}"/></span></p>''')
|
||||
# mso-pagination:keep-with-next ensures image stays with source line
|
||||
html_blocks.append(f'''<p class=MsoNormal style="text-align:center;mso-pagination:keep-with-next"><span lang=ES><img width="{new_w}" height="{new_h}" style="width:{w_pt}pt;height:{h_pt}pt;display:block;margin:0 auto" src="{fig_file}" alt="{fig_title}"/></span></p>''')
|
||||
else:
|
||||
# Fallback to placeholder
|
||||
html_blocks.append(f'''<p class=MsoNormal style="text-align:center;border:1px dashed #999;padding:20px;margin:10px 40px;background:#f9f9f9"><span lang=ES style="color:#666">[Insertar diagrama Mermaid aquí]</span></p>''')
|
||||
# mso-pagination:keep-with-next ensures placeholder stays with source line
|
||||
html_blocks.append(f'''<p class=MsoNormal style="text-align:center;mso-pagination:keep-with-next;border:1px dashed #999;padding:20px;margin:10px 40px;background:#f9f9f9"><span lang=ES style="color:#666">[Insertar diagrama Mermaid aquí]</span></p>''')
|
||||
|
||||
# Check if next non-empty line has custom Fuente
|
||||
custom_source = None
|
||||
@@ -218,11 +221,21 @@ def parse_md_to_html_blocks(md_content):
|
||||
continue
|
||||
elif line.startswith('###'):
|
||||
text = line.lstrip('#').strip()
|
||||
# Disable auto-numbering for Anexo content or A.x headings
|
||||
if is_anexo or re.match(r'^A\.\d+', text):
|
||||
# mso-list:none explicitly disables inherited list numbering from template CSS
|
||||
html_blocks.append(f'<h3 style="mso-list:none"><span lang=ES style="text-transform:none">{text}</span></h3>')
|
||||
else:
|
||||
html_blocks.append(f'<h3 style="mso-list:l22 level3 lfo18"><span lang=ES style="text-transform:none">{text}</span></h3>')
|
||||
i += 1
|
||||
continue
|
||||
elif line.startswith('##'):
|
||||
text = line.lstrip('#').strip()
|
||||
# Disable auto-numbering for Anexo content or A.x headings
|
||||
if is_anexo or re.match(r'^A\.\d+', text):
|
||||
# mso-list:none explicitly disables inherited list numbering from template CSS
|
||||
html_blocks.append(f'<h2 style="mso-list:none"><span lang=ES style="text-transform:none">{text}</span></h2>')
|
||||
else:
|
||||
html_blocks.append(f'<h2 style="mso-list:l22 level2 lfo18"><span lang=ES style="text-transform:none">{text}</span></h2>')
|
||||
i += 1
|
||||
continue
|
||||
@@ -277,10 +290,10 @@ def parse_md_to_html_blocks(md_content):
|
||||
clean_title = alt_title
|
||||
else:
|
||||
clean_title = "Tabla de datos."
|
||||
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_counter}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
|
||||
# mso-pagination:keep-with-next ensures caption stays with table (correct MSO property)
|
||||
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_counter}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
|
||||
|
||||
# Build table HTML with APA style (horizontal lines only, no vertical)
|
||||
# Wrap in centered div for Word compatibility
|
||||
table_html = '<div align="center"><table class=MsoTableGrid border=1 cellspacing=0 cellpadding=0 align="center" style="border-collapse:collapse;margin-left:auto;margin-right:auto;mso-table-style-name:\'Plain Table 1\'">'
|
||||
for j, tline in enumerate(table_lines):
|
||||
cells = [c.strip() for c in tline.split('|')[1:-1]]
|
||||
@@ -365,10 +378,10 @@ def parse_md_to_html_blocks(md_content):
|
||||
|
||||
return '\n\n'.join(html_blocks)
|
||||
|
||||
def extract_section_content(md_content):
|
||||
def extract_section_content(md_content, is_anexo=False):
|
||||
"""Extract content from markdown, skipping the first # header."""
|
||||
md_content = re.sub(r'^#\s+[^\n]+\n+', '', md_content, count=1)
|
||||
return parse_md_to_html_blocks(md_content)
|
||||
return parse_md_to_html_blocks(md_content, is_anexo=is_anexo)
|
||||
|
||||
def find_section_element(soup, keyword):
|
||||
"""Find element containing keyword (h1 or special paragraph classes)."""
|
||||
@@ -672,7 +685,7 @@ def main():
|
||||
current.extract()
|
||||
current = next_elem
|
||||
|
||||
anexo_content = extract_section_content(docs['anexo'])
|
||||
anexo_content = extract_section_content(docs['anexo'], is_anexo=True)
|
||||
anexo_soup = BeautifulSoup(anexo_content, 'html.parser')
|
||||
insert_point = anexo_elem
|
||||
for new_elem in reversed(list(anexo_soup.children)):
|
||||
|
||||
19
src/results/correlations/doctr_correlations.csv
Normal file
19
src/results/correlations/doctr_correlations.csv
Normal file
@@ -0,0 +1,19 @@
|
||||
parameter,metric,pearson
|
||||
straighten_pages,CER,0.9998131749398365
|
||||
symmetric_pad,CER,0.5261495908818205
|
||||
disable_page_orientation,CER,-0.49338806188002227
|
||||
disable_crop_orientation,CER,0.3470400052300961
|
||||
paragraph_break,CER,-0.28710190683729514
|
||||
resolve_blocks,CER,-0.2307325611083525
|
||||
preserve_aspect_ratio,CER,0.12408787013954682
|
||||
assume_straight_pages,CER,-0.11904725618816066
|
||||
resolve_lines,CER,0.060358529330885535
|
||||
straighten_pages,WER,0.9997423603542454
|
||||
symmetric_pad,WER,0.5288569133407047
|
||||
disable_page_orientation,WER,-0.49788185337361895
|
||||
disable_crop_orientation,WER,0.35150041517456027
|
||||
paragraph_break,WER,-0.2867204522845194
|
||||
resolve_blocks,WER,-0.23363320696739256
|
||||
assume_straight_pages,WER,-0.12980061688173894
|
||||
preserve_aspect_ratio,WER,0.1289984520450311
|
||||
resolve_lines,WER,0.06486504638919248
|
||||
|
92
src/results/correlations/doctr_correlations.json
Normal file
92
src/results/correlations/doctr_correlations.json
Normal file
@@ -0,0 +1,92 @@
|
||||
[
|
||||
{
|
||||
"parameter": "straighten_pages",
|
||||
"metric": "CER",
|
||||
"pearson": 0.9998131749398365
|
||||
},
|
||||
{
|
||||
"parameter": "symmetric_pad",
|
||||
"metric": "CER",
|
||||
"pearson": 0.5261495908818205
|
||||
},
|
||||
{
|
||||
"parameter": "disable_page_orientation",
|
||||
"metric": "CER",
|
||||
"pearson": -0.49338806188002227
|
||||
},
|
||||
{
|
||||
"parameter": "disable_crop_orientation",
|
||||
"metric": "CER",
|
||||
"pearson": 0.3470400052300961
|
||||
},
|
||||
{
|
||||
"parameter": "paragraph_break",
|
||||
"metric": "CER",
|
||||
"pearson": -0.28710190683729514
|
||||
},
|
||||
{
|
||||
"parameter": "resolve_blocks",
|
||||
"metric": "CER",
|
||||
"pearson": -0.2307325611083525
|
||||
},
|
||||
{
|
||||
"parameter": "preserve_aspect_ratio",
|
||||
"metric": "CER",
|
||||
"pearson": 0.12408787013954682
|
||||
},
|
||||
{
|
||||
"parameter": "assume_straight_pages",
|
||||
"metric": "CER",
|
||||
"pearson": -0.11904725618816066
|
||||
},
|
||||
{
|
||||
"parameter": "resolve_lines",
|
||||
"metric": "CER",
|
||||
"pearson": 0.060358529330885535
|
||||
},
|
||||
{
|
||||
"parameter": "straighten_pages",
|
||||
"metric": "WER",
|
||||
"pearson": 0.9997423603542454
|
||||
},
|
||||
{
|
||||
"parameter": "symmetric_pad",
|
||||
"metric": "WER",
|
||||
"pearson": 0.5288569133407047
|
||||
},
|
||||
{
|
||||
"parameter": "disable_page_orientation",
|
||||
"metric": "WER",
|
||||
"pearson": -0.49788185337361895
|
||||
},
|
||||
{
|
||||
"parameter": "disable_crop_orientation",
|
||||
"metric": "WER",
|
||||
"pearson": 0.35150041517456027
|
||||
},
|
||||
{
|
||||
"parameter": "paragraph_break",
|
||||
"metric": "WER",
|
||||
"pearson": -0.2867204522845194
|
||||
},
|
||||
{
|
||||
"parameter": "resolve_blocks",
|
||||
"metric": "WER",
|
||||
"pearson": -0.23363320696739256
|
||||
},
|
||||
{
|
||||
"parameter": "assume_straight_pages",
|
||||
"metric": "WER",
|
||||
"pearson": -0.12980061688173894
|
||||
},
|
||||
{
|
||||
"parameter": "preserve_aspect_ratio",
|
||||
"metric": "WER",
|
||||
"pearson": 0.1289984520450311
|
||||
},
|
||||
{
|
||||
"parameter": "resolve_lines",
|
||||
"metric": "WER",
|
||||
"pearson": 0.06486504638919248
|
||||
}
|
||||
]
|
||||
25
src/results/correlations/easyocr_correlations.csv
Normal file
25
src/results/correlations/easyocr_correlations.csv
Normal file
@@ -0,0 +1,25 @@
|
||||
parameter,metric,pearson
|
||||
contrast_ths,CER,0.40885606429688176
|
||||
ycenter_ths,CER,0.3052506223332593
|
||||
slope_ths,CER,-0.3007836023513022
|
||||
width_ths,CER,0.2820622927402215
|
||||
beamWidth,CER,0.24551397803020547
|
||||
add_margin,CER,0.23419721611930053
|
||||
height_ths,CER,-0.22762064209067434
|
||||
low_text,CER,-0.2127914870290739
|
||||
adjust_contrast,CER,-0.14938336246670267
|
||||
text_threshold,CER,-0.12089385052834749
|
||||
link_threshold,CER,-0.09553910777087017
|
||||
min_size,CER,0.04597421017746789
|
||||
contrast_ths,WER,0.38684315732406244
|
||||
slope_ths,WER,-0.32504213142289984
|
||||
ycenter_ths,WER,0.28738383141120705
|
||||
add_margin,WER,0.24148232685944232
|
||||
width_ths,WER,0.23869424685132606
|
||||
height_ths,WER,-0.23743923240967893
|
||||
beamWidth,WER,0.2286079838179018
|
||||
adjust_contrast,WER,-0.17842640649533945
|
||||
low_text,WER,-0.1772397092408802
|
||||
text_threshold,WER,-0.12733512825321042
|
||||
min_size,WER,0.10070615378426818
|
||||
link_threshold,WER,-0.04425190559911718
|
||||
|
122
src/results/correlations/easyocr_correlations.json
Normal file
122
src/results/correlations/easyocr_correlations.json
Normal file
@@ -0,0 +1,122 @@
|
||||
[
|
||||
{
|
||||
"parameter": "contrast_ths",
|
||||
"metric": "CER",
|
||||
"pearson": 0.40885606429688176
|
||||
},
|
||||
{
|
||||
"parameter": "ycenter_ths",
|
||||
"metric": "CER",
|
||||
"pearson": 0.3052506223332593
|
||||
},
|
||||
{
|
||||
"parameter": "slope_ths",
|
||||
"metric": "CER",
|
||||
"pearson": -0.3007836023513022
|
||||
},
|
||||
{
|
||||
"parameter": "width_ths",
|
||||
"metric": "CER",
|
||||
"pearson": 0.2820622927402215
|
||||
},
|
||||
{
|
||||
"parameter": "beamWidth",
|
||||
"metric": "CER",
|
||||
"pearson": 0.24551397803020547
|
||||
},
|
||||
{
|
||||
"parameter": "add_margin",
|
||||
"metric": "CER",
|
||||
"pearson": 0.23419721611930053
|
||||
},
|
||||
{
|
||||
"parameter": "height_ths",
|
||||
"metric": "CER",
|
||||
"pearson": -0.22762064209067434
|
||||
},
|
||||
{
|
||||
"parameter": "low_text",
|
||||
"metric": "CER",
|
||||
"pearson": -0.2127914870290739
|
||||
},
|
||||
{
|
||||
"parameter": "adjust_contrast",
|
||||
"metric": "CER",
|
||||
"pearson": -0.14938336246670267
|
||||
},
|
||||
{
|
||||
"parameter": "text_threshold",
|
||||
"metric": "CER",
|
||||
"pearson": -0.12089385052834749
|
||||
},
|
||||
{
|
||||
"parameter": "link_threshold",
|
||||
"metric": "CER",
|
||||
"pearson": -0.09553910777087017
|
||||
},
|
||||
{
|
||||
"parameter": "min_size",
|
||||
"metric": "CER",
|
||||
"pearson": 0.04597421017746789
|
||||
},
|
||||
{
|
||||
"parameter": "contrast_ths",
|
||||
"metric": "WER",
|
||||
"pearson": 0.38684315732406244
|
||||
},
|
||||
{
|
||||
"parameter": "slope_ths",
|
||||
"metric": "WER",
|
||||
"pearson": -0.32504213142289984
|
||||
},
|
||||
{
|
||||
"parameter": "ycenter_ths",
|
||||
"metric": "WER",
|
||||
"pearson": 0.28738383141120705
|
||||
},
|
||||
{
|
||||
"parameter": "add_margin",
|
||||
"metric": "WER",
|
||||
"pearson": 0.24148232685944232
|
||||
},
|
||||
{
|
||||
"parameter": "width_ths",
|
||||
"metric": "WER",
|
||||
"pearson": 0.23869424685132606
|
||||
},
|
||||
{
|
||||
"parameter": "height_ths",
|
||||
"metric": "WER",
|
||||
"pearson": -0.23743923240967893
|
||||
},
|
||||
{
|
||||
"parameter": "beamWidth",
|
||||
"metric": "WER",
|
||||
"pearson": 0.2286079838179018
|
||||
},
|
||||
{
|
||||
"parameter": "adjust_contrast",
|
||||
"metric": "WER",
|
||||
"pearson": -0.17842640649533945
|
||||
},
|
||||
{
|
||||
"parameter": "low_text",
|
||||
"metric": "WER",
|
||||
"pearson": -0.1772397092408802
|
||||
},
|
||||
{
|
||||
"parameter": "text_threshold",
|
||||
"metric": "WER",
|
||||
"pearson": -0.12733512825321042
|
||||
},
|
||||
{
|
||||
"parameter": "min_size",
|
||||
"metric": "WER",
|
||||
"pearson": 0.10070615378426818
|
||||
},
|
||||
{
|
||||
"parameter": "link_threshold",
|
||||
"metric": "WER",
|
||||
"pearson": -0.04425190559911718
|
||||
}
|
||||
]
|
||||
15
src/results/correlations/paddle_correlations.csv
Normal file
15
src/results/correlations/paddle_correlations.csv
Normal file
@@ -0,0 +1,15 @@
|
||||
parameter,metric,pearson
|
||||
use_doc_unwarping,CER,0.8791236551817551
|
||||
use_doc_orientation_classify,CER,-0.7119850615039771
|
||||
textline_orientation,CER,-0.5347452891182014
|
||||
text_det_thresh,CER,0.4280438958428758
|
||||
text_det_box_thresh,CER,0.3113152196833144
|
||||
text_rec_score_thresh,CER,-0.2681957118190106
|
||||
text_det_unclip_ratio,CER,
|
||||
use_doc_unwarping,WER,0.743651897463081
|
||||
use_doc_orientation_classify,WER,-0.6018981292243886
|
||||
textline_orientation,WER,-0.5906753653336065
|
||||
text_det_thresh,WER,0.39917807081409956
|
||||
text_det_box_thresh,WER,0.2555315418488065
|
||||
text_rec_score_thresh,WER,-0.08030912963602418
|
||||
text_det_unclip_ratio,WER,
|
||||
|
72
src/results/correlations/paddle_correlations.json
Normal file
72
src/results/correlations/paddle_correlations.json
Normal file
@@ -0,0 +1,72 @@
|
||||
[
|
||||
{
|
||||
"parameter": "use_doc_unwarping",
|
||||
"metric": "CER",
|
||||
"pearson": 0.8791236551817551
|
||||
},
|
||||
{
|
||||
"parameter": "use_doc_orientation_classify",
|
||||
"metric": "CER",
|
||||
"pearson": -0.7119850615039771
|
||||
},
|
||||
{
|
||||
"parameter": "textline_orientation",
|
||||
"metric": "CER",
|
||||
"pearson": -0.5347452891182014
|
||||
},
|
||||
{
|
||||
"parameter": "text_det_thresh",
|
||||
"metric": "CER",
|
||||
"pearson": 0.4280438958428758
|
||||
},
|
||||
{
|
||||
"parameter": "text_det_box_thresh",
|
||||
"metric": "CER",
|
||||
"pearson": 0.3113152196833144
|
||||
},
|
||||
{
|
||||
"parameter": "text_rec_score_thresh",
|
||||
"metric": "CER",
|
||||
"pearson": -0.2681957118190106
|
||||
},
|
||||
{
|
||||
"parameter": "text_det_unclip_ratio",
|
||||
"metric": "CER",
|
||||
"pearson": NaN
|
||||
},
|
||||
{
|
||||
"parameter": "use_doc_unwarping",
|
||||
"metric": "WER",
|
||||
"pearson": 0.743651897463081
|
||||
},
|
||||
{
|
||||
"parameter": "use_doc_orientation_classify",
|
||||
"metric": "WER",
|
||||
"pearson": -0.6018981292243886
|
||||
},
|
||||
{
|
||||
"parameter": "textline_orientation",
|
||||
"metric": "WER",
|
||||
"pearson": -0.5906753653336065
|
||||
},
|
||||
{
|
||||
"parameter": "text_det_thresh",
|
||||
"metric": "WER",
|
||||
"pearson": 0.39917807081409956
|
||||
},
|
||||
{
|
||||
"parameter": "text_det_box_thresh",
|
||||
"metric": "WER",
|
||||
"pearson": 0.2555315418488065
|
||||
},
|
||||
{
|
||||
"parameter": "text_rec_score_thresh",
|
||||
"metric": "WER",
|
||||
"pearson": -0.08030912963602418
|
||||
},
|
||||
{
|
||||
"parameter": "text_det_unclip_ratio",
|
||||
"metric": "WER",
|
||||
"pearson": NaN
|
||||
}
|
||||
]
|
||||
Reference in New Issue
Block a user