correlations
All checks were successful
build_docker / essential (push) Successful in 1s
build_docker / build_paddle_ocr (push) Successful in 5m47s
build_docker / build_paddle_ocr_gpu (push) Successful in 22m8s
build_docker / build_easyocr (push) Successful in 18m3s
build_docker / build_easyocr_gpu (push) Successful in 20m9s
build_docker / build_doctr (push) Successful in 19m40s
build_docker / build_raytune (push) Successful in 3m24s
build_docker / build_doctr_gpu (push) Successful in 15m35s

This commit is contained in:
2026-01-24 16:48:47 +01:00
parent 4c299cc00f
commit d384f1e4d3
7 changed files with 369 additions and 11 deletions

View File

@@ -97,7 +97,7 @@ def extract_figure_title_from_mermaid(lines, current_index):
return None
def parse_md_to_html_blocks(md_content):
def parse_md_to_html_blocks(md_content, is_anexo=False):
"""Convert markdown content to HTML blocks with template styles."""
global table_counter, figure_counter
@@ -142,7 +142,8 @@ def parse_md_to_html_blocks(md_content):
# Format: "Figura X." in bold, title in italic (per UNIR guidelines)
# Word TOC looks for text with Caption style - anchor must be outside main caption text
bookmark_id = f"_Ref_Fig{figure_counter}"
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{figure_counter}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
# mso-pagination:keep-with-next ensures caption stays with figure image (correct MSO property)
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{figure_counter}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
if os.path.exists(fig_path):
# Read actual image dimensions and scale to fit page width
@@ -162,10 +163,12 @@ def parse_md_to_html_blocks(md_content):
w_pt = new_w * 0.75
h_pt = new_h * 0.75
html_blocks.append(f'''<p class=MsoNormal style="text-align:center"><span lang=ES><img width="{new_w}" height="{new_h}" style="width:{w_pt}pt;height:{h_pt}pt;display:block;margin:0 auto" src="{fig_file}" alt="{fig_title}"/></span></p>''')
# mso-pagination:keep-with-next ensures image stays with source line
html_blocks.append(f'''<p class=MsoNormal style="text-align:center;mso-pagination:keep-with-next"><span lang=ES><img width="{new_w}" height="{new_h}" style="width:{w_pt}pt;height:{h_pt}pt;display:block;margin:0 auto" src="{fig_file}" alt="{fig_title}"/></span></p>''')
else:
# Fallback to placeholder
html_blocks.append(f'''<p class=MsoNormal style="text-align:center;border:1px dashed #999;padding:20px;margin:10px 40px;background:#f9f9f9"><span lang=ES style="color:#666">[Insertar diagrama Mermaid aquí]</span></p>''')
# mso-pagination:keep-with-next ensures placeholder stays with source line
html_blocks.append(f'''<p class=MsoNormal style="text-align:center;mso-pagination:keep-with-next;border:1px dashed #999;padding:20px;margin:10px 40px;background:#f9f9f9"><span lang=ES style="color:#666">[Insertar diagrama Mermaid aquí]</span></p>''')
# Check if next non-empty line has custom Fuente
custom_source = None
@@ -218,11 +221,21 @@ def parse_md_to_html_blocks(md_content):
continue
elif line.startswith('###'):
text = line.lstrip('#').strip()
# Disable auto-numbering for Anexo content or A.x headings
if is_anexo or re.match(r'^A\.\d+', text):
# mso-list:none explicitly disables inherited list numbering from template CSS
html_blocks.append(f'<h3 style="mso-list:none"><span lang=ES style="text-transform:none">{text}</span></h3>')
else:
html_blocks.append(f'<h3 style="mso-list:l22 level3 lfo18"><span lang=ES style="text-transform:none">{text}</span></h3>')
i += 1
continue
elif line.startswith('##'):
text = line.lstrip('#').strip()
# Disable auto-numbering for Anexo content or A.x headings
if is_anexo or re.match(r'^A\.\d+', text):
# mso-list:none explicitly disables inherited list numbering from template CSS
html_blocks.append(f'<h2 style="mso-list:none"><span lang=ES style="text-transform:none">{text}</span></h2>')
else:
html_blocks.append(f'<h2 style="mso-list:l22 level2 lfo18"><span lang=ES style="text-transform:none">{text}</span></h2>')
i += 1
continue
@@ -277,10 +290,10 @@ def parse_md_to_html_blocks(md_content):
clean_title = alt_title
else:
clean_title = "Tabla de datos."
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_counter}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
# mso-pagination:keep-with-next ensures caption stays with table (correct MSO property)
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_counter}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
# Build table HTML with APA style (horizontal lines only, no vertical)
# Wrap in centered div for Word compatibility
table_html = '<div align="center"><table class=MsoTableGrid border=1 cellspacing=0 cellpadding=0 align="center" style="border-collapse:collapse;margin-left:auto;margin-right:auto;mso-table-style-name:\'Plain Table 1\'">'
for j, tline in enumerate(table_lines):
cells = [c.strip() for c in tline.split('|')[1:-1]]
@@ -365,10 +378,10 @@ def parse_md_to_html_blocks(md_content):
return '\n\n'.join(html_blocks)
def extract_section_content(md_content):
def extract_section_content(md_content, is_anexo=False):
"""Extract content from markdown, skipping the first # header."""
md_content = re.sub(r'^#\s+[^\n]+\n+', '', md_content, count=1)
return parse_md_to_html_blocks(md_content)
return parse_md_to_html_blocks(md_content, is_anexo=is_anexo)
def find_section_element(soup, keyword):
"""Find element containing keyword (h1 or special paragraph classes)."""
@@ -672,7 +685,7 @@ def main():
current.extract()
current = next_elem
anexo_content = extract_section_content(docs['anexo'])
anexo_content = extract_section_content(docs['anexo'], is_anexo=True)
anexo_soup = BeautifulSoup(anexo_content, 'html.parser')
insert_point = anexo_elem
for new_elem in reversed(list(anexo_soup.children)):

View File

@@ -0,0 +1,19 @@
parameter,metric,pearson
straighten_pages,CER,0.9998131749398365
symmetric_pad,CER,0.5261495908818205
disable_page_orientation,CER,-0.49338806188002227
disable_crop_orientation,CER,0.3470400052300961
paragraph_break,CER,-0.28710190683729514
resolve_blocks,CER,-0.2307325611083525
preserve_aspect_ratio,CER,0.12408787013954682
assume_straight_pages,CER,-0.11904725618816066
resolve_lines,CER,0.060358529330885535
straighten_pages,WER,0.9997423603542454
symmetric_pad,WER,0.5288569133407047
disable_page_orientation,WER,-0.49788185337361895
disable_crop_orientation,WER,0.35150041517456027
paragraph_break,WER,-0.2867204522845194
resolve_blocks,WER,-0.23363320696739256
assume_straight_pages,WER,-0.12980061688173894
preserve_aspect_ratio,WER,0.1289984520450311
resolve_lines,WER,0.06486504638919248
1 parameter metric pearson
2 straighten_pages CER 0.9998131749398365
3 symmetric_pad CER 0.5261495908818205
4 disable_page_orientation CER -0.49338806188002227
5 disable_crop_orientation CER 0.3470400052300961
6 paragraph_break CER -0.28710190683729514
7 resolve_blocks CER -0.2307325611083525
8 preserve_aspect_ratio CER 0.12408787013954682
9 assume_straight_pages CER -0.11904725618816066
10 resolve_lines CER 0.060358529330885535
11 straighten_pages WER 0.9997423603542454
12 symmetric_pad WER 0.5288569133407047
13 disable_page_orientation WER -0.49788185337361895
14 disable_crop_orientation WER 0.35150041517456027
15 paragraph_break WER -0.2867204522845194
16 resolve_blocks WER -0.23363320696739256
17 assume_straight_pages WER -0.12980061688173894
18 preserve_aspect_ratio WER 0.1289984520450311
19 resolve_lines WER 0.06486504638919248

View File

@@ -0,0 +1,92 @@
[
{
"parameter": "straighten_pages",
"metric": "CER",
"pearson": 0.9998131749398365
},
{
"parameter": "symmetric_pad",
"metric": "CER",
"pearson": 0.5261495908818205
},
{
"parameter": "disable_page_orientation",
"metric": "CER",
"pearson": -0.49338806188002227
},
{
"parameter": "disable_crop_orientation",
"metric": "CER",
"pearson": 0.3470400052300961
},
{
"parameter": "paragraph_break",
"metric": "CER",
"pearson": -0.28710190683729514
},
{
"parameter": "resolve_blocks",
"metric": "CER",
"pearson": -0.2307325611083525
},
{
"parameter": "preserve_aspect_ratio",
"metric": "CER",
"pearson": 0.12408787013954682
},
{
"parameter": "assume_straight_pages",
"metric": "CER",
"pearson": -0.11904725618816066
},
{
"parameter": "resolve_lines",
"metric": "CER",
"pearson": 0.060358529330885535
},
{
"parameter": "straighten_pages",
"metric": "WER",
"pearson": 0.9997423603542454
},
{
"parameter": "symmetric_pad",
"metric": "WER",
"pearson": 0.5288569133407047
},
{
"parameter": "disable_page_orientation",
"metric": "WER",
"pearson": -0.49788185337361895
},
{
"parameter": "disable_crop_orientation",
"metric": "WER",
"pearson": 0.35150041517456027
},
{
"parameter": "paragraph_break",
"metric": "WER",
"pearson": -0.2867204522845194
},
{
"parameter": "resolve_blocks",
"metric": "WER",
"pearson": -0.23363320696739256
},
{
"parameter": "assume_straight_pages",
"metric": "WER",
"pearson": -0.12980061688173894
},
{
"parameter": "preserve_aspect_ratio",
"metric": "WER",
"pearson": 0.1289984520450311
},
{
"parameter": "resolve_lines",
"metric": "WER",
"pearson": 0.06486504638919248
}
]

View File

@@ -0,0 +1,25 @@
parameter,metric,pearson
contrast_ths,CER,0.40885606429688176
ycenter_ths,CER,0.3052506223332593
slope_ths,CER,-0.3007836023513022
width_ths,CER,0.2820622927402215
beamWidth,CER,0.24551397803020547
add_margin,CER,0.23419721611930053
height_ths,CER,-0.22762064209067434
low_text,CER,-0.2127914870290739
adjust_contrast,CER,-0.14938336246670267
text_threshold,CER,-0.12089385052834749
link_threshold,CER,-0.09553910777087017
min_size,CER,0.04597421017746789
contrast_ths,WER,0.38684315732406244
slope_ths,WER,-0.32504213142289984
ycenter_ths,WER,0.28738383141120705
add_margin,WER,0.24148232685944232
width_ths,WER,0.23869424685132606
height_ths,WER,-0.23743923240967893
beamWidth,WER,0.2286079838179018
adjust_contrast,WER,-0.17842640649533945
low_text,WER,-0.1772397092408802
text_threshold,WER,-0.12733512825321042
min_size,WER,0.10070615378426818
link_threshold,WER,-0.04425190559911718
1 parameter metric pearson
2 contrast_ths CER 0.40885606429688176
3 ycenter_ths CER 0.3052506223332593
4 slope_ths CER -0.3007836023513022
5 width_ths CER 0.2820622927402215
6 beamWidth CER 0.24551397803020547
7 add_margin CER 0.23419721611930053
8 height_ths CER -0.22762064209067434
9 low_text CER -0.2127914870290739
10 adjust_contrast CER -0.14938336246670267
11 text_threshold CER -0.12089385052834749
12 link_threshold CER -0.09553910777087017
13 min_size CER 0.04597421017746789
14 contrast_ths WER 0.38684315732406244
15 slope_ths WER -0.32504213142289984
16 ycenter_ths WER 0.28738383141120705
17 add_margin WER 0.24148232685944232
18 width_ths WER 0.23869424685132606
19 height_ths WER -0.23743923240967893
20 beamWidth WER 0.2286079838179018
21 adjust_contrast WER -0.17842640649533945
22 low_text WER -0.1772397092408802
23 text_threshold WER -0.12733512825321042
24 min_size WER 0.10070615378426818
25 link_threshold WER -0.04425190559911718

View File

@@ -0,0 +1,122 @@
[
{
"parameter": "contrast_ths",
"metric": "CER",
"pearson": 0.40885606429688176
},
{
"parameter": "ycenter_ths",
"metric": "CER",
"pearson": 0.3052506223332593
},
{
"parameter": "slope_ths",
"metric": "CER",
"pearson": -0.3007836023513022
},
{
"parameter": "width_ths",
"metric": "CER",
"pearson": 0.2820622927402215
},
{
"parameter": "beamWidth",
"metric": "CER",
"pearson": 0.24551397803020547
},
{
"parameter": "add_margin",
"metric": "CER",
"pearson": 0.23419721611930053
},
{
"parameter": "height_ths",
"metric": "CER",
"pearson": -0.22762064209067434
},
{
"parameter": "low_text",
"metric": "CER",
"pearson": -0.2127914870290739
},
{
"parameter": "adjust_contrast",
"metric": "CER",
"pearson": -0.14938336246670267
},
{
"parameter": "text_threshold",
"metric": "CER",
"pearson": -0.12089385052834749
},
{
"parameter": "link_threshold",
"metric": "CER",
"pearson": -0.09553910777087017
},
{
"parameter": "min_size",
"metric": "CER",
"pearson": 0.04597421017746789
},
{
"parameter": "contrast_ths",
"metric": "WER",
"pearson": 0.38684315732406244
},
{
"parameter": "slope_ths",
"metric": "WER",
"pearson": -0.32504213142289984
},
{
"parameter": "ycenter_ths",
"metric": "WER",
"pearson": 0.28738383141120705
},
{
"parameter": "add_margin",
"metric": "WER",
"pearson": 0.24148232685944232
},
{
"parameter": "width_ths",
"metric": "WER",
"pearson": 0.23869424685132606
},
{
"parameter": "height_ths",
"metric": "WER",
"pearson": -0.23743923240967893
},
{
"parameter": "beamWidth",
"metric": "WER",
"pearson": 0.2286079838179018
},
{
"parameter": "adjust_contrast",
"metric": "WER",
"pearson": -0.17842640649533945
},
{
"parameter": "low_text",
"metric": "WER",
"pearson": -0.1772397092408802
},
{
"parameter": "text_threshold",
"metric": "WER",
"pearson": -0.12733512825321042
},
{
"parameter": "min_size",
"metric": "WER",
"pearson": 0.10070615378426818
},
{
"parameter": "link_threshold",
"metric": "WER",
"pearson": -0.04425190559911718
}
]

View File

@@ -0,0 +1,15 @@
parameter,metric,pearson
use_doc_unwarping,CER,0.8791236551817551
use_doc_orientation_classify,CER,-0.7119850615039771
textline_orientation,CER,-0.5347452891182014
text_det_thresh,CER,0.4280438958428758
text_det_box_thresh,CER,0.3113152196833144
text_rec_score_thresh,CER,-0.2681957118190106
text_det_unclip_ratio,CER,
use_doc_unwarping,WER,0.743651897463081
use_doc_orientation_classify,WER,-0.6018981292243886
textline_orientation,WER,-0.5906753653336065
text_det_thresh,WER,0.39917807081409956
text_det_box_thresh,WER,0.2555315418488065
text_rec_score_thresh,WER,-0.08030912963602418
text_det_unclip_ratio,WER,
1 parameter metric pearson
2 use_doc_unwarping CER 0.8791236551817551
3 use_doc_orientation_classify CER -0.7119850615039771
4 textline_orientation CER -0.5347452891182014
5 text_det_thresh CER 0.4280438958428758
6 text_det_box_thresh CER 0.3113152196833144
7 text_rec_score_thresh CER -0.2681957118190106
8 text_det_unclip_ratio CER
9 use_doc_unwarping WER 0.743651897463081
10 use_doc_orientation_classify WER -0.6018981292243886
11 textline_orientation WER -0.5906753653336065
12 text_det_thresh WER 0.39917807081409956
13 text_det_box_thresh WER 0.2555315418488065
14 text_rec_score_thresh WER -0.08030912963602418
15 text_det_unclip_ratio WER

View File

@@ -0,0 +1,72 @@
[
{
"parameter": "use_doc_unwarping",
"metric": "CER",
"pearson": 0.8791236551817551
},
{
"parameter": "use_doc_orientation_classify",
"metric": "CER",
"pearson": -0.7119850615039771
},
{
"parameter": "textline_orientation",
"metric": "CER",
"pearson": -0.5347452891182014
},
{
"parameter": "text_det_thresh",
"metric": "CER",
"pearson": 0.4280438958428758
},
{
"parameter": "text_det_box_thresh",
"metric": "CER",
"pearson": 0.3113152196833144
},
{
"parameter": "text_rec_score_thresh",
"metric": "CER",
"pearson": -0.2681957118190106
},
{
"parameter": "text_det_unclip_ratio",
"metric": "CER",
"pearson": NaN
},
{
"parameter": "use_doc_unwarping",
"metric": "WER",
"pearson": 0.743651897463081
},
{
"parameter": "use_doc_orientation_classify",
"metric": "WER",
"pearson": -0.6018981292243886
},
{
"parameter": "textline_orientation",
"metric": "WER",
"pearson": -0.5906753653336065
},
{
"parameter": "text_det_thresh",
"metric": "WER",
"pearson": 0.39917807081409956
},
{
"parameter": "text_det_box_thresh",
"metric": "WER",
"pearson": 0.2555315418488065
},
{
"parameter": "text_rec_score_thresh",
"metric": "WER",
"pearson": -0.08030912963602418
},
{
"parameter": "text_det_unclip_ratio",
"metric": "WER",
"pearson": NaN
}
]