diff --git a/apply_content.py b/apply_content.py
index f2b7ed3..38f279c 100644
--- a/apply_content.py
+++ b/apply_content.py
@@ -97,7 +97,7 @@ def extract_figure_title_from_mermaid(lines, current_index):
return None
-def parse_md_to_html_blocks(md_content):
+def parse_md_to_html_blocks(md_content, is_anexo=False):
"""Convert markdown content to HTML blocks with template styles."""
global table_counter, figure_counter
@@ -142,7 +142,8 @@ def parse_md_to_html_blocks(md_content):
# Format: "Figura X." in bold, title in italic (per UNIR guidelines)
# Word TOC looks for text with Caption style - anchor must be outside main caption text
bookmark_id = f"_Ref_Fig{figure_counter}"
- html_blocks.append(f'''
Figura {figure_counter}. {fig_title}
''')
+ # mso-pagination:keep-with-next ensures caption stays with figure image (correct MSO property)
+ html_blocks.append(f'''Figura {figure_counter}. {fig_title}
''')
if os.path.exists(fig_path):
# Read actual image dimensions and scale to fit page width
@@ -162,10 +163,12 @@ def parse_md_to_html_blocks(md_content):
w_pt = new_w * 0.75
h_pt = new_h * 0.75
- html_blocks.append(f'''
''')
+ # mso-pagination:keep-with-next ensures image stays with source line
+ html_blocks.append(f'''
''')
else:
# Fallback to placeholder
- html_blocks.append(f'''[Insertar diagrama Mermaid aquí]
''')
+ # mso-pagination:keep-with-next ensures placeholder stays with source line
+ html_blocks.append(f'''[Insertar diagrama Mermaid aquí]
''')
# Check if next non-empty line has custom Fuente
custom_source = None
@@ -218,12 +221,22 @@ def parse_md_to_html_blocks(md_content):
continue
elif line.startswith('###'):
text = line.lstrip('#').strip()
- html_blocks.append(f'{text}
')
+ # Disable auto-numbering for Anexo content or A.x headings
+ if is_anexo or re.match(r'^A\.\d+', text):
+ # mso-list:none explicitly disables inherited list numbering from template CSS
+ html_blocks.append(f'{text}
')
+ else:
+ html_blocks.append(f'{text}
')
i += 1
continue
elif line.startswith('##'):
text = line.lstrip('#').strip()
- html_blocks.append(f'{text}
')
+ # Disable auto-numbering for Anexo content or A.x headings
+ if is_anexo or re.match(r'^A\.\d+', text):
+ # mso-list:none explicitly disables inherited list numbering from template CSS
+ html_blocks.append(f'{text}
')
+ else:
+ html_blocks.append(f'{text}
')
i += 1
continue
elif line.startswith('#'):
@@ -277,10 +290,10 @@ def parse_md_to_html_blocks(md_content):
clean_title = alt_title
else:
clean_title = "Tabla de datos."
- html_blocks.append(f'''Tabla {table_counter}. {clean_title}
''')
+ # mso-pagination:keep-with-next ensures caption stays with table (correct MSO property)
+ html_blocks.append(f'''Tabla {table_counter}. {clean_title}
''')
# Build table HTML with APA style (horizontal lines only, no vertical)
- # Wrap in centered div for Word compatibility
table_html = ''
for j, tline in enumerate(table_lines):
cells = [c.strip() for c in tline.split('|')[1:-1]]
@@ -365,10 +378,10 @@ def parse_md_to_html_blocks(md_content):
return '\n\n'.join(html_blocks)
-def extract_section_content(md_content):
+def extract_section_content(md_content, is_anexo=False):
"""Extract content from markdown, skipping the first # header."""
md_content = re.sub(r'^#\s+[^\n]+\n+', '', md_content, count=1)
- return parse_md_to_html_blocks(md_content)
+ return parse_md_to_html_blocks(md_content, is_anexo=is_anexo)
def find_section_element(soup, keyword):
"""Find element containing keyword (h1 or special paragraph classes)."""
@@ -672,7 +685,7 @@ def main():
current.extract()
current = next_elem
- anexo_content = extract_section_content(docs['anexo'])
+ anexo_content = extract_section_content(docs['anexo'], is_anexo=True)
anexo_soup = BeautifulSoup(anexo_content, 'html.parser')
insert_point = anexo_elem
for new_elem in reversed(list(anexo_soup.children)):
diff --git a/src/results/correlations/doctr_correlations.csv b/src/results/correlations/doctr_correlations.csv
new file mode 100644
index 0000000..54ae5fd
--- /dev/null
+++ b/src/results/correlations/doctr_correlations.csv
@@ -0,0 +1,19 @@
+parameter,metric,pearson
+straighten_pages,CER,0.9998131749398365
+symmetric_pad,CER,0.5261495908818205
+disable_page_orientation,CER,-0.49338806188002227
+disable_crop_orientation,CER,0.3470400052300961
+paragraph_break,CER,-0.28710190683729514
+resolve_blocks,CER,-0.2307325611083525
+preserve_aspect_ratio,CER,0.12408787013954682
+assume_straight_pages,CER,-0.11904725618816066
+resolve_lines,CER,0.060358529330885535
+straighten_pages,WER,0.9997423603542454
+symmetric_pad,WER,0.5288569133407047
+disable_page_orientation,WER,-0.49788185337361895
+disable_crop_orientation,WER,0.35150041517456027
+paragraph_break,WER,-0.2867204522845194
+resolve_blocks,WER,-0.23363320696739256
+assume_straight_pages,WER,-0.12980061688173894
+preserve_aspect_ratio,WER,0.1289984520450311
+resolve_lines,WER,0.06486504638919248
diff --git a/src/results/correlations/doctr_correlations.json b/src/results/correlations/doctr_correlations.json
new file mode 100644
index 0000000..8413366
--- /dev/null
+++ b/src/results/correlations/doctr_correlations.json
@@ -0,0 +1,92 @@
+[
+ {
+ "parameter": "straighten_pages",
+ "metric": "CER",
+ "pearson": 0.9998131749398365
+ },
+ {
+ "parameter": "symmetric_pad",
+ "metric": "CER",
+ "pearson": 0.5261495908818205
+ },
+ {
+ "parameter": "disable_page_orientation",
+ "metric": "CER",
+ "pearson": -0.49338806188002227
+ },
+ {
+ "parameter": "disable_crop_orientation",
+ "metric": "CER",
+ "pearson": 0.3470400052300961
+ },
+ {
+ "parameter": "paragraph_break",
+ "metric": "CER",
+ "pearson": -0.28710190683729514
+ },
+ {
+ "parameter": "resolve_blocks",
+ "metric": "CER",
+ "pearson": -0.2307325611083525
+ },
+ {
+ "parameter": "preserve_aspect_ratio",
+ "metric": "CER",
+ "pearson": 0.12408787013954682
+ },
+ {
+ "parameter": "assume_straight_pages",
+ "metric": "CER",
+ "pearson": -0.11904725618816066
+ },
+ {
+ "parameter": "resolve_lines",
+ "metric": "CER",
+ "pearson": 0.060358529330885535
+ },
+ {
+ "parameter": "straighten_pages",
+ "metric": "WER",
+ "pearson": 0.9997423603542454
+ },
+ {
+ "parameter": "symmetric_pad",
+ "metric": "WER",
+ "pearson": 0.5288569133407047
+ },
+ {
+ "parameter": "disable_page_orientation",
+ "metric": "WER",
+ "pearson": -0.49788185337361895
+ },
+ {
+ "parameter": "disable_crop_orientation",
+ "metric": "WER",
+ "pearson": 0.35150041517456027
+ },
+ {
+ "parameter": "paragraph_break",
+ "metric": "WER",
+ "pearson": -0.2867204522845194
+ },
+ {
+ "parameter": "resolve_blocks",
+ "metric": "WER",
+ "pearson": -0.23363320696739256
+ },
+ {
+ "parameter": "assume_straight_pages",
+ "metric": "WER",
+ "pearson": -0.12980061688173894
+ },
+ {
+ "parameter": "preserve_aspect_ratio",
+ "metric": "WER",
+ "pearson": 0.1289984520450311
+ },
+ {
+ "parameter": "resolve_lines",
+ "metric": "WER",
+ "pearson": 0.06486504638919248
+ }
+]
\ No newline at end of file
diff --git a/src/results/correlations/easyocr_correlations.csv b/src/results/correlations/easyocr_correlations.csv
new file mode 100644
index 0000000..0a59fee
--- /dev/null
+++ b/src/results/correlations/easyocr_correlations.csv
@@ -0,0 +1,25 @@
+parameter,metric,pearson
+contrast_ths,CER,0.40885606429688176
+ycenter_ths,CER,0.3052506223332593
+slope_ths,CER,-0.3007836023513022
+width_ths,CER,0.2820622927402215
+beamWidth,CER,0.24551397803020547
+add_margin,CER,0.23419721611930053
+height_ths,CER,-0.22762064209067434
+low_text,CER,-0.2127914870290739
+adjust_contrast,CER,-0.14938336246670267
+text_threshold,CER,-0.12089385052834749
+link_threshold,CER,-0.09553910777087017
+min_size,CER,0.04597421017746789
+contrast_ths,WER,0.38684315732406244
+slope_ths,WER,-0.32504213142289984
+ycenter_ths,WER,0.28738383141120705
+add_margin,WER,0.24148232685944232
+width_ths,WER,0.23869424685132606
+height_ths,WER,-0.23743923240967893
+beamWidth,WER,0.2286079838179018
+adjust_contrast,WER,-0.17842640649533945
+low_text,WER,-0.1772397092408802
+text_threshold,WER,-0.12733512825321042
+min_size,WER,0.10070615378426818
+link_threshold,WER,-0.04425190559911718
diff --git a/src/results/correlations/easyocr_correlations.json b/src/results/correlations/easyocr_correlations.json
new file mode 100644
index 0000000..f9a442b
--- /dev/null
+++ b/src/results/correlations/easyocr_correlations.json
@@ -0,0 +1,122 @@
+[
+ {
+ "parameter": "contrast_ths",
+ "metric": "CER",
+ "pearson": 0.40885606429688176
+ },
+ {
+ "parameter": "ycenter_ths",
+ "metric": "CER",
+ "pearson": 0.3052506223332593
+ },
+ {
+ "parameter": "slope_ths",
+ "metric": "CER",
+ "pearson": -0.3007836023513022
+ },
+ {
+ "parameter": "width_ths",
+ "metric": "CER",
+ "pearson": 0.2820622927402215
+ },
+ {
+ "parameter": "beamWidth",
+ "metric": "CER",
+ "pearson": 0.24551397803020547
+ },
+ {
+ "parameter": "add_margin",
+ "metric": "CER",
+ "pearson": 0.23419721611930053
+ },
+ {
+ "parameter": "height_ths",
+ "metric": "CER",
+ "pearson": -0.22762064209067434
+ },
+ {
+ "parameter": "low_text",
+ "metric": "CER",
+ "pearson": -0.2127914870290739
+ },
+ {
+ "parameter": "adjust_contrast",
+ "metric": "CER",
+ "pearson": -0.14938336246670267
+ },
+ {
+ "parameter": "text_threshold",
+ "metric": "CER",
+ "pearson": -0.12089385052834749
+ },
+ {
+ "parameter": "link_threshold",
+ "metric": "CER",
+ "pearson": -0.09553910777087017
+ },
+ {
+ "parameter": "min_size",
+ "metric": "CER",
+ "pearson": 0.04597421017746789
+ },
+ {
+ "parameter": "contrast_ths",
+ "metric": "WER",
+ "pearson": 0.38684315732406244
+ },
+ {
+ "parameter": "slope_ths",
+ "metric": "WER",
+ "pearson": -0.32504213142289984
+ },
+ {
+ "parameter": "ycenter_ths",
+ "metric": "WER",
+ "pearson": 0.28738383141120705
+ },
+ {
+ "parameter": "add_margin",
+ "metric": "WER",
+ "pearson": 0.24148232685944232
+ },
+ {
+ "parameter": "width_ths",
+ "metric": "WER",
+ "pearson": 0.23869424685132606
+ },
+ {
+ "parameter": "height_ths",
+ "metric": "WER",
+ "pearson": -0.23743923240967893
+ },
+ {
+ "parameter": "beamWidth",
+ "metric": "WER",
+ "pearson": 0.2286079838179018
+ },
+ {
+ "parameter": "adjust_contrast",
+ "metric": "WER",
+ "pearson": -0.17842640649533945
+ },
+ {
+ "parameter": "low_text",
+ "metric": "WER",
+ "pearson": -0.1772397092408802
+ },
+ {
+ "parameter": "text_threshold",
+ "metric": "WER",
+ "pearson": -0.12733512825321042
+ },
+ {
+ "parameter": "min_size",
+ "metric": "WER",
+ "pearson": 0.10070615378426818
+ },
+ {
+ "parameter": "link_threshold",
+ "metric": "WER",
+ "pearson": -0.04425190559911718
+ }
+]
\ No newline at end of file
diff --git a/src/results/correlations/paddle_correlations.csv b/src/results/correlations/paddle_correlations.csv
new file mode 100644
index 0000000..e0a754d
--- /dev/null
+++ b/src/results/correlations/paddle_correlations.csv
@@ -0,0 +1,15 @@
+parameter,metric,pearson
+use_doc_unwarping,CER,0.8791236551817551
+use_doc_orientation_classify,CER,-0.7119850615039771
+textline_orientation,CER,-0.5347452891182014
+text_det_thresh,CER,0.4280438958428758
+text_det_box_thresh,CER,0.3113152196833144
+text_rec_score_thresh,CER,-0.2681957118190106
+text_det_unclip_ratio,CER,
+use_doc_unwarping,WER,0.743651897463081
+use_doc_orientation_classify,WER,-0.6018981292243886
+textline_orientation,WER,-0.5906753653336065
+text_det_thresh,WER,0.39917807081409956
+text_det_box_thresh,WER,0.2555315418488065
+text_rec_score_thresh,WER,-0.08030912963602418
+text_det_unclip_ratio,WER,
diff --git a/src/results/correlations/paddle_correlations.json b/src/results/correlations/paddle_correlations.json
new file mode 100644
index 0000000..78223df
--- /dev/null
+++ b/src/results/correlations/paddle_correlations.json
@@ -0,0 +1,72 @@
+[
+ {
+ "parameter": "use_doc_unwarping",
+ "metric": "CER",
+ "pearson": 0.8791236551817551
+ },
+ {
+ "parameter": "use_doc_orientation_classify",
+ "metric": "CER",
+ "pearson": -0.7119850615039771
+ },
+ {
+ "parameter": "textline_orientation",
+ "metric": "CER",
+ "pearson": -0.5347452891182014
+ },
+ {
+ "parameter": "text_det_thresh",
+ "metric": "CER",
+ "pearson": 0.4280438958428758
+ },
+ {
+ "parameter": "text_det_box_thresh",
+ "metric": "CER",
+ "pearson": 0.3113152196833144
+ },
+ {
+ "parameter": "text_rec_score_thresh",
+ "metric": "CER",
+ "pearson": -0.2681957118190106
+ },
+ {
+ "parameter": "text_det_unclip_ratio",
+ "metric": "CER",
+ "pearson": NaN
+ },
+ {
+ "parameter": "use_doc_unwarping",
+ "metric": "WER",
+ "pearson": 0.743651897463081
+ },
+ {
+ "parameter": "use_doc_orientation_classify",
+ "metric": "WER",
+ "pearson": -0.6018981292243886
+ },
+ {
+ "parameter": "textline_orientation",
+ "metric": "WER",
+ "pearson": -0.5906753653336065
+ },
+ {
+ "parameter": "text_det_thresh",
+ "metric": "WER",
+ "pearson": 0.39917807081409956
+ },
+ {
+ "parameter": "text_det_box_thresh",
+ "metric": "WER",
+ "pearson": 0.2555315418488065
+ },
+ {
+ "parameter": "text_rec_score_thresh",
+ "metric": "WER",
+ "pearson": -0.08030912963602418
+ },
+ {
+ "parameter": "text_det_unclip_ratio",
+ "metric": "WER",
+ "pearson": NaN
+ }
+]
\ No newline at end of file