diff --git a/apply_content.py b/apply_content.py index f2b7ed3..38f279c 100644 --- a/apply_content.py +++ b/apply_content.py @@ -97,7 +97,7 @@ def extract_figure_title_from_mermaid(lines, current_index): return None -def parse_md_to_html_blocks(md_content): +def parse_md_to_html_blocks(md_content, is_anexo=False): """Convert markdown content to HTML blocks with template styles.""" global table_counter, figure_counter @@ -142,7 +142,8 @@ def parse_md_to_html_blocks(md_content): # Format: "Figura X." in bold, title in italic (per UNIR guidelines) # Word TOC looks for text with Caption style - anchor must be outside main caption text bookmark_id = f"_Ref_Fig{figure_counter}" - html_blocks.append(f'''

Figura {figure_counter}. {fig_title}

''') + # mso-pagination:keep-with-next ensures caption stays with figure image (correct MSO property) + html_blocks.append(f'''

Figura {figure_counter}. {fig_title}

''') if os.path.exists(fig_path): # Read actual image dimensions and scale to fit page width @@ -162,10 +163,12 @@ def parse_md_to_html_blocks(md_content): w_pt = new_w * 0.75 h_pt = new_h * 0.75 - html_blocks.append(f'''

{fig_title}

''') + # mso-pagination:keep-with-next ensures image stays with source line + html_blocks.append(f'''

{fig_title}

''') else: # Fallback to placeholder - html_blocks.append(f'''

[Insertar diagrama Mermaid aquí]

''') + # mso-pagination:keep-with-next ensures placeholder stays with source line + html_blocks.append(f'''

[Insertar diagrama Mermaid aquí]

''') # Check if next non-empty line has custom Fuente custom_source = None @@ -218,12 +221,22 @@ def parse_md_to_html_blocks(md_content): continue elif line.startswith('###'): text = line.lstrip('#').strip() - html_blocks.append(f'

{text}

') + # Disable auto-numbering for Anexo content or A.x headings + if is_anexo or re.match(r'^A\.\d+', text): + # mso-list:none explicitly disables inherited list numbering from template CSS + html_blocks.append(f'

{text}

') + else: + html_blocks.append(f'

{text}

') i += 1 continue elif line.startswith('##'): text = line.lstrip('#').strip() - html_blocks.append(f'

{text}

') + # Disable auto-numbering for Anexo content or A.x headings + if is_anexo or re.match(r'^A\.\d+', text): + # mso-list:none explicitly disables inherited list numbering from template CSS + html_blocks.append(f'

{text}

') + else: + html_blocks.append(f'

{text}

') i += 1 continue elif line.startswith('#'): @@ -277,10 +290,10 @@ def parse_md_to_html_blocks(md_content): clean_title = alt_title else: clean_title = "Tabla de datos." - html_blocks.append(f'''

Tabla {table_counter}. {clean_title}

''') + # mso-pagination:keep-with-next ensures caption stays with table (correct MSO property) + html_blocks.append(f'''

Tabla {table_counter}. {clean_title}

''') # Build table HTML with APA style (horizontal lines only, no vertical) - # Wrap in centered div for Word compatibility table_html = '
' for j, tline in enumerate(table_lines): cells = [c.strip() for c in tline.split('|')[1:-1]] @@ -365,10 +378,10 @@ def parse_md_to_html_blocks(md_content): return '\n\n'.join(html_blocks) -def extract_section_content(md_content): +def extract_section_content(md_content, is_anexo=False): """Extract content from markdown, skipping the first # header.""" md_content = re.sub(r'^#\s+[^\n]+\n+', '', md_content, count=1) - return parse_md_to_html_blocks(md_content) + return parse_md_to_html_blocks(md_content, is_anexo=is_anexo) def find_section_element(soup, keyword): """Find element containing keyword (h1 or special paragraph classes).""" @@ -672,7 +685,7 @@ def main(): current.extract() current = next_elem - anexo_content = extract_section_content(docs['anexo']) + anexo_content = extract_section_content(docs['anexo'], is_anexo=True) anexo_soup = BeautifulSoup(anexo_content, 'html.parser') insert_point = anexo_elem for new_elem in reversed(list(anexo_soup.children)): diff --git a/src/results/correlations/doctr_correlations.csv b/src/results/correlations/doctr_correlations.csv new file mode 100644 index 0000000..54ae5fd --- /dev/null +++ b/src/results/correlations/doctr_correlations.csv @@ -0,0 +1,19 @@ +parameter,metric,pearson +straighten_pages,CER,0.9998131749398365 +symmetric_pad,CER,0.5261495908818205 +disable_page_orientation,CER,-0.49338806188002227 +disable_crop_orientation,CER,0.3470400052300961 +paragraph_break,CER,-0.28710190683729514 +resolve_blocks,CER,-0.2307325611083525 +preserve_aspect_ratio,CER,0.12408787013954682 +assume_straight_pages,CER,-0.11904725618816066 +resolve_lines,CER,0.060358529330885535 +straighten_pages,WER,0.9997423603542454 +symmetric_pad,WER,0.5288569133407047 +disable_page_orientation,WER,-0.49788185337361895 +disable_crop_orientation,WER,0.35150041517456027 +paragraph_break,WER,-0.2867204522845194 +resolve_blocks,WER,-0.23363320696739256 +assume_straight_pages,WER,-0.12980061688173894 +preserve_aspect_ratio,WER,0.1289984520450311 +resolve_lines,WER,0.06486504638919248 diff --git a/src/results/correlations/doctr_correlations.json b/src/results/correlations/doctr_correlations.json new file mode 100644 index 0000000..8413366 --- /dev/null +++ b/src/results/correlations/doctr_correlations.json @@ -0,0 +1,92 @@ +[ + { + "parameter": "straighten_pages", + "metric": "CER", + "pearson": 0.9998131749398365 + }, + { + "parameter": "symmetric_pad", + "metric": "CER", + "pearson": 0.5261495908818205 + }, + { + "parameter": "disable_page_orientation", + "metric": "CER", + "pearson": -0.49338806188002227 + }, + { + "parameter": "disable_crop_orientation", + "metric": "CER", + "pearson": 0.3470400052300961 + }, + { + "parameter": "paragraph_break", + "metric": "CER", + "pearson": -0.28710190683729514 + }, + { + "parameter": "resolve_blocks", + "metric": "CER", + "pearson": -0.2307325611083525 + }, + { + "parameter": "preserve_aspect_ratio", + "metric": "CER", + "pearson": 0.12408787013954682 + }, + { + "parameter": "assume_straight_pages", + "metric": "CER", + "pearson": -0.11904725618816066 + }, + { + "parameter": "resolve_lines", + "metric": "CER", + "pearson": 0.060358529330885535 + }, + { + "parameter": "straighten_pages", + "metric": "WER", + "pearson": 0.9997423603542454 + }, + { + "parameter": "symmetric_pad", + "metric": "WER", + "pearson": 0.5288569133407047 + }, + { + "parameter": "disable_page_orientation", + "metric": "WER", + "pearson": -0.49788185337361895 + }, + { + "parameter": "disable_crop_orientation", + "metric": "WER", + "pearson": 0.35150041517456027 + }, + { + "parameter": "paragraph_break", + "metric": "WER", + "pearson": -0.2867204522845194 + }, + { + "parameter": "resolve_blocks", + "metric": "WER", + "pearson": -0.23363320696739256 + }, + { + "parameter": "assume_straight_pages", + "metric": "WER", + "pearson": -0.12980061688173894 + }, + { + "parameter": "preserve_aspect_ratio", + "metric": "WER", + "pearson": 0.1289984520450311 + }, + { + "parameter": "resolve_lines", + "metric": "WER", + "pearson": 0.06486504638919248 + } +] \ No newline at end of file diff --git a/src/results/correlations/easyocr_correlations.csv b/src/results/correlations/easyocr_correlations.csv new file mode 100644 index 0000000..0a59fee --- /dev/null +++ b/src/results/correlations/easyocr_correlations.csv @@ -0,0 +1,25 @@ +parameter,metric,pearson +contrast_ths,CER,0.40885606429688176 +ycenter_ths,CER,0.3052506223332593 +slope_ths,CER,-0.3007836023513022 +width_ths,CER,0.2820622927402215 +beamWidth,CER,0.24551397803020547 +add_margin,CER,0.23419721611930053 +height_ths,CER,-0.22762064209067434 +low_text,CER,-0.2127914870290739 +adjust_contrast,CER,-0.14938336246670267 +text_threshold,CER,-0.12089385052834749 +link_threshold,CER,-0.09553910777087017 +min_size,CER,0.04597421017746789 +contrast_ths,WER,0.38684315732406244 +slope_ths,WER,-0.32504213142289984 +ycenter_ths,WER,0.28738383141120705 +add_margin,WER,0.24148232685944232 +width_ths,WER,0.23869424685132606 +height_ths,WER,-0.23743923240967893 +beamWidth,WER,0.2286079838179018 +adjust_contrast,WER,-0.17842640649533945 +low_text,WER,-0.1772397092408802 +text_threshold,WER,-0.12733512825321042 +min_size,WER,0.10070615378426818 +link_threshold,WER,-0.04425190559911718 diff --git a/src/results/correlations/easyocr_correlations.json b/src/results/correlations/easyocr_correlations.json new file mode 100644 index 0000000..f9a442b --- /dev/null +++ b/src/results/correlations/easyocr_correlations.json @@ -0,0 +1,122 @@ +[ + { + "parameter": "contrast_ths", + "metric": "CER", + "pearson": 0.40885606429688176 + }, + { + "parameter": "ycenter_ths", + "metric": "CER", + "pearson": 0.3052506223332593 + }, + { + "parameter": "slope_ths", + "metric": "CER", + "pearson": -0.3007836023513022 + }, + { + "parameter": "width_ths", + "metric": "CER", + "pearson": 0.2820622927402215 + }, + { + "parameter": "beamWidth", + "metric": "CER", + "pearson": 0.24551397803020547 + }, + { + "parameter": "add_margin", + "metric": "CER", + "pearson": 0.23419721611930053 + }, + { + "parameter": "height_ths", + "metric": "CER", + "pearson": -0.22762064209067434 + }, + { + "parameter": "low_text", + "metric": "CER", + "pearson": -0.2127914870290739 + }, + { + "parameter": "adjust_contrast", + "metric": "CER", + "pearson": -0.14938336246670267 + }, + { + "parameter": "text_threshold", + "metric": "CER", + "pearson": -0.12089385052834749 + }, + { + "parameter": "link_threshold", + "metric": "CER", + "pearson": -0.09553910777087017 + }, + { + "parameter": "min_size", + "metric": "CER", + "pearson": 0.04597421017746789 + }, + { + "parameter": "contrast_ths", + "metric": "WER", + "pearson": 0.38684315732406244 + }, + { + "parameter": "slope_ths", + "metric": "WER", + "pearson": -0.32504213142289984 + }, + { + "parameter": "ycenter_ths", + "metric": "WER", + "pearson": 0.28738383141120705 + }, + { + "parameter": "add_margin", + "metric": "WER", + "pearson": 0.24148232685944232 + }, + { + "parameter": "width_ths", + "metric": "WER", + "pearson": 0.23869424685132606 + }, + { + "parameter": "height_ths", + "metric": "WER", + "pearson": -0.23743923240967893 + }, + { + "parameter": "beamWidth", + "metric": "WER", + "pearson": 0.2286079838179018 + }, + { + "parameter": "adjust_contrast", + "metric": "WER", + "pearson": -0.17842640649533945 + }, + { + "parameter": "low_text", + "metric": "WER", + "pearson": -0.1772397092408802 + }, + { + "parameter": "text_threshold", + "metric": "WER", + "pearson": -0.12733512825321042 + }, + { + "parameter": "min_size", + "metric": "WER", + "pearson": 0.10070615378426818 + }, + { + "parameter": "link_threshold", + "metric": "WER", + "pearson": -0.04425190559911718 + } +] \ No newline at end of file diff --git a/src/results/correlations/paddle_correlations.csv b/src/results/correlations/paddle_correlations.csv new file mode 100644 index 0000000..e0a754d --- /dev/null +++ b/src/results/correlations/paddle_correlations.csv @@ -0,0 +1,15 @@ +parameter,metric,pearson +use_doc_unwarping,CER,0.8791236551817551 +use_doc_orientation_classify,CER,-0.7119850615039771 +textline_orientation,CER,-0.5347452891182014 +text_det_thresh,CER,0.4280438958428758 +text_det_box_thresh,CER,0.3113152196833144 +text_rec_score_thresh,CER,-0.2681957118190106 +text_det_unclip_ratio,CER, +use_doc_unwarping,WER,0.743651897463081 +use_doc_orientation_classify,WER,-0.6018981292243886 +textline_orientation,WER,-0.5906753653336065 +text_det_thresh,WER,0.39917807081409956 +text_det_box_thresh,WER,0.2555315418488065 +text_rec_score_thresh,WER,-0.08030912963602418 +text_det_unclip_ratio,WER, diff --git a/src/results/correlations/paddle_correlations.json b/src/results/correlations/paddle_correlations.json new file mode 100644 index 0000000..78223df --- /dev/null +++ b/src/results/correlations/paddle_correlations.json @@ -0,0 +1,72 @@ +[ + { + "parameter": "use_doc_unwarping", + "metric": "CER", + "pearson": 0.8791236551817551 + }, + { + "parameter": "use_doc_orientation_classify", + "metric": "CER", + "pearson": -0.7119850615039771 + }, + { + "parameter": "textline_orientation", + "metric": "CER", + "pearson": -0.5347452891182014 + }, + { + "parameter": "text_det_thresh", + "metric": "CER", + "pearson": 0.4280438958428758 + }, + { + "parameter": "text_det_box_thresh", + "metric": "CER", + "pearson": 0.3113152196833144 + }, + { + "parameter": "text_rec_score_thresh", + "metric": "CER", + "pearson": -0.2681957118190106 + }, + { + "parameter": "text_det_unclip_ratio", + "metric": "CER", + "pearson": NaN + }, + { + "parameter": "use_doc_unwarping", + "metric": "WER", + "pearson": 0.743651897463081 + }, + { + "parameter": "use_doc_orientation_classify", + "metric": "WER", + "pearson": -0.6018981292243886 + }, + { + "parameter": "textline_orientation", + "metric": "WER", + "pearson": -0.5906753653336065 + }, + { + "parameter": "text_det_thresh", + "metric": "WER", + "pearson": 0.39917807081409956 + }, + { + "parameter": "text_det_box_thresh", + "metric": "WER", + "pearson": 0.2555315418488065 + }, + { + "parameter": "text_rec_score_thresh", + "metric": "WER", + "pearson": -0.08030912963602418 + }, + { + "parameter": "text_det_unclip_ratio", + "metric": "WER", + "pearson": NaN + } +] \ No newline at end of file