correlations
All checks were successful
build_docker / essential (push) Successful in 1s
build_docker / build_paddle_ocr (push) Successful in 5m47s
build_docker / build_paddle_ocr_gpu (push) Successful in 22m8s
build_docker / build_easyocr (push) Successful in 18m3s
build_docker / build_easyocr_gpu (push) Successful in 20m9s
build_docker / build_doctr (push) Successful in 19m40s
build_docker / build_raytune (push) Successful in 3m24s
build_docker / build_doctr_gpu (push) Successful in 15m35s

This commit is contained in:
2026-01-24 16:48:47 +01:00
parent 4c299cc00f
commit d384f1e4d3
7 changed files with 369 additions and 11 deletions

View File

@@ -97,7 +97,7 @@ def extract_figure_title_from_mermaid(lines, current_index):
return None
def parse_md_to_html_blocks(md_content):
def parse_md_to_html_blocks(md_content, is_anexo=False):
"""Convert markdown content to HTML blocks with template styles."""
global table_counter, figure_counter
@@ -142,7 +142,8 @@ def parse_md_to_html_blocks(md_content):
# Format: "Figura X." in bold, title in italic (per UNIR guidelines)
# Word TOC looks for text with Caption style - anchor must be outside main caption text
bookmark_id = f"_Ref_Fig{figure_counter}"
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{figure_counter}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
# mso-pagination:keep-with-next ensures caption stays with figure image (correct MSO property)
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{figure_counter}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
if os.path.exists(fig_path):
# Read actual image dimensions and scale to fit page width
@@ -162,10 +163,12 @@ def parse_md_to_html_blocks(md_content):
w_pt = new_w * 0.75
h_pt = new_h * 0.75
html_blocks.append(f'''<p class=MsoNormal style="text-align:center"><span lang=ES><img width="{new_w}" height="{new_h}" style="width:{w_pt}pt;height:{h_pt}pt;display:block;margin:0 auto" src="{fig_file}" alt="{fig_title}"/></span></p>''')
# mso-pagination:keep-with-next ensures image stays with source line
html_blocks.append(f'''<p class=MsoNormal style="text-align:center;mso-pagination:keep-with-next"><span lang=ES><img width="{new_w}" height="{new_h}" style="width:{w_pt}pt;height:{h_pt}pt;display:block;margin:0 auto" src="{fig_file}" alt="{fig_title}"/></span></p>''')
else:
# Fallback to placeholder
html_blocks.append(f'''<p class=MsoNormal style="text-align:center;border:1px dashed #999;padding:20px;margin:10px 40px;background:#f9f9f9"><span lang=ES style="color:#666">[Insertar diagrama Mermaid aquí]</span></p>''')
# mso-pagination:keep-with-next ensures placeholder stays with source line
html_blocks.append(f'''<p class=MsoNormal style="text-align:center;mso-pagination:keep-with-next;border:1px dashed #999;padding:20px;margin:10px 40px;background:#f9f9f9"><span lang=ES style="color:#666">[Insertar diagrama Mermaid aquí]</span></p>''')
# Check if next non-empty line has custom Fuente
custom_source = None
@@ -218,12 +221,22 @@ def parse_md_to_html_blocks(md_content):
continue
elif line.startswith('###'):
text = line.lstrip('#').strip()
html_blocks.append(f'<h3 style="mso-list:l22 level3 lfo18"><span lang=ES style="text-transform:none">{text}</span></h3>')
# Disable auto-numbering for Anexo content or A.x headings
if is_anexo or re.match(r'^A\.\d+', text):
# mso-list:none explicitly disables inherited list numbering from template CSS
html_blocks.append(f'<h3 style="mso-list:none"><span lang=ES style="text-transform:none">{text}</span></h3>')
else:
html_blocks.append(f'<h3 style="mso-list:l22 level3 lfo18"><span lang=ES style="text-transform:none">{text}</span></h3>')
i += 1
continue
elif line.startswith('##'):
text = line.lstrip('#').strip()
html_blocks.append(f'<h2 style="mso-list:l22 level2 lfo18"><span lang=ES style="text-transform:none">{text}</span></h2>')
# Disable auto-numbering for Anexo content or A.x headings
if is_anexo or re.match(r'^A\.\d+', text):
# mso-list:none explicitly disables inherited list numbering from template CSS
html_blocks.append(f'<h2 style="mso-list:none"><span lang=ES style="text-transform:none">{text}</span></h2>')
else:
html_blocks.append(f'<h2 style="mso-list:l22 level2 lfo18"><span lang=ES style="text-transform:none">{text}</span></h2>')
i += 1
continue
elif line.startswith('#'):
@@ -277,10 +290,10 @@ def parse_md_to_html_blocks(md_content):
clean_title = alt_title
else:
clean_title = "Tabla de datos."
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_counter}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
# mso-pagination:keep-with-next ensures caption stays with table (correct MSO property)
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_counter}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
# Build table HTML with APA style (horizontal lines only, no vertical)
# Wrap in centered div for Word compatibility
table_html = '<div align="center"><table class=MsoTableGrid border=1 cellspacing=0 cellpadding=0 align="center" style="border-collapse:collapse;margin-left:auto;margin-right:auto;mso-table-style-name:\'Plain Table 1\'">'
for j, tline in enumerate(table_lines):
cells = [c.strip() for c in tline.split('|')[1:-1]]
@@ -365,10 +378,10 @@ def parse_md_to_html_blocks(md_content):
return '\n\n'.join(html_blocks)
def extract_section_content(md_content):
def extract_section_content(md_content, is_anexo=False):
"""Extract content from markdown, skipping the first # header."""
md_content = re.sub(r'^#\s+[^\n]+\n+', '', md_content, count=1)
return parse_md_to_html_blocks(md_content)
return parse_md_to_html_blocks(md_content, is_anexo=is_anexo)
def find_section_element(soup, keyword):
"""Find element containing keyword (h1 or special paragraph classes)."""
@@ -672,7 +685,7 @@ def main():
current.extract()
current = next_elem
anexo_content = extract_section_content(docs['anexo'])
anexo_content = extract_section_content(docs['anexo'], is_anexo=True)
anexo_soup = BeautifulSoup(anexo_content, 'html.parser')
insert_point = anexo_elem
for new_elem in reversed(list(anexo_soup.children)):