links regeneration
Some checks failed
build_docker / essential (push) Successful in 0s
build_docker / build_paddle_ocr (push) Successful in 5m30s
build_docker / build_paddle_ocr_gpu (push) Successful in 22m0s
build_docker / build_easyocr (push) Successful in 18m14s
build_docker / build_doctr_gpu (push) Has been cancelled
build_docker / build_raytune (push) Has been cancelled
build_docker / build_easyocr_gpu (push) Has started running
build_docker / build_doctr (push) Has been cancelled

This commit is contained in:
2026-02-04 17:52:58 +01:00
parent d384f1e4d3
commit 38ba85d834
11 changed files with 227 additions and 240 deletions

View File

@@ -16,6 +16,10 @@ DOCS_DIR = os.path.join(BASE_DIR, 'docs')
# Global counters for tables and figures
table_counter = 0
figure_counter = 0
anexo_table_counter = 0
anexo_figure_counter = 0
# Global sequential counter for figure filenames (figura_1.png, figura_2.png, etc.)
global_figure_index = 0
def read_file(path):
try:
@@ -99,7 +103,7 @@ def extract_figure_title_from_mermaid(lines, current_index):
def parse_md_to_html_blocks(md_content, is_anexo=False):
"""Convert markdown content to HTML blocks with template styles."""
global table_counter, figure_counter
global table_counter, figure_counter, anexo_table_counter, anexo_figure_counter, global_figure_index
html_blocks = []
lines = md_content.split('\n')
@@ -115,7 +119,17 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
# Mermaid diagram - convert to figure with actual image
if line.strip().startswith('```mermaid'):
figure_counter += 1
# Always increment global index for sequential filenames
global_figure_index += 1
# Use Anexo-specific counter with "A" prefix for display, or global counter
if is_anexo:
anexo_figure_counter += 1
fig_num = f"A{anexo_figure_counter}" # Display number: A1, A2, A3...
else:
figure_counter += 1
fig_num = str(figure_counter) # Display number: 1, 2, 3...
mermaid_lines = []
i += 1
while i < len(lines) and not lines[i].strip() == '```':
@@ -132,18 +146,22 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
if title_match:
fig_title = title_match.group(1).strip()
else:
fig_title = f"Diagrama {figure_counter}"
fig_title = f"Diagrama {fig_num}"
# Check if the generated PNG exists
fig_file = f'figures/figura_{figure_counter}.png'
# Use global sequential index for filename (figura_1.png, figura_2.png, etc.)
fig_file = f'figures/figura_{global_figure_index}.png'
fig_path = os.path.join(BASE_DIR, 'thesis_output', fig_file)
# Create figure with MsoCaption class and proper Word SEQ field for cross-reference
# Format: "Figura X." in bold, title in italic (per UNIR guidelines)
# Word TOC looks for text with Caption style - anchor must be outside main caption text
bookmark_id = f"_Ref_Fig{figure_counter}"
bookmark_id = f"_Ref_Fig{fig_num}"
# mso-pagination:keep-with-next ensures caption stays with figure image (correct MSO property)
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{figure_counter}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
# For Anexo figures, use static text (no SEQ field) to prevent Word from overwriting A1, A2...
if is_anexo:
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura {fig_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
else:
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{fig_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
if os.path.exists(fig_path):
# Read actual image dimensions and scale to fit page width
@@ -216,7 +234,8 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
# Headers - ## becomes h2, ### becomes h3
if line.startswith('####'):
text = line.lstrip('#').strip()
html_blocks.append(f'<h4><span lang=ES>{text}</span></h4>')
# Apply consistent styling like h2/h3, disable numbering for h4
html_blocks.append(f'<h4 style="mso-list:none"><span lang=ES style="text-transform:none">{text}</span></h4>')
i += 1
continue
elif line.startswith('###'):
@@ -246,7 +265,13 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
# Table - check for table title pattern first
if '|' in line and i + 1 < len(lines) and '---' in lines[i + 1]:
table_counter += 1
# Use Anexo-specific counter with "A" prefix, or global counter
if is_anexo:
anexo_table_counter += 1
table_num = f"A{anexo_table_counter}"
else:
table_counter += 1
table_num = str(table_counter)
# Check if previous line has table title (e.g., **Tabla 1.** *Title*)
table_title = None
@@ -281,7 +306,7 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
# Add table title with MsoCaption class and proper Word SEQ field for cross-reference
# Format: "Tabla X." in bold, title in italic (per UNIR guidelines)
# Word TOC looks for text with Caption style - anchor must be outside main caption text
bookmark_id = f"_Ref_Tab{table_counter}"
bookmark_id = f"_Ref_Tab{table_num}"
if table_title:
# Remove any "Tabla X." or "Tabla AX." pattern from the title
clean_title = re.sub(r'^Tabla\s+[A-Z]?\d+\.\s*', '', table_title).strip()
@@ -291,7 +316,11 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
else:
clean_title = "Tabla de datos."
# mso-pagination:keep-with-next ensures caption stays with table (correct MSO property)
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_counter}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
# For Anexo tables, use static text (no SEQ field) to prevent Word from overwriting A1, A2...
if is_anexo:
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla {table_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
else:
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
# Build table HTML with APA style (horizontal lines only, no vertical)
table_html = '<div align="center"><table class=MsoTableGrid border=1 cellspacing=0 cellpadding=0 align="center" style="border-collapse:collapse;margin-left:auto;margin-right:auto;mso-table-style-name:\'Plain Table 1\'">'
@@ -445,25 +474,25 @@ def extract_resumen_parts(resumen_content):
spanish_keywords = ''
if '**Palabras clave:**' in spanish_part:
text_part, kw_part = spanish_part.split('**Palabras clave:**')
spanish_text = text_part.replace('# Resumen', '').strip()
spanish_keywords = kw_part.strip()
spanish_text = md_to_html_para(text_part.replace('# Resumen', '').strip())
spanish_keywords = md_to_html_para(kw_part.strip())
else:
spanish_text = spanish_part.replace('# Resumen', '').strip()
spanish_text = md_to_html_para(spanish_part.replace('# Resumen', '').strip())
# Extract English content
english_text = ''
english_keywords = ''
if '**Keywords:**' in english_part:
text_part, kw_part = english_part.split('**Keywords:**')
english_text = text_part.replace('# Abstract', '').strip()
english_keywords = kw_part.strip()
english_text = md_to_html_para(text_part.replace('# Abstract', '').strip())
english_keywords = md_to_html_para(kw_part.strip())
else:
english_text = english_part.replace('# Abstract', '').strip()
english_text = md_to_html_para(english_part.replace('# Abstract', '').strip())
return spanish_text, spanish_keywords, english_text, english_keywords
def main():
global table_counter, figure_counter
global table_counter, figure_counter, anexo_table_counter, anexo_figure_counter
print("Reading template...")
html_content = read_file(TEMPLATE_INPUT)
@@ -692,7 +721,7 @@ def main():
insert_point.insert_after(new_elem)
print(f" ✓ Replaced content")
print(f"\nSummary: {table_counter} tables, {figure_counter} figures processed")
print(f"\nSummary: {table_counter} tables + {anexo_table_counter} Anexo tables, {figure_counter} figures + {anexo_figure_counter} Anexo figures processed")
print("Saving modified template...")
output_html = str(soup)