Some checks failed
build_docker / essential (push) Successful in 1s
build_docker / build_paddle_ocr (push) Successful in 5m6s
build_docker / build_raytune (push) Has been cancelled
build_docker / build_easyocr_gpu (push) Has been cancelled
build_docker / build_doctr (push) Has been cancelled
build_docker / build_doctr_gpu (push) Has been cancelled
build_docker / build_paddle_ocr_gpu (push) Has started running
build_docker / build_easyocr (push) Has been cancelled
498 lines
19 KiB
Python
498 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""Replace template content with thesis content from docs/ folder using BeautifulSoup.
|
|
|
|
This module orchestrates the conversion of markdown documentation to UNIR's
|
|
Word template format. Content handling is delegated to:
|
|
- markdown_utils.py: Utility functions for markdown parsing
|
|
- content_handlers.py: Block-level content handlers (tables, figures, lists, etc.)
|
|
"""
|
|
|
|
import re
|
|
import os
|
|
import shutil
|
|
from bs4 import BeautifulSoup, NavigableString
|
|
|
|
from markdown_utils import (
|
|
read_file,
|
|
write_file,
|
|
md_to_html_para,
|
|
convert_latex_formulas,
|
|
is_source_line,
|
|
is_leyenda_line,
|
|
split_into_paragraphs,
|
|
SOURCE_LINE_RE,
|
|
)
|
|
from content_handlers import (
|
|
handle_mermaid_diagram,
|
|
handle_code_block,
|
|
handle_header,
|
|
handle_table,
|
|
handle_blockquote,
|
|
handle_bullet_list,
|
|
handle_numbered_list,
|
|
)
|
|
|
|
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
TEMPLATE_INPUT = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm')
|
|
TEMPLATE_OUTPUT = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual.htm')
|
|
DOCS_DIR = os.path.join(BASE_DIR, 'docs')
|
|
|
|
|
|
def parse_md_to_html_blocks(md_content, is_anexo=False, counters=None):
|
|
"""Convert markdown content to HTML blocks with template styles.
|
|
|
|
Args:
|
|
md_content: Markdown content string
|
|
is_anexo: Boolean indicating if processing Anexo section
|
|
counters: Dict with table/figure counters. If None, creates new one.
|
|
|
|
Returns:
|
|
Tuple of (html_string, counters) where counters is the updated dict
|
|
"""
|
|
if counters is None:
|
|
counters = {
|
|
'table': 0,
|
|
'figure': 0,
|
|
'anexo_table': 0,
|
|
'anexo_figure': 0,
|
|
'global_figure': 0,
|
|
}
|
|
|
|
html_blocks = []
|
|
lines = md_content.split('\n')
|
|
i = 0
|
|
|
|
while i < len(lines):
|
|
line = lines[i]
|
|
|
|
# Skip empty lines
|
|
if not line.strip():
|
|
i += 1
|
|
continue
|
|
|
|
# Mermaid diagram - convert to figure with actual image
|
|
if line.strip().startswith('```mermaid'):
|
|
blocks, i = handle_mermaid_diagram(lines, i, counters, is_anexo)
|
|
html_blocks.extend(blocks)
|
|
continue
|
|
|
|
# Code block (non-mermaid)
|
|
if line.strip().startswith('```'):
|
|
blocks, i = handle_code_block(lines, i)
|
|
html_blocks.extend(blocks)
|
|
continue
|
|
|
|
# Headers
|
|
if line.startswith('#'):
|
|
header_html = handle_header(line, is_anexo)
|
|
if header_html is not None:
|
|
html_blocks.append(header_html)
|
|
i += 1
|
|
continue
|
|
|
|
# Table
|
|
if '|' in line and i + 1 < len(lines) and '---' in lines[i + 1]:
|
|
blocks, i = handle_table(lines, i, counters, is_anexo)
|
|
html_blocks.extend(blocks)
|
|
continue
|
|
|
|
# Blockquote
|
|
if line.startswith('>'):
|
|
blocks, i = handle_blockquote(lines, i)
|
|
html_blocks.extend(blocks)
|
|
continue
|
|
|
|
# Bullet list
|
|
if re.match(r'^[\-\*\+]\s', line):
|
|
blocks, i = handle_bullet_list(lines, i)
|
|
html_blocks.extend(blocks)
|
|
continue
|
|
|
|
# Numbered list
|
|
if re.match(r'^\d+\.\s', line):
|
|
blocks, i = handle_numbered_list(lines, i)
|
|
html_blocks.extend(blocks)
|
|
continue
|
|
|
|
# Skip lines that are just table/figure titles
|
|
if line.strip().startswith('**Tabla') or line.strip().startswith('*Tabla'):
|
|
i += 1
|
|
continue
|
|
if line.strip().startswith('**Figura') or line.strip().startswith('*Figura'):
|
|
i += 1
|
|
continue
|
|
if is_source_line(line):
|
|
i += 1
|
|
continue
|
|
if is_leyenda_line(line):
|
|
i += 1
|
|
continue
|
|
|
|
# Regular paragraph
|
|
para_lines = [line]
|
|
i += 1
|
|
while i < len(lines) and lines[i].strip() and not lines[i].startswith('#') and not lines[i].startswith('```') and not lines[i].startswith('>') and not re.match(r'^[\-\*\+]\s', lines[i]) and not re.match(r'^\d+\.\s', lines[i]) and '|' not in lines[i]:
|
|
para_lines.append(lines[i])
|
|
i += 1
|
|
|
|
para_text = ' '.join(para_lines)
|
|
para_text = convert_latex_formulas(para_text)
|
|
# Check if paragraph contains MathML (already wrapped)
|
|
if '<math' in para_text:
|
|
html_blocks.append(para_text)
|
|
else:
|
|
html_blocks.append(f'<p class=MsoNormal><span lang=ES>{md_to_html_para(para_text)}</span></p>')
|
|
|
|
return '\n\n'.join(html_blocks), counters
|
|
|
|
|
|
def extract_section_content(md_content, is_anexo=False, counters=None):
|
|
"""Extract content from markdown, skipping the first # header.
|
|
|
|
Args:
|
|
md_content: Markdown content string
|
|
is_anexo: Boolean indicating if processing Anexo section
|
|
counters: Dict with table/figure counters
|
|
|
|
Returns:
|
|
Tuple of (html_string, counters)
|
|
"""
|
|
md_content = re.sub(r'^#\s+[^\n]+\n+', '', md_content, count=1)
|
|
return parse_md_to_html_blocks(md_content, is_anexo=is_anexo, counters=counters)
|
|
|
|
|
|
def find_section_element(soup, keyword):
|
|
"""Find element containing keyword (h1 or special paragraph classes)."""
|
|
# First try h1
|
|
for h1 in soup.find_all('h1'):
|
|
text = h1.get_text()
|
|
if keyword.lower() in text.lower():
|
|
return h1
|
|
|
|
# Try special paragraph classes for unnumbered sections
|
|
for p in soup.find_all('p', class_=['Ttulo1sinnumerar', 'Anexo', 'MsoNormal']):
|
|
text = p.get_text()
|
|
if keyword.lower() in text.lower():
|
|
classes = p.get('class', [])
|
|
if 'Ttulo1sinnumerar' in classes or 'Anexo' in classes:
|
|
return p
|
|
if re.match(r'^\d+\.?\s', text.strip()):
|
|
return p
|
|
return None
|
|
|
|
|
|
def remove_elements_between(start_elem, end_elem):
|
|
"""Remove all elements between start and end (exclusive)."""
|
|
current = start_elem.next_sibling
|
|
elements_to_remove = []
|
|
while current and current != end_elem:
|
|
elements_to_remove.append(current)
|
|
current = current.next_sibling
|
|
for elem in elements_to_remove:
|
|
if hasattr(elem, 'decompose'):
|
|
elem.decompose()
|
|
elif isinstance(elem, NavigableString):
|
|
elem.extract()
|
|
|
|
|
|
def format_references(refs_content):
|
|
"""Format references with proper MsoBibliography style."""
|
|
refs_content = refs_content.replace('# Referencias bibliográficas {.unnumbered}', '').strip()
|
|
refs_html = ''
|
|
|
|
for line in refs_content.split('\n\n'):
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
# Apply markdown formatting
|
|
formatted = md_to_html_para(line)
|
|
|
|
# Use MsoBibliography style with hanging indent
|
|
refs_html += f'''<p class=MsoBibliography style="margin-left:36.0pt;text-indent:-36.0pt"><span lang=ES>{formatted}</span></p>\n'''
|
|
|
|
return refs_html
|
|
|
|
|
|
def extract_resumen_parts(resumen_content):
|
|
"""Extract Spanish resumen and English abstract from 00_resumen.md"""
|
|
parts = resumen_content.split('---')
|
|
|
|
spanish_part = parts[0] if len(parts) > 0 else ''
|
|
english_part = parts[1] if len(parts) > 1 else ''
|
|
|
|
# Extract Spanish content
|
|
spanish_text = ''
|
|
spanish_keywords = ''
|
|
if '**Palabras clave:**' in spanish_part:
|
|
text_part, kw_part = spanish_part.split('**Palabras clave:**')
|
|
spanish_text = split_into_paragraphs(text_part.replace('# Resumen', '').strip(), 'ES')
|
|
spanish_keywords = md_to_html_para(kw_part.strip())
|
|
else:
|
|
spanish_text = split_into_paragraphs(spanish_part.replace('# Resumen', '').strip(), 'ES')
|
|
|
|
# Extract English content
|
|
english_text = ''
|
|
english_keywords = ''
|
|
if '**Keywords:**' in english_part:
|
|
text_part, kw_part = english_part.split('**Keywords:**')
|
|
english_text = split_into_paragraphs(text_part.replace('# Abstract', '').strip(), 'EN-US')
|
|
english_keywords = md_to_html_para(kw_part.strip())
|
|
else:
|
|
english_text = split_into_paragraphs(english_part.replace('# Abstract', '').strip(), 'EN-US')
|
|
|
|
return spanish_text, spanish_keywords, english_text, english_keywords
|
|
|
|
|
|
def main():
|
|
# Initialize counters dict (replaces global counters)
|
|
counters = {
|
|
'table': 0,
|
|
'figure': 0,
|
|
'anexo_table': 0,
|
|
'anexo_figure': 0,
|
|
'global_figure': 0,
|
|
}
|
|
|
|
print("Reading template...")
|
|
html_content = read_file(TEMPLATE_INPUT)
|
|
|
|
# Modify the Table of Tables TOC field to include TC entries with \f t identifier
|
|
html_content = re.sub(
|
|
r'(TOC\s+)(\\h\s+\\z\s+\\t\s*\n?\s*"Tablas;1")',
|
|
r'\1\\f t \2',
|
|
html_content
|
|
)
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
print("Reading docs content...")
|
|
docs = {
|
|
'resumen': read_file(os.path.join(DOCS_DIR, '00_resumen.md')),
|
|
'intro': read_file(os.path.join(DOCS_DIR, '01_introduccion.md')),
|
|
'contexto': read_file(os.path.join(DOCS_DIR, '02_contexto_estado_arte.md')),
|
|
'objetivos': read_file(os.path.join(DOCS_DIR, '03_objetivos_metodologia.md')),
|
|
'desarrollo': read_file(os.path.join(DOCS_DIR, '04_desarrollo_especifico.md')),
|
|
'conclusiones': read_file(os.path.join(DOCS_DIR, '05_conclusiones_trabajo_futuro.md')),
|
|
'referencias': read_file(os.path.join(DOCS_DIR, '06_referencias_bibliograficas.md')),
|
|
'anexo': read_file(os.path.join(DOCS_DIR, '07_anexo_a.md')),
|
|
}
|
|
|
|
# Extract resumen and abstract
|
|
spanish_text, spanish_kw, english_text, english_kw = extract_resumen_parts(docs['resumen'])
|
|
|
|
# Replace title
|
|
print("Replacing title...")
|
|
for elem in soup.find_all(string=re.compile(r'Título del TFE', re.IGNORECASE)):
|
|
elem.replace_with(elem.replace('Título del TFE', 'Optimización de Hiperparámetros OCR con Ray Tune para Documentos Académicos en Español'))
|
|
|
|
# Replace Resumen section
|
|
print("Replacing Resumen...")
|
|
resumen_title = soup.find('p', class_='Ttulondices', string=re.compile(r'Resumen'))
|
|
if resumen_title:
|
|
current = resumen_title.find_next_sibling()
|
|
elements_to_remove = []
|
|
while current:
|
|
text = current.get_text() if hasattr(current, 'get_text') else str(current)
|
|
if 'Abstract' in text and current.name == 'p' and 'Ttulondices' in str(current.get('class', [])):
|
|
break
|
|
elements_to_remove.append(current)
|
|
current = current.find_next_sibling()
|
|
|
|
for elem in elements_to_remove:
|
|
if hasattr(elem, 'decompose'):
|
|
elem.decompose()
|
|
|
|
resumen_html = f'''{spanish_text}
|
|
<p class=MsoNormal><span lang=ES><o:p> </o:p></span></p>
|
|
<p class=MsoNormal><b><span lang=ES>Palabras clave:</span></b><span lang=ES> {spanish_kw}</span></p>
|
|
<p class=MsoNormal><span lang=ES><o:p> </o:p></span></p>'''
|
|
resumen_soup = BeautifulSoup(resumen_html, 'html.parser')
|
|
insert_point = resumen_title
|
|
for new_elem in reversed(list(resumen_soup.children)):
|
|
insert_point.insert_after(new_elem)
|
|
print(" ✓ Replaced Resumen")
|
|
|
|
# Replace Abstract section
|
|
print("Replacing Abstract...")
|
|
abstract_title = soup.find('p', class_='Ttulondices', string=re.compile(r'Abstract'))
|
|
if abstract_title:
|
|
current = abstract_title.find_next_sibling()
|
|
elements_to_remove = []
|
|
while current:
|
|
if current.name == 'span' and 'page-break' in str(current):
|
|
break
|
|
text = current.get_text() if hasattr(current, 'get_text') else str(current)
|
|
if current.name == 'p' and ('Ttulondices' in str(current.get('class', [])) or 'MsoToc' in str(current.get('class', []))):
|
|
break
|
|
elements_to_remove.append(current)
|
|
current = current.find_next_sibling()
|
|
|
|
for elem in elements_to_remove:
|
|
if hasattr(elem, 'decompose'):
|
|
elem.decompose()
|
|
|
|
abstract_html = f'''{english_text}
|
|
<p class=MsoNormal><span lang=EN-US><o:p> </o:p></span></p>
|
|
<p class=MsoNormal><b><span lang=EN-US>Keywords:</span></b><span lang=EN-US> {english_kw}</span></p>
|
|
<p class=MsoNormal><span lang=EN-US><o:p> </o:p></span></p>'''
|
|
abstract_soup = BeautifulSoup(abstract_html, 'html.parser')
|
|
insert_point = abstract_title
|
|
for new_elem in reversed(list(abstract_soup.children)):
|
|
insert_point.insert_after(new_elem)
|
|
print(" ✓ Replaced Abstract")
|
|
|
|
# Remove "Importante" callout boxes (template instructions)
|
|
print("Removing template instructions...")
|
|
for div in soup.find_all('div'):
|
|
text = div.get_text()
|
|
if 'Importante:' in text and 'extensión mínima' in text:
|
|
div.decompose()
|
|
print(" ✓ Removed 'Importante' box")
|
|
|
|
# Remove "Ejemplo de nota al pie" footnote
|
|
for elem in soup.find_all(string=re.compile(r'Ejemplo de nota al pie')):
|
|
parent = elem.parent
|
|
if parent:
|
|
while parent and parent.name != 'p':
|
|
parent = parent.parent
|
|
if parent:
|
|
parent.decompose()
|
|
print(" ✓ Removed footnote example")
|
|
|
|
# Clear old figure/table index entries
|
|
print("Clearing old index entries...")
|
|
|
|
for p in soup.find_all('p', class_='MsoTof'):
|
|
text = p.get_text()
|
|
if 'Figura' in text and 'Ejemplo' in text:
|
|
for a in p.find_all('a'):
|
|
a.decompose()
|
|
for span in p.find_all('span', style=lambda x: x and 'mso-no-proof' in str(x)):
|
|
if 'Ejemplo' in span.get_text():
|
|
span.decompose()
|
|
print(" ✓ Cleared figure index example entry")
|
|
if 'Tabla' in text and 'Ejemplo' in text:
|
|
for a in p.find_all('a'):
|
|
a.decompose()
|
|
for span in p.find_all('span', style=lambda x: x and 'mso-no-proof' in str(x)):
|
|
if 'Ejemplo' in span.get_text():
|
|
span.decompose()
|
|
print(" ✓ Cleared table index example entry")
|
|
|
|
for p in soup.find_all('p', class_='MsoToc3'):
|
|
text = p.get_text()
|
|
if 'Figura 1. Ejemplo' in text or 'Tabla 1. Ejemplo' in text:
|
|
p.decompose()
|
|
print(" ✓ Removed template index entry")
|
|
|
|
for p in soup.find_all('p', class_='Imagencentrada'):
|
|
p.decompose()
|
|
print(" ✓ Removed template figure placeholder")
|
|
|
|
# Remove template table example
|
|
for table in soup.find_all('table', class_='MsoTableGrid'):
|
|
text = table.get_text()
|
|
if 'Celda 1' in text or 'Encabezado 1' in text:
|
|
prev_sib = table.find_previous_sibling()
|
|
next_sib = table.find_next_sibling()
|
|
if prev_sib and 'Tabla 1. Ejemplo' in prev_sib.get_text():
|
|
prev_sib.decompose()
|
|
if next_sib and SOURCE_LINE_RE.search(next_sib.get_text().strip()):
|
|
next_sib.decompose()
|
|
table.decompose()
|
|
print(" ✓ Removed template table example")
|
|
break
|
|
|
|
# Define chapters
|
|
chapters = [
|
|
('Introducción', 'intro', 'Contexto'),
|
|
('Contexto', 'contexto', 'Objetivos'),
|
|
('Objetivos', 'objetivos', 'Desarrollo'),
|
|
('Desarrollo', 'desarrollo', 'Conclusiones'),
|
|
('Conclusiones', 'conclusiones', 'Referencias'),
|
|
]
|
|
|
|
print("Replacing chapter contents...")
|
|
for chapter_keyword, doc_key, next_keyword in chapters:
|
|
print(f" Processing: {chapter_keyword}")
|
|
|
|
start_elem = find_section_element(soup, chapter_keyword)
|
|
end_elem = find_section_element(soup, next_keyword)
|
|
|
|
if start_elem and end_elem:
|
|
remove_elements_between(start_elem, end_elem)
|
|
new_content_html, counters = extract_section_content(docs[doc_key], counters=counters)
|
|
new_soup = BeautifulSoup(new_content_html, 'html.parser')
|
|
insert_point = start_elem
|
|
for new_elem in reversed(list(new_soup.children)):
|
|
insert_point.insert_after(new_elem)
|
|
print(f" ✓ Replaced content")
|
|
else:
|
|
if not start_elem:
|
|
print(f" Warning: Could not find start element for {chapter_keyword}")
|
|
if not end_elem:
|
|
print(f" Warning: Could not find end element for {next_keyword}")
|
|
|
|
# Handle Referencias
|
|
print(" Processing: Referencias bibliográficas")
|
|
refs_start = find_section_element(soup, 'Referencias')
|
|
anexo_elem = find_section_element(soup, 'Anexo')
|
|
|
|
if refs_start and anexo_elem:
|
|
remove_elements_between(refs_start, anexo_elem)
|
|
refs_html = format_references(docs['referencias'])
|
|
refs_soup = BeautifulSoup(refs_html, 'html.parser')
|
|
insert_point = refs_start
|
|
for new_elem in reversed(list(refs_soup.children)):
|
|
insert_point.insert_after(new_elem)
|
|
print(f" ✓ Replaced content")
|
|
|
|
# Handle Anexo (last section)
|
|
print(" Processing: Anexo")
|
|
if anexo_elem:
|
|
body = soup.find('body')
|
|
if body:
|
|
current = anexo_elem.next_sibling
|
|
while current:
|
|
next_elem = current.next_sibling
|
|
if hasattr(current, 'decompose'):
|
|
current.decompose()
|
|
elif isinstance(current, NavigableString):
|
|
current.extract()
|
|
current = next_elem
|
|
|
|
anexo_content, counters = extract_section_content(docs['anexo'], is_anexo=True, counters=counters)
|
|
anexo_soup = BeautifulSoup(anexo_content, 'html.parser')
|
|
insert_point = anexo_elem
|
|
for new_elem in reversed(list(anexo_soup.children)):
|
|
insert_point.insert_after(new_elem)
|
|
print(f" ✓ Replaced content")
|
|
|
|
print(f"\nSummary: {counters['table']} tables + {counters['anexo_table']} Anexo tables, {counters['figure']} figures + {counters['anexo_figure']} Anexo figures processed")
|
|
|
|
print("Saving modified template...")
|
|
output_html = str(soup)
|
|
write_file(TEMPLATE_OUTPUT, output_html)
|
|
|
|
# Copy template support files
|
|
support_files_src = os.path.join(BASE_DIR, 'instructions/plantilla_individual_files')
|
|
support_files_dst = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual_files')
|
|
if os.path.exists(support_files_src):
|
|
if os.path.exists(support_files_dst):
|
|
shutil.rmtree(support_files_dst)
|
|
shutil.copytree(support_files_src, support_files_dst)
|
|
print(f"✓ Copied template support files")
|
|
|
|
print(f"✓ Done! Modified: {TEMPLATE_OUTPUT}")
|
|
print("\nTo convert to DOCX:")
|
|
print("1. Open the .htm file in Microsoft Word")
|
|
print("2. Replace [Insertar diagrama Mermaid aquí] placeholders with actual diagrams")
|
|
print("3. Update indices: Select all (Ctrl+A) then press F9 to update fields")
|
|
print(" - This will regenerate: Índice de contenidos, Índice de figuras, Índice de tablas")
|
|
print("4. Save as .docx")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|