#!/usr/bin/env python3 """Replace template content with thesis content from docs/ folder using BeautifulSoup. This module orchestrates the conversion of markdown documentation to UNIR's Word template format. Content handling is delegated to: - markdown_utils.py: Utility functions for markdown parsing - content_handlers.py: Block-level content handlers (tables, figures, lists, etc.) """ import re import os import shutil from bs4 import BeautifulSoup, NavigableString from markdown_utils import ( read_file, write_file, md_to_html_para, convert_latex_formulas, is_source_line, is_leyenda_line, split_into_paragraphs, SOURCE_LINE_RE, ) from content_handlers import ( handle_mermaid_diagram, handle_code_block, handle_header, handle_table, handle_blockquote, handle_bullet_list, handle_numbered_list, ) BASE_DIR = os.path.dirname(os.path.abspath(__file__)) TEMPLATE_INPUT = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm') TEMPLATE_OUTPUT = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual.htm') DOCS_DIR = os.path.join(BASE_DIR, 'docs') def parse_md_to_html_blocks(md_content, is_anexo=False, counters=None): """Convert markdown content to HTML blocks with template styles. Args: md_content: Markdown content string is_anexo: Boolean indicating if processing Anexo section counters: Dict with table/figure counters. If None, creates new one. Returns: Tuple of (html_string, counters) where counters is the updated dict """ if counters is None: counters = { 'table': 0, 'figure': 0, 'anexo_table': 0, 'anexo_figure': 0, 'global_figure': 0, } html_blocks = [] lines = md_content.split('\n') i = 0 while i < len(lines): line = lines[i] # Skip empty lines if not line.strip(): i += 1 continue # Mermaid diagram - convert to figure with actual image if line.strip().startswith('```mermaid'): blocks, i = handle_mermaid_diagram(lines, i, counters, is_anexo) html_blocks.extend(blocks) continue # Code block (non-mermaid) if line.strip().startswith('```'): blocks, i = handle_code_block(lines, i) html_blocks.extend(blocks) continue # Headers if line.startswith('#'): header_html = handle_header(line, is_anexo) if header_html is not None: html_blocks.append(header_html) i += 1 continue # Table if '|' in line and i + 1 < len(lines) and '---' in lines[i + 1]: blocks, i = handle_table(lines, i, counters, is_anexo) html_blocks.extend(blocks) continue # Blockquote if line.startswith('>'): blocks, i = handle_blockquote(lines, i) html_blocks.extend(blocks) continue # Bullet list if re.match(r'^[\-\*\+]\s', line): blocks, i = handle_bullet_list(lines, i) html_blocks.extend(blocks) continue # Numbered list if re.match(r'^\d+\.\s', line): blocks, i = handle_numbered_list(lines, i) html_blocks.extend(blocks) continue # Skip lines that are just table/figure titles if line.strip().startswith('**Tabla') or line.strip().startswith('*Tabla'): i += 1 continue if line.strip().startswith('**Figura') or line.strip().startswith('*Figura'): i += 1 continue if is_source_line(line): i += 1 continue if is_leyenda_line(line): i += 1 continue # Regular paragraph para_lines = [line] i += 1 while i < len(lines) and lines[i].strip() and not lines[i].startswith('#') and not lines[i].startswith('```') and not lines[i].startswith('>') and not re.match(r'^[\-\*\+]\s', lines[i]) and not re.match(r'^\d+\.\s', lines[i]) and '|' not in lines[i]: para_lines.append(lines[i]) i += 1 para_text = ' '.join(para_lines) para_text = convert_latex_formulas(para_text) # Check if paragraph contains MathML (already wrapped) if '{md_to_html_para(para_text)}

') return '\n\n'.join(html_blocks), counters def extract_section_content(md_content, is_anexo=False, counters=None): """Extract content from markdown, skipping the first # header. Args: md_content: Markdown content string is_anexo: Boolean indicating if processing Anexo section counters: Dict with table/figure counters Returns: Tuple of (html_string, counters) """ md_content = re.sub(r'^#\s+[^\n]+\n+', '', md_content, count=1) return parse_md_to_html_blocks(md_content, is_anexo=is_anexo, counters=counters) def find_section_element(soup, keyword): """Find element containing keyword (h1 or special paragraph classes).""" # First try h1 for h1 in soup.find_all('h1'): text = h1.get_text() if keyword.lower() in text.lower(): return h1 # Try special paragraph classes for unnumbered sections for p in soup.find_all('p', class_=['Ttulo1sinnumerar', 'Anexo', 'MsoNormal']): text = p.get_text() if keyword.lower() in text.lower(): classes = p.get('class', []) if 'Ttulo1sinnumerar' in classes or 'Anexo' in classes: return p if re.match(r'^\d+\.?\s', text.strip()): return p return None def remove_elements_between(start_elem, end_elem): """Remove all elements between start and end (exclusive).""" current = start_elem.next_sibling elements_to_remove = [] while current and current != end_elem: elements_to_remove.append(current) current = current.next_sibling for elem in elements_to_remove: if hasattr(elem, 'decompose'): elem.decompose() elif isinstance(elem, NavigableString): elem.extract() def format_references(refs_content): """Format references with proper MsoBibliography style.""" refs_content = refs_content.replace('# Referencias bibliográficas {.unnumbered}', '').strip() refs_html = '' for line in refs_content.split('\n\n'): line = line.strip() if not line: continue # Apply markdown formatting formatted = md_to_html_para(line) # Use MsoBibliography style with hanging indent refs_html += f'''

{formatted}

\n''' return refs_html def extract_resumen_parts(resumen_content): """Extract Spanish resumen and English abstract from 00_resumen.md""" parts = resumen_content.split('---') spanish_part = parts[0] if len(parts) > 0 else '' english_part = parts[1] if len(parts) > 1 else '' # Extract Spanish content spanish_text = '' spanish_keywords = '' if '**Palabras clave:**' in spanish_part: text_part, kw_part = spanish_part.split('**Palabras clave:**') spanish_text = split_into_paragraphs(text_part.replace('# Resumen', '').strip(), 'ES') spanish_keywords = md_to_html_para(kw_part.strip()) else: spanish_text = split_into_paragraphs(spanish_part.replace('# Resumen', '').strip(), 'ES') # Extract English content english_text = '' english_keywords = '' if '**Keywords:**' in english_part: text_part, kw_part = english_part.split('**Keywords:**') english_text = split_into_paragraphs(text_part.replace('# Abstract', '').strip(), 'EN-US') english_keywords = md_to_html_para(kw_part.strip()) else: english_text = split_into_paragraphs(english_part.replace('# Abstract', '').strip(), 'EN-US') return spanish_text, spanish_keywords, english_text, english_keywords def main(): # Initialize counters dict (replaces global counters) counters = { 'table': 0, 'figure': 0, 'anexo_table': 0, 'anexo_figure': 0, 'global_figure': 0, } print("Reading template...") html_content = read_file(TEMPLATE_INPUT) # Modify the Table of Tables TOC field to include TC entries with \f t identifier html_content = re.sub( r'(TOC\s+)(\\h\s+\\z\s+\\t\s*\n?\s*"Tablas;1")', r'\1\\f t \2', html_content ) soup = BeautifulSoup(html_content, 'html.parser') print("Reading docs content...") docs = { 'resumen': read_file(os.path.join(DOCS_DIR, '00_resumen.md')), 'intro': read_file(os.path.join(DOCS_DIR, '01_introduccion.md')), 'contexto': read_file(os.path.join(DOCS_DIR, '02_contexto_estado_arte.md')), 'objetivos': read_file(os.path.join(DOCS_DIR, '03_objetivos_metodologia.md')), 'desarrollo': read_file(os.path.join(DOCS_DIR, '04_desarrollo_especifico.md')), 'conclusiones': read_file(os.path.join(DOCS_DIR, '05_conclusiones_trabajo_futuro.md')), 'referencias': read_file(os.path.join(DOCS_DIR, '06_referencias_bibliograficas.md')), 'anexo': read_file(os.path.join(DOCS_DIR, '07_anexo_a.md')), } # Extract resumen and abstract spanish_text, spanish_kw, english_text, english_kw = extract_resumen_parts(docs['resumen']) # Replace title print("Replacing title...") for elem in soup.find_all(string=re.compile(r'Título del TFE', re.IGNORECASE)): elem.replace_with(elem.replace('Título del TFE', 'Optimización de Hiperparámetros OCR con Ray Tune para Documentos Académicos en Español')) # Replace Resumen section print("Replacing Resumen...") resumen_title = soup.find('p', class_='Ttulondices', string=re.compile(r'Resumen')) if resumen_title: current = resumen_title.find_next_sibling() elements_to_remove = [] while current: text = current.get_text() if hasattr(current, 'get_text') else str(current) if 'Abstract' in text and current.name == 'p' and 'Ttulondices' in str(current.get('class', [])): break elements_to_remove.append(current) current = current.find_next_sibling() for elem in elements_to_remove: if hasattr(elem, 'decompose'): elem.decompose() resumen_html = f'''{spanish_text}

 

Palabras clave: {spanish_kw}

 

''' resumen_soup = BeautifulSoup(resumen_html, 'html.parser') insert_point = resumen_title for new_elem in reversed(list(resumen_soup.children)): insert_point.insert_after(new_elem) print(" ✓ Replaced Resumen") # Replace Abstract section print("Replacing Abstract...") abstract_title = soup.find('p', class_='Ttulondices', string=re.compile(r'Abstract')) if abstract_title: current = abstract_title.find_next_sibling() elements_to_remove = [] while current: if current.name == 'span' and 'page-break' in str(current): break text = current.get_text() if hasattr(current, 'get_text') else str(current) if current.name == 'p' and ('Ttulondices' in str(current.get('class', [])) or 'MsoToc' in str(current.get('class', []))): break elements_to_remove.append(current) current = current.find_next_sibling() for elem in elements_to_remove: if hasattr(elem, 'decompose'): elem.decompose() abstract_html = f'''{english_text}

 

Keywords: {english_kw}

 

''' abstract_soup = BeautifulSoup(abstract_html, 'html.parser') insert_point = abstract_title for new_elem in reversed(list(abstract_soup.children)): insert_point.insert_after(new_elem) print(" ✓ Replaced Abstract") # Remove "Importante" callout boxes (template instructions) print("Removing template instructions...") for div in soup.find_all('div'): text = div.get_text() if 'Importante:' in text and 'extensión mínima' in text: div.decompose() print(" ✓ Removed 'Importante' box") # Remove "Ejemplo de nota al pie" footnote for elem in soup.find_all(string=re.compile(r'Ejemplo de nota al pie')): parent = elem.parent if parent: while parent and parent.name != 'p': parent = parent.parent if parent: parent.decompose() print(" ✓ Removed footnote example") # Clear old figure/table index entries print("Clearing old index entries...") for p in soup.find_all('p', class_='MsoTof'): text = p.get_text() if 'Figura' in text and 'Ejemplo' in text: for a in p.find_all('a'): a.decompose() for span in p.find_all('span', style=lambda x: x and 'mso-no-proof' in str(x)): if 'Ejemplo' in span.get_text(): span.decompose() print(" ✓ Cleared figure index example entry") if 'Tabla' in text and 'Ejemplo' in text: for a in p.find_all('a'): a.decompose() for span in p.find_all('span', style=lambda x: x and 'mso-no-proof' in str(x)): if 'Ejemplo' in span.get_text(): span.decompose() print(" ✓ Cleared table index example entry") for p in soup.find_all('p', class_='MsoToc3'): text = p.get_text() if 'Figura 1. Ejemplo' in text or 'Tabla 1. Ejemplo' in text: p.decompose() print(" ✓ Removed template index entry") for p in soup.find_all('p', class_='Imagencentrada'): p.decompose() print(" ✓ Removed template figure placeholder") # Remove template table example for table in soup.find_all('table', class_='MsoTableGrid'): text = table.get_text() if 'Celda 1' in text or 'Encabezado 1' in text: prev_sib = table.find_previous_sibling() next_sib = table.find_next_sibling() if prev_sib and 'Tabla 1. Ejemplo' in prev_sib.get_text(): prev_sib.decompose() if next_sib and SOURCE_LINE_RE.search(next_sib.get_text().strip()): next_sib.decompose() table.decompose() print(" ✓ Removed template table example") break # Define chapters chapters = [ ('Introducción', 'intro', 'Contexto'), ('Contexto', 'contexto', 'Objetivos'), ('Objetivos', 'objetivos', 'Desarrollo'), ('Desarrollo', 'desarrollo', 'Conclusiones'), ('Conclusiones', 'conclusiones', 'Referencias'), ] print("Replacing chapter contents...") for chapter_keyword, doc_key, next_keyword in chapters: print(f" Processing: {chapter_keyword}") start_elem = find_section_element(soup, chapter_keyword) end_elem = find_section_element(soup, next_keyword) if start_elem and end_elem: remove_elements_between(start_elem, end_elem) new_content_html, counters = extract_section_content(docs[doc_key], counters=counters) new_soup = BeautifulSoup(new_content_html, 'html.parser') insert_point = start_elem for new_elem in reversed(list(new_soup.children)): insert_point.insert_after(new_elem) print(f" ✓ Replaced content") else: if not start_elem: print(f" Warning: Could not find start element for {chapter_keyword}") if not end_elem: print(f" Warning: Could not find end element for {next_keyword}") # Handle Referencias print(" Processing: Referencias bibliográficas") refs_start = find_section_element(soup, 'Referencias') anexo_elem = find_section_element(soup, 'Anexo') if refs_start and anexo_elem: remove_elements_between(refs_start, anexo_elem) refs_html = format_references(docs['referencias']) refs_soup = BeautifulSoup(refs_html, 'html.parser') insert_point = refs_start for new_elem in reversed(list(refs_soup.children)): insert_point.insert_after(new_elem) print(f" ✓ Replaced content") # Handle Anexo (last section) print(" Processing: Anexo") if anexo_elem: body = soup.find('body') if body: current = anexo_elem.next_sibling while current: next_elem = current.next_sibling if hasattr(current, 'decompose'): current.decompose() elif isinstance(current, NavigableString): current.extract() current = next_elem anexo_content, counters = extract_section_content(docs['anexo'], is_anexo=True, counters=counters) anexo_soup = BeautifulSoup(anexo_content, 'html.parser') insert_point = anexo_elem for new_elem in reversed(list(anexo_soup.children)): insert_point.insert_after(new_elem) print(f" ✓ Replaced content") print(f"\nSummary: {counters['table']} tables + {counters['anexo_table']} Anexo tables, {counters['figure']} figures + {counters['anexo_figure']} Anexo figures processed") print("Saving modified template...") output_html = str(soup) write_file(TEMPLATE_OUTPUT, output_html) # Copy template support files support_files_src = os.path.join(BASE_DIR, 'instructions/plantilla_individual_files') support_files_dst = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual_files') if os.path.exists(support_files_src): if os.path.exists(support_files_dst): shutil.rmtree(support_files_dst) shutil.copytree(support_files_src, support_files_dst) print(f"✓ Copied template support files") print(f"✓ Done! Modified: {TEMPLATE_OUTPUT}") print("\nTo convert to DOCX:") print("1. Open the .htm file in Microsoft Word") print("2. Replace [Insertar diagrama Mermaid aquí] placeholders with actual diagrams") print("3. Update indices: Select all (Ctrl+A) then press F9 to update fields") print(" - This will regenerate: Índice de contenidos, Índice de figuras, Índice de tablas") print("4. Save as .docx") if __name__ == '__main__': main()