#!/usr/bin/env python3 """Replace template content with thesis content from docs/ folder using BeautifulSoup. This module orchestrates the conversion of markdown documentation to UNIR's Word template format. Content handling is delegated to: - markdown_utils.py: Utility functions for markdown parsing - content_handlers.py: Block-level content handlers (tables, figures, lists, etc.) """ import re import os import shutil from bs4 import BeautifulSoup, NavigableString from markdown_utils import ( read_file, write_file, md_to_html_para, convert_latex_formulas, is_source_line, is_leyenda_line, split_into_paragraphs, SOURCE_LINE_RE, ) from content_handlers import ( handle_mermaid_diagram, handle_code_block, handle_header, handle_table, handle_blockquote, handle_bullet_list, handle_numbered_list, ) BASE_DIR = os.path.dirname(os.path.abspath(__file__)) TEMPLATE_INPUT = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm') TEMPLATE_OUTPUT = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual.htm') DOCS_DIR = os.path.join(BASE_DIR, 'docs') def parse_md_to_html_blocks(md_content, is_anexo=False, counters=None): """Convert markdown content to HTML blocks with template styles. Args: md_content: Markdown content string is_anexo: Boolean indicating if processing Anexo section counters: Dict with table/figure counters. If None, creates new one. Returns: Tuple of (html_string, counters) where counters is the updated dict """ if counters is None: counters = { 'table': 0, 'figure': 0, 'anexo_table': 0, 'anexo_figure': 0, 'global_figure': 0, } html_blocks = [] lines = md_content.split('\n') i = 0 while i < len(lines): line = lines[i] # Skip empty lines if not line.strip(): i += 1 continue # Mermaid diagram - convert to figure with actual image if line.strip().startswith('```mermaid'): blocks, i = handle_mermaid_diagram(lines, i, counters, is_anexo) html_blocks.extend(blocks) continue # Code block (non-mermaid) if line.strip().startswith('```'): blocks, i = handle_code_block(lines, i) html_blocks.extend(blocks) continue # Headers if line.startswith('#'): header_html = handle_header(line, is_anexo) if header_html is not None: html_blocks.append(header_html) i += 1 continue # Table if '|' in line and i + 1 < len(lines) and '---' in lines[i + 1]: blocks, i = handle_table(lines, i, counters, is_anexo) html_blocks.extend(blocks) continue # Blockquote if line.startswith('>'): blocks, i = handle_blockquote(lines, i) html_blocks.extend(blocks) continue # Bullet list if re.match(r'^[\-\*\+]\s', line): blocks, i = handle_bullet_list(lines, i) html_blocks.extend(blocks) continue # Numbered list if re.match(r'^\d+\.\s', line): blocks, i = handle_numbered_list(lines, i) html_blocks.extend(blocks) continue # Skip lines that are just table/figure titles if line.strip().startswith('**Tabla') or line.strip().startswith('*Tabla'): i += 1 continue if line.strip().startswith('**Figura') or line.strip().startswith('*Figura'): i += 1 continue if is_source_line(line): i += 1 continue if is_leyenda_line(line): i += 1 continue # Regular paragraph para_lines = [line] i += 1 while i < len(lines) and lines[i].strip() and not lines[i].startswith('#') and not lines[i].startswith('```') and not lines[i].startswith('>') and not re.match(r'^[\-\*\+]\s', lines[i]) and not re.match(r'^\d+\.\s', lines[i]) and '|' not in lines[i]: para_lines.append(lines[i]) i += 1 para_text = ' '.join(para_lines) para_text = convert_latex_formulas(para_text) # Check if paragraph contains MathML (already wrapped) if '
') return '\n\n'.join(html_blocks), counters def extract_section_content(md_content, is_anexo=False, counters=None): """Extract content from markdown, skipping the first # header. Args: md_content: Markdown content string is_anexo: Boolean indicating if processing Anexo section counters: Dict with table/figure counters Returns: Tuple of (html_string, counters) """ md_content = re.sub(r'^#\s+[^\n]+\n+', '', md_content, count=1) return parse_md_to_html_blocks(md_content, is_anexo=is_anexo, counters=counters) def find_section_element(soup, keyword): """Find element containing keyword (h1 or special paragraph classes).""" # First try h1 for h1 in soup.find_all('h1'): text = h1.get_text() if keyword.lower() in text.lower(): return h1 # Try special paragraph classes for unnumbered sections for p in soup.find_all('p', class_=['Ttulo1sinnumerar', 'Anexo', 'MsoNormal']): text = p.get_text() if keyword.lower() in text.lower(): classes = p.get('class', []) if 'Ttulo1sinnumerar' in classes or 'Anexo' in classes: return p if re.match(r'^\d+\.?\s', text.strip()): return p return None def remove_elements_between(start_elem, end_elem): """Remove all elements between start and end (exclusive).""" current = start_elem.next_sibling elements_to_remove = [] while current and current != end_elem: elements_to_remove.append(current) current = current.next_sibling for elem in elements_to_remove: if hasattr(elem, 'decompose'): elem.decompose() elif isinstance(elem, NavigableString): elem.extract() def format_references(refs_content): """Format references with proper MsoBibliography style.""" refs_content = refs_content.replace('# Referencias bibliográficas {.unnumbered}', '').strip() refs_html = '' for line in refs_content.split('\n\n'): line = line.strip() if not line: continue # Apply markdown formatting formatted = md_to_html_para(line) # Use MsoBibliography style with hanging indent refs_html += f'''{formatted}
\n''' return refs_html def extract_resumen_parts(resumen_content): """Extract Spanish resumen and English abstract from 00_resumen.md""" parts = resumen_content.split('---') spanish_part = parts[0] if len(parts) > 0 else '' english_part = parts[1] if len(parts) > 1 else '' # Extract Spanish content spanish_text = '' spanish_keywords = '' if '**Palabras clave:**' in spanish_part: text_part, kw_part = spanish_part.split('**Palabras clave:**') spanish_text = split_into_paragraphs(text_part.replace('# Resumen', '').strip(), 'ES') spanish_keywords = md_to_html_para(kw_part.strip()) else: spanish_text = split_into_paragraphs(spanish_part.replace('# Resumen', '').strip(), 'ES') # Extract English content english_text = '' english_keywords = '' if '**Keywords:**' in english_part: text_part, kw_part = english_part.split('**Keywords:**') english_text = split_into_paragraphs(text_part.replace('# Abstract', '').strip(), 'EN-US') english_keywords = md_to_html_para(kw_part.strip()) else: english_text = split_into_paragraphs(english_part.replace('# Abstract', '').strip(), 'EN-US') return spanish_text, spanish_keywords, english_text, english_keywords def main(): # Initialize counters dict (replaces global counters) counters = { 'table': 0, 'figure': 0, 'anexo_table': 0, 'anexo_figure': 0, 'global_figure': 0, } print("Reading template...") html_content = read_file(TEMPLATE_INPUT) # Modify the Table of Tables TOC field to include TC entries with \f t identifier html_content = re.sub( r'(TOC\s+)(\\h\s+\\z\s+\\t\s*\n?\s*"Tablas;1")', r'\1\\f t \2', html_content ) soup = BeautifulSoup(html_content, 'html.parser') print("Reading docs content...") docs = { 'resumen': read_file(os.path.join(DOCS_DIR, '00_resumen.md')), 'intro': read_file(os.path.join(DOCS_DIR, '01_introduccion.md')), 'contexto': read_file(os.path.join(DOCS_DIR, '02_contexto_estado_arte.md')), 'objetivos': read_file(os.path.join(DOCS_DIR, '03_objetivos_metodologia.md')), 'desarrollo': read_file(os.path.join(DOCS_DIR, '04_desarrollo_especifico.md')), 'conclusiones': read_file(os.path.join(DOCS_DIR, '05_conclusiones_trabajo_futuro.md')), 'referencias': read_file(os.path.join(DOCS_DIR, '06_referencias_bibliograficas.md')), 'anexo': read_file(os.path.join(DOCS_DIR, '07_anexo_a.md')), } # Extract resumen and abstract spanish_text, spanish_kw, english_text, english_kw = extract_resumen_parts(docs['resumen']) # Replace title print("Replacing title...") for elem in soup.find_all(string=re.compile(r'Título del TFE', re.IGNORECASE)): elem.replace_with(elem.replace('Título del TFE', 'Optimización de Hiperparámetros OCR con Ray Tune para Documentos Académicos en Español')) # Replace Resumen section print("Replacing Resumen...") resumen_title = soup.find('p', class_='Ttulondices', string=re.compile(r'Resumen')) if resumen_title: current = resumen_title.find_next_sibling() elements_to_remove = [] while current: text = current.get_text() if hasattr(current, 'get_text') else str(current) if 'Abstract' in text and current.name == 'p' and 'Ttulondices' in str(current.get('class', [])): break elements_to_remove.append(current) current = current.find_next_sibling() for elem in elements_to_remove: if hasattr(elem, 'decompose'): elem.decompose() resumen_html = f'''{spanish_text}Palabras clave: {spanish_kw}
Keywords: {english_kw}