MastersThesis/apply_content.py

#!/usr/bin/env python3
"""Replace template content with thesis content from docs/ folder using BeautifulSoup.

This module orchestrates the conversion of markdown documentation to UNIR's
Word template format. Content handling is delegated to:
- markdown_utils.py: Utility functions for markdown parsing
- content_handlers.py: Block-level content handlers (tables, figures, lists, etc.)
"""

import re
import os
import shutil
from bs4 import BeautifulSoup, NavigableString

from markdown_utils import (
    read_file,
    write_file,
    md_to_html_para,
    convert_latex_formulas,
    is_source_line,
    is_leyenda_line,
    split_into_paragraphs,
    SOURCE_LINE_RE,
)
from content_handlers import (
    handle_mermaid_diagram,
    handle_code_block,
    handle_header,
    handle_table,
    handle_blockquote,
    handle_bullet_list,
    handle_numbered_list,
)

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
TEMPLATE_INPUT = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm')
TEMPLATE_OUTPUT = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual.htm')
DOCS_DIR = os.path.join(BASE_DIR, 'docs')


def parse_md_to_html_blocks(md_content, is_anexo=False, counters=None):
    """Convert markdown content to HTML blocks with template styles.

    Args:
        md_content: Markdown content string
        is_anexo: Boolean indicating if processing Anexo section
        counters: Dict with table/figure counters. If None, creates new one.

    Returns:
        Tuple of (html_string, counters) where counters is the updated dict
    """
    if counters is None:
        counters = {
            'table': 0,
            'figure': 0,
            'anexo_table': 0,
            'anexo_figure': 0,
            'global_figure': 0,
        }

    html_blocks = []
    lines = md_content.split('\n')
    i = 0

    while i < len(lines):
        line = lines[i]

        # Skip empty lines
        if not line.strip():
            i += 1
            continue

        # Mermaid diagram - convert to figure with actual image
        if line.strip().startswith('```mermaid'):
            blocks, i = handle_mermaid_diagram(lines, i, counters, is_anexo)
            html_blocks.extend(blocks)
            continue

        # Code block (non-mermaid)
        if line.strip().startswith('```'):
            blocks, i = handle_code_block(lines, i)
            html_blocks.extend(blocks)
            continue

        # Headers
        if line.startswith('#'):
            header_html = handle_header(line, is_anexo)
            if header_html is not None:
                html_blocks.append(header_html)
            i += 1
            continue

        # Table
        if '|' in line and i + 1 < len(lines) and '---' in lines[i + 1]:
            blocks, i = handle_table(lines, i, counters, is_anexo)
            html_blocks.extend(blocks)
            continue

        # Blockquote
        if line.startswith('>'):
            blocks, i = handle_blockquote(lines, i)
            html_blocks.extend(blocks)
            continue

        # Bullet list
        if re.match(r'^[\-\*\+]\s', line):
            blocks, i = handle_bullet_list(lines, i)
            html_blocks.extend(blocks)
            continue

        # Numbered list
        if re.match(r'^\d+\.\s', line):
            blocks, i = handle_numbered_list(lines, i)
            html_blocks.extend(blocks)
            continue

        # Skip lines that are just table/figure titles
        if line.strip().startswith('**Tabla') or line.strip().startswith('*Tabla'):
            i += 1
            continue
        if line.strip().startswith('**Figura') or line.strip().startswith('*Figura'):
            i += 1
            continue
        if is_source_line(line):
            i += 1
            continue
        if is_leyenda_line(line):
            i += 1
            continue

        # Regular paragraph
        para_lines = [line]
        i += 1
        while i < len(lines) and lines[i].strip() and not lines[i].startswith('#') and not lines[i].startswith('```') and not lines[i].startswith('>') and not re.match(r'^[\-\*\+]\s', lines[i]) and not re.match(r'^\d+\.\s', lines[i]) and '|' not in lines[i]:
            para_lines.append(lines[i])
            i += 1

        para_text = ' '.join(para_lines)
        para_text = convert_latex_formulas(para_text)
        # Check if paragraph contains MathML (already wrapped)
        if '<math' in para_text:
            html_blocks.append(para_text)
        else:
            html_blocks.append(f'<p class=MsoNormal><span lang=ES>{md_to_html_para(para_text)}</span></p>')

    return '\n\n'.join(html_blocks), counters


def extract_section_content(md_content, is_anexo=False, counters=None):
    """Extract content from markdown, skipping the first # header.

    Args:
        md_content: Markdown content string
        is_anexo: Boolean indicating if processing Anexo section
        counters: Dict with table/figure counters

    Returns:
        Tuple of (html_string, counters)
    """
    md_content = re.sub(r'^#\s+[^\n]+\n+', '', md_content, count=1)
    return parse_md_to_html_blocks(md_content, is_anexo=is_anexo, counters=counters)


def find_section_element(soup, keyword):
    """Find element containing keyword (h1 or special paragraph classes)."""
    # First try h1
    for h1 in soup.find_all('h1'):
        text = h1.get_text()
        if keyword.lower() in text.lower():
            return h1

    # Try special paragraph classes for unnumbered sections
    for p in soup.find_all('p', class_=['Ttulo1sinnumerar', 'Anexo', 'MsoNormal']):
        text = p.get_text()
        if keyword.lower() in text.lower():
            classes = p.get('class', [])
            if 'Ttulo1sinnumerar' in classes or 'Anexo' in classes:
                return p
            if re.match(r'^\d+\.?\s', text.strip()):
                return p
    return None


def remove_elements_between(start_elem, end_elem):
    """Remove all elements between start and end (exclusive)."""
    current = start_elem.next_sibling
    elements_to_remove = []
    while current and current != end_elem:
        elements_to_remove.append(current)
        current = current.next_sibling
    for elem in elements_to_remove:
        if hasattr(elem, 'decompose'):
            elem.decompose()
        elif isinstance(elem, NavigableString):
            elem.extract()


def format_references(refs_content):
    """Format references with proper MsoBibliography style."""
    refs_content = refs_content.replace('# Referencias bibliográficas {.unnumbered}', '').strip()
    refs_html = ''

    for line in refs_content.split('\n\n'):
        line = line.strip()
        if not line:
            continue

        # Apply markdown formatting
        formatted = md_to_html_para(line)

        # Use MsoBibliography style with hanging indent
        refs_html += f'''<p class=MsoBibliography style="margin-left:36.0pt;text-indent:-36.0pt"><span lang=ES>{formatted}</span></p>\n'''

    return refs_html


def extract_resumen_parts(resumen_content):
    """Extract Spanish resumen and English abstract from 00_resumen.md"""
    parts = resumen_content.split('---')

    spanish_part = parts[0] if len(parts) > 0 else ''
    english_part = parts[1] if len(parts) > 1 else ''

    # Extract Spanish content
    spanish_text = ''
    spanish_keywords = ''
    if '**Palabras clave:**' in spanish_part:
        text_part, kw_part = spanish_part.split('**Palabras clave:**')
        spanish_text = split_into_paragraphs(text_part.replace('# Resumen', '').strip(), 'ES')
        spanish_keywords = md_to_html_para(kw_part.strip())
    else:
        spanish_text = split_into_paragraphs(spanish_part.replace('# Resumen', '').strip(), 'ES')

    # Extract English content
    english_text = ''
    english_keywords = ''
    if '**Keywords:**' in english_part:
        text_part, kw_part = english_part.split('**Keywords:**')
        english_text = split_into_paragraphs(text_part.replace('# Abstract', '').strip(), 'EN-US')
        english_keywords = md_to_html_para(kw_part.strip())
    else:
        english_text = split_into_paragraphs(english_part.replace('# Abstract', '').strip(), 'EN-US')

    return spanish_text, spanish_keywords, english_text, english_keywords


def main():
    # Initialize counters dict (replaces global counters)
    counters = {
        'table': 0,
        'figure': 0,
        'anexo_table': 0,
        'anexo_figure': 0,
        'global_figure': 0,
    }

    print("Reading template...")
    html_content = read_file(TEMPLATE_INPUT)

    # Modify the Table of Tables TOC field to include TC entries with \f t identifier
    html_content = re.sub(
        r'(TOC\s+)(\\h\s+\\z\s+\\t\s*\n?\s*&quot;Tablas;1&quot;)',
        r'\1\\f t \2',
        html_content
    )

    soup = BeautifulSoup(html_content, 'html.parser')

    print("Reading docs content...")
    docs = {
        'resumen': read_file(os.path.join(DOCS_DIR, '00_resumen.md')),
        'intro': read_file(os.path.join(DOCS_DIR, '01_introduccion.md')),
        'contexto': read_file(os.path.join(DOCS_DIR, '02_contexto_estado_arte.md')),
        'objetivos': read_file(os.path.join(DOCS_DIR, '03_objetivos_metodologia.md')),
        'desarrollo': read_file(os.path.join(DOCS_DIR, '04_desarrollo_especifico.md')),
        'conclusiones': read_file(os.path.join(DOCS_DIR, '05_conclusiones_trabajo_futuro.md')),
        'referencias': read_file(os.path.join(DOCS_DIR, '06_referencias_bibliograficas.md')),
        'anexo': read_file(os.path.join(DOCS_DIR, '07_anexo_a.md')),
    }

    # Extract resumen and abstract
    spanish_text, spanish_kw, english_text, english_kw = extract_resumen_parts(docs['resumen'])

    # Replace title
    print("Replacing title...")
    for elem in soup.find_all(string=re.compile(r'Título del TFE', re.IGNORECASE)):
        elem.replace_with(elem.replace('Título del TFE', 'Optimización de Hiperparámetros OCR con Ray Tune para Documentos Académicos en Español'))

    # Replace Resumen section
    print("Replacing Resumen...")
    resumen_title = soup.find('p', class_='Ttulondices', string=re.compile(r'Resumen'))
    if resumen_title:
        current = resumen_title.find_next_sibling()
        elements_to_remove = []
        while current:
            text = current.get_text() if hasattr(current, 'get_text') else str(current)
            if 'Abstract' in text and current.name == 'p' and 'Ttulondices' in str(current.get('class', [])):
                break
            elements_to_remove.append(current)
            current = current.find_next_sibling()

        for elem in elements_to_remove:
            if hasattr(elem, 'decompose'):
                elem.decompose()

        resumen_html = f'''{spanish_text}
<p class=MsoNormal><span lang=ES><o:p>&nbsp;</o:p></span></p>
<p class=MsoNormal><b><span lang=ES>Palabras clave:</span></b><span lang=ES> {spanish_kw}</span></p>
<p class=MsoNormal><span lang=ES><o:p>&nbsp;</o:p></span></p>'''
        resumen_soup = BeautifulSoup(resumen_html, 'html.parser')
        insert_point = resumen_title
        for new_elem in reversed(list(resumen_soup.children)):
            insert_point.insert_after(new_elem)
        print("    ✓ Replaced Resumen")

    # Replace Abstract section
    print("Replacing Abstract...")
    abstract_title = soup.find('p', class_='Ttulondices', string=re.compile(r'Abstract'))
    if abstract_title:
        current = abstract_title.find_next_sibling()
        elements_to_remove = []
        while current:
            if current.name == 'span' and 'page-break' in str(current):
                break
            text = current.get_text() if hasattr(current, 'get_text') else str(current)
            if current.name == 'p' and ('Ttulondices' in str(current.get('class', [])) or 'MsoToc' in str(current.get('class', []))):
                break
            elements_to_remove.append(current)
            current = current.find_next_sibling()

        for elem in elements_to_remove:
            if hasattr(elem, 'decompose'):
                elem.decompose()

        abstract_html = f'''{english_text}
<p class=MsoNormal><span lang=EN-US><o:p>&nbsp;</o:p></span></p>
<p class=MsoNormal><b><span lang=EN-US>Keywords:</span></b><span lang=EN-US> {english_kw}</span></p>
<p class=MsoNormal><span lang=EN-US><o:p>&nbsp;</o:p></span></p>'''
        abstract_soup = BeautifulSoup(abstract_html, 'html.parser')
        insert_point = abstract_title
        for new_elem in reversed(list(abstract_soup.children)):
            insert_point.insert_after(new_elem)
        print("    ✓ Replaced Abstract")

    # Remove "Importante" callout boxes (template instructions)
    print("Removing template instructions...")
    for div in soup.find_all('div'):
        text = div.get_text()
        if 'Importante:' in text and 'extensión mínima' in text:
            div.decompose()
            print("    ✓ Removed 'Importante' box")

    # Remove "Ejemplo de nota al pie" footnote
    for elem in soup.find_all(string=re.compile(r'Ejemplo de nota al pie')):
        parent = elem.parent
        if parent:
            while parent and parent.name != 'p':
                parent = parent.parent
            if parent:
                parent.decompose()
                print("    ✓ Removed footnote example")

    # Clear old figure/table index entries
    print("Clearing old index entries...")

    for p in soup.find_all('p', class_='MsoTof'):
        text = p.get_text()
        if 'Figura' in text and 'Ejemplo' in text:
            for a in p.find_all('a'):
                a.decompose()
            for span in p.find_all('span', style=lambda x: x and 'mso-no-proof' in str(x)):
                if 'Ejemplo' in span.get_text():
                    span.decompose()
            print("    ✓ Cleared figure index example entry")
        if 'Tabla' in text and 'Ejemplo' in text:
            for a in p.find_all('a'):
                a.decompose()
            for span in p.find_all('span', style=lambda x: x and 'mso-no-proof' in str(x)):
                if 'Ejemplo' in span.get_text():
                    span.decompose()
            print("    ✓ Cleared table index example entry")

    for p in soup.find_all('p', class_='MsoToc3'):
        text = p.get_text()
        if 'Figura 1. Ejemplo' in text or 'Tabla 1. Ejemplo' in text:
            p.decompose()
            print("    ✓ Removed template index entry")

    for p in soup.find_all('p', class_='Imagencentrada'):
        p.decompose()
        print("    ✓ Removed template figure placeholder")

    # Remove template table example
    for table in soup.find_all('table', class_='MsoTableGrid'):
        text = table.get_text()
        if 'Celda 1' in text or 'Encabezado 1' in text:
            prev_sib = table.find_previous_sibling()
            next_sib = table.find_next_sibling()
            if prev_sib and 'Tabla 1. Ejemplo' in prev_sib.get_text():
                prev_sib.decompose()
            if next_sib and SOURCE_LINE_RE.search(next_sib.get_text().strip()):
                next_sib.decompose()
            table.decompose()
            print("    ✓ Removed template table example")
            break

    # Define chapters
    chapters = [
        ('Introducción', 'intro', 'Contexto'),
        ('Contexto', 'contexto', 'Objetivos'),
        ('Objetivos', 'objetivos', 'Desarrollo'),
        ('Desarrollo', 'desarrollo', 'Conclusiones'),
        ('Conclusiones', 'conclusiones', 'Referencias'),
    ]

    print("Replacing chapter contents...")
    for chapter_keyword, doc_key, next_keyword in chapters:
        print(f"  Processing: {chapter_keyword}")

        start_elem = find_section_element(soup, chapter_keyword)
        end_elem = find_section_element(soup, next_keyword)

        if start_elem and end_elem:
            remove_elements_between(start_elem, end_elem)
            new_content_html, counters = extract_section_content(docs[doc_key], counters=counters)
            new_soup = BeautifulSoup(new_content_html, 'html.parser')
            insert_point = start_elem
            for new_elem in reversed(list(new_soup.children)):
                insert_point.insert_after(new_elem)
            print(f"    ✓ Replaced content")
        else:
            if not start_elem:
                print(f"    Warning: Could not find start element for {chapter_keyword}")
            if not end_elem:
                print(f"    Warning: Could not find end element for {next_keyword}")

    # Handle Referencias
    print("  Processing: Referencias bibliográficas")
    refs_start = find_section_element(soup, 'Referencias')
    anexo_elem = find_section_element(soup, 'Anexo')

    if refs_start and anexo_elem:
        remove_elements_between(refs_start, anexo_elem)
        refs_html = format_references(docs['referencias'])
        refs_soup = BeautifulSoup(refs_html, 'html.parser')
        insert_point = refs_start
        for new_elem in reversed(list(refs_soup.children)):
            insert_point.insert_after(new_elem)
        print(f"    ✓ Replaced content")

    # Handle Anexo (last section)
    print("  Processing: Anexo")
    if anexo_elem:
        body = soup.find('body')
        if body:
            current = anexo_elem.next_sibling
            while current:
                next_elem = current.next_sibling
                if hasattr(current, 'decompose'):
                    current.decompose()
                elif isinstance(current, NavigableString):
                    current.extract()
                current = next_elem

            anexo_content, counters = extract_section_content(docs['anexo'], is_anexo=True, counters=counters)
            anexo_soup = BeautifulSoup(anexo_content, 'html.parser')
            insert_point = anexo_elem
            for new_elem in reversed(list(anexo_soup.children)):
                insert_point.insert_after(new_elem)
            print(f"    ✓ Replaced content")

    print(f"\nSummary: {counters['table']} tables + {counters['anexo_table']} Anexo tables, {counters['figure']} figures + {counters['anexo_figure']} Anexo figures processed")

    print("Saving modified template...")
    output_html = str(soup)
    write_file(TEMPLATE_OUTPUT, output_html)

    # Copy template support files
    support_files_src = os.path.join(BASE_DIR, 'instructions/plantilla_individual_files')
    support_files_dst = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual_files')
    if os.path.exists(support_files_src):
        if os.path.exists(support_files_dst):
            shutil.rmtree(support_files_dst)
        shutil.copytree(support_files_src, support_files_dst)
        print(f"✓ Copied template support files")

    print(f"✓ Done! Modified: {TEMPLATE_OUTPUT}")
    print("\nTo convert to DOCX:")
    print("1. Open the .htm file in Microsoft Word")
    print("2. Replace [Insertar diagrama Mermaid aquí] placeholders with actual diagrams")
    print("3. Update indices: Select all (Ctrl+A) then press F9 to update fields")
    print("   - This will regenerate: Índice de contenidos, Índice de figuras, Índice de tablas")
    print("4. Save as .docx")


if __name__ == '__main__':
    main()