MastersThesis/generate_thesis_docx.py

#!/usr/bin/env python3
"""
Generate thesis.docx from markdown files using UNIR template.
"""

import re
import os
from docx import Document
from docx.shared import Pt, Cm, RGBColor, Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.style import WD_STYLE_TYPE
from docx.oxml.ns import qn
from docx.oxml import OxmlElement

# Paths
TEMPLATE_PATH = 'instructions/plantilla_individual.docx'
OUTPUT_PATH = 'TFM_Sergio_Jimenez_OCR_Optimization.docx'
DOCS_PATH = 'docs'

# Thesis metadata
THESIS_TITLE = "Optimización de Hiperparámetros OCR con Ray Tune para Documentos Académicos en Español"
AUTHOR = "Sergio Jiménez Jiménez"
DIRECTOR = "[Nombre del Director]"
DATE = "2025"


def read_markdown_file(filepath):
    """Read markdown file and return content."""
    with open(filepath, 'r', encoding='utf-8') as f:
        return f.read()


def parse_markdown_blocks(md_content):
    """Parse markdown content into blocks (headers, paragraphs, code, tables, lists)."""
    blocks = []
    lines = md_content.split('\n')
    i = 0

    while i < len(lines):
        line = lines[i]

        # Skip empty lines
        if not line.strip():
            i += 1
            continue

        # Code block
        if line.strip().startswith('```'):
            lang = line.strip()[3:]
            code_lines = []
            i += 1
            while i < len(lines) and not lines[i].strip().startswith('```'):
                code_lines.append(lines[i])
                i += 1
            blocks.append({'type': 'code', 'lang': lang, 'content': '\n'.join(code_lines)})
            i += 1
            continue

        # Headers
        if line.startswith('#'):
            level = len(line) - len(line.lstrip('#'))
            text = line.lstrip('#').strip()
            # Remove {.unnumbered} suffix
            text = re.sub(r'\s*\{\.unnumbered\}\s*$', '', text)
            blocks.append({'type': 'header', 'level': level, 'content': text})
            i += 1
            continue

        # Table
        if '|' in line and i + 1 < len(lines) and '---' in lines[i + 1]:
            table_lines = [line]
            i += 1
            while i < len(lines) and '|' in lines[i]:
                table_lines.append(lines[i])
                i += 1
            blocks.append({'type': 'table', 'content': table_lines})
            continue

        # Blockquote
        if line.startswith('>'):
            quote_text = line[1:].strip()
            i += 1
            while i < len(lines) and lines[i].startswith('>'):
                quote_text += ' ' + lines[i][1:].strip()
                i += 1
            blocks.append({'type': 'quote', 'content': quote_text})
            continue

        # List item (bullet or numbered)
        if re.match(r'^[\-\*\+]\s', line) or re.match(r'^\d+\.\s', line):
            list_items = []
            list_type = 'numbered' if re.match(r'^\d+\.', line) else 'bullet'
            while i < len(lines):
                current = lines[i]
                if re.match(r'^[\-\*\+]\s', current):
                    list_items.append(current[2:].strip())
                    i += 1
                elif re.match(r'^\d+\.\s', current):
                    list_items.append(re.sub(r'^\d+\.\s*', '', current).strip())
                    i += 1
                elif current.strip() == '':
                    break
                else:
                    break
            blocks.append({'type': 'list', 'list_type': list_type, 'items': list_items})
            continue

        # Figure caption (italic text starting with *Figura or Figura)
        if line.strip().startswith('*Figura') or line.strip().startswith('Figura'):
            blocks.append({'type': 'caption', 'content': line.strip().strip('*')})
            i += 1
            continue

        # Regular paragraph
        para_lines = [line]
        i += 1
        while i < len(lines) and lines[i].strip() and not lines[i].startswith('#') and not lines[i].startswith('```') and not lines[i].startswith('>') and not re.match(r'^[\-\*\+]\s', lines[i]) and not re.match(r'^\d+\.\s', lines[i]) and '|' not in lines[i]:
            para_lines.append(lines[i])
            i += 1

        para_text = ' '.join(para_lines)
        blocks.append({'type': 'paragraph', 'content': para_text})

    return blocks


def add_formatted_text(paragraph, text):
    """Add text with inline formatting (bold, italic, code) to a paragraph."""
    # Pattern for inline formatting
    parts = re.split(r'(\*\*[^*]+\*\*|\*[^*]+\*|`[^`]+`)', text)

    for part in parts:
        if not part:
            continue
        if part.startswith('**') and part.endswith('**'):
            run = paragraph.add_run(part[2:-2])
            run.bold = True
        elif part.startswith('*') and part.endswith('*'):
            run = paragraph.add_run(part[1:-1])
            run.italic = True
        elif part.startswith('`') and part.endswith('`'):
            run = paragraph.add_run(part[1:-1])
            run.font.name = 'Consolas'
            run.font.size = Pt(10)
        else:
            paragraph.add_run(part)


def add_table_to_doc(doc, table_lines):
    """Add a markdown table to the document."""
    # Parse table
    rows = []
    for line in table_lines:
        if '---' in line:
            continue
        cells = [c.strip() for c in line.split('|')[1:-1]]
        if cells:
            rows.append(cells)

    if not rows:
        return

    # Create table
    num_cols = len(rows[0])
    table = doc.add_table(rows=len(rows), cols=num_cols)
    table.style = 'Table Grid'

    for i, row_data in enumerate(rows):
        row = table.rows[i]
        for j, cell_text in enumerate(row_data):
            if j < len(row.cells):
                cell = row.cells[j]
                cell.text = ''
                para = cell.paragraphs[0]
                add_formatted_text(para, cell_text)
                if i == 0:  # Header row
                    for run in para.runs:
                        run.bold = True

    # Add spacing after table
    doc.add_paragraph()


def add_code_block(doc, code, lang=''):
    """Add a code block to the document."""
    para = doc.add_paragraph()
    para.paragraph_format.left_indent = Cm(0.5)
    para.paragraph_format.space_before = Pt(6)
    para.paragraph_format.space_after = Pt(6)

    run = para.add_run(code)
    run.font.name = 'Consolas'
    run.font.size = Pt(9)

    # Add background shading
    shading = OxmlElement('w:shd')
    shading.set(qn('w:fill'), 'F5F5F5')
    para._p.get_or_add_pPr().append(shading)


def get_header_style(level, is_numbered=True):
    """Get the appropriate style for a header level."""
    if level == 1:
        return 'Heading 1'
    elif level == 2:
        return 'Heading 2'
    elif level == 3:
        return 'Heading 3'
    elif level == 4:
        return 'Heading 4'
    else:
        return 'Normal'


def add_section_content(doc, md_content, start_numbered=True):
    """Add markdown content to the document with proper formatting."""
    blocks = parse_markdown_blocks(md_content)

    for block in blocks:
        if block['type'] == 'header':
            level = block['level']
            text = block['content']

            # Map markdown header levels to document styles
            # ## (level 2) -> Heading 2 (subsection like 1.1. Motivación)
            # ### (level 3) -> Heading 3 (sub-subsection like 1.1.1. xxx)
            # #### (level 4) -> Heading 4

            if level == 1:
                # Skip level 1 headers - they're added separately as chapter titles
                continue
            elif level == 2:
                para = doc.add_paragraph(text, style='Heading 2')
            elif level == 3:
                para = doc.add_paragraph(text, style='Heading 3')
            elif level == 4:
                para = doc.add_paragraph(text, style='Heading 4')
            else:
                para = doc.add_paragraph(text)
                if para.runs:
                    para.runs[0].bold = True

        elif block['type'] == 'paragraph':
            para = doc.add_paragraph()
            add_formatted_text(para, block['content'])

        elif block['type'] == 'code':
            add_code_block(doc, block['content'], block.get('lang', ''))

        elif block['type'] == 'table':
            add_table_to_doc(doc, block['content'])

        elif block['type'] == 'quote':
            para = doc.add_paragraph()
            para.paragraph_format.left_indent = Cm(1)
            para.paragraph_format.right_indent = Cm(1)
            add_formatted_text(para, block['content'])
            for run in para.runs:
                run.italic = True

        elif block['type'] == 'list':
            for item in block['items']:
                if block['list_type'] == 'bullet':
                    para = doc.add_paragraph(style='List Paragraph')
                    para.paragraph_format.left_indent = Cm(1)
                    add_formatted_text(para, '• ' + item)
                else:
                    para = doc.add_paragraph(style='List Paragraph')
                    para.paragraph_format.left_indent = Cm(1)
                    add_formatted_text(para, item)

        elif block['type'] == 'caption':
            para = doc.add_paragraph()
            para.alignment = WD_ALIGN_PARAGRAPH.CENTER
            run = para.add_run(block['content'])
            run.italic = True
            run.font.size = Pt(10)


def create_thesis_document():
    """Create the thesis document from template and markdown files."""
    print("Loading template...")
    doc = Document(TEMPLATE_PATH)

    # Find and update title on cover page
    for para in doc.paragraphs[:20]:
        if 'Título del TFE' in para.text or 'titulo del TFE' in para.text.lower():
            para.clear()
            run = para.add_run(THESIS_TITLE)
            run.bold = True

    # Clear template content after indices (keep cover, resumen structure)
    # We'll find where actual content starts and replace it

    # Read all markdown files
    print("Reading markdown files...")
    md_files = {
        'resumen': read_markdown_file(os.path.join(DOCS_PATH, '00_resumen.md')),
        'introduccion': read_markdown_file(os.path.join(DOCS_PATH, '01_introduccion.md')),
        'contexto': read_markdown_file(os.path.join(DOCS_PATH, '02_contexto_estado_arte.md')),
        'objetivos': read_markdown_file(os.path.join(DOCS_PATH, '03_objetivos_metodologia.md')),
        'desarrollo': read_markdown_file(os.path.join(DOCS_PATH, '04_desarrollo_especifico.md')),
        'conclusiones': read_markdown_file(os.path.join(DOCS_PATH, '05_conclusiones_trabajo_futuro.md')),
        'referencias': read_markdown_file(os.path.join(DOCS_PATH, '06_referencias_bibliograficas.md')),
        'anexo': read_markdown_file(os.path.join(DOCS_PATH, '07_anexo_a.md')),
    }

    # Create new document based on template but with our content
    print("Creating new document with thesis content...")

    # Start fresh document with template styles
    new_doc = Document(TEMPLATE_PATH)

    # Clear all content after a certain point
    # Keep first ~70 paragraphs (cover + resumen structure + indices)
    paras_to_remove = []
    found_intro = False
    for i, para in enumerate(new_doc.paragraphs):
        if 'Introducción' in para.text and para.style and 'Heading 1' in para.style.name:
            found_intro = True
        if found_intro:
            paras_to_remove.append(para)

    # Remove old content
    for para in paras_to_remove:
        p = para._element
        p.getparent().remove(p)

    # Now add our content
    print("Adding thesis content...")

    # Add each chapter
    chapters = [
        ('introduccion', '1. Introducción'),
        ('contexto', '2. Contexto y estado del arte'),
        ('objetivos', '3. Objetivos concretos y metodología de trabajo'),
        ('desarrollo', '4. Desarrollo específico de la contribución'),
        ('conclusiones', '5. Conclusiones y trabajo futuro'),
    ]

    for key, title in chapters:
        print(f"  Adding chapter: {title}")
        # Add chapter heading with Heading 1 style
        new_doc.add_paragraph(title, style='Heading 1')

        # Remove the top-level header from content (we added it separately with proper style)
        content = md_files[key]
        # Remove the first # header line and intro paragraph that follows
        content = re.sub(r'^#\s+\d+\.\s+[^\n]+\n+', '', content)
        add_section_content(new_doc, content)
        new_doc.add_page_break()

    # Add Referencias
    print("  Adding Referencias bibliográficas")
    para = new_doc.add_paragraph('Referencias bibliográficas', style='Título 1 sin numerar')
    refs_content = md_files['referencias']
    refs_content = re.sub(r'^#[^\n]+\n+', '', refs_content)  # Remove header

    # Parse references (each reference is a paragraph)
    for line in refs_content.split('\n\n'):
        if line.strip():
            para = new_doc.add_paragraph()
            para.paragraph_format.left_indent = Cm(1.27)
            para.paragraph_format.first_line_indent = Cm(-1.27)
            add_formatted_text(para, line.strip())

    new_doc.add_page_break()

    # Add Anexo
    print("  Adding Anexo A")
    para = new_doc.add_paragraph('Anexo A. Código fuente y datos analizados', style='Título 1 sin numerar')
    anexo_content = md_files['anexo']
    anexo_content = re.sub(r'^#[^\n]+\n+', '', anexo_content)
    add_section_content(new_doc, anexo_content)

    # Update Resumen/Abstract sections (find them in the document and update)
    print("Updating Resumen and Abstract...")
    resumen_content = md_files['resumen']

    # Parse resumen file to extract Spanish and English parts
    resumen_blocks = parse_markdown_blocks(resumen_content)
    spanish_paragraphs = []
    english_paragraphs = []
    keywords_es = ""
    keywords_en = ""
    current_section = None

    for block in resumen_blocks:
        if block['type'] == 'header':
            if 'Resumen' in block['content']:
                current_section = 'es'
            elif 'Abstract' in block['content']:
                current_section = 'en'
        elif block['type'] == 'paragraph':
            text = block['content']
            if 'Palabras clave:' in text:
                keywords_es = text
            elif 'Keywords:' in text:
                keywords_en = text
            elif current_section == 'es' and text.strip():
                spanish_paragraphs.append(text)
            elif current_section == 'en' and text.strip():
                english_paragraphs.append(text)

    # Find and update Resumen section in doc
    found_resumen = False
    found_abstract = False
    for i, para in enumerate(new_doc.paragraphs):
        text = para.text.strip()

        if 'Resumen' in text and para.style and 'Título' in para.style.name:
            found_resumen = True
            # Update following paragraphs
            for j, sp in enumerate(spanish_paragraphs[:3]):  # Limit to first 3 paragraphs
                if i + j + 1 < len(new_doc.paragraphs):
                    target_para = new_doc.paragraphs[i + j + 1]
                    if target_para.style and target_para.style.name == 'Normal':
                        target_para.clear()
                        add_formatted_text(target_para, sp)

        elif 'Abstract' in text and para.style and 'Título' in para.style.name:
            found_abstract = True
            for j, ep in enumerate(english_paragraphs[:3]):
                if i + j + 1 < len(new_doc.paragraphs):
                    target_para = new_doc.paragraphs[i + j + 1]
                    if target_para.style and target_para.style.name == 'Normal':
                        target_para.clear()
                        add_formatted_text(target_para, ep)

    # Save document
    print(f"Saving document to {OUTPUT_PATH}...")
    new_doc.save(OUTPUT_PATH)
    print(f"Done! Document saved as {OUTPUT_PATH}")


if __name__ == '__main__':
    os.chdir('/Users/sergio/Desktop/MastersThesis')
    create_thesis_document()