generate_thesis.py

#!/usr/bin/env python3
"""Generate thesis DOCX from HTML template and markdown content."""

import os
import re
import shutil
import subprocess
from bs4 import BeautifulSoup

BASE_DIR = '/Users/sergio/Desktop/MastersThesis'
TEMPLATE_HTM = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm')
TEMPLATE_FILES = os.path.join(BASE_DIR, 'instructions/plantilla_individual_files')
OUTPUT_HTM = os.path.join(BASE_DIR, 'thesis_output.htm')
OUTPUT_FILES = os.path.join(BASE_DIR, 'thesis_output_files')
OUTPUT_DOCX = os.path.join(BASE_DIR, 'TFM_Sergio_Jimenez_OCR_Optimization.docx')
DOCS_DIR = os.path.join(BASE_DIR, 'docs')

def read_md(filename):
    with open(os.path.join(DOCS_DIR, filename), 'r', encoding='utf-8') as f:
        return f.read()

def md_to_html(md_text):
    """Convert markdown to simple HTML."""
    html = md_text

    # Headers
    html = re.sub(r'^#### (.+)$', r'<h4>\1</h4>', html, flags=re.MULTILINE)
    html = re.sub(r'^### (.+)$', r'<h3>\1</h3>', html, flags=re.MULTILINE)
    html = re.sub(r'^## (.+)$', r'<h2>\1</h2>', html, flags=re.MULTILINE)
    html = re.sub(r'^# (.+)$', r'<h1>\1</h1>', html, flags=re.MULTILINE)

    # Bold and italic
    html = re.sub(r'\*\*([^*]+)\*\*', r'<b>\1</b>', html)
    html = re.sub(r'\*([^*]+)\*', r'<i>\1</i>', html)

    # Inline code
    html = re.sub(r'`([^`]+)`', r'<code>\1</code>', html)

    # Code blocks
    def code_block_replace(match):
        lang = match.group(1)
        code = match.group(2)
        return f'<pre style="background:#f5f5f5;padding:10px;font-family:Consolas;font-size:9pt">{code}</pre>'
    html = re.sub(r'```(\w*)\n(.*?)```', code_block_replace, html, flags=re.DOTALL)

    # Blockquotes
    html = re.sub(r'^>\s*(.+)$', r'<blockquote style="margin-left:2cm;font-style:italic">\1</blockquote>', html, flags=re.MULTILINE)

    # Tables
    def table_replace(match):
        lines = match.group(0).strip().split('\n')
        rows = []
        for line in lines:
            if '---' in line:
                continue
            cells = [c.strip() for c in line.split('|')[1:-1]]
            rows.append(cells)

        table_html = '<table border="1" style="border-collapse:collapse;margin:10px 0">'
        for i, row in enumerate(rows):
            table_html += '<tr>'
            tag = 'th' if i == 0 else 'td'
            for cell in row:
                table_html += f'<{tag} style="padding:5px;border:1px solid #ccc">{cell}</{tag}>'
            table_html += '</tr>'
        table_html += '</table>'
        return table_html

    html = re.sub(r'(\|[^\n]+\|\n)+', table_replace, html)

    # Bullet lists
    def bullet_list_replace(match):
        items = match.group(0).strip().split('\n')
        list_html = '<ul>'
        for item in items:
            item_text = re.sub(r'^[\-\*\+]\s*', '', item)
            list_html += f'<li>{item_text}</li>'
        list_html += '</ul>'
        return list_html
    html = re.sub(r'(^[\-\*\+]\s+.+\n?)+', bullet_list_replace, html, flags=re.MULTILINE)

    # Numbered lists
    def num_list_replace(match):
        items = match.group(0).strip().split('\n')
        list_html = '<ol>'
        for item in items:
            item_text = re.sub(r'^\d+\.\s*', '', item)
            list_html += f'<li>{item_text}</li>'
        list_html += '</ol>'
        return list_html
    html = re.sub(r'(^\d+\.\s+.+\n?)+', num_list_replace, html, flags=re.MULTILINE)

    # Paragraphs (lines not already in tags)
    lines = html.split('\n')
    result = []
    for line in lines:
        line = line.strip()
        if not line:
            continue
        if line.startswith('<') or line.startswith('{'):
            result.append(line)
        else:
            result.append(f'<p class="MsoNormal">{line}</p>')

    return '\n'.join(result)

def main():
    print("Reading template...")
    with open(TEMPLATE_HTM, 'r', encoding='utf-8', errors='ignore') as f:
        html = f.read()

    soup = BeautifulSoup(html, 'html.parser')

    # Read markdown files
    print("Reading markdown content...")
    md_files = {
        'resumen': read_md('00_resumen.md'),
        'intro': read_md('01_introduccion.md'),
        'contexto': read_md('02_contexto_estado_arte.md'),
        'objetivos': read_md('03_objetivos_metodologia.md'),
        'desarrollo': read_md('04_desarrollo_especifico.md'),
        'conclusiones': read_md('05_conclusiones_trabajo_futuro.md'),
        'referencias': read_md('06_referencias_bibliograficas.md'),
        'anexo': read_md('07_anexo_a.md'),
    }

    # Convert markdown to HTML
    print("Converting markdown to HTML...")
    html_content = {}
    for key, md in md_files.items():
        html_content[key] = md_to_html(md)

    # Find and replace content sections
    print("Replacing template content...")

    # Find all WordSection divs and main content areas
    sections = soup.find_all('div', class_=lambda x: x and 'WordSection' in x)

    # Strategy: Find chapter headings and replace following content
    # The template has placeholders we need to replace

    # Simple approach: Create new HTML with template structure but our content
    new_html = '''<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>TFM - Optimización de Hiperparámetros OCR</title>
<style>
body { font-family: Calibri, sans-serif; font-size: 12pt; line-height: 1.5; margin: 2.5cm 2cm 2.5cm 3cm; }
h1 { font-family: "Calibri Light", sans-serif; font-size: 18pt; color: #0098CD; margin-top: 24pt; }
h2 { font-family: "Calibri Light", sans-serif; font-size: 14pt; color: #0098CD; margin-top: 18pt; }
h3 { font-family: "Calibri Light", sans-serif; font-size: 12pt; margin-top: 12pt; }
h4 { font-family: "Calibri Light", sans-serif; font-size: 11pt; margin-top: 10pt; }
p { text-align: justify; margin: 6pt 0; }
code { font-family: Consolas, monospace; font-size: 10pt; background: #f5f5f5; padding: 2px 4px; }
pre { font-family: Consolas, monospace; font-size: 9pt; background: #f5f5f5; padding: 10px; overflow-x: auto; }
table { border-collapse: collapse; margin: 12pt 0; width: 100%; }
th, td { border: 1px solid #ccc; padding: 6pt; text-align: left; }
th { background: #f0f0f0; font-weight: bold; }
blockquote { margin-left: 2cm; font-style: italic; border-left: 3px solid #0098CD; padding-left: 10px; }
ul, ol { margin: 6pt 0 6pt 1cm; }
li { margin: 3pt 0; }
.title-page { text-align: center; page-break-after: always; }
.title-page h1 { font-size: 24pt; color: #0098CD; }
.chapter { page-break-before: always; }
.referencias p { text-indent: -1.27cm; padding-left: 1.27cm; }
</style>
</head>
<body>
'''

    # Title page
    new_html += '''
<div class="title-page">
<p style="margin-top:3cm"><img src="thesis_output_files/unir_logo.png" width="200" alt="UNIR Logo"></p>
<p style="margin-top:1cm;font-size:14pt">Universidad Internacional de La Rioja<br>Escuela Superior de Ingeniería y Tecnología</p>
<p style="margin-top:2cm;font-size:12pt">Máster Universitario en Inteligencia Artificial</p>
<h1 style="margin-top:2cm">Optimización de Hiperparámetros OCR con Ray Tune para Documentos Académicos en Español</h1>
<p style="margin-top:3cm">Trabajo Fin de Estudio presentado por: <b>Sergio Jiménez Jiménez</b></p>
<p>Tipo de trabajo: Comparativa de soluciones / Piloto experimental</p>
<p>Director: [Nombre del Director]</p>
<p style="margin-top:2cm">Fecha: 2025</p>
</div>
'''

    # Resumen
    new_html += '<div class="chapter">\n'
    new_html += html_content['resumen']
    new_html += '</div>\n'

    # Table of contents placeholder
    new_html += '''
<div class="chapter">
<h1 style="color:black">Índice de contenidos</h1>
<p><i>[El índice se generará automáticamente en Word]</i></p>
</div>
'''

    # Chapters
    chapters = [
        ('intro', 'introduccion'),
        ('contexto', 'contexto'),
        ('objetivos', 'objetivos'),
        ('desarrollo', 'desarrollo'),
        ('conclusiones', 'conclusiones'),
    ]

    for key, _ in chapters:
        new_html += '<div class="chapter">\n'
        new_html += html_content[key]
        new_html += '</div>\n'

    # Referencias
    new_html += '<div class="chapter referencias">\n'
    new_html += html_content['referencias']
    new_html += '</div>\n'

    # Anexo
    new_html += '<div class="chapter">\n'
    new_html += html_content['anexo']
    new_html += '</div>\n'

    new_html += '</body></html>'

    # Save HTML
    print(f"Saving HTML to {OUTPUT_HTM}...")
    with open(OUTPUT_HTM, 'w', encoding='utf-8') as f:
        f.write(new_html)

    # Copy template files folder
    if os.path.exists(OUTPUT_FILES):
        shutil.rmtree(OUTPUT_FILES)
    if os.path.exists(TEMPLATE_FILES):
        shutil.copytree(TEMPLATE_FILES, OUTPUT_FILES)

    # Create UNIR logo placeholder if not exists
    os.makedirs(OUTPUT_FILES, exist_ok=True)

    # Convert to DOCX using pandoc
    print(f"Converting to DOCX with pandoc...")
    result = subprocess.run([
        'pandoc',
        OUTPUT_HTM,
        '-o', OUTPUT_DOCX,
        '--reference-doc', os.path.join(BASE_DIR, 'instructions/plantilla_individual.docx'),
        '--toc',
        '--toc-depth=3'
    ], capture_output=True, text=True)

    if result.returncode != 0:
        print(f"Pandoc error: {result.stderr}")
        # Try without reference doc
        print("Retrying without reference doc...")
        result = subprocess.run([
            'pandoc',
            OUTPUT_HTM,
            '-o', OUTPUT_DOCX,
            '--toc',
            '--toc-depth=3'
        ], capture_output=True, text=True)

    if result.returncode == 0:
        print(f"✓ Document saved to {OUTPUT_DOCX}")
        print(f"✓ HTML version saved to {OUTPUT_HTM}")
    else:
        print(f"Error: {result.stderr}")

if __name__ == '__main__':
    main()
autogen atempt 2025-12-15 23:28:31 +01:00			`#!/usr/bin/env python3`
			`"""Generate thesis DOCX from HTML template and markdown content."""`

			`import os`
			`import re`
			`import shutil`
			`import subprocess`
			`from bs4 import BeautifulSoup`

			`BASE_DIR = '/Users/sergio/Desktop/MastersThesis'`
			`TEMPLATE_HTM = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm')`
			`TEMPLATE_FILES = os.path.join(BASE_DIR, 'instructions/plantilla_individual_files')`
			`OUTPUT_HTM = os.path.join(BASE_DIR, 'thesis_output.htm')`
			`OUTPUT_FILES = os.path.join(BASE_DIR, 'thesis_output_files')`
			`OUTPUT_DOCX = os.path.join(BASE_DIR, 'TFM_Sergio_Jimenez_OCR_Optimization.docx')`
			`DOCS_DIR = os.path.join(BASE_DIR, 'docs')`

			`def read_md(filename):`
			`with open(os.path.join(DOCS_DIR, filename), 'r', encoding='utf-8') as f:`
			`return f.read()`

			`def md_to_html(md_text):`
			`"""Convert markdown to simple HTML."""`
			`html = md_text`

			`# Headers`
			`html = re.sub(r'^#### (.+)$', r'<h4>\1</h4>', html, flags=re.MULTILINE)`
			`html = re.sub(r'^### (.+)$', r'<h3>\1</h3>', html, flags=re.MULTILINE)`
			`html = re.sub(r'^## (.+)$', r'<h2>\1</h2>', html, flags=re.MULTILINE)`
			`html = re.sub(r'^# (.+)$', r'<h1>\1</h1>', html, flags=re.MULTILINE)`

			`# Bold and italic`
			`html = re.sub(r'\\([^]+)\\*', r'<b>\1</b>', html)`
			`html = re.sub(r'\([^]+)\*', r'<i>\1</i>', html)`

			`# Inline code`
			html = re.sub(r'`([^`]+)`', r'<code>\1</code>', html)

			`# Code blocks`
			`def code_block_replace(match):`
			`lang = match.group(1)`
			`code = match.group(2)`
			`return f'<pre style="background:#f5f5f5;padding:10px;font-family:Consolas;font-size:9pt">{code}</pre>'`
			html = re.sub(r'```(\w)\n(.?)```', code_block_replace, html, flags=re.DOTALL)

			`# Blockquotes`
			`html = re.sub(r'^>\s*(.+)$', r'<blockquote style="margin-left:2cm;font-style:italic">\1</blockquote>', html, flags=re.MULTILINE)`

			`# Tables`
			`def table_replace(match):`
			`lines = match.group(0).strip().split('\n')`
			`rows = []`
			`for line in lines:`
			`if '---' in line:`
			`continue`
			`cells = [c.strip() for c in line.split('\|')[1:-1]]`
			`rows.append(cells)`

			`table_html = '<table border="1" style="border-collapse:collapse;margin:10px 0">'`
			`for i, row in enumerate(rows):`
			`table_html += '<tr>'`
			`tag = 'th' if i == 0 else 'td'`
			`for cell in row:`
			`table_html += f'<{tag} style="padding:5px;border:1px solid #ccc">{cell}</{tag}>'`
			`table_html += '</tr>'`
			`table_html += '</table>'`
			`return table_html`

			`html = re.sub(r'(\\|[^\n]+\\|\n)+', table_replace, html)`

			`# Bullet lists`
			`def bullet_list_replace(match):`
			`items = match.group(0).strip().split('\n')`
			`list_html = '<ul>'`
			`for item in items:`
			`item_text = re.sub(r'^[\-\\+]\s', '', item)`
			`list_html += f'<li>{item_text}</li>'`
			`list_html += '</ul>'`
			`return list_html`
			`html = re.sub(r'(^[\-\*\+]\s+.+\n?)+', bullet_list_replace, html, flags=re.MULTILINE)`

			`# Numbered lists`
			`def num_list_replace(match):`
			`items = match.group(0).strip().split('\n')`
			`list_html = '<ol>'`
			`for item in items:`
			`item_text = re.sub(r'^\d+\.\s*', '', item)`
			`list_html += f'<li>{item_text}</li>'`
			`list_html += '</ol>'`
			`return list_html`
			`html = re.sub(r'(^\d+\.\s+.+\n?)+', num_list_replace, html, flags=re.MULTILINE)`

			`# Paragraphs (lines not already in tags)`
			`lines = html.split('\n')`
			`result = []`
			`for line in lines:`
			`line = line.strip()`
			`if not line:`
			`continue`
			`if line.startswith('<') or line.startswith('{'):`
			`result.append(line)`
			`else:`
			`result.append(f'<p class="MsoNormal">{line}</p>')`

			`return '\n'.join(result)`

			`def main():`
			`print("Reading template...")`
			`with open(TEMPLATE_HTM, 'r', encoding='utf-8', errors='ignore') as f:`
			`html = f.read()`

			`soup = BeautifulSoup(html, 'html.parser')`

			`# Read markdown files`
			`print("Reading markdown content...")`
			`md_files = {`
			`'resumen': read_md('00_resumen.md'),`
			`'intro': read_md('01_introduccion.md'),`
			`'contexto': read_md('02_contexto_estado_arte.md'),`
			`'objetivos': read_md('03_objetivos_metodologia.md'),`
			`'desarrollo': read_md('04_desarrollo_especifico.md'),`
			`'conclusiones': read_md('05_conclusiones_trabajo_futuro.md'),`
			`'referencias': read_md('06_referencias_bibliograficas.md'),`
			`'anexo': read_md('07_anexo_a.md'),`
			`}`

			`# Convert markdown to HTML`
			`print("Converting markdown to HTML...")`
			`html_content = {}`
			`for key, md in md_files.items():`
			`html_content[key] = md_to_html(md)`

			`# Find and replace content sections`
			`print("Replacing template content...")`

			`# Find all WordSection divs and main content areas`
			`sections = soup.find_all('div', class_=lambda x: x and 'WordSection' in x)`

			`# Strategy: Find chapter headings and replace following content`
			`# The template has placeholders we need to replace`

			`# Simple approach: Create new HTML with template structure but our content`
			`new_html = '''<!DOCTYPE html>`
			`<html>`
			`<head>`
			`<meta charset="UTF-8">`
			`<title>TFM - Optimización de Hiperparámetros OCR</title>`
			`<style>`
			`body { font-family: Calibri, sans-serif; font-size: 12pt; line-height: 1.5; margin: 2.5cm 2cm 2.5cm 3cm; }`
			`h1 { font-family: "Calibri Light", sans-serif; font-size: 18pt; color: #0098CD; margin-top: 24pt; }`
			`h2 { font-family: "Calibri Light", sans-serif; font-size: 14pt; color: #0098CD; margin-top: 18pt; }`
			`h3 { font-family: "Calibri Light", sans-serif; font-size: 12pt; margin-top: 12pt; }`
			`h4 { font-family: "Calibri Light", sans-serif; font-size: 11pt; margin-top: 10pt; }`
			`p { text-align: justify; margin: 6pt 0; }`
			`code { font-family: Consolas, monospace; font-size: 10pt; background: #f5f5f5; padding: 2px 4px; }`
			`pre { font-family: Consolas, monospace; font-size: 9pt; background: #f5f5f5; padding: 10px; overflow-x: auto; }`
			`table { border-collapse: collapse; margin: 12pt 0; width: 100%; }`
			`th, td { border: 1px solid #ccc; padding: 6pt; text-align: left; }`
			`th { background: #f0f0f0; font-weight: bold; }`
			`blockquote { margin-left: 2cm; font-style: italic; border-left: 3px solid #0098CD; padding-left: 10px; }`
			`ul, ol { margin: 6pt 0 6pt 1cm; }`
			`li { margin: 3pt 0; }`
			`.title-page { text-align: center; page-break-after: always; }`
			`.title-page h1 { font-size: 24pt; color: #0098CD; }`
			`.chapter { page-break-before: always; }`
			`.referencias p { text-indent: -1.27cm; padding-left: 1.27cm; }`
			`</style>`
			`</head>`
			`<body>`
			`'''`

			`# Title page`
			`new_html += '''`
			`<div class="title-page">`
			`<p style="margin-top:3cm"><img src="thesis_output_files/unir_logo.png" width="200" alt="UNIR Logo"></p>`
			`<p style="margin-top:1cm;font-size:14pt">Universidad Internacional de La Rioja<br>Escuela Superior de Ingeniería y Tecnología</p>`
			`<p style="margin-top:2cm;font-size:12pt">Máster Universitario en Inteligencia Artificial</p>`
			`<h1 style="margin-top:2cm">Optimización de Hiperparámetros OCR con Ray Tune para Documentos Académicos en Español</h1>`
			`<p style="margin-top:3cm">Trabajo Fin de Estudio presentado por: <b>Sergio Jiménez Jiménez</b></p>`
			`<p>Tipo de trabajo: Comparativa de soluciones / Piloto experimental</p>`
			`<p>Director: [Nombre del Director]</p>`
			`<p style="margin-top:2cm">Fecha: 2025</p>`
			`</div>`
			`'''`

			`# Resumen`
			`new_html += '<div class="chapter">\n'`
			`new_html += html_content['resumen']`
			`new_html += '</div>\n'`

			`# Table of contents placeholder`
			`new_html += '''`
			`<div class="chapter">`
			`<h1 style="color:black">Índice de contenidos</h1>`
			`<p><i>[El índice se generará automáticamente en Word]</i></p>`
			`</div>`
			`'''`

			`# Chapters`
			`chapters = [`
			`('intro', 'introduccion'),`
			`('contexto', 'contexto'),`
			`('objetivos', 'objetivos'),`
			`('desarrollo', 'desarrollo'),`
			`('conclusiones', 'conclusiones'),`
			`]`

			`for key, _ in chapters:`
			`new_html += '<div class="chapter">\n'`
			`new_html += html_content[key]`
			`new_html += '</div>\n'`

			`# Referencias`
			`new_html += '<div class="chapter referencias">\n'`
			`new_html += html_content['referencias']`
			`new_html += '</div>\n'`

			`# Anexo`
			`new_html += '<div class="chapter">\n'`
			`new_html += html_content['anexo']`
			`new_html += '</div>\n'`

			`new_html += '</body></html>'`

			`# Save HTML`
			`print(f"Saving HTML to {OUTPUT_HTM}...")`
			`with open(OUTPUT_HTM, 'w', encoding='utf-8') as f:`
			`f.write(new_html)`

			`# Copy template files folder`
			`if os.path.exists(OUTPUT_FILES):`
			`shutil.rmtree(OUTPUT_FILES)`
			`if os.path.exists(TEMPLATE_FILES):`
			`shutil.copytree(TEMPLATE_FILES, OUTPUT_FILES)`

			`# Create UNIR logo placeholder if not exists`
			`os.makedirs(OUTPUT_FILES, exist_ok=True)`

			`# Convert to DOCX using pandoc`
			`print(f"Converting to DOCX with pandoc...")`
			`result = subprocess.run([`
			`'pandoc',`
			`OUTPUT_HTM,`
			`'-o', OUTPUT_DOCX,`
			`'--reference-doc', os.path.join(BASE_DIR, 'instructions/plantilla_individual.docx'),`
			`'--toc',`
			`'--toc-depth=3'`
			`], capture_output=True, text=True)`

			`if result.returncode != 0:`
			`print(f"Pandoc error: {result.stderr}")`
			`# Try without reference doc`
			`print("Retrying without reference doc...")`
			`result = subprocess.run([`
			`'pandoc',`
			`OUTPUT_HTM,`
			`'-o', OUTPUT_DOCX,`
			`'--toc',`
			`'--toc-depth=3'`
			`], capture_output=True, text=True)`

			`if result.returncode == 0:`
			`print(f"✓ Document saved to {OUTPUT_DOCX}")`
			`print(f"✓ HTML version saved to {OUTPUT_HTM}")`
			`else:`
			`print(f"Error: {result.stderr}")`

			`if __name__ == '__main__':`
			`main()`