#!/usr/bin/env python3 """Generate thesis DOCX from HTML template and markdown content.""" import os import re import shutil import subprocess from bs4 import BeautifulSoup BASE_DIR = '/Users/sergio/Desktop/MastersThesis' TEMPLATE_HTM = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm') TEMPLATE_FILES = os.path.join(BASE_DIR, 'instructions/plantilla_individual_files') OUTPUT_HTM = os.path.join(BASE_DIR, 'thesis_output.htm') OUTPUT_FILES = os.path.join(BASE_DIR, 'thesis_output_files') OUTPUT_DOCX = os.path.join(BASE_DIR, 'TFM_Sergio_Jimenez_OCR_Optimization.docx') DOCS_DIR = os.path.join(BASE_DIR, 'docs') def read_md(filename): with open(os.path.join(DOCS_DIR, filename), 'r', encoding='utf-8') as f: return f.read() def md_to_html(md_text): """Convert markdown to simple HTML.""" html = md_text # Headers html = re.sub(r'^#### (.+)$', r'
\1', html)
# Code blocks
def code_block_replace(match):
lang = match.group(1)
code = match.group(2)
return f'{code}'
html = re.sub(r'```(\w*)\n(.*?)```', code_block_replace, html, flags=re.DOTALL)
# Blockquotes
html = re.sub(r'^>\s*(.+)$', r'\1', html, flags=re.MULTILINE) # Tables def table_replace(match): lines = match.group(0).strip().split('\n') rows = [] for line in lines: if '---' in line: continue cells = [c.strip() for c in line.split('|')[1:-1]] rows.append(cells) table_html = '
{line}
') return '\n'.join(result) def main(): print("Reading template...") with open(TEMPLATE_HTM, 'r', encoding='utf-8', errors='ignore') as f: html = f.read() soup = BeautifulSoup(html, 'html.parser') # Read markdown files print("Reading markdown content...") md_files = { 'resumen': read_md('00_resumen.md'), 'intro': read_md('01_introduccion.md'), 'contexto': read_md('02_contexto_estado_arte.md'), 'objetivos': read_md('03_objetivos_metodologia.md'), 'desarrollo': read_md('04_desarrollo_especifico.md'), 'conclusiones': read_md('05_conclusiones_trabajo_futuro.md'), 'referencias': read_md('06_referencias_bibliograficas.md'), 'anexo': read_md('07_anexo_a.md'), } # Convert markdown to HTML print("Converting markdown to HTML...") html_content = {} for key, md in md_files.items(): html_content[key] = md_to_html(md) # Find and replace content sections print("Replacing template content...") # Find all WordSection divs and main content areas sections = soup.find_all('div', class_=lambda x: x and 'WordSection' in x) # Strategy: Find chapter headings and replace following content # The template has placeholders we need to replace # Simple approach: Create new HTML with template structure but our content new_html = '''
Universidad Internacional de La Rioja
Escuela Superior de Ingeniería y Tecnología
Máster Universitario en Inteligencia Artificial
Trabajo Fin de Estudio presentado por: Sergio Jiménez Jiménez
Tipo de trabajo: Comparativa de soluciones / Piloto experimental
Director: [Nombre del Director]
Fecha: 2025
[El índice se generará automáticamente en Word]