#!/usr/bin/env python3 """Generate thesis DOCX from HTML template and markdown content.""" import os import re import shutil import subprocess from bs4 import BeautifulSoup BASE_DIR = '/Users/sergio/Desktop/MastersThesis' TEMPLATE_HTM = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm') TEMPLATE_FILES = os.path.join(BASE_DIR, 'instructions/plantilla_individual_files') OUTPUT_HTM = os.path.join(BASE_DIR, 'thesis_output.htm') OUTPUT_FILES = os.path.join(BASE_DIR, 'thesis_output_files') OUTPUT_DOCX = os.path.join(BASE_DIR, 'TFM_Sergio_Jimenez_OCR_Optimization.docx') DOCS_DIR = os.path.join(BASE_DIR, 'docs') def read_md(filename): with open(os.path.join(DOCS_DIR, filename), 'r', encoding='utf-8') as f: return f.read() def md_to_html(md_text): """Convert markdown to simple HTML.""" html = md_text # Headers html = re.sub(r'^#### (.+)$', r'

\1

', html, flags=re.MULTILINE) html = re.sub(r'^### (.+)$', r'

\1

', html, flags=re.MULTILINE) html = re.sub(r'^## (.+)$', r'

\1

', html, flags=re.MULTILINE) html = re.sub(r'^# (.+)$', r'

\1

', html, flags=re.MULTILINE) # Bold and italic html = re.sub(r'\*\*([^*]+)\*\*', r'\1', html) html = re.sub(r'\*([^*]+)\*', r'\1', html) # Inline code html = re.sub(r'`([^`]+)`', r'\1', html) # Code blocks def code_block_replace(match): lang = match.group(1) code = match.group(2) return f'

{code}

' html = re.sub(r'```(\w*)\n(.*?)```', code_block_replace, html, flags=re.DOTALL) # Blockquotes html = re.sub(r'^>\s*(.+)$', r'

\1

', html, flags=re.MULTILINE) # Tables def table_replace(match): lines = match.group(0).strip().split('\n') rows = [] for line in lines: if '---' in line: continue cells = [c.strip() for c in line.split('|')[1:-1]] rows.append(cells) table_html = '' for i, row in enumerate(rows): table_html += '' tag = 'th' if i == 0 else 'td' for cell in row: table_html += f'<{tag} style="padding:5px;border:1px solid #ccc">{cell}' table_html += '' table_html += '

' return table_html html = re.sub(r'(\|[^\n]+\|\n)+', table_replace, html) # Bullet lists def bullet_list_replace(match): items = match.group(0).strip().split('\n') list_html = '

{item_text}

' return list_html html = re.sub(r'(^[\-\*\+]\s+.+\n?)+', bullet_list_replace, html, flags=re.MULTILINE) # Numbered lists def num_list_replace(match): items = match.group(0).strip().split('\n') list_html = '

{item_text}

' return list_html html = re.sub(r'(^\d+\.\s+.+\n?)+', num_list_replace, html, flags=re.MULTILINE) # Paragraphs (lines not already in tags) lines = html.split('\n') result = [] for line in lines: line = line.strip() if not line: continue if line.startswith('<') or line.startswith('{'): result.append(line) else: result.append(f'

{line}

') return '\n'.join(result) def main(): print("Reading template...") with open(TEMPLATE_HTM, 'r', encoding='utf-8', errors='ignore') as f: html = f.read() soup = BeautifulSoup(html, 'html.parser') # Read markdown files print("Reading markdown content...") md_files = { 'resumen': read_md('00_resumen.md'), 'intro': read_md('01_introduccion.md'), 'contexto': read_md('02_contexto_estado_arte.md'), 'objetivos': read_md('03_objetivos_metodologia.md'), 'desarrollo': read_md('04_desarrollo_especifico.md'), 'conclusiones': read_md('05_conclusiones_trabajo_futuro.md'), 'referencias': read_md('06_referencias_bibliograficas.md'), 'anexo': read_md('07_anexo_a.md'), } # Convert markdown to HTML print("Converting markdown to HTML...") html_content = {} for key, md in md_files.items(): html_content[key] = md_to_html(md) # Find and replace content sections print("Replacing template content...") # Find all WordSection divs and main content areas sections = soup.find_all('div', class_=lambda x: x and 'WordSection' in x) # Strategy: Find chapter headings and replace following content # The template has placeholders we need to replace # Simple approach: Create new HTML with template structure but our content new_html = ''' TFM - Optimización de Hiperparámetros OCR ''' # Title page new_html += '''

UNIR Logo

Universidad Internacional de La Rioja
Escuela Superior de Ingeniería y Tecnología

Máster Universitario en Inteligencia Artificial

Optimización de Hiperparámetros OCR con Ray Tune para Documentos Académicos en Español

Trabajo Fin de Estudio presentado por: Sergio Jiménez Jiménez

Tipo de trabajo: Comparativa de soluciones / Piloto experimental

Director: [Nombre del Director]

Fecha: 2025

''' # Resumen new_html += '

\n' new_html += html_content['resumen'] new_html += '

\n' # Table of contents placeholder new_html += '''

Índice de contenidos

[El índice se generará automáticamente en Word]

''' # Chapters chapters = [ ('intro', 'introduccion'), ('contexto', 'contexto'), ('objetivos', 'objetivos'), ('desarrollo', 'desarrollo'), ('conclusiones', 'conclusiones'), ] for key, _ in chapters: new_html += '

\n' new_html += html_content[key] new_html += '

\n' # Referencias new_html += '

\n' new_html += html_content['referencias'] new_html += '

\n' # Anexo new_html += '

\n' new_html += html_content['anexo'] new_html += '

\n' new_html += '' # Save HTML print(f"Saving HTML to {OUTPUT_HTM}...") with open(OUTPUT_HTM, 'w', encoding='utf-8') as f: f.write(new_html) # Copy template files folder if os.path.exists(OUTPUT_FILES): shutil.rmtree(OUTPUT_FILES) if os.path.exists(TEMPLATE_FILES): shutil.copytree(TEMPLATE_FILES, OUTPUT_FILES) # Create UNIR logo placeholder if not exists os.makedirs(OUTPUT_FILES, exist_ok=True) # Convert to DOCX using pandoc print(f"Converting to DOCX with pandoc...") result = subprocess.run([ 'pandoc', OUTPUT_HTM, '-o', OUTPUT_DOCX, '--reference-doc', os.path.join(BASE_DIR, 'instructions/plantilla_individual.docx'), '--toc', '--toc-depth=3' ], capture_output=True, text=True) if result.returncode != 0: print(f"Pandoc error: {result.stderr}") # Try without reference doc print("Retrying without reference doc...") result = subprocess.run([ 'pandoc', OUTPUT_HTM, '-o', OUTPUT_DOCX, '--toc', '--toc-depth=3' ], capture_output=True, text=True) if result.returncode == 0: print(f"✓ Document saved to {OUTPUT_DOCX}") print(f"✓ HTML version saved to {OUTPUT_HTM}") else: print(f"Error: {result.stderr}") if __name__ == '__main__': main()