#!/usr/bin/env python3 """ Generate thesis.docx from markdown files using UNIR template. """ import re import os from docx import Document from docx.shared import Pt, Cm, RGBColor, Inches from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.style import WD_STYLE_TYPE from docx.oxml.ns import qn from docx.oxml import OxmlElement # Paths TEMPLATE_PATH = 'instructions/plantilla_individual.docx' OUTPUT_PATH = 'TFM_Sergio_Jimenez_OCR_Optimization.docx' DOCS_PATH = 'docs' # Thesis metadata THESIS_TITLE = "Optimización de Hiperparámetros OCR con Ray Tune para Documentos Académicos en Español" AUTHOR = "Sergio Jiménez Jiménez" DIRECTOR = "[Nombre del Director]" DATE = "2025" def read_markdown_file(filepath): """Read markdown file and return content.""" with open(filepath, 'r', encoding='utf-8') as f: return f.read() def parse_markdown_blocks(md_content): """Parse markdown content into blocks (headers, paragraphs, code, tables, lists).""" blocks = [] lines = md_content.split('\n') i = 0 while i < len(lines): line = lines[i] # Skip empty lines if not line.strip(): i += 1 continue # Code block if line.strip().startswith('```'): lang = line.strip()[3:] code_lines = [] i += 1 while i < len(lines) and not lines[i].strip().startswith('```'): code_lines.append(lines[i]) i += 1 blocks.append({'type': 'code', 'lang': lang, 'content': '\n'.join(code_lines)}) i += 1 continue # Headers if line.startswith('#'): level = len(line) - len(line.lstrip('#')) text = line.lstrip('#').strip() # Remove {.unnumbered} suffix text = re.sub(r'\s*\{\.unnumbered\}\s*$', '', text) blocks.append({'type': 'header', 'level': level, 'content': text}) i += 1 continue # Table if '|' in line and i + 1 < len(lines) and '---' in lines[i + 1]: table_lines = [line] i += 1 while i < len(lines) and '|' in lines[i]: table_lines.append(lines[i]) i += 1 blocks.append({'type': 'table', 'content': table_lines}) continue # Blockquote if line.startswith('>'): quote_text = line[1:].strip() i += 1 while i < len(lines) and lines[i].startswith('>'): quote_text += ' ' + lines[i][1:].strip() i += 1 blocks.append({'type': 'quote', 'content': quote_text}) continue # List item (bullet or numbered) if re.match(r'^[\-\*\+]\s', line) or re.match(r'^\d+\.\s', line): list_items = [] list_type = 'numbered' if re.match(r'^\d+\.', line) else 'bullet' while i < len(lines): current = lines[i] if re.match(r'^[\-\*\+]\s', current): list_items.append(current[2:].strip()) i += 1 elif re.match(r'^\d+\.\s', current): list_items.append(re.sub(r'^\d+\.\s*', '', current).strip()) i += 1 elif current.strip() == '': break else: break blocks.append({'type': 'list', 'list_type': list_type, 'items': list_items}) continue # Figure caption (italic text starting with *Figura or Figura) if line.strip().startswith('*Figura') or line.strip().startswith('Figura'): blocks.append({'type': 'caption', 'content': line.strip().strip('*')}) i += 1 continue # Regular paragraph para_lines = [line] i += 1 while i < len(lines) and lines[i].strip() and not lines[i].startswith('#') and not lines[i].startswith('```') and not lines[i].startswith('>') and not re.match(r'^[\-\*\+]\s', lines[i]) and not re.match(r'^\d+\.\s', lines[i]) and '|' not in lines[i]: para_lines.append(lines[i]) i += 1 para_text = ' '.join(para_lines) blocks.append({'type': 'paragraph', 'content': para_text}) return blocks def add_formatted_text(paragraph, text): """Add text with inline formatting (bold, italic, code) to a paragraph.""" # Pattern for inline formatting parts = re.split(r'(\*\*[^*]+\*\*|\*[^*]+\*|`[^`]+`)', text) for part in parts: if not part: continue if part.startswith('**') and part.endswith('**'): run = paragraph.add_run(part[2:-2]) run.bold = True elif part.startswith('*') and part.endswith('*'): run = paragraph.add_run(part[1:-1]) run.italic = True elif part.startswith('`') and part.endswith('`'): run = paragraph.add_run(part[1:-1]) run.font.name = 'Consolas' run.font.size = Pt(10) else: paragraph.add_run(part) def add_table_to_doc(doc, table_lines): """Add a markdown table to the document.""" # Parse table rows = [] for line in table_lines: if '---' in line: continue cells = [c.strip() for c in line.split('|')[1:-1]] if cells: rows.append(cells) if not rows: return # Create table num_cols = len(rows[0]) table = doc.add_table(rows=len(rows), cols=num_cols) table.style = 'Table Grid' for i, row_data in enumerate(rows): row = table.rows[i] for j, cell_text in enumerate(row_data): if j < len(row.cells): cell = row.cells[j] cell.text = '' para = cell.paragraphs[0] add_formatted_text(para, cell_text) if i == 0: # Header row for run in para.runs: run.bold = True # Add spacing after table doc.add_paragraph() def add_code_block(doc, code, lang=''): """Add a code block to the document.""" para = doc.add_paragraph() para.paragraph_format.left_indent = Cm(0.5) para.paragraph_format.space_before = Pt(6) para.paragraph_format.space_after = Pt(6) run = para.add_run(code) run.font.name = 'Consolas' run.font.size = Pt(9) # Add background shading shading = OxmlElement('w:shd') shading.set(qn('w:fill'), 'F5F5F5') para._p.get_or_add_pPr().append(shading) def get_header_style(level, is_numbered=True): """Get the appropriate style for a header level.""" if level == 1: return 'Heading 1' elif level == 2: return 'Heading 2' elif level == 3: return 'Heading 3' elif level == 4: return 'Heading 4' else: return 'Normal' def add_section_content(doc, md_content, start_numbered=True): """Add markdown content to the document with proper formatting.""" blocks = parse_markdown_blocks(md_content) for block in blocks: if block['type'] == 'header': level = block['level'] text = block['content'] # Map markdown header levels to document styles # ## (level 2) -> Heading 2 (subsection like 1.1. Motivación) # ### (level 3) -> Heading 3 (sub-subsection like 1.1.1. xxx) # #### (level 4) -> Heading 4 if level == 1: # Skip level 1 headers - they're added separately as chapter titles continue elif level == 2: para = doc.add_paragraph(text, style='Heading 2') elif level == 3: para = doc.add_paragraph(text, style='Heading 3') elif level == 4: para = doc.add_paragraph(text, style='Heading 4') else: para = doc.add_paragraph(text) if para.runs: para.runs[0].bold = True elif block['type'] == 'paragraph': para = doc.add_paragraph() add_formatted_text(para, block['content']) elif block['type'] == 'code': add_code_block(doc, block['content'], block.get('lang', '')) elif block['type'] == 'table': add_table_to_doc(doc, block['content']) elif block['type'] == 'quote': para = doc.add_paragraph() para.paragraph_format.left_indent = Cm(1) para.paragraph_format.right_indent = Cm(1) add_formatted_text(para, block['content']) for run in para.runs: run.italic = True elif block['type'] == 'list': for item in block['items']: if block['list_type'] == 'bullet': para = doc.add_paragraph(style='List Paragraph') para.paragraph_format.left_indent = Cm(1) add_formatted_text(para, '• ' + item) else: para = doc.add_paragraph(style='List Paragraph') para.paragraph_format.left_indent = Cm(1) add_formatted_text(para, item) elif block['type'] == 'caption': para = doc.add_paragraph() para.alignment = WD_ALIGN_PARAGRAPH.CENTER run = para.add_run(block['content']) run.italic = True run.font.size = Pt(10) def create_thesis_document(): """Create the thesis document from template and markdown files.""" print("Loading template...") doc = Document(TEMPLATE_PATH) # Find and update title on cover page for para in doc.paragraphs[:20]: if 'Título del TFE' in para.text or 'titulo del TFE' in para.text.lower(): para.clear() run = para.add_run(THESIS_TITLE) run.bold = True # Clear template content after indices (keep cover, resumen structure) # We'll find where actual content starts and replace it # Read all markdown files print("Reading markdown files...") md_files = { 'resumen': read_markdown_file(os.path.join(DOCS_PATH, '00_resumen.md')), 'introduccion': read_markdown_file(os.path.join(DOCS_PATH, '01_introduccion.md')), 'contexto': read_markdown_file(os.path.join(DOCS_PATH, '02_contexto_estado_arte.md')), 'objetivos': read_markdown_file(os.path.join(DOCS_PATH, '03_objetivos_metodologia.md')), 'desarrollo': read_markdown_file(os.path.join(DOCS_PATH, '04_desarrollo_especifico.md')), 'conclusiones': read_markdown_file(os.path.join(DOCS_PATH, '05_conclusiones_trabajo_futuro.md')), 'referencias': read_markdown_file(os.path.join(DOCS_PATH, '06_referencias_bibliograficas.md')), 'anexo': read_markdown_file(os.path.join(DOCS_PATH, '07_anexo_a.md')), } # Create new document based on template but with our content print("Creating new document with thesis content...") # Start fresh document with template styles new_doc = Document(TEMPLATE_PATH) # Clear all content after a certain point # Keep first ~70 paragraphs (cover + resumen structure + indices) paras_to_remove = [] found_intro = False for i, para in enumerate(new_doc.paragraphs): if 'Introducción' in para.text and para.style and 'Heading 1' in para.style.name: found_intro = True if found_intro: paras_to_remove.append(para) # Remove old content for para in paras_to_remove: p = para._element p.getparent().remove(p) # Now add our content print("Adding thesis content...") # Add each chapter chapters = [ ('introduccion', '1. Introducción'), ('contexto', '2. Contexto y estado del arte'), ('objetivos', '3. Objetivos concretos y metodología de trabajo'), ('desarrollo', '4. Desarrollo específico de la contribución'), ('conclusiones', '5. Conclusiones y trabajo futuro'), ] for key, title in chapters: print(f" Adding chapter: {title}") # Add chapter heading with Heading 1 style new_doc.add_paragraph(title, style='Heading 1') # Remove the top-level header from content (we added it separately with proper style) content = md_files[key] # Remove the first # header line and intro paragraph that follows content = re.sub(r'^#\s+\d+\.\s+[^\n]+\n+', '', content) add_section_content(new_doc, content) new_doc.add_page_break() # Add Referencias print(" Adding Referencias bibliográficas") para = new_doc.add_paragraph('Referencias bibliográficas', style='Título 1 sin numerar') refs_content = md_files['referencias'] refs_content = re.sub(r'^#[^\n]+\n+', '', refs_content) # Remove header # Parse references (each reference is a paragraph) for line in refs_content.split('\n\n'): if line.strip(): para = new_doc.add_paragraph() para.paragraph_format.left_indent = Cm(1.27) para.paragraph_format.first_line_indent = Cm(-1.27) add_formatted_text(para, line.strip()) new_doc.add_page_break() # Add Anexo print(" Adding Anexo A") para = new_doc.add_paragraph('Anexo A. Código fuente y datos analizados', style='Título 1 sin numerar') anexo_content = md_files['anexo'] anexo_content = re.sub(r'^#[^\n]+\n+', '', anexo_content) add_section_content(new_doc, anexo_content) # Update Resumen/Abstract sections (find them in the document and update) print("Updating Resumen and Abstract...") resumen_content = md_files['resumen'] # Parse resumen file to extract Spanish and English parts resumen_blocks = parse_markdown_blocks(resumen_content) spanish_paragraphs = [] english_paragraphs = [] keywords_es = "" keywords_en = "" current_section = None for block in resumen_blocks: if block['type'] == 'header': if 'Resumen' in block['content']: current_section = 'es' elif 'Abstract' in block['content']: current_section = 'en' elif block['type'] == 'paragraph': text = block['content'] if 'Palabras clave:' in text: keywords_es = text elif 'Keywords:' in text: keywords_en = text elif current_section == 'es' and text.strip(): spanish_paragraphs.append(text) elif current_section == 'en' and text.strip(): english_paragraphs.append(text) # Find and update Resumen section in doc found_resumen = False found_abstract = False for i, para in enumerate(new_doc.paragraphs): text = para.text.strip() if 'Resumen' in text and para.style and 'Título' in para.style.name: found_resumen = True # Update following paragraphs for j, sp in enumerate(spanish_paragraphs[:3]): # Limit to first 3 paragraphs if i + j + 1 < len(new_doc.paragraphs): target_para = new_doc.paragraphs[i + j + 1] if target_para.style and target_para.style.name == 'Normal': target_para.clear() add_formatted_text(target_para, sp) elif 'Abstract' in text and para.style and 'Título' in para.style.name: found_abstract = True for j, ep in enumerate(english_paragraphs[:3]): if i + j + 1 < len(new_doc.paragraphs): target_para = new_doc.paragraphs[i + j + 1] if target_para.style and target_para.style.name == 'Normal': target_para.clear() add_formatted_text(target_para, ep) # Save document print(f"Saving document to {OUTPUT_PATH}...") new_doc.save(OUTPUT_PATH) print(f"Done! Document saved as {OUTPUT_PATH}") if __name__ == '__main__': os.chdir('/Users/sergio/Desktop/MastersThesis') create_thesis_document()