Files
MastersThesis/generate_thesis_docx.py
2025-12-15 23:28:31 +01:00

439 lines
16 KiB
Python

#!/usr/bin/env python3
"""
Generate thesis.docx from markdown files using UNIR template.
"""
import re
import os
from docx import Document
from docx.shared import Pt, Cm, RGBColor, Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.style import WD_STYLE_TYPE
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
# Paths
TEMPLATE_PATH = 'instructions/plantilla_individual.docx'
OUTPUT_PATH = 'TFM_Sergio_Jimenez_OCR_Optimization.docx'
DOCS_PATH = 'docs'
# Thesis metadata
THESIS_TITLE = "Optimización de Hiperparámetros OCR con Ray Tune para Documentos Académicos en Español"
AUTHOR = "Sergio Jiménez Jiménez"
DIRECTOR = "[Nombre del Director]"
DATE = "2025"
def read_markdown_file(filepath):
"""Read markdown file and return content."""
with open(filepath, 'r', encoding='utf-8') as f:
return f.read()
def parse_markdown_blocks(md_content):
"""Parse markdown content into blocks (headers, paragraphs, code, tables, lists)."""
blocks = []
lines = md_content.split('\n')
i = 0
while i < len(lines):
line = lines[i]
# Skip empty lines
if not line.strip():
i += 1
continue
# Code block
if line.strip().startswith('```'):
lang = line.strip()[3:]
code_lines = []
i += 1
while i < len(lines) and not lines[i].strip().startswith('```'):
code_lines.append(lines[i])
i += 1
blocks.append({'type': 'code', 'lang': lang, 'content': '\n'.join(code_lines)})
i += 1
continue
# Headers
if line.startswith('#'):
level = len(line) - len(line.lstrip('#'))
text = line.lstrip('#').strip()
# Remove {.unnumbered} suffix
text = re.sub(r'\s*\{\.unnumbered\}\s*$', '', text)
blocks.append({'type': 'header', 'level': level, 'content': text})
i += 1
continue
# Table
if '|' in line and i + 1 < len(lines) and '---' in lines[i + 1]:
table_lines = [line]
i += 1
while i < len(lines) and '|' in lines[i]:
table_lines.append(lines[i])
i += 1
blocks.append({'type': 'table', 'content': table_lines})
continue
# Blockquote
if line.startswith('>'):
quote_text = line[1:].strip()
i += 1
while i < len(lines) and lines[i].startswith('>'):
quote_text += ' ' + lines[i][1:].strip()
i += 1
blocks.append({'type': 'quote', 'content': quote_text})
continue
# List item (bullet or numbered)
if re.match(r'^[\-\*\+]\s', line) or re.match(r'^\d+\.\s', line):
list_items = []
list_type = 'numbered' if re.match(r'^\d+\.', line) else 'bullet'
while i < len(lines):
current = lines[i]
if re.match(r'^[\-\*\+]\s', current):
list_items.append(current[2:].strip())
i += 1
elif re.match(r'^\d+\.\s', current):
list_items.append(re.sub(r'^\d+\.\s*', '', current).strip())
i += 1
elif current.strip() == '':
break
else:
break
blocks.append({'type': 'list', 'list_type': list_type, 'items': list_items})
continue
# Figure caption (italic text starting with *Figura or Figura)
if line.strip().startswith('*Figura') or line.strip().startswith('Figura'):
blocks.append({'type': 'caption', 'content': line.strip().strip('*')})
i += 1
continue
# Regular paragraph
para_lines = [line]
i += 1
while i < len(lines) and lines[i].strip() and not lines[i].startswith('#') and not lines[i].startswith('```') and not lines[i].startswith('>') and not re.match(r'^[\-\*\+]\s', lines[i]) and not re.match(r'^\d+\.\s', lines[i]) and '|' not in lines[i]:
para_lines.append(lines[i])
i += 1
para_text = ' '.join(para_lines)
blocks.append({'type': 'paragraph', 'content': para_text})
return blocks
def add_formatted_text(paragraph, text):
"""Add text with inline formatting (bold, italic, code) to a paragraph."""
# Pattern for inline formatting
parts = re.split(r'(\*\*[^*]+\*\*|\*[^*]+\*|`[^`]+`)', text)
for part in parts:
if not part:
continue
if part.startswith('**') and part.endswith('**'):
run = paragraph.add_run(part[2:-2])
run.bold = True
elif part.startswith('*') and part.endswith('*'):
run = paragraph.add_run(part[1:-1])
run.italic = True
elif part.startswith('`') and part.endswith('`'):
run = paragraph.add_run(part[1:-1])
run.font.name = 'Consolas'
run.font.size = Pt(10)
else:
paragraph.add_run(part)
def add_table_to_doc(doc, table_lines):
"""Add a markdown table to the document."""
# Parse table
rows = []
for line in table_lines:
if '---' in line:
continue
cells = [c.strip() for c in line.split('|')[1:-1]]
if cells:
rows.append(cells)
if not rows:
return
# Create table
num_cols = len(rows[0])
table = doc.add_table(rows=len(rows), cols=num_cols)
table.style = 'Table Grid'
for i, row_data in enumerate(rows):
row = table.rows[i]
for j, cell_text in enumerate(row_data):
if j < len(row.cells):
cell = row.cells[j]
cell.text = ''
para = cell.paragraphs[0]
add_formatted_text(para, cell_text)
if i == 0: # Header row
for run in para.runs:
run.bold = True
# Add spacing after table
doc.add_paragraph()
def add_code_block(doc, code, lang=''):
"""Add a code block to the document."""
para = doc.add_paragraph()
para.paragraph_format.left_indent = Cm(0.5)
para.paragraph_format.space_before = Pt(6)
para.paragraph_format.space_after = Pt(6)
run = para.add_run(code)
run.font.name = 'Consolas'
run.font.size = Pt(9)
# Add background shading
shading = OxmlElement('w:shd')
shading.set(qn('w:fill'), 'F5F5F5')
para._p.get_or_add_pPr().append(shading)
def get_header_style(level, is_numbered=True):
"""Get the appropriate style for a header level."""
if level == 1:
return 'Heading 1'
elif level == 2:
return 'Heading 2'
elif level == 3:
return 'Heading 3'
elif level == 4:
return 'Heading 4'
else:
return 'Normal'
def add_section_content(doc, md_content, start_numbered=True):
"""Add markdown content to the document with proper formatting."""
blocks = parse_markdown_blocks(md_content)
for block in blocks:
if block['type'] == 'header':
level = block['level']
text = block['content']
# Map markdown header levels to document styles
# ## (level 2) -> Heading 2 (subsection like 1.1. Motivación)
# ### (level 3) -> Heading 3 (sub-subsection like 1.1.1. xxx)
# #### (level 4) -> Heading 4
if level == 1:
# Skip level 1 headers - they're added separately as chapter titles
continue
elif level == 2:
para = doc.add_paragraph(text, style='Heading 2')
elif level == 3:
para = doc.add_paragraph(text, style='Heading 3')
elif level == 4:
para = doc.add_paragraph(text, style='Heading 4')
else:
para = doc.add_paragraph(text)
if para.runs:
para.runs[0].bold = True
elif block['type'] == 'paragraph':
para = doc.add_paragraph()
add_formatted_text(para, block['content'])
elif block['type'] == 'code':
add_code_block(doc, block['content'], block.get('lang', ''))
elif block['type'] == 'table':
add_table_to_doc(doc, block['content'])
elif block['type'] == 'quote':
para = doc.add_paragraph()
para.paragraph_format.left_indent = Cm(1)
para.paragraph_format.right_indent = Cm(1)
add_formatted_text(para, block['content'])
for run in para.runs:
run.italic = True
elif block['type'] == 'list':
for item in block['items']:
if block['list_type'] == 'bullet':
para = doc.add_paragraph(style='List Paragraph')
para.paragraph_format.left_indent = Cm(1)
add_formatted_text(para, '' + item)
else:
para = doc.add_paragraph(style='List Paragraph')
para.paragraph_format.left_indent = Cm(1)
add_formatted_text(para, item)
elif block['type'] == 'caption':
para = doc.add_paragraph()
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = para.add_run(block['content'])
run.italic = True
run.font.size = Pt(10)
def create_thesis_document():
"""Create the thesis document from template and markdown files."""
print("Loading template...")
doc = Document(TEMPLATE_PATH)
# Find and update title on cover page
for para in doc.paragraphs[:20]:
if 'Título del TFE' in para.text or 'titulo del TFE' in para.text.lower():
para.clear()
run = para.add_run(THESIS_TITLE)
run.bold = True
# Clear template content after indices (keep cover, resumen structure)
# We'll find where actual content starts and replace it
# Read all markdown files
print("Reading markdown files...")
md_files = {
'resumen': read_markdown_file(os.path.join(DOCS_PATH, '00_resumen.md')),
'introduccion': read_markdown_file(os.path.join(DOCS_PATH, '01_introduccion.md')),
'contexto': read_markdown_file(os.path.join(DOCS_PATH, '02_contexto_estado_arte.md')),
'objetivos': read_markdown_file(os.path.join(DOCS_PATH, '03_objetivos_metodologia.md')),
'desarrollo': read_markdown_file(os.path.join(DOCS_PATH, '04_desarrollo_especifico.md')),
'conclusiones': read_markdown_file(os.path.join(DOCS_PATH, '05_conclusiones_trabajo_futuro.md')),
'referencias': read_markdown_file(os.path.join(DOCS_PATH, '06_referencias_bibliograficas.md')),
'anexo': read_markdown_file(os.path.join(DOCS_PATH, '07_anexo_a.md')),
}
# Create new document based on template but with our content
print("Creating new document with thesis content...")
# Start fresh document with template styles
new_doc = Document(TEMPLATE_PATH)
# Clear all content after a certain point
# Keep first ~70 paragraphs (cover + resumen structure + indices)
paras_to_remove = []
found_intro = False
for i, para in enumerate(new_doc.paragraphs):
if 'Introducción' in para.text and para.style and 'Heading 1' in para.style.name:
found_intro = True
if found_intro:
paras_to_remove.append(para)
# Remove old content
for para in paras_to_remove:
p = para._element
p.getparent().remove(p)
# Now add our content
print("Adding thesis content...")
# Add each chapter
chapters = [
('introduccion', '1. Introducción'),
('contexto', '2. Contexto y estado del arte'),
('objetivos', '3. Objetivos concretos y metodología de trabajo'),
('desarrollo', '4. Desarrollo específico de la contribución'),
('conclusiones', '5. Conclusiones y trabajo futuro'),
]
for key, title in chapters:
print(f" Adding chapter: {title}")
# Add chapter heading with Heading 1 style
new_doc.add_paragraph(title, style='Heading 1')
# Remove the top-level header from content (we added it separately with proper style)
content = md_files[key]
# Remove the first # header line and intro paragraph that follows
content = re.sub(r'^#\s+\d+\.\s+[^\n]+\n+', '', content)
add_section_content(new_doc, content)
new_doc.add_page_break()
# Add Referencias
print(" Adding Referencias bibliográficas")
para = new_doc.add_paragraph('Referencias bibliográficas', style='Título 1 sin numerar')
refs_content = md_files['referencias']
refs_content = re.sub(r'^#[^\n]+\n+', '', refs_content) # Remove header
# Parse references (each reference is a paragraph)
for line in refs_content.split('\n\n'):
if line.strip():
para = new_doc.add_paragraph()
para.paragraph_format.left_indent = Cm(1.27)
para.paragraph_format.first_line_indent = Cm(-1.27)
add_formatted_text(para, line.strip())
new_doc.add_page_break()
# Add Anexo
print(" Adding Anexo A")
para = new_doc.add_paragraph('Anexo A. Código fuente y datos analizados', style='Título 1 sin numerar')
anexo_content = md_files['anexo']
anexo_content = re.sub(r'^#[^\n]+\n+', '', anexo_content)
add_section_content(new_doc, anexo_content)
# Update Resumen/Abstract sections (find them in the document and update)
print("Updating Resumen and Abstract...")
resumen_content = md_files['resumen']
# Parse resumen file to extract Spanish and English parts
resumen_blocks = parse_markdown_blocks(resumen_content)
spanish_paragraphs = []
english_paragraphs = []
keywords_es = ""
keywords_en = ""
current_section = None
for block in resumen_blocks:
if block['type'] == 'header':
if 'Resumen' in block['content']:
current_section = 'es'
elif 'Abstract' in block['content']:
current_section = 'en'
elif block['type'] == 'paragraph':
text = block['content']
if 'Palabras clave:' in text:
keywords_es = text
elif 'Keywords:' in text:
keywords_en = text
elif current_section == 'es' and text.strip():
spanish_paragraphs.append(text)
elif current_section == 'en' and text.strip():
english_paragraphs.append(text)
# Find and update Resumen section in doc
found_resumen = False
found_abstract = False
for i, para in enumerate(new_doc.paragraphs):
text = para.text.strip()
if 'Resumen' in text and para.style and 'Título' in para.style.name:
found_resumen = True
# Update following paragraphs
for j, sp in enumerate(spanish_paragraphs[:3]): # Limit to first 3 paragraphs
if i + j + 1 < len(new_doc.paragraphs):
target_para = new_doc.paragraphs[i + j + 1]
if target_para.style and target_para.style.name == 'Normal':
target_para.clear()
add_formatted_text(target_para, sp)
elif 'Abstract' in text and para.style and 'Título' in para.style.name:
found_abstract = True
for j, ep in enumerate(english_paragraphs[:3]):
if i + j + 1 < len(new_doc.paragraphs):
target_para = new_doc.paragraphs[i + j + 1]
if target_para.style and target_para.style.name == 'Normal':
target_para.clear()
add_formatted_text(target_para, ep)
# Save document
print(f"Saving document to {OUTPUT_PATH}...")
new_doc.save(OUTPUT_PATH)
print(f"Done! Document saved as {OUTPUT_PATH}")
if __name__ == '__main__':
os.chdir('/Users/sergio/Desktop/MastersThesis')
create_thesis_document()