439 lines
16 KiB
Python
439 lines
16 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Generate thesis.docx from markdown files using UNIR template.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import re
|
||
|
|
import os
|
||
|
|
from docx import Document
|
||
|
|
from docx.shared import Pt, Cm, RGBColor, Inches
|
||
|
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||
|
|
from docx.enum.style import WD_STYLE_TYPE
|
||
|
|
from docx.oxml.ns import qn
|
||
|
|
from docx.oxml import OxmlElement
|
||
|
|
|
||
|
|
# Paths
|
||
|
|
TEMPLATE_PATH = 'instructions/plantilla_individual.docx'
|
||
|
|
OUTPUT_PATH = 'TFM_Sergio_Jimenez_OCR_Optimization.docx'
|
||
|
|
DOCS_PATH = 'docs'
|
||
|
|
|
||
|
|
# Thesis metadata
|
||
|
|
THESIS_TITLE = "Optimización de Hiperparámetros OCR con Ray Tune para Documentos Académicos en Español"
|
||
|
|
AUTHOR = "Sergio Jiménez Jiménez"
|
||
|
|
DIRECTOR = "[Nombre del Director]"
|
||
|
|
DATE = "2025"
|
||
|
|
|
||
|
|
|
||
|
|
def read_markdown_file(filepath):
|
||
|
|
"""Read markdown file and return content."""
|
||
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
||
|
|
return f.read()
|
||
|
|
|
||
|
|
|
||
|
|
def parse_markdown_blocks(md_content):
|
||
|
|
"""Parse markdown content into blocks (headers, paragraphs, code, tables, lists)."""
|
||
|
|
blocks = []
|
||
|
|
lines = md_content.split('\n')
|
||
|
|
i = 0
|
||
|
|
|
||
|
|
while i < len(lines):
|
||
|
|
line = lines[i]
|
||
|
|
|
||
|
|
# Skip empty lines
|
||
|
|
if not line.strip():
|
||
|
|
i += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Code block
|
||
|
|
if line.strip().startswith('```'):
|
||
|
|
lang = line.strip()[3:]
|
||
|
|
code_lines = []
|
||
|
|
i += 1
|
||
|
|
while i < len(lines) and not lines[i].strip().startswith('```'):
|
||
|
|
code_lines.append(lines[i])
|
||
|
|
i += 1
|
||
|
|
blocks.append({'type': 'code', 'lang': lang, 'content': '\n'.join(code_lines)})
|
||
|
|
i += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Headers
|
||
|
|
if line.startswith('#'):
|
||
|
|
level = len(line) - len(line.lstrip('#'))
|
||
|
|
text = line.lstrip('#').strip()
|
||
|
|
# Remove {.unnumbered} suffix
|
||
|
|
text = re.sub(r'\s*\{\.unnumbered\}\s*$', '', text)
|
||
|
|
blocks.append({'type': 'header', 'level': level, 'content': text})
|
||
|
|
i += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Table
|
||
|
|
if '|' in line and i + 1 < len(lines) and '---' in lines[i + 1]:
|
||
|
|
table_lines = [line]
|
||
|
|
i += 1
|
||
|
|
while i < len(lines) and '|' in lines[i]:
|
||
|
|
table_lines.append(lines[i])
|
||
|
|
i += 1
|
||
|
|
blocks.append({'type': 'table', 'content': table_lines})
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Blockquote
|
||
|
|
if line.startswith('>'):
|
||
|
|
quote_text = line[1:].strip()
|
||
|
|
i += 1
|
||
|
|
while i < len(lines) and lines[i].startswith('>'):
|
||
|
|
quote_text += ' ' + lines[i][1:].strip()
|
||
|
|
i += 1
|
||
|
|
blocks.append({'type': 'quote', 'content': quote_text})
|
||
|
|
continue
|
||
|
|
|
||
|
|
# List item (bullet or numbered)
|
||
|
|
if re.match(r'^[\-\*\+]\s', line) or re.match(r'^\d+\.\s', line):
|
||
|
|
list_items = []
|
||
|
|
list_type = 'numbered' if re.match(r'^\d+\.', line) else 'bullet'
|
||
|
|
while i < len(lines):
|
||
|
|
current = lines[i]
|
||
|
|
if re.match(r'^[\-\*\+]\s', current):
|
||
|
|
list_items.append(current[2:].strip())
|
||
|
|
i += 1
|
||
|
|
elif re.match(r'^\d+\.\s', current):
|
||
|
|
list_items.append(re.sub(r'^\d+\.\s*', '', current).strip())
|
||
|
|
i += 1
|
||
|
|
elif current.strip() == '':
|
||
|
|
break
|
||
|
|
else:
|
||
|
|
break
|
||
|
|
blocks.append({'type': 'list', 'list_type': list_type, 'items': list_items})
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Figure caption (italic text starting with *Figura or Figura)
|
||
|
|
if line.strip().startswith('*Figura') or line.strip().startswith('Figura'):
|
||
|
|
blocks.append({'type': 'caption', 'content': line.strip().strip('*')})
|
||
|
|
i += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Regular paragraph
|
||
|
|
para_lines = [line]
|
||
|
|
i += 1
|
||
|
|
while i < len(lines) and lines[i].strip() and not lines[i].startswith('#') and not lines[i].startswith('```') and not lines[i].startswith('>') and not re.match(r'^[\-\*\+]\s', lines[i]) and not re.match(r'^\d+\.\s', lines[i]) and '|' not in lines[i]:
|
||
|
|
para_lines.append(lines[i])
|
||
|
|
i += 1
|
||
|
|
|
||
|
|
para_text = ' '.join(para_lines)
|
||
|
|
blocks.append({'type': 'paragraph', 'content': para_text})
|
||
|
|
|
||
|
|
return blocks
|
||
|
|
|
||
|
|
|
||
|
|
def add_formatted_text(paragraph, text):
|
||
|
|
"""Add text with inline formatting (bold, italic, code) to a paragraph."""
|
||
|
|
# Pattern for inline formatting
|
||
|
|
parts = re.split(r'(\*\*[^*]+\*\*|\*[^*]+\*|`[^`]+`)', text)
|
||
|
|
|
||
|
|
for part in parts:
|
||
|
|
if not part:
|
||
|
|
continue
|
||
|
|
if part.startswith('**') and part.endswith('**'):
|
||
|
|
run = paragraph.add_run(part[2:-2])
|
||
|
|
run.bold = True
|
||
|
|
elif part.startswith('*') and part.endswith('*'):
|
||
|
|
run = paragraph.add_run(part[1:-1])
|
||
|
|
run.italic = True
|
||
|
|
elif part.startswith('`') and part.endswith('`'):
|
||
|
|
run = paragraph.add_run(part[1:-1])
|
||
|
|
run.font.name = 'Consolas'
|
||
|
|
run.font.size = Pt(10)
|
||
|
|
else:
|
||
|
|
paragraph.add_run(part)
|
||
|
|
|
||
|
|
|
||
|
|
def add_table_to_doc(doc, table_lines):
|
||
|
|
"""Add a markdown table to the document."""
|
||
|
|
# Parse table
|
||
|
|
rows = []
|
||
|
|
for line in table_lines:
|
||
|
|
if '---' in line:
|
||
|
|
continue
|
||
|
|
cells = [c.strip() for c in line.split('|')[1:-1]]
|
||
|
|
if cells:
|
||
|
|
rows.append(cells)
|
||
|
|
|
||
|
|
if not rows:
|
||
|
|
return
|
||
|
|
|
||
|
|
# Create table
|
||
|
|
num_cols = len(rows[0])
|
||
|
|
table = doc.add_table(rows=len(rows), cols=num_cols)
|
||
|
|
table.style = 'Table Grid'
|
||
|
|
|
||
|
|
for i, row_data in enumerate(rows):
|
||
|
|
row = table.rows[i]
|
||
|
|
for j, cell_text in enumerate(row_data):
|
||
|
|
if j < len(row.cells):
|
||
|
|
cell = row.cells[j]
|
||
|
|
cell.text = ''
|
||
|
|
para = cell.paragraphs[0]
|
||
|
|
add_formatted_text(para, cell_text)
|
||
|
|
if i == 0: # Header row
|
||
|
|
for run in para.runs:
|
||
|
|
run.bold = True
|
||
|
|
|
||
|
|
# Add spacing after table
|
||
|
|
doc.add_paragraph()
|
||
|
|
|
||
|
|
|
||
|
|
def add_code_block(doc, code, lang=''):
|
||
|
|
"""Add a code block to the document."""
|
||
|
|
para = doc.add_paragraph()
|
||
|
|
para.paragraph_format.left_indent = Cm(0.5)
|
||
|
|
para.paragraph_format.space_before = Pt(6)
|
||
|
|
para.paragraph_format.space_after = Pt(6)
|
||
|
|
|
||
|
|
run = para.add_run(code)
|
||
|
|
run.font.name = 'Consolas'
|
||
|
|
run.font.size = Pt(9)
|
||
|
|
|
||
|
|
# Add background shading
|
||
|
|
shading = OxmlElement('w:shd')
|
||
|
|
shading.set(qn('w:fill'), 'F5F5F5')
|
||
|
|
para._p.get_or_add_pPr().append(shading)
|
||
|
|
|
||
|
|
|
||
|
|
def get_header_style(level, is_numbered=True):
|
||
|
|
"""Get the appropriate style for a header level."""
|
||
|
|
if level == 1:
|
||
|
|
return 'Heading 1'
|
||
|
|
elif level == 2:
|
||
|
|
return 'Heading 2'
|
||
|
|
elif level == 3:
|
||
|
|
return 'Heading 3'
|
||
|
|
elif level == 4:
|
||
|
|
return 'Heading 4'
|
||
|
|
else:
|
||
|
|
return 'Normal'
|
||
|
|
|
||
|
|
|
||
|
|
def add_section_content(doc, md_content, start_numbered=True):
|
||
|
|
"""Add markdown content to the document with proper formatting."""
|
||
|
|
blocks = parse_markdown_blocks(md_content)
|
||
|
|
|
||
|
|
for block in blocks:
|
||
|
|
if block['type'] == 'header':
|
||
|
|
level = block['level']
|
||
|
|
text = block['content']
|
||
|
|
|
||
|
|
# Map markdown header levels to document styles
|
||
|
|
# ## (level 2) -> Heading 2 (subsection like 1.1. Motivación)
|
||
|
|
# ### (level 3) -> Heading 3 (sub-subsection like 1.1.1. xxx)
|
||
|
|
# #### (level 4) -> Heading 4
|
||
|
|
|
||
|
|
if level == 1:
|
||
|
|
# Skip level 1 headers - they're added separately as chapter titles
|
||
|
|
continue
|
||
|
|
elif level == 2:
|
||
|
|
para = doc.add_paragraph(text, style='Heading 2')
|
||
|
|
elif level == 3:
|
||
|
|
para = doc.add_paragraph(text, style='Heading 3')
|
||
|
|
elif level == 4:
|
||
|
|
para = doc.add_paragraph(text, style='Heading 4')
|
||
|
|
else:
|
||
|
|
para = doc.add_paragraph(text)
|
||
|
|
if para.runs:
|
||
|
|
para.runs[0].bold = True
|
||
|
|
|
||
|
|
elif block['type'] == 'paragraph':
|
||
|
|
para = doc.add_paragraph()
|
||
|
|
add_formatted_text(para, block['content'])
|
||
|
|
|
||
|
|
elif block['type'] == 'code':
|
||
|
|
add_code_block(doc, block['content'], block.get('lang', ''))
|
||
|
|
|
||
|
|
elif block['type'] == 'table':
|
||
|
|
add_table_to_doc(doc, block['content'])
|
||
|
|
|
||
|
|
elif block['type'] == 'quote':
|
||
|
|
para = doc.add_paragraph()
|
||
|
|
para.paragraph_format.left_indent = Cm(1)
|
||
|
|
para.paragraph_format.right_indent = Cm(1)
|
||
|
|
add_formatted_text(para, block['content'])
|
||
|
|
for run in para.runs:
|
||
|
|
run.italic = True
|
||
|
|
|
||
|
|
elif block['type'] == 'list':
|
||
|
|
for item in block['items']:
|
||
|
|
if block['list_type'] == 'bullet':
|
||
|
|
para = doc.add_paragraph(style='List Paragraph')
|
||
|
|
para.paragraph_format.left_indent = Cm(1)
|
||
|
|
add_formatted_text(para, '• ' + item)
|
||
|
|
else:
|
||
|
|
para = doc.add_paragraph(style='List Paragraph')
|
||
|
|
para.paragraph_format.left_indent = Cm(1)
|
||
|
|
add_formatted_text(para, item)
|
||
|
|
|
||
|
|
elif block['type'] == 'caption':
|
||
|
|
para = doc.add_paragraph()
|
||
|
|
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
|
run = para.add_run(block['content'])
|
||
|
|
run.italic = True
|
||
|
|
run.font.size = Pt(10)
|
||
|
|
|
||
|
|
|
||
|
|
def create_thesis_document():
|
||
|
|
"""Create the thesis document from template and markdown files."""
|
||
|
|
print("Loading template...")
|
||
|
|
doc = Document(TEMPLATE_PATH)
|
||
|
|
|
||
|
|
# Find and update title on cover page
|
||
|
|
for para in doc.paragraphs[:20]:
|
||
|
|
if 'Título del TFE' in para.text or 'titulo del TFE' in para.text.lower():
|
||
|
|
para.clear()
|
||
|
|
run = para.add_run(THESIS_TITLE)
|
||
|
|
run.bold = True
|
||
|
|
|
||
|
|
# Clear template content after indices (keep cover, resumen structure)
|
||
|
|
# We'll find where actual content starts and replace it
|
||
|
|
|
||
|
|
# Read all markdown files
|
||
|
|
print("Reading markdown files...")
|
||
|
|
md_files = {
|
||
|
|
'resumen': read_markdown_file(os.path.join(DOCS_PATH, '00_resumen.md')),
|
||
|
|
'introduccion': read_markdown_file(os.path.join(DOCS_PATH, '01_introduccion.md')),
|
||
|
|
'contexto': read_markdown_file(os.path.join(DOCS_PATH, '02_contexto_estado_arte.md')),
|
||
|
|
'objetivos': read_markdown_file(os.path.join(DOCS_PATH, '03_objetivos_metodologia.md')),
|
||
|
|
'desarrollo': read_markdown_file(os.path.join(DOCS_PATH, '04_desarrollo_especifico.md')),
|
||
|
|
'conclusiones': read_markdown_file(os.path.join(DOCS_PATH, '05_conclusiones_trabajo_futuro.md')),
|
||
|
|
'referencias': read_markdown_file(os.path.join(DOCS_PATH, '06_referencias_bibliograficas.md')),
|
||
|
|
'anexo': read_markdown_file(os.path.join(DOCS_PATH, '07_anexo_a.md')),
|
||
|
|
}
|
||
|
|
|
||
|
|
# Create new document based on template but with our content
|
||
|
|
print("Creating new document with thesis content...")
|
||
|
|
|
||
|
|
# Start fresh document with template styles
|
||
|
|
new_doc = Document(TEMPLATE_PATH)
|
||
|
|
|
||
|
|
# Clear all content after a certain point
|
||
|
|
# Keep first ~70 paragraphs (cover + resumen structure + indices)
|
||
|
|
paras_to_remove = []
|
||
|
|
found_intro = False
|
||
|
|
for i, para in enumerate(new_doc.paragraphs):
|
||
|
|
if 'Introducción' in para.text and para.style and 'Heading 1' in para.style.name:
|
||
|
|
found_intro = True
|
||
|
|
if found_intro:
|
||
|
|
paras_to_remove.append(para)
|
||
|
|
|
||
|
|
# Remove old content
|
||
|
|
for para in paras_to_remove:
|
||
|
|
p = para._element
|
||
|
|
p.getparent().remove(p)
|
||
|
|
|
||
|
|
# Now add our content
|
||
|
|
print("Adding thesis content...")
|
||
|
|
|
||
|
|
# Add each chapter
|
||
|
|
chapters = [
|
||
|
|
('introduccion', '1. Introducción'),
|
||
|
|
('contexto', '2. Contexto y estado del arte'),
|
||
|
|
('objetivos', '3. Objetivos concretos y metodología de trabajo'),
|
||
|
|
('desarrollo', '4. Desarrollo específico de la contribución'),
|
||
|
|
('conclusiones', '5. Conclusiones y trabajo futuro'),
|
||
|
|
]
|
||
|
|
|
||
|
|
for key, title in chapters:
|
||
|
|
print(f" Adding chapter: {title}")
|
||
|
|
# Add chapter heading with Heading 1 style
|
||
|
|
new_doc.add_paragraph(title, style='Heading 1')
|
||
|
|
|
||
|
|
# Remove the top-level header from content (we added it separately with proper style)
|
||
|
|
content = md_files[key]
|
||
|
|
# Remove the first # header line and intro paragraph that follows
|
||
|
|
content = re.sub(r'^#\s+\d+\.\s+[^\n]+\n+', '', content)
|
||
|
|
add_section_content(new_doc, content)
|
||
|
|
new_doc.add_page_break()
|
||
|
|
|
||
|
|
# Add Referencias
|
||
|
|
print(" Adding Referencias bibliográficas")
|
||
|
|
para = new_doc.add_paragraph('Referencias bibliográficas', style='Título 1 sin numerar')
|
||
|
|
refs_content = md_files['referencias']
|
||
|
|
refs_content = re.sub(r'^#[^\n]+\n+', '', refs_content) # Remove header
|
||
|
|
|
||
|
|
# Parse references (each reference is a paragraph)
|
||
|
|
for line in refs_content.split('\n\n'):
|
||
|
|
if line.strip():
|
||
|
|
para = new_doc.add_paragraph()
|
||
|
|
para.paragraph_format.left_indent = Cm(1.27)
|
||
|
|
para.paragraph_format.first_line_indent = Cm(-1.27)
|
||
|
|
add_formatted_text(para, line.strip())
|
||
|
|
|
||
|
|
new_doc.add_page_break()
|
||
|
|
|
||
|
|
# Add Anexo
|
||
|
|
print(" Adding Anexo A")
|
||
|
|
para = new_doc.add_paragraph('Anexo A. Código fuente y datos analizados', style='Título 1 sin numerar')
|
||
|
|
anexo_content = md_files['anexo']
|
||
|
|
anexo_content = re.sub(r'^#[^\n]+\n+', '', anexo_content)
|
||
|
|
add_section_content(new_doc, anexo_content)
|
||
|
|
|
||
|
|
# Update Resumen/Abstract sections (find them in the document and update)
|
||
|
|
print("Updating Resumen and Abstract...")
|
||
|
|
resumen_content = md_files['resumen']
|
||
|
|
|
||
|
|
# Parse resumen file to extract Spanish and English parts
|
||
|
|
resumen_blocks = parse_markdown_blocks(resumen_content)
|
||
|
|
spanish_paragraphs = []
|
||
|
|
english_paragraphs = []
|
||
|
|
keywords_es = ""
|
||
|
|
keywords_en = ""
|
||
|
|
current_section = None
|
||
|
|
|
||
|
|
for block in resumen_blocks:
|
||
|
|
if block['type'] == 'header':
|
||
|
|
if 'Resumen' in block['content']:
|
||
|
|
current_section = 'es'
|
||
|
|
elif 'Abstract' in block['content']:
|
||
|
|
current_section = 'en'
|
||
|
|
elif block['type'] == 'paragraph':
|
||
|
|
text = block['content']
|
||
|
|
if 'Palabras clave:' in text:
|
||
|
|
keywords_es = text
|
||
|
|
elif 'Keywords:' in text:
|
||
|
|
keywords_en = text
|
||
|
|
elif current_section == 'es' and text.strip():
|
||
|
|
spanish_paragraphs.append(text)
|
||
|
|
elif current_section == 'en' and text.strip():
|
||
|
|
english_paragraphs.append(text)
|
||
|
|
|
||
|
|
# Find and update Resumen section in doc
|
||
|
|
found_resumen = False
|
||
|
|
found_abstract = False
|
||
|
|
for i, para in enumerate(new_doc.paragraphs):
|
||
|
|
text = para.text.strip()
|
||
|
|
|
||
|
|
if 'Resumen' in text and para.style and 'Título' in para.style.name:
|
||
|
|
found_resumen = True
|
||
|
|
# Update following paragraphs
|
||
|
|
for j, sp in enumerate(spanish_paragraphs[:3]): # Limit to first 3 paragraphs
|
||
|
|
if i + j + 1 < len(new_doc.paragraphs):
|
||
|
|
target_para = new_doc.paragraphs[i + j + 1]
|
||
|
|
if target_para.style and target_para.style.name == 'Normal':
|
||
|
|
target_para.clear()
|
||
|
|
add_formatted_text(target_para, sp)
|
||
|
|
|
||
|
|
elif 'Abstract' in text and para.style and 'Título' in para.style.name:
|
||
|
|
found_abstract = True
|
||
|
|
for j, ep in enumerate(english_paragraphs[:3]):
|
||
|
|
if i + j + 1 < len(new_doc.paragraphs):
|
||
|
|
target_para = new_doc.paragraphs[i + j + 1]
|
||
|
|
if target_para.style and target_para.style.name == 'Normal':
|
||
|
|
target_para.clear()
|
||
|
|
add_formatted_text(target_para, ep)
|
||
|
|
|
||
|
|
# Save document
|
||
|
|
print(f"Saving document to {OUTPUT_PATH}...")
|
||
|
|
new_doc.save(OUTPUT_PATH)
|
||
|
|
print(f"Done! Document saved as {OUTPUT_PATH}")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
os.chdir('/Users/sergio/Desktop/MastersThesis')
|
||
|
|
create_thesis_document()
|