autogen atempt
This commit is contained in:
269
generate_thesis.py
Normal file
269
generate_thesis.py
Normal file
@@ -0,0 +1,269 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate thesis DOCX from HTML template and markdown content."""
|
||||
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
BASE_DIR = '/Users/sergio/Desktop/MastersThesis'
|
||||
TEMPLATE_HTM = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm')
|
||||
TEMPLATE_FILES = os.path.join(BASE_DIR, 'instructions/plantilla_individual_files')
|
||||
OUTPUT_HTM = os.path.join(BASE_DIR, 'thesis_output.htm')
|
||||
OUTPUT_FILES = os.path.join(BASE_DIR, 'thesis_output_files')
|
||||
OUTPUT_DOCX = os.path.join(BASE_DIR, 'TFM_Sergio_Jimenez_OCR_Optimization.docx')
|
||||
DOCS_DIR = os.path.join(BASE_DIR, 'docs')
|
||||
|
||||
def read_md(filename):
|
||||
with open(os.path.join(DOCS_DIR, filename), 'r', encoding='utf-8') as f:
|
||||
return f.read()
|
||||
|
||||
def md_to_html(md_text):
|
||||
"""Convert markdown to simple HTML."""
|
||||
html = md_text
|
||||
|
||||
# Headers
|
||||
html = re.sub(r'^#### (.+)$', r'<h4>\1</h4>', html, flags=re.MULTILINE)
|
||||
html = re.sub(r'^### (.+)$', r'<h3>\1</h3>', html, flags=re.MULTILINE)
|
||||
html = re.sub(r'^## (.+)$', r'<h2>\1</h2>', html, flags=re.MULTILINE)
|
||||
html = re.sub(r'^# (.+)$', r'<h1>\1</h1>', html, flags=re.MULTILINE)
|
||||
|
||||
# Bold and italic
|
||||
html = re.sub(r'\*\*([^*]+)\*\*', r'<b>\1</b>', html)
|
||||
html = re.sub(r'\*([^*]+)\*', r'<i>\1</i>', html)
|
||||
|
||||
# Inline code
|
||||
html = re.sub(r'`([^`]+)`', r'<code>\1</code>', html)
|
||||
|
||||
# Code blocks
|
||||
def code_block_replace(match):
|
||||
lang = match.group(1)
|
||||
code = match.group(2)
|
||||
return f'<pre style="background:#f5f5f5;padding:10px;font-family:Consolas;font-size:9pt">{code}</pre>'
|
||||
html = re.sub(r'```(\w*)\n(.*?)```', code_block_replace, html, flags=re.DOTALL)
|
||||
|
||||
# Blockquotes
|
||||
html = re.sub(r'^>\s*(.+)$', r'<blockquote style="margin-left:2cm;font-style:italic">\1</blockquote>', html, flags=re.MULTILINE)
|
||||
|
||||
# Tables
|
||||
def table_replace(match):
|
||||
lines = match.group(0).strip().split('\n')
|
||||
rows = []
|
||||
for line in lines:
|
||||
if '---' in line:
|
||||
continue
|
||||
cells = [c.strip() for c in line.split('|')[1:-1]]
|
||||
rows.append(cells)
|
||||
|
||||
table_html = '<table border="1" style="border-collapse:collapse;margin:10px 0">'
|
||||
for i, row in enumerate(rows):
|
||||
table_html += '<tr>'
|
||||
tag = 'th' if i == 0 else 'td'
|
||||
for cell in row:
|
||||
table_html += f'<{tag} style="padding:5px;border:1px solid #ccc">{cell}</{tag}>'
|
||||
table_html += '</tr>'
|
||||
table_html += '</table>'
|
||||
return table_html
|
||||
|
||||
html = re.sub(r'(\|[^\n]+\|\n)+', table_replace, html)
|
||||
|
||||
# Bullet lists
|
||||
def bullet_list_replace(match):
|
||||
items = match.group(0).strip().split('\n')
|
||||
list_html = '<ul>'
|
||||
for item in items:
|
||||
item_text = re.sub(r'^[\-\*\+]\s*', '', item)
|
||||
list_html += f'<li>{item_text}</li>'
|
||||
list_html += '</ul>'
|
||||
return list_html
|
||||
html = re.sub(r'(^[\-\*\+]\s+.+\n?)+', bullet_list_replace, html, flags=re.MULTILINE)
|
||||
|
||||
# Numbered lists
|
||||
def num_list_replace(match):
|
||||
items = match.group(0).strip().split('\n')
|
||||
list_html = '<ol>'
|
||||
for item in items:
|
||||
item_text = re.sub(r'^\d+\.\s*', '', item)
|
||||
list_html += f'<li>{item_text}</li>'
|
||||
list_html += '</ol>'
|
||||
return list_html
|
||||
html = re.sub(r'(^\d+\.\s+.+\n?)+', num_list_replace, html, flags=re.MULTILINE)
|
||||
|
||||
# Paragraphs (lines not already in tags)
|
||||
lines = html.split('\n')
|
||||
result = []
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
if line.startswith('<') or line.startswith('{'):
|
||||
result.append(line)
|
||||
else:
|
||||
result.append(f'<p class="MsoNormal">{line}</p>')
|
||||
|
||||
return '\n'.join(result)
|
||||
|
||||
def main():
|
||||
print("Reading template...")
|
||||
with open(TEMPLATE_HTM, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
html = f.read()
|
||||
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Read markdown files
|
||||
print("Reading markdown content...")
|
||||
md_files = {
|
||||
'resumen': read_md('00_resumen.md'),
|
||||
'intro': read_md('01_introduccion.md'),
|
||||
'contexto': read_md('02_contexto_estado_arte.md'),
|
||||
'objetivos': read_md('03_objetivos_metodologia.md'),
|
||||
'desarrollo': read_md('04_desarrollo_especifico.md'),
|
||||
'conclusiones': read_md('05_conclusiones_trabajo_futuro.md'),
|
||||
'referencias': read_md('06_referencias_bibliograficas.md'),
|
||||
'anexo': read_md('07_anexo_a.md'),
|
||||
}
|
||||
|
||||
# Convert markdown to HTML
|
||||
print("Converting markdown to HTML...")
|
||||
html_content = {}
|
||||
for key, md in md_files.items():
|
||||
html_content[key] = md_to_html(md)
|
||||
|
||||
# Find and replace content sections
|
||||
print("Replacing template content...")
|
||||
|
||||
# Find all WordSection divs and main content areas
|
||||
sections = soup.find_all('div', class_=lambda x: x and 'WordSection' in x)
|
||||
|
||||
# Strategy: Find chapter headings and replace following content
|
||||
# The template has placeholders we need to replace
|
||||
|
||||
# Simple approach: Create new HTML with template structure but our content
|
||||
new_html = '''<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>TFM - Optimización de Hiperparámetros OCR</title>
|
||||
<style>
|
||||
body { font-family: Calibri, sans-serif; font-size: 12pt; line-height: 1.5; margin: 2.5cm 2cm 2.5cm 3cm; }
|
||||
h1 { font-family: "Calibri Light", sans-serif; font-size: 18pt; color: #0098CD; margin-top: 24pt; }
|
||||
h2 { font-family: "Calibri Light", sans-serif; font-size: 14pt; color: #0098CD; margin-top: 18pt; }
|
||||
h3 { font-family: "Calibri Light", sans-serif; font-size: 12pt; margin-top: 12pt; }
|
||||
h4 { font-family: "Calibri Light", sans-serif; font-size: 11pt; margin-top: 10pt; }
|
||||
p { text-align: justify; margin: 6pt 0; }
|
||||
code { font-family: Consolas, monospace; font-size: 10pt; background: #f5f5f5; padding: 2px 4px; }
|
||||
pre { font-family: Consolas, monospace; font-size: 9pt; background: #f5f5f5; padding: 10px; overflow-x: auto; }
|
||||
table { border-collapse: collapse; margin: 12pt 0; width: 100%; }
|
||||
th, td { border: 1px solid #ccc; padding: 6pt; text-align: left; }
|
||||
th { background: #f0f0f0; font-weight: bold; }
|
||||
blockquote { margin-left: 2cm; font-style: italic; border-left: 3px solid #0098CD; padding-left: 10px; }
|
||||
ul, ol { margin: 6pt 0 6pt 1cm; }
|
||||
li { margin: 3pt 0; }
|
||||
.title-page { text-align: center; page-break-after: always; }
|
||||
.title-page h1 { font-size: 24pt; color: #0098CD; }
|
||||
.chapter { page-break-before: always; }
|
||||
.referencias p { text-indent: -1.27cm; padding-left: 1.27cm; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
'''
|
||||
|
||||
# Title page
|
||||
new_html += '''
|
||||
<div class="title-page">
|
||||
<p style="margin-top:3cm"><img src="thesis_output_files/unir_logo.png" width="200" alt="UNIR Logo"></p>
|
||||
<p style="margin-top:1cm;font-size:14pt">Universidad Internacional de La Rioja<br>Escuela Superior de Ingeniería y Tecnología</p>
|
||||
<p style="margin-top:2cm;font-size:12pt">Máster Universitario en Inteligencia Artificial</p>
|
||||
<h1 style="margin-top:2cm">Optimización de Hiperparámetros OCR con Ray Tune para Documentos Académicos en Español</h1>
|
||||
<p style="margin-top:3cm">Trabajo Fin de Estudio presentado por: <b>Sergio Jiménez Jiménez</b></p>
|
||||
<p>Tipo de trabajo: Comparativa de soluciones / Piloto experimental</p>
|
||||
<p>Director: [Nombre del Director]</p>
|
||||
<p style="margin-top:2cm">Fecha: 2025</p>
|
||||
</div>
|
||||
'''
|
||||
|
||||
# Resumen
|
||||
new_html += '<div class="chapter">\n'
|
||||
new_html += html_content['resumen']
|
||||
new_html += '</div>\n'
|
||||
|
||||
# Table of contents placeholder
|
||||
new_html += '''
|
||||
<div class="chapter">
|
||||
<h1 style="color:black">Índice de contenidos</h1>
|
||||
<p><i>[El índice se generará automáticamente en Word]</i></p>
|
||||
</div>
|
||||
'''
|
||||
|
||||
# Chapters
|
||||
chapters = [
|
||||
('intro', 'introduccion'),
|
||||
('contexto', 'contexto'),
|
||||
('objetivos', 'objetivos'),
|
||||
('desarrollo', 'desarrollo'),
|
||||
('conclusiones', 'conclusiones'),
|
||||
]
|
||||
|
||||
for key, _ in chapters:
|
||||
new_html += '<div class="chapter">\n'
|
||||
new_html += html_content[key]
|
||||
new_html += '</div>\n'
|
||||
|
||||
# Referencias
|
||||
new_html += '<div class="chapter referencias">\n'
|
||||
new_html += html_content['referencias']
|
||||
new_html += '</div>\n'
|
||||
|
||||
# Anexo
|
||||
new_html += '<div class="chapter">\n'
|
||||
new_html += html_content['anexo']
|
||||
new_html += '</div>\n'
|
||||
|
||||
new_html += '</body></html>'
|
||||
|
||||
# Save HTML
|
||||
print(f"Saving HTML to {OUTPUT_HTM}...")
|
||||
with open(OUTPUT_HTM, 'w', encoding='utf-8') as f:
|
||||
f.write(new_html)
|
||||
|
||||
# Copy template files folder
|
||||
if os.path.exists(OUTPUT_FILES):
|
||||
shutil.rmtree(OUTPUT_FILES)
|
||||
if os.path.exists(TEMPLATE_FILES):
|
||||
shutil.copytree(TEMPLATE_FILES, OUTPUT_FILES)
|
||||
|
||||
# Create UNIR logo placeholder if not exists
|
||||
os.makedirs(OUTPUT_FILES, exist_ok=True)
|
||||
|
||||
# Convert to DOCX using pandoc
|
||||
print(f"Converting to DOCX with pandoc...")
|
||||
result = subprocess.run([
|
||||
'pandoc',
|
||||
OUTPUT_HTM,
|
||||
'-o', OUTPUT_DOCX,
|
||||
'--reference-doc', os.path.join(BASE_DIR, 'instructions/plantilla_individual.docx'),
|
||||
'--toc',
|
||||
'--toc-depth=3'
|
||||
], capture_output=True, text=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"Pandoc error: {result.stderr}")
|
||||
# Try without reference doc
|
||||
print("Retrying without reference doc...")
|
||||
result = subprocess.run([
|
||||
'pandoc',
|
||||
OUTPUT_HTM,
|
||||
'-o', OUTPUT_DOCX,
|
||||
'--toc',
|
||||
'--toc-depth=3'
|
||||
], capture_output=True, text=True)
|
||||
|
||||
if result.returncode == 0:
|
||||
print(f"✓ Document saved to {OUTPUT_DOCX}")
|
||||
print(f"✓ HTML version saved to {OUTPUT_HTM}")
|
||||
else:
|
||||
print(f"Error: {result.stderr}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user