#!/usr/bin/env python3
"""Replace template content with thesis content from docs/ folder using BeautifulSoup."""
import re
import os
import shutil
from bs4 import BeautifulSoup, NavigableString
from latex2mathml.converter import convert as latex_to_mathml
from PIL import Image
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
TEMPLATE_INPUT = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm')
TEMPLATE_OUTPUT = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual.htm')
DOCS_DIR = os.path.join(BASE_DIR, 'docs')
# Global counters for tables and figures
table_counter = 0
figure_counter = 0
def read_file(path):
try:
with open(path, 'r', encoding='utf-8') as f:
return f.read()
except UnicodeDecodeError:
with open(path, 'r', encoding='latin-1') as f:
return f.read()
def write_file(path, content):
with open(path, 'w', encoding='utf-8') as f:
f.write(content)
def md_to_html_para(text):
"""Convert markdown inline formatting to HTML."""
# Bold
text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
# Italic
text = re.sub(r'\*([^*]+)\*', r'\1', text)
# Inline code
text = re.sub(r'`([^`]+)`', r'\1', text)
# Links [text](url) -> text
text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', text)
return text
def convert_latex_formulas(text):
"""Convert LaTeX formulas to MathML for Word compatibility."""
# Block formulas $$...$$
def convert_block(match):
latex = match.group(1)
try:
mathml = latex_to_mathml(latex, display="block")
return f'
{mathml}
'
except:
return match.group(0) # Keep original if conversion fails
text = re.sub(r'\$\$([^$]+)\$\$', convert_block, text)
# Inline formulas $...$
def convert_inline(match):
latex = match.group(1)
try:
return latex_to_mathml(latex, display="inline")
except:
return match.group(0)
text = re.sub(r'\$([^$]+)\$', convert_inline, text)
return text
def extract_table_title(lines, current_index):
"""Look for table title in preceding lines (e.g., **Tabla 1.** *Title*)."""
# Check previous non-empty lines for table title
for i in range(current_index - 1, max(0, current_index - 5), -1):
line = lines[i].strip()
if line.startswith('**Tabla') or line.startswith('*Tabla'):
return line
if line and not line.startswith('|'):
break
return None
def extract_figure_title_from_mermaid(lines, current_index):
"""Extract title from mermaid diagram or preceding text."""
# Look for title in mermaid content
for i in range(current_index + 1, min(len(lines), current_index + 20)):
line = lines[i].strip()
if line.startswith('```'):
break
if 'title' in line.lower():
# Extract title from: title "Some Title"
match = re.search(r'title\s+["\']([^"\']+)["\']', line)
if match:
return match.group(1)
# Check preceding lines for figure reference
for i in range(current_index - 1, max(0, current_index - 3), -1):
line = lines[i].strip()
if line.startswith('**Figura') or 'Figura' in line:
return line
return None
def parse_md_to_html_blocks(md_content):
"""Convert markdown content to HTML blocks with template styles."""
global table_counter, figure_counter
html_blocks = []
lines = md_content.split('\n')
i = 0
while i < len(lines):
line = lines[i]
# Skip empty lines
if not line.strip():
i += 1
continue
# Mermaid diagram - convert to figure with actual image
if line.strip().startswith('```mermaid'):
figure_counter += 1
mermaid_lines = []
i += 1
while i < len(lines) and not lines[i].strip() == '```':
mermaid_lines.append(lines[i])
i += 1
# Try to extract title from mermaid content (YAML format)
mermaid_content = '\n'.join(mermaid_lines)
# Match title with quotes: title: "Something" or title: 'Something'
title_match = re.search(r'title:\s*["\']([^"\']+)["\']', mermaid_content)
if not title_match:
# Match title without quotes: title: Something
title_match = re.search(r'title:\s*([^"\'\n]+)', mermaid_content)
if title_match:
fig_title = title_match.group(1).strip()
else:
fig_title = f"Diagrama {figure_counter}"
# Check if the generated PNG exists
fig_file = f'figures/figura_{figure_counter}.png'
fig_path = os.path.join(BASE_DIR, 'thesis_output', fig_file)
# Create figure with MsoCaption class and proper Word SEQ field for cross-reference
# Format: "Figura X." in bold, title in italic (per UNIR guidelines)
# Word TOC looks for text with Caption style - anchor must be outside main caption text
bookmark_id = f"_Ref_Fig{figure_counter}"
html_blocks.append(f'''Figura {figure_counter}. {fig_title}
''')
if os.path.exists(fig_path):
# Read actual image dimensions and scale to fit page width
img = Image.open(fig_path)
orig_w, orig_h = img.size
# Scale to fit max width of 566px (15cm at 96dpi) while preserving aspect ratio
max_width = 566
if orig_w > max_width:
scale = max_width / orig_w
new_w = max_width
new_h = int(orig_h * scale)
else:
new_w, new_h = orig_w, orig_h
# Convert to pt (1px at 96dpi = 0.75pt)
w_pt = new_w * 0.75
h_pt = new_h * 0.75
html_blocks.append(f'''
''')
else:
# Fallback to placeholder
html_blocks.append(f'''[Insertar diagrama Mermaid aquí]
''')
html_blocks.append(f'''Fuente: Elaboración propia.
''')
html_blocks.append('
')
i += 1
continue
# Code block (non-mermaid)
if line.strip().startswith('```'):
code_lang = line.strip()[3:]
code_lines = []
i += 1
while i < len(lines) and not lines[i].strip().startswith('```'):
code_lines.append(lines[i])
i += 1
code = '\n'.join(code_lines)
# Escape HTML entities in code
code = code.replace('&', '&').replace('<', '<').replace('>', '>')
html_blocks.append(f'{code}
')
i += 1
continue
# Headers - ## becomes h2, ### becomes h3
if line.startswith('####'):
text = line.lstrip('#').strip()
html_blocks.append(f'{text}
')
i += 1
continue
elif line.startswith('###'):
text = line.lstrip('#').strip()
html_blocks.append(f'{text}
')
i += 1
continue
elif line.startswith('##'):
text = line.lstrip('#').strip()
html_blocks.append(f'{text}
')
i += 1
continue
elif line.startswith('#'):
# Skip h1 - we keep the original
i += 1
continue
# Table - check for table title pattern first
if '|' in line and i + 1 < len(lines) and '---' in lines[i + 1]:
table_counter += 1
# Check if previous line has table title (e.g., **Tabla 1.** *Title*)
table_title = None
alt_title = None # Alternative title from **bold text:** pattern
table_source = "Elaboración propia"
# Look back for table title
for j in range(i - 1, max(0, i - 5), -1):
prev_line = lines[j].strip()
if prev_line.startswith('**Tabla') or prev_line.startswith('*Tabla'):
# Extract title text
table_title = re.sub(r'\*+', '', prev_line).strip()
break
elif prev_line.startswith('**') and prev_line.endswith(':**'):
# Alternative: **Bold title:** pattern (for informal tables)
alt_title = re.sub(r'\*+', '', prev_line).rstrip(':').strip()
elif prev_line and not prev_line.startswith('|'):
break
# Parse table
table_lines = []
while i < len(lines) and '|' in lines[i]:
if '---' not in lines[i]:
table_lines.append(lines[i])
i += 1
# Look ahead for source
if i < len(lines) and 'Fuente:' in lines[i]:
table_source = lines[i].replace('*', '').replace('Fuente:', '').strip()
i += 1
# Add table title with MsoCaption class and proper Word SEQ field for cross-reference
# Format: "Tabla X." in bold, title in italic (per UNIR guidelines)
# Word TOC looks for text with Caption style - anchor must be outside main caption text
bookmark_id = f"_Ref_Tab{table_counter}"
if table_title:
# Remove any "Tabla X." or "Tabla AX." pattern from the title
clean_title = re.sub(r'^Tabla\s+[A-Z]?\d+\.\s*', '', table_title).strip()
elif alt_title:
# Use alternative title from **bold text:** pattern
clean_title = alt_title
else:
clean_title = "Tabla de datos."
html_blocks.append(f'''Tabla {table_counter}. {clean_title}
''')
# Build table HTML with APA style (horizontal lines only, no vertical)
# Wrap in centered div for Word compatibility
table_html = ''
for j, tline in enumerate(table_lines):
cells = [c.strip() for c in tline.split('|')[1:-1]]
table_html += ''
for cell in cells:
if j == 0:
# Header row: top and bottom border, bold text
table_html += f'{md_to_html_para(cell)} | '
elif j == len(table_lines) - 1:
# Last row: bottom border only
table_html += f'{md_to_html_para(cell)} | '
else:
# Middle rows: no borders
table_html += f'{md_to_html_para(cell)} | '
table_html += '
'
table_html += '
'
html_blocks.append(table_html)
# Add source with proper template format
html_blocks.append(f'Fuente: {table_source}.
')
html_blocks.append('
')
continue
# Blockquote
if line.startswith('>'):
quote_text = line[1:].strip()
i += 1
while i < len(lines) and lines[i].startswith('>'):
quote_text += ' ' + lines[i][1:].strip()
i += 1
html_blocks.append(f'{md_to_html_para(quote_text)}
')
continue
# Bullet list
if re.match(r'^[\-\*\+]\s', line):
while i < len(lines) and re.match(r'^[\-\*\+]\s', lines[i]):
item_text = lines[i][2:].strip()
item_text = convert_latex_formulas(item_text)
html_blocks.append(f'· {md_to_html_para(item_text)}
')
i += 1
continue
# Numbered list
if re.match(r'^\d+\.\s', line):
num = 1
while i < len(lines) and re.match(r'^\d+\.\s', lines[i]):
item_text = re.sub(r'^\d+\.\s*', '', lines[i]).strip()
item_text = convert_latex_formulas(item_text)
html_blocks.append(f'{num}. {md_to_html_para(item_text)}
')
num += 1
i += 1
continue
# Skip lines that are just table/figure titles (they'll be handled with the table/figure)
if line.strip().startswith('**Tabla') or line.strip().startswith('*Tabla'):
i += 1
continue
if line.strip().startswith('**Figura') or line.strip().startswith('*Figura'):
i += 1
continue
if line.strip().startswith('*Fuente:') or line.strip().startswith('Fuente:'):
i += 1
continue
# Regular paragraph
para_lines = [line]
i += 1
while i < len(lines) and lines[i].strip() and not lines[i].startswith('#') and not lines[i].startswith('```') and not lines[i].startswith('>') and not re.match(r'^[\-\*\+]\s', lines[i]) and not re.match(r'^\d+\.\s', lines[i]) and '|' not in lines[i]:
para_lines.append(lines[i])
i += 1
para_text = ' '.join(para_lines)
para_text = convert_latex_formulas(para_text)
# Check if paragraph contains MathML (already wrapped)
if '