#!/usr/bin/env python3 """Replace template content with thesis content from docs/ folder using BeautifulSoup.""" import re import os import shutil from bs4 import BeautifulSoup, NavigableString from latex2mathml.converter import convert as latex_to_mathml from PIL import Image BASE_DIR = os.path.dirname(os.path.abspath(__file__)) TEMPLATE_INPUT = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm') TEMPLATE_OUTPUT = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual.htm') DOCS_DIR = os.path.join(BASE_DIR, 'docs') # Global counters for tables and figures table_counter = 0 figure_counter = 0 def read_file(path): try: with open(path, 'r', encoding='utf-8') as f: return f.read() except UnicodeDecodeError: with open(path, 'r', encoding='latin-1') as f: return f.read() def write_file(path, content): with open(path, 'w', encoding='utf-8') as f: f.write(content) def md_to_html_para(text): """Convert markdown inline formatting to HTML.""" # Bold text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) # Italic text = re.sub(r'\*([^*]+)\*', r'\1', text) # Inline code text = re.sub(r'`([^`]+)`', r'\1', text) # Links [text](url) -> text text = re.sub(r'\[([^\]]+)\]$([^)]+)$', r'\1', text) return text def convert_latex_formulas(text): """Convert LaTeX formulas to MathML for Word compatibility.""" # Block formulas $$...$$ def convert_block(match): latex = match.group(1) try: mathml = latex_to_mathml(latex, display="block") return f'

{mathml}

' except: return match.group(0) # Keep original if conversion fails text = re.sub(r'\$\$([^$]+)\$\$', convert_block, text) # Inline formulas $...$ def convert_inline(match): latex = match.group(1) try: return latex_to_mathml(latex, display="inline") except: return match.group(0) text = re.sub(r'\$([^$]+)\$', convert_inline, text) return text def extract_table_title(lines, current_index): """Look for table title in preceding lines (e.g., **Tabla 1.** *Title*).""" # Check previous non-empty lines for table title for i in range(current_index - 1, max(0, current_index - 5), -1): line = lines[i].strip() if line.startswith('**Tabla') or line.startswith('*Tabla'): return line if line and not line.startswith('|'): break return None def extract_figure_title_from_mermaid(lines, current_index): """Extract title from mermaid diagram or preceding text.""" # Look for title in mermaid content for i in range(current_index + 1, min(len(lines), current_index + 20)): line = lines[i].strip() if line.startswith('```'): break if 'title' in line.lower(): # Extract title from: title "Some Title" match = re.search(r'title\s+["\']([^"\']+)["\']', line) if match: return match.group(1) # Check preceding lines for figure reference for i in range(current_index - 1, max(0, current_index - 3), -1): line = lines[i].strip() if line.startswith('**Figura') or 'Figura' in line: return line return None def parse_md_to_html_blocks(md_content): """Convert markdown content to HTML blocks with template styles.""" global table_counter, figure_counter html_blocks = [] lines = md_content.split('\n') i = 0 while i < len(lines): line = lines[i] # Skip empty lines if not line.strip(): i += 1 continue # Mermaid diagram - convert to figure with actual image if line.strip().startswith('```mermaid'): figure_counter += 1 mermaid_lines = [] i += 1 while i < len(lines) and not lines[i].strip() == '```': mermaid_lines.append(lines[i]) i += 1 # Try to extract title from mermaid content (YAML format) mermaid_content = '\n'.join(mermaid_lines) # Match title with quotes: title: "Something" or title: 'Something' title_match = re.search(r'title:\s*["\']([^"\']+)["\']', mermaid_content) if not title_match: # Match title without quotes: title: Something title_match = re.search(r'title:\s*([^"\'\n]+)', mermaid_content) if title_match: fig_title = title_match.group(1).strip() else: fig_title = f"Diagrama {figure_counter}" # Check if the generated PNG exists fig_file = f'figures/figura_{figure_counter}.png' fig_path = os.path.join(BASE_DIR, 'thesis_output', fig_file) # Create figure with MsoCaption class and proper Word SEQ field for cross-reference # Format: "Figura X." in bold, title in italic (per UNIR guidelines) # Word TOC looks for text with Caption style - anchor must be outside main caption text bookmark_id = f"_Ref_Fig{figure_counter}" html_blocks.append(f'''

Figura {figure_counter}. {fig_title}

''') if os.path.exists(fig_path): # Read actual image dimensions and scale to fit page width img = Image.open(fig_path) orig_w, orig_h = img.size # Scale to fit max width of 566px (15cm at 96dpi) while preserving aspect ratio max_width = 566 if orig_w > max_width: scale = max_width / orig_w new_w = max_width new_h = int(orig_h * scale) else: new_w, new_h = orig_w, orig_h # Convert to pt (1px at 96dpi = 0.75pt) w_pt = new_w * 0.75 h_pt = new_h * 0.75 html_blocks.append(f'''

${fig_title}$

''') else: # Fallback to placeholder html_blocks.append(f'''

[Insertar diagrama Mermaid aquí]

''') html_blocks.append(f'''

Fuente: Elaboración propia.

''') html_blocks.append('

') i += 1 continue # Code block (non-mermaid) if line.strip().startswith('```'): code_lang = line.strip()[3:] code_lines = [] i += 1 while i < len(lines) and not lines[i].strip().startswith('```'): code_lines.append(lines[i]) i += 1 code = '\n'.join(code_lines) # Escape HTML entities in code code = code.replace('&', '&').replace('<', '<').replace('>', '>') html_blocks.append(f'

{code}

') i += 1 continue # Headers - ## becomes h2, ### becomes h3 if line.startswith('####'): text = line.lstrip('#').strip() html_blocks.append(f'

{text}

') i += 1 continue elif line.startswith('###'): text = line.lstrip('#').strip() html_blocks.append(f'

{text}

') i += 1 continue elif line.startswith('##'): text = line.lstrip('#').strip() html_blocks.append(f'

{text}

') i += 1 continue elif line.startswith('#'): # Skip h1 - we keep the original i += 1 continue # Table - check for table title pattern first if '|' in line and i + 1 < len(lines) and '---' in lines[i + 1]: table_counter += 1 # Check if previous line has table title (e.g., **Tabla 1.** *Title*) table_title = None alt_title = None # Alternative title from **bold text:** pattern table_source = "Elaboración propia" # Look back for table title for j in range(i - 1, max(0, i - 5), -1): prev_line = lines[j].strip() if prev_line.startswith('**Tabla') or prev_line.startswith('*Tabla'): # Extract title text table_title = re.sub(r'\*+', '', prev_line).strip() break elif prev_line.startswith('**') and prev_line.endswith(':**'): # Alternative: **Bold title:** pattern (for informal tables) alt_title = re.sub(r'\*+', '', prev_line).rstrip(':').strip() elif prev_line and not prev_line.startswith('|'): break # Parse table table_lines = [] while i < len(lines) and '|' in lines[i]: if '---' not in lines[i]: table_lines.append(lines[i]) i += 1 # Look ahead for source if i < len(lines) and 'Fuente:' in lines[i]: table_source = lines[i].replace('*', '').replace('Fuente:', '').strip() i += 1 # Add table title with MsoCaption class and proper Word SEQ field for cross-reference # Format: "Tabla X." in bold, title in italic (per UNIR guidelines) # Word TOC looks for text with Caption style - anchor must be outside main caption text bookmark_id = f"_Ref_Tab{table_counter}" if table_title: # Remove any "Tabla X." or "Tabla AX." pattern from the title clean_title = re.sub(r'^Tabla\s+[A-Z]?\d+\.\s*', '', table_title).strip() elif alt_title: # Use alternative title from **bold text:** pattern clean_title = alt_title else: clean_title = "Tabla de datos." html_blocks.append(f'''

Tabla {table_counter}. {clean_title}

''') # Build table HTML with APA style (horizontal lines only, no vertical) # Wrap in centered div for Word compatibility table_html = '

' for j, tline in enumerate(table_lines): cells = [c.strip() for c in tline.split('|')[1:-1]] table_html += '' for cell in cells: if j == 0: # Header row: top and bottom border, bold text table_html += f'' elif j == len(table_lines) - 1: # Last row: bottom border only table_html += f'' else: # Middle rows: no borders table_html += f'' table_html += '' table_html += '

{md_to_html_para(cell)}

{md_to_html_para(cell)}

' html_blocks.append(table_html) # Add source with proper template format html_blocks.append(f'

Fuente: {table_source}.

') html_blocks.append('

') continue # Blockquote if line.startswith('>'): quote_text = line[1:].strip() i += 1 while i < len(lines) and lines[i].startswith('>'): quote_text += ' ' + lines[i][1:].strip() i += 1 html_blocks.append(f'

{md_to_html_para(quote_text)}

') continue # Bullet list if re.match(r'^[\-\*\+]\s', line): while i < len(lines) and re.match(r'^[\-\*\+]\s', lines[i]): item_text = lines[i][2:].strip() item_text = convert_latex_formulas(item_text) html_blocks.append(f'

· {md_to_html_para(item_text)}

') i += 1 continue # Numbered list if re.match(r'^\d+\.\s', line): num = 1 while i < len(lines) and re.match(r'^\d+\.\s', lines[i]): item_text = re.sub(r'^\d+\.\s*', '', lines[i]).strip() item_text = convert_latex_formulas(item_text) html_blocks.append(f'

{num}. {md_to_html_para(item_text)}

') num += 1 i += 1 continue # Skip lines that are just table/figure titles (they'll be handled with the table/figure) if line.strip().startswith('**Tabla') or line.strip().startswith('*Tabla'): i += 1 continue if line.strip().startswith('**Figura') or line.strip().startswith('*Figura'): i += 1 continue if line.strip().startswith('*Fuente:') or line.strip().startswith('Fuente:'): i += 1 continue # Regular paragraph para_lines = [line] i += 1 while i < len(lines) and lines[i].strip() and not lines[i].startswith('#') and not lines[i].startswith('```') and not lines[i].startswith('>') and not re.match(r'^[\-\*\+]\s', lines[i]) and not re.match(r'^\d+\.\s', lines[i]) and '|' not in lines[i]: para_lines.append(lines[i]) i += 1 para_text = ' '.join(para_lines) para_text = convert_latex_formulas(para_text) # Check if paragraph contains MathML (already wrapped) if '{md_to_html_para(para_text)}

') return '\n\n'.join(html_blocks) def extract_section_content(md_content): """Extract content from markdown, skipping the first # header.""" md_content = re.sub(r'^#\s+[^\n]+\n+', '', md_content, count=1) return parse_md_to_html_blocks(md_content) def find_section_element(soup, keyword): """Find element containing keyword (h1 or special paragraph classes).""" # First try h1 for h1 in soup.find_all('h1'): text = h1.get_text() if keyword.lower() in text.lower(): return h1 # Try special paragraph classes for unnumbered sections for p in soup.find_all('p', class_=['Ttulo1sinnumerar', 'Anexo', 'MsoNormal']): text = p.get_text() if keyword.lower() in text.lower(): classes = p.get('class', []) if 'Ttulo1sinnumerar' in classes or 'Anexo' in classes: return p if re.match(r'^\d+\.?\s', text.strip()): return p return None def remove_elements_between(start_elem, end_elem): """Remove all elements between start and end (exclusive).""" current = start_elem.next_sibling elements_to_remove = [] while current and current != end_elem: elements_to_remove.append(current) current = current.next_sibling for elem in elements_to_remove: if hasattr(elem, 'decompose'): elem.decompose() elif isinstance(elem, NavigableString): elem.extract() def format_references(refs_content): """Format references with proper MsoBibliography style.""" refs_content = refs_content.replace('# Referencias bibliográficas {.unnumbered}', '').strip() refs_html = '' for line in refs_content.split('\n\n'): line = line.strip() if not line: continue # Apply markdown formatting formatted = md_to_html_para(line) # Use MsoBibliography style with hanging indent (36pt indent, -36pt text-indent) refs_html += f'''

{formatted}

\n''' return refs_html def extract_resumen_parts(resumen_content): """Extract Spanish resumen and English abstract from 00_resumen.md""" parts = resumen_content.split('---') spanish_part = parts[0] if len(parts) > 0 else '' english_part = parts[1] if len(parts) > 1 else '' # Extract Spanish content spanish_text = '' spanish_keywords = '' if '**Palabras clave:**' in spanish_part: text_part, kw_part = spanish_part.split('**Palabras clave:**') spanish_text = text_part.replace('# Resumen', '').strip() spanish_keywords = kw_part.strip() else: spanish_text = spanish_part.replace('# Resumen', '').strip() # Extract English content english_text = '' english_keywords = '' if '**Keywords:**' in english_part: text_part, kw_part = english_part.split('**Keywords:**') english_text = text_part.replace('# Abstract', '').strip() english_keywords = kw_part.strip() else: english_text = english_part.replace('# Abstract', '').strip() return spanish_text, spanish_keywords, english_text, english_keywords def main(): global table_counter, figure_counter print("Reading template...") html_content = read_file(TEMPLATE_INPUT) soup = BeautifulSoup(html_content, 'html.parser') print("Reading docs content...") docs = { 'resumen': read_file(os.path.join(DOCS_DIR, '00_resumen.md')), 'intro': read_file(os.path.join(DOCS_DIR, '01_introduccion.md')), 'contexto': read_file(os.path.join(DOCS_DIR, '02_contexto_estado_arte.md')), 'objetivos': read_file(os.path.join(DOCS_DIR, '03_objetivos_metodologia.md')), 'desarrollo': read_file(os.path.join(DOCS_DIR, '04_desarrollo_especifico.md')), 'conclusiones': read_file(os.path.join(DOCS_DIR, '05_conclusiones_trabajo_futuro.md')), 'referencias': read_file(os.path.join(DOCS_DIR, '06_referencias_bibliograficas.md')), 'anexo': read_file(os.path.join(DOCS_DIR, '07_anexo_a.md')), } # Extract resumen and abstract spanish_text, spanish_kw, english_text, english_kw = extract_resumen_parts(docs['resumen']) # Replace title print("Replacing title...") for elem in soup.find_all(string=re.compile(r'Título del TFE', re.IGNORECASE)): elem.replace_with(elem.replace('Título del TFE', 'Optimización de Hiperparámetros OCR con Ray Tune para Documentos Académicos en Español')) # Replace Resumen section print("Replacing Resumen...") resumen_title = soup.find('p', class_='Ttulondices', string=re.compile(r'Resumen')) if resumen_title: # Find and replace content after Resumen title until Abstract current = resumen_title.find_next_sibling() elements_to_remove = [] while current: text = current.get_text() if hasattr(current, 'get_text') else str(current) if 'Abstract' in text and current.name == 'p' and 'Ttulondices' in str(current.get('class', [])): break elements_to_remove.append(current) current = current.find_next_sibling() for elem in elements_to_remove: if hasattr(elem, 'decompose'): elem.decompose() # Insert new resumen content resumen_html = f'''

{spanish_text}

Palabras clave: {spanish_kw}

''' resumen_soup = BeautifulSoup(resumen_html, 'html.parser') insert_point = resumen_title for new_elem in reversed(list(resumen_soup.children)): insert_point.insert_after(new_elem) print(" ✓ Replaced Resumen") # Replace Abstract section print("Replacing Abstract...") abstract_title = soup.find('p', class_='Ttulondices', string=re.compile(r'Abstract')) if abstract_title: # Find and replace content after Abstract title until next major section current = abstract_title.find_next_sibling() elements_to_remove = [] while current: # Stop at page break or next title if current.name == 'span' and 'page-break' in str(current): break text = current.get_text() if hasattr(current, 'get_text') else str(current) if current.name == 'p' and ('Ttulondices' in str(current.get('class', [])) or 'MsoToc' in str(current.get('class', []))): break elements_to_remove.append(current) current = current.find_next_sibling() for elem in elements_to_remove: if hasattr(elem, 'decompose'): elem.decompose() # Insert new abstract content abstract_html = f'''

{english_text}

Keywords: {english_kw}

''' abstract_soup = BeautifulSoup(abstract_html, 'html.parser') insert_point = abstract_title for new_elem in reversed(list(abstract_soup.children)): insert_point.insert_after(new_elem) print(" ✓ Replaced Abstract") # Remove "Importante" callout boxes (template instructions) print("Removing template instructions...") for div in soup.find_all('div'): text = div.get_text() if 'Importante:' in text and 'extensión mínima' in text: div.decompose() print(" ✓ Removed 'Importante' box") # Remove "Ejemplo de nota al pie" footnote for elem in soup.find_all(string=re.compile(r'Ejemplo de nota al pie')): parent = elem.parent if parent: # Find the footnote container and remove it while parent and parent.name != 'p': parent = parent.parent if parent: parent.decompose() print(" ✓ Removed footnote example") # Clear old figure/table index entries (they need to be regenerated in Word) print("Clearing old index entries...") # Remove ALL content from MsoTof paragraphs that reference template examples # The indices will be regenerated when user opens in Word and presses Ctrl+A, F9 for p in soup.find_all('p', class_='MsoTof'): text = p.get_text() # Check for figure index entries with template examples if 'Figura' in text and 'Ejemplo' in text: # Remove all tags (the actual index entry links) for a in p.find_all('a'): a.decompose() # Also remove any remaining text content that shows the example for span in p.find_all('span', style=lambda x: x and 'mso-no-proof' in str(x)): if 'Ejemplo' in span.get_text(): span.decompose() print(" ✓ Cleared figure index example entry") # Check for table index entries with template examples if 'Tabla' in text and 'Ejemplo' in text: for a in p.find_all('a'): a.decompose() for span in p.find_all('span', style=lambda x: x and 'mso-no-proof' in str(x)): if 'Ejemplo' in span.get_text(): span.decompose() print(" ✓ Cleared table index example entry") # Remove old figure index entries that reference template examples for p in soup.find_all('p', class_='MsoToc3'): text = p.get_text() if 'Figura 1. Ejemplo' in text or 'Tabla 1. Ejemplo' in text: p.decompose() print(" ✓ Removed template index entry") # Also clear the specific figure/table from template for p in soup.find_all('p', class_='Imagencentrada'): p.decompose() print(" ✓ Removed template figure placeholder") # Remove template table example for table in soup.find_all('table', class_='MsoTableGrid'): # Check if this is the template example table text = table.get_text() if 'Celda 1' in text or 'Encabezado 1' in text: # Also remove surrounding caption and source prev_sib = table.find_previous_sibling() next_sib = table.find_next_sibling() if prev_sib and 'Tabla 1. Ejemplo' in prev_sib.get_text(): prev_sib.decompose() if next_sib and 'Fuente:' in next_sib.get_text(): next_sib.decompose() table.decompose() print(" ✓ Removed template table example") break # Define chapters with their keywords and next chapter keywords chapters = [ ('Introducción', 'intro', 'Contexto'), ('Contexto', 'contexto', 'Objetivos'), ('Objetivos', 'objetivos', 'Desarrollo'), ('Desarrollo', 'desarrollo', 'Conclusiones'), ('Conclusiones', 'conclusiones', 'Referencias'), ] print("Replacing chapter contents...") for chapter_keyword, doc_key, next_keyword in chapters: print(f" Processing: {chapter_keyword}") # Reset counters for consistent numbering per chapter (optional - remove if you want global numbering) # table_counter = 0 # figure_counter = 0 start_elem = find_section_element(soup, chapter_keyword) end_elem = find_section_element(soup, next_keyword) if start_elem and end_elem: remove_elements_between(start_elem, end_elem) new_content_html = extract_section_content(docs[doc_key]) new_soup = BeautifulSoup(new_content_html, 'html.parser') insert_point = start_elem for new_elem in reversed(list(new_soup.children)): insert_point.insert_after(new_elem) print(f" ✓ Replaced content") else: if not start_elem: print(f" Warning: Could not find start element for {chapter_keyword}") if not end_elem: print(f" Warning: Could not find end element for {next_keyword}") # Handle Referencias print(" Processing: Referencias bibliográficas") refs_start = find_section_element(soup, 'Referencias') anexo_elem = find_section_element(soup, 'Anexo') if refs_start and anexo_elem: remove_elements_between(refs_start, anexo_elem) refs_html = format_references(docs['referencias']) refs_soup = BeautifulSoup(refs_html, 'html.parser') insert_point = refs_start for new_elem in reversed(list(refs_soup.children)): insert_point.insert_after(new_elem) print(f" ✓ Replaced content") # Handle Anexo (last section) print(" Processing: Anexo") if anexo_elem: body = soup.find('body') if body: current = anexo_elem.next_sibling while current: next_elem = current.next_sibling if hasattr(current, 'decompose'): current.decompose() elif isinstance(current, NavigableString): current.extract() current = next_elem anexo_content = extract_section_content(docs['anexo']) anexo_soup = BeautifulSoup(anexo_content, 'html.parser') insert_point = anexo_elem for new_elem in reversed(list(anexo_soup.children)): insert_point.insert_after(new_elem) print(f" ✓ Replaced content") print(f"\nSummary: {table_counter} tables, {figure_counter} figures processed") print("Saving modified template...") output_html = str(soup) write_file(TEMPLATE_OUTPUT, output_html) # Copy template support files (header.htm, images, etc.) support_files_src = os.path.join(BASE_DIR, 'instructions/plantilla_individual_files') support_files_dst = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual_files') if os.path.exists(support_files_src): if os.path.exists(support_files_dst): shutil.rmtree(support_files_dst) shutil.copytree(support_files_src, support_files_dst) print(f"✓ Copied template support files") print(f"✓ Done! Modified: {TEMPLATE_OUTPUT}") print("\nTo convert to DOCX:") print("1. Open the .htm file in Microsoft Word") print("2. Replace [Insertar diagrama Mermaid aquí] placeholders with actual diagrams") print("3. Update indices: Select all (Ctrl+A) then press F9 to update fields") print(" - This will regenerate: Índice de contenidos, Índice de figuras, Índice de tablas") print("4. Save as .docx") if __name__ == '__main__': main()