#!/usr/bin/env python3 """Replace template content with thesis content from docs/ folder using BeautifulSoup.""" import re import os from bs4 import BeautifulSoup, NavigableString BASE_DIR = '/Users/sergio/Desktop/MastersThesis' TEMPLATE = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual.htm') DOCS_DIR = os.path.join(BASE_DIR, 'docs') # Global counters for tables and figures table_counter = 0 figure_counter = 0 def read_file(path): try: with open(path, 'r', encoding='utf-8') as f: return f.read() except UnicodeDecodeError: with open(path, 'r', encoding='latin-1') as f: return f.read() def write_file(path, content): with open(path, 'w', encoding='utf-8') as f: f.write(content) def md_to_html_para(text): """Convert markdown inline formatting to HTML.""" # Bold text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) # Italic text = re.sub(r'\*([^*]+)\*', r'\1', text) # Inline code text = re.sub(r'`([^`]+)`', r'\1', text) return text def extract_table_title(lines, current_index): """Look for table title in preceding lines (e.g., **Tabla 1.** *Title*).""" # Check previous non-empty lines for table title for i in range(current_index - 1, max(0, current_index - 5), -1): line = lines[i].strip() if line.startswith('**Tabla') or line.startswith('*Tabla'): return line if line and not line.startswith('|'): break return None def extract_figure_title_from_mermaid(lines, current_index): """Extract title from mermaid diagram or preceding text.""" # Look for title in mermaid content for i in range(current_index + 1, min(len(lines), current_index + 20)): line = lines[i].strip() if line.startswith('```'): break if 'title' in line.lower(): # Extract title from: title "Some Title" match = re.search(r'title\s+["\']([^"\']+)["\']', line) if match: return match.group(1) # Check preceding lines for figure reference for i in range(current_index - 1, max(0, current_index - 3), -1): line = lines[i].strip() if line.startswith('**Figura') or 'Figura' in line: return line return None def parse_md_to_html_blocks(md_content): """Convert markdown content to HTML blocks with template styles.""" global table_counter, figure_counter html_blocks = [] lines = md_content.split('\n') i = 0 while i < len(lines): line = lines[i] # Skip empty lines if not line.strip(): i += 1 continue # Mermaid diagram - convert to figure with actual image if line.strip().startswith('```mermaid'): figure_counter += 1 mermaid_lines = [] i += 1 while i < len(lines) and not lines[i].strip() == '```': mermaid_lines.append(lines[i]) i += 1 # Try to extract title from mermaid content mermaid_content = '\n'.join(mermaid_lines) title_match = re.search(r'title\s+["\']?([^"\'"\n]+)["\']?', mermaid_content) if title_match: fig_title = title_match.group(1).strip() else: fig_title = f"Diagrama {figure_counter}" # Check if the generated PNG exists fig_file = f'figures/figura_{figure_counter}.png' fig_path = os.path.join(BASE_DIR, 'thesis_output', fig_file) # Create figure with proper template format (Piedefoto-tabla class) html_blocks.append(f'''
Figura {figure_counter}. {fig_title}
''') if os.path.exists(fig_path): # Use actual image with proper Word-compatible format html_blocks.append(f'''[Insertar diagrama Mermaid aquí]
''') html_blocks.append(f'''Fuente: Elaboración propia.
''') html_blocks.append('{code}
Tabla {table_counter}. {clean_title}
') else: html_blocks.append(f'Tabla {table_counter}. Tabla de datos.
') # Build table HTML table_html = '{md_to_html_para(cell)} | '
else:
table_html += f'{md_to_html_para(cell)} | '
table_html += '
Fuente: {table_source}.
') html_blocks.append('{md_to_html_para(quote_text)}
') continue # Bullet list if re.match(r'^[\-\*\+]\s', line): while i < len(lines) and re.match(r'^[\-\*\+]\s', lines[i]): item_text = lines[i][2:].strip() html_blocks.append(f'· {md_to_html_para(item_text)}
') i += 1 continue # Numbered list if re.match(r'^\d+\.\s', line): num = 1 while i < len(lines) and re.match(r'^\d+\.\s', lines[i]): item_text = re.sub(r'^\d+\.\s*', '', lines[i]).strip() html_blocks.append(f'{num}. {md_to_html_para(item_text)}
') num += 1 i += 1 continue # Skip lines that are just table/figure titles (they'll be handled with the table/figure) if line.strip().startswith('**Tabla') or line.strip().startswith('*Tabla'): i += 1 continue if line.strip().startswith('**Figura') or line.strip().startswith('*Figura'): i += 1 continue if line.strip().startswith('*Fuente:') or line.strip().startswith('Fuente:'): i += 1 continue # Regular paragraph para_lines = [line] i += 1 while i < len(lines) and lines[i].strip() and not lines[i].startswith('#') and not lines[i].startswith('```') and not lines[i].startswith('>') and not re.match(r'^[\-\*\+]\s', lines[i]) and not re.match(r'^\d+\.\s', lines[i]) and '|' not in lines[i]: para_lines.append(lines[i]) i += 1 para_text = ' '.join(para_lines) html_blocks.append(f'{md_to_html_para(para_text)}
') return '\n\n'.join(html_blocks) def extract_section_content(md_content): """Extract content from markdown, skipping the first # header.""" md_content = re.sub(r'^#\s+[^\n]+\n+', '', md_content, count=1) return parse_md_to_html_blocks(md_content) def find_section_element(soup, keyword): """Find element containing keyword (h1 or special paragraph classes).""" # First try h1 for h1 in soup.find_all('h1'): text = h1.get_text() if keyword.lower() in text.lower(): return h1 # Try special paragraph classes for unnumbered sections for p in soup.find_all('p', class_=['Ttulo1sinnumerar', 'Anexo', 'MsoNormal']): text = p.get_text() if keyword.lower() in text.lower(): classes = p.get('class', []) if 'Ttulo1sinnumerar' in classes or 'Anexo' in classes: return p if re.match(r'^\d+\.?\s', text.strip()): return p return None def remove_elements_between(start_elem, end_elem): """Remove all elements between start and end (exclusive).""" current = start_elem.next_sibling elements_to_remove = [] while current and current != end_elem: elements_to_remove.append(current) current = current.next_sibling for elem in elements_to_remove: if hasattr(elem, 'decompose'): elem.decompose() elif isinstance(elem, NavigableString): elem.extract() def format_references(refs_content): """Format references with proper MsoBibliography style.""" refs_content = refs_content.replace('# Referencias bibliográficas {.unnumbered}', '').strip() refs_html = '' for line in refs_content.split('\n\n'): line = line.strip() if not line: continue # Apply markdown formatting formatted = md_to_html_para(line) # Use MsoBibliography style with hanging indent (36pt indent, -36pt text-indent) refs_html += f'''{formatted}
\n''' return refs_html def extract_resumen_parts(resumen_content): """Extract Spanish resumen and English abstract from 00_resumen.md""" parts = resumen_content.split('---') spanish_part = parts[0] if len(parts) > 0 else '' english_part = parts[1] if len(parts) > 1 else '' # Extract Spanish content spanish_text = '' spanish_keywords = '' if '**Palabras clave:**' in spanish_part: text_part, kw_part = spanish_part.split('**Palabras clave:**') spanish_text = text_part.replace('# Resumen', '').strip() spanish_keywords = kw_part.strip() else: spanish_text = spanish_part.replace('# Resumen', '').strip() # Extract English content english_text = '' english_keywords = '' if '**Keywords:**' in english_part: text_part, kw_part = english_part.split('**Keywords:**') english_text = text_part.replace('# Abstract', '').strip() english_keywords = kw_part.strip() else: english_text = english_part.replace('# Abstract', '').strip() return spanish_text, spanish_keywords, english_text, english_keywords def main(): global table_counter, figure_counter print("Reading template...") html_content = read_file(TEMPLATE) soup = BeautifulSoup(html_content, 'html.parser') print("Reading docs content...") docs = { 'resumen': read_file(os.path.join(DOCS_DIR, '00_resumen.md')), 'intro': read_file(os.path.join(DOCS_DIR, '01_introduccion.md')), 'contexto': read_file(os.path.join(DOCS_DIR, '02_contexto_estado_arte.md')), 'objetivos': read_file(os.path.join(DOCS_DIR, '03_objetivos_metodologia.md')), 'desarrollo': read_file(os.path.join(DOCS_DIR, '04_desarrollo_especifico.md')), 'conclusiones': read_file(os.path.join(DOCS_DIR, '05_conclusiones_trabajo_futuro.md')), 'referencias': read_file(os.path.join(DOCS_DIR, '06_referencias_bibliograficas.md')), 'anexo': read_file(os.path.join(DOCS_DIR, '07_anexo_a.md')), } # Extract resumen and abstract spanish_text, spanish_kw, english_text, english_kw = extract_resumen_parts(docs['resumen']) # Replace title print("Replacing title...") for elem in soup.find_all(string=re.compile(r'Título del TFE', re.IGNORECASE)): elem.replace_with(elem.replace('Título del TFE', 'Optimización de Hiperparámetros OCR con Ray Tune para Documentos Académicos en Español')) # Replace Resumen section print("Replacing Resumen...") resumen_title = soup.find('p', class_='Ttulondices', string=re.compile(r'Resumen')) if resumen_title: # Find and replace content after Resumen title until Abstract current = resumen_title.find_next_sibling() elements_to_remove = [] while current: text = current.get_text() if hasattr(current, 'get_text') else str(current) if 'Abstract' in text and current.name == 'p' and 'Ttulondices' in str(current.get('class', [])): break elements_to_remove.append(current) current = current.find_next_sibling() for elem in elements_to_remove: if hasattr(elem, 'decompose'): elem.decompose() # Insert new resumen content resumen_html = f'''{spanish_text}
Palabras clave: {spanish_kw}
{english_text}
Keywords: {english_kw}