#!/usr/bin/env python3 """Utility functions for markdown processing and conversion.""" import re from latex2mathml.converter import convert as latex_to_mathml # Accept Fuente/Source lines with or without markdown bold SOURCE_LINE_RE = re.compile(r'^\s*(?:\*{1,2})?(Fuente|Source):(?:\*{1,2})?\s*(.*)$', re.IGNORECASE) # Accept Leyenda lines with or without markdown bold LEYENDA_LINE_RE = re.compile(r'^\s*(?:\*{1,2})?Leyenda:(?:\*{1,2})?\s*(.*)$', re.IGNORECASE) # Cross-reference patterns using markdown links: # [Figura 15](#figura-15) or [Tabla 20](#tabla-20) -> Word REF fields # Also supports Anexo: [Figura A1](#figura-a1), [Tabla A2](#tabla-a2) CROSS_REF_LINK_RE = re.compile(r'\[(Figura|Tabla)\s+([A-Za-z]?\d+)\]\(#(figura|tabla)-([a-z]?\d+)\)', re.IGNORECASE) # Section/chapter cross-reference patterns: # [Sección 4.1](#seccion-4-1) or [Capítulo 2](#capitulo-2) SECTION_REF_LINK_RE = re.compile(r'\[(Sección|Seccion|Capítulo|Capitulo)\s+([\d\.]+)\]\(#(seccion|capitulo)-([0-9-]+)\)', re.IGNORECASE) def read_file(path): """Read file content with UTF-8 encoding, falling back to latin-1.""" try: with open(path, 'r', encoding='utf-8') as f: return f.read() except UnicodeDecodeError: with open(path, 'r', encoding='latin-1') as f: return f.read() def write_file(path, content): """Write content to file with UTF-8 encoding.""" with open(path, 'w', encoding='utf-8') as f: f.write(content) def convert_cross_references(text): """Convert markdown link cross-references to Word REF fields. Supported syntax (renders normally in markdown viewers): - [Figura 15](#figura-15) -> clickable Word cross-reference to Figure 15 - [Tabla 20](#tabla-20) -> clickable Word cross-reference to Table 20 - [Figura A1](#figura-a1) -> links to Anexo figures - [Sección 4.1](#seccion-4-1) -> clickable link to Section 4.1 - [Capítulo 2](#capitulo-2) -> clickable link to Chapter 2 """ def replace_fig_tab_ref(match): display_type = match.group(1) # "Figura" or "Tabla" display_num = match.group(2) # "15" or "A1" if display_type.lower() == 'figura': bookmark = f"_Ref_Fig{display_num}" else: # Tabla bookmark = f"_Ref_Tab{display_num}" display_text = f"{display_type} {display_num}" # Word REF field with \h for hyperlink return f'''{display_text}''' def replace_section_ref(match): display_type = match.group(1) # "Sección" or "Capítulo" display_num = match.group(2) # "4.1" or "2" anchor_num = match.group(4) # "4-1" or "2" # Create bookmark name from anchor (e.g., 4-1 -> _Ref_Sec4_1) bookmark = f"_Ref_Sec{anchor_num.replace('-', '_')}" display_text = f"{display_type} {display_num}" # Word REF field with \h for hyperlink return f'''{display_text}''' # Apply cross-reference conversions text = CROSS_REF_LINK_RE.sub(replace_fig_tab_ref, text) text = SECTION_REF_LINK_RE.sub(replace_section_ref, text) return text def md_to_html_para(text): """Convert markdown inline formatting to HTML.""" # Cross-references (must be done before other conversions) text = convert_cross_references(text) # Bold text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) # Italic text = re.sub(r'\*([^*]+)\*', r'\1', text) # Inline code text = re.sub(r'`([^`]+)`', r'\1', text) # Links [text](url) -> text text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', text) return text def convert_latex_formulas(text): """Convert LaTeX formulas to styled text for easy copy-paste into Word equation editor. Word's equation editor accepts LaTeX directly, so we preserve the LaTeX code in a visually distinct format that users can copy and paste. """ # Block formulas $$...$$ - center and style as equation placeholder def convert_block(match): latex = match.group(1).strip() # Style as centered, monospace text that's easy to identify and copy return f'
{latex}
' text = re.sub(r'\$\$([^$]+)\$\$', convert_block, text) # Inline formulas $...$ - style as inline code def convert_inline(match): latex = match.group(1).strip() return f'{latex}' text = re.sub(r'\$([^$]+)\$', convert_inline, text) return text def extract_source_from_line(line): """Return source text if line is a Fuente/Source line, otherwise None.""" match = SOURCE_LINE_RE.match(line.strip()) if not match: return None return match.group(2).strip() def is_source_line(line): """Check whether a line starts with Fuente:/Source: (optionally bold).""" return SOURCE_LINE_RE.match(line.strip()) is not None def extract_leyenda_from_line(line): """Return leyenda text if line is a Leyenda line, otherwise None.""" match = LEYENDA_LINE_RE.match(line.strip()) if not match: return None return match.group(1).strip() def is_leyenda_line(line): """Check whether a line starts with Leyenda: (optionally bold).""" return LEYENDA_LINE_RE.match(line.strip()) is not None def split_into_paragraphs(text, lang='ES'): """Split text by double newlines and wrap each paragraph intags.""" paragraphs = [] for para in text.split('\n\n'): para = para.strip() if para: formatted = md_to_html_para(para) paragraphs.append(f'
{formatted}
') return '\n'.join(paragraphs)