MastersThesis/markdown_utils.py

#!/usr/bin/env python3
"""Utility functions for markdown processing and conversion."""

import re
from latex2mathml.converter import convert as latex_to_mathml

# Accept Fuente/Source lines with or without markdown bold
SOURCE_LINE_RE = re.compile(r'^\s*(?:\*{1,2})?(Fuente|Source):(?:\*{1,2})?\s*(.*)$', re.IGNORECASE)
# Accept Leyenda lines with or without markdown bold
LEYENDA_LINE_RE = re.compile(r'^\s*(?:\*{1,2})?Leyenda:(?:\*{1,2})?\s*(.*)$', re.IGNORECASE)

# Cross-reference patterns using markdown links:
# [Figura 15](#figura-15) or [Tabla 20](#tabla-20) -> Word REF fields
# Also supports Anexo: [Figura A1](#figura-a1), [Tabla A2](#tabla-a2)
CROSS_REF_LINK_RE = re.compile(r'\[(Figura|Tabla)\s+([A-Za-z]?\d+)\]\(#(figura|tabla)-([a-z]?\d+)\)', re.IGNORECASE)
# Section/chapter cross-reference patterns:
# [Sección 4.1](#seccion-4-1) or [Capítulo 2](#capitulo-2)
SECTION_REF_LINK_RE = re.compile(r'\[(Sección|Seccion|Capítulo|Capitulo)\s+([\d\.]+)\]\(#(seccion|capitulo)-([0-9-]+)\)', re.IGNORECASE)


def read_file(path):
    """Read file content with UTF-8 encoding, falling back to latin-1."""
    try:
        with open(path, 'r', encoding='utf-8') as f:
            return f.read()
    except UnicodeDecodeError:
        with open(path, 'r', encoding='latin-1') as f:
            return f.read()


def write_file(path, content):
    """Write content to file with UTF-8 encoding."""
    with open(path, 'w', encoding='utf-8') as f:
        f.write(content)


def convert_cross_references(text):
    """Convert markdown link cross-references to Word REF fields.

    Supported syntax (renders normally in markdown viewers):
    - [Figura 15](#figura-15) -> clickable Word cross-reference to Figure 15
    - [Tabla 20](#tabla-20) -> clickable Word cross-reference to Table 20
    - [Figura A1](#figura-a1) -> links to Anexo figures
    - [Sección 4.1](#seccion-4-1) -> clickable link to Section 4.1
    - [Capítulo 2](#capitulo-2) -> clickable link to Chapter 2
    """
    def replace_fig_tab_ref(match):
        display_type = match.group(1)  # "Figura" or "Tabla"
        display_num = match.group(2)   # "15" or "A1"

        if display_type.lower() == 'figura':
            bookmark = f"_Ref_Fig{display_num}"
        else:  # Tabla
            bookmark = f"_Ref_Tab{display_num}"

        display_text = f"{display_type} {display_num}"

        # Word REF field with \h for hyperlink
        return f'''<!--[if supportFields]><span style='mso-element:field-begin'></span> REF {bookmark} \\h <span style='mso-element:field-separator'></span><![endif]--><a href="#{bookmark}">{display_text}</a><!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->'''

    def replace_section_ref(match):
        display_type = match.group(1)  # "Sección" or "Capítulo"
        display_num = match.group(2)   # "4.1" or "2"
        anchor_num = match.group(4)    # "4-1" or "2"

        # Create bookmark name from anchor (e.g., 4-1 -> _Ref_Sec4_1)
        bookmark = f"_Ref_Sec{anchor_num.replace('-', '_')}"
        display_text = f"{display_type} {display_num}"

        # Word REF field with \h for hyperlink
        return f'''<!--[if supportFields]><span style='mso-element:field-begin'></span> REF {bookmark} \\h <span style='mso-element:field-separator'></span><![endif]--><a href="#{bookmark}">{display_text}</a><!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->'''

    # Apply cross-reference conversions
    text = CROSS_REF_LINK_RE.sub(replace_fig_tab_ref, text)
    text = SECTION_REF_LINK_RE.sub(replace_section_ref, text)
    return text


def md_to_html_para(text):
    """Convert markdown inline formatting to HTML."""
    # Cross-references (must be done before other conversions)
    text = convert_cross_references(text)
    # Bold
    text = re.sub(r'\*\*([^*]+)\*\*', r'<b>\1</b>', text)
    # Italic
    text = re.sub(r'\*([^*]+)\*', r'<i>\1</i>', text)
    # Inline code
    text = re.sub(r'`([^`]+)`', r'<span style="font-family:Consolas;font-size:10pt">\1</span>', text)
    # Links [text](url) -> <a href="url">text</a>
    text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<a href="\2">\1</a>', text)
    return text


def convert_latex_formulas(text):
    """Convert LaTeX formulas to MathML for Word compatibility."""
    # Block formulas $$...$$
    def convert_block(match):
        latex = match.group(1)
        try:
            mathml = latex_to_mathml(latex, display="block")
            return f'<p class=MsoNormal style="text-align:center">{mathml}</p>'
        except:
            return match.group(0)  # Keep original if conversion fails

    text = re.sub(r'\$\$([^$]+)\$\$', convert_block, text)

    # Inline formulas $...$
    def convert_inline(match):
        latex = match.group(1)
        try:
            return latex_to_mathml(latex, display="inline")
        except:
            return match.group(0)

    text = re.sub(r'\$([^$]+)\$', convert_inline, text)
    return text


def extract_source_from_line(line):
    """Return source text if line is a Fuente/Source line, otherwise None."""
    match = SOURCE_LINE_RE.match(line.strip())
    if not match:
        return None
    return match.group(2).strip()


def is_source_line(line):
    """Check whether a line starts with Fuente:/Source: (optionally bold)."""
    return SOURCE_LINE_RE.match(line.strip()) is not None


def extract_leyenda_from_line(line):
    """Return leyenda text if line is a Leyenda line, otherwise None."""
    match = LEYENDA_LINE_RE.match(line.strip())
    if not match:
        return None
    return match.group(1).strip()


def is_leyenda_line(line):
    """Check whether a line starts with Leyenda: (optionally bold)."""
    return LEYENDA_LINE_RE.match(line.strip()) is not None


def split_into_paragraphs(text, lang='ES'):
    """Split text by double newlines and wrap each paragraph in <p> tags."""
    paragraphs = []
    for para in text.split('\n\n'):
        para = para.strip()
        if para:
            formatted = md_to_html_para(para)
            paragraphs.append(f'<p class=MsoNormal><span lang={lang}>{formatted}</span></p>')
    return '\n'.join(paragraphs)