MastersThesis/content_handlers.py

#!/usr/bin/env python3
"""Content block handlers for markdown to HTML conversion."""

import os
import re
from PIL import Image

from markdown_utils import (
    md_to_html_para,
    convert_latex_formulas,
    is_source_line,
    extract_source_from_line,
    is_leyenda_line,
    extract_leyenda_from_line,
)

# Base directory for resolving paths
BASE_DIR = os.path.dirname(os.path.abspath(__file__))


def handle_mermaid_diagram(lines, i, counters, is_anexo):
    """Handle mermaid diagram block, converting to figure with image.

    Args:
        lines: List of markdown lines
        i: Current line index (pointing to ```mermaid)
        counters: Dict with 'table', 'figure', 'anexo_table', 'anexo_figure', 'global_figure'
        is_anexo: Boolean indicating if processing Anexo section

    Returns:
        Tuple of (html_blocks, new_index)
    """
    html_blocks = []

    # Always increment global index for sequential filenames
    counters['global_figure'] += 1

    # Use Anexo-specific counter with "A" prefix for display, or global counter
    if is_anexo:
        counters['anexo_figure'] += 1
        fig_num = f"A{counters['anexo_figure']}"
    else:
        counters['figure'] += 1
        fig_num = str(counters['figure'])

    mermaid_lines = []
    i += 1
    while i < len(lines) and not lines[i].strip() == '```':
        mermaid_lines.append(lines[i])
        i += 1

    # Try to extract title from mermaid content (YAML format)
    mermaid_content = '\n'.join(mermaid_lines)
    # Match title with quotes: title: "Something" or title: 'Something'
    title_match = re.search(r'title:\s*["\']([^"\']+)["\']', mermaid_content)
    if not title_match:
        # Match title without quotes: title: Something
        title_match = re.search(r'title:\s*([^"\'\n]+)', mermaid_content)
    if title_match:
        fig_title = title_match.group(1).strip()
    else:
        fig_title = f"Diagrama {fig_num}"

    # Use global sequential index for filename
    fig_file = f'figures/figura_{counters["global_figure"]}.png'
    fig_path = os.path.join(BASE_DIR, 'thesis_output', fig_file)

    # Create figure with MsoCaption class and proper Word SEQ field
    bookmark_id = f"_Ref_Fig{fig_num}"

    if is_anexo:
        tc_field = f'''<!--[if supportFields]><span style='mso-element:field-begin'></span> TC "Figura {fig_num}. {fig_title}" \\f c \\l 1 <span style='mso-element:field-end'></span><![endif]-->'''
        html_blocks.append(f'''<a name="{bookmark_id}"></a>{tc_field}<p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura {fig_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
    else:
        html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{fig_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')

    if os.path.exists(fig_path):
        # Read actual image dimensions and scale to fit page width
        img = Image.open(fig_path)
        orig_w, orig_h = img.size

        # Scale to fit max width of 566px (15cm at 96dpi) while preserving aspect ratio
        max_width = 566
        if orig_w > max_width:
            scale = max_width / orig_w
            new_w = max_width
            new_h = int(orig_h * scale)
        else:
            new_w, new_h = orig_w, orig_h

        # Convert to pt (1px at 96dpi = 0.75pt)
        w_pt = new_w * 0.75
        h_pt = new_h * 0.75

        html_blocks.append(f'''<p class=MsoNormal style="text-align:center;mso-pagination:keep-with-next"><span lang=ES><img width="{new_w}" height="{new_h}" style="width:{w_pt}pt;height:{h_pt}pt;display:block;margin:0 auto" src="{fig_file}" alt="{fig_title}"/></span></p>''')
    else:
        # Fallback to placeholder
        html_blocks.append(f'''<p class=MsoNormal style="text-align:center;mso-pagination:keep-with-next;border:1px dashed #999;padding:20px;margin:10px 40px;background:#f9f9f9"><span lang=ES style="color:#666">[Insertar diagrama Mermaid aquí]</span></p>''')

    # Check if next non-empty line has custom Fuente
    custom_source = None
    fig_leyenda = None
    lookahead = i + 1
    while lookahead < len(lines) and not lines[lookahead].strip():
        lookahead += 1
    if lookahead < len(lines):
        next_line = lines[lookahead].strip()
        if is_source_line(next_line):
            custom_source = extract_source_from_line(next_line)
            if custom_source and not custom_source.endswith('.'):
                custom_source += '.'
            i = lookahead
            # Check for Leyenda after source
            leyenda_idx = i + 1
            while leyenda_idx < len(lines) and not lines[leyenda_idx].strip():
                leyenda_idx += 1
            if leyenda_idx < len(lines) and is_leyenda_line(lines[leyenda_idx]):
                fig_leyenda = extract_leyenda_from_line(lines[leyenda_idx])
                i = leyenda_idx

    if custom_source:
        source_html = md_to_html_para(custom_source)
        html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Fuente: {source_html}</span></p>''')
    else:
        html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Fuente: Elaboración propia.</span></p>''')

    if fig_leyenda:
        leyenda_html = md_to_html_para(fig_leyenda)
        if not fig_leyenda.endswith('.'):
            leyenda_html += '.'
        html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Leyenda: {leyenda_html}</span></p>''')

    html_blocks.append('<p class=MsoNormal><span lang=ES><o:p>&nbsp;</o:p></span></p>')
    i += 1

    return html_blocks, i


def handle_code_block(lines, i):
    """Handle non-mermaid code block.

    Args:
        lines: List of markdown lines
        i: Current line index (pointing to ```)

    Returns:
        Tuple of (html_blocks, new_index)
    """
    html_blocks = []
    code_lang = lines[i].strip()[3:]
    code_lines = []
    i += 1
    while i < len(lines) and not lines[i].strip().startswith('```'):
        code_lines.append(lines[i])
        i += 1
    code = '\n'.join(code_lines)
    # Escape HTML entities in code
    code = code.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
    html_blocks.append(f'''<div style="background:#E6F4F9;border-top:solid #0098CD .5pt;border-bottom:solid #0098CD .5pt;padding:8pt 12pt;margin:6pt 0">
<pre style="font-family:Consolas,monospace;font-size:9pt;color:#333333;margin:0;white-space:pre-wrap;word-wrap:break-word">{code}</pre>
</div>''')
    i += 1
    return html_blocks, i


def handle_header(line, is_anexo):
    """Handle header lines (##, ###, ####).

    Args:
        line: The header line
        is_anexo: Boolean indicating if processing Anexo section

    Returns:
        HTML string for the header, or None if h1 (skip)
    """
    if line.startswith('####'):
        text = line.lstrip('#').strip()
        return f'<h4 style="mso-list:none"><b><span lang=ES style="text-transform:none">{text}</span></b></h4>'
    elif line.startswith('###'):
        text = line.lstrip('#').strip()
        # Extract section number if present
        sec_match = re.match(r'^([\d\.]+)\s+', text)
        bookmark_html = ''
        if sec_match:
            sec_num = sec_match.group(1).rstrip('.')
            bookmark_id = f"_Ref_Sec{sec_num.replace('.', '_')}"
            bookmark_html = f'<a name="{bookmark_id}"></a>'
        # Disable auto-numbering for Anexo content or A.x headings
        if is_anexo or re.match(r'^A\.\d+', text):
            return f'{bookmark_html}<h3 style="mso-list:none"><span lang=ES style="text-transform:none">{text}</span></h3>'
        else:
            return f'{bookmark_html}<h3 style="mso-list:l22 level3 lfo18"><span lang=ES style="text-transform:none">{text}</span></h3>'
    elif line.startswith('##'):
        text = line.lstrip('#').strip()
        # Extract section number if present
        sec_match = re.match(r'^([\d\.]+)\s+', text)
        bookmark_html = ''
        if sec_match:
            sec_num = sec_match.group(1).rstrip('.')
            bookmark_id = f"_Ref_Sec{sec_num.replace('.', '_')}"
            bookmark_html = f'<a name="{bookmark_id}"></a>'
        # Disable auto-numbering for Anexo content or A.x headings
        if is_anexo or re.match(r'^A\.\d+', text):
            return f'{bookmark_html}<h2 style="mso-list:none"><span lang=ES style="text-transform:none">{text}</span></h2>'
        else:
            return f'{bookmark_html}<h2 style="mso-list:l22 level2 lfo18"><span lang=ES style="text-transform:none">{text}</span></h2>'
    elif line.startswith('#'):
        # Skip h1 - we keep the original
        return None
    return None


def handle_table(lines, i, counters, is_anexo):
    """Handle markdown table.

    Args:
        lines: List of markdown lines
        i: Current line index (pointing to first table row)
        counters: Dict with table/figure counters
        is_anexo: Boolean indicating if processing Anexo section

    Returns:
        Tuple of (html_blocks, new_index)
    """
    html_blocks = []

    # Use Anexo-specific counter with "A" prefix, or global counter
    if is_anexo:
        counters['anexo_table'] += 1
        table_num = f"A{counters['anexo_table']}"
    else:
        counters['table'] += 1
        table_num = str(counters['table'])

    # Check if previous line has table title
    table_title = None
    alt_title = None
    table_source = "Elaboración propia"

    # Look back for table title
    for j in range(i - 1, max(0, i - 5), -1):
        prev_line = lines[j].strip()
        if prev_line.startswith('**Tabla') or prev_line.startswith('*Tabla'):
            table_title = re.sub(r'\*+', '', prev_line).strip()
            break
        elif prev_line.startswith('**') and prev_line.endswith(':**'):
            alt_title = re.sub(r'\*+', '', prev_line).rstrip(':').strip()
        elif prev_line and not prev_line.startswith('|'):
            break

    # Parse table
    table_lines = []
    while i < len(lines) and '|' in lines[i]:
        if '---' not in lines[i]:
            table_lines.append(lines[i])
        i += 1

    # Look ahead for source
    source_idx = i
    table_leyenda = None
    while source_idx < len(lines) and not lines[source_idx].strip():
        source_idx += 1
    if source_idx < len(lines) and is_source_line(lines[source_idx]):
        table_source = extract_source_from_line(lines[source_idx])
        i = source_idx + 1
        # Check for Leyenda after source
        leyenda_idx = i
        while leyenda_idx < len(lines) and not lines[leyenda_idx].strip():
            leyenda_idx += 1
        if leyenda_idx < len(lines) and is_leyenda_line(lines[leyenda_idx]):
            table_leyenda = extract_leyenda_from_line(lines[leyenda_idx])
            i = leyenda_idx + 1

    # Add table title with MsoCaption class
    bookmark_id = f"_Ref_Tab{table_num}"
    if table_title:
        clean_title = re.sub(r'^Tabla\s+[A-Z]?\d+\.\s*', '', table_title).strip()
    elif alt_title:
        clean_title = alt_title
    else:
        clean_title = "Tabla de datos."

    if is_anexo:
        tc_field = f'''<!--[if supportFields]><span style='mso-element:field-begin'></span> TC "Tabla {table_num}. {clean_title}" \\f t \\l 1 <span style='mso-element:field-end'></span><![endif]-->'''
        html_blocks.append(f'''<a name="{bookmark_id}"></a>{tc_field}<p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla {table_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
    else:
        html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')

    # Build table HTML with APA style
    table_html = '<div align="center"><table class=MsoTableGrid border=1 cellspacing=0 cellpadding=0 align="center" style="border-collapse:collapse;margin-left:auto;margin-right:auto;mso-table-style-name:\'Plain Table 1\'">'
    for j, tline in enumerate(table_lines):
        cells = [c.strip() for c in tline.split('|')[1:-1]]
        table_html += '<tr>'
        for cell in cells:
            if j == 0:
                # Header row
                table_html += f'<td style="border-top:solid windowtext 1.0pt;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0;text-align:center"><b><span lang=ES>{md_to_html_para(cell)}</span></b></p></td>'
            elif j == len(table_lines) - 1:
                # Last row
                table_html += f'<td style="border-top:none;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0;text-align:center"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
            else:
                # Middle rows
                table_html += f'<td style="border:none;padding:5px"><p class=MsoNormal style="margin:0;text-align:center"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
        table_html += '</tr>'
    table_html += '</table></div>'
    html_blocks.append(table_html)

    # Add source
    source_html = md_to_html_para(table_source)
    if not table_source.endswith('.'):
        source_html += '.'
    html_blocks.append(f'<p class=Piedefoto-tabla style="margin-left:0cm"><span lang=ES>Fuente: {source_html}</span></p>')

    # Add leyenda if present
    if table_leyenda:
        leyenda_html = md_to_html_para(table_leyenda)
        if not table_leyenda.endswith('.'):
            leyenda_html += '.'
        html_blocks.append(f'<p class=Piedefoto-tabla style="margin-left:0cm"><span lang=ES>Leyenda: {leyenda_html}</span></p>')

    html_blocks.append('<p class=MsoNormal><span lang=ES><o:p>&nbsp;</o:p></span></p>')

    return html_blocks, i


def handle_blockquote(lines, i):
    """Handle blockquote (regular or Nota callout).

    Args:
        lines: List of markdown lines
        i: Current line index (pointing to > line)

    Returns:
        Tuple of (html_blocks, new_index)
    """
    html_blocks = []
    line = lines[i]
    quote_text = line[1:].strip()
    i += 1
    while i < len(lines) and lines[i].startswith('>'):
        quote_text += ' ' + lines[i][1:].strip()
        i += 1

    # Check if this is a Nota/Note callout
    if quote_text.startswith('**Nota:**') or quote_text.startswith('**Note:**'):
        if quote_text.startswith('**Nota:**'):
            label = 'Nota:'
            content = quote_text[9:].strip()
        else:
            label = 'Note:'
            content = quote_text[9:].strip()

        # UNIR callout box style
        html_blocks.append(f'''<div style='mso-element:para-border-div;border-top:solid #0098CD 1.0pt;border-left:none;border-bottom:solid #0098CD 1.0pt;border-right:none;mso-border-top-alt:solid #0098CD .5pt;mso-border-bottom-alt:solid #0098CD .5pt;padding:4.0pt 0cm 4.0pt 0cm;background:#E6F4F9'>
<p class=MsoNormal style='background:#E6F4F9;border:none;padding:0cm;margin:0cm'><b><span lang=ES>{label}</span></b><span lang=ES> {md_to_html_para(content)}</span></p>
</div>''')
    else:
        # Regular blockquote
        html_blocks.append(f'<p class=MsoQuote><i><span lang=ES>{md_to_html_para(quote_text)}</span></i></p>')

    return html_blocks, i


def handle_bullet_list(lines, i):
    """Handle bullet list (-, *, +).

    Args:
        lines: List of markdown lines
        i: Current line index (pointing to first bullet)

    Returns:
        Tuple of (html_blocks, new_index)
    """
    html_blocks = []
    bullet_items = []

    while i < len(lines):
        # Skip blank lines
        while i < len(lines) and not lines[i].strip():
            i += 1
        # Check if next non-blank line is a bullet item
        if i < len(lines) and re.match(r'^[\-\*\+]\s', lines[i]):
            item_text = lines[i][2:].strip()
            item_text = convert_latex_formulas(item_text)
            bullet_items.append(md_to_html_para(item_text))
            i += 1
        else:
            break

    # Output with proper First/Middle/Last classes
    for idx, item in enumerate(bullet_items):
        if len(bullet_items) == 1:
            cls = 'MsoListParagraph'
        elif idx == 0:
            cls = 'MsoListParagraphCxSpFirst'
        elif idx == len(bullet_items) - 1:
            cls = 'MsoListParagraphCxSpLast'
        else:
            cls = 'MsoListParagraphCxSpMiddle'
        html_blocks.append(f'<p class={cls} style="margin-left:36pt;text-indent:-18pt"><span lang=ES style="font-family:Symbol">·</span><span lang=ES style="font-size:7pt">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</span><span lang=ES>{item}</span></p>')

    return html_blocks, i


def handle_numbered_list(lines, i):
    """Handle numbered list (1., 2., etc) with nested bullet sub-lists.

    Args:
        lines: List of markdown lines
        i: Current line index (pointing to first numbered item)

    Returns:
        Tuple of (html_blocks, new_index)
    """
    html_blocks = []
    # Each item is a tuple: (main_text, nested_bullets)
    # where nested_bullets is a list of bullet point strings
    numbered_items = []

    while i < len(lines):
        # Skip blank lines
        while i < len(lines) and not lines[i].strip():
            i += 1
        # Check if next non-blank line is a numbered item
        if i < len(lines) and re.match(r'^\d+\.\s', lines[i]):
            item_text = re.sub(r'^\d+\.\s*', '', lines[i]).strip()
            i += 1
            # Collect any nested/indented content (bullet points)
            nested_bullets = []
            while i < len(lines):
                current = lines[i]
                # Stop conditions
                if re.match(r'^\d+\.\s', current):
                    break
                if current.startswith('#'):
                    break
                if current.startswith('```'):
                    break
                if current.startswith('**Tabla') or current.startswith('**Figura'):
                    break
                # Check for non-indented, non-bullet content (end of nested)
                stripped = current.strip()
                if stripped and not current.startswith(' ') and not current.startswith('\t') and not stripped.startswith('-'):
                    break
                # Collect indented bullet points
                if stripped.startswith('- '):
                    bullet_text = stripped[2:].strip()
                    nested_bullets.append(bullet_text)
                i += 1

            item_text = convert_latex_formulas(item_text)
            numbered_items.append((md_to_html_para(item_text), nested_bullets))
        else:
            break

    # Output numbered items with nested bullet lists
    for idx, (item_text, nested_bullets) in enumerate(numbered_items):
        num = idx + 1
        if len(numbered_items) == 1:
            cls = 'MsoListParagraph'
        elif idx == 0:
            cls = 'MsoListParagraphCxSpFirst'
        elif idx == len(numbered_items) - 1 and not nested_bullets:
            cls = 'MsoListParagraphCxSpLast'
        else:
            cls = 'MsoListParagraphCxSpMiddle'

        # Main numbered item
        html_blocks.append(f'<p class={cls} style="margin-left:36pt;text-indent:-18pt"><span lang=ES>{num}.<span style="font-size:7pt">&nbsp;&nbsp;&nbsp;</span>{item_text}</span></p>')

        # Nested bullet sub-list (indented further)
        if nested_bullets:
            for bullet_idx, bullet_text in enumerate(nested_bullets):
                bullet_text = convert_latex_formulas(bullet_text)
                bullet_html = md_to_html_para(bullet_text)
                # Determine class for sub-list items
                if bullet_idx == 0:
                    sub_cls = 'MsoListParagraphCxSpFirst'
                elif bullet_idx == len(nested_bullets) - 1:
                    # If this is the last bullet of the last numbered item, use Last
                    if idx == len(numbered_items) - 1:
                        sub_cls = 'MsoListParagraphCxSpLast'
                    else:
                        sub_cls = 'MsoListParagraphCxSpLast'
                else:
                    sub_cls = 'MsoListParagraphCxSpMiddle'
                # Nested bullets at 54pt margin (36pt + 18pt)
                html_blocks.append(f'<p class={sub_cls} style="margin-left:54pt;text-indent:-18pt"><span lang=ES style="font-family:Symbol">·</span><span lang=ES style="font-size:7pt">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</span><span lang=ES>{bullet_html}</span></p>')

    return html_blocks, i