Cross references

2026-02-04 20:43:50 +01:00
parent 868f748a8d
commit e9c937a042
18 changed files with 1118 additions and 820 deletions
--- a/content_handlers.py
+++ b/content_handlers.py
@@ -0,0 +1,469 @@
+#!/usr/bin/env python3
+"""Content block handlers for markdown to HTML conversion."""
+
+import os
+import re
+from PIL import Image
+
+from markdown_utils import (
+    md_to_html_para,
+    convert_latex_formulas,
+    is_source_line,
+    extract_source_from_line,
+    is_leyenda_line,
+    extract_leyenda_from_line,
+)
+
+# Base directory for resolving paths
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+def handle_mermaid_diagram(lines, i, counters, is_anexo):
+    """Handle mermaid diagram block, converting to figure with image.
+
+    Args:
+        lines: List of markdown lines
+        i: Current line index (pointing to ```mermaid)
+        counters: Dict with 'table', 'figure', 'anexo_table', 'anexo_figure', 'global_figure'
+        is_anexo: Boolean indicating if processing Anexo section
+
+    Returns:
+        Tuple of (html_blocks, new_index)
+    """
+    html_blocks = []
+
+    # Always increment global index for sequential filenames
+    counters['global_figure'] += 1
+
+    # Use Anexo-specific counter with "A" prefix for display, or global counter
+    if is_anexo:
+        counters['anexo_figure'] += 1
+        fig_num = f"A{counters['anexo_figure']}"
+    else:
+        counters['figure'] += 1
+        fig_num = str(counters['figure'])
+
+    mermaid_lines = []
+    i += 1
+    while i < len(lines) and not lines[i].strip() == '```':
+        mermaid_lines.append(lines[i])
+        i += 1
+
+    # Try to extract title from mermaid content (YAML format)
+    mermaid_content = '\n'.join(mermaid_lines)
+    # Match title with quotes: title: "Something" or title: 'Something'
+    title_match = re.search(r'title:\s*["\']([^"\']+)["\']', mermaid_content)
+    if not title_match:
+        # Match title without quotes: title: Something
+        title_match = re.search(r'title:\s*([^"\'\n]+)', mermaid_content)
+    if title_match:
+        fig_title = title_match.group(1).strip()
+    else:
+        fig_title = f"Diagrama {fig_num}"
+
+    # Use global sequential index for filename
+    fig_file = f'figures/figura_{counters["global_figure"]}.png'
+    fig_path = os.path.join(BASE_DIR, 'thesis_output', fig_file)
+
+    # Create figure with MsoCaption class and proper Word SEQ field
+    bookmark_id = f"_Ref_Fig{fig_num}"
+
+    if is_anexo:
+        tc_field = f'''<!--[if supportFields]><span style='mso-element:field-begin'></span> TC "Figura {fig_num}. {fig_title}" \\f c \\l 1 <span style='mso-element:field-end'></span><![endif]-->'''
+        html_blocks.append(f'''<a name="{bookmark_id}"></a>{tc_field}<p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura {fig_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
+    else:
+        html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{fig_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
+
+    if os.path.exists(fig_path):
+        # Read actual image dimensions and scale to fit page width
+        img = Image.open(fig_path)
+        orig_w, orig_h = img.size
+
+        # Scale to fit max width of 566px (15cm at 96dpi) while preserving aspect ratio
+        max_width = 566
+        if orig_w > max_width:
+            scale = max_width / orig_w
+            new_w = max_width
+            new_h = int(orig_h * scale)
+        else:
+            new_w, new_h = orig_w, orig_h
+
+        # Convert to pt (1px at 96dpi = 0.75pt)
+        w_pt = new_w * 0.75
+        h_pt = new_h * 0.75
+
+        html_blocks.append(f'''<p class=MsoNormal style="text-align:center;mso-pagination:keep-with-next"><span lang=ES><img width="{new_w}" height="{new_h}" style="width:{w_pt}pt;height:{h_pt}pt;display:block;margin:0 auto" src="{fig_file}" alt="{fig_title}"/></span></p>''')
+    else:
+        # Fallback to placeholder
+        html_blocks.append(f'''<p class=MsoNormal style="text-align:center;mso-pagination:keep-with-next;border:1px dashed #999;padding:20px;margin:10px 40px;background:#f9f9f9"><span lang=ES style="color:#666">[Insertar diagrama Mermaid aquí]</span></p>''')
+
+    # Check if next non-empty line has custom Fuente
+    custom_source = None
+    fig_leyenda = None
+    lookahead = i + 1
+    while lookahead < len(lines) and not lines[lookahead].strip():
+        lookahead += 1
+    if lookahead < len(lines):
+        next_line = lines[lookahead].strip()
+        if is_source_line(next_line):
+            custom_source = extract_source_from_line(next_line)
+            if custom_source and not custom_source.endswith('.'):
+                custom_source += '.'
+            i = lookahead
+            # Check for Leyenda after source
+            leyenda_idx = i + 1
+            while leyenda_idx < len(lines) and not lines[leyenda_idx].strip():
+                leyenda_idx += 1
+            if leyenda_idx < len(lines) and is_leyenda_line(lines[leyenda_idx]):
+                fig_leyenda = extract_leyenda_from_line(lines[leyenda_idx])
+                i = leyenda_idx
+
+    if custom_source:
+        source_html = md_to_html_para(custom_source)
+        html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Fuente: {source_html}</span></p>''')
+    else:
+        html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Fuente: Elaboración propia.</span></p>''')
+
+    if fig_leyenda:
+        leyenda_html = md_to_html_para(fig_leyenda)
+        if not fig_leyenda.endswith('.'):
+            leyenda_html += '.'
+        html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Leyenda: {leyenda_html}</span></p>''')
+
+    html_blocks.append('<p class=MsoNormal><span lang=ES><o:p>&nbsp;</o:p></span></p>')
+    i += 1
+
+    return html_blocks, i
+
+
+def handle_code_block(lines, i):
+    """Handle non-mermaid code block.
+
+    Args:
+        lines: List of markdown lines
+        i: Current line index (pointing to ```)
+
+    Returns:
+        Tuple of (html_blocks, new_index)
+    """
+    html_blocks = []
+    code_lang = lines[i].strip()[3:]
+    code_lines = []
+    i += 1
+    while i < len(lines) and not lines[i].strip().startswith('```'):
+        code_lines.append(lines[i])
+        i += 1
+    code = '\n'.join(code_lines)
+    # Escape HTML entities in code
+    code = code.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
+    html_blocks.append(f'''<div style="background:#E6F4F9;border-top:solid #0098CD .5pt;border-bottom:solid #0098CD .5pt;padding:8pt 12pt;margin:6pt 0">
+<pre style="font-family:Consolas,monospace;font-size:9pt;color:#333333;margin:0;white-space:pre-wrap;word-wrap:break-word">{code}</pre>
+</div>''')
+    i += 1
+    return html_blocks, i
+
+
+def handle_header(line, is_anexo):
+    """Handle header lines (##, ###, ####).
+
+    Args:
+        line: The header line
+        is_anexo: Boolean indicating if processing Anexo section
+
+    Returns:
+        HTML string for the header, or None if h1 (skip)
+    """
+    if line.startswith('####'):
+        text = line.lstrip('#').strip()
+        return f'<h4 style="mso-list:none"><b><span lang=ES style="text-transform:none">{text}</span></b></h4>'
+    elif line.startswith('###'):
+        text = line.lstrip('#').strip()
+        # Extract section number if present
+        sec_match = re.match(r'^([\d\.]+)\s+', text)
+        bookmark_html = ''
+        if sec_match:
+            sec_num = sec_match.group(1).rstrip('.')
+            bookmark_id = f"_Ref_Sec{sec_num.replace('.', '_')}"
+            bookmark_html = f'<a name="{bookmark_id}"></a>'
+        # Disable auto-numbering for Anexo content or A.x headings
+        if is_anexo or re.match(r'^A\.\d+', text):
+            return f'{bookmark_html}<h3 style="mso-list:none"><span lang=ES style="text-transform:none">{text}</span></h3>'
+        else:
+            return f'{bookmark_html}<h3 style="mso-list:l22 level3 lfo18"><span lang=ES style="text-transform:none">{text}</span></h3>'
+    elif line.startswith('##'):
+        text = line.lstrip('#').strip()
+        # Extract section number if present
+        sec_match = re.match(r'^([\d\.]+)\s+', text)
+        bookmark_html = ''
+        if sec_match:
+            sec_num = sec_match.group(1).rstrip('.')
+            bookmark_id = f"_Ref_Sec{sec_num.replace('.', '_')}"
+            bookmark_html = f'<a name="{bookmark_id}"></a>'
+        # Disable auto-numbering for Anexo content or A.x headings
+        if is_anexo or re.match(r'^A\.\d+', text):
+            return f'{bookmark_html}<h2 style="mso-list:none"><span lang=ES style="text-transform:none">{text}</span></h2>'
+        else:
+            return f'{bookmark_html}<h2 style="mso-list:l22 level2 lfo18"><span lang=ES style="text-transform:none">{text}</span></h2>'
+    elif line.startswith('#'):
+        # Skip h1 - we keep the original
+        return None
+    return None
+
+
+def handle_table(lines, i, counters, is_anexo):
+    """Handle markdown table.
+
+    Args:
+        lines: List of markdown lines
+        i: Current line index (pointing to first table row)
+        counters: Dict with table/figure counters
+        is_anexo: Boolean indicating if processing Anexo section
+
+    Returns:
+        Tuple of (html_blocks, new_index)
+    """
+    html_blocks = []
+
+    # Use Anexo-specific counter with "A" prefix, or global counter
+    if is_anexo:
+        counters['anexo_table'] += 1
+        table_num = f"A{counters['anexo_table']}"
+    else:
+        counters['table'] += 1
+        table_num = str(counters['table'])
+
+    # Check if previous line has table title
+    table_title = None
+    alt_title = None
+    table_source = "Elaboración propia"
+
+    # Look back for table title
+    for j in range(i - 1, max(0, i - 5), -1):
+        prev_line = lines[j].strip()
+        if prev_line.startswith('**Tabla') or prev_line.startswith('*Tabla'):
+            table_title = re.sub(r'\*+', '', prev_line).strip()
+            break
+        elif prev_line.startswith('**') and prev_line.endswith(':**'):
+            alt_title = re.sub(r'\*+', '', prev_line).rstrip(':').strip()
+        elif prev_line and not prev_line.startswith('|'):
+            break
+
+    # Parse table
+    table_lines = []
+    while i < len(lines) and '|' in lines[i]:
+        if '---' not in lines[i]:
+            table_lines.append(lines[i])
+        i += 1
+
+    # Look ahead for source
+    source_idx = i
+    table_leyenda = None
+    while source_idx < len(lines) and not lines[source_idx].strip():
+        source_idx += 1
+    if source_idx < len(lines) and is_source_line(lines[source_idx]):
+        table_source = extract_source_from_line(lines[source_idx])
+        i = source_idx + 1
+        # Check for Leyenda after source
+        leyenda_idx = i
+        while leyenda_idx < len(lines) and not lines[leyenda_idx].strip():
+            leyenda_idx += 1
+        if leyenda_idx < len(lines) and is_leyenda_line(lines[leyenda_idx]):
+            table_leyenda = extract_leyenda_from_line(lines[leyenda_idx])
+            i = leyenda_idx + 1
+
+    # Add table title with MsoCaption class
+    bookmark_id = f"_Ref_Tab{table_num}"
+    if table_title:
+        clean_title = re.sub(r'^Tabla\s+[A-Z]?\d+\.\s*', '', table_title).strip()
+    elif alt_title:
+        clean_title = alt_title
+    else:
+        clean_title = "Tabla de datos."
+
+    if is_anexo:
+        tc_field = f'''<!--[if supportFields]><span style='mso-element:field-begin'></span> TC "Tabla {table_num}. {clean_title}" \\f t \\l 1 <span style='mso-element:field-end'></span><![endif]-->'''
+        html_blocks.append(f'''<a name="{bookmark_id}"></a>{tc_field}<p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla {table_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
+    else:
+        html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
+
+    # Build table HTML with APA style
+    table_html = '<div align="center"><table class=MsoTableGrid border=1 cellspacing=0 cellpadding=0 align="center" style="border-collapse:collapse;margin-left:auto;margin-right:auto;mso-table-style-name:\'Plain Table 1\'">'
+    for j, tline in enumerate(table_lines):
+        cells = [c.strip() for c in tline.split('|')[1:-1]]
+        table_html += '<tr>'
+        for cell in cells:
+            if j == 0:
+                # Header row
+                table_html += f'<td style="border-top:solid windowtext 1.0pt;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0;text-align:center"><b><span lang=ES>{md_to_html_para(cell)}</span></b></p></td>'
+            elif j == len(table_lines) - 1:
+                # Last row
+                table_html += f'<td style="border-top:none;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0;text-align:center"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
+            else:
+                # Middle rows
+                table_html += f'<td style="border:none;padding:5px"><p class=MsoNormal style="margin:0;text-align:center"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
+        table_html += '</tr>'
+    table_html += '</table></div>'
+    html_blocks.append(table_html)
+
+    # Add source
+    source_html = md_to_html_para(table_source)
+    if not table_source.endswith('.'):
+        source_html += '.'
+    html_blocks.append(f'<p class=Piedefoto-tabla style="margin-left:0cm"><span lang=ES>Fuente: {source_html}</span></p>')
+
+    # Add leyenda if present
+    if table_leyenda:
+        leyenda_html = md_to_html_para(table_leyenda)
+        if not table_leyenda.endswith('.'):
+            leyenda_html += '.'
+        html_blocks.append(f'<p class=Piedefoto-tabla style="margin-left:0cm"><span lang=ES>Leyenda: {leyenda_html}</span></p>')
+
+    html_blocks.append('<p class=MsoNormal><span lang=ES><o:p>&nbsp;</o:p></span></p>')
+
+    return html_blocks, i
+
+
+def handle_blockquote(lines, i):
+    """Handle blockquote (regular or Nota callout).
+
+    Args:
+        lines: List of markdown lines
+        i: Current line index (pointing to > line)
+
+    Returns:
+        Tuple of (html_blocks, new_index)
+    """
+    html_blocks = []
+    line = lines[i]
+    quote_text = line[1:].strip()
+    i += 1
+    while i < len(lines) and lines[i].startswith('>'):
+        quote_text += ' ' + lines[i][1:].strip()
+        i += 1
+
+    # Check if this is a Nota/Note callout
+    if quote_text.startswith('**Nota:**') or quote_text.startswith('**Note:**'):
+        if quote_text.startswith('**Nota:**'):
+            label = 'Nota:'
+            content = quote_text[9:].strip()
+        else:
+            label = 'Note:'
+            content = quote_text[9:].strip()
+
+        # UNIR callout box style
+        html_blocks.append(f'''<div style='mso-element:para-border-div;border-top:solid #0098CD 1.0pt;border-left:none;border-bottom:solid #0098CD 1.0pt;border-right:none;mso-border-top-alt:solid #0098CD .5pt;mso-border-bottom-alt:solid #0098CD .5pt;padding:4.0pt 0cm 4.0pt 0cm;background:#E6F4F9'>
+<p class=MsoNormal style='background:#E6F4F9;border:none;padding:0cm;margin:0cm'><b><span lang=ES>{label}</span></b><span lang=ES> {md_to_html_para(content)}</span></p>
+</div>''')
+    else:
+        # Regular blockquote
+        html_blocks.append(f'<p class=MsoQuote><i><span lang=ES>{md_to_html_para(quote_text)}</span></i></p>')
+
+    return html_blocks, i
+
+
+def handle_bullet_list(lines, i):
+    """Handle bullet list (-, *, +).
+
+    Args:
+        lines: List of markdown lines
+        i: Current line index (pointing to first bullet)
+
+    Returns:
+        Tuple of (html_blocks, new_index)
+    """
+    html_blocks = []
+    bullet_items = []
+
+    while i < len(lines):
+        # Skip blank lines
+        while i < len(lines) and not lines[i].strip():
+            i += 1
+        # Check if next non-blank line is a bullet item
+        if i < len(lines) and re.match(r'^[\-\*\+]\s', lines[i]):
+            item_text = lines[i][2:].strip()
+            item_text = convert_latex_formulas(item_text)
+            bullet_items.append(md_to_html_para(item_text))
+            i += 1
+        else:
+            break
+
+    # Output with proper First/Middle/Last classes
+    for idx, item in enumerate(bullet_items):
+        if len(bullet_items) == 1:
+            cls = 'MsoListParagraph'
+        elif idx == 0:
+            cls = 'MsoListParagraphCxSpFirst'
+        elif idx == len(bullet_items) - 1:
+            cls = 'MsoListParagraphCxSpLast'
+        else:
+            cls = 'MsoListParagraphCxSpMiddle'
+        html_blocks.append(f'<p class={cls} style="margin-left:36pt;text-indent:-18pt"><span lang=ES style="font-family:Symbol">·</span><span lang=ES style="font-size:7pt">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</span><span lang=ES>{item}</span></p>')
+
+    return html_blocks, i
+
+
+def handle_numbered_list(lines, i):
+    """Handle numbered list (1., 2., etc).
+
+    Args:
+        lines: List of markdown lines
+        i: Current line index (pointing to first numbered item)
+
+    Returns:
+        Tuple of (html_blocks, new_index)
+    """
+    html_blocks = []
+    numbered_items = []
+
+    while i < len(lines):
+        # Skip blank lines
+        while i < len(lines) and not lines[i].strip():
+            i += 1
+        # Check if next non-blank line is a numbered item
+        if i < len(lines) and re.match(r'^\d+\.\s', lines[i]):
+            item_text = re.sub(r'^\d+\.\s*', '', lines[i]).strip()
+            i += 1
+            # Collect any nested/indented content
+            nested_lines = []
+            while i < len(lines):
+                current = lines[i]
+                # Stop conditions
+                if re.match(r'^\d+\.\s', current):
+                    break
+                if current.startswith('#'):
+                    break
+                if current.startswith('```'):
+                    break
+                if current.startswith('**Tabla') or current.startswith('**Figura'):
+                    break
+                if current.strip() and not current.startswith(' ') and not current.startswith('\t') and not current.startswith('-'):
+                    if nested_lines or not current.strip():
+                        break
+                if current.strip():
+                    cleaned = current.strip()
+                    if cleaned.startswith('- '):
+                        cleaned = cleaned[2:]
+                    nested_lines.append(cleaned)
+                i += 1
+            # Combine item with nested content
+            if nested_lines:
+                item_text = item_text + '<br/>' + '<br/>'.join(nested_lines)
+            item_text = convert_latex_formulas(item_text)
+            numbered_items.append(md_to_html_para(item_text))
+        else:
+            break
+
+    # Output with proper First/Middle/Last classes
+    for idx, item in enumerate(numbered_items):
+        num = idx + 1
+        if len(numbered_items) == 1:
+            cls = 'MsoListParagraph'
+        elif idx == 0:
+            cls = 'MsoListParagraphCxSpFirst'
+        elif idx == len(numbered_items) - 1:
+            cls = 'MsoListParagraphCxSpLast'
+        else:
+            cls = 'MsoListParagraphCxSpMiddle'
+        html_blocks.append(f'<p class={cls} style="margin-left:36pt;text-indent:-18pt"><span lang=ES>{num}.<span style="font-size:7pt">&nbsp;&nbsp;&nbsp;</span>{item}</span></p>')
+
+    return html_blocks, i