Cross references

2026-02-04 20:43:50 +01:00
parent 868f748a8d
commit e9c937a042
18 changed files with 1118 additions and 820 deletions
--- a/apply_content.py
+++ b/apply_content.py
@@ -1,136 +1,62 @@
 #!/usr/bin/env python3
-"""Replace template content with thesis content from docs/ folder using BeautifulSoup."""
+"""Replace template content with thesis content from docs/ folder using BeautifulSoup.
+
+This module orchestrates the conversion of markdown documentation to UNIR's
+Word template format. Content handling is delegated to:
+- markdown_utils.py: Utility functions for markdown parsing
+- content_handlers.py: Block-level content handlers (tables, figures, lists, etc.)
+"""

 import re
 import os
 import shutil
 from bs4 import BeautifulSoup, NavigableString
-from latex2mathml.converter import convert as latex_to_mathml
-from PIL import Image
+
+from markdown_utils import (
+    read_file,
+    write_file,
+    md_to_html_para,
+    convert_latex_formulas,
+    is_source_line,
+    is_leyenda_line,
+    split_into_paragraphs,
+    SOURCE_LINE_RE,
+)
+from content_handlers import (
+    handle_mermaid_diagram,
+    handle_code_block,
+    handle_header,
+    handle_table,
+    handle_blockquote,
+    handle_bullet_list,
+    handle_numbered_list,
+)

 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 TEMPLATE_INPUT = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm')
 TEMPLATE_OUTPUT = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual.htm')
 DOCS_DIR = os.path.join(BASE_DIR, 'docs')

-# Accept Fuente/Source lines with or without markdown bold
-SOURCE_LINE_RE = re.compile(r'^\s*(?:\*{1,2})?(Fuente|Source):(?:\*{1,2})?\s*(.*)$', re.IGNORECASE)
-# Accept Leyenda lines with or without markdown bold
-LEYENDA_LINE_RE = re.compile(r'^\s*(?:\*{1,2})?Leyenda:(?:\*{1,2})?\s*(.*)$', re.IGNORECASE)

-# Global counters for tables and figures
-table_counter = 0
-figure_counter = 0
-anexo_table_counter = 0
-anexo_figure_counter = 0
-# Global sequential counter for figure filenames (figura_1.png, figura_2.png, etc.)
-global_figure_index = 0
+def parse_md_to_html_blocks(md_content, is_anexo=False, counters=None):
+    """Convert markdown content to HTML blocks with template styles.

-def read_file(path):
-    try:
-        with open(path, 'r', encoding='utf-8') as f:
-            return f.read()
-    except UnicodeDecodeError:
-        with open(path, 'r', encoding='latin-1') as f:
-            return f.read()
+    Args:
+        md_content: Markdown content string
+        is_anexo: Boolean indicating if processing Anexo section
+        counters: Dict with table/figure counters. If None, creates new one.

-def write_file(path, content):
-    with open(path, 'w', encoding='utf-8') as f:
-        f.write(content)
-
-def md_to_html_para(text):
-    """Convert markdown inline formatting to HTML."""
-    # Bold
-    text = re.sub(r'\*\*([^*]+)\*\*', r'<b>\1</b>', text)
-    # Italic
-    text = re.sub(r'\*([^*]+)\*', r'<i>\1</i>', text)
-    # Inline code
-    text = re.sub(r'`([^`]+)`', r'<span style="font-family:Consolas;font-size:10pt">\1</span>', text)
-    # Links [text](url) -> <a href="url">text</a>
-    text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<a href="\2">\1</a>', text)
-    return text
-
-def convert_latex_formulas(text):
-    """Convert LaTeX formulas to MathML for Word compatibility."""
-    # Block formulas $$...$$
-    def convert_block(match):
-        latex = match.group(1)
-        try:
-            mathml = latex_to_mathml(latex, display="block")
-            return f'<p class=MsoNormal style="text-align:center">{mathml}</p>'
-        except:
-            return match.group(0)  # Keep original if conversion fails
-
-    text = re.sub(r'\$\$([^$]+)\$\$', convert_block, text)
-
-    # Inline formulas $...$
-    def convert_inline(match):
-        latex = match.group(1)
-        try:
-            return latex_to_mathml(latex, display="inline")
-        except:
-            return match.group(0)
-
-    text = re.sub(r'\$([^$]+)\$', convert_inline, text)
-    return text
-
-def extract_source_from_line(line):
-    """Return source text if line is a Fuente/Source line, otherwise None."""
-    match = SOURCE_LINE_RE.match(line.strip())
-    if not match:
-        return None
-    return match.group(2).strip()
-
-def is_source_line(line):
-    """Check whether a line starts with Fuente:/Source: (optionally bold)."""
-    return SOURCE_LINE_RE.match(line.strip()) is not None
-
-def extract_leyenda_from_line(line):
-    """Return leyenda text if line is a Leyenda line, otherwise None."""
-    match = LEYENDA_LINE_RE.match(line.strip())
-    if not match:
-        return None
-    return match.group(1).strip()
-
-def is_leyenda_line(line):
-    """Check whether a line starts with Leyenda: (optionally bold)."""
-    return LEYENDA_LINE_RE.match(line.strip()) is not None
-
-def extract_table_title(lines, current_index):
-    """Look for table title in preceding lines (e.g., **Tabla 1.** *Title*)."""
-    # Check previous non-empty lines for table title
-    for i in range(current_index - 1, max(0, current_index - 5), -1):
-        line = lines[i].strip()
-        if line.startswith('**Tabla') or line.startswith('*Tabla'):
-            return line
-        if line and not line.startswith('|'):
-            break
-    return None
-
-def extract_figure_title_from_mermaid(lines, current_index):
-    """Extract title from mermaid diagram or preceding text."""
-    # Look for title in mermaid content
-    for i in range(current_index + 1, min(len(lines), current_index + 20)):
-        line = lines[i].strip()
-        if line.startswith('```'):
-            break
-        if 'title' in line.lower():
-            # Extract title from: title "Some Title"
-            match = re.search(r'title\s+["\']([^"\']+)["\']', line)
-            if match:
-                return match.group(1)
-
-    # Check preceding lines for figure reference
-    for i in range(current_index - 1, max(0, current_index - 3), -1):
-        line = lines[i].strip()
-        if line.startswith('**Figura') or 'Figura' in line:
-            return line
-
-    return None
-
-def parse_md_to_html_blocks(md_content, is_anexo=False):
-    """Convert markdown content to HTML blocks with template styles."""
-    global table_counter, figure_counter, anexo_table_counter, anexo_figure_counter, global_figure_index
+    Returns:
+        Tuple of (html_string, counters) where counters is the updated dict
+    """
+    if counters is None:
+        counters = {
+            'table': 0,
+            'figure': 0,
+            'anexo_table': 0,
+            'anexo_figure': 0,
+            'global_figure': 0,
+        }

    html_blocks = []
    lines = md_content.split('\n')
@@ -146,346 +72,49 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):

        # Mermaid diagram - convert to figure with actual image
        if line.strip().startswith('```mermaid'):
-            # Always increment global index for sequential filenames
-            global_figure_index += 1
-
-            # Use Anexo-specific counter with "A" prefix for display, or global counter
-            if is_anexo:
-                anexo_figure_counter += 1
-                fig_num = f"A{anexo_figure_counter}"  # Display number: A1, A2, A3...
-            else:
-                figure_counter += 1
-                fig_num = str(figure_counter)  # Display number: 1, 2, 3...
-
-            mermaid_lines = []
-            i += 1
-            while i < len(lines) and not lines[i].strip() == '```':
-                mermaid_lines.append(lines[i])
-                i += 1
-
-            # Try to extract title from mermaid content (YAML format)
-            mermaid_content = '\n'.join(mermaid_lines)
-            # Match title with quotes: title: "Something" or title: 'Something'
-            title_match = re.search(r'title:\s*["\']([^"\']+)["\']', mermaid_content)
-            if not title_match:
-                # Match title without quotes: title: Something
-                title_match = re.search(r'title:\s*([^"\'\n]+)', mermaid_content)
-            if title_match:
-                fig_title = title_match.group(1).strip()
-            else:
-                fig_title = f"Diagrama {fig_num}"
-
-            # Use global sequential index for filename (figura_1.png, figura_2.png, etc.)
-            fig_file = f'figures/figura_{global_figure_index}.png'
-            fig_path = os.path.join(BASE_DIR, 'thesis_output', fig_file)
-
-            # Create figure with MsoCaption class and proper Word SEQ field for cross-reference
-            # Format: "Figura X." in bold, title in italic (per UNIR guidelines)
-            # Word TOC looks for text with Caption style - anchor must be outside main caption text
-            bookmark_id = f"_Ref_Fig{fig_num}"
-            # mso-pagination:keep-with-next ensures caption stays with figure image (correct MSO property)
-            # For Anexo figures, use static text (no SEQ field) to prevent Word from overwriting A1, A2...
-            # Add TC field so Anexo figures appear in Table of Figures index
-            # Use \f c to match the TOC field identifier in the template
-            if is_anexo:
-                tc_field = f'''<!--[if supportFields]><span style='mso-element:field-begin'></span> TC "Figura {fig_num}. {fig_title}" \\f c \\l 1 <span style='mso-element:field-end'></span><![endif]-->'''
-                html_blocks.append(f'''<a name="{bookmark_id}"></a>{tc_field}<p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura {fig_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
-            else:
-                html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{fig_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
-
-            if os.path.exists(fig_path):
-                # Read actual image dimensions and scale to fit page width
-                img = Image.open(fig_path)
-                orig_w, orig_h = img.size
-
-                # Scale to fit max width of 566px (15cm at 96dpi) while preserving aspect ratio
-                max_width = 566
-                if orig_w > max_width:
-                    scale = max_width / orig_w
-                    new_w = max_width
-                    new_h = int(orig_h * scale)
-                else:
-                    new_w, new_h = orig_w, orig_h
-
-                # Convert to pt (1px at 96dpi = 0.75pt)
-                w_pt = new_w * 0.75
-                h_pt = new_h * 0.75
-
-                # mso-pagination:keep-with-next ensures image stays with source line
-                html_blocks.append(f'''<p class=MsoNormal style="text-align:center;mso-pagination:keep-with-next"><span lang=ES><img width="{new_w}" height="{new_h}" style="width:{w_pt}pt;height:{h_pt}pt;display:block;margin:0 auto" src="{fig_file}" alt="{fig_title}"/></span></p>''')
-            else:
-                # Fallback to placeholder
-                # mso-pagination:keep-with-next ensures placeholder stays with source line
-                html_blocks.append(f'''<p class=MsoNormal style="text-align:center;mso-pagination:keep-with-next;border:1px dashed #999;padding:20px;margin:10px 40px;background:#f9f9f9"><span lang=ES style="color:#666">[Insertar diagrama Mermaid aquí]</span></p>''')
-
-            # Check if next non-empty line has custom Fuente
-            custom_source = None
-            fig_leyenda = None
-            lookahead = i + 1
-            while lookahead < len(lines) and not lines[lookahead].strip():
-                lookahead += 1
-            if lookahead < len(lines):
-                next_line = lines[lookahead].strip()
-                if is_source_line(next_line):
-                    # Extract custom source, removing markdown formatting
-                    custom_source = extract_source_from_line(next_line)
-                    # Ensure it ends with a period
-                    if custom_source and not custom_source.endswith('.'):
-                        custom_source += '.'
-                    # Skip this line by advancing i past it
-                    i = lookahead
-                    # Check for Leyenda after source
-                    leyenda_idx = i + 1
-                    while leyenda_idx < len(lines) and not lines[leyenda_idx].strip():
-                        leyenda_idx += 1
-                    if leyenda_idx < len(lines) and is_leyenda_line(lines[leyenda_idx]):
-                        fig_leyenda = extract_leyenda_from_line(lines[leyenda_idx])
-                        i = leyenda_idx
-
-            if custom_source:
-                source_html = md_to_html_para(custom_source)
-                html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Fuente: {source_html}</span></p>''')
-            else:
-                html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Fuente: Elaboración propia.</span></p>''')
-
-            # Add leyenda if present (same style as Fuente, new line)
-            if fig_leyenda:
-                leyenda_html = md_to_html_para(fig_leyenda)
-                if not fig_leyenda.endswith('.'):
-                    leyenda_html += '.'
-                html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Leyenda: {leyenda_html}</span></p>''')
-
-            html_blocks.append('<p class=MsoNormal><span lang=ES><o:p>&nbsp;</o:p></span></p>')
-            i += 1
+            blocks, i = handle_mermaid_diagram(lines, i, counters, is_anexo)
+            html_blocks.extend(blocks)
            continue

        # Code block (non-mermaid)
        if line.strip().startswith('```'):
-            code_lang = line.strip()[3:]
-            code_lines = []
-            i += 1
-            while i < len(lines) and not lines[i].strip().startswith('```'):
-                code_lines.append(lines[i])
-                i += 1
-            code = '\n'.join(code_lines)
-            # Escape HTML entities in code
-            code = code.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
-            html_blocks.append(f'''<div style="background:#E6F4F9;border-top:solid #0098CD .5pt;border-bottom:solid #0098CD .5pt;padding:8pt 12pt;margin:6pt 0">
-<pre style="font-family:Consolas,monospace;font-size:9pt;color:#333333;margin:0;white-space:pre-wrap;word-wrap:break-word">{code}</pre>
-</div>''')
+            blocks, i = handle_code_block(lines, i)
+            html_blocks.extend(blocks)
+            continue
+
+        # Headers
+        if line.startswith('#'):
+            header_html = handle_header(line, is_anexo)
+            if header_html is not None:
+                html_blocks.append(header_html)
            i += 1
            continue

-        # Headers - ## becomes h2, ### becomes h3
-        if line.startswith('####'):
-            text = line.lstrip('#').strip()
-            # Apply consistent styling like h2/h3, disable numbering for h4
-            html_blocks.append(f'<h4 style="mso-list:none"><b><span lang=ES style="text-transform:none">{text}</span></b></h4>')
-            i += 1
-            continue
-        elif line.startswith('###'):
-            text = line.lstrip('#').strip()
-            # Disable auto-numbering for Anexo content or A.x headings
-            if is_anexo or re.match(r'^A\.\d+', text):
-                # mso-list:none explicitly disables inherited list numbering from template CSS
-                html_blocks.append(f'<h3 style="mso-list:none"><span lang=ES style="text-transform:none">{text}</span></h3>')
-            else:
-                html_blocks.append(f'<h3 style="mso-list:l22 level3 lfo18"><span lang=ES style="text-transform:none">{text}</span></h3>')
-            i += 1
-            continue
-        elif line.startswith('##'):
-            text = line.lstrip('#').strip()
-            # Disable auto-numbering for Anexo content or A.x headings
-            if is_anexo or re.match(r'^A\.\d+', text):
-                # mso-list:none explicitly disables inherited list numbering from template CSS
-                html_blocks.append(f'<h2 style="mso-list:none"><span lang=ES style="text-transform:none">{text}</span></h2>')
-            else:
-                html_blocks.append(f'<h2 style="mso-list:l22 level2 lfo18"><span lang=ES style="text-transform:none">{text}</span></h2>')
-            i += 1
-            continue
-        elif line.startswith('#'):
-            # Skip h1 - we keep the original
-            i += 1
-            continue
-
-        # Table - check for table title pattern first
+        # Table
        if '|' in line and i + 1 < len(lines) and '---' in lines[i + 1]:
-            # Use Anexo-specific counter with "A" prefix, or global counter
-            if is_anexo:
-                anexo_table_counter += 1
-                table_num = f"A{anexo_table_counter}"
-            else:
-                table_counter += 1
-                table_num = str(table_counter)
-
-            # Check if previous line has table title (e.g., **Tabla 1.** *Title*)
-            table_title = None
-            alt_title = None  # Alternative title from **bold text:** pattern
-            table_source = "Elaboración propia"
-
-            # Look back for table title
-            for j in range(i - 1, max(0, i - 5), -1):
-                prev_line = lines[j].strip()
-                if prev_line.startswith('**Tabla') or prev_line.startswith('*Tabla'):
-                    # Extract title text
-                    table_title = re.sub(r'\*+', '', prev_line).strip()
-                    break
-                elif prev_line.startswith('**') and prev_line.endswith(':**'):
-                    # Alternative: **Bold title:** pattern (for informal tables)
-                    alt_title = re.sub(r'\*+', '', prev_line).rstrip(':').strip()
-                elif prev_line and not prev_line.startswith('|'):
-                    break
-
-            # Parse table
-            table_lines = []
-            while i < len(lines) and '|' in lines[i]:
-                if '---' not in lines[i]:
-                    table_lines.append(lines[i])
-                i += 1
-
-            # Look ahead for source (skip blank lines first)
-            source_idx = i
-            table_leyenda = None
-            while source_idx < len(lines) and not lines[source_idx].strip():
-                source_idx += 1
-            if source_idx < len(lines) and is_source_line(lines[source_idx]):
-                table_source = extract_source_from_line(lines[source_idx])
-                i = source_idx + 1
-                # Check for Leyenda after source (skip blank lines)
-                leyenda_idx = i
-                while leyenda_idx < len(lines) and not lines[leyenda_idx].strip():
-                    leyenda_idx += 1
-                if leyenda_idx < len(lines) and is_leyenda_line(lines[leyenda_idx]):
-                    table_leyenda = extract_leyenda_from_line(lines[leyenda_idx])
-                    i = leyenda_idx + 1
-
-            # Add table title with MsoCaption class and proper Word SEQ field for cross-reference
-            # Format: "Tabla X." in bold, title in italic (per UNIR guidelines)
-            # Word TOC looks for text with Caption style - anchor must be outside main caption text
-            bookmark_id = f"_Ref_Tab{table_num}"
-            if table_title:
-                # Remove any "Tabla X." or "Tabla AX." pattern from the title
-                clean_title = re.sub(r'^Tabla\s+[A-Z]?\d+\.\s*', '', table_title).strip()
-            elif alt_title:
-                # Use alternative title from **bold text:** pattern
-                clean_title = alt_title
-            else:
-                clean_title = "Tabla de datos."
-            # mso-pagination:keep-with-next ensures caption stays with table (correct MSO property)
-            # For Anexo tables, use static text (no SEQ field) to prevent Word from overwriting A1, A2...
-            # Add TC field so Anexo tables appear in Table of Tables index
-            # Use \f t identifier - template TOC field will be modified to include this
-            if is_anexo:
-                tc_field = f'''<!--[if supportFields]><span style='mso-element:field-begin'></span> TC "Tabla {table_num}. {clean_title}" \\f t \\l 1 <span style='mso-element:field-end'></span><![endif]-->'''
-                html_blocks.append(f'''<a name="{bookmark_id}"></a>{tc_field}<p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla {table_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
-            else:
-                html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
-
-            # Build table HTML with APA style (horizontal lines only, no vertical)
-            table_html = '<div align="center"><table class=MsoTableGrid border=1 cellspacing=0 cellpadding=0 align="center" style="border-collapse:collapse;margin-left:auto;margin-right:auto;mso-table-style-name:\'Plain Table 1\'">'
-            for j, tline in enumerate(table_lines):
-                cells = [c.strip() for c in tline.split('|')[1:-1]]
-                table_html += '<tr>'
-                for cell in cells:
-                    if j == 0:
-                        # Header row: top and bottom border, bold text
-                        table_html += f'<td style="border-top:solid windowtext 1.0pt;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0;text-align:center"><b><span lang=ES>{md_to_html_para(cell)}</span></b></p></td>'
-                    elif j == len(table_lines) - 1:
-                        # Last row: bottom border only
-                        table_html += f'<td style="border-top:none;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0;text-align:center"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
-                    else:
-                        # Middle rows: no borders
-                        table_html += f'<td style="border:none;padding:5px"><p class=MsoNormal style="margin:0;text-align:center"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
-                table_html += '</tr>'
-            table_html += '</table></div>'
-            html_blocks.append(table_html)
-
-            # Add source with proper template format (convert markdown links to HTML)
-            source_html = md_to_html_para(table_source)
-            if not table_source.endswith('.'):
-                source_html += '.'
-            html_blocks.append(f'<p class=Piedefoto-tabla style="margin-left:0cm"><span lang=ES>Fuente: {source_html}</span></p>')
-
-            # Add leyenda if present (same style as Fuente, new line)
-            if table_leyenda:
-                leyenda_html = md_to_html_para(table_leyenda)
-                if not table_leyenda.endswith('.'):
-                    leyenda_html += '.'
-                html_blocks.append(f'<p class=Piedefoto-tabla style="margin-left:0cm"><span lang=ES>Leyenda: {leyenda_html}</span></p>')
-
-            html_blocks.append('<p class=MsoNormal><span lang=ES><o:p>&nbsp;</o:p></span></p>')
+            blocks, i = handle_table(lines, i, counters, is_anexo)
+            html_blocks.extend(blocks)
            continue

        # Blockquote
        if line.startswith('>'):
-            quote_text = line[1:].strip()
-            i += 1
-            while i < len(lines) and lines[i].startswith('>'):
-                quote_text += ' ' + lines[i][1:].strip()
-                i += 1
-            html_blocks.append(f'<p class=MsoQuote><i><span lang=ES>{md_to_html_para(quote_text)}</span></i></p>')
+            blocks, i = handle_blockquote(lines, i)
+            html_blocks.extend(blocks)
            continue

-        # Bullet list (handle blank lines between items)
+        # Bullet list
        if re.match(r'^[\-\*\+]\s', line):
-            # Collect all bullet items first
-            bullet_items = []
-            while i < len(lines):
-                # Skip blank lines
-                while i < len(lines) and not lines[i].strip():
-                    i += 1
-                # Check if next non-blank line is a bullet item
-                if i < len(lines) and re.match(r'^[\-\*\+]\s', lines[i]):
-                    item_text = lines[i][2:].strip()
-                    item_text = convert_latex_formulas(item_text)
-                    bullet_items.append(md_to_html_para(item_text))
-                    i += 1
-                else:
-                    break
-            # Output with proper First/Middle/Last classes
-            for idx, item in enumerate(bullet_items):
-                if len(bullet_items) == 1:
-                    cls = 'MsoListParagraph'
-                elif idx == 0:
-                    cls = 'MsoListParagraphCxSpFirst'
-                elif idx == len(bullet_items) - 1:
-                    cls = 'MsoListParagraphCxSpLast'
-                else:
-                    cls = 'MsoListParagraphCxSpMiddle'
-                html_blocks.append(f'<p class={cls} style="margin-left:36pt;text-indent:-18pt"><span lang=ES style="font-family:Symbol">·</span><span lang=ES style="font-size:7pt">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</span><span lang=ES>{item}</span></p>')
+            blocks, i = handle_bullet_list(lines, i)
+            html_blocks.extend(blocks)
            continue

-        # Numbered list (handle blank lines between items)
+        # Numbered list
        if re.match(r'^\d+\.\s', line):
-            # Collect all numbered items first
-            numbered_items = []
-            while i < len(lines):
-                # Skip blank lines
-                while i < len(lines) and not lines[i].strip():
-                    i += 1
-                # Check if next non-blank line is a numbered item
-                if i < len(lines) and re.match(r'^\d+\.\s', lines[i]):
-                    item_text = re.sub(r'^\d+\.\s*', '', lines[i]).strip()
-                    item_text = convert_latex_formulas(item_text)
-                    numbered_items.append(md_to_html_para(item_text))
-                    i += 1
-                else:
-                    break
-            # Output with proper First/Middle/Last classes
-            for idx, item in enumerate(numbered_items):
-                num = idx + 1
-                if len(numbered_items) == 1:
-                    cls = 'MsoListParagraph'
-                elif idx == 0:
-                    cls = 'MsoListParagraphCxSpFirst'
-                elif idx == len(numbered_items) - 1:
-                    cls = 'MsoListParagraphCxSpLast'
-                else:
-                    cls = 'MsoListParagraphCxSpMiddle'
-                html_blocks.append(f'<p class={cls} style="margin-left:36pt;text-indent:-18pt"><span lang=ES>{num}.<span style="font-size:7pt">&nbsp;&nbsp;&nbsp;</span>{item}</span></p>')
+            blocks, i = handle_numbered_list(lines, i)
+            html_blocks.extend(blocks)
            continue

-        # Skip lines that are just table/figure titles (they'll be handled with the table/figure)
+        # Skip lines that are just table/figure titles
        if line.strip().startswith('**Tabla') or line.strip().startswith('*Tabla'):
            i += 1
            continue
@@ -514,12 +143,23 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
        else:
            html_blocks.append(f'<p class=MsoNormal><span lang=ES>{md_to_html_para(para_text)}</span></p>')

-    return '\n\n'.join(html_blocks)
+    return '\n\n'.join(html_blocks), counters

-def extract_section_content(md_content, is_anexo=False):
-    """Extract content from markdown, skipping the first # header."""
+
+def extract_section_content(md_content, is_anexo=False, counters=None):
+    """Extract content from markdown, skipping the first # header.
+
+    Args:
+        md_content: Markdown content string
+        is_anexo: Boolean indicating if processing Anexo section
+        counters: Dict with table/figure counters
+
+    Returns:
+        Tuple of (html_string, counters)
+    """
    md_content = re.sub(r'^#\s+[^\n]+\n+', '', md_content, count=1)
-    return parse_md_to_html_blocks(md_content, is_anexo=is_anexo)
+    return parse_md_to_html_blocks(md_content, is_anexo=is_anexo, counters=counters)
+

 def find_section_element(soup, keyword):
    """Find element containing keyword (h1 or special paragraph classes)."""
@@ -540,6 +180,7 @@ def find_section_element(soup, keyword):
                return p
    return None

+
 def remove_elements_between(start_elem, end_elem):
    """Remove all elements between start and end (exclusive)."""
    current = start_elem.next_sibling
@@ -553,6 +194,7 @@ def remove_elements_between(start_elem, end_elem):
        elif isinstance(elem, NavigableString):
            elem.extract()

+
 def format_references(refs_content):
    """Format references with proper MsoBibliography style."""
    refs_content = refs_content.replace('# Referencias bibliográficas {.unnumbered}', '').strip()
@@ -566,20 +208,11 @@ def format_references(refs_content):
        # Apply markdown formatting
        formatted = md_to_html_para(line)

-        # Use MsoBibliography style with hanging indent (36pt indent, -36pt text-indent)
+        # Use MsoBibliography style with hanging indent
        refs_html += f'''<p class=MsoBibliography style="margin-left:36.0pt;text-indent:-36.0pt"><span lang=ES>{formatted}</span></p>\n'''

    return refs_html

-def split_into_paragraphs(text, lang='ES'):
-    """Split text by double newlines and wrap each paragraph in <p> tags."""
-    paragraphs = []
-    for para in text.split('\n\n'):
-        para = para.strip()
-        if para:
-            formatted = md_to_html_para(para)
-            paragraphs.append(f'<p class=MsoNormal><span lang={lang}>{formatted}</span></p>')
-    return '\n'.join(paragraphs)

 def extract_resumen_parts(resumen_content):
    """Extract Spanish resumen and English abstract from 00_resumen.md"""
@@ -610,16 +243,21 @@ def extract_resumen_parts(resumen_content):

    return spanish_text, spanish_keywords, english_text, english_keywords

+
 def main():
-    global table_counter, figure_counter, anexo_table_counter, anexo_figure_counter
+    # Initialize counters dict (replaces global counters)
+    counters = {
+        'table': 0,
+        'figure': 0,
+        'anexo_table': 0,
+        'anexo_figure': 0,
+        'global_figure': 0,
+    }

    print("Reading template...")
    html_content = read_file(TEMPLATE_INPUT)

    # Modify the Table of Tables TOC field to include TC entries with \f t identifier
-    # Original: TOC \h \z \t "Tablas;1" \c "Tabla"
-    # Modified: TOC \f t \h \z \t "Tablas;1" \c "Tabla"
-    # Use regex to handle whitespace/HTML variations in the TOC field
    html_content = re.sub(
        r'(TOC\s+)(\\h\s+\\z\s+\\t\s*\n?\s*&quot;Tablas;1&quot;)',
        r'\1\\f t \2',
@@ -652,7 +290,6 @@ def main():
    print("Replacing Resumen...")
    resumen_title = soup.find('p', class_='Ttulondices', string=re.compile(r'Resumen'))
    if resumen_title:
-        # Find and replace content after Resumen title until Abstract
        current = resumen_title.find_next_sibling()
        elements_to_remove = []
        while current:
@@ -666,7 +303,6 @@ def main():
            if hasattr(elem, 'decompose'):
                elem.decompose()

-        # Insert new resumen content (spanish_text already contains <p> tags)
        resumen_html = f'''{spanish_text}
 <p class=MsoNormal><span lang=ES><o:p>&nbsp;</o:p></span></p>
 <p class=MsoNormal><b><span lang=ES>Palabras clave:</span></b><span lang=ES> {spanish_kw}</span></p>
@@ -681,11 +317,9 @@ def main():
    print("Replacing Abstract...")
    abstract_title = soup.find('p', class_='Ttulondices', string=re.compile(r'Abstract'))
    if abstract_title:
-        # Find and replace content after Abstract title until next major section
        current = abstract_title.find_next_sibling()
        elements_to_remove = []
        while current:
-            # Stop at page break or next title
            if current.name == 'span' and 'page-break' in str(current):
                break
            text = current.get_text() if hasattr(current, 'get_text') else str(current)
@@ -698,7 +332,6 @@ def main():
            if hasattr(elem, 'decompose'):
                elem.decompose()

-        # Insert new abstract content (english_text already contains <p> tags)
        abstract_html = f'''{english_text}
 <p class=MsoNormal><span lang=EN-US><o:p>&nbsp;</o:p></span></p>
 <p class=MsoNormal><b><span lang=EN-US>Keywords:</span></b><span lang=EN-US> {english_kw}</span></p>
@@ -721,31 +354,24 @@ def main():
    for elem in soup.find_all(string=re.compile(r'Ejemplo de nota al pie')):
        parent = elem.parent
        if parent:
-            # Find the footnote container and remove it
            while parent and parent.name != 'p':
                parent = parent.parent
            if parent:
                parent.decompose()
                print("    ✓ Removed footnote example")

-    # Clear old figure/table index entries (they need to be regenerated in Word)
+    # Clear old figure/table index entries
    print("Clearing old index entries...")

-    # Remove ALL content from MsoTof paragraphs that reference template examples
-    # The indices will be regenerated when user opens in Word and presses Ctrl+A, F9
    for p in soup.find_all('p', class_='MsoTof'):
        text = p.get_text()
-        # Check for figure index entries with template examples
        if 'Figura' in text and 'Ejemplo' in text:
-            # Remove all <a> tags (the actual index entry links)
            for a in p.find_all('a'):
                a.decompose()
-            # Also remove any remaining text content that shows the example
            for span in p.find_all('span', style=lambda x: x and 'mso-no-proof' in str(x)):
                if 'Ejemplo' in span.get_text():
                    span.decompose()
            print("    ✓ Cleared figure index example entry")
-        # Check for table index entries with template examples
        if 'Tabla' in text and 'Ejemplo' in text:
            for a in p.find_all('a'):
                a.decompose()
@@ -754,24 +380,20 @@ def main():
                    span.decompose()
            print("    ✓ Cleared table index example entry")

-    # Remove old figure index entries that reference template examples
    for p in soup.find_all('p', class_='MsoToc3'):
        text = p.get_text()
        if 'Figura 1. Ejemplo' in text or 'Tabla 1. Ejemplo' in text:
            p.decompose()
            print("    ✓ Removed template index entry")

-    # Also clear the specific figure/table from template
    for p in soup.find_all('p', class_='Imagencentrada'):
        p.decompose()
        print("    ✓ Removed template figure placeholder")

    # Remove template table example
    for table in soup.find_all('table', class_='MsoTableGrid'):
-        # Check if this is the template example table
        text = table.get_text()
        if 'Celda 1' in text or 'Encabezado 1' in text:
-            # Also remove surrounding caption and source
            prev_sib = table.find_previous_sibling()
            next_sib = table.find_next_sibling()
            if prev_sib and 'Tabla 1. Ejemplo' in prev_sib.get_text():
@@ -782,7 +404,7 @@ def main():
            print("    ✓ Removed template table example")
            break

-    # Define chapters with their keywords and next chapter keywords
+    # Define chapters
    chapters = [
        ('Introducción', 'intro', 'Contexto'),
        ('Contexto', 'contexto', 'Objetivos'),
@@ -795,16 +417,12 @@ def main():
    for chapter_keyword, doc_key, next_keyword in chapters:
        print(f"  Processing: {chapter_keyword}")

-        # Reset counters for consistent numbering per chapter (optional - remove if you want global numbering)
-        # table_counter = 0
-        # figure_counter = 0
-
        start_elem = find_section_element(soup, chapter_keyword)
        end_elem = find_section_element(soup, next_keyword)

        if start_elem and end_elem:
            remove_elements_between(start_elem, end_elem)
-            new_content_html = extract_section_content(docs[doc_key])
+            new_content_html, counters = extract_section_content(docs[doc_key], counters=counters)
            new_soup = BeautifulSoup(new_content_html, 'html.parser')
            insert_point = start_elem
            for new_elem in reversed(list(new_soup.children)):
@@ -844,20 +462,20 @@ def main():
                    current.extract()
                current = next_elem

-            anexo_content = extract_section_content(docs['anexo'], is_anexo=True)
+            anexo_content, counters = extract_section_content(docs['anexo'], is_anexo=True, counters=counters)
            anexo_soup = BeautifulSoup(anexo_content, 'html.parser')
            insert_point = anexo_elem
            for new_elem in reversed(list(anexo_soup.children)):
                insert_point.insert_after(new_elem)
            print(f"    ✓ Replaced content")

-    print(f"\nSummary: {table_counter} tables + {anexo_table_counter} Anexo tables, {figure_counter} figures + {anexo_figure_counter} Anexo figures processed")
+    print(f"\nSummary: {counters['table']} tables + {counters['anexo_table']} Anexo tables, {counters['figure']} figures + {counters['anexo_figure']} Anexo figures processed")

    print("Saving modified template...")
    output_html = str(soup)
    write_file(TEMPLATE_OUTPUT, output_html)

-    # Copy template support files (header.htm, images, etc.)
+    # Copy template support files
    support_files_src = os.path.join(BASE_DIR, 'instructions/plantilla_individual_files')
    support_files_dst = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual_files')
    if os.path.exists(support_files_src):
@@ -874,5 +492,6 @@ def main():
    print("   - This will regenerate: Índice de contenidos, Índice de figuras, Índice de tablas")
    print("4. Save as .docx")

+
 if __name__ == '__main__':
    main()