#!/usr/bin/env python3 """Content block handlers for markdown to HTML conversion.""" import os import re from PIL import Image from markdown_utils import ( md_to_html_para, convert_latex_formulas, is_source_line, extract_source_from_line, is_leyenda_line, extract_leyenda_from_line, ) # Base directory for resolving paths BASE_DIR = os.path.dirname(os.path.abspath(__file__)) def handle_mermaid_diagram(lines, i, counters, is_anexo): """Handle mermaid diagram block, converting to figure with image. Args: lines: List of markdown lines i: Current line index (pointing to ```mermaid) counters: Dict with 'table', 'figure', 'anexo_table', 'anexo_figure', 'global_figure' is_anexo: Boolean indicating if processing Anexo section Returns: Tuple of (html_blocks, new_index) """ html_blocks = [] # Always increment global index for sequential filenames counters['global_figure'] += 1 # Use Anexo-specific counter with "A" prefix for display, or global counter if is_anexo: counters['anexo_figure'] += 1 fig_num = f"A{counters['anexo_figure']}" else: counters['figure'] += 1 fig_num = str(counters['figure']) mermaid_lines = [] i += 1 while i < len(lines) and not lines[i].strip() == '```': mermaid_lines.append(lines[i]) i += 1 # Try to extract title from mermaid content (YAML format) mermaid_content = '\n'.join(mermaid_lines) # Match title with quotes: title: "Something" or title: 'Something' title_match = re.search(r'title:\s*["\']([^"\']+)["\']', mermaid_content) if not title_match: # Match title without quotes: title: Something title_match = re.search(r'title:\s*([^"\'\n]+)', mermaid_content) if title_match: fig_title = title_match.group(1).strip() else: fig_title = f"Diagrama {fig_num}" # Use global sequential index for filename fig_file = f'figures/figura_{counters["global_figure"]}.png' fig_path = os.path.join(BASE_DIR, 'thesis_output', fig_file) # Create figure with MsoCaption class and proper Word SEQ field bookmark_id = f"_Ref_Fig{fig_num}" if is_anexo: tc_field = f'''''' html_blocks.append(f'''{tc_field}

Figura {fig_num}. {fig_title}

''') else: html_blocks.append(f'''

Figura {fig_num}. {fig_title}

''') if os.path.exists(fig_path): # Read actual image dimensions and scale to fit page width img = Image.open(fig_path) orig_w, orig_h = img.size # Scale to fit max width of 566px (15cm at 96dpi) while preserving aspect ratio max_width = 566 if orig_w > max_width: scale = max_width / orig_w new_w = max_width new_h = int(orig_h * scale) else: new_w, new_h = orig_w, orig_h # Convert to pt (1px at 96dpi = 0.75pt) w_pt = new_w * 0.75 h_pt = new_h * 0.75 html_blocks.append(f'''

${fig_title}$

''') else: # Fallback to placeholder html_blocks.append(f'''

[Insertar diagrama Mermaid aquí]

''') # Check if next non-empty line has custom Fuente custom_source = None fig_leyenda = None lookahead = i + 1 while lookahead < len(lines) and not lines[lookahead].strip(): lookahead += 1 if lookahead < len(lines): next_line = lines[lookahead].strip() if is_source_line(next_line): custom_source = extract_source_from_line(next_line) if custom_source and not custom_source.endswith('.'): custom_source += '.' i = lookahead # Check for Leyenda after source leyenda_idx = i + 1 while leyenda_idx < len(lines) and not lines[leyenda_idx].strip(): leyenda_idx += 1 if leyenda_idx < len(lines) and is_leyenda_line(lines[leyenda_idx]): fig_leyenda = extract_leyenda_from_line(lines[leyenda_idx]) i = leyenda_idx if custom_source: source_html = md_to_html_para(custom_source) html_blocks.append(f'''

Fuente: {source_html}

''') else: html_blocks.append(f'''

Fuente: Elaboración propia.

''') if fig_leyenda: leyenda_html = md_to_html_para(fig_leyenda) if not fig_leyenda.endswith('.'): leyenda_html += '.' html_blocks.append(f'''

Leyenda: {leyenda_html}

''') html_blocks.append('

') i += 1 return html_blocks, i def handle_code_block(lines, i): """Handle non-mermaid code block. Args: lines: List of markdown lines i: Current line index (pointing to ```) Returns: Tuple of (html_blocks, new_index) """ html_blocks = [] code_lang = lines[i].strip()[3:] code_lines = [] i += 1 while i < len(lines) and not lines[i].strip().startswith('```'): code_lines.append(lines[i]) i += 1 code = '\n'.join(code_lines) # Escape HTML entities in code code = code.replace('&', '&').replace('<', '<').replace('>', '>') html_blocks.append(f'''

{code}

''') i += 1 return html_blocks, i def handle_header(line, is_anexo): """Handle header lines (##, ###, ####). Args: line: The header line is_anexo: Boolean indicating if processing Anexo section Returns: HTML string for the header, or None if h1 (skip) """ if line.startswith('####'): text = line.lstrip('#').strip() return f'

{text}

' elif line.startswith('###'): text = line.lstrip('#').strip() # Extract section number if present sec_match = re.match(r'^([\d\.]+)\s+', text) bookmark_html = '' if sec_match: sec_num = sec_match.group(1).rstrip('.') bookmark_id = f"_Ref_Sec{sec_num.replace('.', '_')}" bookmark_html = f'' # Disable auto-numbering for Anexo content or A.x headings if is_anexo or re.match(r'^A\.\d+', text): return f'{bookmark_html}

{text}

' else: return f'{bookmark_html}

{text}

' elif line.startswith('##'): text = line.lstrip('#').strip() # Extract section number if present sec_match = re.match(r'^([\d\.]+)\s+', text) bookmark_html = '' if sec_match: sec_num = sec_match.group(1).rstrip('.') bookmark_id = f"_Ref_Sec{sec_num.replace('.', '_')}" bookmark_html = f'' # Disable auto-numbering for Anexo content or A.x headings if is_anexo or re.match(r'^A\.\d+', text): return f'{bookmark_html}

{text}

' else: return f'{bookmark_html}

{text}

' elif line.startswith('#'): # Skip h1 - we keep the original return None return None def handle_table(lines, i, counters, is_anexo): """Handle markdown table. Args: lines: List of markdown lines i: Current line index (pointing to first table row) counters: Dict with table/figure counters is_anexo: Boolean indicating if processing Anexo section Returns: Tuple of (html_blocks, new_index) """ html_blocks = [] # Use Anexo-specific counter with "A" prefix, or global counter if is_anexo: counters['anexo_table'] += 1 table_num = f"A{counters['anexo_table']}" else: counters['table'] += 1 table_num = str(counters['table']) # Check if previous line has table title table_title = None alt_title = None table_source = "Elaboración propia" # Look back for table title for j in range(i - 1, max(0, i - 5), -1): prev_line = lines[j].strip() if prev_line.startswith('**Tabla') or prev_line.startswith('*Tabla'): table_title = re.sub(r'\*+', '', prev_line).strip() break elif prev_line.startswith('**') and prev_line.endswith(':**'): alt_title = re.sub(r'\*+', '', prev_line).rstrip(':').strip() elif prev_line and not prev_line.startswith('|'): break # Parse table table_lines = [] while i < len(lines) and '|' in lines[i]: if '---' not in lines[i]: table_lines.append(lines[i]) i += 1 # Look ahead for source source_idx = i table_leyenda = None while source_idx < len(lines) and not lines[source_idx].strip(): source_idx += 1 if source_idx < len(lines) and is_source_line(lines[source_idx]): table_source = extract_source_from_line(lines[source_idx]) i = source_idx + 1 # Check for Leyenda after source leyenda_idx = i while leyenda_idx < len(lines) and not lines[leyenda_idx].strip(): leyenda_idx += 1 if leyenda_idx < len(lines) and is_leyenda_line(lines[leyenda_idx]): table_leyenda = extract_leyenda_from_line(lines[leyenda_idx]) i = leyenda_idx + 1 # Add table title with MsoCaption class bookmark_id = f"_Ref_Tab{table_num}" if table_title: clean_title = re.sub(r'^Tabla\s+[A-Z]?\d+\.\s*', '', table_title).strip() elif alt_title: clean_title = alt_title else: clean_title = "Tabla de datos." if is_anexo: tc_field = f'''''' html_blocks.append(f'''{tc_field}

Tabla {table_num}. {clean_title}

''') else: html_blocks.append(f'''

Tabla {table_num}. {clean_title}

''') # Build table HTML with APA style table_html = '

' for j, tline in enumerate(table_lines): cells = [c.strip() for c in tline.split('|')[1:-1]] table_html += '' for cell in cells: if j == 0: # Header row table_html += f'' elif j == len(table_lines) - 1: # Last row table_html += f'' else: # Middle rows table_html += f'' table_html += '' table_html += '

{md_to_html_para(cell)}

{md_to_html_para(cell)}

' html_blocks.append(table_html) # Add source source_html = md_to_html_para(table_source) if not table_source.endswith('.'): source_html += '.' html_blocks.append(f'

Fuente: {source_html}

') # Add leyenda if present if table_leyenda: leyenda_html = md_to_html_para(table_leyenda) if not table_leyenda.endswith('.'): leyenda_html += '.' html_blocks.append(f'

Leyenda: {leyenda_html}

') html_blocks.append('

') return html_blocks, i def handle_blockquote(lines, i): """Handle blockquote (regular or Nota callout). Args: lines: List of markdown lines i: Current line index (pointing to > line) Returns: Tuple of (html_blocks, new_index) """ html_blocks = [] line = lines[i] quote_text = line[1:].strip() i += 1 while i < len(lines) and lines[i].startswith('>'): quote_text += ' ' + lines[i][1:].strip() i += 1 # Check if this is a Nota/Note callout if quote_text.startswith('**Nota:**') or quote_text.startswith('**Note:**'): if quote_text.startswith('**Nota:**'): label = 'Nota:' content = quote_text[9:].strip() else: label = 'Note:' content = quote_text[9:].strip() # UNIR callout box style html_blocks.append(f'''

{label} {md_to_html_para(content)}

''') else: # Regular blockquote html_blocks.append(f'

{md_to_html_para(quote_text)}

') return html_blocks, i def handle_bullet_list(lines, i): """Handle bullet list (-, *, +). Args: lines: List of markdown lines i: Current line index (pointing to first bullet) Returns: Tuple of (html_blocks, new_index) """ html_blocks = [] bullet_items = [] while i < len(lines): # Skip blank lines while i < len(lines) and not lines[i].strip(): i += 1 # Check if next non-blank line is a bullet item if i < len(lines) and re.match(r'^[\-\*\+]\s', lines[i]): item_text = lines[i][2:].strip() item_text = convert_latex_formulas(item_text) bullet_items.append(md_to_html_para(item_text)) i += 1 else: break # Output with proper First/Middle/Last classes for idx, item in enumerate(bullet_items): if len(bullet_items) == 1: cls = 'MsoListParagraph' elif idx == 0: cls = 'MsoListParagraphCxSpFirst' elif idx == len(bullet_items) - 1: cls = 'MsoListParagraphCxSpLast' else: cls = 'MsoListParagraphCxSpMiddle' html_blocks.append(f'

· {item}

') return html_blocks, i def handle_numbered_list(lines, i): """Handle numbered list (1., 2., etc). Args: lines: List of markdown lines i: Current line index (pointing to first numbered item) Returns: Tuple of (html_blocks, new_index) """ html_blocks = [] numbered_items = [] while i < len(lines): # Skip blank lines while i < len(lines) and not lines[i].strip(): i += 1 # Check if next non-blank line is a numbered item if i < len(lines) and re.match(r'^\d+\.\s', lines[i]): item_text = re.sub(r'^\d+\.\s*', '', lines[i]).strip() i += 1 # Collect any nested/indented content nested_lines = [] while i < len(lines): current = lines[i] # Stop conditions if re.match(r'^\d+\.\s', current): break if current.startswith('#'): break if current.startswith('```'): break if current.startswith('**Tabla') or current.startswith('**Figura'): break if current.strip() and not current.startswith(' ') and not current.startswith('\t') and not current.startswith('-'): if nested_lines or not current.strip(): break if current.strip(): cleaned = current.strip() if cleaned.startswith('- '): cleaned = cleaned[2:] nested_lines.append(cleaned) i += 1 # Combine item with nested content if nested_lines: item_text = item_text + '
' + '
'.join(nested_lines) item_text = convert_latex_formulas(item_text) numbered_items.append(md_to_html_para(item_text)) else: break # Output with proper First/Middle/Last classes for idx, item in enumerate(numbered_items): num = idx + 1 if len(numbered_items) == 1: cls = 'MsoListParagraph' elif idx == 0: cls = 'MsoListParagraphCxSpFirst' elif idx == len(numbered_items) - 1: cls = 'MsoListParagraphCxSpLast' else: cls = 'MsoListParagraphCxSpMiddle' html_blocks.append(f'

{num}. {item}

') return html_blocks, i