Files
MastersThesis/content_handlers.py
sergio b91e31e173
Some checks failed
build_docker / essential (push) Successful in 1s
build_docker / build_paddle_ocr (push) Has started running
build_docker / build_doctr_gpu (push) Has been cancelled
build_docker / build_easyocr (push) Has been cancelled
build_docker / build_easyocr_gpu (push) Has been cancelled
build_docker / build_doctr (push) Has been cancelled
build_docker / build_raytune (push) Has been cancelled
build_docker / build_paddle_ocr_gpu (push) Has been cancelled
LaTex ecuations
2026-02-04 21:07:27 +01:00

491 lines
22 KiB
Python

#!/usr/bin/env python3
"""Content block handlers for markdown to HTML conversion."""
import os
import re
from PIL import Image
from markdown_utils import (
md_to_html_para,
convert_latex_formulas,
is_source_line,
extract_source_from_line,
is_leyenda_line,
extract_leyenda_from_line,
)
# Base directory for resolving paths
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
def handle_mermaid_diagram(lines, i, counters, is_anexo):
"""Handle mermaid diagram block, converting to figure with image.
Args:
lines: List of markdown lines
i: Current line index (pointing to ```mermaid)
counters: Dict with 'table', 'figure', 'anexo_table', 'anexo_figure', 'global_figure'
is_anexo: Boolean indicating if processing Anexo section
Returns:
Tuple of (html_blocks, new_index)
"""
html_blocks = []
# Always increment global index for sequential filenames
counters['global_figure'] += 1
# Use Anexo-specific counter with "A" prefix for display, or global counter
if is_anexo:
counters['anexo_figure'] += 1
fig_num = f"A{counters['anexo_figure']}"
else:
counters['figure'] += 1
fig_num = str(counters['figure'])
mermaid_lines = []
i += 1
while i < len(lines) and not lines[i].strip() == '```':
mermaid_lines.append(lines[i])
i += 1
# Try to extract title from mermaid content (YAML format)
mermaid_content = '\n'.join(mermaid_lines)
# Match title with quotes: title: "Something" or title: 'Something'
title_match = re.search(r'title:\s*["\']([^"\']+)["\']', mermaid_content)
if not title_match:
# Match title without quotes: title: Something
title_match = re.search(r'title:\s*([^"\'\n]+)', mermaid_content)
if title_match:
fig_title = title_match.group(1).strip()
else:
fig_title = f"Diagrama {fig_num}"
# Use global sequential index for filename
fig_file = f'figures/figura_{counters["global_figure"]}.png'
fig_path = os.path.join(BASE_DIR, 'thesis_output', fig_file)
# Create figure with MsoCaption class and proper Word SEQ field
bookmark_id = f"_Ref_Fig{fig_num}"
if is_anexo:
tc_field = f'''<!--[if supportFields]><span style='mso-element:field-begin'></span> TC "Figura {fig_num}. {fig_title}" \\f c \\l 1 <span style='mso-element:field-end'></span><![endif]-->'''
html_blocks.append(f'''<a name="{bookmark_id}"></a>{tc_field}<p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura {fig_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
else:
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{fig_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
if os.path.exists(fig_path):
# Read actual image dimensions and scale to fit page width
img = Image.open(fig_path)
orig_w, orig_h = img.size
# Scale to fit max width of 566px (15cm at 96dpi) while preserving aspect ratio
max_width = 566
if orig_w > max_width:
scale = max_width / orig_w
new_w = max_width
new_h = int(orig_h * scale)
else:
new_w, new_h = orig_w, orig_h
# Convert to pt (1px at 96dpi = 0.75pt)
w_pt = new_w * 0.75
h_pt = new_h * 0.75
html_blocks.append(f'''<p class=MsoNormal style="text-align:center;mso-pagination:keep-with-next"><span lang=ES><img width="{new_w}" height="{new_h}" style="width:{w_pt}pt;height:{h_pt}pt;display:block;margin:0 auto" src="{fig_file}" alt="{fig_title}"/></span></p>''')
else:
# Fallback to placeholder
html_blocks.append(f'''<p class=MsoNormal style="text-align:center;mso-pagination:keep-with-next;border:1px dashed #999;padding:20px;margin:10px 40px;background:#f9f9f9"><span lang=ES style="color:#666">[Insertar diagrama Mermaid aquí]</span></p>''')
# Check if next non-empty line has custom Fuente
custom_source = None
fig_leyenda = None
lookahead = i + 1
while lookahead < len(lines) and not lines[lookahead].strip():
lookahead += 1
if lookahead < len(lines):
next_line = lines[lookahead].strip()
if is_source_line(next_line):
custom_source = extract_source_from_line(next_line)
if custom_source and not custom_source.endswith('.'):
custom_source += '.'
i = lookahead
# Check for Leyenda after source
leyenda_idx = i + 1
while leyenda_idx < len(lines) and not lines[leyenda_idx].strip():
leyenda_idx += 1
if leyenda_idx < len(lines) and is_leyenda_line(lines[leyenda_idx]):
fig_leyenda = extract_leyenda_from_line(lines[leyenda_idx])
i = leyenda_idx
if custom_source:
source_html = md_to_html_para(custom_source)
html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Fuente: {source_html}</span></p>''')
else:
html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Fuente: Elaboración propia.</span></p>''')
if fig_leyenda:
leyenda_html = md_to_html_para(fig_leyenda)
if not fig_leyenda.endswith('.'):
leyenda_html += '.'
html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Leyenda: {leyenda_html}</span></p>''')
html_blocks.append('<p class=MsoNormal><span lang=ES><o:p>&nbsp;</o:p></span></p>')
i += 1
return html_blocks, i
def handle_code_block(lines, i):
"""Handle non-mermaid code block.
Args:
lines: List of markdown lines
i: Current line index (pointing to ```)
Returns:
Tuple of (html_blocks, new_index)
"""
html_blocks = []
code_lang = lines[i].strip()[3:]
code_lines = []
i += 1
while i < len(lines) and not lines[i].strip().startswith('```'):
code_lines.append(lines[i])
i += 1
code = '\n'.join(code_lines)
# Escape HTML entities in code
code = code.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
html_blocks.append(f'''<div style="background:#E6F4F9;border-top:solid #0098CD .5pt;border-bottom:solid #0098CD .5pt;padding:8pt 12pt;margin:6pt 0">
<pre style="font-family:Consolas,monospace;font-size:9pt;color:#333333;margin:0;white-space:pre-wrap;word-wrap:break-word">{code}</pre>
</div>''')
i += 1
return html_blocks, i
def handle_header(line, is_anexo):
"""Handle header lines (##, ###, ####).
Args:
line: The header line
is_anexo: Boolean indicating if processing Anexo section
Returns:
HTML string for the header, or None if h1 (skip)
"""
if line.startswith('####'):
text = line.lstrip('#').strip()
return f'<h4 style="mso-list:none"><b><span lang=ES style="text-transform:none">{text}</span></b></h4>'
elif line.startswith('###'):
text = line.lstrip('#').strip()
# Extract section number if present
sec_match = re.match(r'^([\d\.]+)\s+', text)
bookmark_html = ''
if sec_match:
sec_num = sec_match.group(1).rstrip('.')
bookmark_id = f"_Ref_Sec{sec_num.replace('.', '_')}"
bookmark_html = f'<a name="{bookmark_id}"></a>'
# Disable auto-numbering for Anexo content or A.x headings
if is_anexo or re.match(r'^A\.\d+', text):
return f'{bookmark_html}<h3 style="mso-list:none"><span lang=ES style="text-transform:none">{text}</span></h3>'
else:
return f'{bookmark_html}<h3 style="mso-list:l22 level3 lfo18"><span lang=ES style="text-transform:none">{text}</span></h3>'
elif line.startswith('##'):
text = line.lstrip('#').strip()
# Extract section number if present
sec_match = re.match(r'^([\d\.]+)\s+', text)
bookmark_html = ''
if sec_match:
sec_num = sec_match.group(1).rstrip('.')
bookmark_id = f"_Ref_Sec{sec_num.replace('.', '_')}"
bookmark_html = f'<a name="{bookmark_id}"></a>'
# Disable auto-numbering for Anexo content or A.x headings
if is_anexo or re.match(r'^A\.\d+', text):
return f'{bookmark_html}<h2 style="mso-list:none"><span lang=ES style="text-transform:none">{text}</span></h2>'
else:
return f'{bookmark_html}<h2 style="mso-list:l22 level2 lfo18"><span lang=ES style="text-transform:none">{text}</span></h2>'
elif line.startswith('#'):
# Skip h1 - we keep the original
return None
return None
def handle_table(lines, i, counters, is_anexo):
"""Handle markdown table.
Args:
lines: List of markdown lines
i: Current line index (pointing to first table row)
counters: Dict with table/figure counters
is_anexo: Boolean indicating if processing Anexo section
Returns:
Tuple of (html_blocks, new_index)
"""
html_blocks = []
# Use Anexo-specific counter with "A" prefix, or global counter
if is_anexo:
counters['anexo_table'] += 1
table_num = f"A{counters['anexo_table']}"
else:
counters['table'] += 1
table_num = str(counters['table'])
# Check if previous line has table title
table_title = None
alt_title = None
table_source = "Elaboración propia"
# Look back for table title
for j in range(i - 1, max(0, i - 5), -1):
prev_line = lines[j].strip()
if prev_line.startswith('**Tabla') or prev_line.startswith('*Tabla'):
table_title = re.sub(r'\*+', '', prev_line).strip()
break
elif prev_line.startswith('**') and prev_line.endswith(':**'):
alt_title = re.sub(r'\*+', '', prev_line).rstrip(':').strip()
elif prev_line and not prev_line.startswith('|'):
break
# Parse table
table_lines = []
while i < len(lines) and '|' in lines[i]:
if '---' not in lines[i]:
table_lines.append(lines[i])
i += 1
# Look ahead for source
source_idx = i
table_leyenda = None
while source_idx < len(lines) and not lines[source_idx].strip():
source_idx += 1
if source_idx < len(lines) and is_source_line(lines[source_idx]):
table_source = extract_source_from_line(lines[source_idx])
i = source_idx + 1
# Check for Leyenda after source
leyenda_idx = i
while leyenda_idx < len(lines) and not lines[leyenda_idx].strip():
leyenda_idx += 1
if leyenda_idx < len(lines) and is_leyenda_line(lines[leyenda_idx]):
table_leyenda = extract_leyenda_from_line(lines[leyenda_idx])
i = leyenda_idx + 1
# Add table title with MsoCaption class
bookmark_id = f"_Ref_Tab{table_num}"
if table_title:
clean_title = re.sub(r'^Tabla\s+[A-Z]?\d+\.\s*', '', table_title).strip()
elif alt_title:
clean_title = alt_title
else:
clean_title = "Tabla de datos."
if is_anexo:
tc_field = f'''<!--[if supportFields]><span style='mso-element:field-begin'></span> TC "Tabla {table_num}. {clean_title}" \\f t \\l 1 <span style='mso-element:field-end'></span><![endif]-->'''
html_blocks.append(f'''<a name="{bookmark_id}"></a>{tc_field}<p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla {table_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
else:
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
# Build table HTML with APA style
table_html = '<div align="center"><table class=MsoTableGrid border=1 cellspacing=0 cellpadding=0 align="center" style="border-collapse:collapse;margin-left:auto;margin-right:auto;mso-table-style-name:\'Plain Table 1\'">'
for j, tline in enumerate(table_lines):
cells = [c.strip() for c in tline.split('|')[1:-1]]
table_html += '<tr>'
for cell in cells:
if j == 0:
# Header row
table_html += f'<td style="border-top:solid windowtext 1.0pt;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0;text-align:center"><b><span lang=ES>{md_to_html_para(cell)}</span></b></p></td>'
elif j == len(table_lines) - 1:
# Last row
table_html += f'<td style="border-top:none;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0;text-align:center"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
else:
# Middle rows
table_html += f'<td style="border:none;padding:5px"><p class=MsoNormal style="margin:0;text-align:center"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
table_html += '</tr>'
table_html += '</table></div>'
html_blocks.append(table_html)
# Add source
source_html = md_to_html_para(table_source)
if not table_source.endswith('.'):
source_html += '.'
html_blocks.append(f'<p class=Piedefoto-tabla style="margin-left:0cm"><span lang=ES>Fuente: {source_html}</span></p>')
# Add leyenda if present
if table_leyenda:
leyenda_html = md_to_html_para(table_leyenda)
if not table_leyenda.endswith('.'):
leyenda_html += '.'
html_blocks.append(f'<p class=Piedefoto-tabla style="margin-left:0cm"><span lang=ES>Leyenda: {leyenda_html}</span></p>')
html_blocks.append('<p class=MsoNormal><span lang=ES><o:p>&nbsp;</o:p></span></p>')
return html_blocks, i
def handle_blockquote(lines, i):
"""Handle blockquote (regular or Nota callout).
Args:
lines: List of markdown lines
i: Current line index (pointing to > line)
Returns:
Tuple of (html_blocks, new_index)
"""
html_blocks = []
line = lines[i]
quote_text = line[1:].strip()
i += 1
while i < len(lines) and lines[i].startswith('>'):
quote_text += ' ' + lines[i][1:].strip()
i += 1
# Check if this is a Nota/Note callout
if quote_text.startswith('**Nota:**') or quote_text.startswith('**Note:**'):
if quote_text.startswith('**Nota:**'):
label = 'Nota:'
content = quote_text[9:].strip()
else:
label = 'Note:'
content = quote_text[9:].strip()
# UNIR callout box style
html_blocks.append(f'''<div style='mso-element:para-border-div;border-top:solid #0098CD 1.0pt;border-left:none;border-bottom:solid #0098CD 1.0pt;border-right:none;mso-border-top-alt:solid #0098CD .5pt;mso-border-bottom-alt:solid #0098CD .5pt;padding:4.0pt 0cm 4.0pt 0cm;background:#E6F4F9'>
<p class=MsoNormal style='background:#E6F4F9;border:none;padding:0cm;margin:0cm'><b><span lang=ES>{label}</span></b><span lang=ES> {md_to_html_para(content)}</span></p>
</div>''')
else:
# Regular blockquote
html_blocks.append(f'<p class=MsoQuote><i><span lang=ES>{md_to_html_para(quote_text)}</span></i></p>')
return html_blocks, i
def handle_bullet_list(lines, i):
"""Handle bullet list (-, *, +).
Args:
lines: List of markdown lines
i: Current line index (pointing to first bullet)
Returns:
Tuple of (html_blocks, new_index)
"""
html_blocks = []
bullet_items = []
while i < len(lines):
# Skip blank lines
while i < len(lines) and not lines[i].strip():
i += 1
# Check if next non-blank line is a bullet item
if i < len(lines) and re.match(r'^[\-\*\+]\s', lines[i]):
item_text = lines[i][2:].strip()
item_text = convert_latex_formulas(item_text)
bullet_items.append(md_to_html_para(item_text))
i += 1
else:
break
# Output with proper First/Middle/Last classes
for idx, item in enumerate(bullet_items):
if len(bullet_items) == 1:
cls = 'MsoListParagraph'
elif idx == 0:
cls = 'MsoListParagraphCxSpFirst'
elif idx == len(bullet_items) - 1:
cls = 'MsoListParagraphCxSpLast'
else:
cls = 'MsoListParagraphCxSpMiddle'
html_blocks.append(f'<p class={cls} style="margin-left:36pt;text-indent:-18pt"><span lang=ES style="font-family:Symbol">·</span><span lang=ES style="font-size:7pt">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</span><span lang=ES>{item}</span></p>')
return html_blocks, i
def handle_numbered_list(lines, i):
"""Handle numbered list (1., 2., etc) with nested bullet sub-lists.
Args:
lines: List of markdown lines
i: Current line index (pointing to first numbered item)
Returns:
Tuple of (html_blocks, new_index)
"""
html_blocks = []
# Each item is a tuple: (main_text, nested_bullets)
# where nested_bullets is a list of bullet point strings
numbered_items = []
while i < len(lines):
# Skip blank lines
while i < len(lines) and not lines[i].strip():
i += 1
# Check if next non-blank line is a numbered item
if i < len(lines) and re.match(r'^\d+\.\s', lines[i]):
item_text = re.sub(r'^\d+\.\s*', '', lines[i]).strip()
i += 1
# Collect any nested/indented content (bullet points)
nested_bullets = []
while i < len(lines):
current = lines[i]
# Stop conditions
if re.match(r'^\d+\.\s', current):
break
if current.startswith('#'):
break
if current.startswith('```'):
break
if current.startswith('**Tabla') or current.startswith('**Figura'):
break
# Check for non-indented, non-bullet content (end of nested)
stripped = current.strip()
if stripped and not current.startswith(' ') and not current.startswith('\t') and not stripped.startswith('-'):
break
# Collect indented bullet points
if stripped.startswith('- '):
bullet_text = stripped[2:].strip()
nested_bullets.append(bullet_text)
i += 1
item_text = convert_latex_formulas(item_text)
numbered_items.append((md_to_html_para(item_text), nested_bullets))
else:
break
# Output numbered items with nested bullet lists
for idx, (item_text, nested_bullets) in enumerate(numbered_items):
num = idx + 1
if len(numbered_items) == 1:
cls = 'MsoListParagraph'
elif idx == 0:
cls = 'MsoListParagraphCxSpFirst'
elif idx == len(numbered_items) - 1 and not nested_bullets:
cls = 'MsoListParagraphCxSpLast'
else:
cls = 'MsoListParagraphCxSpMiddle'
# Main numbered item
html_blocks.append(f'<p class={cls} style="margin-left:36pt;text-indent:-18pt"><span lang=ES>{num}.<span style="font-size:7pt">&nbsp;&nbsp;&nbsp;</span>{item_text}</span></p>')
# Nested bullet sub-list (indented further)
if nested_bullets:
for bullet_idx, bullet_text in enumerate(nested_bullets):
bullet_text = convert_latex_formulas(bullet_text)
bullet_html = md_to_html_para(bullet_text)
# Determine class for sub-list items
if bullet_idx == 0:
sub_cls = 'MsoListParagraphCxSpFirst'
elif bullet_idx == len(nested_bullets) - 1:
# If this is the last bullet of the last numbered item, use Last
if idx == len(numbered_items) - 1:
sub_cls = 'MsoListParagraphCxSpLast'
else:
sub_cls = 'MsoListParagraphCxSpLast'
else:
sub_cls = 'MsoListParagraphCxSpMiddle'
# Nested bullets at 54pt margin (36pt + 18pt)
html_blocks.append(f'<p class={sub_cls} style="margin-left:54pt;text-indent:-18pt"><span lang=ES style="font-family:Symbol">·</span><span lang=ES style="font-size:7pt">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</span><span lang=ES>{bullet_html}</span></p>')
return html_blocks, i