Files
MastersThesis/markdown_utils.py
sergio 5ac0f486b8
All checks were successful
build_docker / essential (push) Successful in 0s
build_docker / build_paddle_ocr (push) Successful in 7m24s
build_docker / build_paddle_ocr_gpu (push) Successful in 27m31s
build_docker / build_easyocr (push) Successful in 23m46s
build_docker / build_easyocr_gpu (push) Successful in 26m37s
build_docker / build_doctr (push) Successful in 24m23s
build_docker / build_raytune (push) Successful in 5m46s
build_docker / build_doctr_gpu (push) Successful in 18m4s
MathML
2026-02-04 21:14:45 +01:00

154 lines
6.2 KiB
Python

#!/usr/bin/env python3
"""Utility functions for markdown processing and conversion."""
import re
from latex2mathml.converter import convert as latex_to_mathml
# Accept Fuente/Source lines with or without markdown bold
SOURCE_LINE_RE = re.compile(r'^\s*(?:\*{1,2})?(Fuente|Source):(?:\*{1,2})?\s*(.*)$', re.IGNORECASE)
# Accept Leyenda lines with or without markdown bold
LEYENDA_LINE_RE = re.compile(r'^\s*(?:\*{1,2})?Leyenda:(?:\*{1,2})?\s*(.*)$', re.IGNORECASE)
# Cross-reference patterns using markdown links:
# [Figura 15](#figura-15) or [Tabla 20](#tabla-20) -> Word REF fields
# Also supports Anexo: [Figura A1](#figura-a1), [Tabla A2](#tabla-a2)
CROSS_REF_LINK_RE = re.compile(r'\[(Figura|Tabla)\s+([A-Za-z]?\d+)\]\(#(figura|tabla)-([a-z]?\d+)\)', re.IGNORECASE)
# Section/chapter cross-reference patterns:
# [Sección 4.1](#seccion-4-1) or [Capítulo 2](#capitulo-2)
SECTION_REF_LINK_RE = re.compile(r'\[(Sección|Seccion|Capítulo|Capitulo)\s+([\d\.]+)\]\(#(seccion|capitulo)-([0-9-]+)\)', re.IGNORECASE)
def read_file(path):
"""Read file content with UTF-8 encoding, falling back to latin-1."""
try:
with open(path, 'r', encoding='utf-8') as f:
return f.read()
except UnicodeDecodeError:
with open(path, 'r', encoding='latin-1') as f:
return f.read()
def write_file(path, content):
"""Write content to file with UTF-8 encoding."""
with open(path, 'w', encoding='utf-8') as f:
f.write(content)
def convert_cross_references(text):
"""Convert markdown link cross-references to Word REF fields.
Supported syntax (renders normally in markdown viewers):
- [Figura 15](#figura-15) -> clickable Word cross-reference to Figure 15
- [Tabla 20](#tabla-20) -> clickable Word cross-reference to Table 20
- [Figura A1](#figura-a1) -> links to Anexo figures
- [Sección 4.1](#seccion-4-1) -> clickable link to Section 4.1
- [Capítulo 2](#capitulo-2) -> clickable link to Chapter 2
"""
def replace_fig_tab_ref(match):
display_type = match.group(1) # "Figura" or "Tabla"
display_num = match.group(2) # "15" or "A1"
if display_type.lower() == 'figura':
bookmark = f"_Ref_Fig{display_num}"
else: # Tabla
bookmark = f"_Ref_Tab{display_num}"
display_text = f"{display_type} {display_num}"
# Word REF field with \h for hyperlink
return f'''<!--[if supportFields]><span style='mso-element:field-begin'></span> REF {bookmark} \\h <span style='mso-element:field-separator'></span><![endif]--><a href="#{bookmark}">{display_text}</a><!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->'''
def replace_section_ref(match):
display_type = match.group(1) # "Sección" or "Capítulo"
display_num = match.group(2) # "4.1" or "2"
anchor_num = match.group(4) # "4-1" or "2"
# Create bookmark name from anchor (e.g., 4-1 -> _Ref_Sec4_1)
bookmark = f"_Ref_Sec{anchor_num.replace('-', '_')}"
display_text = f"{display_type} {display_num}"
# Word REF field with \h for hyperlink
return f'''<!--[if supportFields]><span style='mso-element:field-begin'></span> REF {bookmark} \\h <span style='mso-element:field-separator'></span><![endif]--><a href="#{bookmark}">{display_text}</a><!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->'''
# Apply cross-reference conversions
text = CROSS_REF_LINK_RE.sub(replace_fig_tab_ref, text)
text = SECTION_REF_LINK_RE.sub(replace_section_ref, text)
return text
def md_to_html_para(text):
"""Convert markdown inline formatting to HTML."""
# Cross-references (must be done before other conversions)
text = convert_cross_references(text)
# Bold
text = re.sub(r'\*\*([^*]+)\*\*', r'<b>\1</b>', text)
# Italic
text = re.sub(r'\*([^*]+)\*', r'<i>\1</i>', text)
# Inline code
text = re.sub(r'`([^`]+)`', r'<span style="font-family:Consolas;font-size:10pt">\1</span>', text)
# Links [text](url) -> <a href="url">text</a>
text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<a href="\2">\1</a>', text)
return text
def convert_latex_formulas(text):
"""Convert LaTeX formulas to MathML for Word compatibility."""
# Block formulas $$...$$
def convert_block(match):
latex = match.group(1)
try:
mathml = latex_to_mathml(latex, display="block")
return f'<p class=MsoNormal style="text-align:center">{mathml}</p>'
except:
return match.group(0) # Keep original if conversion fails
text = re.sub(r'\$\$([^$]+)\$\$', convert_block, text)
# Inline formulas $...$
def convert_inline(match):
latex = match.group(1)
try:
return latex_to_mathml(latex, display="inline")
except:
return match.group(0)
text = re.sub(r'\$([^$]+)\$', convert_inline, text)
return text
def extract_source_from_line(line):
"""Return source text if line is a Fuente/Source line, otherwise None."""
match = SOURCE_LINE_RE.match(line.strip())
if not match:
return None
return match.group(2).strip()
def is_source_line(line):
"""Check whether a line starts with Fuente:/Source: (optionally bold)."""
return SOURCE_LINE_RE.match(line.strip()) is not None
def extract_leyenda_from_line(line):
"""Return leyenda text if line is a Leyenda line, otherwise None."""
match = LEYENDA_LINE_RE.match(line.strip())
if not match:
return None
return match.group(1).strip()
def is_leyenda_line(line):
"""Check whether a line starts with Leyenda: (optionally bold)."""
return LEYENDA_LINE_RE.match(line.strip()) is not None
def split_into_paragraphs(text, lang='ES'):
"""Split text by double newlines and wrap each paragraph in <p> tags."""
paragraphs = []
for para in text.split('\n\n'):
para = para.strip()
if para:
formatted = md_to_html_para(para)
paragraphs.append(f'<p class=MsoNormal><span lang={lang}>{formatted}</span></p>')
return '\n'.join(paragraphs)