All checks were successful
build_docker / essential (push) Successful in 0s
build_docker / build_paddle_ocr (push) Successful in 7m24s
build_docker / build_paddle_ocr_gpu (push) Successful in 27m31s
build_docker / build_easyocr (push) Successful in 23m46s
build_docker / build_easyocr_gpu (push) Successful in 26m37s
build_docker / build_doctr (push) Successful in 24m23s
build_docker / build_raytune (push) Successful in 5m46s
build_docker / build_doctr_gpu (push) Successful in 18m4s
154 lines
6.2 KiB
Python
154 lines
6.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Utility functions for markdown processing and conversion."""
|
|
|
|
import re
|
|
from latex2mathml.converter import convert as latex_to_mathml
|
|
|
|
# Accept Fuente/Source lines with or without markdown bold
|
|
SOURCE_LINE_RE = re.compile(r'^\s*(?:\*{1,2})?(Fuente|Source):(?:\*{1,2})?\s*(.*)$', re.IGNORECASE)
|
|
# Accept Leyenda lines with or without markdown bold
|
|
LEYENDA_LINE_RE = re.compile(r'^\s*(?:\*{1,2})?Leyenda:(?:\*{1,2})?\s*(.*)$', re.IGNORECASE)
|
|
|
|
# Cross-reference patterns using markdown links:
|
|
# [Figura 15](#figura-15) or [Tabla 20](#tabla-20) -> Word REF fields
|
|
# Also supports Anexo: [Figura A1](#figura-a1), [Tabla A2](#tabla-a2)
|
|
CROSS_REF_LINK_RE = re.compile(r'\[(Figura|Tabla)\s+([A-Za-z]?\d+)\]\(#(figura|tabla)-([a-z]?\d+)\)', re.IGNORECASE)
|
|
# Section/chapter cross-reference patterns:
|
|
# [Sección 4.1](#seccion-4-1) or [Capítulo 2](#capitulo-2)
|
|
SECTION_REF_LINK_RE = re.compile(r'\[(Sección|Seccion|Capítulo|Capitulo)\s+([\d\.]+)\]\(#(seccion|capitulo)-([0-9-]+)\)', re.IGNORECASE)
|
|
|
|
|
|
def read_file(path):
|
|
"""Read file content with UTF-8 encoding, falling back to latin-1."""
|
|
try:
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
return f.read()
|
|
except UnicodeDecodeError:
|
|
with open(path, 'r', encoding='latin-1') as f:
|
|
return f.read()
|
|
|
|
|
|
def write_file(path, content):
|
|
"""Write content to file with UTF-8 encoding."""
|
|
with open(path, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
|
|
|
|
def convert_cross_references(text):
|
|
"""Convert markdown link cross-references to Word REF fields.
|
|
|
|
Supported syntax (renders normally in markdown viewers):
|
|
- [Figura 15](#figura-15) -> clickable Word cross-reference to Figure 15
|
|
- [Tabla 20](#tabla-20) -> clickable Word cross-reference to Table 20
|
|
- [Figura A1](#figura-a1) -> links to Anexo figures
|
|
- [Sección 4.1](#seccion-4-1) -> clickable link to Section 4.1
|
|
- [Capítulo 2](#capitulo-2) -> clickable link to Chapter 2
|
|
"""
|
|
def replace_fig_tab_ref(match):
|
|
display_type = match.group(1) # "Figura" or "Tabla"
|
|
display_num = match.group(2) # "15" or "A1"
|
|
|
|
if display_type.lower() == 'figura':
|
|
bookmark = f"_Ref_Fig{display_num}"
|
|
else: # Tabla
|
|
bookmark = f"_Ref_Tab{display_num}"
|
|
|
|
display_text = f"{display_type} {display_num}"
|
|
|
|
# Word REF field with \h for hyperlink
|
|
return f'''<!--[if supportFields]><span style='mso-element:field-begin'></span> REF {bookmark} \\h <span style='mso-element:field-separator'></span><![endif]--><a href="#{bookmark}">{display_text}</a><!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->'''
|
|
|
|
def replace_section_ref(match):
|
|
display_type = match.group(1) # "Sección" or "Capítulo"
|
|
display_num = match.group(2) # "4.1" or "2"
|
|
anchor_num = match.group(4) # "4-1" or "2"
|
|
|
|
# Create bookmark name from anchor (e.g., 4-1 -> _Ref_Sec4_1)
|
|
bookmark = f"_Ref_Sec{anchor_num.replace('-', '_')}"
|
|
display_text = f"{display_type} {display_num}"
|
|
|
|
# Word REF field with \h for hyperlink
|
|
return f'''<!--[if supportFields]><span style='mso-element:field-begin'></span> REF {bookmark} \\h <span style='mso-element:field-separator'></span><![endif]--><a href="#{bookmark}">{display_text}</a><!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->'''
|
|
|
|
# Apply cross-reference conversions
|
|
text = CROSS_REF_LINK_RE.sub(replace_fig_tab_ref, text)
|
|
text = SECTION_REF_LINK_RE.sub(replace_section_ref, text)
|
|
return text
|
|
|
|
|
|
def md_to_html_para(text):
|
|
"""Convert markdown inline formatting to HTML."""
|
|
# Cross-references (must be done before other conversions)
|
|
text = convert_cross_references(text)
|
|
# Bold
|
|
text = re.sub(r'\*\*([^*]+)\*\*', r'<b>\1</b>', text)
|
|
# Italic
|
|
text = re.sub(r'\*([^*]+)\*', r'<i>\1</i>', text)
|
|
# Inline code
|
|
text = re.sub(r'`([^`]+)`', r'<span style="font-family:Consolas;font-size:10pt">\1</span>', text)
|
|
# Links [text](url) -> <a href="url">text</a>
|
|
text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<a href="\2">\1</a>', text)
|
|
return text
|
|
|
|
|
|
def convert_latex_formulas(text):
|
|
"""Convert LaTeX formulas to MathML for Word compatibility."""
|
|
# Block formulas $$...$$
|
|
def convert_block(match):
|
|
latex = match.group(1)
|
|
try:
|
|
mathml = latex_to_mathml(latex, display="block")
|
|
return f'<p class=MsoNormal style="text-align:center">{mathml}</p>'
|
|
except:
|
|
return match.group(0) # Keep original if conversion fails
|
|
|
|
text = re.sub(r'\$\$([^$]+)\$\$', convert_block, text)
|
|
|
|
# Inline formulas $...$
|
|
def convert_inline(match):
|
|
latex = match.group(1)
|
|
try:
|
|
return latex_to_mathml(latex, display="inline")
|
|
except:
|
|
return match.group(0)
|
|
|
|
text = re.sub(r'\$([^$]+)\$', convert_inline, text)
|
|
return text
|
|
|
|
|
|
def extract_source_from_line(line):
|
|
"""Return source text if line is a Fuente/Source line, otherwise None."""
|
|
match = SOURCE_LINE_RE.match(line.strip())
|
|
if not match:
|
|
return None
|
|
return match.group(2).strip()
|
|
|
|
|
|
def is_source_line(line):
|
|
"""Check whether a line starts with Fuente:/Source: (optionally bold)."""
|
|
return SOURCE_LINE_RE.match(line.strip()) is not None
|
|
|
|
|
|
def extract_leyenda_from_line(line):
|
|
"""Return leyenda text if line is a Leyenda line, otherwise None."""
|
|
match = LEYENDA_LINE_RE.match(line.strip())
|
|
if not match:
|
|
return None
|
|
return match.group(1).strip()
|
|
|
|
|
|
def is_leyenda_line(line):
|
|
"""Check whether a line starts with Leyenda: (optionally bold)."""
|
|
return LEYENDA_LINE_RE.match(line.strip()) is not None
|
|
|
|
|
|
def split_into_paragraphs(text, lang='ES'):
|
|
"""Split text by double newlines and wrap each paragraph in <p> tags."""
|
|
paragraphs = []
|
|
for para in text.split('\n\n'):
|
|
para = para.strip()
|
|
if para:
|
|
formatted = md_to_html_para(para)
|
|
paragraphs.append(f'<p class=MsoNormal><span lang={lang}>{formatted}</span></p>')
|
|
return '\n'.join(paragraphs)
|