Leyenda
Some checks failed
build_docker / essential (push) Successful in 1s
build_docker / build_paddle_ocr (push) Successful in 4m0s
build_docker / build_paddle_ocr_gpu (push) Successful in 18m53s
build_docker / build_easyocr (push) Successful in 16m12s
build_docker / build_easyocr_gpu (push) Successful in 22m37s
build_docker / build_doctr (push) Successful in 21m22s
build_docker / build_raytune (push) Successful in 2m50s
build_docker / build_doctr_gpu (push) Has been cancelled
Some checks failed
build_docker / essential (push) Successful in 1s
build_docker / build_paddle_ocr (push) Successful in 4m0s
build_docker / build_paddle_ocr_gpu (push) Successful in 18m53s
build_docker / build_easyocr (push) Successful in 16m12s
build_docker / build_easyocr_gpu (push) Successful in 22m37s
build_docker / build_doctr (push) Successful in 21m22s
build_docker / build_raytune (push) Successful in 2m50s
build_docker / build_doctr_gpu (push) Has been cancelled
This commit is contained in:
249
apply_content.py
249
apply_content.py
@@ -4,23 +4,25 @@
|
||||
import re
|
||||
import os
|
||||
import shutil
|
||||
from bs4 import BeautifulSoup, NavigableString
|
||||
from latex2mathml.converter import convert as latex_to_mathml
|
||||
from PIL import Image
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
TEMPLATE_INPUT = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm')
|
||||
TEMPLATE_OUTPUT = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual.htm')
|
||||
DOCS_DIR = os.path.join(BASE_DIR, 'docs')
|
||||
|
||||
# Accept Fuente/Source lines with or without markdown bold
|
||||
SOURCE_LINE_RE = re.compile(r'^\s*(?:\*{1,2})?(Fuente|Source):(?:\*{1,2})?\s*(.*)$', re.IGNORECASE)
|
||||
|
||||
# Global counters for tables and figures
|
||||
table_counter = 0
|
||||
figure_counter = 0
|
||||
anexo_table_counter = 0
|
||||
anexo_figure_counter = 0
|
||||
from bs4 import BeautifulSoup, NavigableString
|
||||
from latex2mathml.converter import convert as latex_to_mathml
|
||||
from PIL import Image
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
TEMPLATE_INPUT = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm')
|
||||
TEMPLATE_OUTPUT = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual.htm')
|
||||
DOCS_DIR = os.path.join(BASE_DIR, 'docs')
|
||||
|
||||
# Accept Fuente/Source lines with or without markdown bold
|
||||
SOURCE_LINE_RE = re.compile(r'^\s*(?:\*{1,2})?(Fuente|Source):(?:\*{1,2})?\s*(.*)$', re.IGNORECASE)
|
||||
# Accept Leyenda lines with or without markdown bold
|
||||
LEYENDA_LINE_RE = re.compile(r'^\s*(?:\*{1,2})?Leyenda:(?:\*{1,2})?\s*(.*)$', re.IGNORECASE)
|
||||
|
||||
# Global counters for tables and figures
|
||||
table_counter = 0
|
||||
figure_counter = 0
|
||||
anexo_table_counter = 0
|
||||
anexo_figure_counter = 0
|
||||
# Global sequential counter for figure filenames (figura_1.png, figura_2.png, etc.)
|
||||
global_figure_index = 0
|
||||
|
||||
@@ -48,7 +50,7 @@ def md_to_html_para(text):
|
||||
text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<a href="\2">\1</a>', text)
|
||||
return text
|
||||
|
||||
def convert_latex_formulas(text):
|
||||
def convert_latex_formulas(text):
|
||||
"""Convert LaTeX formulas to MathML for Word compatibility."""
|
||||
# Block formulas $$...$$
|
||||
def convert_block(match):
|
||||
@@ -69,22 +71,33 @@ def convert_latex_formulas(text):
|
||||
except:
|
||||
return match.group(0)
|
||||
|
||||
text = re.sub(r'\$([^$]+)\$', convert_inline, text)
|
||||
return text
|
||||
|
||||
def extract_source_from_line(line):
|
||||
"""Return source text if line is a Fuente/Source line, otherwise None."""
|
||||
match = SOURCE_LINE_RE.match(line.strip())
|
||||
if not match:
|
||||
return None
|
||||
return match.group(2).strip()
|
||||
|
||||
def is_source_line(line):
|
||||
"""Check whether a line starts with Fuente:/Source: (optionally bold)."""
|
||||
return SOURCE_LINE_RE.match(line.strip()) is not None
|
||||
|
||||
def extract_table_title(lines, current_index):
|
||||
"""Look for table title in preceding lines (e.g., **Tabla 1.** *Title*)."""
|
||||
text = re.sub(r'\$([^$]+)\$', convert_inline, text)
|
||||
return text
|
||||
|
||||
def extract_source_from_line(line):
|
||||
"""Return source text if line is a Fuente/Source line, otherwise None."""
|
||||
match = SOURCE_LINE_RE.match(line.strip())
|
||||
if not match:
|
||||
return None
|
||||
return match.group(2).strip()
|
||||
|
||||
def is_source_line(line):
|
||||
"""Check whether a line starts with Fuente:/Source: (optionally bold)."""
|
||||
return SOURCE_LINE_RE.match(line.strip()) is not None
|
||||
|
||||
def extract_leyenda_from_line(line):
|
||||
"""Return leyenda text if line is a Leyenda line, otherwise None."""
|
||||
match = LEYENDA_LINE_RE.match(line.strip())
|
||||
if not match:
|
||||
return None
|
||||
return match.group(1).strip()
|
||||
|
||||
def is_leyenda_line(line):
|
||||
"""Check whether a line starts with Leyenda: (optionally bold)."""
|
||||
return LEYENDA_LINE_RE.match(line.strip()) is not None
|
||||
|
||||
def extract_table_title(lines, current_index):
|
||||
"""Look for table title in preceding lines (e.g., **Tabla 1.** *Title*)."""
|
||||
# Check previous non-empty lines for table title
|
||||
for i in range(current_index - 1, max(0, current_index - 5), -1):
|
||||
line = lines[i].strip()
|
||||
@@ -172,8 +185,11 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
|
||||
bookmark_id = f"_Ref_Fig{fig_num}"
|
||||
# mso-pagination:keep-with-next ensures caption stays with figure image (correct MSO property)
|
||||
# For Anexo figures, use static text (no SEQ field) to prevent Word from overwriting A1, A2...
|
||||
# Add TC field so Anexo figures appear in Table of Figures index
|
||||
# Use \f c to match the TOC field identifier in the template
|
||||
if is_anexo:
|
||||
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura {fig_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
|
||||
tc_field = f'''<!--[if supportFields]><span style='mso-element:field-begin'></span> TC "Figura {fig_num}. {fig_title}" \\f c \\l 1 <span style='mso-element:field-end'></span><![endif]-->'''
|
||||
html_blocks.append(f'''<a name="{bookmark_id}"></a>{tc_field}<p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura {fig_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
|
||||
else:
|
||||
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{fig_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
|
||||
|
||||
@@ -204,19 +220,27 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
|
||||
|
||||
# Check if next non-empty line has custom Fuente
|
||||
custom_source = None
|
||||
fig_leyenda = None
|
||||
lookahead = i + 1
|
||||
while lookahead < len(lines) and not lines[lookahead].strip():
|
||||
lookahead += 1
|
||||
if lookahead < len(lines):
|
||||
next_line = lines[lookahead].strip()
|
||||
if is_source_line(next_line):
|
||||
# Extract custom source, removing markdown formatting
|
||||
custom_source = extract_source_from_line(next_line)
|
||||
# Ensure it ends with a period
|
||||
if custom_source and not custom_source.endswith('.'):
|
||||
custom_source += '.'
|
||||
# Skip this line by advancing i past it
|
||||
i = lookahead
|
||||
if lookahead < len(lines):
|
||||
next_line = lines[lookahead].strip()
|
||||
if is_source_line(next_line):
|
||||
# Extract custom source, removing markdown formatting
|
||||
custom_source = extract_source_from_line(next_line)
|
||||
# Ensure it ends with a period
|
||||
if custom_source and not custom_source.endswith('.'):
|
||||
custom_source += '.'
|
||||
# Skip this line by advancing i past it
|
||||
i = lookahead
|
||||
# Check for Leyenda after source
|
||||
leyenda_idx = i + 1
|
||||
while leyenda_idx < len(lines) and not lines[leyenda_idx].strip():
|
||||
leyenda_idx += 1
|
||||
if leyenda_idx < len(lines) and is_leyenda_line(lines[leyenda_idx]):
|
||||
fig_leyenda = extract_leyenda_from_line(lines[leyenda_idx])
|
||||
i = leyenda_idx
|
||||
|
||||
if custom_source:
|
||||
source_html = md_to_html_para(custom_source)
|
||||
@@ -224,6 +248,13 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
|
||||
else:
|
||||
html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Fuente: Elaboración propia.</span></p>''')
|
||||
|
||||
# Add leyenda if present (same style as Fuente, new line)
|
||||
if fig_leyenda:
|
||||
leyenda_html = md_to_html_para(fig_leyenda)
|
||||
if not fig_leyenda.endswith('.'):
|
||||
leyenda_html += '.'
|
||||
html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Leyenda: {leyenda_html}</span></p>''')
|
||||
|
||||
html_blocks.append('<p class=MsoNormal><span lang=ES><o:p> </o:p></span></p>')
|
||||
i += 1
|
||||
continue
|
||||
@@ -249,7 +280,7 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
|
||||
if line.startswith('####'):
|
||||
text = line.lstrip('#').strip()
|
||||
# Apply consistent styling like h2/h3, disable numbering for h4
|
||||
html_blocks.append(f'<h4 style="mso-list:none"><span lang=ES style="text-transform:none">{text}</span></h4>')
|
||||
html_blocks.append(f'<h4 style="mso-list:none"><b><span lang=ES style="text-transform:none">{text}</span></b></h4>')
|
||||
i += 1
|
||||
continue
|
||||
elif line.startswith('###'):
|
||||
@@ -314,11 +345,19 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
|
||||
|
||||
# Look ahead for source (skip blank lines first)
|
||||
source_idx = i
|
||||
while source_idx < len(lines) and not lines[source_idx].strip():
|
||||
source_idx += 1
|
||||
if source_idx < len(lines) and is_source_line(lines[source_idx]):
|
||||
table_source = extract_source_from_line(lines[source_idx])
|
||||
i = source_idx + 1
|
||||
table_leyenda = None
|
||||
while source_idx < len(lines) and not lines[source_idx].strip():
|
||||
source_idx += 1
|
||||
if source_idx < len(lines) and is_source_line(lines[source_idx]):
|
||||
table_source = extract_source_from_line(lines[source_idx])
|
||||
i = source_idx + 1
|
||||
# Check for Leyenda after source (skip blank lines)
|
||||
leyenda_idx = i
|
||||
while leyenda_idx < len(lines) and not lines[leyenda_idx].strip():
|
||||
leyenda_idx += 1
|
||||
if leyenda_idx < len(lines) and is_leyenda_line(lines[leyenda_idx]):
|
||||
table_leyenda = extract_leyenda_from_line(lines[leyenda_idx])
|
||||
i = leyenda_idx + 1
|
||||
|
||||
# Add table title with MsoCaption class and proper Word SEQ field for cross-reference
|
||||
# Format: "Tabla X." in bold, title in italic (per UNIR guidelines)
|
||||
@@ -334,8 +373,11 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
|
||||
clean_title = "Tabla de datos."
|
||||
# mso-pagination:keep-with-next ensures caption stays with table (correct MSO property)
|
||||
# For Anexo tables, use static text (no SEQ field) to prevent Word from overwriting A1, A2...
|
||||
# Add TC field so Anexo tables appear in Table of Tables index
|
||||
# Use \f t identifier - template TOC field will be modified to include this
|
||||
if is_anexo:
|
||||
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla {table_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
|
||||
tc_field = f'''<!--[if supportFields]><span style='mso-element:field-begin'></span> TC "Tabla {table_num}. {clean_title}" \\f t \\l 1 <span style='mso-element:field-end'></span><![endif]-->'''
|
||||
html_blocks.append(f'''<a name="{bookmark_id}"></a>{tc_field}<p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla {table_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
|
||||
else:
|
||||
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
|
||||
|
||||
@@ -363,6 +405,14 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
|
||||
if not table_source.endswith('.'):
|
||||
source_html += '.'
|
||||
html_blocks.append(f'<p class=Piedefoto-tabla style="margin-left:0cm"><span lang=ES>Fuente: {source_html}</span></p>')
|
||||
|
||||
# Add leyenda if present (same style as Fuente, new line)
|
||||
if table_leyenda:
|
||||
leyenda_html = md_to_html_para(table_leyenda)
|
||||
if not table_leyenda.endswith('.'):
|
||||
leyenda_html += '.'
|
||||
html_blocks.append(f'<p class=Piedefoto-tabla style="margin-left:0cm"><span lang=ES>Leyenda: {leyenda_html}</span></p>')
|
||||
|
||||
html_blocks.append('<p class=MsoNormal><span lang=ES><o:p> </o:p></span></p>')
|
||||
continue
|
||||
|
||||
@@ -376,24 +426,63 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
|
||||
html_blocks.append(f'<p class=MsoQuote><i><span lang=ES>{md_to_html_para(quote_text)}</span></i></p>')
|
||||
continue
|
||||
|
||||
# Bullet list
|
||||
# Bullet list (handle blank lines between items)
|
||||
if re.match(r'^[\-\*\+]\s', line):
|
||||
while i < len(lines) and re.match(r'^[\-\*\+]\s', lines[i]):
|
||||
item_text = lines[i][2:].strip()
|
||||
item_text = convert_latex_formulas(item_text)
|
||||
html_blocks.append(f'<p class=MsoListParagraphCxSpMiddle style="margin-left:36pt;text-indent:-18pt"><span lang=ES style="font-family:Symbol">·</span><span lang=ES style="font-size:7pt"> </span><span lang=ES>{md_to_html_para(item_text)}</span></p>')
|
||||
i += 1
|
||||
# Collect all bullet items first
|
||||
bullet_items = []
|
||||
while i < len(lines):
|
||||
# Skip blank lines
|
||||
while i < len(lines) and not lines[i].strip():
|
||||
i += 1
|
||||
# Check if next non-blank line is a bullet item
|
||||
if i < len(lines) and re.match(r'^[\-\*\+]\s', lines[i]):
|
||||
item_text = lines[i][2:].strip()
|
||||
item_text = convert_latex_formulas(item_text)
|
||||
bullet_items.append(md_to_html_para(item_text))
|
||||
i += 1
|
||||
else:
|
||||
break
|
||||
# Output with proper First/Middle/Last classes
|
||||
for idx, item in enumerate(bullet_items):
|
||||
if len(bullet_items) == 1:
|
||||
cls = 'MsoListParagraph'
|
||||
elif idx == 0:
|
||||
cls = 'MsoListParagraphCxSpFirst'
|
||||
elif idx == len(bullet_items) - 1:
|
||||
cls = 'MsoListParagraphCxSpLast'
|
||||
else:
|
||||
cls = 'MsoListParagraphCxSpMiddle'
|
||||
html_blocks.append(f'<p class={cls} style="margin-left:36pt;text-indent:-18pt"><span lang=ES style="font-family:Symbol">·</span><span lang=ES style="font-size:7pt"> </span><span lang=ES>{item}</span></p>')
|
||||
continue
|
||||
|
||||
# Numbered list
|
||||
# Numbered list (handle blank lines between items)
|
||||
if re.match(r'^\d+\.\s', line):
|
||||
num = 1
|
||||
while i < len(lines) and re.match(r'^\d+\.\s', lines[i]):
|
||||
item_text = re.sub(r'^\d+\.\s*', '', lines[i]).strip()
|
||||
item_text = convert_latex_formulas(item_text)
|
||||
html_blocks.append(f'<p class=MsoListParagraphCxSpMiddle style="margin-left:36pt;text-indent:-18pt"><span lang=ES>{num}.<span style="font-size:7pt"> </span>{md_to_html_para(item_text)}</span></p>')
|
||||
num += 1
|
||||
i += 1
|
||||
# Collect all numbered items first
|
||||
numbered_items = []
|
||||
while i < len(lines):
|
||||
# Skip blank lines
|
||||
while i < len(lines) and not lines[i].strip():
|
||||
i += 1
|
||||
# Check if next non-blank line is a numbered item
|
||||
if i < len(lines) and re.match(r'^\d+\.\s', lines[i]):
|
||||
item_text = re.sub(r'^\d+\.\s*', '', lines[i]).strip()
|
||||
item_text = convert_latex_formulas(item_text)
|
||||
numbered_items.append(md_to_html_para(item_text))
|
||||
i += 1
|
||||
else:
|
||||
break
|
||||
# Output with proper First/Middle/Last classes
|
||||
for idx, item in enumerate(numbered_items):
|
||||
num = idx + 1
|
||||
if len(numbered_items) == 1:
|
||||
cls = 'MsoListParagraph'
|
||||
elif idx == 0:
|
||||
cls = 'MsoListParagraphCxSpFirst'
|
||||
elif idx == len(numbered_items) - 1:
|
||||
cls = 'MsoListParagraphCxSpLast'
|
||||
else:
|
||||
cls = 'MsoListParagraphCxSpMiddle'
|
||||
html_blocks.append(f'<p class={cls} style="margin-left:36pt;text-indent:-18pt"><span lang=ES>{num}.<span style="font-size:7pt"> </span>{item}</span></p>')
|
||||
continue
|
||||
|
||||
# Skip lines that are just table/figure titles (they'll be handled with the table/figure)
|
||||
@@ -403,9 +492,12 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
|
||||
if line.strip().startswith('**Figura') or line.strip().startswith('*Figura'):
|
||||
i += 1
|
||||
continue
|
||||
if is_source_line(line):
|
||||
i += 1
|
||||
continue
|
||||
if is_source_line(line):
|
||||
i += 1
|
||||
continue
|
||||
if is_leyenda_line(line):
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Regular paragraph
|
||||
para_lines = [line]
|
||||
@@ -523,6 +615,17 @@ def main():
|
||||
|
||||
print("Reading template...")
|
||||
html_content = read_file(TEMPLATE_INPUT)
|
||||
|
||||
# Modify the Table of Tables TOC field to include TC entries with \f t identifier
|
||||
# Original: TOC \h \z \t "Tablas;1" \c "Tabla"
|
||||
# Modified: TOC \f t \h \z \t "Tablas;1" \c "Tabla"
|
||||
# Use regex to handle whitespace/HTML variations in the TOC field
|
||||
html_content = re.sub(
|
||||
r'(TOC\s+)(\\h\s+\\z\s+\\t\s*\n?\s*"Tablas;1")',
|
||||
r'\1\\f t \2',
|
||||
html_content
|
||||
)
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
print("Reading docs content...")
|
||||
@@ -671,10 +774,10 @@ def main():
|
||||
# Also remove surrounding caption and source
|
||||
prev_sib = table.find_previous_sibling()
|
||||
next_sib = table.find_next_sibling()
|
||||
if prev_sib and 'Tabla 1. Ejemplo' in prev_sib.get_text():
|
||||
prev_sib.decompose()
|
||||
if next_sib and SOURCE_LINE_RE.search(next_sib.get_text().strip()):
|
||||
next_sib.decompose()
|
||||
if prev_sib and 'Tabla 1. Ejemplo' in prev_sib.get_text():
|
||||
prev_sib.decompose()
|
||||
if next_sib and SOURCE_LINE_RE.search(next_sib.get_text().strip()):
|
||||
next_sib.decompose()
|
||||
table.decompose()
|
||||
print(" ✓ Removed template table example")
|
||||
break
|
||||
|
||||
Reference in New Issue
Block a user