deliberable_04_01_2026
All checks were successful
build_docker / essential (push) Successful in 1s
build_docker / build_paddle_ocr (push) Successful in 5m12s
build_docker / build_paddle_ocr_gpu (push) Successful in 20m54s
build_docker / build_easyocr (push) Successful in 18m19s
build_docker / build_doctr (push) Successful in 19m49s
build_docker / build_easyocr_gpu (push) Successful in 24m6s
build_docker / build_raytune (push) Successful in 4m10s
build_docker / build_doctr_gpu (push) Successful in 16m26s
All checks were successful
build_docker / essential (push) Successful in 1s
build_docker / build_paddle_ocr (push) Successful in 5m12s
build_docker / build_paddle_ocr_gpu (push) Successful in 20m54s
build_docker / build_easyocr (push) Successful in 18m19s
build_docker / build_doctr (push) Successful in 19m49s
build_docker / build_easyocr_gpu (push) Successful in 24m6s
build_docker / build_raytune (push) Successful in 4m10s
build_docker / build_doctr_gpu (push) Successful in 16m26s
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
221
apply_content.py
221
apply_content.py
@@ -4,18 +4,25 @@
|
||||
import re
|
||||
import os
|
||||
import shutil
|
||||
from bs4 import BeautifulSoup, NavigableString
|
||||
from latex2mathml.converter import convert as latex_to_mathml
|
||||
from PIL import Image
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
TEMPLATE_INPUT = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm')
|
||||
TEMPLATE_OUTPUT = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual.htm')
|
||||
DOCS_DIR = os.path.join(BASE_DIR, 'docs')
|
||||
|
||||
# Global counters for tables and figures
|
||||
table_counter = 0
|
||||
figure_counter = 0
|
||||
from bs4 import BeautifulSoup, NavigableString
|
||||
from latex2mathml.converter import convert as latex_to_mathml
|
||||
from PIL import Image
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
TEMPLATE_INPUT = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm')
|
||||
TEMPLATE_OUTPUT = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual.htm')
|
||||
DOCS_DIR = os.path.join(BASE_DIR, 'docs')
|
||||
|
||||
# Accept Fuente/Source lines with or without markdown bold
|
||||
SOURCE_LINE_RE = re.compile(r'^\s*(?:\*{1,2})?(Fuente|Source):(?:\*{1,2})?\s*(.*)$', re.IGNORECASE)
|
||||
|
||||
# Global counters for tables and figures
|
||||
table_counter = 0
|
||||
figure_counter = 0
|
||||
anexo_table_counter = 0
|
||||
anexo_figure_counter = 0
|
||||
# Global sequential counter for figure filenames (figura_1.png, figura_2.png, etc.)
|
||||
global_figure_index = 0
|
||||
|
||||
def read_file(path):
|
||||
try:
|
||||
@@ -41,7 +48,7 @@ def md_to_html_para(text):
|
||||
text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<a href="\2">\1</a>', text)
|
||||
return text
|
||||
|
||||
def convert_latex_formulas(text):
|
||||
def convert_latex_formulas(text):
|
||||
"""Convert LaTeX formulas to MathML for Word compatibility."""
|
||||
# Block formulas $$...$$
|
||||
def convert_block(match):
|
||||
@@ -62,11 +69,22 @@ def convert_latex_formulas(text):
|
||||
except:
|
||||
return match.group(0)
|
||||
|
||||
text = re.sub(r'\$([^$]+)\$', convert_inline, text)
|
||||
return text
|
||||
|
||||
def extract_table_title(lines, current_index):
|
||||
"""Look for table title in preceding lines (e.g., **Tabla 1.** *Title*)."""
|
||||
text = re.sub(r'\$([^$]+)\$', convert_inline, text)
|
||||
return text
|
||||
|
||||
def extract_source_from_line(line):
|
||||
"""Return source text if line is a Fuente/Source line, otherwise None."""
|
||||
match = SOURCE_LINE_RE.match(line.strip())
|
||||
if not match:
|
||||
return None
|
||||
return match.group(2).strip()
|
||||
|
||||
def is_source_line(line):
|
||||
"""Check whether a line starts with Fuente:/Source: (optionally bold)."""
|
||||
return SOURCE_LINE_RE.match(line.strip()) is not None
|
||||
|
||||
def extract_table_title(lines, current_index):
|
||||
"""Look for table title in preceding lines (e.g., **Tabla 1.** *Title*)."""
|
||||
# Check previous non-empty lines for table title
|
||||
for i in range(current_index - 1, max(0, current_index - 5), -1):
|
||||
line = lines[i].strip()
|
||||
@@ -97,9 +115,9 @@ def extract_figure_title_from_mermaid(lines, current_index):
|
||||
|
||||
return None
|
||||
|
||||
def parse_md_to_html_blocks(md_content):
|
||||
def parse_md_to_html_blocks(md_content, is_anexo=False):
|
||||
"""Convert markdown content to HTML blocks with template styles."""
|
||||
global table_counter, figure_counter
|
||||
global table_counter, figure_counter, anexo_table_counter, anexo_figure_counter, global_figure_index
|
||||
|
||||
html_blocks = []
|
||||
lines = md_content.split('\n')
|
||||
@@ -115,7 +133,17 @@ def parse_md_to_html_blocks(md_content):
|
||||
|
||||
# Mermaid diagram - convert to figure with actual image
|
||||
if line.strip().startswith('```mermaid'):
|
||||
figure_counter += 1
|
||||
# Always increment global index for sequential filenames
|
||||
global_figure_index += 1
|
||||
|
||||
# Use Anexo-specific counter with "A" prefix for display, or global counter
|
||||
if is_anexo:
|
||||
anexo_figure_counter += 1
|
||||
fig_num = f"A{anexo_figure_counter}" # Display number: A1, A2, A3...
|
||||
else:
|
||||
figure_counter += 1
|
||||
fig_num = str(figure_counter) # Display number: 1, 2, 3...
|
||||
|
||||
mermaid_lines = []
|
||||
i += 1
|
||||
while i < len(lines) and not lines[i].strip() == '```':
|
||||
@@ -132,17 +160,22 @@ def parse_md_to_html_blocks(md_content):
|
||||
if title_match:
|
||||
fig_title = title_match.group(1).strip()
|
||||
else:
|
||||
fig_title = f"Diagrama {figure_counter}"
|
||||
fig_title = f"Diagrama {fig_num}"
|
||||
|
||||
# Check if the generated PNG exists
|
||||
fig_file = f'figures/figura_{figure_counter}.png'
|
||||
# Use global sequential index for filename (figura_1.png, figura_2.png, etc.)
|
||||
fig_file = f'figures/figura_{global_figure_index}.png'
|
||||
fig_path = os.path.join(BASE_DIR, 'thesis_output', fig_file)
|
||||
|
||||
# Create figure with MsoCaption class and proper Word SEQ field for cross-reference
|
||||
# Format: "Figura X." in bold, title in italic (per UNIR guidelines)
|
||||
# Word TOC looks for text with Caption style - anchor must be outside main caption text
|
||||
bookmark_id = f"_Ref_Fig{figure_counter}"
|
||||
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{figure_counter}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
|
||||
bookmark_id = f"_Ref_Fig{fig_num}"
|
||||
# mso-pagination:keep-with-next ensures caption stays with figure image (correct MSO property)
|
||||
# For Anexo figures, use static text (no SEQ field) to prevent Word from overwriting A1, A2...
|
||||
if is_anexo:
|
||||
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura {fig_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
|
||||
else:
|
||||
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{fig_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
|
||||
|
||||
if os.path.exists(fig_path):
|
||||
# Read actual image dimensions and scale to fit page width
|
||||
@@ -162,12 +195,35 @@ def parse_md_to_html_blocks(md_content):
|
||||
w_pt = new_w * 0.75
|
||||
h_pt = new_h * 0.75
|
||||
|
||||
html_blocks.append(f'''<p class=MsoNormal style="text-align:center"><span lang=ES><img width="{new_w}" height="{new_h}" style="width:{w_pt}pt;height:{h_pt}pt;display:block;margin:0 auto" src="{fig_file}" alt="{fig_title}"/></span></p>''')
|
||||
# mso-pagination:keep-with-next ensures image stays with source line
|
||||
html_blocks.append(f'''<p class=MsoNormal style="text-align:center;mso-pagination:keep-with-next"><span lang=ES><img width="{new_w}" height="{new_h}" style="width:{w_pt}pt;height:{h_pt}pt;display:block;margin:0 auto" src="{fig_file}" alt="{fig_title}"/></span></p>''')
|
||||
else:
|
||||
# Fallback to placeholder
|
||||
html_blocks.append(f'''<p class=MsoNormal style="text-align:center;border:1px dashed #999;padding:20px;margin:10px 40px;background:#f9f9f9"><span lang=ES style="color:#666">[Insertar diagrama Mermaid aquí]</span></p>''')
|
||||
# mso-pagination:keep-with-next ensures placeholder stays with source line
|
||||
html_blocks.append(f'''<p class=MsoNormal style="text-align:center;mso-pagination:keep-with-next;border:1px dashed #999;padding:20px;margin:10px 40px;background:#f9f9f9"><span lang=ES style="color:#666">[Insertar diagrama Mermaid aquí]</span></p>''')
|
||||
|
||||
# Check if next non-empty line has custom Fuente
|
||||
custom_source = None
|
||||
lookahead = i + 1
|
||||
while lookahead < len(lines) and not lines[lookahead].strip():
|
||||
lookahead += 1
|
||||
if lookahead < len(lines):
|
||||
next_line = lines[lookahead].strip()
|
||||
if is_source_line(next_line):
|
||||
# Extract custom source, removing markdown formatting
|
||||
custom_source = extract_source_from_line(next_line)
|
||||
# Ensure it ends with a period
|
||||
if custom_source and not custom_source.endswith('.'):
|
||||
custom_source += '.'
|
||||
# Skip this line by advancing i past it
|
||||
i = lookahead
|
||||
|
||||
if custom_source:
|
||||
source_html = md_to_html_para(custom_source)
|
||||
html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Fuente: {source_html}</span></p>''')
|
||||
else:
|
||||
html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Fuente: Elaboración propia.</span></p>''')
|
||||
|
||||
html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Fuente: Elaboración propia.</span></p>''')
|
||||
html_blocks.append('<p class=MsoNormal><span lang=ES><o:p> </o:p></span></p>')
|
||||
i += 1
|
||||
continue
|
||||
@@ -192,17 +248,28 @@ def parse_md_to_html_blocks(md_content):
|
||||
# Headers - ## becomes h2, ### becomes h3
|
||||
if line.startswith('####'):
|
||||
text = line.lstrip('#').strip()
|
||||
html_blocks.append(f'<h4><span lang=ES>{text}</span></h4>')
|
||||
# Apply consistent styling like h2/h3, disable numbering for h4
|
||||
html_blocks.append(f'<h4 style="mso-list:none"><span lang=ES style="text-transform:none">{text}</span></h4>')
|
||||
i += 1
|
||||
continue
|
||||
elif line.startswith('###'):
|
||||
text = line.lstrip('#').strip()
|
||||
html_blocks.append(f'<h3 style="mso-list:l22 level3 lfo18"><span lang=ES style="text-transform:none">{text}</span></h3>')
|
||||
# Disable auto-numbering for Anexo content or A.x headings
|
||||
if is_anexo or re.match(r'^A\.\d+', text):
|
||||
# mso-list:none explicitly disables inherited list numbering from template CSS
|
||||
html_blocks.append(f'<h3 style="mso-list:none"><span lang=ES style="text-transform:none">{text}</span></h3>')
|
||||
else:
|
||||
html_blocks.append(f'<h3 style="mso-list:l22 level3 lfo18"><span lang=ES style="text-transform:none">{text}</span></h3>')
|
||||
i += 1
|
||||
continue
|
||||
elif line.startswith('##'):
|
||||
text = line.lstrip('#').strip()
|
||||
html_blocks.append(f'<h2 style="mso-list:l22 level2 lfo18"><span lang=ES style="text-transform:none">{text}</span></h2>')
|
||||
# Disable auto-numbering for Anexo content or A.x headings
|
||||
if is_anexo or re.match(r'^A\.\d+', text):
|
||||
# mso-list:none explicitly disables inherited list numbering from template CSS
|
||||
html_blocks.append(f'<h2 style="mso-list:none"><span lang=ES style="text-transform:none">{text}</span></h2>')
|
||||
else:
|
||||
html_blocks.append(f'<h2 style="mso-list:l22 level2 lfo18"><span lang=ES style="text-transform:none">{text}</span></h2>')
|
||||
i += 1
|
||||
continue
|
||||
elif line.startswith('#'):
|
||||
@@ -212,7 +279,13 @@ def parse_md_to_html_blocks(md_content):
|
||||
|
||||
# Table - check for table title pattern first
|
||||
if '|' in line and i + 1 < len(lines) and '---' in lines[i + 1]:
|
||||
table_counter += 1
|
||||
# Use Anexo-specific counter with "A" prefix, or global counter
|
||||
if is_anexo:
|
||||
anexo_table_counter += 1
|
||||
table_num = f"A{anexo_table_counter}"
|
||||
else:
|
||||
table_counter += 1
|
||||
table_num = str(table_counter)
|
||||
|
||||
# Check if previous line has table title (e.g., **Tabla 1.** *Title*)
|
||||
table_title = None
|
||||
@@ -239,15 +312,18 @@ def parse_md_to_html_blocks(md_content):
|
||||
table_lines.append(lines[i])
|
||||
i += 1
|
||||
|
||||
# Look ahead for source
|
||||
if i < len(lines) and 'Fuente:' in lines[i]:
|
||||
table_source = lines[i].replace('*', '').replace('Fuente:', '').strip()
|
||||
i += 1
|
||||
# Look ahead for source (skip blank lines first)
|
||||
source_idx = i
|
||||
while source_idx < len(lines) and not lines[source_idx].strip():
|
||||
source_idx += 1
|
||||
if source_idx < len(lines) and is_source_line(lines[source_idx]):
|
||||
table_source = extract_source_from_line(lines[source_idx])
|
||||
i = source_idx + 1
|
||||
|
||||
# Add table title with MsoCaption class and proper Word SEQ field for cross-reference
|
||||
# Format: "Tabla X." in bold, title in italic (per UNIR guidelines)
|
||||
# Word TOC looks for text with Caption style - anchor must be outside main caption text
|
||||
bookmark_id = f"_Ref_Tab{table_counter}"
|
||||
bookmark_id = f"_Ref_Tab{table_num}"
|
||||
if table_title:
|
||||
# Remove any "Tabla X." or "Tabla AX." pattern from the title
|
||||
clean_title = re.sub(r'^Tabla\s+[A-Z]?\d+\.\s*', '', table_title).strip()
|
||||
@@ -256,10 +332,14 @@ def parse_md_to_html_blocks(md_content):
|
||||
clean_title = alt_title
|
||||
else:
|
||||
clean_title = "Tabla de datos."
|
||||
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_counter}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
|
||||
# mso-pagination:keep-with-next ensures caption stays with table (correct MSO property)
|
||||
# For Anexo tables, use static text (no SEQ field) to prevent Word from overwriting A1, A2...
|
||||
if is_anexo:
|
||||
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla {table_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
|
||||
else:
|
||||
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
|
||||
|
||||
# Build table HTML with APA style (horizontal lines only, no vertical)
|
||||
# Wrap in centered div for Word compatibility
|
||||
table_html = '<div align="center"><table class=MsoTableGrid border=1 cellspacing=0 cellpadding=0 align="center" style="border-collapse:collapse;margin-left:auto;margin-right:auto;mso-table-style-name:\'Plain Table 1\'">'
|
||||
for j, tline in enumerate(table_lines):
|
||||
cells = [c.strip() for c in tline.split('|')[1:-1]]
|
||||
@@ -278,8 +358,11 @@ def parse_md_to_html_blocks(md_content):
|
||||
table_html += '</table></div>'
|
||||
html_blocks.append(table_html)
|
||||
|
||||
# Add source with proper template format
|
||||
html_blocks.append(f'<p class=Piedefoto-tabla style="margin-left:0cm"><span lang=ES>Fuente: {table_source}.</span></p>')
|
||||
# Add source with proper template format (convert markdown links to HTML)
|
||||
source_html = md_to_html_para(table_source)
|
||||
if not table_source.endswith('.'):
|
||||
source_html += '.'
|
||||
html_blocks.append(f'<p class=Piedefoto-tabla style="margin-left:0cm"><span lang=ES>Fuente: {source_html}</span></p>')
|
||||
html_blocks.append('<p class=MsoNormal><span lang=ES><o:p> </o:p></span></p>')
|
||||
continue
|
||||
|
||||
@@ -320,9 +403,9 @@ def parse_md_to_html_blocks(md_content):
|
||||
if line.strip().startswith('**Figura') or line.strip().startswith('*Figura'):
|
||||
i += 1
|
||||
continue
|
||||
if line.strip().startswith('*Fuente:') or line.strip().startswith('Fuente:'):
|
||||
i += 1
|
||||
continue
|
||||
if is_source_line(line):
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Regular paragraph
|
||||
para_lines = [line]
|
||||
@@ -341,10 +424,10 @@ def parse_md_to_html_blocks(md_content):
|
||||
|
||||
return '\n\n'.join(html_blocks)
|
||||
|
||||
def extract_section_content(md_content):
|
||||
def extract_section_content(md_content, is_anexo=False):
|
||||
"""Extract content from markdown, skipping the first # header."""
|
||||
md_content = re.sub(r'^#\s+[^\n]+\n+', '', md_content, count=1)
|
||||
return parse_md_to_html_blocks(md_content)
|
||||
return parse_md_to_html_blocks(md_content, is_anexo=is_anexo)
|
||||
|
||||
def find_section_element(soup, keyword):
|
||||
"""Find element containing keyword (h1 or special paragraph classes)."""
|
||||
@@ -396,6 +479,16 @@ def format_references(refs_content):
|
||||
|
||||
return refs_html
|
||||
|
||||
def split_into_paragraphs(text, lang='ES'):
|
||||
"""Split text by double newlines and wrap each paragraph in <p> tags."""
|
||||
paragraphs = []
|
||||
for para in text.split('\n\n'):
|
||||
para = para.strip()
|
||||
if para:
|
||||
formatted = md_to_html_para(para)
|
||||
paragraphs.append(f'<p class=MsoNormal><span lang={lang}>{formatted}</span></p>')
|
||||
return '\n'.join(paragraphs)
|
||||
|
||||
def extract_resumen_parts(resumen_content):
|
||||
"""Extract Spanish resumen and English abstract from 00_resumen.md"""
|
||||
parts = resumen_content.split('---')
|
||||
@@ -408,25 +501,25 @@ def extract_resumen_parts(resumen_content):
|
||||
spanish_keywords = ''
|
||||
if '**Palabras clave:**' in spanish_part:
|
||||
text_part, kw_part = spanish_part.split('**Palabras clave:**')
|
||||
spanish_text = text_part.replace('# Resumen', '').strip()
|
||||
spanish_keywords = kw_part.strip()
|
||||
spanish_text = split_into_paragraphs(text_part.replace('# Resumen', '').strip(), 'ES')
|
||||
spanish_keywords = md_to_html_para(kw_part.strip())
|
||||
else:
|
||||
spanish_text = spanish_part.replace('# Resumen', '').strip()
|
||||
spanish_text = split_into_paragraphs(spanish_part.replace('# Resumen', '').strip(), 'ES')
|
||||
|
||||
# Extract English content
|
||||
english_text = ''
|
||||
english_keywords = ''
|
||||
if '**Keywords:**' in english_part:
|
||||
text_part, kw_part = english_part.split('**Keywords:**')
|
||||
english_text = text_part.replace('# Abstract', '').strip()
|
||||
english_keywords = kw_part.strip()
|
||||
english_text = split_into_paragraphs(text_part.replace('# Abstract', '').strip(), 'EN-US')
|
||||
english_keywords = md_to_html_para(kw_part.strip())
|
||||
else:
|
||||
english_text = english_part.replace('# Abstract', '').strip()
|
||||
english_text = split_into_paragraphs(english_part.replace('# Abstract', '').strip(), 'EN-US')
|
||||
|
||||
return spanish_text, spanish_keywords, english_text, english_keywords
|
||||
|
||||
def main():
|
||||
global table_counter, figure_counter
|
||||
global table_counter, figure_counter, anexo_table_counter, anexo_figure_counter
|
||||
|
||||
print("Reading template...")
|
||||
html_content = read_file(TEMPLATE_INPUT)
|
||||
@@ -470,8 +563,8 @@ def main():
|
||||
if hasattr(elem, 'decompose'):
|
||||
elem.decompose()
|
||||
|
||||
# Insert new resumen content
|
||||
resumen_html = f'''<p class=MsoNormal><span lang=ES>{spanish_text}</span></p>
|
||||
# Insert new resumen content (spanish_text already contains <p> tags)
|
||||
resumen_html = f'''{spanish_text}
|
||||
<p class=MsoNormal><span lang=ES><o:p> </o:p></span></p>
|
||||
<p class=MsoNormal><b><span lang=ES>Palabras clave:</span></b><span lang=ES> {spanish_kw}</span></p>
|
||||
<p class=MsoNormal><span lang=ES><o:p> </o:p></span></p>'''
|
||||
@@ -502,8 +595,8 @@ def main():
|
||||
if hasattr(elem, 'decompose'):
|
||||
elem.decompose()
|
||||
|
||||
# Insert new abstract content
|
||||
abstract_html = f'''<p class=MsoNormal><span lang=EN-US>{english_text}</span></p>
|
||||
# Insert new abstract content (english_text already contains <p> tags)
|
||||
abstract_html = f'''{english_text}
|
||||
<p class=MsoNormal><span lang=EN-US><o:p> </o:p></span></p>
|
||||
<p class=MsoNormal><b><span lang=EN-US>Keywords:</span></b><span lang=EN-US> {english_kw}</span></p>
|
||||
<p class=MsoNormal><span lang=EN-US><o:p> </o:p></span></p>'''
|
||||
@@ -578,10 +671,10 @@ def main():
|
||||
# Also remove surrounding caption and source
|
||||
prev_sib = table.find_previous_sibling()
|
||||
next_sib = table.find_next_sibling()
|
||||
if prev_sib and 'Tabla 1. Ejemplo' in prev_sib.get_text():
|
||||
prev_sib.decompose()
|
||||
if next_sib and 'Fuente:' in next_sib.get_text():
|
||||
next_sib.decompose()
|
||||
if prev_sib and 'Tabla 1. Ejemplo' in prev_sib.get_text():
|
||||
prev_sib.decompose()
|
||||
if next_sib and SOURCE_LINE_RE.search(next_sib.get_text().strip()):
|
||||
next_sib.decompose()
|
||||
table.decompose()
|
||||
print(" ✓ Removed template table example")
|
||||
break
|
||||
@@ -648,14 +741,14 @@ def main():
|
||||
current.extract()
|
||||
current = next_elem
|
||||
|
||||
anexo_content = extract_section_content(docs['anexo'])
|
||||
anexo_content = extract_section_content(docs['anexo'], is_anexo=True)
|
||||
anexo_soup = BeautifulSoup(anexo_content, 'html.parser')
|
||||
insert_point = anexo_elem
|
||||
for new_elem in reversed(list(anexo_soup.children)):
|
||||
insert_point.insert_after(new_elem)
|
||||
print(f" ✓ Replaced content")
|
||||
|
||||
print(f"\nSummary: {table_counter} tables, {figure_counter} figures processed")
|
||||
print(f"\nSummary: {table_counter} tables + {anexo_table_counter} Anexo tables, {figure_counter} figures + {anexo_figure_counter} Anexo figures processed")
|
||||
|
||||
print("Saving modified template...")
|
||||
output_html = str(soup)
|
||||
|
||||
Reference in New Issue
Block a user