Documentation review. (#5)
All checks were successful
build_docker / essential (push) Successful in 0s
build_docker / build_paddle_ocr (push) Successful in 5m28s
build_docker / build_paddle_ocr_gpu (push) Successful in 21m16s
build_docker / build_easyocr (push) Successful in 15m52s
build_docker / build_easyocr_gpu (push) Successful in 18m22s
build_docker / build_doctr (push) Successful in 19m3s
build_docker / build_raytune (push) Successful in 3m34s
build_docker / build_doctr_gpu (push) Successful in 13m56s

This commit was merged in pull request #5.
This commit is contained in:
2026-01-20 14:33:46 +00:00
committed by Sergio Jimenez Jimenez
parent c7ed7b2b9c
commit 9ee2490097
56 changed files with 2182 additions and 945 deletions

View File

@@ -3,8 +3,10 @@
import re
import os
import shutil
from bs4 import BeautifulSoup, NavigableString
from latex2mathml.converter import convert as latex_to_mathml
from PIL import Image
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
TEMPLATE_INPUT = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm')
@@ -120,13 +122,13 @@ def parse_md_to_html_blocks(md_content):
mermaid_lines.append(lines[i])
i += 1
# Try to extract title from mermaid content (YAML format: title: "...")
# Try to extract title from mermaid content (YAML format)
mermaid_content = '\n'.join(mermaid_lines)
# Match YAML format: title: "Title" or title: 'Title'
# Match title with quotes: title: "Something" or title: 'Something'
title_match = re.search(r'title:\s*["\']([^"\']+)["\']', mermaid_content)
if not title_match:
# Fallback to non-YAML format: title "Title"
title_match = re.search(r'title\s+["\']?([^"\'"\n]+)["\']?', mermaid_content)
# Match title without quotes: title: Something
title_match = re.search(r'title:\s*([^"\'\n]+)', mermaid_content)
if title_match:
fig_title = title_match.group(1).strip()
else:
@@ -143,8 +145,24 @@ def parse_md_to_html_blocks(md_content):
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{figure_counter}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
if os.path.exists(fig_path):
# Use Word-compatible width in cm (A4 text area is ~16cm wide, use ~12cm max)
html_blocks.append(f'''<p class=MsoNormal style="text-align:center"><span lang=ES><img style="width:12cm;max-width:100%" src="{fig_file}" alt="{fig_title}"/></span></p>''')
# Read actual image dimensions and scale to fit page width
img = Image.open(fig_path)
orig_w, orig_h = img.size
# Scale to fit max width of 566px (15cm at 96dpi) while preserving aspect ratio
max_width = 566
if orig_w > max_width:
scale = max_width / orig_w
new_w = max_width
new_h = int(orig_h * scale)
else:
new_w, new_h = orig_w, orig_h
# Convert to pt (1px at 96dpi = 0.75pt)
w_pt = new_w * 0.75
h_pt = new_h * 0.75
html_blocks.append(f'''<p class=MsoNormal style="text-align:center"><span lang=ES><img width="{new_w}" height="{new_h}" style="width:{w_pt}pt;height:{h_pt}pt;display:block;margin:0 auto" src="{fig_file}" alt="{fig_title}"/></span></p>''')
else:
# Fallback to placeholder
html_blocks.append(f'''<p class=MsoNormal style="text-align:center;border:1px dashed #999;padding:20px;margin:10px 40px;background:#f9f9f9"><span lang=ES style="color:#666">[Insertar diagrama Mermaid aquí]</span></p>''')
@@ -165,7 +183,9 @@ def parse_md_to_html_blocks(md_content):
code = '\n'.join(code_lines)
# Escape HTML entities in code
code = code.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
html_blocks.append(f'<p class=MsoNormal style="margin-left:1cm"><span style="font-family:Consolas;font-size:9pt"><pre>{code}</pre></span></p>')
html_blocks.append(f'''<div style="background:#E6F4F9;border-top:solid #0098CD .5pt;border-bottom:solid #0098CD .5pt;padding:8pt 12pt;margin:6pt 0">
<pre style="font-family:Consolas,monospace;font-size:9pt;color:#333333;margin:0;white-space:pre-wrap;word-wrap:break-word">{code}</pre>
</div>''')
i += 1
continue
@@ -239,7 +259,8 @@ def parse_md_to_html_blocks(md_content):
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_counter}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
# Build table HTML with APA style (horizontal lines only, no vertical)
table_html = '<table class=MsoTableGrid border=1 cellspacing=0 cellpadding=0 style="border-collapse:collapse;margin-left:auto;margin-right:auto;mso-table-style-name:\'Plain Table 1\'">'
# Wrap in centered div for Word compatibility
table_html = '<div align="center"><table class=MsoTableGrid border=1 cellspacing=0 cellpadding=0 align="center" style="border-collapse:collapse;margin-left:auto;margin-right:auto;mso-table-style-name:\'Plain Table 1\'">'
for j, tline in enumerate(table_lines):
cells = [c.strip() for c in tline.split('|')[1:-1]]
table_html += '<tr>'
@@ -254,7 +275,7 @@ def parse_md_to_html_blocks(md_content):
# Middle rows: no borders
table_html += f'<td style="border:none;padding:5px"><p class=MsoNormal style="margin:0;text-align:center"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
table_html += '</tr>'
table_html += '</table>'
table_html += '</table></div>'
html_blocks.append(table_html)
# Add source with proper template format
@@ -269,7 +290,7 @@ def parse_md_to_html_blocks(md_content):
while i < len(lines) and lines[i].startswith('>'):
quote_text += ' ' + lines[i][1:].strip()
i += 1
html_blocks.append(f'<p class=MsoNormal style="margin-left:2cm;margin-right:1cm"><i><span lang=ES>{md_to_html_para(quote_text)}</span></i></p>')
html_blocks.append(f'<p class=MsoQuote><i><span lang=ES>{md_to_html_para(quote_text)}</span></i></p>')
continue
# Bullet list
@@ -640,6 +661,15 @@ def main():
output_html = str(soup)
write_file(TEMPLATE_OUTPUT, output_html)
# Copy template support files (header.htm, images, etc.)
support_files_src = os.path.join(BASE_DIR, 'instructions/plantilla_individual_files')
support_files_dst = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual_files')
if os.path.exists(support_files_src):
if os.path.exists(support_files_dst):
shutil.rmtree(support_files_dst)
shutil.copytree(support_files_src, support_files_dst)
print(f"✓ Copied template support files")
print(f"✓ Done! Modified: {TEMPLATE_OUTPUT}")
print("\nTo convert to DOCX:")
print("1. Open the .htm file in Microsoft Word")