Paddle ocr, easyicr and doctr gpu support. (#4)
All checks were successful
build_docker / essential (push) Successful in 0s
build_docker / build_cpu (push) Successful in 5m0s
build_docker / build_gpu (push) Successful in 22m55s
build_docker / build_easyocr (push) Successful in 18m47s
build_docker / build_easyocr_gpu (push) Successful in 19m0s
build_docker / build_raytune (push) Successful in 3m27s
build_docker / build_doctr (push) Successful in 19m42s
build_docker / build_doctr_gpu (push) Successful in 14m49s

This commit was merged in pull request #4.
This commit is contained in:
2026-01-19 17:35:24 +00:00
committed by Sergio Jimenez Jimenez
parent 8e2b7a5096
commit c7ed7b2b9c
105 changed files with 8170 additions and 1263 deletions

View File

@@ -4,9 +4,11 @@
import re
import os
from bs4 import BeautifulSoup, NavigableString
from latex2mathml.converter import convert as latex_to_mathml
BASE_DIR = '/Users/sergio/Desktop/MastersThesis'
TEMPLATE = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual.htm')
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
TEMPLATE_INPUT = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm')
TEMPLATE_OUTPUT = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual.htm')
DOCS_DIR = os.path.join(BASE_DIR, 'docs')
# Global counters for tables and figures
@@ -33,6 +35,32 @@ def md_to_html_para(text):
text = re.sub(r'\*([^*]+)\*', r'<i>\1</i>', text)
# Inline code
text = re.sub(r'`([^`]+)`', r'<span style="font-family:Consolas;font-size:10pt">\1</span>', text)
# Links [text](url) -> <a href="url">text</a>
text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<a href="\2">\1</a>', text)
return text
def convert_latex_formulas(text):
"""Convert LaTeX formulas to MathML for Word compatibility."""
# Block formulas $$...$$
def convert_block(match):
latex = match.group(1)
try:
mathml = latex_to_mathml(latex, display="block")
return f'<p class=MsoNormal style="text-align:center">{mathml}</p>'
except:
return match.group(0) # Keep original if conversion fails
text = re.sub(r'\$\$([^$]+)\$\$', convert_block, text)
# Inline formulas $...$
def convert_inline(match):
latex = match.group(1)
try:
return latex_to_mathml(latex, display="inline")
except:
return match.group(0)
text = re.sub(r'\$([^$]+)\$', convert_inline, text)
return text
def extract_table_title(lines, current_index):
@@ -168,6 +196,7 @@ def parse_md_to_html_blocks(md_content):
# Check if previous line has table title (e.g., **Tabla 1.** *Title*)
table_title = None
alt_title = None # Alternative title from **bold text:** pattern
table_source = "Elaboración propia"
# Look back for table title
@@ -177,6 +206,9 @@ def parse_md_to_html_blocks(md_content):
# Extract title text
table_title = re.sub(r'\*+', '', prev_line).strip()
break
elif prev_line.startswith('**') and prev_line.endswith(':**'):
# Alternative: **Bold title:** pattern (for informal tables)
alt_title = re.sub(r'\*+', '', prev_line).rstrip(':').strip()
elif prev_line and not prev_line.startswith('|'):
break
@@ -197,26 +229,30 @@ def parse_md_to_html_blocks(md_content):
# Word TOC looks for text with Caption style - anchor must be outside main caption text
bookmark_id = f"_Ref_Tab{table_counter}"
if table_title:
clean_title = table_title.replace(f"Tabla {table_counter}.", "").strip()
# Remove any "Tabla X." or "Tabla AX." pattern from the title
clean_title = re.sub(r'^Tabla\s+[A-Z]?\d+\.\s*', '', table_title).strip()
elif alt_title:
# Use alternative title from **bold text:** pattern
clean_title = alt_title
else:
clean_title = "Tabla de datos."
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_counter}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
# Build table HTML with APA style (horizontal lines only, no vertical)
table_html = '<table class=MsoTableGrid border=0 cellspacing=0 cellpadding=0 style="border-collapse:collapse;border:none">'
table_html = '<table class=MsoTableGrid border=1 cellspacing=0 cellpadding=0 style="border-collapse:collapse;margin-left:auto;margin-right:auto;mso-table-style-name:\'Plain Table 1\'">'
for j, tline in enumerate(table_lines):
cells = [c.strip() for c in tline.split('|')[1:-1]]
table_html += '<tr>'
for cell in cells:
if j == 0:
# Header row: top and bottom border, bold text
table_html += f'<td style="border-top:solid windowtext 1.0pt;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0"><b><span lang=ES>{md_to_html_para(cell)}</span></b></p></td>'
table_html += f'<td style="border-top:solid windowtext 1.0pt;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0;text-align:center"><b><span lang=ES>{md_to_html_para(cell)}</span></b></p></td>'
elif j == len(table_lines) - 1:
# Last row: bottom border only
table_html += f'<td style="border-top:none;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
table_html += f'<td style="border-top:none;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0;text-align:center"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
else:
# Middle rows: no borders
table_html += f'<td style="border:none;padding:5px"><p class=MsoNormal style="margin:0"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
table_html += f'<td style="border:none;padding:5px"><p class=MsoNormal style="margin:0;text-align:center"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
table_html += '</tr>'
table_html += '</table>'
html_blocks.append(table_html)
@@ -240,6 +276,7 @@ def parse_md_to_html_blocks(md_content):
if re.match(r'^[\-\*\+]\s', line):
while i < len(lines) and re.match(r'^[\-\*\+]\s', lines[i]):
item_text = lines[i][2:].strip()
item_text = convert_latex_formulas(item_text)
html_blocks.append(f'<p class=MsoListParagraphCxSpMiddle style="margin-left:36pt;text-indent:-18pt"><span lang=ES style="font-family:Symbol">·</span><span lang=ES style="font-size:7pt">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</span><span lang=ES>{md_to_html_para(item_text)}</span></p>')
i += 1
continue
@@ -249,6 +286,7 @@ def parse_md_to_html_blocks(md_content):
num = 1
while i < len(lines) and re.match(r'^\d+\.\s', lines[i]):
item_text = re.sub(r'^\d+\.\s*', '', lines[i]).strip()
item_text = convert_latex_formulas(item_text)
html_blocks.append(f'<p class=MsoListParagraphCxSpMiddle style="margin-left:36pt;text-indent:-18pt"><span lang=ES>{num}.<span style="font-size:7pt">&nbsp;&nbsp;&nbsp;</span>{md_to_html_para(item_text)}</span></p>')
num += 1
i += 1
@@ -273,7 +311,12 @@ def parse_md_to_html_blocks(md_content):
i += 1
para_text = ' '.join(para_lines)
html_blocks.append(f'<p class=MsoNormal><span lang=ES>{md_to_html_para(para_text)}</span></p>')
para_text = convert_latex_formulas(para_text)
# Check if paragraph contains MathML (already wrapped)
if '<math' in para_text:
html_blocks.append(para_text)
else:
html_blocks.append(f'<p class=MsoNormal><span lang=ES>{md_to_html_para(para_text)}</span></p>')
return '\n\n'.join(html_blocks)
@@ -365,7 +408,7 @@ def main():
global table_counter, figure_counter
print("Reading template...")
html_content = read_file(TEMPLATE)
html_content = read_file(TEMPLATE_INPUT)
soup = BeautifulSoup(html_content, 'html.parser')
print("Reading docs content...")
@@ -595,9 +638,9 @@ def main():
print("Saving modified template...")
output_html = str(soup)
write_file(TEMPLATE, output_html)
write_file(TEMPLATE_OUTPUT, output_html)
print(f"✓ Done! Modified: {TEMPLATE}")
print(f"✓ Done! Modified: {TEMPLATE_OUTPUT}")
print("\nTo convert to DOCX:")
print("1. Open the .htm file in Microsoft Word")
print("2. Replace [Insertar diagrama Mermaid aquí] placeholders with actual diagrams")