Paddle ocr, easyicr and doctr gpu support. (#4)

2026-01-19 17:35:24 +00:00
parent 8e2b7a5096
commit c7ed7b2b9c
105 changed files with 8170 additions and 1263 deletions
--- a/apply_content.py
+++ b/apply_content.py
@@ -4,9 +4,11 @@
 import re
 import os
 from bs4 import BeautifulSoup, NavigableString
+from latex2mathml.converter import convert as latex_to_mathml

-BASE_DIR = '/Users/sergio/Desktop/MastersThesis'
-TEMPLATE = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual.htm')
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+TEMPLATE_INPUT = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm')
+TEMPLATE_OUTPUT = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual.htm')
 DOCS_DIR = os.path.join(BASE_DIR, 'docs')

 # Global counters for tables and figures
@@ -33,6 +35,32 @@ def md_to_html_para(text):
    text = re.sub(r'\*([^*]+)\*', r'<i>\1</i>', text)
    # Inline code
    text = re.sub(r'`([^`]+)`', r'<span style="font-family:Consolas;font-size:10pt">\1</span>', text)
+    # Links [text](url) -> <a href="url">text</a>
+    text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<a href="\2">\1</a>', text)
+    return text
+
+def convert_latex_formulas(text):
+    """Convert LaTeX formulas to MathML for Word compatibility."""
+    # Block formulas $$...$$
+    def convert_block(match):
+        latex = match.group(1)
+        try:
+            mathml = latex_to_mathml(latex, display="block")
+            return f'<p class=MsoNormal style="text-align:center">{mathml}</p>'
+        except:
+            return match.group(0)  # Keep original if conversion fails
+
+    text = re.sub(r'\$\$([^$]+)\$\$', convert_block, text)
+
+    # Inline formulas $...$
+    def convert_inline(match):
+        latex = match.group(1)
+        try:
+            return latex_to_mathml(latex, display="inline")
+        except:
+            return match.group(0)
+
+    text = re.sub(r'\$([^$]+)\$', convert_inline, text)
    return text

 def extract_table_title(lines, current_index):
@@ -168,6 +196,7 @@ def parse_md_to_html_blocks(md_content):

            # Check if previous line has table title (e.g., **Tabla 1.** *Title*)
            table_title = None
+            alt_title = None  # Alternative title from **bold text:** pattern
            table_source = "Elaboración propia"

            # Look back for table title
@@ -177,6 +206,9 @@ def parse_md_to_html_blocks(md_content):
                    # Extract title text
                    table_title = re.sub(r'\*+', '', prev_line).strip()
                    break
+                elif prev_line.startswith('**') and prev_line.endswith(':**'):
+                    # Alternative: **Bold title:** pattern (for informal tables)
+                    alt_title = re.sub(r'\*+', '', prev_line).rstrip(':').strip()
                elif prev_line and not prev_line.startswith('|'):
                    break

@@ -197,26 +229,30 @@ def parse_md_to_html_blocks(md_content):
            # Word TOC looks for text with Caption style - anchor must be outside main caption text
            bookmark_id = f"_Ref_Tab{table_counter}"
            if table_title:
-                clean_title = table_title.replace(f"Tabla {table_counter}.", "").strip()
+                # Remove any "Tabla X." or "Tabla AX." pattern from the title
+                clean_title = re.sub(r'^Tabla\s+[A-Z]?\d+\.\s*', '', table_title).strip()
+            elif alt_title:
+                # Use alternative title from **bold text:** pattern
+                clean_title = alt_title
            else:
                clean_title = "Tabla de datos."
            html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_counter}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')

            # Build table HTML with APA style (horizontal lines only, no vertical)
-            table_html = '<table class=MsoTableGrid border=0 cellspacing=0 cellpadding=0 style="border-collapse:collapse;border:none">'
+            table_html = '<table class=MsoTableGrid border=1 cellspacing=0 cellpadding=0 style="border-collapse:collapse;margin-left:auto;margin-right:auto;mso-table-style-name:\'Plain Table 1\'">'
            for j, tline in enumerate(table_lines):
                cells = [c.strip() for c in tline.split('|')[1:-1]]
                table_html += '<tr>'
                for cell in cells:
                    if j == 0:
                        # Header row: top and bottom border, bold text
-                        table_html += f'<td style="border-top:solid windowtext 1.0pt;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0"><b><span lang=ES>{md_to_html_para(cell)}</span></b></p></td>'
+                        table_html += f'<td style="border-top:solid windowtext 1.0pt;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0;text-align:center"><b><span lang=ES>{md_to_html_para(cell)}</span></b></p></td>'
                    elif j == len(table_lines) - 1:
                        # Last row: bottom border only
-                        table_html += f'<td style="border-top:none;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
+                        table_html += f'<td style="border-top:none;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0;text-align:center"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
                    else:
                        # Middle rows: no borders
-                        table_html += f'<td style="border:none;padding:5px"><p class=MsoNormal style="margin:0"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
+                        table_html += f'<td style="border:none;padding:5px"><p class=MsoNormal style="margin:0;text-align:center"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
                table_html += '</tr>'
            table_html += '</table>'
            html_blocks.append(table_html)
@@ -240,6 +276,7 @@ def parse_md_to_html_blocks(md_content):
        if re.match(r'^[\-\*\+]\s', line):
            while i < len(lines) and re.match(r'^[\-\*\+]\s', lines[i]):
                item_text = lines[i][2:].strip()
+                item_text = convert_latex_formulas(item_text)
                html_blocks.append(f'<p class=MsoListParagraphCxSpMiddle style="margin-left:36pt;text-indent:-18pt"><span lang=ES style="font-family:Symbol">·</span><span lang=ES style="font-size:7pt">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</span><span lang=ES>{md_to_html_para(item_text)}</span></p>')
                i += 1
            continue
@@ -249,6 +286,7 @@ def parse_md_to_html_blocks(md_content):
            num = 1
            while i < len(lines) and re.match(r'^\d+\.\s', lines[i]):
                item_text = re.sub(r'^\d+\.\s*', '', lines[i]).strip()
+                item_text = convert_latex_formulas(item_text)
                html_blocks.append(f'<p class=MsoListParagraphCxSpMiddle style="margin-left:36pt;text-indent:-18pt"><span lang=ES>{num}.<span style="font-size:7pt">&nbsp;&nbsp;&nbsp;</span>{md_to_html_para(item_text)}</span></p>')
                num += 1
                i += 1
@@ -273,7 +311,12 @@ def parse_md_to_html_blocks(md_content):
            i += 1

        para_text = ' '.join(para_lines)
-        html_blocks.append(f'<p class=MsoNormal><span lang=ES>{md_to_html_para(para_text)}</span></p>')
+        para_text = convert_latex_formulas(para_text)
+        # Check if paragraph contains MathML (already wrapped)
+        if '<math' in para_text:
+            html_blocks.append(para_text)
+        else:
+            html_blocks.append(f'<p class=MsoNormal><span lang=ES>{md_to_html_para(para_text)}</span></p>')

    return '\n\n'.join(html_blocks)

@@ -365,7 +408,7 @@ def main():
    global table_counter, figure_counter

    print("Reading template...")
-    html_content = read_file(TEMPLATE)
+    html_content = read_file(TEMPLATE_INPUT)
    soup = BeautifulSoup(html_content, 'html.parser')

    print("Reading docs content...")
@@ -595,9 +638,9 @@ def main():

    print("Saving modified template...")
    output_html = str(soup)
-    write_file(TEMPLATE, output_html)
+    write_file(TEMPLATE_OUTPUT, output_html)

-    print(f"✓ Done! Modified: {TEMPLATE}")
+    print(f"✓ Done! Modified: {TEMPLATE_OUTPUT}")
    print("\nTo convert to DOCX:")
    print("1. Open the .htm file in Microsoft Word")
    print("2. Replace [Insertar diagrama Mermaid aquí] placeholders with actual diagrams")