Leyenda
Some checks failed
build_docker / essential (push) Successful in 1s
build_docker / build_paddle_ocr (push) Successful in 4m0s
build_docker / build_paddle_ocr_gpu (push) Successful in 18m53s
build_docker / build_easyocr (push) Successful in 16m12s
build_docker / build_easyocr_gpu (push) Successful in 22m37s
build_docker / build_doctr (push) Successful in 21m22s
build_docker / build_raytune (push) Successful in 2m50s
build_docker / build_doctr_gpu (push) Has been cancelled

This commit is contained in:
2026-02-04 19:56:30 +01:00
parent d746a3c73f
commit 868f748a8d
4 changed files with 617 additions and 357 deletions

View File

@@ -4,23 +4,25 @@
import re import re
import os import os
import shutil import shutil
from bs4 import BeautifulSoup, NavigableString from bs4 import BeautifulSoup, NavigableString
from latex2mathml.converter import convert as latex_to_mathml from latex2mathml.converter import convert as latex_to_mathml
from PIL import Image from PIL import Image
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) BASE_DIR = os.path.dirname(os.path.abspath(__file__))
TEMPLATE_INPUT = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm') TEMPLATE_INPUT = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm')
TEMPLATE_OUTPUT = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual.htm') TEMPLATE_OUTPUT = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual.htm')
DOCS_DIR = os.path.join(BASE_DIR, 'docs') DOCS_DIR = os.path.join(BASE_DIR, 'docs')
# Accept Fuente/Source lines with or without markdown bold # Accept Fuente/Source lines with or without markdown bold
SOURCE_LINE_RE = re.compile(r'^\s*(?:\*{1,2})?(Fuente|Source):(?:\*{1,2})?\s*(.*)$', re.IGNORECASE) SOURCE_LINE_RE = re.compile(r'^\s*(?:\*{1,2})?(Fuente|Source):(?:\*{1,2})?\s*(.*)$', re.IGNORECASE)
# Accept Leyenda lines with or without markdown bold
# Global counters for tables and figures LEYENDA_LINE_RE = re.compile(r'^\s*(?:\*{1,2})?Leyenda:(?:\*{1,2})?\s*(.*)$', re.IGNORECASE)
table_counter = 0
figure_counter = 0 # Global counters for tables and figures
anexo_table_counter = 0 table_counter = 0
anexo_figure_counter = 0 figure_counter = 0
anexo_table_counter = 0
anexo_figure_counter = 0
# Global sequential counter for figure filenames (figura_1.png, figura_2.png, etc.) # Global sequential counter for figure filenames (figura_1.png, figura_2.png, etc.)
global_figure_index = 0 global_figure_index = 0
@@ -48,7 +50,7 @@ def md_to_html_para(text):
text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<a href="\2">\1</a>', text) text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<a href="\2">\1</a>', text)
return text return text
def convert_latex_formulas(text): def convert_latex_formulas(text):
"""Convert LaTeX formulas to MathML for Word compatibility.""" """Convert LaTeX formulas to MathML for Word compatibility."""
# Block formulas $$...$$ # Block formulas $$...$$
def convert_block(match): def convert_block(match):
@@ -69,22 +71,33 @@ def convert_latex_formulas(text):
except: except:
return match.group(0) return match.group(0)
text = re.sub(r'\$([^$]+)\$', convert_inline, text) text = re.sub(r'\$([^$]+)\$', convert_inline, text)
return text return text
def extract_source_from_line(line): def extract_source_from_line(line):
"""Return source text if line is a Fuente/Source line, otherwise None.""" """Return source text if line is a Fuente/Source line, otherwise None."""
match = SOURCE_LINE_RE.match(line.strip()) match = SOURCE_LINE_RE.match(line.strip())
if not match: if not match:
return None return None
return match.group(2).strip() return match.group(2).strip()
def is_source_line(line): def is_source_line(line):
"""Check whether a line starts with Fuente:/Source: (optionally bold).""" """Check whether a line starts with Fuente:/Source: (optionally bold)."""
return SOURCE_LINE_RE.match(line.strip()) is not None return SOURCE_LINE_RE.match(line.strip()) is not None
def extract_table_title(lines, current_index): def extract_leyenda_from_line(line):
"""Look for table title in preceding lines (e.g., **Tabla 1.** *Title*).""" """Return leyenda text if line is a Leyenda line, otherwise None."""
match = LEYENDA_LINE_RE.match(line.strip())
if not match:
return None
return match.group(1).strip()
def is_leyenda_line(line):
"""Check whether a line starts with Leyenda: (optionally bold)."""
return LEYENDA_LINE_RE.match(line.strip()) is not None
def extract_table_title(lines, current_index):
"""Look for table title in preceding lines (e.g., **Tabla 1.** *Title*)."""
# Check previous non-empty lines for table title # Check previous non-empty lines for table title
for i in range(current_index - 1, max(0, current_index - 5), -1): for i in range(current_index - 1, max(0, current_index - 5), -1):
line = lines[i].strip() line = lines[i].strip()
@@ -172,8 +185,11 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
bookmark_id = f"_Ref_Fig{fig_num}" bookmark_id = f"_Ref_Fig{fig_num}"
# mso-pagination:keep-with-next ensures caption stays with figure image (correct MSO property) # mso-pagination:keep-with-next ensures caption stays with figure image (correct MSO property)
# For Anexo figures, use static text (no SEQ field) to prevent Word from overwriting A1, A2... # For Anexo figures, use static text (no SEQ field) to prevent Word from overwriting A1, A2...
# Add TC field so Anexo figures appear in Table of Figures index
# Use \f c to match the TOC field identifier in the template
if is_anexo: if is_anexo:
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura {fig_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''') tc_field = f'''<!--[if supportFields]><span style='mso-element:field-begin'></span> TC "Figura {fig_num}. {fig_title}" \\f c \\l 1 <span style='mso-element:field-end'></span><![endif]-->'''
html_blocks.append(f'''<a name="{bookmark_id}"></a>{tc_field}<p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura {fig_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
else: else:
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{fig_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''') html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{fig_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
@@ -204,19 +220,27 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
# Check if next non-empty line has custom Fuente # Check if next non-empty line has custom Fuente
custom_source = None custom_source = None
fig_leyenda = None
lookahead = i + 1 lookahead = i + 1
while lookahead < len(lines) and not lines[lookahead].strip(): while lookahead < len(lines) and not lines[lookahead].strip():
lookahead += 1 lookahead += 1
if lookahead < len(lines): if lookahead < len(lines):
next_line = lines[lookahead].strip() next_line = lines[lookahead].strip()
if is_source_line(next_line): if is_source_line(next_line):
# Extract custom source, removing markdown formatting # Extract custom source, removing markdown formatting
custom_source = extract_source_from_line(next_line) custom_source = extract_source_from_line(next_line)
# Ensure it ends with a period # Ensure it ends with a period
if custom_source and not custom_source.endswith('.'): if custom_source and not custom_source.endswith('.'):
custom_source += '.' custom_source += '.'
# Skip this line by advancing i past it # Skip this line by advancing i past it
i = lookahead i = lookahead
# Check for Leyenda after source
leyenda_idx = i + 1
while leyenda_idx < len(lines) and not lines[leyenda_idx].strip():
leyenda_idx += 1
if leyenda_idx < len(lines) and is_leyenda_line(lines[leyenda_idx]):
fig_leyenda = extract_leyenda_from_line(lines[leyenda_idx])
i = leyenda_idx
if custom_source: if custom_source:
source_html = md_to_html_para(custom_source) source_html = md_to_html_para(custom_source)
@@ -224,6 +248,13 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
else: else:
html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Fuente: Elaboración propia.</span></p>''') html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Fuente: Elaboración propia.</span></p>''')
# Add leyenda if present (same style as Fuente, new line)
if fig_leyenda:
leyenda_html = md_to_html_para(fig_leyenda)
if not fig_leyenda.endswith('.'):
leyenda_html += '.'
html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Leyenda: {leyenda_html}</span></p>''')
html_blocks.append('<p class=MsoNormal><span lang=ES><o:p>&nbsp;</o:p></span></p>') html_blocks.append('<p class=MsoNormal><span lang=ES><o:p>&nbsp;</o:p></span></p>')
i += 1 i += 1
continue continue
@@ -249,7 +280,7 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
if line.startswith('####'): if line.startswith('####'):
text = line.lstrip('#').strip() text = line.lstrip('#').strip()
# Apply consistent styling like h2/h3, disable numbering for h4 # Apply consistent styling like h2/h3, disable numbering for h4
html_blocks.append(f'<h4 style="mso-list:none"><span lang=ES style="text-transform:none">{text}</span></h4>') html_blocks.append(f'<h4 style="mso-list:none"><b><span lang=ES style="text-transform:none">{text}</span></b></h4>')
i += 1 i += 1
continue continue
elif line.startswith('###'): elif line.startswith('###'):
@@ -314,11 +345,19 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
# Look ahead for source (skip blank lines first) # Look ahead for source (skip blank lines first)
source_idx = i source_idx = i
while source_idx < len(lines) and not lines[source_idx].strip(): table_leyenda = None
source_idx += 1 while source_idx < len(lines) and not lines[source_idx].strip():
if source_idx < len(lines) and is_source_line(lines[source_idx]): source_idx += 1
table_source = extract_source_from_line(lines[source_idx]) if source_idx < len(lines) and is_source_line(lines[source_idx]):
i = source_idx + 1 table_source = extract_source_from_line(lines[source_idx])
i = source_idx + 1
# Check for Leyenda after source (skip blank lines)
leyenda_idx = i
while leyenda_idx < len(lines) and not lines[leyenda_idx].strip():
leyenda_idx += 1
if leyenda_idx < len(lines) and is_leyenda_line(lines[leyenda_idx]):
table_leyenda = extract_leyenda_from_line(lines[leyenda_idx])
i = leyenda_idx + 1
# Add table title with MsoCaption class and proper Word SEQ field for cross-reference # Add table title with MsoCaption class and proper Word SEQ field for cross-reference
# Format: "Tabla X." in bold, title in italic (per UNIR guidelines) # Format: "Tabla X." in bold, title in italic (per UNIR guidelines)
@@ -334,8 +373,11 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
clean_title = "Tabla de datos." clean_title = "Tabla de datos."
# mso-pagination:keep-with-next ensures caption stays with table (correct MSO property) # mso-pagination:keep-with-next ensures caption stays with table (correct MSO property)
# For Anexo tables, use static text (no SEQ field) to prevent Word from overwriting A1, A2... # For Anexo tables, use static text (no SEQ field) to prevent Word from overwriting A1, A2...
# Add TC field so Anexo tables appear in Table of Tables index
# Use \f t identifier - template TOC field will be modified to include this
if is_anexo: if is_anexo:
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla {table_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''') tc_field = f'''<!--[if supportFields]><span style='mso-element:field-begin'></span> TC "Tabla {table_num}. {clean_title}" \\f t \\l 1 <span style='mso-element:field-end'></span><![endif]-->'''
html_blocks.append(f'''<a name="{bookmark_id}"></a>{tc_field}<p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla {table_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
else: else:
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''') html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
@@ -363,6 +405,14 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
if not table_source.endswith('.'): if not table_source.endswith('.'):
source_html += '.' source_html += '.'
html_blocks.append(f'<p class=Piedefoto-tabla style="margin-left:0cm"><span lang=ES>Fuente: {source_html}</span></p>') html_blocks.append(f'<p class=Piedefoto-tabla style="margin-left:0cm"><span lang=ES>Fuente: {source_html}</span></p>')
# Add leyenda if present (same style as Fuente, new line)
if table_leyenda:
leyenda_html = md_to_html_para(table_leyenda)
if not table_leyenda.endswith('.'):
leyenda_html += '.'
html_blocks.append(f'<p class=Piedefoto-tabla style="margin-left:0cm"><span lang=ES>Leyenda: {leyenda_html}</span></p>')
html_blocks.append('<p class=MsoNormal><span lang=ES><o:p>&nbsp;</o:p></span></p>') html_blocks.append('<p class=MsoNormal><span lang=ES><o:p>&nbsp;</o:p></span></p>')
continue continue
@@ -376,24 +426,63 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
html_blocks.append(f'<p class=MsoQuote><i><span lang=ES>{md_to_html_para(quote_text)}</span></i></p>') html_blocks.append(f'<p class=MsoQuote><i><span lang=ES>{md_to_html_para(quote_text)}</span></i></p>')
continue continue
# Bullet list # Bullet list (handle blank lines between items)
if re.match(r'^[\-\*\+]\s', line): if re.match(r'^[\-\*\+]\s', line):
while i < len(lines) and re.match(r'^[\-\*\+]\s', lines[i]): # Collect all bullet items first
item_text = lines[i][2:].strip() bullet_items = []
item_text = convert_latex_formulas(item_text) while i < len(lines):
html_blocks.append(f'<p class=MsoListParagraphCxSpMiddle style="margin-left:36pt;text-indent:-18pt"><span lang=ES style="font-family:Symbol">·</span><span lang=ES style="font-size:7pt">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</span><span lang=ES>{md_to_html_para(item_text)}</span></p>') # Skip blank lines
i += 1 while i < len(lines) and not lines[i].strip():
i += 1
# Check if next non-blank line is a bullet item
if i < len(lines) and re.match(r'^[\-\*\+]\s', lines[i]):
item_text = lines[i][2:].strip()
item_text = convert_latex_formulas(item_text)
bullet_items.append(md_to_html_para(item_text))
i += 1
else:
break
# Output with proper First/Middle/Last classes
for idx, item in enumerate(bullet_items):
if len(bullet_items) == 1:
cls = 'MsoListParagraph'
elif idx == 0:
cls = 'MsoListParagraphCxSpFirst'
elif idx == len(bullet_items) - 1:
cls = 'MsoListParagraphCxSpLast'
else:
cls = 'MsoListParagraphCxSpMiddle'
html_blocks.append(f'<p class={cls} style="margin-left:36pt;text-indent:-18pt"><span lang=ES style="font-family:Symbol">·</span><span lang=ES style="font-size:7pt">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</span><span lang=ES>{item}</span></p>')
continue continue
# Numbered list # Numbered list (handle blank lines between items)
if re.match(r'^\d+\.\s', line): if re.match(r'^\d+\.\s', line):
num = 1 # Collect all numbered items first
while i < len(lines) and re.match(r'^\d+\.\s', lines[i]): numbered_items = []
item_text = re.sub(r'^\d+\.\s*', '', lines[i]).strip() while i < len(lines):
item_text = convert_latex_formulas(item_text) # Skip blank lines
html_blocks.append(f'<p class=MsoListParagraphCxSpMiddle style="margin-left:36pt;text-indent:-18pt"><span lang=ES>{num}.<span style="font-size:7pt">&nbsp;&nbsp;&nbsp;</span>{md_to_html_para(item_text)}</span></p>') while i < len(lines) and not lines[i].strip():
num += 1 i += 1
i += 1 # Check if next non-blank line is a numbered item
if i < len(lines) and re.match(r'^\d+\.\s', lines[i]):
item_text = re.sub(r'^\d+\.\s*', '', lines[i]).strip()
item_text = convert_latex_formulas(item_text)
numbered_items.append(md_to_html_para(item_text))
i += 1
else:
break
# Output with proper First/Middle/Last classes
for idx, item in enumerate(numbered_items):
num = idx + 1
if len(numbered_items) == 1:
cls = 'MsoListParagraph'
elif idx == 0:
cls = 'MsoListParagraphCxSpFirst'
elif idx == len(numbered_items) - 1:
cls = 'MsoListParagraphCxSpLast'
else:
cls = 'MsoListParagraphCxSpMiddle'
html_blocks.append(f'<p class={cls} style="margin-left:36pt;text-indent:-18pt"><span lang=ES>{num}.<span style="font-size:7pt">&nbsp;&nbsp;&nbsp;</span>{item}</span></p>')
continue continue
# Skip lines that are just table/figure titles (they'll be handled with the table/figure) # Skip lines that are just table/figure titles (they'll be handled with the table/figure)
@@ -403,9 +492,12 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
if line.strip().startswith('**Figura') or line.strip().startswith('*Figura'): if line.strip().startswith('**Figura') or line.strip().startswith('*Figura'):
i += 1 i += 1
continue continue
if is_source_line(line): if is_source_line(line):
i += 1 i += 1
continue continue
if is_leyenda_line(line):
i += 1
continue
# Regular paragraph # Regular paragraph
para_lines = [line] para_lines = [line]
@@ -523,6 +615,17 @@ def main():
print("Reading template...") print("Reading template...")
html_content = read_file(TEMPLATE_INPUT) html_content = read_file(TEMPLATE_INPUT)
# Modify the Table of Tables TOC field to include TC entries with \f t identifier
# Original: TOC \h \z \t "Tablas;1" \c "Tabla"
# Modified: TOC \f t \h \z \t "Tablas;1" \c "Tabla"
# Use regex to handle whitespace/HTML variations in the TOC field
html_content = re.sub(
r'(TOC\s+)(\\h\s+\\z\s+\\t\s*\n?\s*&quot;Tablas;1&quot;)',
r'\1\\f t \2',
html_content
)
soup = BeautifulSoup(html_content, 'html.parser') soup = BeautifulSoup(html_content, 'html.parser')
print("Reading docs content...") print("Reading docs content...")
@@ -671,10 +774,10 @@ def main():
# Also remove surrounding caption and source # Also remove surrounding caption and source
prev_sib = table.find_previous_sibling() prev_sib = table.find_previous_sibling()
next_sib = table.find_next_sibling() next_sib = table.find_next_sibling()
if prev_sib and 'Tabla 1. Ejemplo' in prev_sib.get_text(): if prev_sib and 'Tabla 1. Ejemplo' in prev_sib.get_text():
prev_sib.decompose() prev_sib.decompose()
if next_sib and SOURCE_LINE_RE.search(next_sib.get_text().strip()): if next_sib and SOURCE_LINE_RE.search(next_sib.get_text().strip()):
next_sib.decompose() next_sib.decompose()
table.decompose() table.decompose()
print(" ✓ Removed template table example") print(" ✓ Removed template table example")
break break

View File

@@ -6,7 +6,7 @@ Se realizó un estudio comparativo de tres soluciones OCR de código abierto: Ea
Los resultados demuestran que la optimización de hiperparámetros logró mejoras significativas: el mejor trial individual alcanzó un CER de 0.79% (precisión del 99.21%), cumpliendo el objetivo de CER < 2%. Al validar la configuración optimizada sobre el dataset completo de 45 páginas, se obtuvo una mejora del 12.8% en CER (de 8.85% a 7.72%). El hallazgo más relevante fue que el parámetro `textline_orientation` (clasificación de orientación de línea de texto) tiene un impacto crítico en el rendimiento. Adicionalmente, se identificó que el umbral de detección (`text_det_thresh`) presenta una correlación positiva moderada (0.43) con el error, lo que indica que valores más bajos tienden a mejorar el rendimiento. Los resultados demuestran que la optimización de hiperparámetros logró mejoras significativas: el mejor trial individual alcanzó un CER de 0.79% (precisión del 99.21%), cumpliendo el objetivo de CER < 2%. Al validar la configuración optimizada sobre el dataset completo de 45 páginas, se obtuvo una mejora del 12.8% en CER (de 8.85% a 7.72%). El hallazgo más relevante fue que el parámetro `textline_orientation` (clasificación de orientación de línea de texto) tiene un impacto crítico en el rendimiento. Adicionalmente, se identificó que el umbral de detección (`text_det_thresh`) presenta una correlación positiva moderada (0.43) con el error, lo que indica que valores más bajos tienden a mejorar el rendimiento.
**Fuente:** [`docs/metrics/metrics_paddle.md`](https://seryus.ddns.net/unir/MastersThesis/src/branch/main/docs/metrics/metrics_paddle.md), [`src/results/correlations/paddle_correlations.csv`](https://seryus.ddns.net/unir/MastersThesis/src/branch/main/src/results/correlations/paddle_correlations.csv). **Fuente:** [`metrics_paddle.md`](https://seryus.ddns.net/unir/MastersThesis/src/branch/main/docs/metrics/metrics_paddle.md), [`paddle_correlations.csv`](https://seryus.ddns.net/unir/MastersThesis/src/branch/main/src/results/correlations/paddle_correlations.csv).
Este trabajo demuestra que la optimización de hiperparámetros es una alternativa viable al fine-tuning, especialmente útil cuando se dispone de modelos preentrenados para el idioma objetivo. La infraestructura dockerizada desarrollada permite reproducir los experimentos y facilita la evaluación sistemática de configuraciones OCR. Este trabajo demuestra que la optimización de hiperparámetros es una alternativa viable al fine-tuning, especialmente útil cuando se dispone de modelos preentrenados para el idioma objetivo. La infraestructura dockerizada desarrollada permite reproducir los experimentos y facilita la evaluación sistemática de configuraciones OCR.
@@ -22,7 +22,7 @@ A comparative study of three open-source OCR solutions was conducted with EasyOC
Results demonstrate that hyperparameter optimization achieved significant improvements. The best individual trial reached a CER of 0.79% (99.21% accuracy), meeting the CER < 2% objective. When validating the optimized configuration on the full 45-page dataset, a 12.8% CER improvement was obtained (from 8.85% to 7.72%). The most relevant finding was that the `textline_orientation` parameter (text line orientation classification) has a critical impact on performance. Additionally, the detection threshold (`text_det_thresh`) showed a moderate positive correlation (0.43) with error, indicating that lower values tend to improve performance. Results demonstrate that hyperparameter optimization achieved significant improvements. The best individual trial reached a CER of 0.79% (99.21% accuracy), meeting the CER < 2% objective. When validating the optimized configuration on the full 45-page dataset, a 12.8% CER improvement was obtained (from 8.85% to 7.72%). The most relevant finding was that the `textline_orientation` parameter (text line orientation classification) has a critical impact on performance. Additionally, the detection threshold (`text_det_thresh`) showed a moderate positive correlation (0.43) with error, indicating that lower values tend to improve performance.
Sources: [`docs/metrics/metrics_paddle.md`](https://seryus.ddns.net/unir/MastersThesis/src/branch/main/docs/metrics/metrics_paddle.md), [`src/results/correlations/paddle_correlations.csv`](https://seryus.ddns.net/unir/MastersThesis/src/branch/main/src/results/correlations/paddle_correlations.csv). Sources: [`metrics_paddle.md`](https://seryus.ddns.net/unir/MastersThesis/src/branch/main/docs/metrics/metrics_paddle.md), [`paddle_correlations.csv`](https://seryus.ddns.net/unir/MastersThesis/src/branch/main/src/results/correlations/paddle_correlations.csv).
This work demonstrates that hyperparameter optimization is a viable alternative to fine-tuning, especially useful when pre-trained models for the target language are available. The dockerized infrastructure developed enables experiment reproducibility and facilitates systematic evaluation of OCR configurations. This work demonstrates that hyperparameter optimization is a viable alternative to fine-tuning, especially useful when pre-trained models for the target language are available. The dockerized infrastructure developed enables experiment reproducibility and facilitates systematic evaluation of OCR configurations.

157
docs/compliance.md Normal file
View File

@@ -0,0 +1,157 @@
# UNIR Style Compliance Checklist
This document lists the UNIR TFE style requirements to verify before final submission.
## Page Layout
| Requirement | Specification | Check |
|-------------|---------------|-------|
| Page size | A4 | ☐ |
| Left margin | 3.0 cm | ☐ |
| Right margin | 2.0 cm | ☐ |
| Top margin | 2.5 cm | ☐ |
| Bottom margin | 2.5 cm | ☐ |
| Header | Student name + TFE title | ☐ |
| Footer | Page number | ☐ |
## Typography
| Element | Specification | Check |
|---------|---------------|-------|
| Body text | Calibri 12pt, justified, 1.5 line spacing | ☐ |
| Título 1 (H1) | Calibri Light 18pt, blue, numbered (1., 2., ...) | ☐ |
| Título 2 (H2) | Calibri Light 14pt, blue, numbered (1.1, 1.2, ...) | ☐ |
| Título 3 (H3) | Calibri Light 12pt, numbered (1.1.1, 1.1.2, ...) | ☐ |
| Título 4 (H4) | Calibri 12pt, bold, unnumbered | ☐ |
| Footnotes | Calibri 10pt, justified, single spacing | ☐ |
| Code blocks | Consolas 10pt | ☐ |
## Document Structure
| Section | Requirements | Check |
|---------|--------------|-------|
| Portada | Title, Author, Type, Director, Date | ☐ |
| Resumen | 150-300 words in Spanish + Palabras clave (3-5) | ☐ |
| Abstract | 150-300 words in English + Keywords (3-5) | ☐ |
| Índice de contenidos | Auto-generated, new page | ☐ |
| Índice de figuras | Auto-generated, new page | ☐ |
| Índice de tablas | Auto-generated, new page | ☐ |
| Cap. 1 Introducción | 1.1 Motivación, 1.2 Planteamiento, 1.3 Estructura | ☐ |
| Cap. 2 Contexto | 2.1 Contexto, 2.2 Estado del arte, 2.3 Conclusiones | ☐ |
| Cap. 3 Objetivos | 3.1 Objetivo general, 3.2 Específicos, 3.3 Metodología | ☐ |
| Cap. 4 Desarrollo | Structure depends on work type | ☐ |
| Cap. 5 Conclusiones | 5.1 Conclusiones, 5.2 Trabajo futuro | ☐ |
| Referencias | APA format, alphabetical order | ☐ |
| Anexos | Code repository URL, supplementary data | ☐ |
## Tables
| Requirement | Specification | Check |
|-------------|---------------|-------|
| Title position | Above the table | ☐ |
| Title format | **Tabla N.** *Descriptive title in italics.* | ☐ |
| Numbering | Sequential (1, 2, 3...), Anexo uses A1, A2... | ☐ |
| Border style | APA: horizontal lines only (top, header bottom, table bottom) | ☐ |
| Source position | Below the table, centered | ☐ |
| Source format | Fuente: Author, Year. or Fuente: Elaboración propia. | ☐ |
| Leyenda (if needed) | Below Fuente, same style (Piedefoto-tabla) | ☐ |
| In TOT index | All tables appear in Índice de tablas | ☐ |
## Figures
| Requirement | Specification | Check |
|-------------|---------------|-------|
| Title position | Above the figure | ☐ |
| Title format | **Figura N.** *Descriptive title in italics.* | ☐ |
| Numbering | Sequential (1, 2, 3...), Anexo uses A1, A2... | ☐ |
| Alignment | Centered | ☐ |
| Source position | Below the figure, centered | ☐ |
| Source format | Fuente: Author, Year. or Fuente: Elaboración propia. | ☐ |
| Leyenda (if needed) | Below Fuente, same style (Piedefoto-tabla) | ☐ |
| In TOF index | All figures appear in Índice de figuras | ☐ |
## Lists
| Requirement | Specification | Check |
|-------------|---------------|-------|
| Bullet lists | Indented 36pt, bullet symbol (·) | ☐ |
| Numbered lists | Indented 36pt, sequential numbers (1, 2, 3...) | ☐ |
| Spacing | Proper First/Middle/Last paragraph spacing | ☐ |
## Citations and References
| Requirement | Specification | Check |
|-------------|---------------|-------|
| Citation format | APA 7th edition | ☐ |
| Single author | (Author, Year) or Author (Year) | ☐ |
| Two authors | (Author1 & Author2, Year) | ☐ |
| Three+ authors | (Author1 et al., Year) | ☐ |
| Reference list | Alphabetical by first author surname | ☐ |
| Hanging indent | 36pt left margin, -36pt text indent | ☐ |
| DOI/URL | Include when available | ☐ |
| No Wikipedia | Wikipedia citations not allowed | ☐ |
| Source variety | Books, journals, conferences (not just URLs) | ☐ |
## SMART Objectives
All objectives must be SMART:
| Criterion | Requirement | Check |
|-----------|-------------|-------|
| **S**pecific | Clearly defined, unambiguous | ☐ |
| **M**easurable | Quantifiable success metric (e.g., CER < 2%) | ☐ |
| **A**ttainable | Feasible with available resources | ☐ |
| **R**elevant | Demonstrable impact | ☐ |
| **T**ime-bound | Achievable within timeframe | ☐ |
## Writing Style
| Requirement | Check |
|-------------|-------|
| Each chapter starts with introductory paragraph | ☐ |
| Each paragraph has at least 3 sentences | ☐ |
| No two consecutive headings without text between them | ☐ |
| No superfluous phrases or repetition | ☐ |
| All concepts defined with pertinent citations | ☐ |
| Spelling checked (Word corrector) | ☐ |
| Logical flow between paragraphs | ☐ |
## Final Checks
| Requirement | Check |
|-------------|-------|
| All cited references appear in reference list | ☐ |
| All references in list are cited in text | ☐ |
| All figures/tables have numbers and titles | ☐ |
| Update all indices (Ctrl+A, F9 in Word) | ☐ |
| Page count: 50-90 pages (excl. cover, indices, annexes) | ☐ |
| Final format: PDF for deposit | ☐ |
## Automated Checks (apply_content.py)
The following are automatically handled by the generation scripts:
- ✓ Table/Figure sequential numbering
- ✓ Anexo items use A1, A2... prefix
- ✓ TC fields for Anexo items (appear in indices)
- ✓ Piedefoto-tabla style for Fuente/Leyenda
- ✓ MsoCaption style for titles
- ✓ APA table borders (horizontal only)
- ✓ MsoBibliography style for references
- ✓ MsoQuote style for blockquotes
- ✓ List paragraph classes (First/Middle/Last)
- ✓ Bold H4 headings (unnumbered)
## Color Palette (UNIR Theme)
| Color | Hex | Usage |
|-------|-----|-------|
| Primary Blue | `#0098CD` | Headings, diagram borders |
| Light Blue BG | `#E6F4F9` | Diagram backgrounds |
| Dark Gray | `#404040` | Body text |
| Accent Blue | `#5B9BD5` | Table headers |
| Light Accent | `#9CC2E5` | Table borders |
---
**Reference:** UNIR TFE Guidelines (`instructions/instrucciones.pdf`, `instructions/plantilla_individual.pdf`)

File diff suppressed because one or more lines are too long