Leyenda
Some checks failed
build_docker / essential (push) Successful in 1s
build_docker / build_paddle_ocr (push) Successful in 4m0s
build_docker / build_paddle_ocr_gpu (push) Successful in 18m53s
build_docker / build_easyocr (push) Successful in 16m12s
build_docker / build_easyocr_gpu (push) Successful in 22m37s
build_docker / build_doctr (push) Successful in 21m22s
build_docker / build_raytune (push) Successful in 2m50s
build_docker / build_doctr_gpu (push) Has been cancelled
Some checks failed
build_docker / essential (push) Successful in 1s
build_docker / build_paddle_ocr (push) Successful in 4m0s
build_docker / build_paddle_ocr_gpu (push) Successful in 18m53s
build_docker / build_easyocr (push) Successful in 16m12s
build_docker / build_easyocr_gpu (push) Successful in 22m37s
build_docker / build_doctr (push) Successful in 21m22s
build_docker / build_raytune (push) Successful in 2m50s
build_docker / build_doctr_gpu (push) Has been cancelled
This commit is contained in:
249
apply_content.py
249
apply_content.py
@@ -4,23 +4,25 @@
|
|||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
from bs4 import BeautifulSoup, NavigableString
|
from bs4 import BeautifulSoup, NavigableString
|
||||||
from latex2mathml.converter import convert as latex_to_mathml
|
from latex2mathml.converter import convert as latex_to_mathml
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
TEMPLATE_INPUT = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm')
|
TEMPLATE_INPUT = os.path.join(BASE_DIR, 'instructions/plantilla_individual.htm')
|
||||||
TEMPLATE_OUTPUT = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual.htm')
|
TEMPLATE_OUTPUT = os.path.join(BASE_DIR, 'thesis_output/plantilla_individual.htm')
|
||||||
DOCS_DIR = os.path.join(BASE_DIR, 'docs')
|
DOCS_DIR = os.path.join(BASE_DIR, 'docs')
|
||||||
|
|
||||||
# Accept Fuente/Source lines with or without markdown bold
|
# Accept Fuente/Source lines with or without markdown bold
|
||||||
SOURCE_LINE_RE = re.compile(r'^\s*(?:\*{1,2})?(Fuente|Source):(?:\*{1,2})?\s*(.*)$', re.IGNORECASE)
|
SOURCE_LINE_RE = re.compile(r'^\s*(?:\*{1,2})?(Fuente|Source):(?:\*{1,2})?\s*(.*)$', re.IGNORECASE)
|
||||||
|
# Accept Leyenda lines with or without markdown bold
|
||||||
# Global counters for tables and figures
|
LEYENDA_LINE_RE = re.compile(r'^\s*(?:\*{1,2})?Leyenda:(?:\*{1,2})?\s*(.*)$', re.IGNORECASE)
|
||||||
table_counter = 0
|
|
||||||
figure_counter = 0
|
# Global counters for tables and figures
|
||||||
anexo_table_counter = 0
|
table_counter = 0
|
||||||
anexo_figure_counter = 0
|
figure_counter = 0
|
||||||
|
anexo_table_counter = 0
|
||||||
|
anexo_figure_counter = 0
|
||||||
# Global sequential counter for figure filenames (figura_1.png, figura_2.png, etc.)
|
# Global sequential counter for figure filenames (figura_1.png, figura_2.png, etc.)
|
||||||
global_figure_index = 0
|
global_figure_index = 0
|
||||||
|
|
||||||
@@ -48,7 +50,7 @@ def md_to_html_para(text):
|
|||||||
text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<a href="\2">\1</a>', text)
|
text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<a href="\2">\1</a>', text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def convert_latex_formulas(text):
|
def convert_latex_formulas(text):
|
||||||
"""Convert LaTeX formulas to MathML for Word compatibility."""
|
"""Convert LaTeX formulas to MathML for Word compatibility."""
|
||||||
# Block formulas $$...$$
|
# Block formulas $$...$$
|
||||||
def convert_block(match):
|
def convert_block(match):
|
||||||
@@ -69,22 +71,33 @@ def convert_latex_formulas(text):
|
|||||||
except:
|
except:
|
||||||
return match.group(0)
|
return match.group(0)
|
||||||
|
|
||||||
text = re.sub(r'\$([^$]+)\$', convert_inline, text)
|
text = re.sub(r'\$([^$]+)\$', convert_inline, text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def extract_source_from_line(line):
|
def extract_source_from_line(line):
|
||||||
"""Return source text if line is a Fuente/Source line, otherwise None."""
|
"""Return source text if line is a Fuente/Source line, otherwise None."""
|
||||||
match = SOURCE_LINE_RE.match(line.strip())
|
match = SOURCE_LINE_RE.match(line.strip())
|
||||||
if not match:
|
if not match:
|
||||||
return None
|
return None
|
||||||
return match.group(2).strip()
|
return match.group(2).strip()
|
||||||
|
|
||||||
def is_source_line(line):
|
def is_source_line(line):
|
||||||
"""Check whether a line starts with Fuente:/Source: (optionally bold)."""
|
"""Check whether a line starts with Fuente:/Source: (optionally bold)."""
|
||||||
return SOURCE_LINE_RE.match(line.strip()) is not None
|
return SOURCE_LINE_RE.match(line.strip()) is not None
|
||||||
|
|
||||||
def extract_table_title(lines, current_index):
|
def extract_leyenda_from_line(line):
|
||||||
"""Look for table title in preceding lines (e.g., **Tabla 1.** *Title*)."""
|
"""Return leyenda text if line is a Leyenda line, otherwise None."""
|
||||||
|
match = LEYENDA_LINE_RE.match(line.strip())
|
||||||
|
if not match:
|
||||||
|
return None
|
||||||
|
return match.group(1).strip()
|
||||||
|
|
||||||
|
def is_leyenda_line(line):
|
||||||
|
"""Check whether a line starts with Leyenda: (optionally bold)."""
|
||||||
|
return LEYENDA_LINE_RE.match(line.strip()) is not None
|
||||||
|
|
||||||
|
def extract_table_title(lines, current_index):
|
||||||
|
"""Look for table title in preceding lines (e.g., **Tabla 1.** *Title*)."""
|
||||||
# Check previous non-empty lines for table title
|
# Check previous non-empty lines for table title
|
||||||
for i in range(current_index - 1, max(0, current_index - 5), -1):
|
for i in range(current_index - 1, max(0, current_index - 5), -1):
|
||||||
line = lines[i].strip()
|
line = lines[i].strip()
|
||||||
@@ -172,8 +185,11 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
|
|||||||
bookmark_id = f"_Ref_Fig{fig_num}"
|
bookmark_id = f"_Ref_Fig{fig_num}"
|
||||||
# mso-pagination:keep-with-next ensures caption stays with figure image (correct MSO property)
|
# mso-pagination:keep-with-next ensures caption stays with figure image (correct MSO property)
|
||||||
# For Anexo figures, use static text (no SEQ field) to prevent Word from overwriting A1, A2...
|
# For Anexo figures, use static text (no SEQ field) to prevent Word from overwriting A1, A2...
|
||||||
|
# Add TC field so Anexo figures appear in Table of Figures index
|
||||||
|
# Use \f c to match the TOC field identifier in the template
|
||||||
if is_anexo:
|
if is_anexo:
|
||||||
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura {fig_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
|
tc_field = f'''<!--[if supportFields]><span style='mso-element:field-begin'></span> TC "Figura {fig_num}. {fig_title}" \\f c \\l 1 <span style='mso-element:field-end'></span><![endif]-->'''
|
||||||
|
html_blocks.append(f'''<a name="{bookmark_id}"></a>{tc_field}<p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura {fig_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
|
||||||
else:
|
else:
|
||||||
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{fig_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
|
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="text-align:center;mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{fig_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{fig_title}</span></i></p>''')
|
||||||
|
|
||||||
@@ -204,19 +220,27 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
|
|||||||
|
|
||||||
# Check if next non-empty line has custom Fuente
|
# Check if next non-empty line has custom Fuente
|
||||||
custom_source = None
|
custom_source = None
|
||||||
|
fig_leyenda = None
|
||||||
lookahead = i + 1
|
lookahead = i + 1
|
||||||
while lookahead < len(lines) and not lines[lookahead].strip():
|
while lookahead < len(lines) and not lines[lookahead].strip():
|
||||||
lookahead += 1
|
lookahead += 1
|
||||||
if lookahead < len(lines):
|
if lookahead < len(lines):
|
||||||
next_line = lines[lookahead].strip()
|
next_line = lines[lookahead].strip()
|
||||||
if is_source_line(next_line):
|
if is_source_line(next_line):
|
||||||
# Extract custom source, removing markdown formatting
|
# Extract custom source, removing markdown formatting
|
||||||
custom_source = extract_source_from_line(next_line)
|
custom_source = extract_source_from_line(next_line)
|
||||||
# Ensure it ends with a period
|
# Ensure it ends with a period
|
||||||
if custom_source and not custom_source.endswith('.'):
|
if custom_source and not custom_source.endswith('.'):
|
||||||
custom_source += '.'
|
custom_source += '.'
|
||||||
# Skip this line by advancing i past it
|
# Skip this line by advancing i past it
|
||||||
i = lookahead
|
i = lookahead
|
||||||
|
# Check for Leyenda after source
|
||||||
|
leyenda_idx = i + 1
|
||||||
|
while leyenda_idx < len(lines) and not lines[leyenda_idx].strip():
|
||||||
|
leyenda_idx += 1
|
||||||
|
if leyenda_idx < len(lines) and is_leyenda_line(lines[leyenda_idx]):
|
||||||
|
fig_leyenda = extract_leyenda_from_line(lines[leyenda_idx])
|
||||||
|
i = leyenda_idx
|
||||||
|
|
||||||
if custom_source:
|
if custom_source:
|
||||||
source_html = md_to_html_para(custom_source)
|
source_html = md_to_html_para(custom_source)
|
||||||
@@ -224,6 +248,13 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
|
|||||||
else:
|
else:
|
||||||
html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Fuente: Elaboración propia.</span></p>''')
|
html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Fuente: Elaboración propia.</span></p>''')
|
||||||
|
|
||||||
|
# Add leyenda if present (same style as Fuente, new line)
|
||||||
|
if fig_leyenda:
|
||||||
|
leyenda_html = md_to_html_para(fig_leyenda)
|
||||||
|
if not fig_leyenda.endswith('.'):
|
||||||
|
leyenda_html += '.'
|
||||||
|
html_blocks.append(f'''<p class=Piedefoto-tabla style="margin-left:0cm;text-align:center"><span lang=ES>Leyenda: {leyenda_html}</span></p>''')
|
||||||
|
|
||||||
html_blocks.append('<p class=MsoNormal><span lang=ES><o:p> </o:p></span></p>')
|
html_blocks.append('<p class=MsoNormal><span lang=ES><o:p> </o:p></span></p>')
|
||||||
i += 1
|
i += 1
|
||||||
continue
|
continue
|
||||||
@@ -249,7 +280,7 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
|
|||||||
if line.startswith('####'):
|
if line.startswith('####'):
|
||||||
text = line.lstrip('#').strip()
|
text = line.lstrip('#').strip()
|
||||||
# Apply consistent styling like h2/h3, disable numbering for h4
|
# Apply consistent styling like h2/h3, disable numbering for h4
|
||||||
html_blocks.append(f'<h4 style="mso-list:none"><span lang=ES style="text-transform:none">{text}</span></h4>')
|
html_blocks.append(f'<h4 style="mso-list:none"><b><span lang=ES style="text-transform:none">{text}</span></b></h4>')
|
||||||
i += 1
|
i += 1
|
||||||
continue
|
continue
|
||||||
elif line.startswith('###'):
|
elif line.startswith('###'):
|
||||||
@@ -314,11 +345,19 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
|
|||||||
|
|
||||||
# Look ahead for source (skip blank lines first)
|
# Look ahead for source (skip blank lines first)
|
||||||
source_idx = i
|
source_idx = i
|
||||||
while source_idx < len(lines) and not lines[source_idx].strip():
|
table_leyenda = None
|
||||||
source_idx += 1
|
while source_idx < len(lines) and not lines[source_idx].strip():
|
||||||
if source_idx < len(lines) and is_source_line(lines[source_idx]):
|
source_idx += 1
|
||||||
table_source = extract_source_from_line(lines[source_idx])
|
if source_idx < len(lines) and is_source_line(lines[source_idx]):
|
||||||
i = source_idx + 1
|
table_source = extract_source_from_line(lines[source_idx])
|
||||||
|
i = source_idx + 1
|
||||||
|
# Check for Leyenda after source (skip blank lines)
|
||||||
|
leyenda_idx = i
|
||||||
|
while leyenda_idx < len(lines) and not lines[leyenda_idx].strip():
|
||||||
|
leyenda_idx += 1
|
||||||
|
if leyenda_idx < len(lines) and is_leyenda_line(lines[leyenda_idx]):
|
||||||
|
table_leyenda = extract_leyenda_from_line(lines[leyenda_idx])
|
||||||
|
i = leyenda_idx + 1
|
||||||
|
|
||||||
# Add table title with MsoCaption class and proper Word SEQ field for cross-reference
|
# Add table title with MsoCaption class and proper Word SEQ field for cross-reference
|
||||||
# Format: "Tabla X." in bold, title in italic (per UNIR guidelines)
|
# Format: "Tabla X." in bold, title in italic (per UNIR guidelines)
|
||||||
@@ -334,8 +373,11 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
|
|||||||
clean_title = "Tabla de datos."
|
clean_title = "Tabla de datos."
|
||||||
# mso-pagination:keep-with-next ensures caption stays with table (correct MSO property)
|
# mso-pagination:keep-with-next ensures caption stays with table (correct MSO property)
|
||||||
# For Anexo tables, use static text (no SEQ field) to prevent Word from overwriting A1, A2...
|
# For Anexo tables, use static text (no SEQ field) to prevent Word from overwriting A1, A2...
|
||||||
|
# Add TC field so Anexo tables appear in Table of Tables index
|
||||||
|
# Use \f t identifier - template TOC field will be modified to include this
|
||||||
if is_anexo:
|
if is_anexo:
|
||||||
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla {table_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
|
tc_field = f'''<!--[if supportFields]><span style='mso-element:field-begin'></span> TC "Tabla {table_num}. {clean_title}" \\f t \\l 1 <span style='mso-element:field-end'></span><![endif]-->'''
|
||||||
|
html_blocks.append(f'''<a name="{bookmark_id}"></a>{tc_field}<p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla {table_num}.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
|
||||||
else:
|
else:
|
||||||
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
|
html_blocks.append(f'''<a name="{bookmark_id}"></a><p class=MsoCaption style="mso-pagination:keep-with-next"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla <!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]-->{table_num}<!--[if supportFields]><span style='mso-element:field-end'></span><![endif]-->.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%">{clean_title}</span></i></p>''')
|
||||||
|
|
||||||
@@ -363,6 +405,14 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
|
|||||||
if not table_source.endswith('.'):
|
if not table_source.endswith('.'):
|
||||||
source_html += '.'
|
source_html += '.'
|
||||||
html_blocks.append(f'<p class=Piedefoto-tabla style="margin-left:0cm"><span lang=ES>Fuente: {source_html}</span></p>')
|
html_blocks.append(f'<p class=Piedefoto-tabla style="margin-left:0cm"><span lang=ES>Fuente: {source_html}</span></p>')
|
||||||
|
|
||||||
|
# Add leyenda if present (same style as Fuente, new line)
|
||||||
|
if table_leyenda:
|
||||||
|
leyenda_html = md_to_html_para(table_leyenda)
|
||||||
|
if not table_leyenda.endswith('.'):
|
||||||
|
leyenda_html += '.'
|
||||||
|
html_blocks.append(f'<p class=Piedefoto-tabla style="margin-left:0cm"><span lang=ES>Leyenda: {leyenda_html}</span></p>')
|
||||||
|
|
||||||
html_blocks.append('<p class=MsoNormal><span lang=ES><o:p> </o:p></span></p>')
|
html_blocks.append('<p class=MsoNormal><span lang=ES><o:p> </o:p></span></p>')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -376,24 +426,63 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
|
|||||||
html_blocks.append(f'<p class=MsoQuote><i><span lang=ES>{md_to_html_para(quote_text)}</span></i></p>')
|
html_blocks.append(f'<p class=MsoQuote><i><span lang=ES>{md_to_html_para(quote_text)}</span></i></p>')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Bullet list
|
# Bullet list (handle blank lines between items)
|
||||||
if re.match(r'^[\-\*\+]\s', line):
|
if re.match(r'^[\-\*\+]\s', line):
|
||||||
while i < len(lines) and re.match(r'^[\-\*\+]\s', lines[i]):
|
# Collect all bullet items first
|
||||||
item_text = lines[i][2:].strip()
|
bullet_items = []
|
||||||
item_text = convert_latex_formulas(item_text)
|
while i < len(lines):
|
||||||
html_blocks.append(f'<p class=MsoListParagraphCxSpMiddle style="margin-left:36pt;text-indent:-18pt"><span lang=ES style="font-family:Symbol">·</span><span lang=ES style="font-size:7pt"> </span><span lang=ES>{md_to_html_para(item_text)}</span></p>')
|
# Skip blank lines
|
||||||
i += 1
|
while i < len(lines) and not lines[i].strip():
|
||||||
|
i += 1
|
||||||
|
# Check if next non-blank line is a bullet item
|
||||||
|
if i < len(lines) and re.match(r'^[\-\*\+]\s', lines[i]):
|
||||||
|
item_text = lines[i][2:].strip()
|
||||||
|
item_text = convert_latex_formulas(item_text)
|
||||||
|
bullet_items.append(md_to_html_para(item_text))
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
# Output with proper First/Middle/Last classes
|
||||||
|
for idx, item in enumerate(bullet_items):
|
||||||
|
if len(bullet_items) == 1:
|
||||||
|
cls = 'MsoListParagraph'
|
||||||
|
elif idx == 0:
|
||||||
|
cls = 'MsoListParagraphCxSpFirst'
|
||||||
|
elif idx == len(bullet_items) - 1:
|
||||||
|
cls = 'MsoListParagraphCxSpLast'
|
||||||
|
else:
|
||||||
|
cls = 'MsoListParagraphCxSpMiddle'
|
||||||
|
html_blocks.append(f'<p class={cls} style="margin-left:36pt;text-indent:-18pt"><span lang=ES style="font-family:Symbol">·</span><span lang=ES style="font-size:7pt"> </span><span lang=ES>{item}</span></p>')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Numbered list
|
# Numbered list (handle blank lines between items)
|
||||||
if re.match(r'^\d+\.\s', line):
|
if re.match(r'^\d+\.\s', line):
|
||||||
num = 1
|
# Collect all numbered items first
|
||||||
while i < len(lines) and re.match(r'^\d+\.\s', lines[i]):
|
numbered_items = []
|
||||||
item_text = re.sub(r'^\d+\.\s*', '', lines[i]).strip()
|
while i < len(lines):
|
||||||
item_text = convert_latex_formulas(item_text)
|
# Skip blank lines
|
||||||
html_blocks.append(f'<p class=MsoListParagraphCxSpMiddle style="margin-left:36pt;text-indent:-18pt"><span lang=ES>{num}.<span style="font-size:7pt"> </span>{md_to_html_para(item_text)}</span></p>')
|
while i < len(lines) and not lines[i].strip():
|
||||||
num += 1
|
i += 1
|
||||||
i += 1
|
# Check if next non-blank line is a numbered item
|
||||||
|
if i < len(lines) and re.match(r'^\d+\.\s', lines[i]):
|
||||||
|
item_text = re.sub(r'^\d+\.\s*', '', lines[i]).strip()
|
||||||
|
item_text = convert_latex_formulas(item_text)
|
||||||
|
numbered_items.append(md_to_html_para(item_text))
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
# Output with proper First/Middle/Last classes
|
||||||
|
for idx, item in enumerate(numbered_items):
|
||||||
|
num = idx + 1
|
||||||
|
if len(numbered_items) == 1:
|
||||||
|
cls = 'MsoListParagraph'
|
||||||
|
elif idx == 0:
|
||||||
|
cls = 'MsoListParagraphCxSpFirst'
|
||||||
|
elif idx == len(numbered_items) - 1:
|
||||||
|
cls = 'MsoListParagraphCxSpLast'
|
||||||
|
else:
|
||||||
|
cls = 'MsoListParagraphCxSpMiddle'
|
||||||
|
html_blocks.append(f'<p class={cls} style="margin-left:36pt;text-indent:-18pt"><span lang=ES>{num}.<span style="font-size:7pt"> </span>{item}</span></p>')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Skip lines that are just table/figure titles (they'll be handled with the table/figure)
|
# Skip lines that are just table/figure titles (they'll be handled with the table/figure)
|
||||||
@@ -403,9 +492,12 @@ def parse_md_to_html_blocks(md_content, is_anexo=False):
|
|||||||
if line.strip().startswith('**Figura') or line.strip().startswith('*Figura'):
|
if line.strip().startswith('**Figura') or line.strip().startswith('*Figura'):
|
||||||
i += 1
|
i += 1
|
||||||
continue
|
continue
|
||||||
if is_source_line(line):
|
if is_source_line(line):
|
||||||
i += 1
|
i += 1
|
||||||
continue
|
continue
|
||||||
|
if is_leyenda_line(line):
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
# Regular paragraph
|
# Regular paragraph
|
||||||
para_lines = [line]
|
para_lines = [line]
|
||||||
@@ -523,6 +615,17 @@ def main():
|
|||||||
|
|
||||||
print("Reading template...")
|
print("Reading template...")
|
||||||
html_content = read_file(TEMPLATE_INPUT)
|
html_content = read_file(TEMPLATE_INPUT)
|
||||||
|
|
||||||
|
# Modify the Table of Tables TOC field to include TC entries with \f t identifier
|
||||||
|
# Original: TOC \h \z \t "Tablas;1" \c "Tabla"
|
||||||
|
# Modified: TOC \f t \h \z \t "Tablas;1" \c "Tabla"
|
||||||
|
# Use regex to handle whitespace/HTML variations in the TOC field
|
||||||
|
html_content = re.sub(
|
||||||
|
r'(TOC\s+)(\\h\s+\\z\s+\\t\s*\n?\s*"Tablas;1")',
|
||||||
|
r'\1\\f t \2',
|
||||||
|
html_content
|
||||||
|
)
|
||||||
|
|
||||||
soup = BeautifulSoup(html_content, 'html.parser')
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
|
||||||
print("Reading docs content...")
|
print("Reading docs content...")
|
||||||
@@ -671,10 +774,10 @@ def main():
|
|||||||
# Also remove surrounding caption and source
|
# Also remove surrounding caption and source
|
||||||
prev_sib = table.find_previous_sibling()
|
prev_sib = table.find_previous_sibling()
|
||||||
next_sib = table.find_next_sibling()
|
next_sib = table.find_next_sibling()
|
||||||
if prev_sib and 'Tabla 1. Ejemplo' in prev_sib.get_text():
|
if prev_sib and 'Tabla 1. Ejemplo' in prev_sib.get_text():
|
||||||
prev_sib.decompose()
|
prev_sib.decompose()
|
||||||
if next_sib and SOURCE_LINE_RE.search(next_sib.get_text().strip()):
|
if next_sib and SOURCE_LINE_RE.search(next_sib.get_text().strip()):
|
||||||
next_sib.decompose()
|
next_sib.decompose()
|
||||||
table.decompose()
|
table.decompose()
|
||||||
print(" ✓ Removed template table example")
|
print(" ✓ Removed template table example")
|
||||||
break
|
break
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ Se realizó un estudio comparativo de tres soluciones OCR de código abierto: Ea
|
|||||||
|
|
||||||
Los resultados demuestran que la optimización de hiperparámetros logró mejoras significativas: el mejor trial individual alcanzó un CER de 0.79% (precisión del 99.21%), cumpliendo el objetivo de CER < 2%. Al validar la configuración optimizada sobre el dataset completo de 45 páginas, se obtuvo una mejora del 12.8% en CER (de 8.85% a 7.72%). El hallazgo más relevante fue que el parámetro `textline_orientation` (clasificación de orientación de línea de texto) tiene un impacto crítico en el rendimiento. Adicionalmente, se identificó que el umbral de detección (`text_det_thresh`) presenta una correlación positiva moderada (0.43) con el error, lo que indica que valores más bajos tienden a mejorar el rendimiento.
|
Los resultados demuestran que la optimización de hiperparámetros logró mejoras significativas: el mejor trial individual alcanzó un CER de 0.79% (precisión del 99.21%), cumpliendo el objetivo de CER < 2%. Al validar la configuración optimizada sobre el dataset completo de 45 páginas, se obtuvo una mejora del 12.8% en CER (de 8.85% a 7.72%). El hallazgo más relevante fue que el parámetro `textline_orientation` (clasificación de orientación de línea de texto) tiene un impacto crítico en el rendimiento. Adicionalmente, se identificó que el umbral de detección (`text_det_thresh`) presenta una correlación positiva moderada (0.43) con el error, lo que indica que valores más bajos tienden a mejorar el rendimiento.
|
||||||
|
|
||||||
**Fuente:** [`docs/metrics/metrics_paddle.md`](https://seryus.ddns.net/unir/MastersThesis/src/branch/main/docs/metrics/metrics_paddle.md), [`src/results/correlations/paddle_correlations.csv`](https://seryus.ddns.net/unir/MastersThesis/src/branch/main/src/results/correlations/paddle_correlations.csv).
|
**Fuente:** [`metrics_paddle.md`](https://seryus.ddns.net/unir/MastersThesis/src/branch/main/docs/metrics/metrics_paddle.md), [`paddle_correlations.csv`](https://seryus.ddns.net/unir/MastersThesis/src/branch/main/src/results/correlations/paddle_correlations.csv).
|
||||||
|
|
||||||
Este trabajo demuestra que la optimización de hiperparámetros es una alternativa viable al fine-tuning, especialmente útil cuando se dispone de modelos preentrenados para el idioma objetivo. La infraestructura dockerizada desarrollada permite reproducir los experimentos y facilita la evaluación sistemática de configuraciones OCR.
|
Este trabajo demuestra que la optimización de hiperparámetros es una alternativa viable al fine-tuning, especialmente útil cuando se dispone de modelos preentrenados para el idioma objetivo. La infraestructura dockerizada desarrollada permite reproducir los experimentos y facilita la evaluación sistemática de configuraciones OCR.
|
||||||
|
|
||||||
@@ -22,7 +22,7 @@ A comparative study of three open-source OCR solutions was conducted with EasyOC
|
|||||||
|
|
||||||
Results demonstrate that hyperparameter optimization achieved significant improvements. The best individual trial reached a CER of 0.79% (99.21% accuracy), meeting the CER < 2% objective. When validating the optimized configuration on the full 45-page dataset, a 12.8% CER improvement was obtained (from 8.85% to 7.72%). The most relevant finding was that the `textline_orientation` parameter (text line orientation classification) has a critical impact on performance. Additionally, the detection threshold (`text_det_thresh`) showed a moderate positive correlation (0.43) with error, indicating that lower values tend to improve performance.
|
Results demonstrate that hyperparameter optimization achieved significant improvements. The best individual trial reached a CER of 0.79% (99.21% accuracy), meeting the CER < 2% objective. When validating the optimized configuration on the full 45-page dataset, a 12.8% CER improvement was obtained (from 8.85% to 7.72%). The most relevant finding was that the `textline_orientation` parameter (text line orientation classification) has a critical impact on performance. Additionally, the detection threshold (`text_det_thresh`) showed a moderate positive correlation (0.43) with error, indicating that lower values tend to improve performance.
|
||||||
|
|
||||||
Sources: [`docs/metrics/metrics_paddle.md`](https://seryus.ddns.net/unir/MastersThesis/src/branch/main/docs/metrics/metrics_paddle.md), [`src/results/correlations/paddle_correlations.csv`](https://seryus.ddns.net/unir/MastersThesis/src/branch/main/src/results/correlations/paddle_correlations.csv).
|
Sources: [`metrics_paddle.md`](https://seryus.ddns.net/unir/MastersThesis/src/branch/main/docs/metrics/metrics_paddle.md), [`paddle_correlations.csv`](https://seryus.ddns.net/unir/MastersThesis/src/branch/main/src/results/correlations/paddle_correlations.csv).
|
||||||
|
|
||||||
This work demonstrates that hyperparameter optimization is a viable alternative to fine-tuning, especially useful when pre-trained models for the target language are available. The dockerized infrastructure developed enables experiment reproducibility and facilitates systematic evaluation of OCR configurations.
|
This work demonstrates that hyperparameter optimization is a viable alternative to fine-tuning, especially useful when pre-trained models for the target language are available. The dockerized infrastructure developed enables experiment reproducibility and facilitates systematic evaluation of OCR configurations.
|
||||||
|
|
||||||
|
|||||||
157
docs/compliance.md
Normal file
157
docs/compliance.md
Normal file
@@ -0,0 +1,157 @@
|
|||||||
|
# UNIR Style Compliance Checklist
|
||||||
|
|
||||||
|
This document lists the UNIR TFE style requirements to verify before final submission.
|
||||||
|
|
||||||
|
## Page Layout
|
||||||
|
|
||||||
|
| Requirement | Specification | Check |
|
||||||
|
|-------------|---------------|-------|
|
||||||
|
| Page size | A4 | ☐ |
|
||||||
|
| Left margin | 3.0 cm | ☐ |
|
||||||
|
| Right margin | 2.0 cm | ☐ |
|
||||||
|
| Top margin | 2.5 cm | ☐ |
|
||||||
|
| Bottom margin | 2.5 cm | ☐ |
|
||||||
|
| Header | Student name + TFE title | ☐ |
|
||||||
|
| Footer | Page number | ☐ |
|
||||||
|
|
||||||
|
## Typography
|
||||||
|
|
||||||
|
| Element | Specification | Check |
|
||||||
|
|---------|---------------|-------|
|
||||||
|
| Body text | Calibri 12pt, justified, 1.5 line spacing | ☐ |
|
||||||
|
| Título 1 (H1) | Calibri Light 18pt, blue, numbered (1., 2., ...) | ☐ |
|
||||||
|
| Título 2 (H2) | Calibri Light 14pt, blue, numbered (1.1, 1.2, ...) | ☐ |
|
||||||
|
| Título 3 (H3) | Calibri Light 12pt, numbered (1.1.1, 1.1.2, ...) | ☐ |
|
||||||
|
| Título 4 (H4) | Calibri 12pt, bold, unnumbered | ☐ |
|
||||||
|
| Footnotes | Calibri 10pt, justified, single spacing | ☐ |
|
||||||
|
| Code blocks | Consolas 10pt | ☐ |
|
||||||
|
|
||||||
|
## Document Structure
|
||||||
|
|
||||||
|
| Section | Requirements | Check |
|
||||||
|
|---------|--------------|-------|
|
||||||
|
| Portada | Title, Author, Type, Director, Date | ☐ |
|
||||||
|
| Resumen | 150-300 words in Spanish + Palabras clave (3-5) | ☐ |
|
||||||
|
| Abstract | 150-300 words in English + Keywords (3-5) | ☐ |
|
||||||
|
| Índice de contenidos | Auto-generated, new page | ☐ |
|
||||||
|
| Índice de figuras | Auto-generated, new page | ☐ |
|
||||||
|
| Índice de tablas | Auto-generated, new page | ☐ |
|
||||||
|
| Cap. 1 Introducción | 1.1 Motivación, 1.2 Planteamiento, 1.3 Estructura | ☐ |
|
||||||
|
| Cap. 2 Contexto | 2.1 Contexto, 2.2 Estado del arte, 2.3 Conclusiones | ☐ |
|
||||||
|
| Cap. 3 Objetivos | 3.1 Objetivo general, 3.2 Específicos, 3.3 Metodología | ☐ |
|
||||||
|
| Cap. 4 Desarrollo | Structure depends on work type | ☐ |
|
||||||
|
| Cap. 5 Conclusiones | 5.1 Conclusiones, 5.2 Trabajo futuro | ☐ |
|
||||||
|
| Referencias | APA format, alphabetical order | ☐ |
|
||||||
|
| Anexos | Code repository URL, supplementary data | ☐ |
|
||||||
|
|
||||||
|
## Tables
|
||||||
|
|
||||||
|
| Requirement | Specification | Check |
|
||||||
|
|-------------|---------------|-------|
|
||||||
|
| Title position | Above the table | ☐ |
|
||||||
|
| Title format | **Tabla N.** *Descriptive title in italics.* | ☐ |
|
||||||
|
| Numbering | Sequential (1, 2, 3...), Anexo uses A1, A2... | ☐ |
|
||||||
|
| Border style | APA: horizontal lines only (top, header bottom, table bottom) | ☐ |
|
||||||
|
| Source position | Below the table, centered | ☐ |
|
||||||
|
| Source format | Fuente: Author, Year. or Fuente: Elaboración propia. | ☐ |
|
||||||
|
| Leyenda (if needed) | Below Fuente, same style (Piedefoto-tabla) | ☐ |
|
||||||
|
| In TOT index | All tables appear in Índice de tablas | ☐ |
|
||||||
|
|
||||||
|
## Figures
|
||||||
|
|
||||||
|
| Requirement | Specification | Check |
|
||||||
|
|-------------|---------------|-------|
|
||||||
|
| Title position | Above the figure | ☐ |
|
||||||
|
| Title format | **Figura N.** *Descriptive title in italics.* | ☐ |
|
||||||
|
| Numbering | Sequential (1, 2, 3...), Anexo uses A1, A2... | ☐ |
|
||||||
|
| Alignment | Centered | ☐ |
|
||||||
|
| Source position | Below the figure, centered | ☐ |
|
||||||
|
| Source format | Fuente: Author, Year. or Fuente: Elaboración propia. | ☐ |
|
||||||
|
| Leyenda (if needed) | Below Fuente, same style (Piedefoto-tabla) | ☐ |
|
||||||
|
| In TOF index | All figures appear in Índice de figuras | ☐ |
|
||||||
|
|
||||||
|
## Lists
|
||||||
|
|
||||||
|
| Requirement | Specification | Check |
|
||||||
|
|-------------|---------------|-------|
|
||||||
|
| Bullet lists | Indented 36pt, bullet symbol (·) | ☐ |
|
||||||
|
| Numbered lists | Indented 36pt, sequential numbers (1, 2, 3...) | ☐ |
|
||||||
|
| Spacing | Proper First/Middle/Last paragraph spacing | ☐ |
|
||||||
|
|
||||||
|
## Citations and References
|
||||||
|
|
||||||
|
| Requirement | Specification | Check |
|
||||||
|
|-------------|---------------|-------|
|
||||||
|
| Citation format | APA 7th edition | ☐ |
|
||||||
|
| Single author | (Author, Year) or Author (Year) | ☐ |
|
||||||
|
| Two authors | (Author1 & Author2, Year) | ☐ |
|
||||||
|
| Three+ authors | (Author1 et al., Year) | ☐ |
|
||||||
|
| Reference list | Alphabetical by first author surname | ☐ |
|
||||||
|
| Hanging indent | 36pt left margin, -36pt text indent | ☐ |
|
||||||
|
| DOI/URL | Include when available | ☐ |
|
||||||
|
| No Wikipedia | Wikipedia citations not allowed | ☐ |
|
||||||
|
| Source variety | Books, journals, conferences (not just URLs) | ☐ |
|
||||||
|
|
||||||
|
## SMART Objectives
|
||||||
|
|
||||||
|
All objectives must be SMART:
|
||||||
|
|
||||||
|
| Criterion | Requirement | Check |
|
||||||
|
|-----------|-------------|-------|
|
||||||
|
| **S**pecific | Clearly defined, unambiguous | ☐ |
|
||||||
|
| **M**easurable | Quantifiable success metric (e.g., CER < 2%) | ☐ |
|
||||||
|
| **A**ttainable | Feasible with available resources | ☐ |
|
||||||
|
| **R**elevant | Demonstrable impact | ☐ |
|
||||||
|
| **T**ime-bound | Achievable within timeframe | ☐ |
|
||||||
|
|
||||||
|
## Writing Style
|
||||||
|
|
||||||
|
| Requirement | Check |
|
||||||
|
|-------------|-------|
|
||||||
|
| Each chapter starts with introductory paragraph | ☐ |
|
||||||
|
| Each paragraph has at least 3 sentences | ☐ |
|
||||||
|
| No two consecutive headings without text between them | ☐ |
|
||||||
|
| No superfluous phrases or repetition | ☐ |
|
||||||
|
| All concepts defined with pertinent citations | ☐ |
|
||||||
|
| Spelling checked (Word corrector) | ☐ |
|
||||||
|
| Logical flow between paragraphs | ☐ |
|
||||||
|
|
||||||
|
## Final Checks
|
||||||
|
|
||||||
|
| Requirement | Check |
|
||||||
|
|-------------|-------|
|
||||||
|
| All cited references appear in reference list | ☐ |
|
||||||
|
| All references in list are cited in text | ☐ |
|
||||||
|
| All figures/tables have numbers and titles | ☐ |
|
||||||
|
| Update all indices (Ctrl+A, F9 in Word) | ☐ |
|
||||||
|
| Page count: 50-90 pages (excl. cover, indices, annexes) | ☐ |
|
||||||
|
| Final format: PDF for deposit | ☐ |
|
||||||
|
|
||||||
|
## Automated Checks (apply_content.py)
|
||||||
|
|
||||||
|
The following are automatically handled by the generation scripts:
|
||||||
|
|
||||||
|
- ✓ Table/Figure sequential numbering
|
||||||
|
- ✓ Anexo items use A1, A2... prefix
|
||||||
|
- ✓ TC fields for Anexo items (appear in indices)
|
||||||
|
- ✓ Piedefoto-tabla style for Fuente/Leyenda
|
||||||
|
- ✓ MsoCaption style for titles
|
||||||
|
- ✓ APA table borders (horizontal only)
|
||||||
|
- ✓ MsoBibliography style for references
|
||||||
|
- ✓ MsoQuote style for blockquotes
|
||||||
|
- ✓ List paragraph classes (First/Middle/Last)
|
||||||
|
- ✓ Bold H4 headings (unnumbered)
|
||||||
|
|
||||||
|
## Color Palette (UNIR Theme)
|
||||||
|
|
||||||
|
| Color | Hex | Usage |
|
||||||
|
|-------|-----|-------|
|
||||||
|
| Primary Blue | `#0098CD` | Headings, diagram borders |
|
||||||
|
| Light Blue BG | `#E6F4F9` | Diagram backgrounds |
|
||||||
|
| Dark Gray | `#404040` | Body text |
|
||||||
|
| Accent Blue | `#5B9BD5` | Table headers |
|
||||||
|
| Light Accent | `#9CC2E5` | Table borders |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Reference:** UNIR TFE Guidelines (`instructions/instrucciones.pdf`, `instructions/plantilla_individual.pdf`)
|
||||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user