generation test

This commit is contained in:
2025-12-16 00:25:16 +01:00
parent 8a587e4620
commit 34a1cb4766
8 changed files with 54 additions and 54 deletions

View File

@@ -32,7 +32,7 @@ def md_to_html_para(text):
# Italic # Italic
text = re.sub(r'\*([^*]+)\*', r'<i>\1</i>', text) text = re.sub(r'\*([^*]+)\*', r'<i>\1</i>', text)
# Inline code # Inline code
text = re.sub(r'`([^`]+)`', r'<span style="font-family:Consolas;font-size:10pt;background:#f5f5f5">\1</span>', text) text = re.sub(r'`([^`]+)`', r'<span style="font-family:Consolas;font-size:10pt">\1</span>', text)
return text return text
def extract_table_title(lines, current_index): def extract_table_title(lines, current_index):
@@ -104,9 +104,10 @@ def parse_md_to_html_blocks(md_content):
fig_file = f'figures/figura_{figure_counter}.png' fig_file = f'figures/figura_{figure_counter}.png'
fig_path = os.path.join(BASE_DIR, 'thesis_output', fig_file) fig_path = os.path.join(BASE_DIR, 'thesis_output', fig_file)
# Create figure with MsoCaption class and bookmark for Word cross-reference # Create figure with MsoCaption class and proper Word SEQ field for cross-reference
# Format: "Figura X." in bold, title in italic (per UNIR guidelines)
bookmark_id = f"_TocFigura{figure_counter}" bookmark_id = f"_TocFigura{figure_counter}"
html_blocks.append(f'''<p class=MsoCaption style="text-align:center"><a name="{bookmark_id}"><span lang=ES style="font-size:12.0pt;line-height:150%">Figura {figure_counter}. </span></a><i><span lang=ES style="font-size:12.0pt;line-height:150%;font-weight:normal">{fig_title}</span></i></p>''') html_blocks.append(f'''<p class=MsoCaption style="text-align:center"><a name="{bookmark_id}"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura </span></b></a><!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]--><b><span lang=ES style="font-size:12.0pt;line-height:150%">{figure_counter}</span></b><!--[if supportFields]><span style='mso-element:field-end'></span><![endif]--><b><span lang=ES style="font-size:12.0pt;line-height:150%">.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%;font-weight:normal">{fig_title}</span></i></p>''')
if os.path.exists(fig_path): if os.path.exists(fig_path):
# Use actual image with proper Word-compatible format (max 400px width, 500px height to fit page) # Use actual image with proper Word-compatible format (max 400px width, 500px height to fit page)
@@ -131,7 +132,7 @@ def parse_md_to_html_blocks(md_content):
code = '\n'.join(code_lines) code = '\n'.join(code_lines)
# Escape HTML entities in code # Escape HTML entities in code
code = code.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;') code = code.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
html_blocks.append(f'<p class=MsoNormal style="margin-left:1cm;background:#F5F5F5;padding:10px"><span style="font-family:Consolas;font-size:9pt"><pre>{code}</pre></span></p>') html_blocks.append(f'<p class=MsoNormal style="margin-left:1cm"><span style="font-family:Consolas;font-size:9pt"><pre>{code}</pre></span></p>')
i += 1 i += 1
continue continue
@@ -186,25 +187,30 @@ def parse_md_to_html_blocks(md_content):
table_source = lines[i].replace('*', '').replace('Fuente:', '').strip() table_source = lines[i].replace('*', '').replace('Fuente:', '').strip()
i += 1 i += 1
# Add table title with MsoCaption class and bookmark for Word cross-reference # Add table title with MsoCaption class and proper Word SEQ field for cross-reference
# Format: "Tabla X." in bold, title in italic (per UNIR guidelines)
bookmark_id = f"_TocTabla{table_counter}" bookmark_id = f"_TocTabla{table_counter}"
if table_title: if table_title:
clean_title = table_title.replace(f"Tabla {table_counter}.", "").strip() clean_title = table_title.replace(f"Tabla {table_counter}.", "").strip()
html_blocks.append(f'<p class=MsoCaption><a name="{bookmark_id}"><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla {table_counter}. </span></a><i><span lang=ES style="font-size:12.0pt;line-height:150%;font-weight:normal">{clean_title}</span></i></p>')
else: else:
html_blocks.append(f'<p class=MsoCaption><a name="{bookmark_id}"><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla {table_counter}. </span></a><i><span lang=ES style="font-size:12.0pt;line-height:150%;font-weight:normal">Tabla de datos.</span></i></p>') clean_title = "Tabla de datos."
html_blocks.append(f'''<p class=MsoCaption><a name="{bookmark_id}"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla </span></b></a><!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]--><b><span lang=ES style="font-size:12.0pt;line-height:150%">{table_counter}</span></b><!--[if supportFields]><span style='mso-element:field-end'></span><![endif]--><b><span lang=ES style="font-size:12.0pt;line-height:150%">.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%;font-weight:normal">{clean_title}</span></i></p>''')
# Build table HTML # Build table HTML with APA style (horizontal lines only, no vertical)
table_html = '<table class=MsoTableGrid border=1 cellspacing=0 cellpadding=0 style="border-collapse:collapse;border:none;mso-border-alt:solid windowtext .5pt">' table_html = '<table class=MsoTableGrid border=0 cellspacing=0 cellpadding=0 style="border-collapse:collapse;border:none">'
for j, tline in enumerate(table_lines): for j, tline in enumerate(table_lines):
cells = [c.strip() for c in tline.split('|')[1:-1]] cells = [c.strip() for c in tline.split('|')[1:-1]]
table_html += '<tr>' table_html += '<tr>'
for cell in cells: for cell in cells:
if j == 0: if j == 0:
# Header row # Header row: top and bottom border, bold text
table_html += f'<td style="border:solid windowtext 1.0pt;padding:5px;background:#F0F0F0"><p class=MsoNormal style="margin:0"><b><span lang=ES>{md_to_html_para(cell)}</span></b></p></td>' table_html += f'<td style="border-top:solid windowtext 1.0pt;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0"><b><span lang=ES>{md_to_html_para(cell)}</span></b></p></td>'
elif j == len(table_lines) - 1:
# Last row: bottom border only
table_html += f'<td style="border-top:none;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
else: else:
table_html += f'<td style="border:solid windowtext 1.0pt;padding:5px"><p class=MsoNormal style="margin:0"><span lang=ES>{md_to_html_para(cell)}</span></p></td>' # Middle rows: no borders
table_html += f'<td style="border:none;padding:5px"><p class=MsoNormal style="margin:0"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
table_html += '</tr>' table_html += '</tr>'
table_html += '</table>' table_html += '</table>'
html_blocks.append(table_html) html_blocks.append(table_html)

View File

@@ -165,38 +165,16 @@ Los métodos de HPO incluyen:
La combinación Ray Tune + Optuna permite búsquedas eficientes en espacios de alta dimensionalidad. La combinación Ray Tune + Optuna permite búsquedas eficientes en espacios de alta dimensionalidad.
```mermaid ```mermaid
flowchart TD flowchart LR
subgraph "Ray Tune" A["Espacio de<br/>búsqueda"] --> B["Ray Tune<br/>Scheduler"]
A["Espacio de<br/>búsqueda"] B --> C["Trials<br/>paralelos"]
B["Scheduler<br/>(gestión de trials)"] C --> D["Evaluación<br/>OCR"]
C["Trial 1"] D --> E["Métricas<br/>CER/WER"]
D["Trial 2"] E --> F["Optuna<br/>TPE"]
E["Trial N"] F -->|"Nueva config"| B
end
subgraph "Optuna (TPE)"
F["Modelo probabilístico<br/>de la función objetivo"]
G["Sugiere nueva<br/>configuración"]
end
subgraph "Evaluación"
H["Ejecuta modelo OCR<br/>con config"]
I["Calcula métricas<br/>(CER, WER)"]
end
A --> B
B --> C & D & E
C & D & E --> H
H --> I
I -->|"Resultados"| F
F --> G
G -->|"Nueva config"| B
style A fill:#fff3e0
style I fill:#e8f5e9
``` ```
*Figura 2. Arquitectura de optimización de hiperparámetros con Ray Tune y Optuna.* *Figura 2. Ciclo de optimización de hiperparámetros con Ray Tune y Optuna.*
#### HPO en Sistemas OCR #### HPO en Sistemas OCR

View File

@@ -15,19 +15,29 @@ El repositorio incluye:
## A.2 Estructura del Repositorio ## A.2 Estructura del Repositorio
```mermaid
flowchart LR
root["MastersThesis/"] --> docs["docs/"]
root --> src["src/"]
root --> results["results/"]
root --> instructions["instructions/"]
root --> readme["README.md"]
src --> nb1["paddle_ocr_fine_tune_unir_raytune.ipynb"]
src --> py1["paddle_ocr_tuning.py"]
src --> py2["dataset_manager.py"]
src --> nb2["prepare_dataset.ipynb"]
src --> csv["raytune_results_*.csv"]
``` ```
MastersThesis/
├── docs/ # Capítulos de la tesis en Markdown *Figura 8. Estructura del repositorio del proyecto.*
├── src/
│ ├── paddle_ocr_fine_tune_unir_raytune.ipynb # Experimento principal **Descripción de componentes:**
│ ├── paddle_ocr_tuning.py # Script de evaluación CLI
│ ├── dataset_manager.py # Clase ImageTextDataset - **docs/**: Capítulos de la tesis en Markdown
│ ├── prepare_dataset.ipynb # Preparación del dataset - **src/**: Código fuente (notebooks y scripts)
│ └── raytune_paddle_subproc_results_*.csv # Resultados de 64 trials - **results/**: Resultados de benchmarks en CSV
├── results/ # Resultados de benchmarks - **instructions/**: Instrucciones y plantilla UNIR
├── instructions/ # Instrucciones y plantilla UNIR
└── README.md
```
## A.3 Requisitos de Software ## A.3 Requisitos de Software

View File

@@ -19,6 +19,7 @@ def extract_mermaid_diagrams():
'02_contexto_estado_arte.md', '02_contexto_estado_arte.md',
'03_objetivos_metodologia.md', '03_objetivos_metodologia.md',
'04_desarrollo_especifico.md', '04_desarrollo_especifico.md',
'07_anexo_a.md',
] ]
for md_file in md_files: for md_file in md_files:

Binary file not shown.

Before

Width:  |  Height:  |  Size: 115 KiB

After

Width:  |  Height:  |  Size: 43 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 100 KiB

View File

@@ -33,5 +33,10 @@
"file": "figura_7.png", "file": "figura_7.png",
"title": "Comparación Baseline vs Optimizado (24 páginas)", "title": "Comparación Baseline vs Optimizado (24 páginas)",
"index": 7 "index": 7
},
{
"file": "figura_8.png",
"title": "Diagrama de 07_anexo_a.md",
"index": 8
} }
] ]