generation test
This commit is contained in:
@@ -32,7 +32,7 @@ def md_to_html_para(text):
|
|||||||
# Italic
|
# Italic
|
||||||
text = re.sub(r'\*([^*]+)\*', r'<i>\1</i>', text)
|
text = re.sub(r'\*([^*]+)\*', r'<i>\1</i>', text)
|
||||||
# Inline code
|
# Inline code
|
||||||
text = re.sub(r'`([^`]+)`', r'<span style="font-family:Consolas;font-size:10pt;background:#f5f5f5">\1</span>', text)
|
text = re.sub(r'`([^`]+)`', r'<span style="font-family:Consolas;font-size:10pt">\1</span>', text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def extract_table_title(lines, current_index):
|
def extract_table_title(lines, current_index):
|
||||||
@@ -104,9 +104,10 @@ def parse_md_to_html_blocks(md_content):
|
|||||||
fig_file = f'figures/figura_{figure_counter}.png'
|
fig_file = f'figures/figura_{figure_counter}.png'
|
||||||
fig_path = os.path.join(BASE_DIR, 'thesis_output', fig_file)
|
fig_path = os.path.join(BASE_DIR, 'thesis_output', fig_file)
|
||||||
|
|
||||||
# Create figure with MsoCaption class and bookmark for Word cross-reference
|
# Create figure with MsoCaption class and proper Word SEQ field for cross-reference
|
||||||
|
# Format: "Figura X." in bold, title in italic (per UNIR guidelines)
|
||||||
bookmark_id = f"_TocFigura{figure_counter}"
|
bookmark_id = f"_TocFigura{figure_counter}"
|
||||||
html_blocks.append(f'''<p class=MsoCaption style="text-align:center"><a name="{bookmark_id}"><span lang=ES style="font-size:12.0pt;line-height:150%">Figura {figure_counter}. </span></a><i><span lang=ES style="font-size:12.0pt;line-height:150%;font-weight:normal">{fig_title}</span></i></p>''')
|
html_blocks.append(f'''<p class=MsoCaption style="text-align:center"><a name="{bookmark_id}"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura </span></b></a><!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]--><b><span lang=ES style="font-size:12.0pt;line-height:150%">{figure_counter}</span></b><!--[if supportFields]><span style='mso-element:field-end'></span><![endif]--><b><span lang=ES style="font-size:12.0pt;line-height:150%">.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%;font-weight:normal">{fig_title}</span></i></p>''')
|
||||||
|
|
||||||
if os.path.exists(fig_path):
|
if os.path.exists(fig_path):
|
||||||
# Use actual image with proper Word-compatible format (max 400px width, 500px height to fit page)
|
# Use actual image with proper Word-compatible format (max 400px width, 500px height to fit page)
|
||||||
@@ -131,7 +132,7 @@ def parse_md_to_html_blocks(md_content):
|
|||||||
code = '\n'.join(code_lines)
|
code = '\n'.join(code_lines)
|
||||||
# Escape HTML entities in code
|
# Escape HTML entities in code
|
||||||
code = code.replace('&', '&').replace('<', '<').replace('>', '>')
|
code = code.replace('&', '&').replace('<', '<').replace('>', '>')
|
||||||
html_blocks.append(f'<p class=MsoNormal style="margin-left:1cm;background:#F5F5F5;padding:10px"><span style="font-family:Consolas;font-size:9pt"><pre>{code}</pre></span></p>')
|
html_blocks.append(f'<p class=MsoNormal style="margin-left:1cm"><span style="font-family:Consolas;font-size:9pt"><pre>{code}</pre></span></p>')
|
||||||
i += 1
|
i += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -186,25 +187,30 @@ def parse_md_to_html_blocks(md_content):
|
|||||||
table_source = lines[i].replace('*', '').replace('Fuente:', '').strip()
|
table_source = lines[i].replace('*', '').replace('Fuente:', '').strip()
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
# Add table title with MsoCaption class and bookmark for Word cross-reference
|
# Add table title with MsoCaption class and proper Word SEQ field for cross-reference
|
||||||
|
# Format: "Tabla X." in bold, title in italic (per UNIR guidelines)
|
||||||
bookmark_id = f"_TocTabla{table_counter}"
|
bookmark_id = f"_TocTabla{table_counter}"
|
||||||
if table_title:
|
if table_title:
|
||||||
clean_title = table_title.replace(f"Tabla {table_counter}.", "").strip()
|
clean_title = table_title.replace(f"Tabla {table_counter}.", "").strip()
|
||||||
html_blocks.append(f'<p class=MsoCaption><a name="{bookmark_id}"><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla {table_counter}. </span></a><i><span lang=ES style="font-size:12.0pt;line-height:150%;font-weight:normal">{clean_title}</span></i></p>')
|
|
||||||
else:
|
else:
|
||||||
html_blocks.append(f'<p class=MsoCaption><a name="{bookmark_id}"><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla {table_counter}. </span></a><i><span lang=ES style="font-size:12.0pt;line-height:150%;font-weight:normal">Tabla de datos.</span></i></p>')
|
clean_title = "Tabla de datos."
|
||||||
|
html_blocks.append(f'''<p class=MsoCaption><a name="{bookmark_id}"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla </span></b></a><!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]--><b><span lang=ES style="font-size:12.0pt;line-height:150%">{table_counter}</span></b><!--[if supportFields]><span style='mso-element:field-end'></span><![endif]--><b><span lang=ES style="font-size:12.0pt;line-height:150%">.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%;font-weight:normal">{clean_title}</span></i></p>''')
|
||||||
|
|
||||||
# Build table HTML
|
# Build table HTML with APA style (horizontal lines only, no vertical)
|
||||||
table_html = '<table class=MsoTableGrid border=1 cellspacing=0 cellpadding=0 style="border-collapse:collapse;border:none;mso-border-alt:solid windowtext .5pt">'
|
table_html = '<table class=MsoTableGrid border=0 cellspacing=0 cellpadding=0 style="border-collapse:collapse;border:none">'
|
||||||
for j, tline in enumerate(table_lines):
|
for j, tline in enumerate(table_lines):
|
||||||
cells = [c.strip() for c in tline.split('|')[1:-1]]
|
cells = [c.strip() for c in tline.split('|')[1:-1]]
|
||||||
table_html += '<tr>'
|
table_html += '<tr>'
|
||||||
for cell in cells:
|
for cell in cells:
|
||||||
if j == 0:
|
if j == 0:
|
||||||
# Header row
|
# Header row: top and bottom border, bold text
|
||||||
table_html += f'<td style="border:solid windowtext 1.0pt;padding:5px;background:#F0F0F0"><p class=MsoNormal style="margin:0"><b><span lang=ES>{md_to_html_para(cell)}</span></b></p></td>'
|
table_html += f'<td style="border-top:solid windowtext 1.0pt;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0"><b><span lang=ES>{md_to_html_para(cell)}</span></b></p></td>'
|
||||||
|
elif j == len(table_lines) - 1:
|
||||||
|
# Last row: bottom border only
|
||||||
|
table_html += f'<td style="border-top:none;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
|
||||||
else:
|
else:
|
||||||
table_html += f'<td style="border:solid windowtext 1.0pt;padding:5px"><p class=MsoNormal style="margin:0"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
|
# Middle rows: no borders
|
||||||
|
table_html += f'<td style="border:none;padding:5px"><p class=MsoNormal style="margin:0"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
|
||||||
table_html += '</tr>'
|
table_html += '</tr>'
|
||||||
table_html += '</table>'
|
table_html += '</table>'
|
||||||
html_blocks.append(table_html)
|
html_blocks.append(table_html)
|
||||||
|
|||||||
@@ -165,38 +165,16 @@ Los métodos de HPO incluyen:
|
|||||||
La combinación Ray Tune + Optuna permite búsquedas eficientes en espacios de alta dimensionalidad.
|
La combinación Ray Tune + Optuna permite búsquedas eficientes en espacios de alta dimensionalidad.
|
||||||
|
|
||||||
```mermaid
|
```mermaid
|
||||||
flowchart TD
|
flowchart LR
|
||||||
subgraph "Ray Tune"
|
A["Espacio de<br/>búsqueda"] --> B["Ray Tune<br/>Scheduler"]
|
||||||
A["Espacio de<br/>búsqueda"]
|
B --> C["Trials<br/>paralelos"]
|
||||||
B["Scheduler<br/>(gestión de trials)"]
|
C --> D["Evaluación<br/>OCR"]
|
||||||
C["Trial 1"]
|
D --> E["Métricas<br/>CER/WER"]
|
||||||
D["Trial 2"]
|
E --> F["Optuna<br/>TPE"]
|
||||||
E["Trial N"]
|
F -->|"Nueva config"| B
|
||||||
end
|
|
||||||
|
|
||||||
subgraph "Optuna (TPE)"
|
|
||||||
F["Modelo probabilístico<br/>de la función objetivo"]
|
|
||||||
G["Sugiere nueva<br/>configuración"]
|
|
||||||
end
|
|
||||||
|
|
||||||
subgraph "Evaluación"
|
|
||||||
H["Ejecuta modelo OCR<br/>con config"]
|
|
||||||
I["Calcula métricas<br/>(CER, WER)"]
|
|
||||||
end
|
|
||||||
|
|
||||||
A --> B
|
|
||||||
B --> C & D & E
|
|
||||||
C & D & E --> H
|
|
||||||
H --> I
|
|
||||||
I -->|"Resultados"| F
|
|
||||||
F --> G
|
|
||||||
G -->|"Nueva config"| B
|
|
||||||
|
|
||||||
style A fill:#fff3e0
|
|
||||||
style I fill:#e8f5e9
|
|
||||||
```
|
```
|
||||||
|
|
||||||
*Figura 2. Arquitectura de optimización de hiperparámetros con Ray Tune y Optuna.*
|
*Figura 2. Ciclo de optimización de hiperparámetros con Ray Tune y Optuna.*
|
||||||
|
|
||||||
#### HPO en Sistemas OCR
|
#### HPO en Sistemas OCR
|
||||||
|
|
||||||
|
|||||||
@@ -15,19 +15,29 @@ El repositorio incluye:
|
|||||||
|
|
||||||
## A.2 Estructura del Repositorio
|
## A.2 Estructura del Repositorio
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart LR
|
||||||
|
root["MastersThesis/"] --> docs["docs/"]
|
||||||
|
root --> src["src/"]
|
||||||
|
root --> results["results/"]
|
||||||
|
root --> instructions["instructions/"]
|
||||||
|
root --> readme["README.md"]
|
||||||
|
|
||||||
|
src --> nb1["paddle_ocr_fine_tune_unir_raytune.ipynb"]
|
||||||
|
src --> py1["paddle_ocr_tuning.py"]
|
||||||
|
src --> py2["dataset_manager.py"]
|
||||||
|
src --> nb2["prepare_dataset.ipynb"]
|
||||||
|
src --> csv["raytune_results_*.csv"]
|
||||||
```
|
```
|
||||||
MastersThesis/
|
|
||||||
├── docs/ # Capítulos de la tesis en Markdown
|
*Figura 8. Estructura del repositorio del proyecto.*
|
||||||
├── src/
|
|
||||||
│ ├── paddle_ocr_fine_tune_unir_raytune.ipynb # Experimento principal
|
**Descripción de componentes:**
|
||||||
│ ├── paddle_ocr_tuning.py # Script de evaluación CLI
|
|
||||||
│ ├── dataset_manager.py # Clase ImageTextDataset
|
- **docs/**: Capítulos de la tesis en Markdown
|
||||||
│ ├── prepare_dataset.ipynb # Preparación del dataset
|
- **src/**: Código fuente (notebooks y scripts)
|
||||||
│ └── raytune_paddle_subproc_results_*.csv # Resultados de 64 trials
|
- **results/**: Resultados de benchmarks en CSV
|
||||||
├── results/ # Resultados de benchmarks
|
- **instructions/**: Instrucciones y plantilla UNIR
|
||||||
├── instructions/ # Instrucciones y plantilla UNIR
|
|
||||||
└── README.md
|
|
||||||
```
|
|
||||||
|
|
||||||
## A.3 Requisitos de Software
|
## A.3 Requisitos de Software
|
||||||
|
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ def extract_mermaid_diagrams():
|
|||||||
'02_contexto_estado_arte.md',
|
'02_contexto_estado_arte.md',
|
||||||
'03_objetivos_metodologia.md',
|
'03_objetivos_metodologia.md',
|
||||||
'04_desarrollo_especifico.md',
|
'04_desarrollo_especifico.md',
|
||||||
|
'07_anexo_a.md',
|
||||||
]
|
]
|
||||||
|
|
||||||
for md_file in md_files:
|
for md_file in md_files:
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 115 KiB After Width: | Height: | Size: 43 KiB |
BIN
thesis_output/figures/figura_8.png
Normal file
BIN
thesis_output/figures/figura_8.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 100 KiB |
@@ -33,5 +33,10 @@
|
|||||||
"file": "figura_7.png",
|
"file": "figura_7.png",
|
||||||
"title": "Comparación Baseline vs Optimizado (24 páginas)",
|
"title": "Comparación Baseline vs Optimizado (24 páginas)",
|
||||||
"index": 7
|
"index": 7
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"file": "figura_8.png",
|
||||||
|
"title": "Diagrama de 07_anexo_a.md",
|
||||||
|
"index": 8
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
Binary file not shown.
Reference in New Issue
Block a user