generation test
This commit is contained in:
@@ -32,7 +32,7 @@ def md_to_html_para(text):
|
||||
# Italic
|
||||
text = re.sub(r'\*([^*]+)\*', r'<i>\1</i>', text)
|
||||
# Inline code
|
||||
text = re.sub(r'`([^`]+)`', r'<span style="font-family:Consolas;font-size:10pt;background:#f5f5f5">\1</span>', text)
|
||||
text = re.sub(r'`([^`]+)`', r'<span style="font-family:Consolas;font-size:10pt">\1</span>', text)
|
||||
return text
|
||||
|
||||
def extract_table_title(lines, current_index):
|
||||
@@ -104,9 +104,10 @@ def parse_md_to_html_blocks(md_content):
|
||||
fig_file = f'figures/figura_{figure_counter}.png'
|
||||
fig_path = os.path.join(BASE_DIR, 'thesis_output', fig_file)
|
||||
|
||||
# Create figure with MsoCaption class and bookmark for Word cross-reference
|
||||
# Create figure with MsoCaption class and proper Word SEQ field for cross-reference
|
||||
# Format: "Figura X." in bold, title in italic (per UNIR guidelines)
|
||||
bookmark_id = f"_TocFigura{figure_counter}"
|
||||
html_blocks.append(f'''<p class=MsoCaption style="text-align:center"><a name="{bookmark_id}"><span lang=ES style="font-size:12.0pt;line-height:150%">Figura {figure_counter}. </span></a><i><span lang=ES style="font-size:12.0pt;line-height:150%;font-weight:normal">{fig_title}</span></i></p>''')
|
||||
html_blocks.append(f'''<p class=MsoCaption style="text-align:center"><a name="{bookmark_id}"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Figura </span></b></a><!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Figura \\* ARABIC <span style='mso-element:field-separator'></span><![endif]--><b><span lang=ES style="font-size:12.0pt;line-height:150%">{figure_counter}</span></b><!--[if supportFields]><span style='mso-element:field-end'></span><![endif]--><b><span lang=ES style="font-size:12.0pt;line-height:150%">.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%;font-weight:normal">{fig_title}</span></i></p>''')
|
||||
|
||||
if os.path.exists(fig_path):
|
||||
# Use actual image with proper Word-compatible format (max 400px width, 500px height to fit page)
|
||||
@@ -131,7 +132,7 @@ def parse_md_to_html_blocks(md_content):
|
||||
code = '\n'.join(code_lines)
|
||||
# Escape HTML entities in code
|
||||
code = code.replace('&', '&').replace('<', '<').replace('>', '>')
|
||||
html_blocks.append(f'<p class=MsoNormal style="margin-left:1cm;background:#F5F5F5;padding:10px"><span style="font-family:Consolas;font-size:9pt"><pre>{code}</pre></span></p>')
|
||||
html_blocks.append(f'<p class=MsoNormal style="margin-left:1cm"><span style="font-family:Consolas;font-size:9pt"><pre>{code}</pre></span></p>')
|
||||
i += 1
|
||||
continue
|
||||
|
||||
@@ -186,25 +187,30 @@ def parse_md_to_html_blocks(md_content):
|
||||
table_source = lines[i].replace('*', '').replace('Fuente:', '').strip()
|
||||
i += 1
|
||||
|
||||
# Add table title with MsoCaption class and bookmark for Word cross-reference
|
||||
# Add table title with MsoCaption class and proper Word SEQ field for cross-reference
|
||||
# Format: "Tabla X." in bold, title in italic (per UNIR guidelines)
|
||||
bookmark_id = f"_TocTabla{table_counter}"
|
||||
if table_title:
|
||||
clean_title = table_title.replace(f"Tabla {table_counter}.", "").strip()
|
||||
html_blocks.append(f'<p class=MsoCaption><a name="{bookmark_id}"><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla {table_counter}. </span></a><i><span lang=ES style="font-size:12.0pt;line-height:150%;font-weight:normal">{clean_title}</span></i></p>')
|
||||
else:
|
||||
html_blocks.append(f'<p class=MsoCaption><a name="{bookmark_id}"><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla {table_counter}. </span></a><i><span lang=ES style="font-size:12.0pt;line-height:150%;font-weight:normal">Tabla de datos.</span></i></p>')
|
||||
clean_title = "Tabla de datos."
|
||||
html_blocks.append(f'''<p class=MsoCaption><a name="{bookmark_id}"><b><span lang=ES style="font-size:12.0pt;line-height:150%">Tabla </span></b></a><!--[if supportFields]><span style='mso-element:field-begin'></span> SEQ Tabla \\* ARABIC <span style='mso-element:field-separator'></span><![endif]--><b><span lang=ES style="font-size:12.0pt;line-height:150%">{table_counter}</span></b><!--[if supportFields]><span style='mso-element:field-end'></span><![endif]--><b><span lang=ES style="font-size:12.0pt;line-height:150%">.</span></b><span lang=ES style="font-size:12.0pt;line-height:150%"> </span><i><span lang=ES style="font-size:12.0pt;line-height:150%;font-weight:normal">{clean_title}</span></i></p>''')
|
||||
|
||||
# Build table HTML
|
||||
table_html = '<table class=MsoTableGrid border=1 cellspacing=0 cellpadding=0 style="border-collapse:collapse;border:none;mso-border-alt:solid windowtext .5pt">'
|
||||
# Build table HTML with APA style (horizontal lines only, no vertical)
|
||||
table_html = '<table class=MsoTableGrid border=0 cellspacing=0 cellpadding=0 style="border-collapse:collapse;border:none">'
|
||||
for j, tline in enumerate(table_lines):
|
||||
cells = [c.strip() for c in tline.split('|')[1:-1]]
|
||||
table_html += '<tr>'
|
||||
for cell in cells:
|
||||
if j == 0:
|
||||
# Header row
|
||||
table_html += f'<td style="border:solid windowtext 1.0pt;padding:5px;background:#F0F0F0"><p class=MsoNormal style="margin:0"><b><span lang=ES>{md_to_html_para(cell)}</span></b></p></td>'
|
||||
# Header row: top and bottom border, bold text
|
||||
table_html += f'<td style="border-top:solid windowtext 1.0pt;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0"><b><span lang=ES>{md_to_html_para(cell)}</span></b></p></td>'
|
||||
elif j == len(table_lines) - 1:
|
||||
# Last row: bottom border only
|
||||
table_html += f'<td style="border-top:none;border-bottom:solid windowtext 1.0pt;border-left:none;border-right:none;padding:5px"><p class=MsoNormal style="margin:0"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
|
||||
else:
|
||||
table_html += f'<td style="border:solid windowtext 1.0pt;padding:5px"><p class=MsoNormal style="margin:0"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
|
||||
# Middle rows: no borders
|
||||
table_html += f'<td style="border:none;padding:5px"><p class=MsoNormal style="margin:0"><span lang=ES>{md_to_html_para(cell)}</span></p></td>'
|
||||
table_html += '</tr>'
|
||||
table_html += '</table>'
|
||||
html_blocks.append(table_html)
|
||||
|
||||
@@ -165,38 +165,16 @@ Los métodos de HPO incluyen:
|
||||
La combinación Ray Tune + Optuna permite búsquedas eficientes en espacios de alta dimensionalidad.
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
subgraph "Ray Tune"
|
||||
A["Espacio de<br/>búsqueda"]
|
||||
B["Scheduler<br/>(gestión de trials)"]
|
||||
C["Trial 1"]
|
||||
D["Trial 2"]
|
||||
E["Trial N"]
|
||||
end
|
||||
|
||||
subgraph "Optuna (TPE)"
|
||||
F["Modelo probabilístico<br/>de la función objetivo"]
|
||||
G["Sugiere nueva<br/>configuración"]
|
||||
end
|
||||
|
||||
subgraph "Evaluación"
|
||||
H["Ejecuta modelo OCR<br/>con config"]
|
||||
I["Calcula métricas<br/>(CER, WER)"]
|
||||
end
|
||||
|
||||
A --> B
|
||||
B --> C & D & E
|
||||
C & D & E --> H
|
||||
H --> I
|
||||
I -->|"Resultados"| F
|
||||
F --> G
|
||||
G -->|"Nueva config"| B
|
||||
|
||||
style A fill:#fff3e0
|
||||
style I fill:#e8f5e9
|
||||
flowchart LR
|
||||
A["Espacio de<br/>búsqueda"] --> B["Ray Tune<br/>Scheduler"]
|
||||
B --> C["Trials<br/>paralelos"]
|
||||
C --> D["Evaluación<br/>OCR"]
|
||||
D --> E["Métricas<br/>CER/WER"]
|
||||
E --> F["Optuna<br/>TPE"]
|
||||
F -->|"Nueva config"| B
|
||||
```
|
||||
|
||||
*Figura 2. Arquitectura de optimización de hiperparámetros con Ray Tune y Optuna.*
|
||||
*Figura 2. Ciclo de optimización de hiperparámetros con Ray Tune y Optuna.*
|
||||
|
||||
#### HPO en Sistemas OCR
|
||||
|
||||
|
||||
@@ -15,19 +15,29 @@ El repositorio incluye:
|
||||
|
||||
## A.2 Estructura del Repositorio
|
||||
|
||||
```mermaid
|
||||
flowchart LR
|
||||
root["MastersThesis/"] --> docs["docs/"]
|
||||
root --> src["src/"]
|
||||
root --> results["results/"]
|
||||
root --> instructions["instructions/"]
|
||||
root --> readme["README.md"]
|
||||
|
||||
src --> nb1["paddle_ocr_fine_tune_unir_raytune.ipynb"]
|
||||
src --> py1["paddle_ocr_tuning.py"]
|
||||
src --> py2["dataset_manager.py"]
|
||||
src --> nb2["prepare_dataset.ipynb"]
|
||||
src --> csv["raytune_results_*.csv"]
|
||||
```
|
||||
MastersThesis/
|
||||
├── docs/ # Capítulos de la tesis en Markdown
|
||||
├── src/
|
||||
│ ├── paddle_ocr_fine_tune_unir_raytune.ipynb # Experimento principal
|
||||
│ ├── paddle_ocr_tuning.py # Script de evaluación CLI
|
||||
│ ├── dataset_manager.py # Clase ImageTextDataset
|
||||
│ ├── prepare_dataset.ipynb # Preparación del dataset
|
||||
│ └── raytune_paddle_subproc_results_*.csv # Resultados de 64 trials
|
||||
├── results/ # Resultados de benchmarks
|
||||
├── instructions/ # Instrucciones y plantilla UNIR
|
||||
└── README.md
|
||||
```
|
||||
|
||||
*Figura 8. Estructura del repositorio del proyecto.*
|
||||
|
||||
**Descripción de componentes:**
|
||||
|
||||
- **docs/**: Capítulos de la tesis en Markdown
|
||||
- **src/**: Código fuente (notebooks y scripts)
|
||||
- **results/**: Resultados de benchmarks en CSV
|
||||
- **instructions/**: Instrucciones y plantilla UNIR
|
||||
|
||||
## A.3 Requisitos de Software
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@ def extract_mermaid_diagrams():
|
||||
'02_contexto_estado_arte.md',
|
||||
'03_objetivos_metodologia.md',
|
||||
'04_desarrollo_especifico.md',
|
||||
'07_anexo_a.md',
|
||||
]
|
||||
|
||||
for md_file in md_files:
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 115 KiB After Width: | Height: | Size: 43 KiB |
BIN
thesis_output/figures/figura_8.png
Normal file
BIN
thesis_output/figures/figura_8.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 100 KiB |
@@ -33,5 +33,10 @@
|
||||
"file": "figura_7.png",
|
||||
"title": "Comparación Baseline vs Optimizado (24 páginas)",
|
||||
"index": 7
|
||||
},
|
||||
{
|
||||
"file": "figura_8.png",
|
||||
"title": "Diagrama de 07_anexo_a.md",
|
||||
"index": 8
|
||||
}
|
||||
]
|
||||
Binary file not shown.
Reference in New Issue
Block a user