139 lines
3.5 KiB
Python
139 lines
3.5 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""Convert custom OCR dataset to Hugging Face format."""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import shutil
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
|
||
|
|
def convert_dataset(source_dir: str, output_dir: str):
|
||
|
|
"""Convert folder-based dataset to HF ImageFolder format."""
|
||
|
|
|
||
|
|
source = Path(source_dir)
|
||
|
|
output = Path(output_dir)
|
||
|
|
data_dir = output / "data"
|
||
|
|
data_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
metadata = []
|
||
|
|
|
||
|
|
for doc_folder in sorted(source.iterdir()):
|
||
|
|
if not doc_folder.is_dir():
|
||
|
|
continue
|
||
|
|
|
||
|
|
doc_id = doc_folder.name
|
||
|
|
img_dir = doc_folder / "img"
|
||
|
|
txt_dir = doc_folder / "txt"
|
||
|
|
|
||
|
|
if not img_dir.exists() or not txt_dir.exists():
|
||
|
|
continue
|
||
|
|
|
||
|
|
for img_file in sorted(img_dir.glob("*.png")):
|
||
|
|
txt_file = txt_dir / f"{img_file.stem}.txt"
|
||
|
|
if not txt_file.exists():
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Extract page number
|
||
|
|
page_num = int(img_file.stem.split("_")[-1])
|
||
|
|
|
||
|
|
# New filename: page_{doc_id}_{page_num:04d}.png
|
||
|
|
new_name = f"page_{doc_id}_{page_num:04d}.png"
|
||
|
|
|
||
|
|
# Copy image
|
||
|
|
shutil.copy(img_file, data_dir / new_name)
|
||
|
|
|
||
|
|
# Read text
|
||
|
|
text = txt_file.read_text(encoding="utf-8").strip()
|
||
|
|
|
||
|
|
# Add metadata entry
|
||
|
|
metadata.append({
|
||
|
|
"file_name": f"data/{new_name}",
|
||
|
|
"text": text,
|
||
|
|
"document_id": doc_id,
|
||
|
|
"page_number": page_num
|
||
|
|
})
|
||
|
|
|
||
|
|
# Write metadata.jsonl
|
||
|
|
with open(output / "metadata.jsonl", "w", encoding="utf-8") as f:
|
||
|
|
for entry in metadata:
|
||
|
|
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||
|
|
|
||
|
|
# Write dataset card
|
||
|
|
write_dataset_card(output, len(metadata))
|
||
|
|
|
||
|
|
print(f"Converted {len(metadata)} samples to {output}")
|
||
|
|
|
||
|
|
|
||
|
|
def write_dataset_card(output_dir: Path, num_samples: int):
|
||
|
|
"""Write HF dataset card."""
|
||
|
|
card = f'''---
|
||
|
|
dataset_info:
|
||
|
|
features:
|
||
|
|
- name: image
|
||
|
|
dtype: image
|
||
|
|
- name: text
|
||
|
|
dtype: string
|
||
|
|
- name: document_id
|
||
|
|
dtype: string
|
||
|
|
- name: page_number
|
||
|
|
dtype: int32
|
||
|
|
splits:
|
||
|
|
- name: train
|
||
|
|
num_examples: {num_samples}
|
||
|
|
license: cc-by-4.0
|
||
|
|
language:
|
||
|
|
- es
|
||
|
|
task_categories:
|
||
|
|
- image-to-text
|
||
|
|
tags:
|
||
|
|
- ocr
|
||
|
|
- spanish
|
||
|
|
- academic-documents
|
||
|
|
- unir
|
||
|
|
---
|
||
|
|
|
||
|
|
# UNIR OCR Dataset
|
||
|
|
|
||
|
|
Dataset de documentos académicos en español para evaluación de sistemas OCR.
|
||
|
|
|
||
|
|
## Descripción
|
||
|
|
|
||
|
|
- **Idioma**: Español
|
||
|
|
- **Dominio**: Documentos académicos (instrucciones TFE de UNIR)
|
||
|
|
- **Formato**: Imágenes PNG (300 DPI) + texto ground truth
|
||
|
|
- **Total**: {num_samples} pares imagen-texto
|
||
|
|
|
||
|
|
## Uso
|
||
|
|
|
||
|
|
```python
|
||
|
|
from datasets import load_dataset
|
||
|
|
|
||
|
|
dataset = load_dataset("path/to/dataset")
|
||
|
|
|
||
|
|
for sample in dataset["train"]:
|
||
|
|
image = sample["image"]
|
||
|
|
text = sample["text"]
|
||
|
|
```
|
||
|
|
|
||
|
|
## Estructura
|
||
|
|
|
||
|
|
Cada muestra contiene:
|
||
|
|
- `image`: Imagen de la página (PIL.Image)
|
||
|
|
- `text`: Texto ground truth extraído del PDF
|
||
|
|
- `document_id`: ID del documento fuente
|
||
|
|
- `page_number`: Número de página
|
||
|
|
|
||
|
|
## Citación
|
||
|
|
|
||
|
|
Parte del TFM "Optimización de Hiperparámetros OCR con Ray Tune" - UNIR 2025
|
||
|
|
'''
|
||
|
|
(output_dir / "README.md").write_text(card, encoding="utf-8")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
import sys
|
||
|
|
|
||
|
|
source = sys.argv[1] if len(sys.argv) > 1 else "src/dataset"
|
||
|
|
output = sys.argv[2] if len(sys.argv) > 2 else "src/dataset_hf"
|
||
|
|
|
||
|
|
convert_dataset(source, output)
|