src/dataset_formatting/convert_to_hf_dataset.py

#!/usr/bin/env python3
"""Convert custom OCR dataset to Hugging Face format."""

import json
import shutil
from pathlib import Path


def convert_dataset(source_dir: str, output_dir: str):
    """Convert folder-based dataset to HF ImageFolder format."""

    source = Path(source_dir)
    output = Path(output_dir)
    data_dir = output / "data"
    data_dir.mkdir(parents=True, exist_ok=True)

    metadata = []

    for doc_folder in sorted(source.iterdir()):
        if not doc_folder.is_dir():
            continue

        doc_id = doc_folder.name
        img_dir = doc_folder / "img"
        txt_dir = doc_folder / "txt"

        if not img_dir.exists() or not txt_dir.exists():
            continue

        for img_file in sorted(img_dir.glob("*.png")):
            txt_file = txt_dir / f"{img_file.stem}.txt"
            if not txt_file.exists():
                continue

            # Extract page number
            page_num = int(img_file.stem.split("_")[-1])

            # New filename: page_{doc_id}_{page_num:04d}.png
            new_name = f"page_{doc_id}_{page_num:04d}.png"

            # Copy image
            shutil.copy(img_file, data_dir / new_name)

            # Read text
            text = txt_file.read_text(encoding="utf-8").strip()

            # Add metadata entry
            metadata.append({
                "file_name": f"data/{new_name}",
                "text": text,
                "document_id": doc_id,
                "page_number": page_num
            })

    # Write metadata.jsonl
    with open(output / "metadata.jsonl", "w", encoding="utf-8") as f:
        for entry in metadata:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    # Write dataset card
    write_dataset_card(output, len(metadata))

    print(f"Converted {len(metadata)} samples to {output}")


def write_dataset_card(output_dir: Path, num_samples: int):
    """Write HF dataset card."""
    card = f'''---
dataset_info:
  features:
    - name: image
      dtype: image
    - name: text
      dtype: string
    - name: document_id
      dtype: string
    - name: page_number
      dtype: int32
  splits:
    - name: train
      num_examples: {num_samples}
license: cc-by-4.0
language:
  - es
task_categories:
  - image-to-text
tags:
  - ocr
  - spanish
  - academic-documents
  - unir
---

# UNIR OCR Dataset

Dataset de documentos académicos en español para evaluación de sistemas OCR.

## Descripción

- **Idioma**: Español
- **Dominio**: Documentos académicos (instrucciones TFE de UNIR)
- **Formato**: Imágenes PNG (300 DPI) + texto ground truth
- **Total**: {num_samples} pares imagen-texto

## Uso

```python
from datasets import load_dataset

dataset = load_dataset("path/to/dataset")

for sample in dataset["train"]:
    image = sample["image"]
    text = sample["text"]
```

## Estructura

Cada muestra contiene:
- `image`: Imagen de la página (PIL.Image)
- `text`: Texto ground truth extraído del PDF
- `document_id`: ID del documento fuente
- `page_number`: Número de página

## Citación

Parte del TFM "Optimización de Hiperparámetros OCR con Ray Tune" - UNIR 2025
'''
    (output_dir / "README.md").write_text(card, encoding="utf-8")


if __name__ == "__main__":
    import sys

    source = sys.argv[1] if len(sys.argv) > 1 else "src/dataset"
    output = sys.argv[2] if len(sys.argv) > 2 else "src/dataset_hf"

    convert_dataset(source, output)
readme and hf dataset formater. 2026-01-19 14:00:28 +01:00			`#!/usr/bin/env python3`
			`"""Convert custom OCR dataset to Hugging Face format."""`

			`import json`
			`import shutil`
			`from pathlib import Path`


			`def convert_dataset(source_dir: str, output_dir: str):`
			`"""Convert folder-based dataset to HF ImageFolder format."""`

			`source = Path(source_dir)`
			`output = Path(output_dir)`
			`data_dir = output / "data"`
			`data_dir.mkdir(parents=True, exist_ok=True)`

			`metadata = []`

			`for doc_folder in sorted(source.iterdir()):`
			`if not doc_folder.is_dir():`
			`continue`

			`doc_id = doc_folder.name`
			`img_dir = doc_folder / "img"`
			`txt_dir = doc_folder / "txt"`

			`if not img_dir.exists() or not txt_dir.exists():`
			`continue`

			`for img_file in sorted(img_dir.glob("*.png")):`
			`txt_file = txt_dir / f"{img_file.stem}.txt"`
			`if not txt_file.exists():`
			`continue`

			`# Extract page number`
			`page_num = int(img_file.stem.split("_")[-1])`

			`# New filename: page_{doc_id}_{page_num:04d}.png`
			`new_name = f"page_{doc_id}_{page_num:04d}.png"`

			`# Copy image`
			`shutil.copy(img_file, data_dir / new_name)`

			`# Read text`
			`text = txt_file.read_text(encoding="utf-8").strip()`

			`# Add metadata entry`
			`metadata.append({`
			`"file_name": f"data/{new_name}",`
			`"text": text,`
			`"document_id": doc_id,`
			`"page_number": page_num`
			`})`

			`# Write metadata.jsonl`
			`with open(output / "metadata.jsonl", "w", encoding="utf-8") as f:`
			`for entry in metadata:`
			`f.write(json.dumps(entry, ensure_ascii=False) + "\n")`

			`# Write dataset card`
			`write_dataset_card(output, len(metadata))`

			`print(f"Converted {len(metadata)} samples to {output}")`


			`def write_dataset_card(output_dir: Path, num_samples: int):`
			`"""Write HF dataset card."""`
			`card = f'''---`
			`dataset_info:`
			`features:`
			`- name: image`
			`dtype: image`
			`- name: text`
			`dtype: string`
			`- name: document_id`
			`dtype: string`
			`- name: page_number`
			`dtype: int32`
			`splits:`
			`- name: train`
			`num_examples: {num_samples}`
			`license: cc-by-4.0`
			`language:`
			`- es`
			`task_categories:`
			`- image-to-text`
			`tags:`
			`- ocr`
			`- spanish`
			`- academic-documents`
			`- unir`
			`---`

			`# UNIR OCR Dataset`

			`Dataset de documentos académicos en español para evaluación de sistemas OCR.`

			`## Descripción`

			`- Idioma: Español`
			`- Dominio: Documentos académicos (instrucciones TFE de UNIR)`
			`- Formato: Imágenes PNG (300 DPI) + texto ground truth`
			`- Total: {num_samples} pares imagen-texto`

			`## Uso`

			```python
			`from datasets import load_dataset`

			`dataset = load_dataset("path/to/dataset")`

			`for sample in dataset["train"]:`
			`image = sample["image"]`
			`text = sample["text"]`
			```

			`## Estructura`

			`Cada muestra contiene:`
			- `image`: Imagen de la página (PIL.Image)
			- `text`: Texto ground truth extraído del PDF
			- `document_id`: ID del documento fuente
			- `page_number`: Número de página

			`## Citación`

			`Parte del TFM "Optimización de Hiperparámetros OCR con Ray Tune" - UNIR 2025`
			`'''`
			`(output_dir / "README.md").write_text(card, encoding="utf-8")`


			`if __name__ == "__main__":`
			`import sys`

			`source = sys.argv[1] if len(sys.argv) > 1 else "src/dataset"`
			`output = sys.argv[2] if len(sys.argv) > 2 else "src/dataset_hf"`

			`convert_dataset(source, output)`