#!/usr/bin/env python3
"""Convert custom OCR dataset to Hugging Face format."""

import json
import shutil
from pathlib import Path


def convert_dataset(source_dir: str, output_dir: str):
    """Convert folder-based dataset to HF ImageFolder format."""

    source = Path(source_dir)
    output = Path(output_dir)
    data_dir = output / "data"
    data_dir.mkdir(parents=True, exist_ok=True)

    metadata = []

    for doc_folder in sorted(source.iterdir()):
        if not doc_folder.is_dir():
            continue

        doc_id = doc_folder.name
        img_dir = doc_folder / "img"
        txt_dir = doc_folder / "txt"

        if not img_dir.exists() or not txt_dir.exists():
            continue

        for img_file in sorted(img_dir.glob("*.png")):
            txt_file = txt_dir / f"{img_file.stem}.txt"
            if not txt_file.exists():
                continue

            # Extract page number
            page_num = int(img_file.stem.split("_")[-1])

            # New filename: page_{doc_id}_{page_num:04d}.png
            new_name = f"page_{doc_id}_{page_num:04d}.png"

            # Copy image
            shutil.copy(img_file, data_dir / new_name)

            # Read text
            text = txt_file.read_text(encoding="utf-8").strip()

            # Add metadata entry
            metadata.append({
                "file_name": f"data/{new_name}",
                "text": text,
                "document_id": doc_id,
                "page_number": page_num
            })

    # Write metadata.jsonl
    with open(output / "metadata.jsonl", "w", encoding="utf-8") as f:
        for entry in metadata:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    # Write dataset card
    write_dataset_card(output, len(metadata))

    print(f"Converted {len(metadata)} samples to {output}")


def write_dataset_card(output_dir: Path, num_samples: int):
    """Write HF dataset card."""
    card = f'''---
dataset_info:
  features:
    - name: image
      dtype: image
    - name: text
      dtype: string
    - name: document_id
      dtype: string
    - name: page_number
      dtype: int32
  splits:
    - name: train
      num_examples: {num_samples}
license: cc-by-4.0
language:
  - es
task_categories:
  - image-to-text
tags:
  - ocr
  - spanish
  - academic-documents
  - unir
---

# UNIR OCR Dataset

Dataset de documentos académicos en español para evaluación de sistemas OCR.

## Descripción

- **Idioma**: Español
- **Dominio**: Documentos académicos (instrucciones TFE de UNIR)
- **Formato**: Imágenes PNG (300 DPI) + texto ground truth
- **Total**: {num_samples} pares imagen-texto

## Uso

```python
from datasets import load_dataset

dataset = load_dataset("path/to/dataset")

for sample in dataset["train"]:
    image = sample["image"]
    text = sample["text"]
```

## Estructura

Cada muestra contiene:
- `image`: Imagen de la página (PIL.Image)
- `text`: Texto ground truth extraído del PDF
- `document_id`: ID del documento fuente
- `page_number`: Número de página

## Citación

Parte del TFM "Optimización de Hiperparámetros OCR con Ray Tune" - UNIR 2025
'''
    (output_dir / "README.md").write_text(card, encoding="utf-8")


if __name__ == "__main__":
    import sys

    source = sys.argv[1] if len(sys.argv) > 1 else "src/dataset"
    output = sys.argv[2] if len(sys.argv) > 2 else "src/dataset_hf"

    convert_dataset(source, output)