#!/usr/bin/env python3 """Convert custom OCR dataset to Hugging Face format.""" import json import shutil from pathlib import Path def convert_dataset(source_dir: str, output_dir: str): """Convert folder-based dataset to HF ImageFolder format.""" source = Path(source_dir) output = Path(output_dir) data_dir = output / "data" data_dir.mkdir(parents=True, exist_ok=True) metadata = [] for doc_folder in sorted(source.iterdir()): if not doc_folder.is_dir(): continue doc_id = doc_folder.name img_dir = doc_folder / "img" txt_dir = doc_folder / "txt" if not img_dir.exists() or not txt_dir.exists(): continue for img_file in sorted(img_dir.glob("*.png")): txt_file = txt_dir / f"{img_file.stem}.txt" if not txt_file.exists(): continue # Extract page number page_num = int(img_file.stem.split("_")[-1]) # New filename: page_{doc_id}_{page_num:04d}.png new_name = f"page_{doc_id}_{page_num:04d}.png" # Copy image shutil.copy(img_file, data_dir / new_name) # Read text text = txt_file.read_text(encoding="utf-8").strip() # Add metadata entry metadata.append({ "file_name": f"data/{new_name}", "text": text, "document_id": doc_id, "page_number": page_num }) # Write metadata.jsonl with open(output / "metadata.jsonl", "w", encoding="utf-8") as f: for entry in metadata: f.write(json.dumps(entry, ensure_ascii=False) + "\n") # Write dataset card write_dataset_card(output, len(metadata)) print(f"Converted {len(metadata)} samples to {output}") def write_dataset_card(output_dir: Path, num_samples: int): """Write HF dataset card.""" card = f'''--- dataset_info: features: - name: image dtype: image - name: text dtype: string - name: document_id dtype: string - name: page_number dtype: int32 splits: - name: train num_examples: {num_samples} license: cc-by-4.0 language: - es task_categories: - image-to-text tags: - ocr - spanish - academic-documents - unir --- # UNIR OCR Dataset Dataset de documentos académicos en español para evaluación de sistemas OCR. ## Descripción - **Idioma**: Español - **Dominio**: Documentos académicos (instrucciones TFE de UNIR) - **Formato**: Imágenes PNG (300 DPI) + texto ground truth - **Total**: {num_samples} pares imagen-texto ## Uso ```python from datasets import load_dataset dataset = load_dataset("path/to/dataset") for sample in dataset["train"]: image = sample["image"] text = sample["text"] ``` ## Estructura Cada muestra contiene: - `image`: Imagen de la página (PIL.Image) - `text`: Texto ground truth extraído del PDF - `document_id`: ID del documento fuente - `page_number`: Número de página ## Citación Parte del TFM "Optimización de Hiperparámetros OCR con Ray Tune" - UNIR 2025 ''' (output_dir / "README.md").write_text(card, encoding="utf-8") if __name__ == "__main__": import sys source = sys.argv[1] if len(sys.argv) > 1 else "src/dataset" output = sys.argv[2] if len(sys.argv) > 2 else "src/dataset_hf" convert_dataset(source, output)