readme and hf dataset formater.

2026-01-19 14:00:28 +01:00
parent 316ace4d51
commit d67cbd4677
4 changed files with 252 additions and 28 deletions
--- a/src/dataset_formatting/convert_to_hf_dataset.py
+++ b/src/dataset_formatting/convert_to_hf_dataset.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+"""Convert custom OCR dataset to Hugging Face format."""
+
+import json
+import shutil
+from pathlib import Path
+
+
+def convert_dataset(source_dir: str, output_dir: str):
+    """Convert folder-based dataset to HF ImageFolder format."""
+
+    source = Path(source_dir)
+    output = Path(output_dir)
+    data_dir = output / "data"
+    data_dir.mkdir(parents=True, exist_ok=True)
+
+    metadata = []
+
+    for doc_folder in sorted(source.iterdir()):
+        if not doc_folder.is_dir():
+            continue
+
+        doc_id = doc_folder.name
+        img_dir = doc_folder / "img"
+        txt_dir = doc_folder / "txt"
+
+        if not img_dir.exists() or not txt_dir.exists():
+            continue
+
+        for img_file in sorted(img_dir.glob("*.png")):
+            txt_file = txt_dir / f"{img_file.stem}.txt"
+            if not txt_file.exists():
+                continue
+
+            # Extract page number
+            page_num = int(img_file.stem.split("_")[-1])
+
+            # New filename: page_{doc_id}_{page_num:04d}.png
+            new_name = f"page_{doc_id}_{page_num:04d}.png"
+
+            # Copy image
+            shutil.copy(img_file, data_dir / new_name)
+
+            # Read text
+            text = txt_file.read_text(encoding="utf-8").strip()
+
+            # Add metadata entry
+            metadata.append({
+                "file_name": f"data/{new_name}",
+                "text": text,
+                "document_id": doc_id,
+                "page_number": page_num
+            })
+
+    # Write metadata.jsonl
+    with open(output / "metadata.jsonl", "w", encoding="utf-8") as f:
+        for entry in metadata:
+            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+
+    # Write dataset card
+    write_dataset_card(output, len(metadata))
+
+    print(f"Converted {len(metadata)} samples to {output}")
+
+
+def write_dataset_card(output_dir: Path, num_samples: int):
+    """Write HF dataset card."""
+    card = f'''---
+dataset_info:
+  features:
+    - name: image
+      dtype: image
+    - name: text
+      dtype: string
+    - name: document_id
+      dtype: string
+    - name: page_number
+      dtype: int32
+  splits:
+    - name: train
+      num_examples: {num_samples}
+license: cc-by-4.0
+language:
+  - es
+task_categories:
+  - image-to-text
+tags:
+  - ocr
+  - spanish
+  - academic-documents
+  - unir
+---
+
+# UNIR OCR Dataset
+
+Dataset de documentos académicos en español para evaluación de sistemas OCR.
+
+## Descripción
+
+- **Idioma**: Español
+- **Dominio**: Documentos académicos (instrucciones TFE de UNIR)
+- **Formato**: Imágenes PNG (300 DPI) + texto ground truth
+- **Total**: {num_samples} pares imagen-texto
+
+## Uso
+
+```python
+from datasets import load_dataset
+
+dataset = load_dataset("path/to/dataset")
+
+for sample in dataset["train"]:
+    image = sample["image"]
+    text = sample["text"]
+```
+
+## Estructura
+
+Cada muestra contiene:
+- `image`: Imagen de la página (PIL.Image)
+- `text`: Texto ground truth extraído del PDF
+- `document_id`: ID del documento fuente
+- `page_number`: Número de página
+
+## Citación
+
+Parte del TFM "Optimización de Hiperparámetros OCR con Ray Tune" - UNIR 2025
+'''
+    (output_dir / "README.md").write_text(card, encoding="utf-8")
+
+
+if __name__ == "__main__":
+    import sys
+
+    source = sys.argv[1] if len(sys.argv) > 1 else "src/dataset"
+    output = sys.argv[2] if len(sys.argv) > 2 else "src/dataset_hf"
+
+    convert_dataset(source, output)
--- a/src/dataset_formatting/upload-dataset.sh
+++ b/src/dataset_formatting/upload-dataset.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# Upload OCR dataset to Gitea generic packages
+#
+# Usage:
+#   ./src/dataset_formatting/upload-dataset.sh [token]
+#
+# Environment variables:
+#   GITEA_TOKEN - Gitea API token
+
+set -e
+
+GITEA_URL="https://seryus.ddns.net"
+GITEA_ORG="unir"
+PACKAGE_NAME="ocr-dataset-spanish"
+VERSION="1.0.0"
+DATASET_DIR="src/dataset_hf"
+TARBALL="dataset-${VERSION}.tar.gz"
+
+# Get token
+TOKEN="${1:-${GITEA_TOKEN}}"
+if [ -z "$TOKEN" ]; then
+    echo "Error: No token provided"
+    echo "Usage: $0 [token]"
+    echo "  or set GITEA_TOKEN environment variable"
+    exit 1
+fi
+
+# Check dataset exists
+if [ ! -d "$DATASET_DIR" ]; then
+    echo "Error: Dataset not found at $DATASET_DIR"
+    echo "Run: python src/convert_to_hf_dataset.py first"
+    exit 1
+fi
+
+# Create tarball
+echo "Creating tarball..."
+tar -czvf "$TARBALL" -C "$DATASET_DIR" .
+echo "Created: $TARBALL ($(du -h $TARBALL | cut -f1))"
+
+# Upload
+echo "Uploading to Gitea packages..."
+echo "  URL: $GITEA_URL/api/packages/$GITEA_ORG/generic/$PACKAGE_NAME/$VERSION/$TARBALL"
+
+HTTP_CODE=$(curl -sS -w "%{http_code}" -o /tmp/upload_response.txt \
+    -X PUT \
+    -H "Authorization: token $TOKEN" \
+    -H "Content-Type: application/octet-stream" \
+    --data-binary "@$TARBALL" \
+    "$GITEA_URL/api/packages/$GITEA_ORG/generic/$PACKAGE_NAME/$VERSION/$TARBALL")
+
+if [ "$HTTP_CODE" = "201" ] || [ "$HTTP_CODE" = "200" ]; then
+    echo "Success! Dataset uploaded."
+    echo "Download URL: $GITEA_URL/api/packages/$GITEA_ORG/generic/$PACKAGE_NAME/$VERSION/$TARBALL"
+    rm "$TARBALL"
+elif [ "$HTTP_CODE" = "409" ]; then
+    echo "Package version already exists (HTTP 409)"
+    rm "$TARBALL"
+else
+    echo "Error: Upload failed with HTTP $HTTP_CODE"
+    cat /tmp/upload_response.txt
+    rm "$TARBALL"
+    exit 1
+fi