readme and hf dataset formater.
All checks were successful
build_docker / build_cpu (pull_request) Successful in 5m46s
build_docker / build_gpu (pull_request) Successful in 19m24s
build_docker / build_easyocr (pull_request) Successful in 16m51s
build_docker / build_easyocr_gpu (pull_request) Successful in 15m50s
build_docker / build_doctr (pull_request) Successful in 18m54s
build_docker / build_doctr_gpu (pull_request) Successful in 13m36s
build_docker / essential (pull_request) Successful in 1s
All checks were successful
build_docker / build_cpu (pull_request) Successful in 5m46s
build_docker / build_gpu (pull_request) Successful in 19m24s
build_docker / build_easyocr (pull_request) Successful in 16m51s
build_docker / build_easyocr_gpu (pull_request) Successful in 15m50s
build_docker / build_doctr (pull_request) Successful in 18m54s
build_docker / build_doctr_gpu (pull_request) Successful in 13m36s
build_docker / essential (pull_request) Successful in 1s
This commit is contained in:
138
src/dataset_formatting/convert_to_hf_dataset.py
Executable file
138
src/dataset_formatting/convert_to_hf_dataset.py
Executable file
@@ -0,0 +1,138 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Convert custom OCR dataset to Hugging Face format."""
|
||||
|
||||
import json
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def convert_dataset(source_dir: str, output_dir: str):
|
||||
"""Convert folder-based dataset to HF ImageFolder format."""
|
||||
|
||||
source = Path(source_dir)
|
||||
output = Path(output_dir)
|
||||
data_dir = output / "data"
|
||||
data_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
metadata = []
|
||||
|
||||
for doc_folder in sorted(source.iterdir()):
|
||||
if not doc_folder.is_dir():
|
||||
continue
|
||||
|
||||
doc_id = doc_folder.name
|
||||
img_dir = doc_folder / "img"
|
||||
txt_dir = doc_folder / "txt"
|
||||
|
||||
if not img_dir.exists() or not txt_dir.exists():
|
||||
continue
|
||||
|
||||
for img_file in sorted(img_dir.glob("*.png")):
|
||||
txt_file = txt_dir / f"{img_file.stem}.txt"
|
||||
if not txt_file.exists():
|
||||
continue
|
||||
|
||||
# Extract page number
|
||||
page_num = int(img_file.stem.split("_")[-1])
|
||||
|
||||
# New filename: page_{doc_id}_{page_num:04d}.png
|
||||
new_name = f"page_{doc_id}_{page_num:04d}.png"
|
||||
|
||||
# Copy image
|
||||
shutil.copy(img_file, data_dir / new_name)
|
||||
|
||||
# Read text
|
||||
text = txt_file.read_text(encoding="utf-8").strip()
|
||||
|
||||
# Add metadata entry
|
||||
metadata.append({
|
||||
"file_name": f"data/{new_name}",
|
||||
"text": text,
|
||||
"document_id": doc_id,
|
||||
"page_number": page_num
|
||||
})
|
||||
|
||||
# Write metadata.jsonl
|
||||
with open(output / "metadata.jsonl", "w", encoding="utf-8") as f:
|
||||
for entry in metadata:
|
||||
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||
|
||||
# Write dataset card
|
||||
write_dataset_card(output, len(metadata))
|
||||
|
||||
print(f"Converted {len(metadata)} samples to {output}")
|
||||
|
||||
|
||||
def write_dataset_card(output_dir: Path, num_samples: int):
|
||||
"""Write HF dataset card."""
|
||||
card = f'''---
|
||||
dataset_info:
|
||||
features:
|
||||
- name: image
|
||||
dtype: image
|
||||
- name: text
|
||||
dtype: string
|
||||
- name: document_id
|
||||
dtype: string
|
||||
- name: page_number
|
||||
dtype: int32
|
||||
splits:
|
||||
- name: train
|
||||
num_examples: {num_samples}
|
||||
license: cc-by-4.0
|
||||
language:
|
||||
- es
|
||||
task_categories:
|
||||
- image-to-text
|
||||
tags:
|
||||
- ocr
|
||||
- spanish
|
||||
- academic-documents
|
||||
- unir
|
||||
---
|
||||
|
||||
# UNIR OCR Dataset
|
||||
|
||||
Dataset de documentos académicos en español para evaluación de sistemas OCR.
|
||||
|
||||
## Descripción
|
||||
|
||||
- **Idioma**: Español
|
||||
- **Dominio**: Documentos académicos (instrucciones TFE de UNIR)
|
||||
- **Formato**: Imágenes PNG (300 DPI) + texto ground truth
|
||||
- **Total**: {num_samples} pares imagen-texto
|
||||
|
||||
## Uso
|
||||
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
|
||||
dataset = load_dataset("path/to/dataset")
|
||||
|
||||
for sample in dataset["train"]:
|
||||
image = sample["image"]
|
||||
text = sample["text"]
|
||||
```
|
||||
|
||||
## Estructura
|
||||
|
||||
Cada muestra contiene:
|
||||
- `image`: Imagen de la página (PIL.Image)
|
||||
- `text`: Texto ground truth extraído del PDF
|
||||
- `document_id`: ID del documento fuente
|
||||
- `page_number`: Número de página
|
||||
|
||||
## Citación
|
||||
|
||||
Parte del TFM "Optimización de Hiperparámetros OCR con Ray Tune" - UNIR 2025
|
||||
'''
|
||||
(output_dir / "README.md").write_text(card, encoding="utf-8")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
source = sys.argv[1] if len(sys.argv) > 1 else "src/dataset"
|
||||
output = sys.argv[2] if len(sys.argv) > 2 else "src/dataset_hf"
|
||||
|
||||
convert_dataset(source, output)
|
||||
63
src/dataset_formatting/upload-dataset.sh
Executable file
63
src/dataset_formatting/upload-dataset.sh
Executable file
@@ -0,0 +1,63 @@
|
||||
#!/bin/bash
|
||||
# Upload OCR dataset to Gitea generic packages
|
||||
#
|
||||
# Usage:
|
||||
# ./src/dataset_formatting/upload-dataset.sh [token]
|
||||
#
|
||||
# Environment variables:
|
||||
# GITEA_TOKEN - Gitea API token
|
||||
|
||||
set -e
|
||||
|
||||
GITEA_URL="https://seryus.ddns.net"
|
||||
GITEA_ORG="unir"
|
||||
PACKAGE_NAME="ocr-dataset-spanish"
|
||||
VERSION="1.0.0"
|
||||
DATASET_DIR="src/dataset_hf"
|
||||
TARBALL="dataset-${VERSION}.tar.gz"
|
||||
|
||||
# Get token
|
||||
TOKEN="${1:-${GITEA_TOKEN}}"
|
||||
if [ -z "$TOKEN" ]; then
|
||||
echo "Error: No token provided"
|
||||
echo "Usage: $0 [token]"
|
||||
echo " or set GITEA_TOKEN environment variable"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check dataset exists
|
||||
if [ ! -d "$DATASET_DIR" ]; then
|
||||
echo "Error: Dataset not found at $DATASET_DIR"
|
||||
echo "Run: python src/convert_to_hf_dataset.py first"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Create tarball
|
||||
echo "Creating tarball..."
|
||||
tar -czvf "$TARBALL" -C "$DATASET_DIR" .
|
||||
echo "Created: $TARBALL ($(du -h $TARBALL | cut -f1))"
|
||||
|
||||
# Upload
|
||||
echo "Uploading to Gitea packages..."
|
||||
echo " URL: $GITEA_URL/api/packages/$GITEA_ORG/generic/$PACKAGE_NAME/$VERSION/$TARBALL"
|
||||
|
||||
HTTP_CODE=$(curl -sS -w "%{http_code}" -o /tmp/upload_response.txt \
|
||||
-X PUT \
|
||||
-H "Authorization: token $TOKEN" \
|
||||
-H "Content-Type: application/octet-stream" \
|
||||
--data-binary "@$TARBALL" \
|
||||
"$GITEA_URL/api/packages/$GITEA_ORG/generic/$PACKAGE_NAME/$VERSION/$TARBALL")
|
||||
|
||||
if [ "$HTTP_CODE" = "201" ] || [ "$HTTP_CODE" = "200" ]; then
|
||||
echo "Success! Dataset uploaded."
|
||||
echo "Download URL: $GITEA_URL/api/packages/$GITEA_ORG/generic/$PACKAGE_NAME/$VERSION/$TARBALL"
|
||||
rm "$TARBALL"
|
||||
elif [ "$HTTP_CODE" = "409" ]; then
|
||||
echo "Package version already exists (HTTP 409)"
|
||||
rm "$TARBALL"
|
||||
else
|
||||
echo "Error: Upload failed with HTTP $HTTP_CODE"
|
||||
cat /tmp/upload_response.txt
|
||||
rm "$TARBALL"
|
||||
exit 1
|
||||
fi
|
||||
Reference in New Issue
Block a user