diff --git a/src/paddle_ocr/Dockerfile.cpu b/src/paddle_ocr/Dockerfile.cpu
new file mode 100644
index 0000000..f9c6bab
--- /dev/null
+++ b/src/paddle_ocr/Dockerfile.cpu
@@ -0,0 +1,58 @@
+# Dockerfile.cpu - CPU-only PaddleOCR REST API
+# Multi-arch: supports both amd64 and arm64
+
+FROM python:3.11-slim
+
+LABEL maintainer="Sergio Jimenez"
+LABEL description="PaddleOCR Tuning REST API - CPU version"
+
+WORKDIR /app
+
+# Install system dependencies for OpenCV and PaddleOCR
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ libgl1 \
+ libglib2.0-0 \
+ libsm6 \
+ libxext6 \
+ libxrender1 \
+ libgomp1 \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install Python dependencies from requirements file
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy application code
+COPY paddle_ocr_tuning_rest.py .
+COPY dataset_manager.py .
+
+# Build arguments for models to bake into image
+ARG DET_MODEL=PP-OCRv5_server_det
+ARG REC_MODEL=PP-OCRv5_server_rec
+
+# Set as environment variables (can be overridden at runtime)
+ENV PADDLE_DET_MODEL=${DET_MODEL}
+ENV PADDLE_REC_MODEL=${REC_MODEL}
+
+# Download models during build (not at runtime)
+RUN python -c "\
+import os; \
+from paddleocr import PaddleOCR; \
+det = os.environ.get('PADDLE_DET_MODEL', 'PP-OCRv5_server_det'); \
+rec = os.environ.get('PADDLE_REC_MODEL', 'PP-OCRv5_server_rec'); \
+print(f'Downloading models: det={det}, rec={rec}'); \
+ocr = PaddleOCR(text_detection_model_name=det, text_recognition_model_name=rec); \
+print('Models downloaded successfully!')"
+
+# Volume for dataset and optional additional model cache
+VOLUME ["/app/dataset", "/root/.paddlex"]
+
+# Expose API port
+EXPOSE 8000
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+ CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
+
+# Run the API server
+CMD ["uvicorn", "paddle_ocr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/src/paddle_ocr/Dockerfile.gpu b/src/paddle_ocr/Dockerfile.gpu
new file mode 100644
index 0000000..5c3ca27
--- /dev/null
+++ b/src/paddle_ocr/Dockerfile.gpu
@@ -0,0 +1,68 @@
+# Dockerfile.gpu - CUDA-enabled PaddleOCR REST API
+# Supports: x86_64 with NVIDIA GPU (CUDA 12.x)
+# For DGX Spark (ARM64 + CUDA): build natively on the device
+
+FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
+
+LABEL maintainer="Sergio Jimenez"
+LABEL description="PaddleOCR Tuning REST API - GPU/CUDA version"
+
+WORKDIR /app
+
+# Set environment variables
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV CUDA_VISIBLE_DEVICES=0
+
+# Install Python 3.11 and system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ python3.11 \
+ python3.11-venv \
+ python3-pip \
+ libgl1 \
+ libglib2.0-0 \
+ libsm6 \
+ libxext6 \
+ libxrender1 \
+ libgomp1 \
+ && rm -rf /var/lib/apt/lists/* \
+ && ln -sf /usr/bin/python3.11 /usr/bin/python
+
+# Install Python dependencies from requirements file
+COPY requirements-gpu.txt .
+RUN pip install --no-cache-dir -r requirements-gpu.txt
+
+# Copy application code
+COPY paddle_ocr_tuning_rest.py .
+COPY dataset_manager.py .
+
+# Build arguments for models to bake into image
+ARG DET_MODEL=PP-OCRv5_server_det
+ARG REC_MODEL=PP-OCRv5_server_rec
+
+# Set as environment variables (can be overridden at runtime)
+ENV PADDLE_DET_MODEL=${DET_MODEL}
+ENV PADDLE_REC_MODEL=${REC_MODEL}
+
+# Download models during build (not at runtime)
+RUN python -c "\
+import os; \
+from paddleocr import PaddleOCR; \
+det = os.environ.get('PADDLE_DET_MODEL', 'PP-OCRv5_server_det'); \
+rec = os.environ.get('PADDLE_REC_MODEL', 'PP-OCRv5_server_rec'); \
+print(f'Downloading models: det={det}, rec={rec}'); \
+ocr = PaddleOCR(text_detection_model_name=det, text_recognition_model_name=rec); \
+print('Models downloaded successfully!')"
+
+# Volume for dataset and optional additional model cache
+VOLUME ["/app/dataset", "/root/.paddlex"]
+
+# Expose API port
+EXPOSE 8000
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+ CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
+
+# Run the API server
+CMD ["uvicorn", "paddle_ocr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/src/paddle_ocr/README.md b/src/paddle_ocr/README.md
new file mode 100644
index 0000000..1012a2b
--- /dev/null
+++ b/src/paddle_ocr/README.md
@@ -0,0 +1,329 @@
+# PaddleOCR Tuning REST API
+
+REST API service for PaddleOCR hyperparameter evaluation. Keeps the model loaded in memory for fast repeated evaluations during hyperparameter search.
+
+## Quick Start with Docker Compose
+
+Docker Compose manages building and running containers. The `docker-compose.yml` defines two services:
+- `ocr-cpu` - CPU-only version (works everywhere)
+- `ocr-gpu` - GPU version (requires NVIDIA GPU + Container Toolkit)
+
+### Run CPU Version
+
+```bash
+cd src/paddle_ocr
+
+# Build and start (first time takes ~2-3 min to build, ~30s to load model)
+docker compose up ocr-cpu
+
+# Or run in background (detached)
+docker compose up -d ocr-cpu
+
+# View logs
+docker compose logs -f ocr-cpu
+
+# Stop
+docker compose down
+```
+
+### Run GPU Version
+
+```bash
+# Requires: NVIDIA GPU + nvidia-container-toolkit installed
+docker compose up ocr-gpu
+```
+
+### Test the API
+
+Once running, test with:
+```bash
+# Check health
+curl http://localhost:8000/health
+
+# Or use the test script
+pip install requests
+python test.py --url http://localhost:8000
+```
+
+### What Docker Compose Does
+
+```
+docker compose up ocr-cpu
+ │
+ ├─► Builds image from Dockerfile.cpu (if not exists)
+ ├─► Creates container "paddle-ocr-cpu"
+ ├─► Mounts ../dataset → /app/dataset (your PDF images)
+ ├─► Mounts paddlex-cache volume (persists downloaded models)
+ ├─► Exposes port 8000
+ └─► Runs: uvicorn paddle_ocr_tuning_rest:app --host 0.0.0.0 --port 8000
+```
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `paddle_ocr_tuning_rest.py` | FastAPI REST service |
+| `dataset_manager.py` | Dataset loader |
+| `test.py` | API test client |
+| `Dockerfile.cpu` | CPU-only image (multi-arch) |
+| `Dockerfile.gpu` | GPU/CUDA image (x86_64) |
+| `docker-compose.yml` | Service orchestration |
+
+## API Endpoints
+
+### `GET /health`
+Check if service is ready.
+
+```json
+{"status": "ok", "model_loaded": true, "dataset_loaded": true, "dataset_size": 24}
+```
+
+### `POST /evaluate`
+Run OCR evaluation with given hyperparameters.
+
+**Request:**
+```json
+{
+ "pdf_folder": "/app/dataset",
+ "textline_orientation": true,
+ "use_doc_orientation_classify": false,
+ "use_doc_unwarping": false,
+ "text_det_thresh": 0.469,
+ "text_det_box_thresh": 0.5412,
+ "text_det_unclip_ratio": 0.0,
+ "text_rec_score_thresh": 0.635,
+ "start_page": 5,
+ "end_page": 10
+}
+```
+
+**Response:**
+```json
+{"CER": 0.0115, "WER": 0.0989, "TIME": 330.5, "PAGES": 5, "TIME_PER_PAGE": 66.1}
+```
+
+### `POST /evaluate_full`
+Same as `/evaluate` but runs on ALL pages (ignores start_page/end_page).
+
+## Building Images
+
+### CPU Image (Multi-Architecture)
+
+```bash
+# Local build (current architecture)
+docker build -f Dockerfile.cpu -t paddle-ocr-api:cpu .
+
+# Multi-arch build with buildx (amd64 + arm64)
+docker buildx create --name multiarch --use
+docker buildx build -f Dockerfile.cpu \
+ --platform linux/amd64,linux/arm64 \
+ -t paddle-ocr-api:cpu \
+ --push .
+```
+
+### GPU Image (x86_64 only)
+
+```bash
+docker build -f Dockerfile.gpu -t paddle-ocr-api:gpu .
+```
+
+## Running
+
+### CPU (Any machine)
+
+```bash
+docker run -d -p 8000:8000 \
+ -v $(pwd)/../dataset:/app/dataset:ro \
+ -v paddlex-cache:/root/.paddlex \
+ paddle-ocr-api:cpu
+```
+
+### GPU (NVIDIA)
+
+```bash
+docker run -d -p 8000:8000 --gpus all \
+ -v $(pwd)/../dataset:/app/dataset:ro \
+ -v paddlex-cache:/root/.paddlex \
+ paddle-ocr-api:gpu
+```
+
+## DGX Spark (ARM64 + CUDA)
+
+DGX Spark uses ARM64 (Grace CPU) with NVIDIA Hopper GPU. You have two options:
+
+### Option 1: Native ARM64 Build (Recommended)
+
+PaddlePaddle has ARM64 support. Build natively:
+
+```bash
+# On DGX Spark or ARM64 machine
+docker build -f Dockerfile.cpu -t paddle-ocr-api:arm64 .
+```
+
+For GPU acceleration on ARM64, you'll need to modify `Dockerfile.gpu` to use ARM-compatible base image:
+
+```dockerfile
+# Change this line in Dockerfile.gpu:
+FROM nvcr.io/nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
+
+# To ARM64-compatible version:
+FROM nvcr.io/nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
+# (same image works on ARM64 when pulled on ARM machine)
+```
+
+Then build on the DGX Spark:
+```bash
+docker build -f Dockerfile.gpu -t paddle-ocr-api:gpu-arm64 .
+```
+
+### Option 2: x86_64 Emulation via QEMU (Slow)
+
+You CAN run x86_64 images on ARM via emulation, but it's ~10-20x slower:
+
+```bash
+# On DGX Spark, enable QEMU emulation
+docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
+
+# Run x86_64 image with emulation
+docker run --platform linux/amd64 -p 8000:8000 \
+ -v $(pwd)/../dataset:/app/dataset:ro \
+ paddle-ocr-api:cpu
+```
+
+**Not recommended** for production due to severe performance penalty.
+
+### Option 3: Cross-compile from x86_64
+
+Build ARM64 images from your x86_64 machine:
+
+```bash
+# Setup buildx for multi-arch
+docker buildx create --name mybuilder --use
+
+# Build ARM64 image from x86_64 machine
+docker buildx build -f Dockerfile.cpu \
+ --platform linux/arm64 \
+ -t paddle-ocr-api:arm64 \
+ --load .
+
+# Save and transfer to DGX Spark
+docker save paddle-ocr-api:arm64 | gzip > paddle-ocr-arm64.tar.gz
+scp paddle-ocr-arm64.tar.gz dgx-spark:~/
+# On DGX Spark:
+docker load < paddle-ocr-arm64.tar.gz
+```
+
+## Using with Ray Tune
+
+Update your notebook's `trainable_paddle_ocr` function:
+
+```python
+import requests
+
+API_URL = "http://localhost:8000/evaluate"
+
+def trainable_paddle_ocr(config):
+ """Call OCR API instead of subprocess."""
+ payload = {
+ "pdf_folder": "/app/dataset",
+ "use_doc_orientation_classify": config.get("use_doc_orientation_classify", False),
+ "use_doc_unwarping": config.get("use_doc_unwarping", False),
+ "textline_orientation": config.get("textline_orientation", True),
+ "text_det_thresh": config.get("text_det_thresh", 0.0),
+ "text_det_box_thresh": config.get("text_det_box_thresh", 0.0),
+ "text_det_unclip_ratio": config.get("text_det_unclip_ratio", 1.5),
+ "text_rec_score_thresh": config.get("text_rec_score_thresh", 0.0),
+ }
+
+ try:
+ response = requests.post(API_URL, json=payload, timeout=600)
+ response.raise_for_status()
+ metrics = response.json()
+ tune.report(metrics=metrics)
+ except Exception as e:
+ tune.report({"CER": 1.0, "WER": 1.0, "ERROR": str(e)[:500]})
+```
+
+## Architecture: Model Lifecycle
+
+The model is loaded **once** at container startup and stays in memory for all requests:
+
+```mermaid
+flowchart TB
+ subgraph Container["Docker Container Lifecycle"]
+ Start([Container Start]) --> Load[Load PaddleOCR Models
~10-30s one-time cost]
+ Load --> Ready[API Ready
Models in RAM ~500MB]
+
+ subgraph Requests["Incoming Requests - Models Stay Loaded"]
+ Ready --> R1[Request 1] --> Ready
+ Ready --> R2[Request 2] --> Ready
+ Ready --> RN[Request N...] --> Ready
+ end
+
+ Ready --> Stop([Container Stop])
+ Stop --> Free[Models Freed]
+ end
+
+ style Load fill:#f9f,stroke:#333
+ style Ready fill:#9f9,stroke:#333
+ style Requests fill:#e8f4ea,stroke:#090
+```
+
+**Subprocess vs REST API comparison:**
+
+```mermaid
+flowchart LR
+ subgraph Subprocess["❌ Subprocess Approach"]
+ direction TB
+ S1[Trial 1] --> L1[Load Model ~10s]
+ L1 --> E1[Evaluate ~60s]
+ E1 --> U1[Unload]
+ U1 --> S2[Trial 2]
+ S2 --> L2[Load Model ~10s]
+ L2 --> E2[Evaluate ~60s]
+ end
+
+ subgraph REST["✅ REST API Approach"]
+ direction TB
+ Start2[Start Container] --> Load2[Load Model ~10s]
+ Load2 --> Ready2[Model in Memory]
+ Ready2 --> T1[Trial 1 ~60s]
+ T1 --> Ready2
+ Ready2 --> T2[Trial 2 ~60s]
+ T2 --> Ready2
+ Ready2 --> TN[Trial N ~60s]
+ end
+
+ style L1 fill:#faa
+ style L2 fill:#faa
+ style Load2 fill:#afa
+ style Ready2 fill:#afa
+```
+
+## Performance Comparison
+
+| Approach | Model Load | Per-Trial Overhead | 64 Trials |
+|----------|------------|-------------------|-----------|
+| Subprocess (original) | Every trial (~10s) | ~10s | ~7 hours |
+| Docker per trial | Every trial (~10s) | ~12-15s | ~7.5 hours |
+| **REST API** | **Once** | **~0.1s** | **~5.8 hours** |
+
+The REST API saves ~1+ hour by loading the model only once.
+
+## Troubleshooting
+
+### Model download slow on first run
+The first run downloads ~500MB of models. Use volume `paddlex-cache` to persist them.
+
+### Out of memory
+Reduce `max_concurrent_trials` in Ray Tune, or increase container memory:
+```bash
+docker run --memory=8g ...
+```
+
+### GPU not detected
+Ensure NVIDIA Container Toolkit is installed:
+```bash
+nvidia-smi # Should work
+docker run --rm --gpus all nvidia/cuda:12.0-base nvidia-smi # Should work
+```
diff --git a/src/paddle_ocr/dataset_manager.py b/src/paddle_ocr/dataset_manager.py
new file mode 100644
index 0000000..2d3ccac
--- /dev/null
+++ b/src/paddle_ocr/dataset_manager.py
@@ -0,0 +1,45 @@
+# Imports
+import os
+from PIL import Image
+
+
+class ImageTextDataset:
+ def __init__(self, root):
+ self.samples = []
+
+ for folder in sorted(os.listdir(root)):
+ sub = os.path.join(root, folder)
+ img_dir = os.path.join(sub, "img")
+ txt_dir = os.path.join(sub, "txt")
+
+ if not (os.path.isdir(img_dir) and os.path.isdir(txt_dir)):
+ continue
+
+ for fname in sorted(os.listdir(img_dir)):
+ if not fname.lower().endswith((".png", ".jpg", ".jpeg")):
+ continue
+
+ img_path = os.path.join(img_dir, fname)
+
+ # text file must have same name but .txt
+ txt_name = os.path.splitext(fname)[0] + ".txt"
+ txt_path = os.path.join(txt_dir, txt_name)
+
+ if not os.path.exists(txt_path):
+ continue
+
+ self.samples.append((img_path, txt_path))
+ def __len__(self):
+ return len(self.samples)
+
+ def __getitem__(self, idx):
+ img_path, txt_path = self.samples[idx]
+
+ # Load image
+ image = Image.open(img_path).convert("RGB")
+
+ # Load text
+ with open(txt_path, "r", encoding="utf-8") as f:
+ text = f.read()
+
+ return image, text
\ No newline at end of file
diff --git a/src/paddle_ocr/docker-compose.yml b/src/paddle_ocr/docker-compose.yml
new file mode 100644
index 0000000..1bbd6e0
--- /dev/null
+++ b/src/paddle_ocr/docker-compose.yml
@@ -0,0 +1,83 @@
+# docker-compose.yml - PaddleOCR REST API
+# Usage:
+# CPU: docker compose up ocr-cpu
+# GPU: docker compose up ocr-gpu
+# Test: docker compose run --rm test
+
+services:
+ # CPU-only service (works on any architecture)
+ ocr-cpu:
+ build:
+ context: .
+ dockerfile: Dockerfile.cpu
+ args:
+ # Models to bake into image (change before building):
+ DET_MODEL: PP-OCRv5_server_det
+ REC_MODEL: PP-OCRv5_server_rec
+ image: paddle-ocr-api:cpu
+ container_name: paddle-ocr-cpu
+ ports:
+ - "8000:8000"
+ volumes:
+ - ../dataset:/app/dataset:ro # Your dataset
+ - paddlex-cache:/root/.paddlex # For additional models at runtime
+ environment:
+ - PYTHONUNBUFFERED=1
+ # Override models at runtime (uncomment to use different models):
+ # - PADDLE_DET_MODEL=PP-OCRv5_mobile_det
+ # - PADDLE_REC_MODEL=PP-OCRv5_mobile_rec
+ restart: unless-stopped
+ healthcheck:
+ test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
+ interval: 30s
+ timeout: 10s
+ retries: 3
+ start_period: 60s
+
+ # GPU service (requires NVIDIA Container Toolkit)
+ ocr-gpu:
+ build:
+ context: .
+ dockerfile: Dockerfile.gpu
+ args:
+ DET_MODEL: PP-OCRv5_server_det
+ REC_MODEL: PP-OCRv5_server_rec
+ image: paddle-ocr-api:gpu
+ container_name: paddle-ocr-gpu
+ ports:
+ - "8000:8000"
+ volumes:
+ - ../dataset:/app/dataset:ro
+ - paddlex-cache:/root/.paddlex
+ environment:
+ - PYTHONUNBUFFERED=1
+ - CUDA_VISIBLE_DEVICES=0
+ # Override models at runtime:
+ # - PADDLE_DET_MODEL=PP-OCRv5_mobile_det
+ # - PADDLE_REC_MODEL=PP-OCRv5_mobile_rec
+ deploy:
+ resources:
+ reservations:
+ devices:
+ - driver: nvidia
+ count: 1
+ capabilities: [gpu]
+ restart: unless-stopped
+
+ # Test client (runs once and exits)
+ test:
+ image: python:3.11-slim
+ container_name: paddle-ocr-test
+ depends_on:
+ ocr-cpu:
+ condition: service_healthy
+ volumes:
+ - ./test.py:/app/test.py:ro
+ working_dir: /app
+ command: >
+ sh -c "pip install -q requests && python test.py --url http://ocr-cpu:8000 --dataset /app/dataset"
+ network_mode: "service:ocr-cpu"
+
+volumes:
+ paddlex-cache:
+ name: paddlex-model-cache
diff --git a/src/paddle_ocr/paddle_ocr_tuning_rest.py b/src/paddle_ocr/paddle_ocr_tuning_rest.py
new file mode 100644
index 0000000..9a34c78
--- /dev/null
+++ b/src/paddle_ocr/paddle_ocr_tuning_rest.py
@@ -0,0 +1,263 @@
+# paddle_ocr_tuning_rest.py
+# FastAPI REST service for PaddleOCR hyperparameter evaluation
+# Usage: uvicorn paddle_ocr_tuning_rest:app --host 0.0.0.0 --port 8000
+
+import os
+import re
+import time
+from typing import Optional
+from contextlib import asynccontextmanager
+
+import numpy as np
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, Field
+
+from paddleocr import PaddleOCR
+from jiwer import wer, cer
+from dataset_manager import ImageTextDataset
+
+
+# Model configuration via environment variables (with defaults)
+DEFAULT_DET_MODEL = os.environ.get("PADDLE_DET_MODEL", "PP-OCRv5_server_det")
+DEFAULT_REC_MODEL = os.environ.get("PADDLE_REC_MODEL", "PP-OCRv5_server_rec")
+
+
+# Global state for model and dataset
+class AppState:
+ ocr: Optional[PaddleOCR] = None
+ dataset: Optional[ImageTextDataset] = None
+ dataset_path: Optional[str] = None
+ det_model: str = DEFAULT_DET_MODEL
+ rec_model: str = DEFAULT_REC_MODEL
+
+
+state = AppState()
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+ """Load OCR model at startup."""
+ print(f"Loading PaddleOCR models...")
+ print(f" Detection: {state.det_model}")
+ print(f" Recognition: {state.rec_model}")
+ state.ocr = PaddleOCR(
+ text_detection_model_name=state.det_model,
+ text_recognition_model_name=state.rec_model,
+ )
+ print("Model loaded successfully!")
+ yield
+ # Cleanup on shutdown
+ state.ocr = None
+ state.dataset = None
+
+
+app = FastAPI(
+ title="PaddleOCR Tuning API",
+ description="REST API for OCR hyperparameter evaluation",
+ version="1.0.0",
+ lifespan=lifespan,
+)
+
+
+class EvaluateRequest(BaseModel):
+ """Request schema matching CLI arguments."""
+ pdf_folder: str = Field("/app/dataset", description="Path to dataset folder")
+ use_doc_orientation_classify: bool = Field(False, description="Use document orientation classification")
+ use_doc_unwarping: bool = Field(False, description="Use document unwarping")
+ textline_orientation: bool = Field(True, description="Use textline orientation classification")
+ text_det_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Detection pixel threshold")
+ text_det_box_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Detection box threshold")
+ text_det_unclip_ratio: float = Field(1.5, ge=0.0, description="Text detection expansion coefficient")
+ text_rec_score_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Recognition score threshold")
+ start_page: int = Field(5, ge=0, description="Start page index (inclusive)")
+ end_page: int = Field(10, ge=1, description="End page index (exclusive)")
+
+
+class EvaluateResponse(BaseModel):
+ """Response schema matching CLI output."""
+ CER: float
+ WER: float
+ TIME: float
+ PAGES: int
+ TIME_PER_PAGE: float
+
+
+class HealthResponse(BaseModel):
+ status: str
+ model_loaded: bool
+ dataset_loaded: bool
+ dataset_size: Optional[int] = None
+ det_model: Optional[str] = None
+ rec_model: Optional[str] = None
+
+
+def _normalize_box_xyxy(box):
+ """Normalize bounding box to (x0, y0, x1, y1) format."""
+ if isinstance(box, (list, tuple)) and box and isinstance(box[0], (list, tuple)):
+ xs = [p[0] for p in box]
+ ys = [p[1] for p in box]
+ return min(xs), min(ys), max(xs), max(ys)
+
+ if isinstance(box, (list, tuple)):
+ if len(box) == 4:
+ x0, y0, x1, y1 = box
+ return min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1)
+ if len(box) == 8:
+ xs = box[0::2]
+ ys = box[1::2]
+ return min(xs), min(ys), max(xs), max(ys)
+
+ raise ValueError(f"Unrecognized box format: {box!r}")
+
+
+def assemble_from_paddle_result(paddleocr_predict, min_score=0.0, line_tol_factor=0.6):
+ """
+ Robust line grouping for PaddleOCR outputs.
+ Normalizes boxes, groups by line, and returns assembled text.
+ """
+ boxes_all = []
+ for item in paddleocr_predict:
+ res = item.json.get("res", {})
+ boxes = res.get("rec_boxes", []) or []
+ texts = res.get("rec_texts", []) or []
+ scores = res.get("rec_scores", None)
+
+ for i, (box, text) in enumerate(zip(boxes, texts)):
+ try:
+ x0, y0, x1, y1 = _normalize_box_xyxy(box)
+ except Exception:
+ continue
+
+ y_mid = 0.5 * (y0 + y1)
+ score = float(scores[i]) if (scores is not None and i < len(scores)) else 1.0
+
+ t = re.sub(r"\s+", " ", str(text)).strip()
+ if not t:
+ continue
+
+ boxes_all.append((x0, y0, x1, y1, y_mid, t, score))
+
+ if min_score > 0:
+ boxes_all = [b for b in boxes_all if b[6] >= min_score]
+
+ if not boxes_all:
+ return ""
+
+ # Adaptive line tolerance
+ heights = [b[3] - b[1] for b in boxes_all]
+ median_h = float(np.median(heights)) if heights else 20.0
+ line_tol = max(8.0, line_tol_factor * median_h)
+
+ # Sort by vertical mid, then x0
+ boxes_all.sort(key=lambda b: (b[4], b[0]))
+
+ # Group into lines
+ lines, cur, last_y = [], [], None
+ for x0, y0, x1, y1, y_mid, text, score in boxes_all:
+ if last_y is None or abs(y_mid - last_y) <= line_tol:
+ cur.append((x0, text))
+ else:
+ cur.sort(key=lambda t: t[0])
+ lines.append(" ".join(t[1] for t in cur))
+ cur = [(x0, text)]
+ last_y = y_mid
+
+ if cur:
+ cur.sort(key=lambda t: t[0])
+ lines.append(" ".join(t[1] for t in cur))
+
+ res = "\n".join(lines)
+ res = re.sub(r"\s+\n", "\n", res).strip()
+ return res
+
+
+def evaluate_text(reference: str, prediction: str) -> dict:
+ """Calculate WER and CER metrics."""
+ return {"WER": wer(reference, prediction), "CER": cer(reference, prediction)}
+
+
+@app.get("/health", response_model=HealthResponse)
+def health_check():
+ """Check if the service is ready."""
+ return HealthResponse(
+ status="ok" if state.ocr is not None else "initializing",
+ model_loaded=state.ocr is not None,
+ dataset_loaded=state.dataset is not None,
+ dataset_size=len(state.dataset) if state.dataset else None,
+ det_model=state.det_model,
+ rec_model=state.rec_model,
+ )
+
+
+@app.post("/evaluate", response_model=EvaluateResponse)
+def evaluate(request: EvaluateRequest):
+ """
+ Evaluate OCR with given hyperparameters.
+ Returns CER, WER, and timing metrics.
+ """
+ if state.ocr is None:
+ raise HTTPException(status_code=503, detail="Model not loaded yet")
+
+ # Load or reload dataset if path changed
+ if state.dataset is None or state.dataset_path != request.pdf_folder:
+ if not os.path.isdir(request.pdf_folder):
+ raise HTTPException(status_code=400, detail=f"Dataset folder not found: {request.pdf_folder}")
+ state.dataset = ImageTextDataset(request.pdf_folder)
+ state.dataset_path = request.pdf_folder
+
+ if len(state.dataset) == 0:
+ raise HTTPException(status_code=400, detail="Dataset is empty")
+
+ # Validate page range
+ start = request.start_page
+ end = min(request.end_page, len(state.dataset))
+ if start >= end:
+ raise HTTPException(status_code=400, detail=f"Invalid page range: {start}-{end}")
+
+ cer_list, wer_list = [], []
+ time_per_page_list = []
+ t0 = time.time()
+
+ for idx in range(start, end):
+ img, ref = state.dataset[idx]
+ arr = np.array(img)
+
+ tp0 = time.time()
+ out = state.ocr.predict(
+ arr,
+ use_doc_orientation_classify=request.use_doc_orientation_classify,
+ use_doc_unwarping=request.use_doc_unwarping,
+ use_textline_orientation=request.textline_orientation,
+ text_det_thresh=request.text_det_thresh,
+ text_det_box_thresh=request.text_det_box_thresh,
+ text_det_unclip_ratio=request.text_det_unclip_ratio,
+ text_rec_score_thresh=request.text_rec_score_thresh,
+ )
+
+ pred = assemble_from_paddle_result(out)
+ time_per_page_list.append(float(time.time() - tp0))
+
+ m = evaluate_text(ref, pred)
+ cer_list.append(m["CER"])
+ wer_list.append(m["WER"])
+
+ return EvaluateResponse(
+ CER=float(np.mean(cer_list)) if cer_list else 1.0,
+ WER=float(np.mean(wer_list)) if wer_list else 1.0,
+ TIME=float(time.time() - t0),
+ PAGES=len(cer_list),
+ TIME_PER_PAGE=float(np.mean(time_per_page_list)) if time_per_page_list else 0.0,
+ )
+
+
+@app.post("/evaluate_full", response_model=EvaluateResponse)
+def evaluate_full(request: EvaluateRequest):
+ """Evaluate on ALL pages (ignores start_page/end_page)."""
+ request.start_page = 0
+ request.end_page = 9999 # Will be clamped to dataset size
+ return evaluate(request)
+
+
+if __name__ == "__main__":
+ import uvicorn
+ uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/src/paddle_ocr/requirements-gpu.txt b/src/paddle_ocr/requirements-gpu.txt
new file mode 100644
index 0000000..56b4832
--- /dev/null
+++ b/src/paddle_ocr/requirements-gpu.txt
@@ -0,0 +1,22 @@
+# PaddleOCR REST API - GPU Requirements
+# Install: pip install -r requirements-gpu.txt
+
+# PaddlePaddle (GPU version with CUDA)
+paddlepaddle-gpu==3.0.0
+
+# PaddleOCR
+paddleocr==3.3.2
+
+# OCR evaluation metrics
+jiwer
+
+# Numerical computing
+numpy
+
+# REST API framework
+fastapi
+uvicorn[standard]
+pydantic
+
+# Image processing
+Pillow
diff --git a/src/paddle_ocr/requirements.txt b/src/paddle_ocr/requirements.txt
new file mode 100644
index 0000000..4ea8bf7
--- /dev/null
+++ b/src/paddle_ocr/requirements.txt
@@ -0,0 +1,22 @@
+# PaddleOCR REST API - CPU Requirements
+# Install: pip install -r requirements.txt
+
+# PaddlePaddle (CPU version)
+paddlepaddle==3.2.2
+
+# PaddleOCR
+paddleocr==3.3.2
+
+# OCR evaluation metrics
+jiwer
+
+# Numerical computing
+numpy
+
+# REST API framework
+fastapi
+uvicorn[standard]
+pydantic
+
+# Image processing (pulled by paddleocr, but explicit)
+Pillow
diff --git a/src/paddle_ocr/test.py b/src/paddle_ocr/test.py
new file mode 100644
index 0000000..544da55
--- /dev/null
+++ b/src/paddle_ocr/test.py
@@ -0,0 +1,114 @@
+# test.py - Simple client to test PaddleOCR REST API
+# Usage: python test.py [--url URL] [--dataset PATH]
+
+import argparse
+import requests
+import time
+import sys
+
+
+def wait_for_health(url: str, timeout: int = 120) -> bool:
+ """Wait for API to be ready."""
+ health_url = f"{url}/health"
+ start = time.time()
+
+ print(f"Waiting for API at {health_url}...")
+ while time.time() - start < timeout:
+ try:
+ resp = requests.get(health_url, timeout=5)
+ if resp.status_code == 200:
+ data = resp.json()
+ if data.get("model_loaded"):
+ print(f"API ready! Model loaded in {time.time() - start:.1f}s")
+ return True
+ print(f" Model loading... ({time.time() - start:.0f}s)")
+ except requests.exceptions.ConnectionError:
+ print(f" Connecting... ({time.time() - start:.0f}s)")
+ except Exception as e:
+ print(f" Error: {e}")
+ time.sleep(2)
+
+ print("Timeout waiting for API")
+ return False
+
+
+def test_evaluate(url: str, config: dict) -> dict:
+ """Run evaluation with given config."""
+ eval_url = f"{url}/evaluate"
+
+ print(f"\nTesting config: {config}")
+ start = time.time()
+
+ resp = requests.post(eval_url, json=config, timeout=600)
+ resp.raise_for_status()
+
+ result = resp.json()
+ elapsed = time.time() - start
+
+ print(f"Results (took {elapsed:.1f}s):")
+ print(f" CER: {result['CER']:.4f} ({result['CER']*100:.2f}%)")
+ print(f" WER: {result['WER']:.4f} ({result['WER']*100:.2f}%)")
+ print(f" Pages: {result['PAGES']}")
+ print(f" Time/page: {result['TIME_PER_PAGE']:.2f}s")
+
+ return result
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Test PaddleOCR REST API")
+ parser.add_argument("--url", default="http://localhost:8000", help="API base URL")
+ parser.add_argument("--dataset", default="/app/dataset", help="Dataset path (inside container)")
+ parser.add_argument("--skip-health", action="store_true", help="Skip health check wait")
+ args = parser.parse_args()
+
+ # Wait for API to be ready
+ if not args.skip_health:
+ if not wait_for_health(args.url):
+ sys.exit(1)
+
+ # Test 1: Baseline config (default PaddleOCR)
+ print("\n" + "="*50)
+ print("TEST 1: Baseline Configuration")
+ print("="*50)
+ baseline = test_evaluate(args.url, {
+ "pdf_folder": args.dataset,
+ "use_doc_orientation_classify": False,
+ "use_doc_unwarping": False,
+ "textline_orientation": False, # Baseline: disabled
+ "text_det_thresh": 0.0,
+ "text_det_box_thresh": 0.0,
+ "text_det_unclip_ratio": 1.5,
+ "text_rec_score_thresh": 0.0,
+ "start_page": 5,
+ "end_page": 10,
+ })
+
+ # Test 2: Optimized config (from Ray Tune results)
+ print("\n" + "="*50)
+ print("TEST 2: Optimized Configuration")
+ print("="*50)
+ optimized = test_evaluate(args.url, {
+ "pdf_folder": args.dataset,
+ "use_doc_orientation_classify": False,
+ "use_doc_unwarping": False,
+ "textline_orientation": True, # KEY: enabled
+ "text_det_thresh": 0.4690,
+ "text_det_box_thresh": 0.5412,
+ "text_det_unclip_ratio": 0.0,
+ "text_rec_score_thresh": 0.6350,
+ "start_page": 5,
+ "end_page": 10,
+ })
+
+ # Summary
+ print("\n" + "="*50)
+ print("SUMMARY")
+ print("="*50)
+ cer_reduction = (1 - optimized["CER"] / baseline["CER"]) * 100 if baseline["CER"] > 0 else 0
+ print(f"Baseline CER: {baseline['CER']*100:.2f}%")
+ print(f"Optimized CER: {optimized['CER']*100:.2f}%")
+ print(f"Improvement: {cer_reduction:.1f}% reduction in errors")
+
+
+if __name__ == "__main__":
+ main()