2026-01-19 17:35:25 +00:00
9 changed files with 1004 additions and 0 deletions
--- a/src/paddle_ocr/Dockerfile.cpu
+++ b/src/paddle_ocr/Dockerfile.cpu
@@ -0,0 +1,58 @@
+# Dockerfile.cpu - CPU-only PaddleOCR REST API
+# Multi-arch: supports both amd64 and arm64
+
+FROM python:3.11-slim
+
+LABEL maintainer="Sergio Jimenez"
+LABEL description="PaddleOCR Tuning REST API - CPU version"
+
+WORKDIR /app
+
+# Install system dependencies for OpenCV and PaddleOCR
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgl1 \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender1 \
+    libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python dependencies from requirements file
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy application code
+COPY paddle_ocr_tuning_rest.py .
+COPY dataset_manager.py .
+
+# Build arguments for models to bake into image
+ARG DET_MODEL=PP-OCRv5_server_det
+ARG REC_MODEL=PP-OCRv5_server_rec
+
+# Set as environment variables (can be overridden at runtime)
+ENV PADDLE_DET_MODEL=${DET_MODEL}
+ENV PADDLE_REC_MODEL=${REC_MODEL}
+
+# Download models during build (not at runtime)
+RUN python -c "\
+import os; \
+from paddleocr import PaddleOCR; \
+det = os.environ.get('PADDLE_DET_MODEL', 'PP-OCRv5_server_det'); \
+rec = os.environ.get('PADDLE_REC_MODEL', 'PP-OCRv5_server_rec'); \
+print(f'Downloading models: det={det}, rec={rec}'); \
+ocr = PaddleOCR(text_detection_model_name=det, text_recognition_model_name=rec); \
+print('Models downloaded successfully!')"
+
+# Volume for dataset and optional additional model cache
+VOLUME ["/app/dataset", "/root/.paddlex"]
+
+# Expose API port
+EXPOSE 8000
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
+
+# Run the API server
+CMD ["uvicorn", "paddle_ocr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/src/paddle_ocr/Dockerfile.gpu
+++ b/src/paddle_ocr/Dockerfile.gpu
@@ -0,0 +1,68 @@
+# Dockerfile.gpu - CUDA-enabled PaddleOCR REST API
+# Supports: x86_64 with NVIDIA GPU (CUDA 12.x)
+# For DGX Spark (ARM64 + CUDA): build natively on the device
+
+FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
+
+LABEL maintainer="Sergio Jimenez"
+LABEL description="PaddleOCR Tuning REST API - GPU/CUDA version"
+
+WORKDIR /app
+
+# Set environment variables
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV CUDA_VISIBLE_DEVICES=0
+
+# Install Python 3.11 and system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3.11 \
+    python3.11-venv \
+    python3-pip \
+    libgl1 \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender1 \
+    libgomp1 \
+    && rm -rf /var/lib/apt/lists/* \
+    && ln -sf /usr/bin/python3.11 /usr/bin/python
+
+# Install Python dependencies from requirements file
+COPY requirements-gpu.txt .
+RUN pip install --no-cache-dir -r requirements-gpu.txt
+
+# Copy application code
+COPY paddle_ocr_tuning_rest.py .
+COPY dataset_manager.py .
+
+# Build arguments for models to bake into image
+ARG DET_MODEL=PP-OCRv5_server_det
+ARG REC_MODEL=PP-OCRv5_server_rec
+
+# Set as environment variables (can be overridden at runtime)
+ENV PADDLE_DET_MODEL=${DET_MODEL}
+ENV PADDLE_REC_MODEL=${REC_MODEL}
+
+# Download models during build (not at runtime)
+RUN python -c "\
+import os; \
+from paddleocr import PaddleOCR; \
+det = os.environ.get('PADDLE_DET_MODEL', 'PP-OCRv5_server_det'); \
+rec = os.environ.get('PADDLE_REC_MODEL', 'PP-OCRv5_server_rec'); \
+print(f'Downloading models: det={det}, rec={rec}'); \
+ocr = PaddleOCR(text_detection_model_name=det, text_recognition_model_name=rec); \
+print('Models downloaded successfully!')"
+
+# Volume for dataset and optional additional model cache
+VOLUME ["/app/dataset", "/root/.paddlex"]
+
+# Expose API port
+EXPOSE 8000
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
+
+# Run the API server
+CMD ["uvicorn", "paddle_ocr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/src/paddle_ocr/README.md
+++ b/src/paddle_ocr/README.md
@@ -0,0 +1,329 @@
+# PaddleOCR Tuning REST API
+
+REST API service for PaddleOCR hyperparameter evaluation. Keeps the model loaded in memory for fast repeated evaluations during hyperparameter search.
+
+## Quick Start with Docker Compose
+
+Docker Compose manages building and running containers. The `docker-compose.yml` defines two services:
+- `ocr-cpu` - CPU-only version (works everywhere)
+- `ocr-gpu` - GPU version (requires NVIDIA GPU + Container Toolkit)
+
+### Run CPU Version
+
+```bash
+cd src/paddle_ocr
+
+# Build and start (first time takes ~2-3 min to build, ~30s to load model)
+docker compose up ocr-cpu
+
+# Or run in background (detached)
+docker compose up -d ocr-cpu
+
+# View logs
+docker compose logs -f ocr-cpu
+
+# Stop
+docker compose down
+```
+
+### Run GPU Version
+
+```bash
+# Requires: NVIDIA GPU + nvidia-container-toolkit installed
+docker compose up ocr-gpu
+```
+
+### Test the API
+
+Once running, test with:
+```bash
+# Check health
+curl http://localhost:8000/health
+
+# Or use the test script
+pip install requests
+python test.py --url http://localhost:8000
+```
+
+### What Docker Compose Does
+
+```
+docker compose up ocr-cpu
+       │
+       ├─► Builds image from Dockerfile.cpu (if not exists)
+       ├─► Creates container "paddle-ocr-cpu"
+       ├─► Mounts ../dataset → /app/dataset (your PDF images)
+       ├─► Mounts paddlex-cache volume (persists downloaded models)
+       ├─► Exposes port 8000
+       └─► Runs: uvicorn paddle_ocr_tuning_rest:app --host 0.0.0.0 --port 8000
+```
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `paddle_ocr_tuning_rest.py` | FastAPI REST service |
+| `dataset_manager.py` | Dataset loader |
+| `test.py` | API test client |
+| `Dockerfile.cpu` | CPU-only image (multi-arch) |
+| `Dockerfile.gpu` | GPU/CUDA image (x86_64) |
+| `docker-compose.yml` | Service orchestration |
+
+## API Endpoints
+
+### `GET /health`
+Check if service is ready.
+
+```json
+{"status": "ok", "model_loaded": true, "dataset_loaded": true, "dataset_size": 24}
+```
+
+### `POST /evaluate`
+Run OCR evaluation with given hyperparameters.
+
+**Request:**
+```json
+{
+  "pdf_folder": "/app/dataset",
+  "textline_orientation": true,
+  "use_doc_orientation_classify": false,
+  "use_doc_unwarping": false,
+  "text_det_thresh": 0.469,
+  "text_det_box_thresh": 0.5412,
+  "text_det_unclip_ratio": 0.0,
+  "text_rec_score_thresh": 0.635,
+  "start_page": 5,
+  "end_page": 10
+}
+```
+
+**Response:**
+```json
+{"CER": 0.0115, "WER": 0.0989, "TIME": 330.5, "PAGES": 5, "TIME_PER_PAGE": 66.1}
+```
+
+### `POST /evaluate_full`
+Same as `/evaluate` but runs on ALL pages (ignores start_page/end_page).
+
+## Building Images
+
+### CPU Image (Multi-Architecture)
+
+```bash
+# Local build (current architecture)
+docker build -f Dockerfile.cpu -t paddle-ocr-api:cpu .
+
+# Multi-arch build with buildx (amd64 + arm64)
+docker buildx create --name multiarch --use
+docker buildx build -f Dockerfile.cpu \
+  --platform linux/amd64,linux/arm64 \
+  -t paddle-ocr-api:cpu \
+  --push .
+```
+
+### GPU Image (x86_64 only)
+
+```bash
+docker build -f Dockerfile.gpu -t paddle-ocr-api:gpu .
+```
+
+## Running
+
+### CPU (Any machine)
+
+```bash
+docker run -d -p 8000:8000 \
+  -v $(pwd)/../dataset:/app/dataset:ro \
+  -v paddlex-cache:/root/.paddlex \
+  paddle-ocr-api:cpu
+```
+
+### GPU (NVIDIA)
+
+```bash
+docker run -d -p 8000:8000 --gpus all \
+  -v $(pwd)/../dataset:/app/dataset:ro \
+  -v paddlex-cache:/root/.paddlex \
+  paddle-ocr-api:gpu
+```
+
+## DGX Spark (ARM64 + CUDA)
+
+DGX Spark uses ARM64 (Grace CPU) with NVIDIA Hopper GPU. You have two options:
+
+### Option 1: Native ARM64 Build (Recommended)
+
+PaddlePaddle has ARM64 support. Build natively:
+
+```bash
+# On DGX Spark or ARM64 machine
+docker build -f Dockerfile.cpu -t paddle-ocr-api:arm64 .
+```
+
+For GPU acceleration on ARM64, you'll need to modify `Dockerfile.gpu` to use ARM-compatible base image:
+
+```dockerfile
+# Change this line in Dockerfile.gpu:
+FROM nvcr.io/nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
+
+# To ARM64-compatible version:
+FROM nvcr.io/nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
+# (same image works on ARM64 when pulled on ARM machine)
+```
+
+Then build on the DGX Spark:
+```bash
+docker build -f Dockerfile.gpu -t paddle-ocr-api:gpu-arm64 .
+```
+
+### Option 2: x86_64 Emulation via QEMU (Slow)
+
+You CAN run x86_64 images on ARM via emulation, but it's ~10-20x slower:
+
+```bash
+# On DGX Spark, enable QEMU emulation
+docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
+
+# Run x86_64 image with emulation
+docker run --platform linux/amd64 -p 8000:8000 \
+  -v $(pwd)/../dataset:/app/dataset:ro \
+  paddle-ocr-api:cpu
+```
+
+**Not recommended** for production due to severe performance penalty.
+
+### Option 3: Cross-compile from x86_64
+
+Build ARM64 images from your x86_64 machine:
+
+```bash
+# Setup buildx for multi-arch
+docker buildx create --name mybuilder --use
+
+# Build ARM64 image from x86_64 machine
+docker buildx build -f Dockerfile.cpu \
+  --platform linux/arm64 \
+  -t paddle-ocr-api:arm64 \
+  --load .
+
+# Save and transfer to DGX Spark
+docker save paddle-ocr-api:arm64 | gzip > paddle-ocr-arm64.tar.gz
+scp paddle-ocr-arm64.tar.gz dgx-spark:~/
+# On DGX Spark:
+docker load < paddle-ocr-arm64.tar.gz
+```
+
+## Using with Ray Tune
+
+Update your notebook's `trainable_paddle_ocr` function:
+
+```python
+import requests
+
+API_URL = "http://localhost:8000/evaluate"
+
+def trainable_paddle_ocr(config):
+    """Call OCR API instead of subprocess."""
+    payload = {
+        "pdf_folder": "/app/dataset",
+        "use_doc_orientation_classify": config.get("use_doc_orientation_classify", False),
+        "use_doc_unwarping": config.get("use_doc_unwarping", False),
+        "textline_orientation": config.get("textline_orientation", True),
+        "text_det_thresh": config.get("text_det_thresh", 0.0),
+        "text_det_box_thresh": config.get("text_det_box_thresh", 0.0),
+        "text_det_unclip_ratio": config.get("text_det_unclip_ratio", 1.5),
+        "text_rec_score_thresh": config.get("text_rec_score_thresh", 0.0),
+    }
+
+    try:
+        response = requests.post(API_URL, json=payload, timeout=600)
+        response.raise_for_status()
+        metrics = response.json()
+        tune.report(metrics=metrics)
+    except Exception as e:
+        tune.report({"CER": 1.0, "WER": 1.0, "ERROR": str(e)[:500]})
+```
+
+## Architecture: Model Lifecycle
+
+The model is loaded **once** at container startup and stays in memory for all requests:
+
+```mermaid
+flowchart TB
+    subgraph Container["Docker Container Lifecycle"]
+        Start([Container Start]) --> Load[Load PaddleOCR Models<br/>~10-30s one-time cost]
+        Load --> Ready[API Ready<br/>Models in RAM ~500MB]
+
+        subgraph Requests["Incoming Requests - Models Stay Loaded"]
+            Ready --> R1[Request 1] --> Ready
+            Ready --> R2[Request 2] --> Ready
+            Ready --> RN[Request N...] --> Ready
+        end
+
+        Ready --> Stop([Container Stop])
+        Stop --> Free[Models Freed]
+    end
+
+    style Load fill:#f9f,stroke:#333
+    style Ready fill:#9f9,stroke:#333
+    style Requests fill:#e8f4ea,stroke:#090
+```
+
+**Subprocess vs REST API comparison:**
+
+```mermaid
+flowchart LR
+    subgraph Subprocess["❌ Subprocess Approach"]
+        direction TB
+        S1[Trial 1] --> L1[Load Model ~10s]
+        L1 --> E1[Evaluate ~60s]
+        E1 --> U1[Unload]
+        U1 --> S2[Trial 2]
+        S2 --> L2[Load Model ~10s]
+        L2 --> E2[Evaluate ~60s]
+    end
+
+    subgraph REST["✅ REST API Approach"]
+        direction TB
+        Start2[Start Container] --> Load2[Load Model ~10s]
+        Load2 --> Ready2[Model in Memory]
+        Ready2 --> T1[Trial 1 ~60s]
+        T1 --> Ready2
+        Ready2 --> T2[Trial 2 ~60s]
+        T2 --> Ready2
+        Ready2 --> TN[Trial N ~60s]
+    end
+
+    style L1 fill:#faa
+    style L2 fill:#faa
+    style Load2 fill:#afa
+    style Ready2 fill:#afa
+```
+
+## Performance Comparison
+
+| Approach | Model Load | Per-Trial Overhead | 64 Trials |
+|----------|------------|-------------------|-----------|
+| Subprocess (original) | Every trial (~10s) | ~10s | ~7 hours |
+| Docker per trial | Every trial (~10s) | ~12-15s | ~7.5 hours |
+| **REST API** | **Once** | **~0.1s** | **~5.8 hours** |
+
+The REST API saves ~1+ hour by loading the model only once.
+
+## Troubleshooting
+
+### Model download slow on first run
+The first run downloads ~500MB of models. Use volume `paddlex-cache` to persist them.
+
+### Out of memory
+Reduce `max_concurrent_trials` in Ray Tune, or increase container memory:
+```bash
+docker run --memory=8g ...
+```
+
+### GPU not detected
+Ensure NVIDIA Container Toolkit is installed:
+```bash
+nvidia-smi  # Should work
+docker run --rm --gpus all nvidia/cuda:12.0-base nvidia-smi  # Should work
+```
--- a/src/paddle_ocr/dataset_manager.py
+++ b/src/paddle_ocr/dataset_manager.py
@@ -0,0 +1,45 @@
+# Imports
+import os
+from PIL import Image
+
+
+class ImageTextDataset:
+    def __init__(self, root):
+        self.samples = []
+
+        for folder in sorted(os.listdir(root)):
+            sub = os.path.join(root, folder)
+            img_dir = os.path.join(sub, "img")
+            txt_dir = os.path.join(sub, "txt")
+
+            if not (os.path.isdir(img_dir) and os.path.isdir(txt_dir)):
+                continue
+
+            for fname in sorted(os.listdir(img_dir)):
+                if not fname.lower().endswith((".png", ".jpg", ".jpeg")):
+                    continue
+
+                img_path = os.path.join(img_dir, fname)
+
+                # text file must have same name but .txt
+                txt_name = os.path.splitext(fname)[0] + ".txt"
+                txt_path = os.path.join(txt_dir, txt_name)
+
+                if not os.path.exists(txt_path):
+                    continue
+
+                self.samples.append((img_path, txt_path))
+    def __len__(self):
+        return len(self.samples)
+
+    def __getitem__(self, idx):
+        img_path, txt_path = self.samples[idx]
+
+        # Load image
+        image = Image.open(img_path).convert("RGB")
+
+        # Load text
+        with open(txt_path, "r", encoding="utf-8") as f:
+            text = f.read()
+
+        return image, text
--- a/src/paddle_ocr/docker-compose.yml
+++ b/src/paddle_ocr/docker-compose.yml
@@ -0,0 +1,83 @@
+# docker-compose.yml - PaddleOCR REST API
+# Usage:
+#   CPU:  docker compose up ocr-cpu
+#   GPU:  docker compose up ocr-gpu
+#   Test: docker compose run --rm test
+
+services:
+  # CPU-only service (works on any architecture)
+  ocr-cpu:
+    build:
+      context: .
+      dockerfile: Dockerfile.cpu
+      args:
+        # Models to bake into image (change before building):
+        DET_MODEL: PP-OCRv5_server_det
+        REC_MODEL: PP-OCRv5_server_rec
+    image: paddle-ocr-api:cpu
+    container_name: paddle-ocr-cpu
+    ports:
+      - "8000:8000"
+    volumes:
+      - ../dataset:/app/dataset:ro          # Your dataset
+      - paddlex-cache:/root/.paddlex        # For additional models at runtime
+    environment:
+      - PYTHONUNBUFFERED=1
+      # Override models at runtime (uncomment to use different models):
+      # - PADDLE_DET_MODEL=PP-OCRv5_mobile_det
+      # - PADDLE_REC_MODEL=PP-OCRv5_mobile_rec
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+
+  # GPU service (requires NVIDIA Container Toolkit)
+  ocr-gpu:
+    build:
+      context: .
+      dockerfile: Dockerfile.gpu
+      args:
+        DET_MODEL: PP-OCRv5_server_det
+        REC_MODEL: PP-OCRv5_server_rec
+    image: paddle-ocr-api:gpu
+    container_name: paddle-ocr-gpu
+    ports:
+      - "8000:8000"
+    volumes:
+      - ../dataset:/app/dataset:ro
+      - paddlex-cache:/root/.paddlex
+    environment:
+      - PYTHONUNBUFFERED=1
+      - CUDA_VISIBLE_DEVICES=0
+      # Override models at runtime:
+      # - PADDLE_DET_MODEL=PP-OCRv5_mobile_det
+      # - PADDLE_REC_MODEL=PP-OCRv5_mobile_rec
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    restart: unless-stopped
+
+  # Test client (runs once and exits)
+  test:
+    image: python:3.11-slim
+    container_name: paddle-ocr-test
+    depends_on:
+      ocr-cpu:
+        condition: service_healthy
+    volumes:
+      - ./test.py:/app/test.py:ro
+    working_dir: /app
+    command: >
+      sh -c "pip install -q requests && python test.py --url http://ocr-cpu:8000 --dataset /app/dataset"
+    network_mode: "service:ocr-cpu"
+
+volumes:
+  paddlex-cache:
+    name: paddlex-model-cache
--- a/src/paddle_ocr/paddle_ocr_tuning_rest.py
+++ b/src/paddle_ocr/paddle_ocr_tuning_rest.py
@@ -0,0 +1,263 @@
+# paddle_ocr_tuning_rest.py
+# FastAPI REST service for PaddleOCR hyperparameter evaluation
+# Usage: uvicorn paddle_ocr_tuning_rest:app --host 0.0.0.0 --port 8000
+
+import os
+import re
+import time
+from typing import Optional
+from contextlib import asynccontextmanager
+
+import numpy as np
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, Field
+
+from paddleocr import PaddleOCR
+from jiwer import wer, cer
+from dataset_manager import ImageTextDataset
+
+
+# Model configuration via environment variables (with defaults)
+DEFAULT_DET_MODEL = os.environ.get("PADDLE_DET_MODEL", "PP-OCRv5_server_det")
+DEFAULT_REC_MODEL = os.environ.get("PADDLE_REC_MODEL", "PP-OCRv5_server_rec")
+
+
+# Global state for model and dataset
+class AppState:
+    ocr: Optional[PaddleOCR] = None
+    dataset: Optional[ImageTextDataset] = None
+    dataset_path: Optional[str] = None
+    det_model: str = DEFAULT_DET_MODEL
+    rec_model: str = DEFAULT_REC_MODEL
+
+
+state = AppState()
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Load OCR model at startup."""
+    print(f"Loading PaddleOCR models...")
+    print(f"  Detection: {state.det_model}")
+    print(f"  Recognition: {state.rec_model}")
+    state.ocr = PaddleOCR(
+        text_detection_model_name=state.det_model,
+        text_recognition_model_name=state.rec_model,
+    )
+    print("Model loaded successfully!")
+    yield
+    # Cleanup on shutdown
+    state.ocr = None
+    state.dataset = None
+
+
+app = FastAPI(
+    title="PaddleOCR Tuning API",
+    description="REST API for OCR hyperparameter evaluation",
+    version="1.0.0",
+    lifespan=lifespan,
+)
+
+
+class EvaluateRequest(BaseModel):
+    """Request schema matching CLI arguments."""
+    pdf_folder: str = Field("/app/dataset", description="Path to dataset folder")
+    use_doc_orientation_classify: bool = Field(False, description="Use document orientation classification")
+    use_doc_unwarping: bool = Field(False, description="Use document unwarping")
+    textline_orientation: bool = Field(True, description="Use textline orientation classification")
+    text_det_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Detection pixel threshold")
+    text_det_box_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Detection box threshold")
+    text_det_unclip_ratio: float = Field(1.5, ge=0.0, description="Text detection expansion coefficient")
+    text_rec_score_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Recognition score threshold")
+    start_page: int = Field(5, ge=0, description="Start page index (inclusive)")
+    end_page: int = Field(10, ge=1, description="End page index (exclusive)")
+
+
+class EvaluateResponse(BaseModel):
+    """Response schema matching CLI output."""
+    CER: float
+    WER: float
+    TIME: float
+    PAGES: int
+    TIME_PER_PAGE: float
+
+
+class HealthResponse(BaseModel):
+    status: str
+    model_loaded: bool
+    dataset_loaded: bool
+    dataset_size: Optional[int] = None
+    det_model: Optional[str] = None
+    rec_model: Optional[str] = None
+
+
+def _normalize_box_xyxy(box):
+    """Normalize bounding box to (x0, y0, x1, y1) format."""
+    if isinstance(box, (list, tuple)) and box and isinstance(box[0], (list, tuple)):
+        xs = [p[0] for p in box]
+        ys = [p[1] for p in box]
+        return min(xs), min(ys), max(xs), max(ys)
+
+    if isinstance(box, (list, tuple)):
+        if len(box) == 4:
+            x0, y0, x1, y1 = box
+            return min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1)
+        if len(box) == 8:
+            xs = box[0::2]
+            ys = box[1::2]
+            return min(xs), min(ys), max(xs), max(ys)
+
+    raise ValueError(f"Unrecognized box format: {box!r}")
+
+
+def assemble_from_paddle_result(paddleocr_predict, min_score=0.0, line_tol_factor=0.6):
+    """
+    Robust line grouping for PaddleOCR outputs.
+    Normalizes boxes, groups by line, and returns assembled text.
+    """
+    boxes_all = []
+    for item in paddleocr_predict:
+        res = item.json.get("res", {})
+        boxes = res.get("rec_boxes", []) or []
+        texts = res.get("rec_texts", []) or []
+        scores = res.get("rec_scores", None)
+
+        for i, (box, text) in enumerate(zip(boxes, texts)):
+            try:
+                x0, y0, x1, y1 = _normalize_box_xyxy(box)
+            except Exception:
+                continue
+
+            y_mid = 0.5 * (y0 + y1)
+            score = float(scores[i]) if (scores is not None and i < len(scores)) else 1.0
+
+            t = re.sub(r"\s+", " ", str(text)).strip()
+            if not t:
+                continue
+
+            boxes_all.append((x0, y0, x1, y1, y_mid, t, score))
+
+    if min_score > 0:
+        boxes_all = [b for b in boxes_all if b[6] >= min_score]
+
+    if not boxes_all:
+        return ""
+
+    # Adaptive line tolerance
+    heights = [b[3] - b[1] for b in boxes_all]
+    median_h = float(np.median(heights)) if heights else 20.0
+    line_tol = max(8.0, line_tol_factor * median_h)
+
+    # Sort by vertical mid, then x0
+    boxes_all.sort(key=lambda b: (b[4], b[0]))
+
+    # Group into lines
+    lines, cur, last_y = [], [], None
+    for x0, y0, x1, y1, y_mid, text, score in boxes_all:
+        if last_y is None or abs(y_mid - last_y) <= line_tol:
+            cur.append((x0, text))
+        else:
+            cur.sort(key=lambda t: t[0])
+            lines.append(" ".join(t[1] for t in cur))
+            cur = [(x0, text)]
+        last_y = y_mid
+
+    if cur:
+        cur.sort(key=lambda t: t[0])
+        lines.append(" ".join(t[1] for t in cur))
+
+    res = "\n".join(lines)
+    res = re.sub(r"\s+\n", "\n", res).strip()
+    return res
+
+
+def evaluate_text(reference: str, prediction: str) -> dict:
+    """Calculate WER and CER metrics."""
+    return {"WER": wer(reference, prediction), "CER": cer(reference, prediction)}
+
+
+@app.get("/health", response_model=HealthResponse)
+def health_check():
+    """Check if the service is ready."""
+    return HealthResponse(
+        status="ok" if state.ocr is not None else "initializing",
+        model_loaded=state.ocr is not None,
+        dataset_loaded=state.dataset is not None,
+        dataset_size=len(state.dataset) if state.dataset else None,
+        det_model=state.det_model,
+        rec_model=state.rec_model,
+    )
+
+
+@app.post("/evaluate", response_model=EvaluateResponse)
+def evaluate(request: EvaluateRequest):
+    """
+    Evaluate OCR with given hyperparameters.
+    Returns CER, WER, and timing metrics.
+    """
+    if state.ocr is None:
+        raise HTTPException(status_code=503, detail="Model not loaded yet")
+
+    # Load or reload dataset if path changed
+    if state.dataset is None or state.dataset_path != request.pdf_folder:
+        if not os.path.isdir(request.pdf_folder):
+            raise HTTPException(status_code=400, detail=f"Dataset folder not found: {request.pdf_folder}")
+        state.dataset = ImageTextDataset(request.pdf_folder)
+        state.dataset_path = request.pdf_folder
+
+    if len(state.dataset) == 0:
+        raise HTTPException(status_code=400, detail="Dataset is empty")
+
+    # Validate page range
+    start = request.start_page
+    end = min(request.end_page, len(state.dataset))
+    if start >= end:
+        raise HTTPException(status_code=400, detail=f"Invalid page range: {start}-{end}")
+
+    cer_list, wer_list = [], []
+    time_per_page_list = []
+    t0 = time.time()
+
+    for idx in range(start, end):
+        img, ref = state.dataset[idx]
+        arr = np.array(img)
+
+        tp0 = time.time()
+        out = state.ocr.predict(
+            arr,
+            use_doc_orientation_classify=request.use_doc_orientation_classify,
+            use_doc_unwarping=request.use_doc_unwarping,
+            use_textline_orientation=request.textline_orientation,
+            text_det_thresh=request.text_det_thresh,
+            text_det_box_thresh=request.text_det_box_thresh,
+            text_det_unclip_ratio=request.text_det_unclip_ratio,
+            text_rec_score_thresh=request.text_rec_score_thresh,
+        )
+
+        pred = assemble_from_paddle_result(out)
+        time_per_page_list.append(float(time.time() - tp0))
+
+        m = evaluate_text(ref, pred)
+        cer_list.append(m["CER"])
+        wer_list.append(m["WER"])
+
+    return EvaluateResponse(
+        CER=float(np.mean(cer_list)) if cer_list else 1.0,
+        WER=float(np.mean(wer_list)) if wer_list else 1.0,
+        TIME=float(time.time() - t0),
+        PAGES=len(cer_list),
+        TIME_PER_PAGE=float(np.mean(time_per_page_list)) if time_per_page_list else 0.0,
+    )
+
+
+@app.post("/evaluate_full", response_model=EvaluateResponse)
+def evaluate_full(request: EvaluateRequest):
+    """Evaluate on ALL pages (ignores start_page/end_page)."""
+    request.start_page = 0
+    request.end_page = 9999  # Will be clamped to dataset size
+    return evaluate(request)
+
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/src/paddle_ocr/requirements-gpu.txt
+++ b/src/paddle_ocr/requirements-gpu.txt
@@ -0,0 +1,22 @@
+# PaddleOCR REST API - GPU Requirements
+# Install: pip install -r requirements-gpu.txt
+
+# PaddlePaddle (GPU version with CUDA)
+paddlepaddle-gpu==3.0.0
+
+# PaddleOCR
+paddleocr==3.3.2
+
+# OCR evaluation metrics
+jiwer
+
+# Numerical computing
+numpy
+
+# REST API framework
+fastapi
+uvicorn[standard]
+pydantic
+
+# Image processing
+Pillow
--- a/src/paddle_ocr/requirements.txt
+++ b/src/paddle_ocr/requirements.txt
@@ -0,0 +1,22 @@
+# PaddleOCR REST API - CPU Requirements
+# Install: pip install -r requirements.txt
+
+# PaddlePaddle (CPU version)
+paddlepaddle==3.2.2
+
+# PaddleOCR
+paddleocr==3.3.2
+
+# OCR evaluation metrics
+jiwer
+
+# Numerical computing
+numpy
+
+# REST API framework
+fastapi
+uvicorn[standard]
+pydantic
+
+# Image processing (pulled by paddleocr, but explicit)
+Pillow
--- a/src/paddle_ocr/test.py
+++ b/src/paddle_ocr/test.py
@@ -0,0 +1,114 @@
+# test.py - Simple client to test PaddleOCR REST API
+# Usage: python test.py [--url URL] [--dataset PATH]
+
+import argparse
+import requests
+import time
+import sys
+
+
+def wait_for_health(url: str, timeout: int = 120) -> bool:
+    """Wait for API to be ready."""
+    health_url = f"{url}/health"
+    start = time.time()
+
+    print(f"Waiting for API at {health_url}...")
+    while time.time() - start < timeout:
+        try:
+            resp = requests.get(health_url, timeout=5)
+            if resp.status_code == 200:
+                data = resp.json()
+                if data.get("model_loaded"):
+                    print(f"API ready! Model loaded in {time.time() - start:.1f}s")
+                    return True
+                print(f"  Model loading... ({time.time() - start:.0f}s)")
+        except requests.exceptions.ConnectionError:
+            print(f"  Connecting... ({time.time() - start:.0f}s)")
+        except Exception as e:
+            print(f"  Error: {e}")
+        time.sleep(2)
+
+    print("Timeout waiting for API")
+    return False
+
+
+def test_evaluate(url: str, config: dict) -> dict:
+    """Run evaluation with given config."""
+    eval_url = f"{url}/evaluate"
+
+    print(f"\nTesting config: {config}")
+    start = time.time()
+
+    resp = requests.post(eval_url, json=config, timeout=600)
+    resp.raise_for_status()
+
+    result = resp.json()
+    elapsed = time.time() - start
+
+    print(f"Results (took {elapsed:.1f}s):")
+    print(f"  CER: {result['CER']:.4f} ({result['CER']*100:.2f}%)")
+    print(f"  WER: {result['WER']:.4f} ({result['WER']*100:.2f}%)")
+    print(f"  Pages: {result['PAGES']}")
+    print(f"  Time/page: {result['TIME_PER_PAGE']:.2f}s")
+
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Test PaddleOCR REST API")
+    parser.add_argument("--url", default="http://localhost:8000", help="API base URL")
+    parser.add_argument("--dataset", default="/app/dataset", help="Dataset path (inside container)")
+    parser.add_argument("--skip-health", action="store_true", help="Skip health check wait")
+    args = parser.parse_args()
+
+    # Wait for API to be ready
+    if not args.skip_health:
+        if not wait_for_health(args.url):
+            sys.exit(1)
+
+    # Test 1: Baseline config (default PaddleOCR)
+    print("\n" + "="*50)
+    print("TEST 1: Baseline Configuration")
+    print("="*50)
+    baseline = test_evaluate(args.url, {
+        "pdf_folder": args.dataset,
+        "use_doc_orientation_classify": False,
+        "use_doc_unwarping": False,
+        "textline_orientation": False,  # Baseline: disabled
+        "text_det_thresh": 0.0,
+        "text_det_box_thresh": 0.0,
+        "text_det_unclip_ratio": 1.5,
+        "text_rec_score_thresh": 0.0,
+        "start_page": 5,
+        "end_page": 10,
+    })
+
+    # Test 2: Optimized config (from Ray Tune results)
+    print("\n" + "="*50)
+    print("TEST 2: Optimized Configuration")
+    print("="*50)
+    optimized = test_evaluate(args.url, {
+        "pdf_folder": args.dataset,
+        "use_doc_orientation_classify": False,
+        "use_doc_unwarping": False,
+        "textline_orientation": True,  # KEY: enabled
+        "text_det_thresh": 0.4690,
+        "text_det_box_thresh": 0.5412,
+        "text_det_unclip_ratio": 0.0,
+        "text_rec_score_thresh": 0.6350,
+        "start_page": 5,
+        "end_page": 10,
+    })
+
+    # Summary
+    print("\n" + "="*50)
+    print("SUMMARY")
+    print("="*50)
+    cer_reduction = (1 - optimized["CER"] / baseline["CER"]) * 100 if baseline["CER"] > 0 else 0
+    print(f"Baseline CER:  {baseline['CER']*100:.2f}%")
+    print(f"Optimized CER: {optimized['CER']*100:.2f}%")
+    print(f"Improvement:   {cer_reduction:.1f}% reduction in errors")
+
+
+if __name__ == "__main__":
+    main()