2026-01-19 17:35:25 +00:00
9 changed files with 1004 additions and 0 deletions
--- a/src/paddle_ocr/Dockerfile.cpu
+++ b/src/paddle_ocr/Dockerfile.cpu
@@ -0,0 +1,58 @@
 # Dockerfile.cpu - CPU-only PaddleOCR REST API
 # Multi-arch: supports both amd64 and arm64
 FROM python:3.11-slim
 LABEL maintainer="Sergio Jimenez"
 LABEL description="PaddleOCR Tuning REST API - CPU version"
 WORKDIR /app
 # Install system dependencies for OpenCV and PaddleOCR
 RUN apt-get update && apt-get install -y --no-install-recommends \
    libgl1 \
    libglib2.0-0 \
    libsm6 \
    libxext6 \
    libxrender1 \
    libgomp1 \
    && rm -rf /var/lib/apt/lists/*
 # Install Python dependencies from requirements file
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy application code
 COPY paddle_ocr_tuning_rest.py .
 COPY dataset_manager.py .
 # Build arguments for models to bake into image
 ARG DET_MODEL=PP-OCRv5_server_det
 ARG REC_MODEL=PP-OCRv5_server_rec
 # Set as environment variables (can be overridden at runtime)
 ENV PADDLE_DET_MODEL=${DET_MODEL}
 ENV PADDLE_REC_MODEL=${REC_MODEL}
 # Download models during build (not at runtime)
 RUN python -c "\
 import os; \
 from paddleocr import PaddleOCR; \
 det = os.environ.get('PADDLE_DET_MODEL', 'PP-OCRv5_server_det'); \
 rec = os.environ.get('PADDLE_REC_MODEL', 'PP-OCRv5_server_rec'); \
 print(f'Downloading models: det={det}, rec={rec}'); \
 ocr = PaddleOCR(text_detection_model_name=det, text_recognition_model_name=rec); \
 print('Models downloaded successfully!')"
 # Volume for dataset and optional additional model cache
 VOLUME ["/app/dataset", "/root/.paddlex"]
 # Expose API port
 EXPOSE 8000
 # Health check
 HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
 # Run the API server
 CMD ["uvicorn", "paddle_ocr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/src/paddle_ocr/Dockerfile.gpu
+++ b/src/paddle_ocr/Dockerfile.gpu
@@ -0,0 +1,68 @@
 # Dockerfile.gpu - CUDA-enabled PaddleOCR REST API
 # Supports: x86_64 with NVIDIA GPU (CUDA 12.x)
 # For DGX Spark (ARM64 + CUDA): build natively on the device
 FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
 LABEL maintainer="Sergio Jimenez"
 LABEL description="PaddleOCR Tuning REST API - GPU/CUDA version"
 WORKDIR /app
 # Set environment variables
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
 ENV CUDA_VISIBLE_DEVICES=0
 # Install Python 3.11 and system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
    python3.11 \
    python3.11-venv \
    python3-pip \
    libgl1 \
    libglib2.0-0 \
    libsm6 \
    libxext6 \
    libxrender1 \
    libgomp1 \
    && rm -rf /var/lib/apt/lists/* \
    && ln -sf /usr/bin/python3.11 /usr/bin/python
 # Install Python dependencies from requirements file
 COPY requirements-gpu.txt .
 RUN pip install --no-cache-dir -r requirements-gpu.txt
 # Copy application code
 COPY paddle_ocr_tuning_rest.py .
 COPY dataset_manager.py .
 # Build arguments for models to bake into image
 ARG DET_MODEL=PP-OCRv5_server_det
 ARG REC_MODEL=PP-OCRv5_server_rec
 # Set as environment variables (can be overridden at runtime)
 ENV PADDLE_DET_MODEL=${DET_MODEL}
 ENV PADDLE_REC_MODEL=${REC_MODEL}
 # Download models during build (not at runtime)
 RUN python -c "\
 import os; \
 from paddleocr import PaddleOCR; \
 det = os.environ.get('PADDLE_DET_MODEL', 'PP-OCRv5_server_det'); \
 rec = os.environ.get('PADDLE_REC_MODEL', 'PP-OCRv5_server_rec'); \
 print(f'Downloading models: det={det}, rec={rec}'); \
 ocr = PaddleOCR(text_detection_model_name=det, text_recognition_model_name=rec); \
 print('Models downloaded successfully!')"
 # Volume for dataset and optional additional model cache
 VOLUME ["/app/dataset", "/root/.paddlex"]
 # Expose API port
 EXPOSE 8000
 # Health check
 HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
 # Run the API server
 CMD ["uvicorn", "paddle_ocr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/src/paddle_ocr/README.md
+++ b/src/paddle_ocr/README.md
@@ -0,0 +1,329 @@
 # PaddleOCR Tuning REST API
 REST API service for PaddleOCR hyperparameter evaluation. Keeps the model loaded in memory for fast repeated evaluations during hyperparameter search.
 ## Quick Start with Docker Compose
 Docker Compose manages building and running containers. The `docker-compose.yml` defines two services:
 - `ocr-cpu` - CPU-only version (works everywhere)
 - `ocr-gpu` - GPU version (requires NVIDIA GPU + Container Toolkit)
 ### Run CPU Version
 ```bash
 cd src/paddle_ocr
 # Build and start (first time takes ~2-3 min to build, ~30s to load model)
 docker compose up ocr-cpu
 # Or run in background (detached)
 docker compose up -d ocr-cpu
 # View logs
 docker compose logs -f ocr-cpu
 # Stop
 docker compose down
 ```
 ### Run GPU Version
 ```bash
 # Requires: NVIDIA GPU + nvidia-container-toolkit installed
 docker compose up ocr-gpu
 ```
 ### Test the API
 Once running, test with:
 ```bash
 # Check health
 curl http://localhost:8000/health
 # Or use the test script
 pip install requests
 python test.py --url http://localhost:8000
 ```
 ### What Docker Compose Does
 ```
 docker compose up ocr-cpu
       │
       ├─► Builds image from Dockerfile.cpu (if not exists)
       ├─► Creates container "paddle-ocr-cpu"
       ├─► Mounts ../dataset → /app/dataset (your PDF images)
       ├─► Mounts paddlex-cache volume (persists downloaded models)
       ├─► Exposes port 8000
       └─► Runs: uvicorn paddle_ocr_tuning_rest:app --host 0.0.0.0 --port 8000
 ```
 ## Files
 | File | Description |
 |------|-------------|
 | `paddle_ocr_tuning_rest.py` | FastAPI REST service |
 | `dataset_manager.py` | Dataset loader |
 | `test.py` | API test client |
 | `Dockerfile.cpu` | CPU-only image (multi-arch) |
 | `Dockerfile.gpu` | GPU/CUDA image (x86_64) |
 | `docker-compose.yml` | Service orchestration |
 ## API Endpoints
 ### `GET /health`
 Check if service is ready.
 ```json
 {"status": "ok", "model_loaded": true, "dataset_loaded": true, "dataset_size": 24}
 ```
 ### `POST /evaluate`
 Run OCR evaluation with given hyperparameters.
 **Request:**
 ```json
 {
  "pdf_folder": "/app/dataset",
  "textline_orientation": true,
  "use_doc_orientation_classify": false,
  "use_doc_unwarping": false,
  "text_det_thresh": 0.469,
  "text_det_box_thresh": 0.5412,
  "text_det_unclip_ratio": 0.0,
  "text_rec_score_thresh": 0.635,
  "start_page": 5,
  "end_page": 10
 }
 ```
 **Response:**
 ```json
 {"CER": 0.0115, "WER": 0.0989, "TIME": 330.5, "PAGES": 5, "TIME_PER_PAGE": 66.1}
 ```
 ### `POST /evaluate_full`
 Same as `/evaluate` but runs on ALL pages (ignores start_page/end_page).
 ## Building Images
 ### CPU Image (Multi-Architecture)
 ```bash
 # Local build (current architecture)
 docker build -f Dockerfile.cpu -t paddle-ocr-api:cpu .
 # Multi-arch build with buildx (amd64 + arm64)
 docker buildx create --name multiarch --use
 docker buildx build -f Dockerfile.cpu \
  --platform linux/amd64,linux/arm64 \
  -t paddle-ocr-api:cpu \
  --push .
 ```
 ### GPU Image (x86_64 only)
 ```bash
 docker build -f Dockerfile.gpu -t paddle-ocr-api:gpu .
 ```
 ## Running
 ### CPU (Any machine)
 ```bash
 docker run -d -p 8000:8000 \
  -v $(pwd)/../dataset:/app/dataset:ro \
  -v paddlex-cache:/root/.paddlex \
  paddle-ocr-api:cpu
 ```
 ### GPU (NVIDIA)
 ```bash
 docker run -d -p 8000:8000 --gpus all \
  -v $(pwd)/../dataset:/app/dataset:ro \
  -v paddlex-cache:/root/.paddlex \
  paddle-ocr-api:gpu
 ```
 ## DGX Spark (ARM64 + CUDA)
 DGX Spark uses ARM64 (Grace CPU) with NVIDIA Hopper GPU. You have two options:
 ### Option 1: Native ARM64 Build (Recommended)
 PaddlePaddle has ARM64 support. Build natively:
 ```bash
 # On DGX Spark or ARM64 machine
 docker build -f Dockerfile.cpu -t paddle-ocr-api:arm64 .
 ```
 For GPU acceleration on ARM64, you'll need to modify `Dockerfile.gpu` to use ARM-compatible base image:
 ```dockerfile
 # Change this line in Dockerfile.gpu:
 FROM nvcr.io/nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
 # To ARM64-compatible version:
 FROM nvcr.io/nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
 # (same image works on ARM64 when pulled on ARM machine)
 ```
 Then build on the DGX Spark:
 ```bash
 docker build -f Dockerfile.gpu -t paddle-ocr-api:gpu-arm64 .
 ```
 ### Option 2: x86_64 Emulation via QEMU (Slow)
 You CAN run x86_64 images on ARM via emulation, but it's ~10-20x slower:
 ```bash
 # On DGX Spark, enable QEMU emulation
 docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
 # Run x86_64 image with emulation
 docker run --platform linux/amd64 -p 8000:8000 \
  -v $(pwd)/../dataset:/app/dataset:ro \
  paddle-ocr-api:cpu
 ```
 **Not recommended** for production due to severe performance penalty.
 ### Option 3: Cross-compile from x86_64
 Build ARM64 images from your x86_64 machine:
 ```bash
 # Setup buildx for multi-arch
 docker buildx create --name mybuilder --use
 # Build ARM64 image from x86_64 machine
 docker buildx build -f Dockerfile.cpu \
  --platform linux/arm64 \
  -t paddle-ocr-api:arm64 \
  --load .
 # Save and transfer to DGX Spark
 docker save paddle-ocr-api:arm64 | gzip > paddle-ocr-arm64.tar.gz
 scp paddle-ocr-arm64.tar.gz dgx-spark:~/
 # On DGX Spark:
 docker load < paddle-ocr-arm64.tar.gz
 ```
 ## Using with Ray Tune
 Update your notebook's `trainable_paddle_ocr` function:
 ```python
 import requests
 API_URL = "http://localhost:8000/evaluate"
 def trainable_paddle_ocr(config):
    """Call OCR API instead of subprocess."""
    payload = {
        "pdf_folder": "/app/dataset",
        "use_doc_orientation_classify": config.get("use_doc_orientation_classify", False),
        "use_doc_unwarping": config.get("use_doc_unwarping", False),
        "textline_orientation": config.get("textline_orientation", True),
        "text_det_thresh": config.get("text_det_thresh", 0.0),
        "text_det_box_thresh": config.get("text_det_box_thresh", 0.0),
        "text_det_unclip_ratio": config.get("text_det_unclip_ratio", 1.5),
        "text_rec_score_thresh": config.get("text_rec_score_thresh", 0.0),
    }
    try:
        response = requests.post(API_URL, json=payload, timeout=600)
        response.raise_for_status()
        metrics = response.json()
        tune.report(metrics=metrics)
    except Exception as e:
        tune.report({"CER": 1.0, "WER": 1.0, "ERROR": str(e)[:500]})
 ```
 ## Architecture: Model Lifecycle
 The model is loaded **once** at container startup and stays in memory for all requests:
 ```mermaid
 flowchart TB
    subgraph Container["Docker Container Lifecycle"]
        Start([Container Start]) --> Load[Load PaddleOCR Models<br/>~10-30s one-time cost]
        Load --> Ready[API Ready<br/>Models in RAM ~500MB]
        subgraph Requests["Incoming Requests - Models Stay Loaded"]
            Ready --> R1[Request 1] --> Ready
            Ready --> R2[Request 2] --> Ready
            Ready --> RN[Request N...] --> Ready
        end
        Ready --> Stop([Container Stop])
        Stop --> Free[Models Freed]
    end
    style Load fill:#f9f,stroke:#333
    style Ready fill:#9f9,stroke:#333
    style Requests fill:#e8f4ea,stroke:#090
 ```
 **Subprocess vs REST API comparison:**
 ```mermaid
 flowchart LR
    subgraph Subprocess["❌ Subprocess Approach"]
        direction TB
        S1[Trial 1] --> L1[Load Model ~10s]
        L1 --> E1[Evaluate ~60s]
        E1 --> U1[Unload]
        U1 --> S2[Trial 2]
        S2 --> L2[Load Model ~10s]
        L2 --> E2[Evaluate ~60s]
    end
    subgraph REST["✅ REST API Approach"]
        direction TB
        Start2[Start Container] --> Load2[Load Model ~10s]
        Load2 --> Ready2[Model in Memory]
        Ready2 --> T1[Trial 1 ~60s]
        T1 --> Ready2
        Ready2 --> T2[Trial 2 ~60s]
        T2 --> Ready2
        Ready2 --> TN[Trial N ~60s]
    end
    style L1 fill:#faa
    style L2 fill:#faa
    style Load2 fill:#afa
    style Ready2 fill:#afa
 ```
 ## Performance Comparison
 | Approach | Model Load | Per-Trial Overhead | 64 Trials |
 |----------|------------|-------------------|-----------|
 | Subprocess (original) | Every trial (~10s) | ~10s | ~7 hours |
 | Docker per trial | Every trial (~10s) | ~12-15s | ~7.5 hours |
 | **REST API** | **Once** | **~0.1s** | **~5.8 hours** |
 The REST API saves ~1+ hour by loading the model only once.
 ## Troubleshooting
 ### Model download slow on first run
 The first run downloads ~500MB of models. Use volume `paddlex-cache` to persist them.
 ### Out of memory
 Reduce `max_concurrent_trials` in Ray Tune, or increase container memory:
 ```bash
 docker run --memory=8g ...
 ```
 ### GPU not detected
 Ensure NVIDIA Container Toolkit is installed:
 ```bash
 nvidia-smi  # Should work
 docker run --rm --gpus all nvidia/cuda:12.0-base nvidia-smi  # Should work
 ```
--- a/src/paddle_ocr/dataset_manager.py
+++ b/src/paddle_ocr/dataset_manager.py
@@ -0,0 +1,45 @@
 # Imports
 import os
 from PIL import Image
 class ImageTextDataset:
    def __init__(self, root):
        self.samples = []
        for folder in sorted(os.listdir(root)):
            sub = os.path.join(root, folder)
            img_dir = os.path.join(sub, "img")
            txt_dir = os.path.join(sub, "txt")
            if not (os.path.isdir(img_dir) and os.path.isdir(txt_dir)):
                continue
            for fname in sorted(os.listdir(img_dir)):
                if not fname.lower().endswith((".png", ".jpg", ".jpeg")):
                    continue
                img_path = os.path.join(img_dir, fname)
                # text file must have same name but .txt
                txt_name = os.path.splitext(fname)[0] + ".txt"
                txt_path = os.path.join(txt_dir, txt_name)
                if not os.path.exists(txt_path):
                    continue
                self.samples.append((img_path, txt_path))
    def __len__(self):
        return len(self.samples)
    def __getitem__(self, idx):
        img_path, txt_path = self.samples[idx]
        # Load image
        image = Image.open(img_path).convert("RGB")
        # Load text
        with open(txt_path, "r", encoding="utf-8") as f:
            text = f.read()
        return image, text
--- a/src/paddle_ocr/docker-compose.yml
+++ b/src/paddle_ocr/docker-compose.yml
@@ -0,0 +1,83 @@
 # docker-compose.yml - PaddleOCR REST API
 # Usage:
 #   CPU:  docker compose up ocr-cpu
 #   GPU:  docker compose up ocr-gpu
 #   Test: docker compose run --rm test
 services:
  # CPU-only service (works on any architecture)
  ocr-cpu:
    build:
      context: .
      dockerfile: Dockerfile.cpu
      args:
        # Models to bake into image (change before building):
        DET_MODEL: PP-OCRv5_server_det
        REC_MODEL: PP-OCRv5_server_rec
    image: paddle-ocr-api:cpu
    container_name: paddle-ocr-cpu
    ports:
      - "8000:8000"
    volumes:
      - ../dataset:/app/dataset:ro          # Your dataset
      - paddlex-cache:/root/.paddlex        # For additional models at runtime
    environment:
      - PYTHONUNBUFFERED=1
      # Override models at runtime (uncomment to use different models):
      # - PADDLE_DET_MODEL=PP-OCRv5_mobile_det
      # - PADDLE_REC_MODEL=PP-OCRv5_mobile_rec
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
  # GPU service (requires NVIDIA Container Toolkit)
  ocr-gpu:
    build:
      context: .
      dockerfile: Dockerfile.gpu
      args:
        DET_MODEL: PP-OCRv5_server_det
        REC_MODEL: PP-OCRv5_server_rec
    image: paddle-ocr-api:gpu
    container_name: paddle-ocr-gpu
    ports:
      - "8000:8000"
    volumes:
      - ../dataset:/app/dataset:ro
      - paddlex-cache:/root/.paddlex
    environment:
      - PYTHONUNBUFFERED=1
      - CUDA_VISIBLE_DEVICES=0
      # Override models at runtime:
      # - PADDLE_DET_MODEL=PP-OCRv5_mobile_det
      # - PADDLE_REC_MODEL=PP-OCRv5_mobile_rec
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    restart: unless-stopped
  # Test client (runs once and exits)
  test:
    image: python:3.11-slim
    container_name: paddle-ocr-test
    depends_on:
      ocr-cpu:
        condition: service_healthy
    volumes:
      - ./test.py:/app/test.py:ro
    working_dir: /app
    command: >
      sh -c "pip install -q requests && python test.py --url http://ocr-cpu:8000 --dataset /app/dataset"
    network_mode: "service:ocr-cpu"
 volumes:
  paddlex-cache:
    name: paddlex-model-cache
--- a/src/paddle_ocr/paddle_ocr_tuning_rest.py
+++ b/src/paddle_ocr/paddle_ocr_tuning_rest.py
@@ -0,0 +1,263 @@
 # paddle_ocr_tuning_rest.py
 # FastAPI REST service for PaddleOCR hyperparameter evaluation
 # Usage: uvicorn paddle_ocr_tuning_rest:app --host 0.0.0.0 --port 8000
 import os
 import re
 import time
 from typing import Optional
 from contextlib import asynccontextmanager
 import numpy as np
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel, Field
 from paddleocr import PaddleOCR
 from jiwer import wer, cer
 from dataset_manager import ImageTextDataset
 # Model configuration via environment variables (with defaults)
 DEFAULT_DET_MODEL = os.environ.get("PADDLE_DET_MODEL", "PP-OCRv5_server_det")
 DEFAULT_REC_MODEL = os.environ.get("PADDLE_REC_MODEL", "PP-OCRv5_server_rec")
 # Global state for model and dataset
 class AppState:
    ocr: Optional[PaddleOCR] = None
    dataset: Optional[ImageTextDataset] = None
    dataset_path: Optional[str] = None
    det_model: str = DEFAULT_DET_MODEL
    rec_model: str = DEFAULT_REC_MODEL
 state = AppState()
@asynccontextmanager
 async def lifespan(app: FastAPI):
    """Load OCR model at startup."""
    print(f"Loading PaddleOCR models...")
    print(f"  Detection: {state.det_model}")
    print(f"  Recognition: {state.rec_model}")
    state.ocr = PaddleOCR(
        text_detection_model_name=state.det_model,
        text_recognition_model_name=state.rec_model,
    )
    print("Model loaded successfully!")
    yield
    # Cleanup on shutdown
    state.ocr = None
    state.dataset = None
 app = FastAPI(
    title="PaddleOCR Tuning API",
    description="REST API for OCR hyperparameter evaluation",
    version="1.0.0",
    lifespan=lifespan,
 )
 class EvaluateRequest(BaseModel):
    """Request schema matching CLI arguments."""
    pdf_folder: str = Field("/app/dataset", description="Path to dataset folder")
    use_doc_orientation_classify: bool = Field(False, description="Use document orientation classification")
    use_doc_unwarping: bool = Field(False, description="Use document unwarping")
    textline_orientation: bool = Field(True, description="Use textline orientation classification")
    text_det_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Detection pixel threshold")
    text_det_box_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Detection box threshold")
    text_det_unclip_ratio: float = Field(1.5, ge=0.0, description="Text detection expansion coefficient")
    text_rec_score_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Recognition score threshold")
    start_page: int = Field(5, ge=0, description="Start page index (inclusive)")
    end_page: int = Field(10, ge=1, description="End page index (exclusive)")
 class EvaluateResponse(BaseModel):
    """Response schema matching CLI output."""
    CER: float
    WER: float
    TIME: float
    PAGES: int
    TIME_PER_PAGE: float
 class HealthResponse(BaseModel):
    status: str
    model_loaded: bool
    dataset_loaded: bool
    dataset_size: Optional[int] = None
    det_model: Optional[str] = None
    rec_model: Optional[str] = None
 def _normalize_box_xyxy(box):
    """Normalize bounding box to (x0, y0, x1, y1) format."""
    if isinstance(box, (list, tuple)) and box and isinstance(box[0], (list, tuple)):
        xs = [p[0] for p in box]
        ys = [p[1] for p in box]
        return min(xs), min(ys), max(xs), max(ys)
    if isinstance(box, (list, tuple)):
        if len(box) == 4:
            x0, y0, x1, y1 = box
            return min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1)
        if len(box) == 8:
            xs = box[0::2]
            ys = box[1::2]
            return min(xs), min(ys), max(xs), max(ys)
    raise ValueError(f"Unrecognized box format: {box!r}")
 def assemble_from_paddle_result(paddleocr_predict, min_score=0.0, line_tol_factor=0.6):
    """
    Robust line grouping for PaddleOCR outputs.
    Normalizes boxes, groups by line, and returns assembled text.
    """
    boxes_all = []
    for item in paddleocr_predict:
        res = item.json.get("res", {})
        boxes = res.get("rec_boxes", []) or []
        texts = res.get("rec_texts", []) or []
        scores = res.get("rec_scores", None)
        for i, (box, text) in enumerate(zip(boxes, texts)):
            try:
                x0, y0, x1, y1 = _normalize_box_xyxy(box)
            except Exception:
                continue
            y_mid = 0.5 * (y0 + y1)
            score = float(scores[i]) if (scores is not None and i < len(scores)) else 1.0
            t = re.sub(r"\s+", " ", str(text)).strip()
            if not t:
                continue
            boxes_all.append((x0, y0, x1, y1, y_mid, t, score))
    if min_score > 0:
        boxes_all = [b for b in boxes_all if b[6] >= min_score]
    if not boxes_all:
        return ""
    # Adaptive line tolerance
    heights = [b[3] - b[1] for b in boxes_all]
    median_h = float(np.median(heights)) if heights else 20.0
    line_tol = max(8.0, line_tol_factor * median_h)
    # Sort by vertical mid, then x0
    boxes_all.sort(key=lambda b: (b[4], b[0]))
    # Group into lines
    lines, cur, last_y = [], [], None
    for x0, y0, x1, y1, y_mid, text, score in boxes_all:
        if last_y is None or abs(y_mid - last_y) <= line_tol:
            cur.append((x0, text))
        else:
            cur.sort(key=lambda t: t[0])
            lines.append(" ".join(t[1] for t in cur))
            cur = [(x0, text)]
        last_y = y_mid
    if cur:
        cur.sort(key=lambda t: t[0])
        lines.append(" ".join(t[1] for t in cur))
    res = "\n".join(lines)
    res = re.sub(r"\s+\n", "\n", res).strip()
    return res
 def evaluate_text(reference: str, prediction: str) -> dict:
    """Calculate WER and CER metrics."""
    return {"WER": wer(reference, prediction), "CER": cer(reference, prediction)}
@app.get("/health", response_model=HealthResponse)
 def health_check():
    """Check if the service is ready."""
    return HealthResponse(
        status="ok" if state.ocr is not None else "initializing",
        model_loaded=state.ocr is not None,
        dataset_loaded=state.dataset is not None,
        dataset_size=len(state.dataset) if state.dataset else None,
        det_model=state.det_model,
        rec_model=state.rec_model,
    )
@app.post("/evaluate", response_model=EvaluateResponse)
 def evaluate(request: EvaluateRequest):
    """
    Evaluate OCR with given hyperparameters.
    Returns CER, WER, and timing metrics.
    """
    if state.ocr is None:
        raise HTTPException(status_code=503, detail="Model not loaded yet")
    # Load or reload dataset if path changed
    if state.dataset is None or state.dataset_path != request.pdf_folder:
        if not os.path.isdir(request.pdf_folder):
            raise HTTPException(status_code=400, detail=f"Dataset folder not found: {request.pdf_folder}")
        state.dataset = ImageTextDataset(request.pdf_folder)
        state.dataset_path = request.pdf_folder
    if len(state.dataset) == 0:
        raise HTTPException(status_code=400, detail="Dataset is empty")
    # Validate page range
    start = request.start_page
    end = min(request.end_page, len(state.dataset))
    if start >= end:
        raise HTTPException(status_code=400, detail=f"Invalid page range: {start}-{end}")
    cer_list, wer_list = [], []
    time_per_page_list = []
    t0 = time.time()
    for idx in range(start, end):
        img, ref = state.dataset[idx]
        arr = np.array(img)
        tp0 = time.time()
        out = state.ocr.predict(
            arr,
            use_doc_orientation_classify=request.use_doc_orientation_classify,
            use_doc_unwarping=request.use_doc_unwarping,
            use_textline_orientation=request.textline_orientation,
            text_det_thresh=request.text_det_thresh,
            text_det_box_thresh=request.text_det_box_thresh,
            text_det_unclip_ratio=request.text_det_unclip_ratio,
            text_rec_score_thresh=request.text_rec_score_thresh,
        )
        pred = assemble_from_paddle_result(out)
        time_per_page_list.append(float(time.time() - tp0))
        m = evaluate_text(ref, pred)
        cer_list.append(m["CER"])
        wer_list.append(m["WER"])
    return EvaluateResponse(
        CER=float(np.mean(cer_list)) if cer_list else 1.0,
        WER=float(np.mean(wer_list)) if wer_list else 1.0,
        TIME=float(time.time() - t0),
        PAGES=len(cer_list),
        TIME_PER_PAGE=float(np.mean(time_per_page_list)) if time_per_page_list else 0.0,
    )
@app.post("/evaluate_full", response_model=EvaluateResponse)
 def evaluate_full(request: EvaluateRequest):
    """Evaluate on ALL pages (ignores start_page/end_page)."""
    request.start_page = 0
    request.end_page = 9999  # Will be clamped to dataset size
    return evaluate(request)
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/src/paddle_ocr/requirements-gpu.txt
+++ b/src/paddle_ocr/requirements-gpu.txt
@@ -0,0 +1,22 @@
 # PaddleOCR REST API - GPU Requirements
 # Install: pip install -r requirements-gpu.txt
 # PaddlePaddle (GPU version with CUDA)
 paddlepaddle-gpu==3.0.0
 # PaddleOCR
 paddleocr==3.3.2
 # OCR evaluation metrics
 jiwer
 # Numerical computing
 numpy
 # REST API framework
 fastapi
 uvicorn[standard]
 pydantic
 # Image processing
 Pillow
--- a/src/paddle_ocr/requirements.txt
+++ b/src/paddle_ocr/requirements.txt
@@ -0,0 +1,22 @@
 # PaddleOCR REST API - CPU Requirements
 # Install: pip install -r requirements.txt
 # PaddlePaddle (CPU version)
 paddlepaddle==3.2.2
 # PaddleOCR
 paddleocr==3.3.2
 # OCR evaluation metrics
 jiwer
 # Numerical computing
 numpy
 # REST API framework
 fastapi
 uvicorn[standard]
 pydantic
 # Image processing (pulled by paddleocr, but explicit)
 Pillow
--- a/src/paddle_ocr/test.py
+++ b/src/paddle_ocr/test.py
@@ -0,0 +1,114 @@
 # test.py - Simple client to test PaddleOCR REST API
 # Usage: python test.py [--url URL] [--dataset PATH]
 import argparse
 import requests
 import time
 import sys
 def wait_for_health(url: str, timeout: int = 120) -> bool:
    """Wait for API to be ready."""
    health_url = f"{url}/health"
    start = time.time()
    print(f"Waiting for API at {health_url}...")
    while time.time() - start < timeout:
        try:
            resp = requests.get(health_url, timeout=5)
            if resp.status_code == 200:
                data = resp.json()
                if data.get("model_loaded"):
                    print(f"API ready! Model loaded in {time.time() - start:.1f}s")
                    return True
                print(f"  Model loading... ({time.time() - start:.0f}s)")
        except requests.exceptions.ConnectionError:
            print(f"  Connecting... ({time.time() - start:.0f}s)")
        except Exception as e:
            print(f"  Error: {e}")
        time.sleep(2)
    print("Timeout waiting for API")
    return False
 def test_evaluate(url: str, config: dict) -> dict:
    """Run evaluation with given config."""
    eval_url = f"{url}/evaluate"
    print(f"\nTesting config: {config}")
    start = time.time()
    resp = requests.post(eval_url, json=config, timeout=600)
    resp.raise_for_status()
    result = resp.json()
    elapsed = time.time() - start
    print(f"Results (took {elapsed:.1f}s):")
    print(f"  CER: {result['CER']:.4f} ({result['CER']*100:.2f}%)")
    print(f"  WER: {result['WER']:.4f} ({result['WER']*100:.2f}%)")
    print(f"  Pages: {result['PAGES']}")
    print(f"  Time/page: {result['TIME_PER_PAGE']:.2f}s")
    return result
 def main():
    parser = argparse.ArgumentParser(description="Test PaddleOCR REST API")
    parser.add_argument("--url", default="http://localhost:8000", help="API base URL")
    parser.add_argument("--dataset", default="/app/dataset", help="Dataset path (inside container)")
    parser.add_argument("--skip-health", action="store_true", help="Skip health check wait")
    args = parser.parse_args()
    # Wait for API to be ready
    if not args.skip_health:
        if not wait_for_health(args.url):
            sys.exit(1)
    # Test 1: Baseline config (default PaddleOCR)
    print("\n" + "="*50)
    print("TEST 1: Baseline Configuration")
    print("="*50)
    baseline = test_evaluate(args.url, {
        "pdf_folder": args.dataset,
        "use_doc_orientation_classify": False,
        "use_doc_unwarping": False,
        "textline_orientation": False,  # Baseline: disabled
        "text_det_thresh": 0.0,
        "text_det_box_thresh": 0.0,
        "text_det_unclip_ratio": 1.5,
        "text_rec_score_thresh": 0.0,
        "start_page": 5,
        "end_page": 10,
    })
    # Test 2: Optimized config (from Ray Tune results)
    print("\n" + "="*50)
    print("TEST 2: Optimized Configuration")
    print("="*50)
    optimized = test_evaluate(args.url, {
        "pdf_folder": args.dataset,
        "use_doc_orientation_classify": False,
        "use_doc_unwarping": False,
        "textline_orientation": True,  # KEY: enabled
        "text_det_thresh": 0.4690,
        "text_det_box_thresh": 0.5412,
        "text_det_unclip_ratio": 0.0,
        "text_rec_score_thresh": 0.6350,
        "start_page": 5,
        "end_page": 10,
    })
    # Summary
    print("\n" + "="*50)
    print("SUMMARY")
    print("="*50)
    cer_reduction = (1 - optimized["CER"] / baseline["CER"]) * 100 if baseline["CER"] > 0 else 0
    print(f"Baseline CER:  {baseline['CER']*100:.2f}%")
    print(f"Optimized CER: {optimized['CER']*100:.2f}%")
    print(f"Improvement:   {cer_reduction:.1f}% reduction in errors")
 if __name__ == "__main__":
    main()