From c4ab0ffad1be638b28e95289b9848af6ba81d477 Mon Sep 17 00:00:00 2001 From: sergio Date: Sat, 17 Jan 2026 10:24:00 +0100 Subject: [PATCH] doceker support --- src/paddle_ocr/Dockerfile.cpu | 58 ++++ src/paddle_ocr/Dockerfile.gpu | 68 +++++ src/paddle_ocr/README.md | 329 +++++++++++++++++++++++ src/paddle_ocr/dataset_manager.py | 45 ++++ src/paddle_ocr/docker-compose.yml | 83 ++++++ src/paddle_ocr/paddle_ocr_tuning_rest.py | 263 ++++++++++++++++++ src/paddle_ocr/requirements-gpu.txt | 22 ++ src/paddle_ocr/requirements.txt | 22 ++ src/paddle_ocr/test.py | 114 ++++++++ 9 files changed, 1004 insertions(+) create mode 100644 src/paddle_ocr/Dockerfile.cpu create mode 100644 src/paddle_ocr/Dockerfile.gpu create mode 100644 src/paddle_ocr/README.md create mode 100644 src/paddle_ocr/dataset_manager.py create mode 100644 src/paddle_ocr/docker-compose.yml create mode 100644 src/paddle_ocr/paddle_ocr_tuning_rest.py create mode 100644 src/paddle_ocr/requirements-gpu.txt create mode 100644 src/paddle_ocr/requirements.txt create mode 100644 src/paddle_ocr/test.py diff --git a/src/paddle_ocr/Dockerfile.cpu b/src/paddle_ocr/Dockerfile.cpu new file mode 100644 index 0000000..f9c6bab --- /dev/null +++ b/src/paddle_ocr/Dockerfile.cpu @@ -0,0 +1,58 @@ +# Dockerfile.cpu - CPU-only PaddleOCR REST API +# Multi-arch: supports both amd64 and arm64 + +FROM python:3.11-slim + +LABEL maintainer="Sergio Jimenez" +LABEL description="PaddleOCR Tuning REST API - CPU version" + +WORKDIR /app + +# Install system dependencies for OpenCV and PaddleOCR +RUN apt-get update && apt-get install -y --no-install-recommends \ + libgl1 \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender1 \ + libgomp1 \ + && rm -rf /var/lib/apt/lists/* + +# Install Python dependencies from requirements file +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY paddle_ocr_tuning_rest.py . +COPY dataset_manager.py . + +# Build arguments for models to bake into image +ARG DET_MODEL=PP-OCRv5_server_det +ARG REC_MODEL=PP-OCRv5_server_rec + +# Set as environment variables (can be overridden at runtime) +ENV PADDLE_DET_MODEL=${DET_MODEL} +ENV PADDLE_REC_MODEL=${REC_MODEL} + +# Download models during build (not at runtime) +RUN python -c "\ +import os; \ +from paddleocr import PaddleOCR; \ +det = os.environ.get('PADDLE_DET_MODEL', 'PP-OCRv5_server_det'); \ +rec = os.environ.get('PADDLE_REC_MODEL', 'PP-OCRv5_server_rec'); \ +print(f'Downloading models: det={det}, rec={rec}'); \ +ocr = PaddleOCR(text_detection_model_name=det, text_recognition_model_name=rec); \ +print('Models downloaded successfully!')" + +# Volume for dataset and optional additional model cache +VOLUME ["/app/dataset", "/root/.paddlex"] + +# Expose API port +EXPOSE 8000 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ + CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1 + +# Run the API server +CMD ["uvicorn", "paddle_ocr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/src/paddle_ocr/Dockerfile.gpu b/src/paddle_ocr/Dockerfile.gpu new file mode 100644 index 0000000..5c3ca27 --- /dev/null +++ b/src/paddle_ocr/Dockerfile.gpu @@ -0,0 +1,68 @@ +# Dockerfile.gpu - CUDA-enabled PaddleOCR REST API +# Supports: x86_64 with NVIDIA GPU (CUDA 12.x) +# For DGX Spark (ARM64 + CUDA): build natively on the device + +FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 + +LABEL maintainer="Sergio Jimenez" +LABEL description="PaddleOCR Tuning REST API - GPU/CUDA version" + +WORKDIR /app + +# Set environment variables +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONUNBUFFERED=1 +ENV CUDA_VISIBLE_DEVICES=0 + +# Install Python 3.11 and system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3.11 \ + python3.11-venv \ + python3-pip \ + libgl1 \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender1 \ + libgomp1 \ + && rm -rf /var/lib/apt/lists/* \ + && ln -sf /usr/bin/python3.11 /usr/bin/python + +# Install Python dependencies from requirements file +COPY requirements-gpu.txt . +RUN pip install --no-cache-dir -r requirements-gpu.txt + +# Copy application code +COPY paddle_ocr_tuning_rest.py . +COPY dataset_manager.py . + +# Build arguments for models to bake into image +ARG DET_MODEL=PP-OCRv5_server_det +ARG REC_MODEL=PP-OCRv5_server_rec + +# Set as environment variables (can be overridden at runtime) +ENV PADDLE_DET_MODEL=${DET_MODEL} +ENV PADDLE_REC_MODEL=${REC_MODEL} + +# Download models during build (not at runtime) +RUN python -c "\ +import os; \ +from paddleocr import PaddleOCR; \ +det = os.environ.get('PADDLE_DET_MODEL', 'PP-OCRv5_server_det'); \ +rec = os.environ.get('PADDLE_REC_MODEL', 'PP-OCRv5_server_rec'); \ +print(f'Downloading models: det={det}, rec={rec}'); \ +ocr = PaddleOCR(text_detection_model_name=det, text_recognition_model_name=rec); \ +print('Models downloaded successfully!')" + +# Volume for dataset and optional additional model cache +VOLUME ["/app/dataset", "/root/.paddlex"] + +# Expose API port +EXPOSE 8000 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ + CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1 + +# Run the API server +CMD ["uvicorn", "paddle_ocr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/src/paddle_ocr/README.md b/src/paddle_ocr/README.md new file mode 100644 index 0000000..1012a2b --- /dev/null +++ b/src/paddle_ocr/README.md @@ -0,0 +1,329 @@ +# PaddleOCR Tuning REST API + +REST API service for PaddleOCR hyperparameter evaluation. Keeps the model loaded in memory for fast repeated evaluations during hyperparameter search. + +## Quick Start with Docker Compose + +Docker Compose manages building and running containers. The `docker-compose.yml` defines two services: +- `ocr-cpu` - CPU-only version (works everywhere) +- `ocr-gpu` - GPU version (requires NVIDIA GPU + Container Toolkit) + +### Run CPU Version + +```bash +cd src/paddle_ocr + +# Build and start (first time takes ~2-3 min to build, ~30s to load model) +docker compose up ocr-cpu + +# Or run in background (detached) +docker compose up -d ocr-cpu + +# View logs +docker compose logs -f ocr-cpu + +# Stop +docker compose down +``` + +### Run GPU Version + +```bash +# Requires: NVIDIA GPU + nvidia-container-toolkit installed +docker compose up ocr-gpu +``` + +### Test the API + +Once running, test with: +```bash +# Check health +curl http://localhost:8000/health + +# Or use the test script +pip install requests +python test.py --url http://localhost:8000 +``` + +### What Docker Compose Does + +``` +docker compose up ocr-cpu + │ + ├─► Builds image from Dockerfile.cpu (if not exists) + ├─► Creates container "paddle-ocr-cpu" + ├─► Mounts ../dataset → /app/dataset (your PDF images) + ├─► Mounts paddlex-cache volume (persists downloaded models) + ├─► Exposes port 8000 + └─► Runs: uvicorn paddle_ocr_tuning_rest:app --host 0.0.0.0 --port 8000 +``` + +## Files + +| File | Description | +|------|-------------| +| `paddle_ocr_tuning_rest.py` | FastAPI REST service | +| `dataset_manager.py` | Dataset loader | +| `test.py` | API test client | +| `Dockerfile.cpu` | CPU-only image (multi-arch) | +| `Dockerfile.gpu` | GPU/CUDA image (x86_64) | +| `docker-compose.yml` | Service orchestration | + +## API Endpoints + +### `GET /health` +Check if service is ready. + +```json +{"status": "ok", "model_loaded": true, "dataset_loaded": true, "dataset_size": 24} +``` + +### `POST /evaluate` +Run OCR evaluation with given hyperparameters. + +**Request:** +```json +{ + "pdf_folder": "/app/dataset", + "textline_orientation": true, + "use_doc_orientation_classify": false, + "use_doc_unwarping": false, + "text_det_thresh": 0.469, + "text_det_box_thresh": 0.5412, + "text_det_unclip_ratio": 0.0, + "text_rec_score_thresh": 0.635, + "start_page": 5, + "end_page": 10 +} +``` + +**Response:** +```json +{"CER": 0.0115, "WER": 0.0989, "TIME": 330.5, "PAGES": 5, "TIME_PER_PAGE": 66.1} +``` + +### `POST /evaluate_full` +Same as `/evaluate` but runs on ALL pages (ignores start_page/end_page). + +## Building Images + +### CPU Image (Multi-Architecture) + +```bash +# Local build (current architecture) +docker build -f Dockerfile.cpu -t paddle-ocr-api:cpu . + +# Multi-arch build with buildx (amd64 + arm64) +docker buildx create --name multiarch --use +docker buildx build -f Dockerfile.cpu \ + --platform linux/amd64,linux/arm64 \ + -t paddle-ocr-api:cpu \ + --push . +``` + +### GPU Image (x86_64 only) + +```bash +docker build -f Dockerfile.gpu -t paddle-ocr-api:gpu . +``` + +## Running + +### CPU (Any machine) + +```bash +docker run -d -p 8000:8000 \ + -v $(pwd)/../dataset:/app/dataset:ro \ + -v paddlex-cache:/root/.paddlex \ + paddle-ocr-api:cpu +``` + +### GPU (NVIDIA) + +```bash +docker run -d -p 8000:8000 --gpus all \ + -v $(pwd)/../dataset:/app/dataset:ro \ + -v paddlex-cache:/root/.paddlex \ + paddle-ocr-api:gpu +``` + +## DGX Spark (ARM64 + CUDA) + +DGX Spark uses ARM64 (Grace CPU) with NVIDIA Hopper GPU. You have two options: + +### Option 1: Native ARM64 Build (Recommended) + +PaddlePaddle has ARM64 support. Build natively: + +```bash +# On DGX Spark or ARM64 machine +docker build -f Dockerfile.cpu -t paddle-ocr-api:arm64 . +``` + +For GPU acceleration on ARM64, you'll need to modify `Dockerfile.gpu` to use ARM-compatible base image: + +```dockerfile +# Change this line in Dockerfile.gpu: +FROM nvcr.io/nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 + +# To ARM64-compatible version: +FROM nvcr.io/nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 +# (same image works on ARM64 when pulled on ARM machine) +``` + +Then build on the DGX Spark: +```bash +docker build -f Dockerfile.gpu -t paddle-ocr-api:gpu-arm64 . +``` + +### Option 2: x86_64 Emulation via QEMU (Slow) + +You CAN run x86_64 images on ARM via emulation, but it's ~10-20x slower: + +```bash +# On DGX Spark, enable QEMU emulation +docker run --rm --privileged multiarch/qemu-user-static --reset -p yes + +# Run x86_64 image with emulation +docker run --platform linux/amd64 -p 8000:8000 \ + -v $(pwd)/../dataset:/app/dataset:ro \ + paddle-ocr-api:cpu +``` + +**Not recommended** for production due to severe performance penalty. + +### Option 3: Cross-compile from x86_64 + +Build ARM64 images from your x86_64 machine: + +```bash +# Setup buildx for multi-arch +docker buildx create --name mybuilder --use + +# Build ARM64 image from x86_64 machine +docker buildx build -f Dockerfile.cpu \ + --platform linux/arm64 \ + -t paddle-ocr-api:arm64 \ + --load . + +# Save and transfer to DGX Spark +docker save paddle-ocr-api:arm64 | gzip > paddle-ocr-arm64.tar.gz +scp paddle-ocr-arm64.tar.gz dgx-spark:~/ +# On DGX Spark: +docker load < paddle-ocr-arm64.tar.gz +``` + +## Using with Ray Tune + +Update your notebook's `trainable_paddle_ocr` function: + +```python +import requests + +API_URL = "http://localhost:8000/evaluate" + +def trainable_paddle_ocr(config): + """Call OCR API instead of subprocess.""" + payload = { + "pdf_folder": "/app/dataset", + "use_doc_orientation_classify": config.get("use_doc_orientation_classify", False), + "use_doc_unwarping": config.get("use_doc_unwarping", False), + "textline_orientation": config.get("textline_orientation", True), + "text_det_thresh": config.get("text_det_thresh", 0.0), + "text_det_box_thresh": config.get("text_det_box_thresh", 0.0), + "text_det_unclip_ratio": config.get("text_det_unclip_ratio", 1.5), + "text_rec_score_thresh": config.get("text_rec_score_thresh", 0.0), + } + + try: + response = requests.post(API_URL, json=payload, timeout=600) + response.raise_for_status() + metrics = response.json() + tune.report(metrics=metrics) + except Exception as e: + tune.report({"CER": 1.0, "WER": 1.0, "ERROR": str(e)[:500]}) +``` + +## Architecture: Model Lifecycle + +The model is loaded **once** at container startup and stays in memory for all requests: + +```mermaid +flowchart TB + subgraph Container["Docker Container Lifecycle"] + Start([Container Start]) --> Load[Load PaddleOCR Models
~10-30s one-time cost] + Load --> Ready[API Ready
Models in RAM ~500MB] + + subgraph Requests["Incoming Requests - Models Stay Loaded"] + Ready --> R1[Request 1] --> Ready + Ready --> R2[Request 2] --> Ready + Ready --> RN[Request N...] --> Ready + end + + Ready --> Stop([Container Stop]) + Stop --> Free[Models Freed] + end + + style Load fill:#f9f,stroke:#333 + style Ready fill:#9f9,stroke:#333 + style Requests fill:#e8f4ea,stroke:#090 +``` + +**Subprocess vs REST API comparison:** + +```mermaid +flowchart LR + subgraph Subprocess["❌ Subprocess Approach"] + direction TB + S1[Trial 1] --> L1[Load Model ~10s] + L1 --> E1[Evaluate ~60s] + E1 --> U1[Unload] + U1 --> S2[Trial 2] + S2 --> L2[Load Model ~10s] + L2 --> E2[Evaluate ~60s] + end + + subgraph REST["✅ REST API Approach"] + direction TB + Start2[Start Container] --> Load2[Load Model ~10s] + Load2 --> Ready2[Model in Memory] + Ready2 --> T1[Trial 1 ~60s] + T1 --> Ready2 + Ready2 --> T2[Trial 2 ~60s] + T2 --> Ready2 + Ready2 --> TN[Trial N ~60s] + end + + style L1 fill:#faa + style L2 fill:#faa + style Load2 fill:#afa + style Ready2 fill:#afa +``` + +## Performance Comparison + +| Approach | Model Load | Per-Trial Overhead | 64 Trials | +|----------|------------|-------------------|-----------| +| Subprocess (original) | Every trial (~10s) | ~10s | ~7 hours | +| Docker per trial | Every trial (~10s) | ~12-15s | ~7.5 hours | +| **REST API** | **Once** | **~0.1s** | **~5.8 hours** | + +The REST API saves ~1+ hour by loading the model only once. + +## Troubleshooting + +### Model download slow on first run +The first run downloads ~500MB of models. Use volume `paddlex-cache` to persist them. + +### Out of memory +Reduce `max_concurrent_trials` in Ray Tune, or increase container memory: +```bash +docker run --memory=8g ... +``` + +### GPU not detected +Ensure NVIDIA Container Toolkit is installed: +```bash +nvidia-smi # Should work +docker run --rm --gpus all nvidia/cuda:12.0-base nvidia-smi # Should work +``` diff --git a/src/paddle_ocr/dataset_manager.py b/src/paddle_ocr/dataset_manager.py new file mode 100644 index 0000000..2d3ccac --- /dev/null +++ b/src/paddle_ocr/dataset_manager.py @@ -0,0 +1,45 @@ +# Imports +import os +from PIL import Image + + +class ImageTextDataset: + def __init__(self, root): + self.samples = [] + + for folder in sorted(os.listdir(root)): + sub = os.path.join(root, folder) + img_dir = os.path.join(sub, "img") + txt_dir = os.path.join(sub, "txt") + + if not (os.path.isdir(img_dir) and os.path.isdir(txt_dir)): + continue + + for fname in sorted(os.listdir(img_dir)): + if not fname.lower().endswith((".png", ".jpg", ".jpeg")): + continue + + img_path = os.path.join(img_dir, fname) + + # text file must have same name but .txt + txt_name = os.path.splitext(fname)[0] + ".txt" + txt_path = os.path.join(txt_dir, txt_name) + + if not os.path.exists(txt_path): + continue + + self.samples.append((img_path, txt_path)) + def __len__(self): + return len(self.samples) + + def __getitem__(self, idx): + img_path, txt_path = self.samples[idx] + + # Load image + image = Image.open(img_path).convert("RGB") + + # Load text + with open(txt_path, "r", encoding="utf-8") as f: + text = f.read() + + return image, text \ No newline at end of file diff --git a/src/paddle_ocr/docker-compose.yml b/src/paddle_ocr/docker-compose.yml new file mode 100644 index 0000000..1bbd6e0 --- /dev/null +++ b/src/paddle_ocr/docker-compose.yml @@ -0,0 +1,83 @@ +# docker-compose.yml - PaddleOCR REST API +# Usage: +# CPU: docker compose up ocr-cpu +# GPU: docker compose up ocr-gpu +# Test: docker compose run --rm test + +services: + # CPU-only service (works on any architecture) + ocr-cpu: + build: + context: . + dockerfile: Dockerfile.cpu + args: + # Models to bake into image (change before building): + DET_MODEL: PP-OCRv5_server_det + REC_MODEL: PP-OCRv5_server_rec + image: paddle-ocr-api:cpu + container_name: paddle-ocr-cpu + ports: + - "8000:8000" + volumes: + - ../dataset:/app/dataset:ro # Your dataset + - paddlex-cache:/root/.paddlex # For additional models at runtime + environment: + - PYTHONUNBUFFERED=1 + # Override models at runtime (uncomment to use different models): + # - PADDLE_DET_MODEL=PP-OCRv5_mobile_det + # - PADDLE_REC_MODEL=PP-OCRv5_mobile_rec + restart: unless-stopped + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 60s + + # GPU service (requires NVIDIA Container Toolkit) + ocr-gpu: + build: + context: . + dockerfile: Dockerfile.gpu + args: + DET_MODEL: PP-OCRv5_server_det + REC_MODEL: PP-OCRv5_server_rec + image: paddle-ocr-api:gpu + container_name: paddle-ocr-gpu + ports: + - "8000:8000" + volumes: + - ../dataset:/app/dataset:ro + - paddlex-cache:/root/.paddlex + environment: + - PYTHONUNBUFFERED=1 + - CUDA_VISIBLE_DEVICES=0 + # Override models at runtime: + # - PADDLE_DET_MODEL=PP-OCRv5_mobile_det + # - PADDLE_REC_MODEL=PP-OCRv5_mobile_rec + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + restart: unless-stopped + + # Test client (runs once and exits) + test: + image: python:3.11-slim + container_name: paddle-ocr-test + depends_on: + ocr-cpu: + condition: service_healthy + volumes: + - ./test.py:/app/test.py:ro + working_dir: /app + command: > + sh -c "pip install -q requests && python test.py --url http://ocr-cpu:8000 --dataset /app/dataset" + network_mode: "service:ocr-cpu" + +volumes: + paddlex-cache: + name: paddlex-model-cache diff --git a/src/paddle_ocr/paddle_ocr_tuning_rest.py b/src/paddle_ocr/paddle_ocr_tuning_rest.py new file mode 100644 index 0000000..9a34c78 --- /dev/null +++ b/src/paddle_ocr/paddle_ocr_tuning_rest.py @@ -0,0 +1,263 @@ +# paddle_ocr_tuning_rest.py +# FastAPI REST service for PaddleOCR hyperparameter evaluation +# Usage: uvicorn paddle_ocr_tuning_rest:app --host 0.0.0.0 --port 8000 + +import os +import re +import time +from typing import Optional +from contextlib import asynccontextmanager + +import numpy as np +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel, Field + +from paddleocr import PaddleOCR +from jiwer import wer, cer +from dataset_manager import ImageTextDataset + + +# Model configuration via environment variables (with defaults) +DEFAULT_DET_MODEL = os.environ.get("PADDLE_DET_MODEL", "PP-OCRv5_server_det") +DEFAULT_REC_MODEL = os.environ.get("PADDLE_REC_MODEL", "PP-OCRv5_server_rec") + + +# Global state for model and dataset +class AppState: + ocr: Optional[PaddleOCR] = None + dataset: Optional[ImageTextDataset] = None + dataset_path: Optional[str] = None + det_model: str = DEFAULT_DET_MODEL + rec_model: str = DEFAULT_REC_MODEL + + +state = AppState() + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Load OCR model at startup.""" + print(f"Loading PaddleOCR models...") + print(f" Detection: {state.det_model}") + print(f" Recognition: {state.rec_model}") + state.ocr = PaddleOCR( + text_detection_model_name=state.det_model, + text_recognition_model_name=state.rec_model, + ) + print("Model loaded successfully!") + yield + # Cleanup on shutdown + state.ocr = None + state.dataset = None + + +app = FastAPI( + title="PaddleOCR Tuning API", + description="REST API for OCR hyperparameter evaluation", + version="1.0.0", + lifespan=lifespan, +) + + +class EvaluateRequest(BaseModel): + """Request schema matching CLI arguments.""" + pdf_folder: str = Field("/app/dataset", description="Path to dataset folder") + use_doc_orientation_classify: bool = Field(False, description="Use document orientation classification") + use_doc_unwarping: bool = Field(False, description="Use document unwarping") + textline_orientation: bool = Field(True, description="Use textline orientation classification") + text_det_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Detection pixel threshold") + text_det_box_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Detection box threshold") + text_det_unclip_ratio: float = Field(1.5, ge=0.0, description="Text detection expansion coefficient") + text_rec_score_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Recognition score threshold") + start_page: int = Field(5, ge=0, description="Start page index (inclusive)") + end_page: int = Field(10, ge=1, description="End page index (exclusive)") + + +class EvaluateResponse(BaseModel): + """Response schema matching CLI output.""" + CER: float + WER: float + TIME: float + PAGES: int + TIME_PER_PAGE: float + + +class HealthResponse(BaseModel): + status: str + model_loaded: bool + dataset_loaded: bool + dataset_size: Optional[int] = None + det_model: Optional[str] = None + rec_model: Optional[str] = None + + +def _normalize_box_xyxy(box): + """Normalize bounding box to (x0, y0, x1, y1) format.""" + if isinstance(box, (list, tuple)) and box and isinstance(box[0], (list, tuple)): + xs = [p[0] for p in box] + ys = [p[1] for p in box] + return min(xs), min(ys), max(xs), max(ys) + + if isinstance(box, (list, tuple)): + if len(box) == 4: + x0, y0, x1, y1 = box + return min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1) + if len(box) == 8: + xs = box[0::2] + ys = box[1::2] + return min(xs), min(ys), max(xs), max(ys) + + raise ValueError(f"Unrecognized box format: {box!r}") + + +def assemble_from_paddle_result(paddleocr_predict, min_score=0.0, line_tol_factor=0.6): + """ + Robust line grouping for PaddleOCR outputs. + Normalizes boxes, groups by line, and returns assembled text. + """ + boxes_all = [] + for item in paddleocr_predict: + res = item.json.get("res", {}) + boxes = res.get("rec_boxes", []) or [] + texts = res.get("rec_texts", []) or [] + scores = res.get("rec_scores", None) + + for i, (box, text) in enumerate(zip(boxes, texts)): + try: + x0, y0, x1, y1 = _normalize_box_xyxy(box) + except Exception: + continue + + y_mid = 0.5 * (y0 + y1) + score = float(scores[i]) if (scores is not None and i < len(scores)) else 1.0 + + t = re.sub(r"\s+", " ", str(text)).strip() + if not t: + continue + + boxes_all.append((x0, y0, x1, y1, y_mid, t, score)) + + if min_score > 0: + boxes_all = [b for b in boxes_all if b[6] >= min_score] + + if not boxes_all: + return "" + + # Adaptive line tolerance + heights = [b[3] - b[1] for b in boxes_all] + median_h = float(np.median(heights)) if heights else 20.0 + line_tol = max(8.0, line_tol_factor * median_h) + + # Sort by vertical mid, then x0 + boxes_all.sort(key=lambda b: (b[4], b[0])) + + # Group into lines + lines, cur, last_y = [], [], None + for x0, y0, x1, y1, y_mid, text, score in boxes_all: + if last_y is None or abs(y_mid - last_y) <= line_tol: + cur.append((x0, text)) + else: + cur.sort(key=lambda t: t[0]) + lines.append(" ".join(t[1] for t in cur)) + cur = [(x0, text)] + last_y = y_mid + + if cur: + cur.sort(key=lambda t: t[0]) + lines.append(" ".join(t[1] for t in cur)) + + res = "\n".join(lines) + res = re.sub(r"\s+\n", "\n", res).strip() + return res + + +def evaluate_text(reference: str, prediction: str) -> dict: + """Calculate WER and CER metrics.""" + return {"WER": wer(reference, prediction), "CER": cer(reference, prediction)} + + +@app.get("/health", response_model=HealthResponse) +def health_check(): + """Check if the service is ready.""" + return HealthResponse( + status="ok" if state.ocr is not None else "initializing", + model_loaded=state.ocr is not None, + dataset_loaded=state.dataset is not None, + dataset_size=len(state.dataset) if state.dataset else None, + det_model=state.det_model, + rec_model=state.rec_model, + ) + + +@app.post("/evaluate", response_model=EvaluateResponse) +def evaluate(request: EvaluateRequest): + """ + Evaluate OCR with given hyperparameters. + Returns CER, WER, and timing metrics. + """ + if state.ocr is None: + raise HTTPException(status_code=503, detail="Model not loaded yet") + + # Load or reload dataset if path changed + if state.dataset is None or state.dataset_path != request.pdf_folder: + if not os.path.isdir(request.pdf_folder): + raise HTTPException(status_code=400, detail=f"Dataset folder not found: {request.pdf_folder}") + state.dataset = ImageTextDataset(request.pdf_folder) + state.dataset_path = request.pdf_folder + + if len(state.dataset) == 0: + raise HTTPException(status_code=400, detail="Dataset is empty") + + # Validate page range + start = request.start_page + end = min(request.end_page, len(state.dataset)) + if start >= end: + raise HTTPException(status_code=400, detail=f"Invalid page range: {start}-{end}") + + cer_list, wer_list = [], [] + time_per_page_list = [] + t0 = time.time() + + for idx in range(start, end): + img, ref = state.dataset[idx] + arr = np.array(img) + + tp0 = time.time() + out = state.ocr.predict( + arr, + use_doc_orientation_classify=request.use_doc_orientation_classify, + use_doc_unwarping=request.use_doc_unwarping, + use_textline_orientation=request.textline_orientation, + text_det_thresh=request.text_det_thresh, + text_det_box_thresh=request.text_det_box_thresh, + text_det_unclip_ratio=request.text_det_unclip_ratio, + text_rec_score_thresh=request.text_rec_score_thresh, + ) + + pred = assemble_from_paddle_result(out) + time_per_page_list.append(float(time.time() - tp0)) + + m = evaluate_text(ref, pred) + cer_list.append(m["CER"]) + wer_list.append(m["WER"]) + + return EvaluateResponse( + CER=float(np.mean(cer_list)) if cer_list else 1.0, + WER=float(np.mean(wer_list)) if wer_list else 1.0, + TIME=float(time.time() - t0), + PAGES=len(cer_list), + TIME_PER_PAGE=float(np.mean(time_per_page_list)) if time_per_page_list else 0.0, + ) + + +@app.post("/evaluate_full", response_model=EvaluateResponse) +def evaluate_full(request: EvaluateRequest): + """Evaluate on ALL pages (ignores start_page/end_page).""" + request.start_page = 0 + request.end_page = 9999 # Will be clamped to dataset size + return evaluate(request) + + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/src/paddle_ocr/requirements-gpu.txt b/src/paddle_ocr/requirements-gpu.txt new file mode 100644 index 0000000..56b4832 --- /dev/null +++ b/src/paddle_ocr/requirements-gpu.txt @@ -0,0 +1,22 @@ +# PaddleOCR REST API - GPU Requirements +# Install: pip install -r requirements-gpu.txt + +# PaddlePaddle (GPU version with CUDA) +paddlepaddle-gpu==3.0.0 + +# PaddleOCR +paddleocr==3.3.2 + +# OCR evaluation metrics +jiwer + +# Numerical computing +numpy + +# REST API framework +fastapi +uvicorn[standard] +pydantic + +# Image processing +Pillow diff --git a/src/paddle_ocr/requirements.txt b/src/paddle_ocr/requirements.txt new file mode 100644 index 0000000..4ea8bf7 --- /dev/null +++ b/src/paddle_ocr/requirements.txt @@ -0,0 +1,22 @@ +# PaddleOCR REST API - CPU Requirements +# Install: pip install -r requirements.txt + +# PaddlePaddle (CPU version) +paddlepaddle==3.2.2 + +# PaddleOCR +paddleocr==3.3.2 + +# OCR evaluation metrics +jiwer + +# Numerical computing +numpy + +# REST API framework +fastapi +uvicorn[standard] +pydantic + +# Image processing (pulled by paddleocr, but explicit) +Pillow diff --git a/src/paddle_ocr/test.py b/src/paddle_ocr/test.py new file mode 100644 index 0000000..544da55 --- /dev/null +++ b/src/paddle_ocr/test.py @@ -0,0 +1,114 @@ +# test.py - Simple client to test PaddleOCR REST API +# Usage: python test.py [--url URL] [--dataset PATH] + +import argparse +import requests +import time +import sys + + +def wait_for_health(url: str, timeout: int = 120) -> bool: + """Wait for API to be ready.""" + health_url = f"{url}/health" + start = time.time() + + print(f"Waiting for API at {health_url}...") + while time.time() - start < timeout: + try: + resp = requests.get(health_url, timeout=5) + if resp.status_code == 200: + data = resp.json() + if data.get("model_loaded"): + print(f"API ready! Model loaded in {time.time() - start:.1f}s") + return True + print(f" Model loading... ({time.time() - start:.0f}s)") + except requests.exceptions.ConnectionError: + print(f" Connecting... ({time.time() - start:.0f}s)") + except Exception as e: + print(f" Error: {e}") + time.sleep(2) + + print("Timeout waiting for API") + return False + + +def test_evaluate(url: str, config: dict) -> dict: + """Run evaluation with given config.""" + eval_url = f"{url}/evaluate" + + print(f"\nTesting config: {config}") + start = time.time() + + resp = requests.post(eval_url, json=config, timeout=600) + resp.raise_for_status() + + result = resp.json() + elapsed = time.time() - start + + print(f"Results (took {elapsed:.1f}s):") + print(f" CER: {result['CER']:.4f} ({result['CER']*100:.2f}%)") + print(f" WER: {result['WER']:.4f} ({result['WER']*100:.2f}%)") + print(f" Pages: {result['PAGES']}") + print(f" Time/page: {result['TIME_PER_PAGE']:.2f}s") + + return result + + +def main(): + parser = argparse.ArgumentParser(description="Test PaddleOCR REST API") + parser.add_argument("--url", default="http://localhost:8000", help="API base URL") + parser.add_argument("--dataset", default="/app/dataset", help="Dataset path (inside container)") + parser.add_argument("--skip-health", action="store_true", help="Skip health check wait") + args = parser.parse_args() + + # Wait for API to be ready + if not args.skip_health: + if not wait_for_health(args.url): + sys.exit(1) + + # Test 1: Baseline config (default PaddleOCR) + print("\n" + "="*50) + print("TEST 1: Baseline Configuration") + print("="*50) + baseline = test_evaluate(args.url, { + "pdf_folder": args.dataset, + "use_doc_orientation_classify": False, + "use_doc_unwarping": False, + "textline_orientation": False, # Baseline: disabled + "text_det_thresh": 0.0, + "text_det_box_thresh": 0.0, + "text_det_unclip_ratio": 1.5, + "text_rec_score_thresh": 0.0, + "start_page": 5, + "end_page": 10, + }) + + # Test 2: Optimized config (from Ray Tune results) + print("\n" + "="*50) + print("TEST 2: Optimized Configuration") + print("="*50) + optimized = test_evaluate(args.url, { + "pdf_folder": args.dataset, + "use_doc_orientation_classify": False, + "use_doc_unwarping": False, + "textline_orientation": True, # KEY: enabled + "text_det_thresh": 0.4690, + "text_det_box_thresh": 0.5412, + "text_det_unclip_ratio": 0.0, + "text_rec_score_thresh": 0.6350, + "start_page": 5, + "end_page": 10, + }) + + # Summary + print("\n" + "="*50) + print("SUMMARY") + print("="*50) + cer_reduction = (1 - optimized["CER"] / baseline["CER"]) * 100 if baseline["CER"] > 0 else 0 + print(f"Baseline CER: {baseline['CER']*100:.2f}%") + print(f"Optimized CER: {optimized['CER']*100:.2f}%") + print(f"Improvement: {cer_reduction:.1f}% reduction in errors") + + +if __name__ == "__main__": + main()