Paddle ocr gpu support. #4

Merged
Seryusjj merged 40 commits from gpu_support into main 2026-01-19 17:35:25 +00:00
9 changed files with 1004 additions and 0 deletions
Showing only changes of commit c4ab0ffad1 - Show all commits

View File

@@ -0,0 +1,58 @@
# Dockerfile.cpu - CPU-only PaddleOCR REST API
# Multi-arch: supports both amd64 and arm64
FROM python:3.11-slim
LABEL maintainer="Sergio Jimenez"
LABEL description="PaddleOCR Tuning REST API - CPU version"
WORKDIR /app
# Install system dependencies for OpenCV and PaddleOCR
RUN apt-get update && apt-get install -y --no-install-recommends \
libgl1 \
libglib2.0-0 \
libsm6 \
libxext6 \
libxrender1 \
libgomp1 \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies from requirements file
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY paddle_ocr_tuning_rest.py .
COPY dataset_manager.py .
# Build arguments for models to bake into image
ARG DET_MODEL=PP-OCRv5_server_det
ARG REC_MODEL=PP-OCRv5_server_rec
# Set as environment variables (can be overridden at runtime)
ENV PADDLE_DET_MODEL=${DET_MODEL}
ENV PADDLE_REC_MODEL=${REC_MODEL}
# Download models during build (not at runtime)
RUN python -c "\
import os; \
from paddleocr import PaddleOCR; \
det = os.environ.get('PADDLE_DET_MODEL', 'PP-OCRv5_server_det'); \
rec = os.environ.get('PADDLE_REC_MODEL', 'PP-OCRv5_server_rec'); \
print(f'Downloading models: det={det}, rec={rec}'); \
ocr = PaddleOCR(text_detection_model_name=det, text_recognition_model_name=rec); \
print('Models downloaded successfully!')"
# Volume for dataset and optional additional model cache
VOLUME ["/app/dataset", "/root/.paddlex"]
# Expose API port
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
# Run the API server
CMD ["uvicorn", "paddle_ocr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -0,0 +1,68 @@
# Dockerfile.gpu - CUDA-enabled PaddleOCR REST API
# Supports: x86_64 with NVIDIA GPU (CUDA 12.x)
# For DGX Spark (ARM64 + CUDA): build natively on the device
FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
LABEL maintainer="Sergio Jimenez"
LABEL description="PaddleOCR Tuning REST API - GPU/CUDA version"
WORKDIR /app
# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV CUDA_VISIBLE_DEVICES=0
# Install Python 3.11 and system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
python3.11 \
python3.11-venv \
python3-pip \
libgl1 \
libglib2.0-0 \
libsm6 \
libxext6 \
libxrender1 \
libgomp1 \
&& rm -rf /var/lib/apt/lists/* \
&& ln -sf /usr/bin/python3.11 /usr/bin/python
# Install Python dependencies from requirements file
COPY requirements-gpu.txt .
RUN pip install --no-cache-dir -r requirements-gpu.txt
# Copy application code
COPY paddle_ocr_tuning_rest.py .
COPY dataset_manager.py .
# Build arguments for models to bake into image
ARG DET_MODEL=PP-OCRv5_server_det
ARG REC_MODEL=PP-OCRv5_server_rec
# Set as environment variables (can be overridden at runtime)
ENV PADDLE_DET_MODEL=${DET_MODEL}
ENV PADDLE_REC_MODEL=${REC_MODEL}
# Download models during build (not at runtime)
RUN python -c "\
import os; \
from paddleocr import PaddleOCR; \
det = os.environ.get('PADDLE_DET_MODEL', 'PP-OCRv5_server_det'); \
rec = os.environ.get('PADDLE_REC_MODEL', 'PP-OCRv5_server_rec'); \
print(f'Downloading models: det={det}, rec={rec}'); \
ocr = PaddleOCR(text_detection_model_name=det, text_recognition_model_name=rec); \
print('Models downloaded successfully!')"
# Volume for dataset and optional additional model cache
VOLUME ["/app/dataset", "/root/.paddlex"]
# Expose API port
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
# Run the API server
CMD ["uvicorn", "paddle_ocr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]

329
src/paddle_ocr/README.md Normal file
View File

@@ -0,0 +1,329 @@
# PaddleOCR Tuning REST API
REST API service for PaddleOCR hyperparameter evaluation. Keeps the model loaded in memory for fast repeated evaluations during hyperparameter search.
## Quick Start with Docker Compose
Docker Compose manages building and running containers. The `docker-compose.yml` defines two services:
- `ocr-cpu` - CPU-only version (works everywhere)
- `ocr-gpu` - GPU version (requires NVIDIA GPU + Container Toolkit)
### Run CPU Version
```bash
cd src/paddle_ocr
# Build and start (first time takes ~2-3 min to build, ~30s to load model)
docker compose up ocr-cpu
# Or run in background (detached)
docker compose up -d ocr-cpu
# View logs
docker compose logs -f ocr-cpu
# Stop
docker compose down
```
### Run GPU Version
```bash
# Requires: NVIDIA GPU + nvidia-container-toolkit installed
docker compose up ocr-gpu
```
### Test the API
Once running, test with:
```bash
# Check health
curl http://localhost:8000/health
# Or use the test script
pip install requests
python test.py --url http://localhost:8000
```
### What Docker Compose Does
```
docker compose up ocr-cpu
├─► Builds image from Dockerfile.cpu (if not exists)
├─► Creates container "paddle-ocr-cpu"
├─► Mounts ../dataset → /app/dataset (your PDF images)
├─► Mounts paddlex-cache volume (persists downloaded models)
├─► Exposes port 8000
└─► Runs: uvicorn paddle_ocr_tuning_rest:app --host 0.0.0.0 --port 8000
```
## Files
| File | Description |
|------|-------------|
| `paddle_ocr_tuning_rest.py` | FastAPI REST service |
| `dataset_manager.py` | Dataset loader |
| `test.py` | API test client |
| `Dockerfile.cpu` | CPU-only image (multi-arch) |
| `Dockerfile.gpu` | GPU/CUDA image (x86_64) |
| `docker-compose.yml` | Service orchestration |
## API Endpoints
### `GET /health`
Check if service is ready.
```json
{"status": "ok", "model_loaded": true, "dataset_loaded": true, "dataset_size": 24}
```
### `POST /evaluate`
Run OCR evaluation with given hyperparameters.
**Request:**
```json
{
"pdf_folder": "/app/dataset",
"textline_orientation": true,
"use_doc_orientation_classify": false,
"use_doc_unwarping": false,
"text_det_thresh": 0.469,
"text_det_box_thresh": 0.5412,
"text_det_unclip_ratio": 0.0,
"text_rec_score_thresh": 0.635,
"start_page": 5,
"end_page": 10
}
```
**Response:**
```json
{"CER": 0.0115, "WER": 0.0989, "TIME": 330.5, "PAGES": 5, "TIME_PER_PAGE": 66.1}
```
### `POST /evaluate_full`
Same as `/evaluate` but runs on ALL pages (ignores start_page/end_page).
## Building Images
### CPU Image (Multi-Architecture)
```bash
# Local build (current architecture)
docker build -f Dockerfile.cpu -t paddle-ocr-api:cpu .
# Multi-arch build with buildx (amd64 + arm64)
docker buildx create --name multiarch --use
docker buildx build -f Dockerfile.cpu \
--platform linux/amd64,linux/arm64 \
-t paddle-ocr-api:cpu \
--push .
```
### GPU Image (x86_64 only)
```bash
docker build -f Dockerfile.gpu -t paddle-ocr-api:gpu .
```
## Running
### CPU (Any machine)
```bash
docker run -d -p 8000:8000 \
-v $(pwd)/../dataset:/app/dataset:ro \
-v paddlex-cache:/root/.paddlex \
paddle-ocr-api:cpu
```
### GPU (NVIDIA)
```bash
docker run -d -p 8000:8000 --gpus all \
-v $(pwd)/../dataset:/app/dataset:ro \
-v paddlex-cache:/root/.paddlex \
paddle-ocr-api:gpu
```
## DGX Spark (ARM64 + CUDA)
DGX Spark uses ARM64 (Grace CPU) with NVIDIA Hopper GPU. You have two options:
### Option 1: Native ARM64 Build (Recommended)
PaddlePaddle has ARM64 support. Build natively:
```bash
# On DGX Spark or ARM64 machine
docker build -f Dockerfile.cpu -t paddle-ocr-api:arm64 .
```
For GPU acceleration on ARM64, you'll need to modify `Dockerfile.gpu` to use ARM-compatible base image:
```dockerfile
# Change this line in Dockerfile.gpu:
FROM nvcr.io/nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
# To ARM64-compatible version:
FROM nvcr.io/nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
# (same image works on ARM64 when pulled on ARM machine)
```
Then build on the DGX Spark:
```bash
docker build -f Dockerfile.gpu -t paddle-ocr-api:gpu-arm64 .
```
### Option 2: x86_64 Emulation via QEMU (Slow)
You CAN run x86_64 images on ARM via emulation, but it's ~10-20x slower:
```bash
# On DGX Spark, enable QEMU emulation
docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
# Run x86_64 image with emulation
docker run --platform linux/amd64 -p 8000:8000 \
-v $(pwd)/../dataset:/app/dataset:ro \
paddle-ocr-api:cpu
```
**Not recommended** for production due to severe performance penalty.
### Option 3: Cross-compile from x86_64
Build ARM64 images from your x86_64 machine:
```bash
# Setup buildx for multi-arch
docker buildx create --name mybuilder --use
# Build ARM64 image from x86_64 machine
docker buildx build -f Dockerfile.cpu \
--platform linux/arm64 \
-t paddle-ocr-api:arm64 \
--load .
# Save and transfer to DGX Spark
docker save paddle-ocr-api:arm64 | gzip > paddle-ocr-arm64.tar.gz
scp paddle-ocr-arm64.tar.gz dgx-spark:~/
# On DGX Spark:
docker load < paddle-ocr-arm64.tar.gz
```
## Using with Ray Tune
Update your notebook's `trainable_paddle_ocr` function:
```python
import requests
API_URL = "http://localhost:8000/evaluate"
def trainable_paddle_ocr(config):
"""Call OCR API instead of subprocess."""
payload = {
"pdf_folder": "/app/dataset",
"use_doc_orientation_classify": config.get("use_doc_orientation_classify", False),
"use_doc_unwarping": config.get("use_doc_unwarping", False),
"textline_orientation": config.get("textline_orientation", True),
"text_det_thresh": config.get("text_det_thresh", 0.0),
"text_det_box_thresh": config.get("text_det_box_thresh", 0.0),
"text_det_unclip_ratio": config.get("text_det_unclip_ratio", 1.5),
"text_rec_score_thresh": config.get("text_rec_score_thresh", 0.0),
}
try:
response = requests.post(API_URL, json=payload, timeout=600)
response.raise_for_status()
metrics = response.json()
tune.report(metrics=metrics)
except Exception as e:
tune.report({"CER": 1.0, "WER": 1.0, "ERROR": str(e)[:500]})
```
## Architecture: Model Lifecycle
The model is loaded **once** at container startup and stays in memory for all requests:
```mermaid
flowchart TB
subgraph Container["Docker Container Lifecycle"]
Start([Container Start]) --> Load[Load PaddleOCR Models<br/>~10-30s one-time cost]
Load --> Ready[API Ready<br/>Models in RAM ~500MB]
subgraph Requests["Incoming Requests - Models Stay Loaded"]
Ready --> R1[Request 1] --> Ready
Ready --> R2[Request 2] --> Ready
Ready --> RN[Request N...] --> Ready
end
Ready --> Stop([Container Stop])
Stop --> Free[Models Freed]
end
style Load fill:#f9f,stroke:#333
style Ready fill:#9f9,stroke:#333
style Requests fill:#e8f4ea,stroke:#090
```
**Subprocess vs REST API comparison:**
```mermaid
flowchart LR
subgraph Subprocess["❌ Subprocess Approach"]
direction TB
S1[Trial 1] --> L1[Load Model ~10s]
L1 --> E1[Evaluate ~60s]
E1 --> U1[Unload]
U1 --> S2[Trial 2]
S2 --> L2[Load Model ~10s]
L2 --> E2[Evaluate ~60s]
end
subgraph REST["✅ REST API Approach"]
direction TB
Start2[Start Container] --> Load2[Load Model ~10s]
Load2 --> Ready2[Model in Memory]
Ready2 --> T1[Trial 1 ~60s]
T1 --> Ready2
Ready2 --> T2[Trial 2 ~60s]
T2 --> Ready2
Ready2 --> TN[Trial N ~60s]
end
style L1 fill:#faa
style L2 fill:#faa
style Load2 fill:#afa
style Ready2 fill:#afa
```
## Performance Comparison
| Approach | Model Load | Per-Trial Overhead | 64 Trials |
|----------|------------|-------------------|-----------|
| Subprocess (original) | Every trial (~10s) | ~10s | ~7 hours |
| Docker per trial | Every trial (~10s) | ~12-15s | ~7.5 hours |
| **REST API** | **Once** | **~0.1s** | **~5.8 hours** |
The REST API saves ~1+ hour by loading the model only once.
## Troubleshooting
### Model download slow on first run
The first run downloads ~500MB of models. Use volume `paddlex-cache` to persist them.
### Out of memory
Reduce `max_concurrent_trials` in Ray Tune, or increase container memory:
```bash
docker run --memory=8g ...
```
### GPU not detected
Ensure NVIDIA Container Toolkit is installed:
```bash
nvidia-smi # Should work
docker run --rm --gpus all nvidia/cuda:12.0-base nvidia-smi # Should work
```

View File

@@ -0,0 +1,45 @@
# Imports
import os
from PIL import Image
class ImageTextDataset:
def __init__(self, root):
self.samples = []
for folder in sorted(os.listdir(root)):
sub = os.path.join(root, folder)
img_dir = os.path.join(sub, "img")
txt_dir = os.path.join(sub, "txt")
if not (os.path.isdir(img_dir) and os.path.isdir(txt_dir)):
continue
for fname in sorted(os.listdir(img_dir)):
if not fname.lower().endswith((".png", ".jpg", ".jpeg")):
continue
img_path = os.path.join(img_dir, fname)
# text file must have same name but .txt
txt_name = os.path.splitext(fname)[0] + ".txt"
txt_path = os.path.join(txt_dir, txt_name)
if not os.path.exists(txt_path):
continue
self.samples.append((img_path, txt_path))
def __len__(self):
return len(self.samples)
def __getitem__(self, idx):
img_path, txt_path = self.samples[idx]
# Load image
image = Image.open(img_path).convert("RGB")
# Load text
with open(txt_path, "r", encoding="utf-8") as f:
text = f.read()
return image, text

View File

@@ -0,0 +1,83 @@
# docker-compose.yml - PaddleOCR REST API
# Usage:
# CPU: docker compose up ocr-cpu
# GPU: docker compose up ocr-gpu
# Test: docker compose run --rm test
services:
# CPU-only service (works on any architecture)
ocr-cpu:
build:
context: .
dockerfile: Dockerfile.cpu
args:
# Models to bake into image (change before building):
DET_MODEL: PP-OCRv5_server_det
REC_MODEL: PP-OCRv5_server_rec
image: paddle-ocr-api:cpu
container_name: paddle-ocr-cpu
ports:
- "8000:8000"
volumes:
- ../dataset:/app/dataset:ro # Your dataset
- paddlex-cache:/root/.paddlex # For additional models at runtime
environment:
- PYTHONUNBUFFERED=1
# Override models at runtime (uncomment to use different models):
# - PADDLE_DET_MODEL=PP-OCRv5_mobile_det
# - PADDLE_REC_MODEL=PP-OCRv5_mobile_rec
restart: unless-stopped
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
# GPU service (requires NVIDIA Container Toolkit)
ocr-gpu:
build:
context: .
dockerfile: Dockerfile.gpu
args:
DET_MODEL: PP-OCRv5_server_det
REC_MODEL: PP-OCRv5_server_rec
image: paddle-ocr-api:gpu
container_name: paddle-ocr-gpu
ports:
- "8000:8000"
volumes:
- ../dataset:/app/dataset:ro
- paddlex-cache:/root/.paddlex
environment:
- PYTHONUNBUFFERED=1
- CUDA_VISIBLE_DEVICES=0
# Override models at runtime:
# - PADDLE_DET_MODEL=PP-OCRv5_mobile_det
# - PADDLE_REC_MODEL=PP-OCRv5_mobile_rec
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
restart: unless-stopped
# Test client (runs once and exits)
test:
image: python:3.11-slim
container_name: paddle-ocr-test
depends_on:
ocr-cpu:
condition: service_healthy
volumes:
- ./test.py:/app/test.py:ro
working_dir: /app
command: >
sh -c "pip install -q requests && python test.py --url http://ocr-cpu:8000 --dataset /app/dataset"
network_mode: "service:ocr-cpu"
volumes:
paddlex-cache:
name: paddlex-model-cache

View File

@@ -0,0 +1,263 @@
# paddle_ocr_tuning_rest.py
# FastAPI REST service for PaddleOCR hyperparameter evaluation
# Usage: uvicorn paddle_ocr_tuning_rest:app --host 0.0.0.0 --port 8000
import os
import re
import time
from typing import Optional
from contextlib import asynccontextmanager
import numpy as np
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from paddleocr import PaddleOCR
from jiwer import wer, cer
from dataset_manager import ImageTextDataset
# Model configuration via environment variables (with defaults)
DEFAULT_DET_MODEL = os.environ.get("PADDLE_DET_MODEL", "PP-OCRv5_server_det")
DEFAULT_REC_MODEL = os.environ.get("PADDLE_REC_MODEL", "PP-OCRv5_server_rec")
# Global state for model and dataset
class AppState:
ocr: Optional[PaddleOCR] = None
dataset: Optional[ImageTextDataset] = None
dataset_path: Optional[str] = None
det_model: str = DEFAULT_DET_MODEL
rec_model: str = DEFAULT_REC_MODEL
state = AppState()
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Load OCR model at startup."""
print(f"Loading PaddleOCR models...")
print(f" Detection: {state.det_model}")
print(f" Recognition: {state.rec_model}")
state.ocr = PaddleOCR(
text_detection_model_name=state.det_model,
text_recognition_model_name=state.rec_model,
)
print("Model loaded successfully!")
yield
# Cleanup on shutdown
state.ocr = None
state.dataset = None
app = FastAPI(
title="PaddleOCR Tuning API",
description="REST API for OCR hyperparameter evaluation",
version="1.0.0",
lifespan=lifespan,
)
class EvaluateRequest(BaseModel):
"""Request schema matching CLI arguments."""
pdf_folder: str = Field("/app/dataset", description="Path to dataset folder")
use_doc_orientation_classify: bool = Field(False, description="Use document orientation classification")
use_doc_unwarping: bool = Field(False, description="Use document unwarping")
textline_orientation: bool = Field(True, description="Use textline orientation classification")
text_det_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Detection pixel threshold")
text_det_box_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Detection box threshold")
text_det_unclip_ratio: float = Field(1.5, ge=0.0, description="Text detection expansion coefficient")
text_rec_score_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Recognition score threshold")
start_page: int = Field(5, ge=0, description="Start page index (inclusive)")
end_page: int = Field(10, ge=1, description="End page index (exclusive)")
class EvaluateResponse(BaseModel):
"""Response schema matching CLI output."""
CER: float
WER: float
TIME: float
PAGES: int
TIME_PER_PAGE: float
class HealthResponse(BaseModel):
status: str
model_loaded: bool
dataset_loaded: bool
dataset_size: Optional[int] = None
det_model: Optional[str] = None
rec_model: Optional[str] = None
def _normalize_box_xyxy(box):
"""Normalize bounding box to (x0, y0, x1, y1) format."""
if isinstance(box, (list, tuple)) and box and isinstance(box[0], (list, tuple)):
xs = [p[0] for p in box]
ys = [p[1] for p in box]
return min(xs), min(ys), max(xs), max(ys)
if isinstance(box, (list, tuple)):
if len(box) == 4:
x0, y0, x1, y1 = box
return min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1)
if len(box) == 8:
xs = box[0::2]
ys = box[1::2]
return min(xs), min(ys), max(xs), max(ys)
raise ValueError(f"Unrecognized box format: {box!r}")
def assemble_from_paddle_result(paddleocr_predict, min_score=0.0, line_tol_factor=0.6):
"""
Robust line grouping for PaddleOCR outputs.
Normalizes boxes, groups by line, and returns assembled text.
"""
boxes_all = []
for item in paddleocr_predict:
res = item.json.get("res", {})
boxes = res.get("rec_boxes", []) or []
texts = res.get("rec_texts", []) or []
scores = res.get("rec_scores", None)
for i, (box, text) in enumerate(zip(boxes, texts)):
try:
x0, y0, x1, y1 = _normalize_box_xyxy(box)
except Exception:
continue
y_mid = 0.5 * (y0 + y1)
score = float(scores[i]) if (scores is not None and i < len(scores)) else 1.0
t = re.sub(r"\s+", " ", str(text)).strip()
if not t:
continue
boxes_all.append((x0, y0, x1, y1, y_mid, t, score))
if min_score > 0:
boxes_all = [b for b in boxes_all if b[6] >= min_score]
if not boxes_all:
return ""
# Adaptive line tolerance
heights = [b[3] - b[1] for b in boxes_all]
median_h = float(np.median(heights)) if heights else 20.0
line_tol = max(8.0, line_tol_factor * median_h)
# Sort by vertical mid, then x0
boxes_all.sort(key=lambda b: (b[4], b[0]))
# Group into lines
lines, cur, last_y = [], [], None
for x0, y0, x1, y1, y_mid, text, score in boxes_all:
if last_y is None or abs(y_mid - last_y) <= line_tol:
cur.append((x0, text))
else:
cur.sort(key=lambda t: t[0])
lines.append(" ".join(t[1] for t in cur))
cur = [(x0, text)]
last_y = y_mid
if cur:
cur.sort(key=lambda t: t[0])
lines.append(" ".join(t[1] for t in cur))
res = "\n".join(lines)
res = re.sub(r"\s+\n", "\n", res).strip()
return res
def evaluate_text(reference: str, prediction: str) -> dict:
"""Calculate WER and CER metrics."""
return {"WER": wer(reference, prediction), "CER": cer(reference, prediction)}
@app.get("/health", response_model=HealthResponse)
def health_check():
"""Check if the service is ready."""
return HealthResponse(
status="ok" if state.ocr is not None else "initializing",
model_loaded=state.ocr is not None,
dataset_loaded=state.dataset is not None,
dataset_size=len(state.dataset) if state.dataset else None,
det_model=state.det_model,
rec_model=state.rec_model,
)
@app.post("/evaluate", response_model=EvaluateResponse)
def evaluate(request: EvaluateRequest):
"""
Evaluate OCR with given hyperparameters.
Returns CER, WER, and timing metrics.
"""
if state.ocr is None:
raise HTTPException(status_code=503, detail="Model not loaded yet")
# Load or reload dataset if path changed
if state.dataset is None or state.dataset_path != request.pdf_folder:
if not os.path.isdir(request.pdf_folder):
raise HTTPException(status_code=400, detail=f"Dataset folder not found: {request.pdf_folder}")
state.dataset = ImageTextDataset(request.pdf_folder)
state.dataset_path = request.pdf_folder
if len(state.dataset) == 0:
raise HTTPException(status_code=400, detail="Dataset is empty")
# Validate page range
start = request.start_page
end = min(request.end_page, len(state.dataset))
if start >= end:
raise HTTPException(status_code=400, detail=f"Invalid page range: {start}-{end}")
cer_list, wer_list = [], []
time_per_page_list = []
t0 = time.time()
for idx in range(start, end):
img, ref = state.dataset[idx]
arr = np.array(img)
tp0 = time.time()
out = state.ocr.predict(
arr,
use_doc_orientation_classify=request.use_doc_orientation_classify,
use_doc_unwarping=request.use_doc_unwarping,
use_textline_orientation=request.textline_orientation,
text_det_thresh=request.text_det_thresh,
text_det_box_thresh=request.text_det_box_thresh,
text_det_unclip_ratio=request.text_det_unclip_ratio,
text_rec_score_thresh=request.text_rec_score_thresh,
)
pred = assemble_from_paddle_result(out)
time_per_page_list.append(float(time.time() - tp0))
m = evaluate_text(ref, pred)
cer_list.append(m["CER"])
wer_list.append(m["WER"])
return EvaluateResponse(
CER=float(np.mean(cer_list)) if cer_list else 1.0,
WER=float(np.mean(wer_list)) if wer_list else 1.0,
TIME=float(time.time() - t0),
PAGES=len(cer_list),
TIME_PER_PAGE=float(np.mean(time_per_page_list)) if time_per_page_list else 0.0,
)
@app.post("/evaluate_full", response_model=EvaluateResponse)
def evaluate_full(request: EvaluateRequest):
"""Evaluate on ALL pages (ignores start_page/end_page)."""
request.start_page = 0
request.end_page = 9999 # Will be clamped to dataset size
return evaluate(request)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)

View File

@@ -0,0 +1,22 @@
# PaddleOCR REST API - GPU Requirements
# Install: pip install -r requirements-gpu.txt
# PaddlePaddle (GPU version with CUDA)
paddlepaddle-gpu==3.0.0
# PaddleOCR
paddleocr==3.3.2
# OCR evaluation metrics
jiwer
# Numerical computing
numpy
# REST API framework
fastapi
uvicorn[standard]
pydantic
# Image processing
Pillow

View File

@@ -0,0 +1,22 @@
# PaddleOCR REST API - CPU Requirements
# Install: pip install -r requirements.txt
# PaddlePaddle (CPU version)
paddlepaddle==3.2.2
# PaddleOCR
paddleocr==3.3.2
# OCR evaluation metrics
jiwer
# Numerical computing
numpy
# REST API framework
fastapi
uvicorn[standard]
pydantic
# Image processing (pulled by paddleocr, but explicit)
Pillow

114
src/paddle_ocr/test.py Normal file
View File

@@ -0,0 +1,114 @@
# test.py - Simple client to test PaddleOCR REST API
# Usage: python test.py [--url URL] [--dataset PATH]
import argparse
import requests
import time
import sys
def wait_for_health(url: str, timeout: int = 120) -> bool:
"""Wait for API to be ready."""
health_url = f"{url}/health"
start = time.time()
print(f"Waiting for API at {health_url}...")
while time.time() - start < timeout:
try:
resp = requests.get(health_url, timeout=5)
if resp.status_code == 200:
data = resp.json()
if data.get("model_loaded"):
print(f"API ready! Model loaded in {time.time() - start:.1f}s")
return True
print(f" Model loading... ({time.time() - start:.0f}s)")
except requests.exceptions.ConnectionError:
print(f" Connecting... ({time.time() - start:.0f}s)")
except Exception as e:
print(f" Error: {e}")
time.sleep(2)
print("Timeout waiting for API")
return False
def test_evaluate(url: str, config: dict) -> dict:
"""Run evaluation with given config."""
eval_url = f"{url}/evaluate"
print(f"\nTesting config: {config}")
start = time.time()
resp = requests.post(eval_url, json=config, timeout=600)
resp.raise_for_status()
result = resp.json()
elapsed = time.time() - start
print(f"Results (took {elapsed:.1f}s):")
print(f" CER: {result['CER']:.4f} ({result['CER']*100:.2f}%)")
print(f" WER: {result['WER']:.4f} ({result['WER']*100:.2f}%)")
print(f" Pages: {result['PAGES']}")
print(f" Time/page: {result['TIME_PER_PAGE']:.2f}s")
return result
def main():
parser = argparse.ArgumentParser(description="Test PaddleOCR REST API")
parser.add_argument("--url", default="http://localhost:8000", help="API base URL")
parser.add_argument("--dataset", default="/app/dataset", help="Dataset path (inside container)")
parser.add_argument("--skip-health", action="store_true", help="Skip health check wait")
args = parser.parse_args()
# Wait for API to be ready
if not args.skip_health:
if not wait_for_health(args.url):
sys.exit(1)
# Test 1: Baseline config (default PaddleOCR)
print("\n" + "="*50)
print("TEST 1: Baseline Configuration")
print("="*50)
baseline = test_evaluate(args.url, {
"pdf_folder": args.dataset,
"use_doc_orientation_classify": False,
"use_doc_unwarping": False,
"textline_orientation": False, # Baseline: disabled
"text_det_thresh": 0.0,
"text_det_box_thresh": 0.0,
"text_det_unclip_ratio": 1.5,
"text_rec_score_thresh": 0.0,
"start_page": 5,
"end_page": 10,
})
# Test 2: Optimized config (from Ray Tune results)
print("\n" + "="*50)
print("TEST 2: Optimized Configuration")
print("="*50)
optimized = test_evaluate(args.url, {
"pdf_folder": args.dataset,
"use_doc_orientation_classify": False,
"use_doc_unwarping": False,
"textline_orientation": True, # KEY: enabled
"text_det_thresh": 0.4690,
"text_det_box_thresh": 0.5412,
"text_det_unclip_ratio": 0.0,
"text_rec_score_thresh": 0.6350,
"start_page": 5,
"end_page": 10,
})
# Summary
print("\n" + "="*50)
print("SUMMARY")
print("="*50)
cer_reduction = (1 - optimized["CER"] / baseline["CER"]) * 100 if baseline["CER"] > 0 else 0
print(f"Baseline CER: {baseline['CER']*100:.2f}%")
print(f"Optimized CER: {optimized['CER']*100:.2f}%")
print(f"Improvement: {cer_reduction:.1f}% reduction in errors")
if __name__ == "__main__":
main()