Paddle ocr gpu support. #4
58
src/paddle_ocr/Dockerfile.cpu
Normal file
58
src/paddle_ocr/Dockerfile.cpu
Normal file
@@ -0,0 +1,58 @@
|
||||
# Dockerfile.cpu - CPU-only PaddleOCR REST API
|
||||
# Multi-arch: supports both amd64 and arm64
|
||||
|
||||
FROM python:3.11-slim
|
||||
|
||||
LABEL maintainer="Sergio Jimenez"
|
||||
LABEL description="PaddleOCR Tuning REST API - CPU version"
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies for OpenCV and PaddleOCR
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
libgl1 \
|
||||
libglib2.0-0 \
|
||||
libsm6 \
|
||||
libxext6 \
|
||||
libxrender1 \
|
||||
libgomp1 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Python dependencies from requirements file
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY paddle_ocr_tuning_rest.py .
|
||||
COPY dataset_manager.py .
|
||||
|
||||
# Build arguments for models to bake into image
|
||||
ARG DET_MODEL=PP-OCRv5_server_det
|
||||
ARG REC_MODEL=PP-OCRv5_server_rec
|
||||
|
||||
# Set as environment variables (can be overridden at runtime)
|
||||
ENV PADDLE_DET_MODEL=${DET_MODEL}
|
||||
ENV PADDLE_REC_MODEL=${REC_MODEL}
|
||||
|
||||
# Download models during build (not at runtime)
|
||||
RUN python -c "\
|
||||
import os; \
|
||||
from paddleocr import PaddleOCR; \
|
||||
det = os.environ.get('PADDLE_DET_MODEL', 'PP-OCRv5_server_det'); \
|
||||
rec = os.environ.get('PADDLE_REC_MODEL', 'PP-OCRv5_server_rec'); \
|
||||
print(f'Downloading models: det={det}, rec={rec}'); \
|
||||
ocr = PaddleOCR(text_detection_model_name=det, text_recognition_model_name=rec); \
|
||||
print('Models downloaded successfully!')"
|
||||
|
||||
# Volume for dataset and optional additional model cache
|
||||
VOLUME ["/app/dataset", "/root/.paddlex"]
|
||||
|
||||
# Expose API port
|
||||
EXPOSE 8000
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
||||
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
|
||||
|
||||
# Run the API server
|
||||
CMD ["uvicorn", "paddle_ocr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
68
src/paddle_ocr/Dockerfile.gpu
Normal file
68
src/paddle_ocr/Dockerfile.gpu
Normal file
@@ -0,0 +1,68 @@
|
||||
# Dockerfile.gpu - CUDA-enabled PaddleOCR REST API
|
||||
# Supports: x86_64 with NVIDIA GPU (CUDA 12.x)
|
||||
# For DGX Spark (ARM64 + CUDA): build natively on the device
|
||||
|
||||
FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
||||
|
||||
LABEL maintainer="Sergio Jimenez"
|
||||
LABEL description="PaddleOCR Tuning REST API - GPU/CUDA version"
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Set environment variables
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
ENV CUDA_VISIBLE_DEVICES=0
|
||||
|
||||
# Install Python 3.11 and system dependencies
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3.11 \
|
||||
python3.11-venv \
|
||||
python3-pip \
|
||||
libgl1 \
|
||||
libglib2.0-0 \
|
||||
libsm6 \
|
||||
libxext6 \
|
||||
libxrender1 \
|
||||
libgomp1 \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& ln -sf /usr/bin/python3.11 /usr/bin/python
|
||||
|
||||
# Install Python dependencies from requirements file
|
||||
COPY requirements-gpu.txt .
|
||||
RUN pip install --no-cache-dir -r requirements-gpu.txt
|
||||
|
||||
# Copy application code
|
||||
COPY paddle_ocr_tuning_rest.py .
|
||||
COPY dataset_manager.py .
|
||||
|
||||
# Build arguments for models to bake into image
|
||||
ARG DET_MODEL=PP-OCRv5_server_det
|
||||
ARG REC_MODEL=PP-OCRv5_server_rec
|
||||
|
||||
# Set as environment variables (can be overridden at runtime)
|
||||
ENV PADDLE_DET_MODEL=${DET_MODEL}
|
||||
ENV PADDLE_REC_MODEL=${REC_MODEL}
|
||||
|
||||
# Download models during build (not at runtime)
|
||||
RUN python -c "\
|
||||
import os; \
|
||||
from paddleocr import PaddleOCR; \
|
||||
det = os.environ.get('PADDLE_DET_MODEL', 'PP-OCRv5_server_det'); \
|
||||
rec = os.environ.get('PADDLE_REC_MODEL', 'PP-OCRv5_server_rec'); \
|
||||
print(f'Downloading models: det={det}, rec={rec}'); \
|
||||
ocr = PaddleOCR(text_detection_model_name=det, text_recognition_model_name=rec); \
|
||||
print('Models downloaded successfully!')"
|
||||
|
||||
# Volume for dataset and optional additional model cache
|
||||
VOLUME ["/app/dataset", "/root/.paddlex"]
|
||||
|
||||
# Expose API port
|
||||
EXPOSE 8000
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
||||
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
|
||||
|
||||
# Run the API server
|
||||
CMD ["uvicorn", "paddle_ocr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
329
src/paddle_ocr/README.md
Normal file
329
src/paddle_ocr/README.md
Normal file
@@ -0,0 +1,329 @@
|
||||
# PaddleOCR Tuning REST API
|
||||
|
||||
REST API service for PaddleOCR hyperparameter evaluation. Keeps the model loaded in memory for fast repeated evaluations during hyperparameter search.
|
||||
|
||||
## Quick Start with Docker Compose
|
||||
|
||||
Docker Compose manages building and running containers. The `docker-compose.yml` defines two services:
|
||||
- `ocr-cpu` - CPU-only version (works everywhere)
|
||||
- `ocr-gpu` - GPU version (requires NVIDIA GPU + Container Toolkit)
|
||||
|
||||
### Run CPU Version
|
||||
|
||||
```bash
|
||||
cd src/paddle_ocr
|
||||
|
||||
# Build and start (first time takes ~2-3 min to build, ~30s to load model)
|
||||
docker compose up ocr-cpu
|
||||
|
||||
# Or run in background (detached)
|
||||
docker compose up -d ocr-cpu
|
||||
|
||||
# View logs
|
||||
docker compose logs -f ocr-cpu
|
||||
|
||||
# Stop
|
||||
docker compose down
|
||||
```
|
||||
|
||||
### Run GPU Version
|
||||
|
||||
```bash
|
||||
# Requires: NVIDIA GPU + nvidia-container-toolkit installed
|
||||
docker compose up ocr-gpu
|
||||
```
|
||||
|
||||
### Test the API
|
||||
|
||||
Once running, test with:
|
||||
```bash
|
||||
# Check health
|
||||
curl http://localhost:8000/health
|
||||
|
||||
# Or use the test script
|
||||
pip install requests
|
||||
python test.py --url http://localhost:8000
|
||||
```
|
||||
|
||||
### What Docker Compose Does
|
||||
|
||||
```
|
||||
docker compose up ocr-cpu
|
||||
│
|
||||
├─► Builds image from Dockerfile.cpu (if not exists)
|
||||
├─► Creates container "paddle-ocr-cpu"
|
||||
├─► Mounts ../dataset → /app/dataset (your PDF images)
|
||||
├─► Mounts paddlex-cache volume (persists downloaded models)
|
||||
├─► Exposes port 8000
|
||||
└─► Runs: uvicorn paddle_ocr_tuning_rest:app --host 0.0.0.0 --port 8000
|
||||
```
|
||||
|
||||
## Files
|
||||
|
||||
| File | Description |
|
||||
|------|-------------|
|
||||
| `paddle_ocr_tuning_rest.py` | FastAPI REST service |
|
||||
| `dataset_manager.py` | Dataset loader |
|
||||
| `test.py` | API test client |
|
||||
| `Dockerfile.cpu` | CPU-only image (multi-arch) |
|
||||
| `Dockerfile.gpu` | GPU/CUDA image (x86_64) |
|
||||
| `docker-compose.yml` | Service orchestration |
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### `GET /health`
|
||||
Check if service is ready.
|
||||
|
||||
```json
|
||||
{"status": "ok", "model_loaded": true, "dataset_loaded": true, "dataset_size": 24}
|
||||
```
|
||||
|
||||
### `POST /evaluate`
|
||||
Run OCR evaluation with given hyperparameters.
|
||||
|
||||
**Request:**
|
||||
```json
|
||||
{
|
||||
"pdf_folder": "/app/dataset",
|
||||
"textline_orientation": true,
|
||||
"use_doc_orientation_classify": false,
|
||||
"use_doc_unwarping": false,
|
||||
"text_det_thresh": 0.469,
|
||||
"text_det_box_thresh": 0.5412,
|
||||
"text_det_unclip_ratio": 0.0,
|
||||
"text_rec_score_thresh": 0.635,
|
||||
"start_page": 5,
|
||||
"end_page": 10
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{"CER": 0.0115, "WER": 0.0989, "TIME": 330.5, "PAGES": 5, "TIME_PER_PAGE": 66.1}
|
||||
```
|
||||
|
||||
### `POST /evaluate_full`
|
||||
Same as `/evaluate` but runs on ALL pages (ignores start_page/end_page).
|
||||
|
||||
## Building Images
|
||||
|
||||
### CPU Image (Multi-Architecture)
|
||||
|
||||
```bash
|
||||
# Local build (current architecture)
|
||||
docker build -f Dockerfile.cpu -t paddle-ocr-api:cpu .
|
||||
|
||||
# Multi-arch build with buildx (amd64 + arm64)
|
||||
docker buildx create --name multiarch --use
|
||||
docker buildx build -f Dockerfile.cpu \
|
||||
--platform linux/amd64,linux/arm64 \
|
||||
-t paddle-ocr-api:cpu \
|
||||
--push .
|
||||
```
|
||||
|
||||
### GPU Image (x86_64 only)
|
||||
|
||||
```bash
|
||||
docker build -f Dockerfile.gpu -t paddle-ocr-api:gpu .
|
||||
```
|
||||
|
||||
## Running
|
||||
|
||||
### CPU (Any machine)
|
||||
|
||||
```bash
|
||||
docker run -d -p 8000:8000 \
|
||||
-v $(pwd)/../dataset:/app/dataset:ro \
|
||||
-v paddlex-cache:/root/.paddlex \
|
||||
paddle-ocr-api:cpu
|
||||
```
|
||||
|
||||
### GPU (NVIDIA)
|
||||
|
||||
```bash
|
||||
docker run -d -p 8000:8000 --gpus all \
|
||||
-v $(pwd)/../dataset:/app/dataset:ro \
|
||||
-v paddlex-cache:/root/.paddlex \
|
||||
paddle-ocr-api:gpu
|
||||
```
|
||||
|
||||
## DGX Spark (ARM64 + CUDA)
|
||||
|
||||
DGX Spark uses ARM64 (Grace CPU) with NVIDIA Hopper GPU. You have two options:
|
||||
|
||||
### Option 1: Native ARM64 Build (Recommended)
|
||||
|
||||
PaddlePaddle has ARM64 support. Build natively:
|
||||
|
||||
```bash
|
||||
# On DGX Spark or ARM64 machine
|
||||
docker build -f Dockerfile.cpu -t paddle-ocr-api:arm64 .
|
||||
```
|
||||
|
||||
For GPU acceleration on ARM64, you'll need to modify `Dockerfile.gpu` to use ARM-compatible base image:
|
||||
|
||||
```dockerfile
|
||||
# Change this line in Dockerfile.gpu:
|
||||
FROM nvcr.io/nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
||||
|
||||
# To ARM64-compatible version:
|
||||
FROM nvcr.io/nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
||||
# (same image works on ARM64 when pulled on ARM machine)
|
||||
```
|
||||
|
||||
Then build on the DGX Spark:
|
||||
```bash
|
||||
docker build -f Dockerfile.gpu -t paddle-ocr-api:gpu-arm64 .
|
||||
```
|
||||
|
||||
### Option 2: x86_64 Emulation via QEMU (Slow)
|
||||
|
||||
You CAN run x86_64 images on ARM via emulation, but it's ~10-20x slower:
|
||||
|
||||
```bash
|
||||
# On DGX Spark, enable QEMU emulation
|
||||
docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
|
||||
|
||||
# Run x86_64 image with emulation
|
||||
docker run --platform linux/amd64 -p 8000:8000 \
|
||||
-v $(pwd)/../dataset:/app/dataset:ro \
|
||||
paddle-ocr-api:cpu
|
||||
```
|
||||
|
||||
**Not recommended** for production due to severe performance penalty.
|
||||
|
||||
### Option 3: Cross-compile from x86_64
|
||||
|
||||
Build ARM64 images from your x86_64 machine:
|
||||
|
||||
```bash
|
||||
# Setup buildx for multi-arch
|
||||
docker buildx create --name mybuilder --use
|
||||
|
||||
# Build ARM64 image from x86_64 machine
|
||||
docker buildx build -f Dockerfile.cpu \
|
||||
--platform linux/arm64 \
|
||||
-t paddle-ocr-api:arm64 \
|
||||
--load .
|
||||
|
||||
# Save and transfer to DGX Spark
|
||||
docker save paddle-ocr-api:arm64 | gzip > paddle-ocr-arm64.tar.gz
|
||||
scp paddle-ocr-arm64.tar.gz dgx-spark:~/
|
||||
# On DGX Spark:
|
||||
docker load < paddle-ocr-arm64.tar.gz
|
||||
```
|
||||
|
||||
## Using with Ray Tune
|
||||
|
||||
Update your notebook's `trainable_paddle_ocr` function:
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
API_URL = "http://localhost:8000/evaluate"
|
||||
|
||||
def trainable_paddle_ocr(config):
|
||||
"""Call OCR API instead of subprocess."""
|
||||
payload = {
|
||||
"pdf_folder": "/app/dataset",
|
||||
"use_doc_orientation_classify": config.get("use_doc_orientation_classify", False),
|
||||
"use_doc_unwarping": config.get("use_doc_unwarping", False),
|
||||
"textline_orientation": config.get("textline_orientation", True),
|
||||
"text_det_thresh": config.get("text_det_thresh", 0.0),
|
||||
"text_det_box_thresh": config.get("text_det_box_thresh", 0.0),
|
||||
"text_det_unclip_ratio": config.get("text_det_unclip_ratio", 1.5),
|
||||
"text_rec_score_thresh": config.get("text_rec_score_thresh", 0.0),
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(API_URL, json=payload, timeout=600)
|
||||
response.raise_for_status()
|
||||
metrics = response.json()
|
||||
tune.report(metrics=metrics)
|
||||
except Exception as e:
|
||||
tune.report({"CER": 1.0, "WER": 1.0, "ERROR": str(e)[:500]})
|
||||
```
|
||||
|
||||
## Architecture: Model Lifecycle
|
||||
|
||||
The model is loaded **once** at container startup and stays in memory for all requests:
|
||||
|
||||
```mermaid
|
||||
flowchart TB
|
||||
subgraph Container["Docker Container Lifecycle"]
|
||||
Start([Container Start]) --> Load[Load PaddleOCR Models<br/>~10-30s one-time cost]
|
||||
Load --> Ready[API Ready<br/>Models in RAM ~500MB]
|
||||
|
||||
subgraph Requests["Incoming Requests - Models Stay Loaded"]
|
||||
Ready --> R1[Request 1] --> Ready
|
||||
Ready --> R2[Request 2] --> Ready
|
||||
Ready --> RN[Request N...] --> Ready
|
||||
end
|
||||
|
||||
Ready --> Stop([Container Stop])
|
||||
Stop --> Free[Models Freed]
|
||||
end
|
||||
|
||||
style Load fill:#f9f,stroke:#333
|
||||
style Ready fill:#9f9,stroke:#333
|
||||
style Requests fill:#e8f4ea,stroke:#090
|
||||
```
|
||||
|
||||
**Subprocess vs REST API comparison:**
|
||||
|
||||
```mermaid
|
||||
flowchart LR
|
||||
subgraph Subprocess["❌ Subprocess Approach"]
|
||||
direction TB
|
||||
S1[Trial 1] --> L1[Load Model ~10s]
|
||||
L1 --> E1[Evaluate ~60s]
|
||||
E1 --> U1[Unload]
|
||||
U1 --> S2[Trial 2]
|
||||
S2 --> L2[Load Model ~10s]
|
||||
L2 --> E2[Evaluate ~60s]
|
||||
end
|
||||
|
||||
subgraph REST["✅ REST API Approach"]
|
||||
direction TB
|
||||
Start2[Start Container] --> Load2[Load Model ~10s]
|
||||
Load2 --> Ready2[Model in Memory]
|
||||
Ready2 --> T1[Trial 1 ~60s]
|
||||
T1 --> Ready2
|
||||
Ready2 --> T2[Trial 2 ~60s]
|
||||
T2 --> Ready2
|
||||
Ready2 --> TN[Trial N ~60s]
|
||||
end
|
||||
|
||||
style L1 fill:#faa
|
||||
style L2 fill:#faa
|
||||
style Load2 fill:#afa
|
||||
style Ready2 fill:#afa
|
||||
```
|
||||
|
||||
## Performance Comparison
|
||||
|
||||
| Approach | Model Load | Per-Trial Overhead | 64 Trials |
|
||||
|----------|------------|-------------------|-----------|
|
||||
| Subprocess (original) | Every trial (~10s) | ~10s | ~7 hours |
|
||||
| Docker per trial | Every trial (~10s) | ~12-15s | ~7.5 hours |
|
||||
| **REST API** | **Once** | **~0.1s** | **~5.8 hours** |
|
||||
|
||||
The REST API saves ~1+ hour by loading the model only once.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Model download slow on first run
|
||||
The first run downloads ~500MB of models. Use volume `paddlex-cache` to persist them.
|
||||
|
||||
### Out of memory
|
||||
Reduce `max_concurrent_trials` in Ray Tune, or increase container memory:
|
||||
```bash
|
||||
docker run --memory=8g ...
|
||||
```
|
||||
|
||||
### GPU not detected
|
||||
Ensure NVIDIA Container Toolkit is installed:
|
||||
```bash
|
||||
nvidia-smi # Should work
|
||||
docker run --rm --gpus all nvidia/cuda:12.0-base nvidia-smi # Should work
|
||||
```
|
||||
45
src/paddle_ocr/dataset_manager.py
Normal file
45
src/paddle_ocr/dataset_manager.py
Normal file
@@ -0,0 +1,45 @@
|
||||
# Imports
|
||||
import os
|
||||
from PIL import Image
|
||||
|
||||
|
||||
class ImageTextDataset:
|
||||
def __init__(self, root):
|
||||
self.samples = []
|
||||
|
||||
for folder in sorted(os.listdir(root)):
|
||||
sub = os.path.join(root, folder)
|
||||
img_dir = os.path.join(sub, "img")
|
||||
txt_dir = os.path.join(sub, "txt")
|
||||
|
||||
if not (os.path.isdir(img_dir) and os.path.isdir(txt_dir)):
|
||||
continue
|
||||
|
||||
for fname in sorted(os.listdir(img_dir)):
|
||||
if not fname.lower().endswith((".png", ".jpg", ".jpeg")):
|
||||
continue
|
||||
|
||||
img_path = os.path.join(img_dir, fname)
|
||||
|
||||
# text file must have same name but .txt
|
||||
txt_name = os.path.splitext(fname)[0] + ".txt"
|
||||
txt_path = os.path.join(txt_dir, txt_name)
|
||||
|
||||
if not os.path.exists(txt_path):
|
||||
continue
|
||||
|
||||
self.samples.append((img_path, txt_path))
|
||||
def __len__(self):
|
||||
return len(self.samples)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
img_path, txt_path = self.samples[idx]
|
||||
|
||||
# Load image
|
||||
image = Image.open(img_path).convert("RGB")
|
||||
|
||||
# Load text
|
||||
with open(txt_path, "r", encoding="utf-8") as f:
|
||||
text = f.read()
|
||||
|
||||
return image, text
|
||||
83
src/paddle_ocr/docker-compose.yml
Normal file
83
src/paddle_ocr/docker-compose.yml
Normal file
@@ -0,0 +1,83 @@
|
||||
# docker-compose.yml - PaddleOCR REST API
|
||||
# Usage:
|
||||
# CPU: docker compose up ocr-cpu
|
||||
# GPU: docker compose up ocr-gpu
|
||||
# Test: docker compose run --rm test
|
||||
|
||||
services:
|
||||
# CPU-only service (works on any architecture)
|
||||
ocr-cpu:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.cpu
|
||||
args:
|
||||
# Models to bake into image (change before building):
|
||||
DET_MODEL: PP-OCRv5_server_det
|
||||
REC_MODEL: PP-OCRv5_server_rec
|
||||
image: paddle-ocr-api:cpu
|
||||
container_name: paddle-ocr-cpu
|
||||
ports:
|
||||
- "8000:8000"
|
||||
volumes:
|
||||
- ../dataset:/app/dataset:ro # Your dataset
|
||||
- paddlex-cache:/root/.paddlex # For additional models at runtime
|
||||
environment:
|
||||
- PYTHONUNBUFFERED=1
|
||||
# Override models at runtime (uncomment to use different models):
|
||||
# - PADDLE_DET_MODEL=PP-OCRv5_mobile_det
|
||||
# - PADDLE_REC_MODEL=PP-OCRv5_mobile_rec
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
|
||||
# GPU service (requires NVIDIA Container Toolkit)
|
||||
ocr-gpu:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.gpu
|
||||
args:
|
||||
DET_MODEL: PP-OCRv5_server_det
|
||||
REC_MODEL: PP-OCRv5_server_rec
|
||||
image: paddle-ocr-api:gpu
|
||||
container_name: paddle-ocr-gpu
|
||||
ports:
|
||||
- "8000:8000"
|
||||
volumes:
|
||||
- ../dataset:/app/dataset:ro
|
||||
- paddlex-cache:/root/.paddlex
|
||||
environment:
|
||||
- PYTHONUNBUFFERED=1
|
||||
- CUDA_VISIBLE_DEVICES=0
|
||||
# Override models at runtime:
|
||||
# - PADDLE_DET_MODEL=PP-OCRv5_mobile_det
|
||||
# - PADDLE_REC_MODEL=PP-OCRv5_mobile_rec
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: 1
|
||||
capabilities: [gpu]
|
||||
restart: unless-stopped
|
||||
|
||||
# Test client (runs once and exits)
|
||||
test:
|
||||
image: python:3.11-slim
|
||||
container_name: paddle-ocr-test
|
||||
depends_on:
|
||||
ocr-cpu:
|
||||
condition: service_healthy
|
||||
volumes:
|
||||
- ./test.py:/app/test.py:ro
|
||||
working_dir: /app
|
||||
command: >
|
||||
sh -c "pip install -q requests && python test.py --url http://ocr-cpu:8000 --dataset /app/dataset"
|
||||
network_mode: "service:ocr-cpu"
|
||||
|
||||
volumes:
|
||||
paddlex-cache:
|
||||
name: paddlex-model-cache
|
||||
263
src/paddle_ocr/paddle_ocr_tuning_rest.py
Normal file
263
src/paddle_ocr/paddle_ocr_tuning_rest.py
Normal file
@@ -0,0 +1,263 @@
|
||||
# paddle_ocr_tuning_rest.py
|
||||
# FastAPI REST service for PaddleOCR hyperparameter evaluation
|
||||
# Usage: uvicorn paddle_ocr_tuning_rest:app --host 0.0.0.0 --port 8000
|
||||
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from typing import Optional
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
import numpy as np
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from paddleocr import PaddleOCR
|
||||
from jiwer import wer, cer
|
||||
from dataset_manager import ImageTextDataset
|
||||
|
||||
|
||||
# Model configuration via environment variables (with defaults)
|
||||
DEFAULT_DET_MODEL = os.environ.get("PADDLE_DET_MODEL", "PP-OCRv5_server_det")
|
||||
DEFAULT_REC_MODEL = os.environ.get("PADDLE_REC_MODEL", "PP-OCRv5_server_rec")
|
||||
|
||||
|
||||
# Global state for model and dataset
|
||||
class AppState:
|
||||
ocr: Optional[PaddleOCR] = None
|
||||
dataset: Optional[ImageTextDataset] = None
|
||||
dataset_path: Optional[str] = None
|
||||
det_model: str = DEFAULT_DET_MODEL
|
||||
rec_model: str = DEFAULT_REC_MODEL
|
||||
|
||||
|
||||
state = AppState()
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""Load OCR model at startup."""
|
||||
print(f"Loading PaddleOCR models...")
|
||||
print(f" Detection: {state.det_model}")
|
||||
print(f" Recognition: {state.rec_model}")
|
||||
state.ocr = PaddleOCR(
|
||||
text_detection_model_name=state.det_model,
|
||||
text_recognition_model_name=state.rec_model,
|
||||
)
|
||||
print("Model loaded successfully!")
|
||||
yield
|
||||
# Cleanup on shutdown
|
||||
state.ocr = None
|
||||
state.dataset = None
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="PaddleOCR Tuning API",
|
||||
description="REST API for OCR hyperparameter evaluation",
|
||||
version="1.0.0",
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
|
||||
class EvaluateRequest(BaseModel):
|
||||
"""Request schema matching CLI arguments."""
|
||||
pdf_folder: str = Field("/app/dataset", description="Path to dataset folder")
|
||||
use_doc_orientation_classify: bool = Field(False, description="Use document orientation classification")
|
||||
use_doc_unwarping: bool = Field(False, description="Use document unwarping")
|
||||
textline_orientation: bool = Field(True, description="Use textline orientation classification")
|
||||
text_det_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Detection pixel threshold")
|
||||
text_det_box_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Detection box threshold")
|
||||
text_det_unclip_ratio: float = Field(1.5, ge=0.0, description="Text detection expansion coefficient")
|
||||
text_rec_score_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Recognition score threshold")
|
||||
start_page: int = Field(5, ge=0, description="Start page index (inclusive)")
|
||||
end_page: int = Field(10, ge=1, description="End page index (exclusive)")
|
||||
|
||||
|
||||
class EvaluateResponse(BaseModel):
|
||||
"""Response schema matching CLI output."""
|
||||
CER: float
|
||||
WER: float
|
||||
TIME: float
|
||||
PAGES: int
|
||||
TIME_PER_PAGE: float
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
status: str
|
||||
model_loaded: bool
|
||||
dataset_loaded: bool
|
||||
dataset_size: Optional[int] = None
|
||||
det_model: Optional[str] = None
|
||||
rec_model: Optional[str] = None
|
||||
|
||||
|
||||
def _normalize_box_xyxy(box):
|
||||
"""Normalize bounding box to (x0, y0, x1, y1) format."""
|
||||
if isinstance(box, (list, tuple)) and box and isinstance(box[0], (list, tuple)):
|
||||
xs = [p[0] for p in box]
|
||||
ys = [p[1] for p in box]
|
||||
return min(xs), min(ys), max(xs), max(ys)
|
||||
|
||||
if isinstance(box, (list, tuple)):
|
||||
if len(box) == 4:
|
||||
x0, y0, x1, y1 = box
|
||||
return min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1)
|
||||
if len(box) == 8:
|
||||
xs = box[0::2]
|
||||
ys = box[1::2]
|
||||
return min(xs), min(ys), max(xs), max(ys)
|
||||
|
||||
raise ValueError(f"Unrecognized box format: {box!r}")
|
||||
|
||||
|
||||
def assemble_from_paddle_result(paddleocr_predict, min_score=0.0, line_tol_factor=0.6):
|
||||
"""
|
||||
Robust line grouping for PaddleOCR outputs.
|
||||
Normalizes boxes, groups by line, and returns assembled text.
|
||||
"""
|
||||
boxes_all = []
|
||||
for item in paddleocr_predict:
|
||||
res = item.json.get("res", {})
|
||||
boxes = res.get("rec_boxes", []) or []
|
||||
texts = res.get("rec_texts", []) or []
|
||||
scores = res.get("rec_scores", None)
|
||||
|
||||
for i, (box, text) in enumerate(zip(boxes, texts)):
|
||||
try:
|
||||
x0, y0, x1, y1 = _normalize_box_xyxy(box)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
y_mid = 0.5 * (y0 + y1)
|
||||
score = float(scores[i]) if (scores is not None and i < len(scores)) else 1.0
|
||||
|
||||
t = re.sub(r"\s+", " ", str(text)).strip()
|
||||
if not t:
|
||||
continue
|
||||
|
||||
boxes_all.append((x0, y0, x1, y1, y_mid, t, score))
|
||||
|
||||
if min_score > 0:
|
||||
boxes_all = [b for b in boxes_all if b[6] >= min_score]
|
||||
|
||||
if not boxes_all:
|
||||
return ""
|
||||
|
||||
# Adaptive line tolerance
|
||||
heights = [b[3] - b[1] for b in boxes_all]
|
||||
median_h = float(np.median(heights)) if heights else 20.0
|
||||
line_tol = max(8.0, line_tol_factor * median_h)
|
||||
|
||||
# Sort by vertical mid, then x0
|
||||
boxes_all.sort(key=lambda b: (b[4], b[0]))
|
||||
|
||||
# Group into lines
|
||||
lines, cur, last_y = [], [], None
|
||||
for x0, y0, x1, y1, y_mid, text, score in boxes_all:
|
||||
if last_y is None or abs(y_mid - last_y) <= line_tol:
|
||||
cur.append((x0, text))
|
||||
else:
|
||||
cur.sort(key=lambda t: t[0])
|
||||
lines.append(" ".join(t[1] for t in cur))
|
||||
cur = [(x0, text)]
|
||||
last_y = y_mid
|
||||
|
||||
if cur:
|
||||
cur.sort(key=lambda t: t[0])
|
||||
lines.append(" ".join(t[1] for t in cur))
|
||||
|
||||
res = "\n".join(lines)
|
||||
res = re.sub(r"\s+\n", "\n", res).strip()
|
||||
return res
|
||||
|
||||
|
||||
def evaluate_text(reference: str, prediction: str) -> dict:
|
||||
"""Calculate WER and CER metrics."""
|
||||
return {"WER": wer(reference, prediction), "CER": cer(reference, prediction)}
|
||||
|
||||
|
||||
@app.get("/health", response_model=HealthResponse)
|
||||
def health_check():
|
||||
"""Check if the service is ready."""
|
||||
return HealthResponse(
|
||||
status="ok" if state.ocr is not None else "initializing",
|
||||
model_loaded=state.ocr is not None,
|
||||
dataset_loaded=state.dataset is not None,
|
||||
dataset_size=len(state.dataset) if state.dataset else None,
|
||||
det_model=state.det_model,
|
||||
rec_model=state.rec_model,
|
||||
)
|
||||
|
||||
|
||||
@app.post("/evaluate", response_model=EvaluateResponse)
|
||||
def evaluate(request: EvaluateRequest):
|
||||
"""
|
||||
Evaluate OCR with given hyperparameters.
|
||||
Returns CER, WER, and timing metrics.
|
||||
"""
|
||||
if state.ocr is None:
|
||||
raise HTTPException(status_code=503, detail="Model not loaded yet")
|
||||
|
||||
# Load or reload dataset if path changed
|
||||
if state.dataset is None or state.dataset_path != request.pdf_folder:
|
||||
if not os.path.isdir(request.pdf_folder):
|
||||
raise HTTPException(status_code=400, detail=f"Dataset folder not found: {request.pdf_folder}")
|
||||
state.dataset = ImageTextDataset(request.pdf_folder)
|
||||
state.dataset_path = request.pdf_folder
|
||||
|
||||
if len(state.dataset) == 0:
|
||||
raise HTTPException(status_code=400, detail="Dataset is empty")
|
||||
|
||||
# Validate page range
|
||||
start = request.start_page
|
||||
end = min(request.end_page, len(state.dataset))
|
||||
if start >= end:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid page range: {start}-{end}")
|
||||
|
||||
cer_list, wer_list = [], []
|
||||
time_per_page_list = []
|
||||
t0 = time.time()
|
||||
|
||||
for idx in range(start, end):
|
||||
img, ref = state.dataset[idx]
|
||||
arr = np.array(img)
|
||||
|
||||
tp0 = time.time()
|
||||
out = state.ocr.predict(
|
||||
arr,
|
||||
use_doc_orientation_classify=request.use_doc_orientation_classify,
|
||||
use_doc_unwarping=request.use_doc_unwarping,
|
||||
use_textline_orientation=request.textline_orientation,
|
||||
text_det_thresh=request.text_det_thresh,
|
||||
text_det_box_thresh=request.text_det_box_thresh,
|
||||
text_det_unclip_ratio=request.text_det_unclip_ratio,
|
||||
text_rec_score_thresh=request.text_rec_score_thresh,
|
||||
)
|
||||
|
||||
pred = assemble_from_paddle_result(out)
|
||||
time_per_page_list.append(float(time.time() - tp0))
|
||||
|
||||
m = evaluate_text(ref, pred)
|
||||
cer_list.append(m["CER"])
|
||||
wer_list.append(m["WER"])
|
||||
|
||||
return EvaluateResponse(
|
||||
CER=float(np.mean(cer_list)) if cer_list else 1.0,
|
||||
WER=float(np.mean(wer_list)) if wer_list else 1.0,
|
||||
TIME=float(time.time() - t0),
|
||||
PAGES=len(cer_list),
|
||||
TIME_PER_PAGE=float(np.mean(time_per_page_list)) if time_per_page_list else 0.0,
|
||||
)
|
||||
|
||||
|
||||
@app.post("/evaluate_full", response_model=EvaluateResponse)
|
||||
def evaluate_full(request: EvaluateRequest):
|
||||
"""Evaluate on ALL pages (ignores start_page/end_page)."""
|
||||
request.start_page = 0
|
||||
request.end_page = 9999 # Will be clamped to dataset size
|
||||
return evaluate(request)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
22
src/paddle_ocr/requirements-gpu.txt
Normal file
22
src/paddle_ocr/requirements-gpu.txt
Normal file
@@ -0,0 +1,22 @@
|
||||
# PaddleOCR REST API - GPU Requirements
|
||||
# Install: pip install -r requirements-gpu.txt
|
||||
|
||||
# PaddlePaddle (GPU version with CUDA)
|
||||
paddlepaddle-gpu==3.0.0
|
||||
|
||||
# PaddleOCR
|
||||
paddleocr==3.3.2
|
||||
|
||||
# OCR evaluation metrics
|
||||
jiwer
|
||||
|
||||
# Numerical computing
|
||||
numpy
|
||||
|
||||
# REST API framework
|
||||
fastapi
|
||||
uvicorn[standard]
|
||||
pydantic
|
||||
|
||||
# Image processing
|
||||
Pillow
|
||||
22
src/paddle_ocr/requirements.txt
Normal file
22
src/paddle_ocr/requirements.txt
Normal file
@@ -0,0 +1,22 @@
|
||||
# PaddleOCR REST API - CPU Requirements
|
||||
# Install: pip install -r requirements.txt
|
||||
|
||||
# PaddlePaddle (CPU version)
|
||||
paddlepaddle==3.2.2
|
||||
|
||||
# PaddleOCR
|
||||
paddleocr==3.3.2
|
||||
|
||||
# OCR evaluation metrics
|
||||
jiwer
|
||||
|
||||
# Numerical computing
|
||||
numpy
|
||||
|
||||
# REST API framework
|
||||
fastapi
|
||||
uvicorn[standard]
|
||||
pydantic
|
||||
|
||||
# Image processing (pulled by paddleocr, but explicit)
|
||||
Pillow
|
||||
114
src/paddle_ocr/test.py
Normal file
114
src/paddle_ocr/test.py
Normal file
@@ -0,0 +1,114 @@
|
||||
# test.py - Simple client to test PaddleOCR REST API
|
||||
# Usage: python test.py [--url URL] [--dataset PATH]
|
||||
|
||||
import argparse
|
||||
import requests
|
||||
import time
|
||||
import sys
|
||||
|
||||
|
||||
def wait_for_health(url: str, timeout: int = 120) -> bool:
|
||||
"""Wait for API to be ready."""
|
||||
health_url = f"{url}/health"
|
||||
start = time.time()
|
||||
|
||||
print(f"Waiting for API at {health_url}...")
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
resp = requests.get(health_url, timeout=5)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
if data.get("model_loaded"):
|
||||
print(f"API ready! Model loaded in {time.time() - start:.1f}s")
|
||||
return True
|
||||
print(f" Model loading... ({time.time() - start:.0f}s)")
|
||||
except requests.exceptions.ConnectionError:
|
||||
print(f" Connecting... ({time.time() - start:.0f}s)")
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
time.sleep(2)
|
||||
|
||||
print("Timeout waiting for API")
|
||||
return False
|
||||
|
||||
|
||||
def test_evaluate(url: str, config: dict) -> dict:
|
||||
"""Run evaluation with given config."""
|
||||
eval_url = f"{url}/evaluate"
|
||||
|
||||
print(f"\nTesting config: {config}")
|
||||
start = time.time()
|
||||
|
||||
resp = requests.post(eval_url, json=config, timeout=600)
|
||||
resp.raise_for_status()
|
||||
|
||||
result = resp.json()
|
||||
elapsed = time.time() - start
|
||||
|
||||
print(f"Results (took {elapsed:.1f}s):")
|
||||
print(f" CER: {result['CER']:.4f} ({result['CER']*100:.2f}%)")
|
||||
print(f" WER: {result['WER']:.4f} ({result['WER']*100:.2f}%)")
|
||||
print(f" Pages: {result['PAGES']}")
|
||||
print(f" Time/page: {result['TIME_PER_PAGE']:.2f}s")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Test PaddleOCR REST API")
|
||||
parser.add_argument("--url", default="http://localhost:8000", help="API base URL")
|
||||
parser.add_argument("--dataset", default="/app/dataset", help="Dataset path (inside container)")
|
||||
parser.add_argument("--skip-health", action="store_true", help="Skip health check wait")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Wait for API to be ready
|
||||
if not args.skip_health:
|
||||
if not wait_for_health(args.url):
|
||||
sys.exit(1)
|
||||
|
||||
# Test 1: Baseline config (default PaddleOCR)
|
||||
print("\n" + "="*50)
|
||||
print("TEST 1: Baseline Configuration")
|
||||
print("="*50)
|
||||
baseline = test_evaluate(args.url, {
|
||||
"pdf_folder": args.dataset,
|
||||
"use_doc_orientation_classify": False,
|
||||
"use_doc_unwarping": False,
|
||||
"textline_orientation": False, # Baseline: disabled
|
||||
"text_det_thresh": 0.0,
|
||||
"text_det_box_thresh": 0.0,
|
||||
"text_det_unclip_ratio": 1.5,
|
||||
"text_rec_score_thresh": 0.0,
|
||||
"start_page": 5,
|
||||
"end_page": 10,
|
||||
})
|
||||
|
||||
# Test 2: Optimized config (from Ray Tune results)
|
||||
print("\n" + "="*50)
|
||||
print("TEST 2: Optimized Configuration")
|
||||
print("="*50)
|
||||
optimized = test_evaluate(args.url, {
|
||||
"pdf_folder": args.dataset,
|
||||
"use_doc_orientation_classify": False,
|
||||
"use_doc_unwarping": False,
|
||||
"textline_orientation": True, # KEY: enabled
|
||||
"text_det_thresh": 0.4690,
|
||||
"text_det_box_thresh": 0.5412,
|
||||
"text_det_unclip_ratio": 0.0,
|
||||
"text_rec_score_thresh": 0.6350,
|
||||
"start_page": 5,
|
||||
"end_page": 10,
|
||||
})
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*50)
|
||||
print("SUMMARY")
|
||||
print("="*50)
|
||||
cer_reduction = (1 - optimized["CER"] / baseline["CER"]) * 100 if baseline["CER"] > 0 else 0
|
||||
print(f"Baseline CER: {baseline['CER']*100:.2f}%")
|
||||
print(f"Optimized CER: {optimized['CER']*100:.2f}%")
|
||||
print(f"Improvement: {cer_reduction:.1f}% reduction in errors")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user