Paddle ocr, easyicr and doctr gpu support. (#4)
All checks were successful
build_docker / essential (push) Successful in 0s
build_docker / build_cpu (push) Successful in 5m0s
build_docker / build_gpu (push) Successful in 22m55s
build_docker / build_easyocr (push) Successful in 18m47s
build_docker / build_easyocr_gpu (push) Successful in 19m0s
build_docker / build_raytune (push) Successful in 3m27s
build_docker / build_doctr (push) Successful in 19m42s
build_docker / build_doctr_gpu (push) Successful in 14m49s
All checks were successful
build_docker / essential (push) Successful in 0s
build_docker / build_cpu (push) Successful in 5m0s
build_docker / build_gpu (push) Successful in 22m55s
build_docker / build_easyocr (push) Successful in 18m47s
build_docker / build_easyocr_gpu (push) Successful in 19m0s
build_docker / build_raytune (push) Successful in 3m27s
build_docker / build_doctr (push) Successful in 19m42s
build_docker / build_doctr_gpu (push) Successful in 14m49s
This commit was merged in pull request #4.
This commit is contained in:
49
src/doctr_service/Dockerfile
Normal file
49
src/doctr_service/Dockerfile
Normal file
@@ -0,0 +1,49 @@
|
||||
# Dockerfile - DocTR Tuning REST API
|
||||
#
|
||||
# Build:
|
||||
# docker build -t doctr-api:latest .
|
||||
#
|
||||
# Run:
|
||||
# docker run -p 8003:8000 -v ./dataset:/app/dataset doctr-api:latest
|
||||
|
||||
FROM python:3.11-slim
|
||||
|
||||
LABEL maintainer="Sergio Jimenez"
|
||||
LABEL description="DocTR Tuning REST API"
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Set environment variables
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
ENV DOCTR_DET_ARCH=db_resnet50
|
||||
ENV DOCTR_RECO_ARCH=crnn_vgg16_bn
|
||||
|
||||
# Install system dependencies for OpenCV and image processing
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
libgl1 \
|
||||
libglib2.0-0 \
|
||||
libsm6 \
|
||||
libxext6 \
|
||||
libxrender1 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy and install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY doctr_tuning_rest.py .
|
||||
COPY dataset_manager.py .
|
||||
|
||||
# Volume for dataset and model cache
|
||||
VOLUME ["/app/dataset", "/root/.cache/doctr"]
|
||||
|
||||
# Expose API port
|
||||
EXPOSE 8000
|
||||
|
||||
# Health check (longer start period for model download)
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=180s --retries=3 \
|
||||
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
|
||||
|
||||
# Run the API server
|
||||
CMD ["uvicorn", "doctr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
63
src/doctr_service/Dockerfile.gpu
Normal file
63
src/doctr_service/Dockerfile.gpu
Normal file
@@ -0,0 +1,63 @@
|
||||
# Dockerfile.gpu - DocTR GPU Dockerfile for amd64/arm64
|
||||
#
|
||||
# Build:
|
||||
# docker build -t doctr-gpu:latest -f Dockerfile.gpu .
|
||||
#
|
||||
# Run:
|
||||
# docker run --gpus all -p 8003:8000 -v ./dataset:/app/dataset doctr-gpu:latest
|
||||
|
||||
# CUDA 13.0 for Blackwell (sm_121) and GH200/GB200 support
|
||||
FROM nvidia/cuda:13.0.2-cudnn-runtime-ubuntu24.04
|
||||
|
||||
LABEL maintainer="Sergio Jimenez"
|
||||
LABEL description="DocTR Tuning REST API - GPU/CUDA version"
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Set environment variables
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
ENV CUDA_VISIBLE_DEVICES=0
|
||||
ENV DOCTR_DET_ARCH=db_resnet50
|
||||
ENV DOCTR_RECO_ARCH=crnn_vgg16_bn
|
||||
|
||||
# Install Python 3.12 and system dependencies
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3.12 \
|
||||
python3.12-venv \
|
||||
python3-pip \
|
||||
libgl1 \
|
||||
libglib2.0-0 \
|
||||
libsm6 \
|
||||
libxext6 \
|
||||
libxrender1 \
|
||||
libgomp1 \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& ln -sf /usr/bin/python3.12 /usr/bin/python
|
||||
|
||||
# Copy requirements first for better caching
|
||||
COPY requirements.txt .
|
||||
|
||||
# Install PyTorch with CUDA support first (cu128 index has amd64 + arm64 wheels)
|
||||
RUN python -m pip install --no-cache-dir --break-system-packages \
|
||||
torch torchvision --index-url https://download.pytorch.org/whl/cu128
|
||||
|
||||
# Install remaining dependencies from requirements.txt (skip torch, already installed)
|
||||
RUN grep -v "^torch" requirements.txt | python -m pip install --no-cache-dir --break-system-packages -r /dev/stdin
|
||||
|
||||
# Copy application code
|
||||
COPY doctr_tuning_rest.py .
|
||||
COPY dataset_manager.py .
|
||||
|
||||
# Volume for dataset and model cache
|
||||
VOLUME ["/app/dataset", "/root/.cache/doctr"]
|
||||
|
||||
# Expose API port
|
||||
EXPOSE 8000
|
||||
|
||||
# Health check (longer start period for model download)
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=180s --retries=3 \
|
||||
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
|
||||
|
||||
# Run the API server
|
||||
CMD ["uvicorn", "doctr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
261
src/doctr_service/README.md
Normal file
261
src/doctr_service/README.md
Normal file
@@ -0,0 +1,261 @@
|
||||
# DocTR Tuning REST API
|
||||
|
||||
REST API service for DocTR (Document Text Recognition) hyperparameter evaluation. Keeps the model loaded in memory for fast repeated evaluations during hyperparameter search.
|
||||
|
||||
## Quick Start
|
||||
|
||||
### CPU Version
|
||||
|
||||
```bash
|
||||
cd src/doctr_service
|
||||
|
||||
# Build
|
||||
docker build -t doctr-api:cpu .
|
||||
|
||||
# Run
|
||||
docker run -d -p 8003:8000 \
|
||||
-v $(pwd)/../dataset:/app/dataset:ro \
|
||||
-v doctr-cache:/root/.cache/doctr \
|
||||
doctr-api:cpu
|
||||
|
||||
# Test
|
||||
curl http://localhost:8003/health
|
||||
```
|
||||
|
||||
### GPU Version
|
||||
|
||||
```bash
|
||||
# Build GPU image
|
||||
docker build -f Dockerfile.gpu -t doctr-api:gpu .
|
||||
|
||||
# Run with GPU
|
||||
docker run -d -p 8003:8000 --gpus all \
|
||||
-v $(pwd)/../dataset:/app/dataset:ro \
|
||||
-v doctr-cache:/root/.cache/doctr \
|
||||
doctr-api:gpu
|
||||
```
|
||||
|
||||
## Files
|
||||
|
||||
| File | Description |
|
||||
|------|-------------|
|
||||
| `doctr_tuning_rest.py` | FastAPI REST service with 9 tunable hyperparameters |
|
||||
| `dataset_manager.py` | Dataset loader (shared with other services) |
|
||||
| `Dockerfile` | CPU-only image (amd64 + arm64) |
|
||||
| `Dockerfile.gpu` | GPU/CUDA image (amd64 + arm64) |
|
||||
| `requirements.txt` | Python dependencies |
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### `GET /health`
|
||||
|
||||
Check if service is ready.
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "ok",
|
||||
"model_loaded": true,
|
||||
"dataset_loaded": true,
|
||||
"dataset_size": 24,
|
||||
"det_arch": "db_resnet50",
|
||||
"reco_arch": "crnn_vgg16_bn",
|
||||
"cuda_available": true,
|
||||
"device": "cuda",
|
||||
"gpu_name": "NVIDIA GB10"
|
||||
}
|
||||
```
|
||||
|
||||
### `POST /evaluate`
|
||||
|
||||
Run OCR evaluation with given hyperparameters.
|
||||
|
||||
**Request (9 tunable parameters):**
|
||||
```json
|
||||
{
|
||||
"pdf_folder": "/app/dataset",
|
||||
"assume_straight_pages": true,
|
||||
"straighten_pages": false,
|
||||
"preserve_aspect_ratio": true,
|
||||
"symmetric_pad": true,
|
||||
"disable_page_orientation": false,
|
||||
"disable_crop_orientation": false,
|
||||
"resolve_lines": true,
|
||||
"resolve_blocks": false,
|
||||
"paragraph_break": 0.035,
|
||||
"start_page": 5,
|
||||
"end_page": 10
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"CER": 0.0189,
|
||||
"WER": 0.1023,
|
||||
"TIME": 52.3,
|
||||
"PAGES": 5,
|
||||
"TIME_PER_PAGE": 10.46,
|
||||
"model_reinitialized": false
|
||||
}
|
||||
```
|
||||
|
||||
**Note:** `model_reinitialized` indicates if the model was reloaded due to changed processing flags (adds ~2-5s overhead).
|
||||
|
||||
## Debug Output (debugset)
|
||||
|
||||
The `debugset` folder allows saving OCR predictions for debugging and analysis. When `save_output=True` is passed to `/evaluate`, predictions are written to `/app/debugset`.
|
||||
|
||||
### Enable Debug Output
|
||||
|
||||
```json
|
||||
{
|
||||
"pdf_folder": "/app/dataset",
|
||||
"save_output": true,
|
||||
"start_page": 5,
|
||||
"end_page": 10
|
||||
}
|
||||
```
|
||||
|
||||
### Output Structure
|
||||
|
||||
```
|
||||
debugset/
|
||||
├── doc1/
|
||||
│ └── doctr/
|
||||
│ ├── page_0005.txt
|
||||
│ ├── page_0006.txt
|
||||
│ └── ...
|
||||
├── doc2/
|
||||
│ └── doctr/
|
||||
│ └── ...
|
||||
```
|
||||
|
||||
Each `.txt` file contains the OCR-extracted text for that page.
|
||||
|
||||
### Docker Mount
|
||||
|
||||
Add the debugset volume to your docker run command:
|
||||
|
||||
```bash
|
||||
docker run -d -p 8003:8000 \
|
||||
-v $(pwd)/../dataset:/app/dataset:ro \
|
||||
-v $(pwd)/../debugset:/app/debugset:rw \
|
||||
-v doctr-cache:/root/.cache/doctr \
|
||||
doctr-api:cpu
|
||||
```
|
||||
|
||||
### Use Cases
|
||||
|
||||
- **Compare OCR engines**: Run same pages through PaddleOCR, DocTR, EasyOCR with `save_output=True`, then diff results
|
||||
- **Debug hyperparameters**: See how different settings affect text extraction
|
||||
- **Ground truth comparison**: Compare predictions against expected output
|
||||
|
||||
## Hyperparameters
|
||||
|
||||
### Processing Flags (Require Model Reinitialization)
|
||||
|
||||
| Parameter | Default | Description |
|
||||
|-----------|---------|-------------|
|
||||
| `assume_straight_pages` | true | Skip rotation handling for straight documents |
|
||||
| `straighten_pages` | false | Pre-straighten pages before detection |
|
||||
| `preserve_aspect_ratio` | true | Maintain document proportions during resize |
|
||||
| `symmetric_pad` | true | Use symmetric padding when preserving aspect ratio |
|
||||
|
||||
**Note:** Changing these flags requires model reinitialization (~2-5s).
|
||||
|
||||
### Orientation Flags
|
||||
|
||||
| Parameter | Default | Description |
|
||||
|-----------|---------|-------------|
|
||||
| `disable_page_orientation` | false | Skip page orientation classification |
|
||||
| `disable_crop_orientation` | false | Skip crop orientation detection |
|
||||
|
||||
### Output Grouping
|
||||
|
||||
| Parameter | Default | Range | Description |
|
||||
|-----------|---------|-------|-------------|
|
||||
| `resolve_lines` | true | bool | Group words into lines |
|
||||
| `resolve_blocks` | false | bool | Group lines into blocks |
|
||||
| `paragraph_break` | 0.035 | 0.0-1.0 | Minimum space ratio separating paragraphs |
|
||||
|
||||
## Model Architecture
|
||||
|
||||
DocTR uses a two-stage pipeline:
|
||||
|
||||
1. **Detection** (`det_arch`): Localizes text regions
|
||||
- Default: `db_resnet50` (DBNet with ResNet-50 backbone)
|
||||
- Alternatives: `linknet_resnet18`, `db_mobilenet_v3_large`
|
||||
|
||||
2. **Recognition** (`reco_arch`): Recognizes characters
|
||||
- Default: `crnn_vgg16_bn` (CRNN with VGG-16 backbone)
|
||||
- Alternatives: `sar_resnet31`, `master`, `vitstr_small`
|
||||
|
||||
Architecture is set via environment variables (fixed at startup).
|
||||
|
||||
## GPU Support
|
||||
|
||||
### Platform Support
|
||||
|
||||
| Platform | CPU | GPU |
|
||||
|----------|-----|-----|
|
||||
| Linux x86_64 (amd64) | ✅ | ✅ PyTorch CUDA |
|
||||
| Linux ARM64 (GH200/GB200/DGX Spark) | ✅ | ✅ PyTorch CUDA (cu128 index) |
|
||||
| macOS ARM64 (M1/M2) | ✅ | ❌ |
|
||||
|
||||
### PyTorch CUDA on ARM64
|
||||
|
||||
Unlike PaddlePaddle, PyTorch provides **official ARM64 CUDA wheels** on the cu128 index:
|
||||
|
||||
```bash
|
||||
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu128
|
||||
```
|
||||
|
||||
This works on both amd64 and arm64 platforms with CUDA support.
|
||||
|
||||
### GPU Detection
|
||||
|
||||
DocTR automatically uses GPU when available:
|
||||
|
||||
```python
|
||||
import torch
|
||||
print(torch.cuda.is_available()) # True if GPU available
|
||||
|
||||
# DocTR model moves to GPU
|
||||
model = ocr_predictor(pretrained=True)
|
||||
if torch.cuda.is_available():
|
||||
model = model.cuda()
|
||||
```
|
||||
|
||||
The `/health` endpoint shows GPU status:
|
||||
```json
|
||||
{
|
||||
"cuda_available": true,
|
||||
"device": "cuda",
|
||||
"gpu_name": "NVIDIA GB10",
|
||||
"gpu_memory_total": "128.00 GB"
|
||||
}
|
||||
```
|
||||
|
||||
## Environment Variables
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `DOCTR_DET_ARCH` | `db_resnet50` | Detection architecture |
|
||||
| `DOCTR_RECO_ARCH` | `crnn_vgg16_bn` | Recognition architecture |
|
||||
| `CUDA_VISIBLE_DEVICES` | `0` | GPU device selection |
|
||||
|
||||
## CI/CD
|
||||
|
||||
Built images available from registry:
|
||||
|
||||
| Image | Architecture |
|
||||
|-------|--------------|
|
||||
| `seryus.ddns.net/unir/doctr-cpu:latest` | amd64, arm64 |
|
||||
| `seryus.ddns.net/unir/doctr-gpu:latest` | amd64, arm64 |
|
||||
|
||||
## Sources
|
||||
|
||||
- [DocTR Documentation](https://mindee.github.io/doctr/)
|
||||
- [DocTR GitHub](https://github.com/mindee/doctr)
|
||||
- [DocTR Model Usage](https://mindee.github.io/doctr/latest/using_doctr/using_models.html)
|
||||
- [PyTorch ARM64 CUDA Wheels](https://github.com/pytorch/pytorch/issues/160162)
|
||||
74
src/doctr_service/dataset_manager.py
Normal file
74
src/doctr_service/dataset_manager.py
Normal file
@@ -0,0 +1,74 @@
|
||||
# Imports
|
||||
import os
|
||||
from PIL import Image
|
||||
|
||||
|
||||
class ImageTextDataset:
|
||||
def __init__(self, root):
|
||||
self.samples = []
|
||||
|
||||
for folder in sorted(os.listdir(root)):
|
||||
sub = os.path.join(root, folder)
|
||||
img_dir = os.path.join(sub, "img")
|
||||
txt_dir = os.path.join(sub, "txt")
|
||||
|
||||
if not (os.path.isdir(img_dir) and os.path.isdir(txt_dir)):
|
||||
continue
|
||||
|
||||
for fname in sorted(os.listdir(img_dir)):
|
||||
if not fname.lower().endswith((".png", ".jpg", ".jpeg")):
|
||||
continue
|
||||
|
||||
img_path = os.path.join(img_dir, fname)
|
||||
|
||||
# text file must have same name but .txt
|
||||
txt_name = os.path.splitext(fname)[0] + ".txt"
|
||||
txt_path = os.path.join(txt_dir, txt_name)
|
||||
|
||||
if not os.path.exists(txt_path):
|
||||
continue
|
||||
|
||||
self.samples.append((img_path, txt_path))
|
||||
def __len__(self):
|
||||
return len(self.samples)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
img_path, txt_path = self.samples[idx]
|
||||
|
||||
# Load image
|
||||
image = Image.open(img_path).convert("RGB")
|
||||
|
||||
# Load text
|
||||
with open(txt_path, "r", encoding="utf-8") as f:
|
||||
text = f.read()
|
||||
|
||||
return image, text
|
||||
|
||||
def get_output_path(self, idx, output_subdir, debugset_root="/app/debugset"):
|
||||
"""Get output path for saving OCR result to debugset folder.
|
||||
|
||||
Args:
|
||||
idx: Sample index
|
||||
output_subdir: Subdirectory name (e.g., 'paddle_text', 'doctr_text')
|
||||
debugset_root: Root folder for debug output (default: /app/debugset)
|
||||
|
||||
Returns:
|
||||
Path like /app/debugset/doc1/{output_subdir}/page_001.txt
|
||||
"""
|
||||
img_path, _ = self.samples[idx]
|
||||
# img_path: /app/dataset/doc1/img/page_001.png
|
||||
# Extract relative path: doc1/img/page_001.png
|
||||
parts = img_path.split("/dataset/", 1)
|
||||
if len(parts) == 2:
|
||||
rel_path = parts[1] # doc1/img/page_001.png
|
||||
else:
|
||||
rel_path = os.path.basename(img_path)
|
||||
|
||||
# Replace /img/ with /{output_subdir}/
|
||||
rel_parts = rel_path.rsplit("/img/", 1)
|
||||
doc_folder = rel_parts[0] # doc1
|
||||
fname = os.path.splitext(rel_parts[1])[0] + ".txt" # page_001.txt
|
||||
|
||||
out_dir = os.path.join(debugset_root, doc_folder, output_subdir)
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
return os.path.join(out_dir, fname)
|
||||
63
src/doctr_service/docker-compose.yml
Normal file
63
src/doctr_service/docker-compose.yml
Normal file
@@ -0,0 +1,63 @@
|
||||
# docker-compose.yml - DocTR REST API
|
||||
# Usage:
|
||||
# CPU: docker compose up ocr-cpu
|
||||
# GPU: docker compose up ocr-gpu
|
||||
#
|
||||
# Port: 8003
|
||||
|
||||
services:
|
||||
# CPU-only service
|
||||
ocr-cpu:
|
||||
image: seryus.ddns.net/unir/doctr-cpu:latest
|
||||
container_name: doctr-cpu
|
||||
ports:
|
||||
- "8003:8000"
|
||||
volumes:
|
||||
- ../dataset:/app/dataset:ro
|
||||
- ../debugset:/app/debugset:rw
|
||||
- doctr-cache:/root/.cache/doctr
|
||||
environment:
|
||||
- PYTHONUNBUFFERED=1
|
||||
- DOCTR_DET_ARCH=db_resnet50
|
||||
- DOCTR_RECO_ARCH=crnn_vgg16_bn
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 180s
|
||||
|
||||
# GPU service (requires NVIDIA Container Toolkit)
|
||||
ocr-gpu:
|
||||
image: seryus.ddns.net/unir/doctr-gpu:latest
|
||||
container_name: doctr-gpu
|
||||
ports:
|
||||
- "8003:8000"
|
||||
volumes:
|
||||
- ../dataset:/app/dataset:ro
|
||||
- ../debugset:/app/debugset:rw
|
||||
- doctr-cache:/root/.cache/doctr
|
||||
environment:
|
||||
- PYTHONUNBUFFERED=1
|
||||
- CUDA_VISIBLE_DEVICES=0
|
||||
- DOCTR_DET_ARCH=db_resnet50
|
||||
- DOCTR_RECO_ARCH=crnn_vgg16_bn
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: 1
|
||||
capabilities: [gpu]
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 180s
|
||||
|
||||
volumes:
|
||||
doctr-cache:
|
||||
name: doctr-model-cache
|
||||
336
src/doctr_service/doctr_tuning_rest.py
Normal file
336
src/doctr_service/doctr_tuning_rest.py
Normal file
@@ -0,0 +1,336 @@
|
||||
# doctr_tuning_rest.py
|
||||
# FastAPI REST service for DocTR hyperparameter evaluation
|
||||
# Usage: uvicorn doctr_tuning_rest:app --host 0.0.0.0 --port 8000
|
||||
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import threading
|
||||
from typing import Optional
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from doctr.models import ocr_predictor
|
||||
from jiwer import wer, cer
|
||||
from dataset_manager import ImageTextDataset
|
||||
|
||||
|
||||
def get_gpu_info() -> dict:
|
||||
"""Get GPU status information from PyTorch."""
|
||||
info = {
|
||||
"cuda_available": torch.cuda.is_available(),
|
||||
"device": "cuda" if torch.cuda.is_available() else "cpu",
|
||||
"gpu_count": 0,
|
||||
"gpu_name": None,
|
||||
"gpu_memory_total": None,
|
||||
"gpu_memory_used": None,
|
||||
}
|
||||
|
||||
if info["cuda_available"]:
|
||||
try:
|
||||
info["gpu_count"] = torch.cuda.device_count()
|
||||
if info["gpu_count"] > 0:
|
||||
info["gpu_name"] = torch.cuda.get_device_name(0)
|
||||
info["gpu_memory_total"] = f"{torch.cuda.get_device_properties(0).total_memory / (1024**3):.2f} GB"
|
||||
info["gpu_memory_used"] = f"{torch.cuda.memory_allocated(0) / (1024**3):.2f} GB"
|
||||
except Exception as e:
|
||||
info["gpu_error"] = str(e)
|
||||
|
||||
return info
|
||||
|
||||
|
||||
# Model configuration via environment variables
|
||||
DEFAULT_DET_ARCH = os.environ.get("DOCTR_DET_ARCH", "db_resnet50")
|
||||
DEFAULT_RECO_ARCH = os.environ.get("DOCTR_RECO_ARCH", "crnn_vgg16_bn")
|
||||
|
||||
|
||||
# Global state for model and dataset
|
||||
class AppState:
|
||||
model: Optional[object] = None
|
||||
dataset: Optional[ImageTextDataset] = None
|
||||
dataset_path: Optional[str] = None
|
||||
det_arch: str = DEFAULT_DET_ARCH
|
||||
reco_arch: str = DEFAULT_RECO_ARCH
|
||||
# Track current model config for cache invalidation
|
||||
current_config: Optional[dict] = None
|
||||
device: str = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
lock: threading.Lock = None # Protects OCR model from concurrent access
|
||||
|
||||
def __init__(self):
|
||||
self.lock = threading.Lock()
|
||||
|
||||
|
||||
state = AppState()
|
||||
|
||||
|
||||
def create_model(
|
||||
assume_straight_pages: bool = True,
|
||||
straighten_pages: bool = False,
|
||||
preserve_aspect_ratio: bool = True,
|
||||
symmetric_pad: bool = True,
|
||||
disable_page_orientation: bool = False,
|
||||
disable_crop_orientation: bool = False,
|
||||
) -> object:
|
||||
"""Create DocTR model with given configuration."""
|
||||
model = ocr_predictor(
|
||||
det_arch=state.det_arch,
|
||||
reco_arch=state.reco_arch,
|
||||
pretrained=True,
|
||||
assume_straight_pages=assume_straight_pages,
|
||||
straighten_pages=straighten_pages,
|
||||
preserve_aspect_ratio=preserve_aspect_ratio,
|
||||
symmetric_pad=symmetric_pad,
|
||||
)
|
||||
|
||||
# Apply orientation settings if supported
|
||||
if hasattr(model, 'disable_page_orientation'):
|
||||
model.disable_page_orientation = disable_page_orientation
|
||||
if hasattr(model, 'disable_crop_orientation'):
|
||||
model.disable_crop_orientation = disable_crop_orientation
|
||||
|
||||
# Move to GPU if available
|
||||
if state.device == "cuda":
|
||||
model = model.cuda()
|
||||
|
||||
return model
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""Load DocTR model at startup with default configuration."""
|
||||
gpu_info = get_gpu_info()
|
||||
print("=" * 50)
|
||||
print("GPU STATUS")
|
||||
print("=" * 50)
|
||||
print(f" CUDA available: {gpu_info['cuda_available']}")
|
||||
print(f" Device: {gpu_info['device']}")
|
||||
if gpu_info['cuda_available']:
|
||||
print(f" GPU count: {gpu_info['gpu_count']}")
|
||||
print(f" GPU name: {gpu_info['gpu_name']}")
|
||||
print(f" GPU memory total: {gpu_info['gpu_memory_total']}")
|
||||
print("=" * 50)
|
||||
|
||||
print(f"Loading DocTR models...")
|
||||
print(f" Detection: {state.det_arch}")
|
||||
print(f" Recognition: {state.reco_arch}")
|
||||
|
||||
# Load with default config
|
||||
state.model = create_model()
|
||||
state.current_config = {
|
||||
"assume_straight_pages": True,
|
||||
"straighten_pages": False,
|
||||
"preserve_aspect_ratio": True,
|
||||
"symmetric_pad": True,
|
||||
"disable_page_orientation": False,
|
||||
"disable_crop_orientation": False,
|
||||
}
|
||||
|
||||
if gpu_info['cuda_available']:
|
||||
gpu_after = get_gpu_info()
|
||||
print(f" GPU memory after load: {gpu_after.get('gpu_memory_used', 'N/A')}")
|
||||
|
||||
print("Model loaded successfully!")
|
||||
yield
|
||||
state.model = None
|
||||
state.dataset = None
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="DocTR Tuning API",
|
||||
description="REST API for DocTR hyperparameter evaluation",
|
||||
version="1.0.0",
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
|
||||
class EvaluateRequest(BaseModel):
|
||||
"""Request schema with all tunable DocTR hyperparameters."""
|
||||
pdf_folder: str = Field("/app/dataset", description="Path to dataset folder")
|
||||
|
||||
# Processing flags (require model reinit)
|
||||
assume_straight_pages: bool = Field(True, description="Skip rotation handling for straight documents")
|
||||
straighten_pages: bool = Field(False, description="Pre-straighten pages before detection")
|
||||
preserve_aspect_ratio: bool = Field(True, description="Maintain document proportions during resize")
|
||||
symmetric_pad: bool = Field(True, description="Use symmetric padding when preserving aspect ratio")
|
||||
|
||||
# Orientation flags
|
||||
disable_page_orientation: bool = Field(False, description="Skip page orientation classification")
|
||||
disable_crop_orientation: bool = Field(False, description="Skip crop orientation detection")
|
||||
|
||||
# Output grouping
|
||||
resolve_lines: bool = Field(True, description="Group words into lines")
|
||||
resolve_blocks: bool = Field(False, description="Group lines into blocks")
|
||||
paragraph_break: float = Field(0.035, ge=0.0, le=1.0, description="Minimum space ratio separating paragraphs")
|
||||
|
||||
# Page range
|
||||
start_page: int = Field(5, ge=0, description="Start page index (inclusive)")
|
||||
end_page: int = Field(10, ge=1, description="End page index (exclusive)")
|
||||
save_output: bool = Field(False, description="Save OCR predictions to debugset folder")
|
||||
|
||||
|
||||
class EvaluateResponse(BaseModel):
|
||||
"""Response schema matching CLI output."""
|
||||
CER: float
|
||||
WER: float
|
||||
TIME: float
|
||||
PAGES: int
|
||||
TIME_PER_PAGE: float
|
||||
model_reinitialized: bool = False
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
status: str
|
||||
model_loaded: bool
|
||||
dataset_loaded: bool
|
||||
dataset_size: Optional[int] = None
|
||||
det_arch: Optional[str] = None
|
||||
reco_arch: Optional[str] = None
|
||||
cuda_available: Optional[bool] = None
|
||||
device: Optional[str] = None
|
||||
gpu_name: Optional[str] = None
|
||||
gpu_memory_used: Optional[str] = None
|
||||
gpu_memory_total: Optional[str] = None
|
||||
|
||||
|
||||
def doctr_result_to_text(result, resolve_lines: bool = True, resolve_blocks: bool = False) -> str:
|
||||
"""
|
||||
Convert DocTR result to plain text.
|
||||
Structure: Document -> pages -> blocks -> lines -> words
|
||||
"""
|
||||
lines = []
|
||||
for page in result.pages:
|
||||
for block in page.blocks:
|
||||
for line in block.lines:
|
||||
line_text = " ".join([w.value for w in line.words])
|
||||
lines.append(line_text)
|
||||
if resolve_blocks:
|
||||
lines.append("") # paragraph separator
|
||||
|
||||
text = " ".join([l for l in lines if l]).strip()
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
|
||||
def evaluate_text(reference: str, prediction: str) -> dict:
|
||||
"""Calculate WER and CER metrics."""
|
||||
return {"WER": wer(reference, prediction), "CER": cer(reference, prediction)}
|
||||
|
||||
|
||||
@app.get("/health", response_model=HealthResponse)
|
||||
def health_check():
|
||||
"""Check if the service is ready."""
|
||||
gpu_info = get_gpu_info()
|
||||
return HealthResponse(
|
||||
status="ok" if state.model is not None else "initializing",
|
||||
model_loaded=state.model is not None,
|
||||
dataset_loaded=state.dataset is not None,
|
||||
dataset_size=len(state.dataset) if state.dataset else None,
|
||||
det_arch=state.det_arch,
|
||||
reco_arch=state.reco_arch,
|
||||
cuda_available=gpu_info.get("cuda_available"),
|
||||
device=gpu_info.get("device"),
|
||||
gpu_name=gpu_info.get("gpu_name"),
|
||||
gpu_memory_used=gpu_info.get("gpu_memory_used"),
|
||||
gpu_memory_total=gpu_info.get("gpu_memory_total"),
|
||||
)
|
||||
|
||||
|
||||
@app.post("/evaluate", response_model=EvaluateResponse)
|
||||
def evaluate(request: EvaluateRequest):
|
||||
"""
|
||||
Evaluate OCR with given hyperparameters.
|
||||
Returns CER, WER, and timing metrics.
|
||||
Note: Model will be reinitialized if processing flags change.
|
||||
"""
|
||||
if state.model is None:
|
||||
raise HTTPException(status_code=503, detail="Model not loaded yet")
|
||||
|
||||
# Load or reload dataset if path changed
|
||||
if state.dataset is None or state.dataset_path != request.pdf_folder:
|
||||
if not os.path.isdir(request.pdf_folder):
|
||||
raise HTTPException(status_code=400, detail=f"Dataset folder not found: {request.pdf_folder}")
|
||||
state.dataset = ImageTextDataset(request.pdf_folder)
|
||||
state.dataset_path = request.pdf_folder
|
||||
|
||||
if len(state.dataset) == 0:
|
||||
raise HTTPException(status_code=400, detail="Dataset is empty")
|
||||
|
||||
# Validate page range
|
||||
start = request.start_page
|
||||
end = min(request.end_page, len(state.dataset))
|
||||
if start >= end:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid page range: {start}-{end}")
|
||||
|
||||
cer_list, wer_list = [], []
|
||||
time_per_page_list = []
|
||||
t0 = time.time()
|
||||
|
||||
# Lock to prevent concurrent OCR access (model is not thread-safe)
|
||||
with state.lock:
|
||||
# Check if model needs to be reinitialized
|
||||
new_config = {
|
||||
"assume_straight_pages": request.assume_straight_pages,
|
||||
"straighten_pages": request.straighten_pages,
|
||||
"preserve_aspect_ratio": request.preserve_aspect_ratio,
|
||||
"symmetric_pad": request.symmetric_pad,
|
||||
"disable_page_orientation": request.disable_page_orientation,
|
||||
"disable_crop_orientation": request.disable_crop_orientation,
|
||||
}
|
||||
|
||||
model_reinitialized = False
|
||||
if state.current_config != new_config:
|
||||
print(f"Model config changed, reinitializing...")
|
||||
state.model = create_model(**new_config)
|
||||
state.current_config = new_config
|
||||
model_reinitialized = True
|
||||
|
||||
for idx in range(start, end):
|
||||
img, ref = state.dataset[idx]
|
||||
arr = np.array(img)
|
||||
|
||||
tp0 = time.time()
|
||||
# DocTR expects a list of images
|
||||
result = state.model([arr])
|
||||
|
||||
pred = doctr_result_to_text(
|
||||
result,
|
||||
resolve_lines=request.resolve_lines,
|
||||
resolve_blocks=request.resolve_blocks,
|
||||
)
|
||||
time_per_page_list.append(float(time.time() - tp0))
|
||||
|
||||
# Save prediction to debugset if requested
|
||||
if request.save_output:
|
||||
out_path = state.dataset.get_output_path(idx, "doctr_text")
|
||||
with open(out_path, "w", encoding="utf-8") as f:
|
||||
f.write(pred)
|
||||
|
||||
m = evaluate_text(ref, pred)
|
||||
cer_list.append(m["CER"])
|
||||
wer_list.append(m["WER"])
|
||||
|
||||
return EvaluateResponse(
|
||||
CER=float(np.mean(cer_list)) if cer_list else 1.0,
|
||||
WER=float(np.mean(wer_list)) if wer_list else 1.0,
|
||||
TIME=float(time.time() - t0),
|
||||
PAGES=len(cer_list),
|
||||
TIME_PER_PAGE=float(np.mean(time_per_page_list)) if time_per_page_list else 0.0,
|
||||
model_reinitialized=model_reinitialized,
|
||||
)
|
||||
|
||||
|
||||
@app.post("/evaluate_full", response_model=EvaluateResponse)
|
||||
def evaluate_full(request: EvaluateRequest):
|
||||
"""Evaluate on ALL pages (ignores start_page/end_page)."""
|
||||
request.start_page = 0
|
||||
request.end_page = 9999
|
||||
return evaluate(request)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
8
src/doctr_service/requirements.txt
Normal file
8
src/doctr_service/requirements.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
python-doctr[torch]>=0.8.0
|
||||
fastapi>=0.104.0
|
||||
uvicorn>=0.24.0
|
||||
pydantic>=2.0.0
|
||||
jiwer>=3.0.0
|
||||
numpy>=1.24.0
|
||||
pillow>=10.0.0
|
||||
torch>=2.0.0
|
||||
Reference in New Issue
Block a user