Paddle ocr gpu support. #4
58
src/paddle_ocr/Dockerfile.cpu
Normal file
58
src/paddle_ocr/Dockerfile.cpu
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
# Dockerfile.cpu - CPU-only PaddleOCR REST API
|
||||||
|
# Multi-arch: supports both amd64 and arm64
|
||||||
|
|
||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
LABEL maintainer="Sergio Jimenez"
|
||||||
|
LABEL description="PaddleOCR Tuning REST API - CPU version"
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install system dependencies for OpenCV and PaddleOCR
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
libgl1 \
|
||||||
|
libglib2.0-0 \
|
||||||
|
libsm6 \
|
||||||
|
libxext6 \
|
||||||
|
libxrender1 \
|
||||||
|
libgomp1 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install Python dependencies from requirements file
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Copy application code
|
||||||
|
COPY paddle_ocr_tuning_rest.py .
|
||||||
|
COPY dataset_manager.py .
|
||||||
|
|
||||||
|
# Build arguments for models to bake into image
|
||||||
|
ARG DET_MODEL=PP-OCRv5_server_det
|
||||||
|
ARG REC_MODEL=PP-OCRv5_server_rec
|
||||||
|
|
||||||
|
# Set as environment variables (can be overridden at runtime)
|
||||||
|
ENV PADDLE_DET_MODEL=${DET_MODEL}
|
||||||
|
ENV PADDLE_REC_MODEL=${REC_MODEL}
|
||||||
|
|
||||||
|
# Download models during build (not at runtime)
|
||||||
|
RUN python -c "\
|
||||||
|
import os; \
|
||||||
|
from paddleocr import PaddleOCR; \
|
||||||
|
det = os.environ.get('PADDLE_DET_MODEL', 'PP-OCRv5_server_det'); \
|
||||||
|
rec = os.environ.get('PADDLE_REC_MODEL', 'PP-OCRv5_server_rec'); \
|
||||||
|
print(f'Downloading models: det={det}, rec={rec}'); \
|
||||||
|
ocr = PaddleOCR(text_detection_model_name=det, text_recognition_model_name=rec); \
|
||||||
|
print('Models downloaded successfully!')"
|
||||||
|
|
||||||
|
# Volume for dataset and optional additional model cache
|
||||||
|
VOLUME ["/app/dataset", "/root/.paddlex"]
|
||||||
|
|
||||||
|
# Expose API port
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
# Health check
|
||||||
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
||||||
|
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
|
||||||
|
|
||||||
|
# Run the API server
|
||||||
|
CMD ["uvicorn", "paddle_ocr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
68
src/paddle_ocr/Dockerfile.gpu
Normal file
68
src/paddle_ocr/Dockerfile.gpu
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
# Dockerfile.gpu - CUDA-enabled PaddleOCR REST API
|
||||||
|
# Supports: x86_64 with NVIDIA GPU (CUDA 12.x)
|
||||||
|
# For DGX Spark (ARM64 + CUDA): build natively on the device
|
||||||
|
|
||||||
|
FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
LABEL maintainer="Sergio Jimenez"
|
||||||
|
LABEL description="PaddleOCR Tuning REST API - GPU/CUDA version"
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Set environment variables
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
ENV CUDA_VISIBLE_DEVICES=0
|
||||||
|
|
||||||
|
# Install Python 3.11 and system dependencies
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3.11 \
|
||||||
|
python3.11-venv \
|
||||||
|
python3-pip \
|
||||||
|
libgl1 \
|
||||||
|
libglib2.0-0 \
|
||||||
|
libsm6 \
|
||||||
|
libxext6 \
|
||||||
|
libxrender1 \
|
||||||
|
libgomp1 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
|
&& ln -sf /usr/bin/python3.11 /usr/bin/python
|
||||||
|
|
||||||
|
# Install Python dependencies from requirements file
|
||||||
|
COPY requirements-gpu.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements-gpu.txt
|
||||||
|
|
||||||
|
# Copy application code
|
||||||
|
COPY paddle_ocr_tuning_rest.py .
|
||||||
|
COPY dataset_manager.py .
|
||||||
|
|
||||||
|
# Build arguments for models to bake into image
|
||||||
|
ARG DET_MODEL=PP-OCRv5_server_det
|
||||||
|
ARG REC_MODEL=PP-OCRv5_server_rec
|
||||||
|
|
||||||
|
# Set as environment variables (can be overridden at runtime)
|
||||||
|
ENV PADDLE_DET_MODEL=${DET_MODEL}
|
||||||
|
ENV PADDLE_REC_MODEL=${REC_MODEL}
|
||||||
|
|
||||||
|
# Download models during build (not at runtime)
|
||||||
|
RUN python -c "\
|
||||||
|
import os; \
|
||||||
|
from paddleocr import PaddleOCR; \
|
||||||
|
det = os.environ.get('PADDLE_DET_MODEL', 'PP-OCRv5_server_det'); \
|
||||||
|
rec = os.environ.get('PADDLE_REC_MODEL', 'PP-OCRv5_server_rec'); \
|
||||||
|
print(f'Downloading models: det={det}, rec={rec}'); \
|
||||||
|
ocr = PaddleOCR(text_detection_model_name=det, text_recognition_model_name=rec); \
|
||||||
|
print('Models downloaded successfully!')"
|
||||||
|
|
||||||
|
# Volume for dataset and optional additional model cache
|
||||||
|
VOLUME ["/app/dataset", "/root/.paddlex"]
|
||||||
|
|
||||||
|
# Expose API port
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
# Health check
|
||||||
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
||||||
|
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
|
||||||
|
|
||||||
|
# Run the API server
|
||||||
|
CMD ["uvicorn", "paddle_ocr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
329
src/paddle_ocr/README.md
Normal file
329
src/paddle_ocr/README.md
Normal file
@@ -0,0 +1,329 @@
|
|||||||
|
# PaddleOCR Tuning REST API
|
||||||
|
|
||||||
|
REST API service for PaddleOCR hyperparameter evaluation. Keeps the model loaded in memory for fast repeated evaluations during hyperparameter search.
|
||||||
|
|
||||||
|
## Quick Start with Docker Compose
|
||||||
|
|
||||||
|
Docker Compose manages building and running containers. The `docker-compose.yml` defines two services:
|
||||||
|
- `ocr-cpu` - CPU-only version (works everywhere)
|
||||||
|
- `ocr-gpu` - GPU version (requires NVIDIA GPU + Container Toolkit)
|
||||||
|
|
||||||
|
### Run CPU Version
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd src/paddle_ocr
|
||||||
|
|
||||||
|
# Build and start (first time takes ~2-3 min to build, ~30s to load model)
|
||||||
|
docker compose up ocr-cpu
|
||||||
|
|
||||||
|
# Or run in background (detached)
|
||||||
|
docker compose up -d ocr-cpu
|
||||||
|
|
||||||
|
# View logs
|
||||||
|
docker compose logs -f ocr-cpu
|
||||||
|
|
||||||
|
# Stop
|
||||||
|
docker compose down
|
||||||
|
```
|
||||||
|
|
||||||
|
### Run GPU Version
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Requires: NVIDIA GPU + nvidia-container-toolkit installed
|
||||||
|
docker compose up ocr-gpu
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test the API
|
||||||
|
|
||||||
|
Once running, test with:
|
||||||
|
```bash
|
||||||
|
# Check health
|
||||||
|
curl http://localhost:8000/health
|
||||||
|
|
||||||
|
# Or use the test script
|
||||||
|
pip install requests
|
||||||
|
python test.py --url http://localhost:8000
|
||||||
|
```
|
||||||
|
|
||||||
|
### What Docker Compose Does
|
||||||
|
|
||||||
|
```
|
||||||
|
docker compose up ocr-cpu
|
||||||
|
│
|
||||||
|
├─► Builds image from Dockerfile.cpu (if not exists)
|
||||||
|
├─► Creates container "paddle-ocr-cpu"
|
||||||
|
├─► Mounts ../dataset → /app/dataset (your PDF images)
|
||||||
|
├─► Mounts paddlex-cache volume (persists downloaded models)
|
||||||
|
├─► Exposes port 8000
|
||||||
|
└─► Runs: uvicorn paddle_ocr_tuning_rest:app --host 0.0.0.0 --port 8000
|
||||||
|
```
|
||||||
|
|
||||||
|
## Files
|
||||||
|
|
||||||
|
| File | Description |
|
||||||
|
|------|-------------|
|
||||||
|
| `paddle_ocr_tuning_rest.py` | FastAPI REST service |
|
||||||
|
| `dataset_manager.py` | Dataset loader |
|
||||||
|
| `test.py` | API test client |
|
||||||
|
| `Dockerfile.cpu` | CPU-only image (multi-arch) |
|
||||||
|
| `Dockerfile.gpu` | GPU/CUDA image (x86_64) |
|
||||||
|
| `docker-compose.yml` | Service orchestration |
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
### `GET /health`
|
||||||
|
Check if service is ready.
|
||||||
|
|
||||||
|
```json
|
||||||
|
{"status": "ok", "model_loaded": true, "dataset_loaded": true, "dataset_size": 24}
|
||||||
|
```
|
||||||
|
|
||||||
|
### `POST /evaluate`
|
||||||
|
Run OCR evaluation with given hyperparameters.
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"pdf_folder": "/app/dataset",
|
||||||
|
"textline_orientation": true,
|
||||||
|
"use_doc_orientation_classify": false,
|
||||||
|
"use_doc_unwarping": false,
|
||||||
|
"text_det_thresh": 0.469,
|
||||||
|
"text_det_box_thresh": 0.5412,
|
||||||
|
"text_det_unclip_ratio": 0.0,
|
||||||
|
"text_rec_score_thresh": 0.635,
|
||||||
|
"start_page": 5,
|
||||||
|
"end_page": 10
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{"CER": 0.0115, "WER": 0.0989, "TIME": 330.5, "PAGES": 5, "TIME_PER_PAGE": 66.1}
|
||||||
|
```
|
||||||
|
|
||||||
|
### `POST /evaluate_full`
|
||||||
|
Same as `/evaluate` but runs on ALL pages (ignores start_page/end_page).
|
||||||
|
|
||||||
|
## Building Images
|
||||||
|
|
||||||
|
### CPU Image (Multi-Architecture)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Local build (current architecture)
|
||||||
|
docker build -f Dockerfile.cpu -t paddle-ocr-api:cpu .
|
||||||
|
|
||||||
|
# Multi-arch build with buildx (amd64 + arm64)
|
||||||
|
docker buildx create --name multiarch --use
|
||||||
|
docker buildx build -f Dockerfile.cpu \
|
||||||
|
--platform linux/amd64,linux/arm64 \
|
||||||
|
-t paddle-ocr-api:cpu \
|
||||||
|
--push .
|
||||||
|
```
|
||||||
|
|
||||||
|
### GPU Image (x86_64 only)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker build -f Dockerfile.gpu -t paddle-ocr-api:gpu .
|
||||||
|
```
|
||||||
|
|
||||||
|
## Running
|
||||||
|
|
||||||
|
### CPU (Any machine)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -d -p 8000:8000 \
|
||||||
|
-v $(pwd)/../dataset:/app/dataset:ro \
|
||||||
|
-v paddlex-cache:/root/.paddlex \
|
||||||
|
paddle-ocr-api:cpu
|
||||||
|
```
|
||||||
|
|
||||||
|
### GPU (NVIDIA)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -d -p 8000:8000 --gpus all \
|
||||||
|
-v $(pwd)/../dataset:/app/dataset:ro \
|
||||||
|
-v paddlex-cache:/root/.paddlex \
|
||||||
|
paddle-ocr-api:gpu
|
||||||
|
```
|
||||||
|
|
||||||
|
## DGX Spark (ARM64 + CUDA)
|
||||||
|
|
||||||
|
DGX Spark uses ARM64 (Grace CPU) with NVIDIA Hopper GPU. You have two options:
|
||||||
|
|
||||||
|
### Option 1: Native ARM64 Build (Recommended)
|
||||||
|
|
||||||
|
PaddlePaddle has ARM64 support. Build natively:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On DGX Spark or ARM64 machine
|
||||||
|
docker build -f Dockerfile.cpu -t paddle-ocr-api:arm64 .
|
||||||
|
```
|
||||||
|
|
||||||
|
For GPU acceleration on ARM64, you'll need to modify `Dockerfile.gpu` to use ARM-compatible base image:
|
||||||
|
|
||||||
|
```dockerfile
|
||||||
|
# Change this line in Dockerfile.gpu:
|
||||||
|
FROM nvcr.io/nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
# To ARM64-compatible version:
|
||||||
|
FROM nvcr.io/nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
|
||||||
|
# (same image works on ARM64 when pulled on ARM machine)
|
||||||
|
```
|
||||||
|
|
||||||
|
Then build on the DGX Spark:
|
||||||
|
```bash
|
||||||
|
docker build -f Dockerfile.gpu -t paddle-ocr-api:gpu-arm64 .
|
||||||
|
```
|
||||||
|
|
||||||
|
### Option 2: x86_64 Emulation via QEMU (Slow)
|
||||||
|
|
||||||
|
You CAN run x86_64 images on ARM via emulation, but it's ~10-20x slower:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On DGX Spark, enable QEMU emulation
|
||||||
|
docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
|
||||||
|
|
||||||
|
# Run x86_64 image with emulation
|
||||||
|
docker run --platform linux/amd64 -p 8000:8000 \
|
||||||
|
-v $(pwd)/../dataset:/app/dataset:ro \
|
||||||
|
paddle-ocr-api:cpu
|
||||||
|
```
|
||||||
|
|
||||||
|
**Not recommended** for production due to severe performance penalty.
|
||||||
|
|
||||||
|
### Option 3: Cross-compile from x86_64
|
||||||
|
|
||||||
|
Build ARM64 images from your x86_64 machine:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Setup buildx for multi-arch
|
||||||
|
docker buildx create --name mybuilder --use
|
||||||
|
|
||||||
|
# Build ARM64 image from x86_64 machine
|
||||||
|
docker buildx build -f Dockerfile.cpu \
|
||||||
|
--platform linux/arm64 \
|
||||||
|
-t paddle-ocr-api:arm64 \
|
||||||
|
--load .
|
||||||
|
|
||||||
|
# Save and transfer to DGX Spark
|
||||||
|
docker save paddle-ocr-api:arm64 | gzip > paddle-ocr-arm64.tar.gz
|
||||||
|
scp paddle-ocr-arm64.tar.gz dgx-spark:~/
|
||||||
|
# On DGX Spark:
|
||||||
|
docker load < paddle-ocr-arm64.tar.gz
|
||||||
|
```
|
||||||
|
|
||||||
|
## Using with Ray Tune
|
||||||
|
|
||||||
|
Update your notebook's `trainable_paddle_ocr` function:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
API_URL = "http://localhost:8000/evaluate"
|
||||||
|
|
||||||
|
def trainable_paddle_ocr(config):
|
||||||
|
"""Call OCR API instead of subprocess."""
|
||||||
|
payload = {
|
||||||
|
"pdf_folder": "/app/dataset",
|
||||||
|
"use_doc_orientation_classify": config.get("use_doc_orientation_classify", False),
|
||||||
|
"use_doc_unwarping": config.get("use_doc_unwarping", False),
|
||||||
|
"textline_orientation": config.get("textline_orientation", True),
|
||||||
|
"text_det_thresh": config.get("text_det_thresh", 0.0),
|
||||||
|
"text_det_box_thresh": config.get("text_det_box_thresh", 0.0),
|
||||||
|
"text_det_unclip_ratio": config.get("text_det_unclip_ratio", 1.5),
|
||||||
|
"text_rec_score_thresh": config.get("text_rec_score_thresh", 0.0),
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.post(API_URL, json=payload, timeout=600)
|
||||||
|
response.raise_for_status()
|
||||||
|
metrics = response.json()
|
||||||
|
tune.report(metrics=metrics)
|
||||||
|
except Exception as e:
|
||||||
|
tune.report({"CER": 1.0, "WER": 1.0, "ERROR": str(e)[:500]})
|
||||||
|
```
|
||||||
|
|
||||||
|
## Architecture: Model Lifecycle
|
||||||
|
|
||||||
|
The model is loaded **once** at container startup and stays in memory for all requests:
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart TB
|
||||||
|
subgraph Container["Docker Container Lifecycle"]
|
||||||
|
Start([Container Start]) --> Load[Load PaddleOCR Models<br/>~10-30s one-time cost]
|
||||||
|
Load --> Ready[API Ready<br/>Models in RAM ~500MB]
|
||||||
|
|
||||||
|
subgraph Requests["Incoming Requests - Models Stay Loaded"]
|
||||||
|
Ready --> R1[Request 1] --> Ready
|
||||||
|
Ready --> R2[Request 2] --> Ready
|
||||||
|
Ready --> RN[Request N...] --> Ready
|
||||||
|
end
|
||||||
|
|
||||||
|
Ready --> Stop([Container Stop])
|
||||||
|
Stop --> Free[Models Freed]
|
||||||
|
end
|
||||||
|
|
||||||
|
style Load fill:#f9f,stroke:#333
|
||||||
|
style Ready fill:#9f9,stroke:#333
|
||||||
|
style Requests fill:#e8f4ea,stroke:#090
|
||||||
|
```
|
||||||
|
|
||||||
|
**Subprocess vs REST API comparison:**
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart LR
|
||||||
|
subgraph Subprocess["❌ Subprocess Approach"]
|
||||||
|
direction TB
|
||||||
|
S1[Trial 1] --> L1[Load Model ~10s]
|
||||||
|
L1 --> E1[Evaluate ~60s]
|
||||||
|
E1 --> U1[Unload]
|
||||||
|
U1 --> S2[Trial 2]
|
||||||
|
S2 --> L2[Load Model ~10s]
|
||||||
|
L2 --> E2[Evaluate ~60s]
|
||||||
|
end
|
||||||
|
|
||||||
|
subgraph REST["✅ REST API Approach"]
|
||||||
|
direction TB
|
||||||
|
Start2[Start Container] --> Load2[Load Model ~10s]
|
||||||
|
Load2 --> Ready2[Model in Memory]
|
||||||
|
Ready2 --> T1[Trial 1 ~60s]
|
||||||
|
T1 --> Ready2
|
||||||
|
Ready2 --> T2[Trial 2 ~60s]
|
||||||
|
T2 --> Ready2
|
||||||
|
Ready2 --> TN[Trial N ~60s]
|
||||||
|
end
|
||||||
|
|
||||||
|
style L1 fill:#faa
|
||||||
|
style L2 fill:#faa
|
||||||
|
style Load2 fill:#afa
|
||||||
|
style Ready2 fill:#afa
|
||||||
|
```
|
||||||
|
|
||||||
|
## Performance Comparison
|
||||||
|
|
||||||
|
| Approach | Model Load | Per-Trial Overhead | 64 Trials |
|
||||||
|
|----------|------------|-------------------|-----------|
|
||||||
|
| Subprocess (original) | Every trial (~10s) | ~10s | ~7 hours |
|
||||||
|
| Docker per trial | Every trial (~10s) | ~12-15s | ~7.5 hours |
|
||||||
|
| **REST API** | **Once** | **~0.1s** | **~5.8 hours** |
|
||||||
|
|
||||||
|
The REST API saves ~1+ hour by loading the model only once.
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Model download slow on first run
|
||||||
|
The first run downloads ~500MB of models. Use volume `paddlex-cache` to persist them.
|
||||||
|
|
||||||
|
### Out of memory
|
||||||
|
Reduce `max_concurrent_trials` in Ray Tune, or increase container memory:
|
||||||
|
```bash
|
||||||
|
docker run --memory=8g ...
|
||||||
|
```
|
||||||
|
|
||||||
|
### GPU not detected
|
||||||
|
Ensure NVIDIA Container Toolkit is installed:
|
||||||
|
```bash
|
||||||
|
nvidia-smi # Should work
|
||||||
|
docker run --rm --gpus all nvidia/cuda:12.0-base nvidia-smi # Should work
|
||||||
|
```
|
||||||
45
src/paddle_ocr/dataset_manager.py
Normal file
45
src/paddle_ocr/dataset_manager.py
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
# Imports
|
||||||
|
import os
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
|
||||||
|
class ImageTextDataset:
|
||||||
|
def __init__(self, root):
|
||||||
|
self.samples = []
|
||||||
|
|
||||||
|
for folder in sorted(os.listdir(root)):
|
||||||
|
sub = os.path.join(root, folder)
|
||||||
|
img_dir = os.path.join(sub, "img")
|
||||||
|
txt_dir = os.path.join(sub, "txt")
|
||||||
|
|
||||||
|
if not (os.path.isdir(img_dir) and os.path.isdir(txt_dir)):
|
||||||
|
continue
|
||||||
|
|
||||||
|
for fname in sorted(os.listdir(img_dir)):
|
||||||
|
if not fname.lower().endswith((".png", ".jpg", ".jpeg")):
|
||||||
|
continue
|
||||||
|
|
||||||
|
img_path = os.path.join(img_dir, fname)
|
||||||
|
|
||||||
|
# text file must have same name but .txt
|
||||||
|
txt_name = os.path.splitext(fname)[0] + ".txt"
|
||||||
|
txt_path = os.path.join(txt_dir, txt_name)
|
||||||
|
|
||||||
|
if not os.path.exists(txt_path):
|
||||||
|
continue
|
||||||
|
|
||||||
|
self.samples.append((img_path, txt_path))
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.samples)
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
img_path, txt_path = self.samples[idx]
|
||||||
|
|
||||||
|
# Load image
|
||||||
|
image = Image.open(img_path).convert("RGB")
|
||||||
|
|
||||||
|
# Load text
|
||||||
|
with open(txt_path, "r", encoding="utf-8") as f:
|
||||||
|
text = f.read()
|
||||||
|
|
||||||
|
return image, text
|
||||||
83
src/paddle_ocr/docker-compose.yml
Normal file
83
src/paddle_ocr/docker-compose.yml
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
# docker-compose.yml - PaddleOCR REST API
|
||||||
|
# Usage:
|
||||||
|
# CPU: docker compose up ocr-cpu
|
||||||
|
# GPU: docker compose up ocr-gpu
|
||||||
|
# Test: docker compose run --rm test
|
||||||
|
|
||||||
|
services:
|
||||||
|
# CPU-only service (works on any architecture)
|
||||||
|
ocr-cpu:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile.cpu
|
||||||
|
args:
|
||||||
|
# Models to bake into image (change before building):
|
||||||
|
DET_MODEL: PP-OCRv5_server_det
|
||||||
|
REC_MODEL: PP-OCRv5_server_rec
|
||||||
|
image: paddle-ocr-api:cpu
|
||||||
|
container_name: paddle-ocr-cpu
|
||||||
|
ports:
|
||||||
|
- "8000:8000"
|
||||||
|
volumes:
|
||||||
|
- ../dataset:/app/dataset:ro # Your dataset
|
||||||
|
- paddlex-cache:/root/.paddlex # For additional models at runtime
|
||||||
|
environment:
|
||||||
|
- PYTHONUNBUFFERED=1
|
||||||
|
# Override models at runtime (uncomment to use different models):
|
||||||
|
# - PADDLE_DET_MODEL=PP-OCRv5_mobile_det
|
||||||
|
# - PADDLE_REC_MODEL=PP-OCRv5_mobile_rec
|
||||||
|
restart: unless-stopped
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 60s
|
||||||
|
|
||||||
|
# GPU service (requires NVIDIA Container Toolkit)
|
||||||
|
ocr-gpu:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile.gpu
|
||||||
|
args:
|
||||||
|
DET_MODEL: PP-OCRv5_server_det
|
||||||
|
REC_MODEL: PP-OCRv5_server_rec
|
||||||
|
image: paddle-ocr-api:gpu
|
||||||
|
container_name: paddle-ocr-gpu
|
||||||
|
ports:
|
||||||
|
- "8000:8000"
|
||||||
|
volumes:
|
||||||
|
- ../dataset:/app/dataset:ro
|
||||||
|
- paddlex-cache:/root/.paddlex
|
||||||
|
environment:
|
||||||
|
- PYTHONUNBUFFERED=1
|
||||||
|
- CUDA_VISIBLE_DEVICES=0
|
||||||
|
# Override models at runtime:
|
||||||
|
# - PADDLE_DET_MODEL=PP-OCRv5_mobile_det
|
||||||
|
# - PADDLE_REC_MODEL=PP-OCRv5_mobile_rec
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
count: 1
|
||||||
|
capabilities: [gpu]
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
# Test client (runs once and exits)
|
||||||
|
test:
|
||||||
|
image: python:3.11-slim
|
||||||
|
container_name: paddle-ocr-test
|
||||||
|
depends_on:
|
||||||
|
ocr-cpu:
|
||||||
|
condition: service_healthy
|
||||||
|
volumes:
|
||||||
|
- ./test.py:/app/test.py:ro
|
||||||
|
working_dir: /app
|
||||||
|
command: >
|
||||||
|
sh -c "pip install -q requests && python test.py --url http://ocr-cpu:8000 --dataset /app/dataset"
|
||||||
|
network_mode: "service:ocr-cpu"
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
paddlex-cache:
|
||||||
|
name: paddlex-model-cache
|
||||||
263
src/paddle_ocr/paddle_ocr_tuning_rest.py
Normal file
263
src/paddle_ocr/paddle_ocr_tuning_rest.py
Normal file
@@ -0,0 +1,263 @@
|
|||||||
|
# paddle_ocr_tuning_rest.py
|
||||||
|
# FastAPI REST service for PaddleOCR hyperparameter evaluation
|
||||||
|
# Usage: uvicorn paddle_ocr_tuning_rest:app --host 0.0.0.0 --port 8000
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from fastapi import FastAPI, HTTPException
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from paddleocr import PaddleOCR
|
||||||
|
from jiwer import wer, cer
|
||||||
|
from dataset_manager import ImageTextDataset
|
||||||
|
|
||||||
|
|
||||||
|
# Model configuration via environment variables (with defaults)
|
||||||
|
DEFAULT_DET_MODEL = os.environ.get("PADDLE_DET_MODEL", "PP-OCRv5_server_det")
|
||||||
|
DEFAULT_REC_MODEL = os.environ.get("PADDLE_REC_MODEL", "PP-OCRv5_server_rec")
|
||||||
|
|
||||||
|
|
||||||
|
# Global state for model and dataset
|
||||||
|
class AppState:
|
||||||
|
ocr: Optional[PaddleOCR] = None
|
||||||
|
dataset: Optional[ImageTextDataset] = None
|
||||||
|
dataset_path: Optional[str] = None
|
||||||
|
det_model: str = DEFAULT_DET_MODEL
|
||||||
|
rec_model: str = DEFAULT_REC_MODEL
|
||||||
|
|
||||||
|
|
||||||
|
state = AppState()
|
||||||
|
|
||||||
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def lifespan(app: FastAPI):
|
||||||
|
"""Load OCR model at startup."""
|
||||||
|
print(f"Loading PaddleOCR models...")
|
||||||
|
print(f" Detection: {state.det_model}")
|
||||||
|
print(f" Recognition: {state.rec_model}")
|
||||||
|
state.ocr = PaddleOCR(
|
||||||
|
text_detection_model_name=state.det_model,
|
||||||
|
text_recognition_model_name=state.rec_model,
|
||||||
|
)
|
||||||
|
print("Model loaded successfully!")
|
||||||
|
yield
|
||||||
|
# Cleanup on shutdown
|
||||||
|
state.ocr = None
|
||||||
|
state.dataset = None
|
||||||
|
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="PaddleOCR Tuning API",
|
||||||
|
description="REST API for OCR hyperparameter evaluation",
|
||||||
|
version="1.0.0",
|
||||||
|
lifespan=lifespan,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class EvaluateRequest(BaseModel):
|
||||||
|
"""Request schema matching CLI arguments."""
|
||||||
|
pdf_folder: str = Field("/app/dataset", description="Path to dataset folder")
|
||||||
|
use_doc_orientation_classify: bool = Field(False, description="Use document orientation classification")
|
||||||
|
use_doc_unwarping: bool = Field(False, description="Use document unwarping")
|
||||||
|
textline_orientation: bool = Field(True, description="Use textline orientation classification")
|
||||||
|
text_det_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Detection pixel threshold")
|
||||||
|
text_det_box_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Detection box threshold")
|
||||||
|
text_det_unclip_ratio: float = Field(1.5, ge=0.0, description="Text detection expansion coefficient")
|
||||||
|
text_rec_score_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Recognition score threshold")
|
||||||
|
start_page: int = Field(5, ge=0, description="Start page index (inclusive)")
|
||||||
|
end_page: int = Field(10, ge=1, description="End page index (exclusive)")
|
||||||
|
|
||||||
|
|
||||||
|
class EvaluateResponse(BaseModel):
|
||||||
|
"""Response schema matching CLI output."""
|
||||||
|
CER: float
|
||||||
|
WER: float
|
||||||
|
TIME: float
|
||||||
|
PAGES: int
|
||||||
|
TIME_PER_PAGE: float
|
||||||
|
|
||||||
|
|
||||||
|
class HealthResponse(BaseModel):
|
||||||
|
status: str
|
||||||
|
model_loaded: bool
|
||||||
|
dataset_loaded: bool
|
||||||
|
dataset_size: Optional[int] = None
|
||||||
|
det_model: Optional[str] = None
|
||||||
|
rec_model: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_box_xyxy(box):
|
||||||
|
"""Normalize bounding box to (x0, y0, x1, y1) format."""
|
||||||
|
if isinstance(box, (list, tuple)) and box and isinstance(box[0], (list, tuple)):
|
||||||
|
xs = [p[0] for p in box]
|
||||||
|
ys = [p[1] for p in box]
|
||||||
|
return min(xs), min(ys), max(xs), max(ys)
|
||||||
|
|
||||||
|
if isinstance(box, (list, tuple)):
|
||||||
|
if len(box) == 4:
|
||||||
|
x0, y0, x1, y1 = box
|
||||||
|
return min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1)
|
||||||
|
if len(box) == 8:
|
||||||
|
xs = box[0::2]
|
||||||
|
ys = box[1::2]
|
||||||
|
return min(xs), min(ys), max(xs), max(ys)
|
||||||
|
|
||||||
|
raise ValueError(f"Unrecognized box format: {box!r}")
|
||||||
|
|
||||||
|
|
||||||
|
def assemble_from_paddle_result(paddleocr_predict, min_score=0.0, line_tol_factor=0.6):
|
||||||
|
"""
|
||||||
|
Robust line grouping for PaddleOCR outputs.
|
||||||
|
Normalizes boxes, groups by line, and returns assembled text.
|
||||||
|
"""
|
||||||
|
boxes_all = []
|
||||||
|
for item in paddleocr_predict:
|
||||||
|
res = item.json.get("res", {})
|
||||||
|
boxes = res.get("rec_boxes", []) or []
|
||||||
|
texts = res.get("rec_texts", []) or []
|
||||||
|
scores = res.get("rec_scores", None)
|
||||||
|
|
||||||
|
for i, (box, text) in enumerate(zip(boxes, texts)):
|
||||||
|
try:
|
||||||
|
x0, y0, x1, y1 = _normalize_box_xyxy(box)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
y_mid = 0.5 * (y0 + y1)
|
||||||
|
score = float(scores[i]) if (scores is not None and i < len(scores)) else 1.0
|
||||||
|
|
||||||
|
t = re.sub(r"\s+", " ", str(text)).strip()
|
||||||
|
if not t:
|
||||||
|
continue
|
||||||
|
|
||||||
|
boxes_all.append((x0, y0, x1, y1, y_mid, t, score))
|
||||||
|
|
||||||
|
if min_score > 0:
|
||||||
|
boxes_all = [b for b in boxes_all if b[6] >= min_score]
|
||||||
|
|
||||||
|
if not boxes_all:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Adaptive line tolerance
|
||||||
|
heights = [b[3] - b[1] for b in boxes_all]
|
||||||
|
median_h = float(np.median(heights)) if heights else 20.0
|
||||||
|
line_tol = max(8.0, line_tol_factor * median_h)
|
||||||
|
|
||||||
|
# Sort by vertical mid, then x0
|
||||||
|
boxes_all.sort(key=lambda b: (b[4], b[0]))
|
||||||
|
|
||||||
|
# Group into lines
|
||||||
|
lines, cur, last_y = [], [], None
|
||||||
|
for x0, y0, x1, y1, y_mid, text, score in boxes_all:
|
||||||
|
if last_y is None or abs(y_mid - last_y) <= line_tol:
|
||||||
|
cur.append((x0, text))
|
||||||
|
else:
|
||||||
|
cur.sort(key=lambda t: t[0])
|
||||||
|
lines.append(" ".join(t[1] for t in cur))
|
||||||
|
cur = [(x0, text)]
|
||||||
|
last_y = y_mid
|
||||||
|
|
||||||
|
if cur:
|
||||||
|
cur.sort(key=lambda t: t[0])
|
||||||
|
lines.append(" ".join(t[1] for t in cur))
|
||||||
|
|
||||||
|
res = "\n".join(lines)
|
||||||
|
res = re.sub(r"\s+\n", "\n", res).strip()
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_text(reference: str, prediction: str) -> dict:
|
||||||
|
"""Calculate WER and CER metrics."""
|
||||||
|
return {"WER": wer(reference, prediction), "CER": cer(reference, prediction)}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health", response_model=HealthResponse)
|
||||||
|
def health_check():
|
||||||
|
"""Check if the service is ready."""
|
||||||
|
return HealthResponse(
|
||||||
|
status="ok" if state.ocr is not None else "initializing",
|
||||||
|
model_loaded=state.ocr is not None,
|
||||||
|
dataset_loaded=state.dataset is not None,
|
||||||
|
dataset_size=len(state.dataset) if state.dataset else None,
|
||||||
|
det_model=state.det_model,
|
||||||
|
rec_model=state.rec_model,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/evaluate", response_model=EvaluateResponse)
|
||||||
|
def evaluate(request: EvaluateRequest):
|
||||||
|
"""
|
||||||
|
Evaluate OCR with given hyperparameters.
|
||||||
|
Returns CER, WER, and timing metrics.
|
||||||
|
"""
|
||||||
|
if state.ocr is None:
|
||||||
|
raise HTTPException(status_code=503, detail="Model not loaded yet")
|
||||||
|
|
||||||
|
# Load or reload dataset if path changed
|
||||||
|
if state.dataset is None or state.dataset_path != request.pdf_folder:
|
||||||
|
if not os.path.isdir(request.pdf_folder):
|
||||||
|
raise HTTPException(status_code=400, detail=f"Dataset folder not found: {request.pdf_folder}")
|
||||||
|
state.dataset = ImageTextDataset(request.pdf_folder)
|
||||||
|
state.dataset_path = request.pdf_folder
|
||||||
|
|
||||||
|
if len(state.dataset) == 0:
|
||||||
|
raise HTTPException(status_code=400, detail="Dataset is empty")
|
||||||
|
|
||||||
|
# Validate page range
|
||||||
|
start = request.start_page
|
||||||
|
end = min(request.end_page, len(state.dataset))
|
||||||
|
if start >= end:
|
||||||
|
raise HTTPException(status_code=400, detail=f"Invalid page range: {start}-{end}")
|
||||||
|
|
||||||
|
cer_list, wer_list = [], []
|
||||||
|
time_per_page_list = []
|
||||||
|
t0 = time.time()
|
||||||
|
|
||||||
|
for idx in range(start, end):
|
||||||
|
img, ref = state.dataset[idx]
|
||||||
|
arr = np.array(img)
|
||||||
|
|
||||||
|
tp0 = time.time()
|
||||||
|
out = state.ocr.predict(
|
||||||
|
arr,
|
||||||
|
use_doc_orientation_classify=request.use_doc_orientation_classify,
|
||||||
|
use_doc_unwarping=request.use_doc_unwarping,
|
||||||
|
use_textline_orientation=request.textline_orientation,
|
||||||
|
text_det_thresh=request.text_det_thresh,
|
||||||
|
text_det_box_thresh=request.text_det_box_thresh,
|
||||||
|
text_det_unclip_ratio=request.text_det_unclip_ratio,
|
||||||
|
text_rec_score_thresh=request.text_rec_score_thresh,
|
||||||
|
)
|
||||||
|
|
||||||
|
pred = assemble_from_paddle_result(out)
|
||||||
|
time_per_page_list.append(float(time.time() - tp0))
|
||||||
|
|
||||||
|
m = evaluate_text(ref, pred)
|
||||||
|
cer_list.append(m["CER"])
|
||||||
|
wer_list.append(m["WER"])
|
||||||
|
|
||||||
|
return EvaluateResponse(
|
||||||
|
CER=float(np.mean(cer_list)) if cer_list else 1.0,
|
||||||
|
WER=float(np.mean(wer_list)) if wer_list else 1.0,
|
||||||
|
TIME=float(time.time() - t0),
|
||||||
|
PAGES=len(cer_list),
|
||||||
|
TIME_PER_PAGE=float(np.mean(time_per_page_list)) if time_per_page_list else 0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/evaluate_full", response_model=EvaluateResponse)
|
||||||
|
def evaluate_full(request: EvaluateRequest):
|
||||||
|
"""Evaluate on ALL pages (ignores start_page/end_page)."""
|
||||||
|
request.start_page = 0
|
||||||
|
request.end_page = 9999 # Will be clamped to dataset size
|
||||||
|
return evaluate(request)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||||
22
src/paddle_ocr/requirements-gpu.txt
Normal file
22
src/paddle_ocr/requirements-gpu.txt
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
# PaddleOCR REST API - GPU Requirements
|
||||||
|
# Install: pip install -r requirements-gpu.txt
|
||||||
|
|
||||||
|
# PaddlePaddle (GPU version with CUDA)
|
||||||
|
paddlepaddle-gpu==3.0.0
|
||||||
|
|
||||||
|
# PaddleOCR
|
||||||
|
paddleocr==3.3.2
|
||||||
|
|
||||||
|
# OCR evaluation metrics
|
||||||
|
jiwer
|
||||||
|
|
||||||
|
# Numerical computing
|
||||||
|
numpy
|
||||||
|
|
||||||
|
# REST API framework
|
||||||
|
fastapi
|
||||||
|
uvicorn[standard]
|
||||||
|
pydantic
|
||||||
|
|
||||||
|
# Image processing
|
||||||
|
Pillow
|
||||||
22
src/paddle_ocr/requirements.txt
Normal file
22
src/paddle_ocr/requirements.txt
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
# PaddleOCR REST API - CPU Requirements
|
||||||
|
# Install: pip install -r requirements.txt
|
||||||
|
|
||||||
|
# PaddlePaddle (CPU version)
|
||||||
|
paddlepaddle==3.2.2
|
||||||
|
|
||||||
|
# PaddleOCR
|
||||||
|
paddleocr==3.3.2
|
||||||
|
|
||||||
|
# OCR evaluation metrics
|
||||||
|
jiwer
|
||||||
|
|
||||||
|
# Numerical computing
|
||||||
|
numpy
|
||||||
|
|
||||||
|
# REST API framework
|
||||||
|
fastapi
|
||||||
|
uvicorn[standard]
|
||||||
|
pydantic
|
||||||
|
|
||||||
|
# Image processing (pulled by paddleocr, but explicit)
|
||||||
|
Pillow
|
||||||
114
src/paddle_ocr/test.py
Normal file
114
src/paddle_ocr/test.py
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
# test.py - Simple client to test PaddleOCR REST API
|
||||||
|
# Usage: python test.py [--url URL] [--dataset PATH]
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def wait_for_health(url: str, timeout: int = 120) -> bool:
|
||||||
|
"""Wait for API to be ready."""
|
||||||
|
health_url = f"{url}/health"
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
print(f"Waiting for API at {health_url}...")
|
||||||
|
while time.time() - start < timeout:
|
||||||
|
try:
|
||||||
|
resp = requests.get(health_url, timeout=5)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
data = resp.json()
|
||||||
|
if data.get("model_loaded"):
|
||||||
|
print(f"API ready! Model loaded in {time.time() - start:.1f}s")
|
||||||
|
return True
|
||||||
|
print(f" Model loading... ({time.time() - start:.0f}s)")
|
||||||
|
except requests.exceptions.ConnectionError:
|
||||||
|
print(f" Connecting... ({time.time() - start:.0f}s)")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Error: {e}")
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
print("Timeout waiting for API")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def test_evaluate(url: str, config: dict) -> dict:
|
||||||
|
"""Run evaluation with given config."""
|
||||||
|
eval_url = f"{url}/evaluate"
|
||||||
|
|
||||||
|
print(f"\nTesting config: {config}")
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
resp = requests.post(eval_url, json=config, timeout=600)
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
result = resp.json()
|
||||||
|
elapsed = time.time() - start
|
||||||
|
|
||||||
|
print(f"Results (took {elapsed:.1f}s):")
|
||||||
|
print(f" CER: {result['CER']:.4f} ({result['CER']*100:.2f}%)")
|
||||||
|
print(f" WER: {result['WER']:.4f} ({result['WER']*100:.2f}%)")
|
||||||
|
print(f" Pages: {result['PAGES']}")
|
||||||
|
print(f" Time/page: {result['TIME_PER_PAGE']:.2f}s")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Test PaddleOCR REST API")
|
||||||
|
parser.add_argument("--url", default="http://localhost:8000", help="API base URL")
|
||||||
|
parser.add_argument("--dataset", default="/app/dataset", help="Dataset path (inside container)")
|
||||||
|
parser.add_argument("--skip-health", action="store_true", help="Skip health check wait")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Wait for API to be ready
|
||||||
|
if not args.skip_health:
|
||||||
|
if not wait_for_health(args.url):
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Test 1: Baseline config (default PaddleOCR)
|
||||||
|
print("\n" + "="*50)
|
||||||
|
print("TEST 1: Baseline Configuration")
|
||||||
|
print("="*50)
|
||||||
|
baseline = test_evaluate(args.url, {
|
||||||
|
"pdf_folder": args.dataset,
|
||||||
|
"use_doc_orientation_classify": False,
|
||||||
|
"use_doc_unwarping": False,
|
||||||
|
"textline_orientation": False, # Baseline: disabled
|
||||||
|
"text_det_thresh": 0.0,
|
||||||
|
"text_det_box_thresh": 0.0,
|
||||||
|
"text_det_unclip_ratio": 1.5,
|
||||||
|
"text_rec_score_thresh": 0.0,
|
||||||
|
"start_page": 5,
|
||||||
|
"end_page": 10,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Test 2: Optimized config (from Ray Tune results)
|
||||||
|
print("\n" + "="*50)
|
||||||
|
print("TEST 2: Optimized Configuration")
|
||||||
|
print("="*50)
|
||||||
|
optimized = test_evaluate(args.url, {
|
||||||
|
"pdf_folder": args.dataset,
|
||||||
|
"use_doc_orientation_classify": False,
|
||||||
|
"use_doc_unwarping": False,
|
||||||
|
"textline_orientation": True, # KEY: enabled
|
||||||
|
"text_det_thresh": 0.4690,
|
||||||
|
"text_det_box_thresh": 0.5412,
|
||||||
|
"text_det_unclip_ratio": 0.0,
|
||||||
|
"text_rec_score_thresh": 0.6350,
|
||||||
|
"start_page": 5,
|
||||||
|
"end_page": 10,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print("\n" + "="*50)
|
||||||
|
print("SUMMARY")
|
||||||
|
print("="*50)
|
||||||
|
cer_reduction = (1 - optimized["CER"] / baseline["CER"]) * 100 if baseline["CER"] > 0 else 0
|
||||||
|
print(f"Baseline CER: {baseline['CER']*100:.2f}%")
|
||||||
|
print(f"Optimized CER: {optimized['CER']*100:.2f}%")
|
||||||
|
print(f"Improvement: {cer_reduction:.1f}% reduction in errors")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user