Paddle ocr, easyicr and doctr gpu support. (#4)
All checks were successful
build_docker / essential (push) Successful in 0s
build_docker / build_cpu (push) Successful in 5m0s
build_docker / build_gpu (push) Successful in 22m55s
build_docker / build_easyocr (push) Successful in 18m47s
build_docker / build_easyocr_gpu (push) Successful in 19m0s
build_docker / build_raytune (push) Successful in 3m27s
build_docker / build_doctr (push) Successful in 19m42s
build_docker / build_doctr_gpu (push) Successful in 14m49s

This commit was merged in pull request #4.
This commit is contained in:
2026-01-19 17:35:24 +00:00
committed by Sergio Jimenez Jimenez
parent 8e2b7a5096
commit c7ed7b2b9c
105 changed files with 8170 additions and 1263 deletions

View File

@@ -0,0 +1,213 @@
# Dockerfile.build-paddle - Build PaddlePaddle GPU wheel for ARM64
#
# This Dockerfile compiles PaddlePaddle from source with CUDA support for ARM64.
# The resulting wheel can be used in Dockerfile.gpu for ARM64 GPU acceleration.
#
# Build time: ~1-2 hours with caching, 2-4 hours first build
# Output: /output/paddlepaddle_gpu-*.whl
#
# Usage:
# CUDA_ARCH=90 docker compose --profile build run --rm build-paddle
#
# Features:
# - ccache for compiler caching (survives rebuilds)
# - Split build stages for better layer caching
# - ARM64 -m64 patch applied automatically
# syntax=docker/dockerfile:1.4
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
LABEL maintainer="Sergio Jimenez"
LABEL description="PaddlePaddle GPU wheel builder for ARM64"
# Build arguments
ARG PADDLE_VERSION=v3.0.0
ARG PYTHON_VERSION=3.11
ARG CUDA_ARCH=90
# Environment setup
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV CCACHE_DIR=/ccache
ENV PATH="/usr/lib/ccache:${PATH}"
# Install build dependencies + ccache
RUN apt-get update && apt-get install -y --no-install-recommends \
python${PYTHON_VERSION} \
python${PYTHON_VERSION}-dev \
python${PYTHON_VERSION}-venv \
python3-pip \
build-essential \
cmake \
ninja-build \
git \
wget \
curl \
pkg-config \
ccache \
libssl-dev \
libffi-dev \
zlib1g-dev \
libbz2-dev \
libreadline-dev \
libsqlite3-dev \
liblzma-dev \
libncurses5-dev \
libncursesw5-dev \
libgflags-dev \
libgoogle-glog-dev \
libprotobuf-dev \
protobuf-compiler \
patchelf \
libopenblas-dev \
liblapack-dev \
swig \
&& rm -rf /var/lib/apt/lists/* \
&& ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python \
&& ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python3
# Setup ccache symlinks for CUDA
RUN mkdir -p /usr/lib/ccache && \
ln -sf /usr/bin/ccache /usr/lib/ccache/nvcc && \
ln -sf /usr/bin/ccache /usr/lib/ccache/gcc && \
ln -sf /usr/bin/ccache /usr/lib/ccache/g++ && \
ln -sf /usr/bin/ccache /usr/lib/ccache/cc && \
ln -sf /usr/bin/ccache /usr/lib/ccache/c++
# Upgrade pip and install Python build dependencies
RUN python -m pip install --upgrade pip setuptools wheel && \
python -m pip install numpy protobuf pyyaml requests packaging astor decorator paddle-bfloat opt-einsum
WORKDIR /build
# Clone PaddlePaddle repository
RUN git clone --depth 1 --branch ${PADDLE_VERSION} https://github.com/PaddlePaddle/Paddle.git
WORKDIR /build/Paddle
# Patch for ARM64: Remove -m64 flag (x86_64 specific, causes build failure on aarch64)
RUN sed -i 's/-m64//g' cmake/flags.cmake && \
sed -i 's/-m64//g' CMakeLists.txt 2>/dev/null || true && \
find . -name "*.cmake" -exec sed -i 's/-m64//g' {} \; 2>/dev/null || true && \
echo "Patched -m64 flag for ARM64 compatibility"
# Patch for ARM64: Install sse2neon to translate x86 SSE intrinsics to ARM NEON
# sse2neon provides drop-in replacements for x86 SIMD headers
RUN git clone --depth 1 https://github.com/DLTcollab/sse2neon.git /tmp/sse2neon && \
mkdir -p /usr/local/include/sse2neon && \
cp /tmp/sse2neon/sse2neon.h /usr/local/include/sse2neon/ && \
rm -rf /tmp/sse2neon && \
echo "Installed sse2neon for x86->ARM NEON translation"
# Create wrapper headers that use sse2neon for ARM64
RUN mkdir -p /usr/local/include/x86_stubs && \
echo "#ifndef __x86_64__" > /usr/local/include/x86_stubs/immintrin.h && \
echo "#include <sse2neon/sse2neon.h>" >> /usr/local/include/x86_stubs/immintrin.h && \
echo "#else" >> /usr/local/include/x86_stubs/immintrin.h && \
echo "#include_next <immintrin.h>" >> /usr/local/include/x86_stubs/immintrin.h && \
echo "#endif" >> /usr/local/include/x86_stubs/immintrin.h && \
echo "#ifndef __x86_64__" > /usr/local/include/x86_stubs/xmmintrin.h && \
echo "#include <sse2neon/sse2neon.h>" >> /usr/local/include/x86_stubs/xmmintrin.h && \
echo "#else" >> /usr/local/include/x86_stubs/xmmintrin.h && \
echo "#include_next <xmmintrin.h>" >> /usr/local/include/x86_stubs/xmmintrin.h && \
echo "#endif" >> /usr/local/include/x86_stubs/xmmintrin.h && \
echo "#ifndef __x86_64__" > /usr/local/include/x86_stubs/emmintrin.h && \
echo "#include <sse2neon/sse2neon.h>" >> /usr/local/include/x86_stubs/emmintrin.h && \
echo "#else" >> /usr/local/include/x86_stubs/emmintrin.h && \
echo "#include_next <emmintrin.h>" >> /usr/local/include/x86_stubs/emmintrin.h && \
echo "#endif" >> /usr/local/include/x86_stubs/emmintrin.h && \
echo "#ifndef __x86_64__" > /usr/local/include/x86_stubs/pmmintrin.h && \
echo "#include <sse2neon/sse2neon.h>" >> /usr/local/include/x86_stubs/pmmintrin.h && \
echo "#else" >> /usr/local/include/x86_stubs/pmmintrin.h && \
echo "#include_next <pmmintrin.h>" >> /usr/local/include/x86_stubs/pmmintrin.h && \
echo "#endif" >> /usr/local/include/x86_stubs/pmmintrin.h && \
echo "#ifndef __x86_64__" > /usr/local/include/x86_stubs/smmintrin.h && \
echo "#include <sse2neon/sse2neon.h>" >> /usr/local/include/x86_stubs/smmintrin.h && \
echo "#else" >> /usr/local/include/x86_stubs/smmintrin.h && \
echo "#include_next <smmintrin.h>" >> /usr/local/include/x86_stubs/smmintrin.h && \
echo "#endif" >> /usr/local/include/x86_stubs/smmintrin.h && \
echo "Created x86 intrinsic wrapper headers for ARM64 using sse2neon"
# Install additional Python requirements for building
RUN pip install -r python/requirements.txt || true
# Create build directory
RUN mkdir -p build
WORKDIR /build/Paddle/build
# Configure CMake for ARM64 + CUDA build
# Note: -Wno-class-memaccess fixes Eigen NEON warning on ARM64
RUN echo "Building for CUDA architecture: sm_${CUDA_ARCH}" && \
cmake .. \
-GNinja \
-DCMAKE_BUILD_TYPE=Release \
-DPY_VERSION=${PYTHON_VERSION} \
-DWITH_GPU=ON \
-DWITH_TESTING=OFF \
-DWITH_DISTRIBUTE=OFF \
-DWITH_NCCL=OFF \
-DWITH_MKL=OFF \
-DWITH_MKLDNN=OFF \
-DON_INFER=OFF \
-DWITH_PYTHON=ON \
-DWITH_AVX=OFF \
-DCUDA_ARCH_NAME=Manual \
-DCUDA_ARCH_BIN="${CUDA_ARCH}" \
-DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCH}" \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_FLAGS="-Wno-class-memaccess -Wno-error=class-memaccess -I/usr/local/include/x86_stubs" \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
# Build external dependencies first (cacheable layer)
RUN --mount=type=cache,target=/ccache \
ninja extern_gflags extern_glog extern_protobuf extern_zlib extern_eigen3
# Build flashattn (heaviest dependency, separate layer for caching)
RUN --mount=type=cache,target=/ccache \
ninja extern_flashattn
# Build remaining external dependencies
RUN --mount=type=cache,target=/ccache \
ninja extern_openblas extern_pybind extern_utf8proc extern_xxhash extern_yaml extern_cryptopp extern_warpctc extern_warprnnt extern_gloo extern_xbyak
# Build main PaddlePaddle (with ccache, fallback to fewer jobs if OOM)
RUN --mount=type=cache,target=/ccache \
ninja -j$(nproc) || ninja -j$(($(nproc)/2)) || ninja -j4
# Build the Python wheel
RUN ninja paddle_python || true
# Create output directory
RUN mkdir -p /output
# Build wheel package - try multiple methods since PaddlePaddle build structure varies
WORKDIR /build/Paddle
RUN echo "=== Looking for wheel build method ===" && \
ls -la python/ 2>/dev/null && \
ls -la build/python/ 2>/dev/null && \
if [ -f build/python/setup.py ]; then \
echo "Using build/python/setup.py" && \
cd build/python && python setup.py bdist_wheel; \
elif [ -f python/setup.py ]; then \
echo "Using python/setup.py" && \
cd python && python setup.py bdist_wheel; \
else \
echo "Looking for existing wheel..." && \
find /build -name "paddlepaddle*.whl" -type f 2>/dev/null; \
fi
# Copy wheel to output
RUN find /build -name "paddlepaddle*.whl" -type f -exec cp {} /output/ \; && \
ls -la /output/ && \
if [ ! "$(ls -A /output/*.whl 2>/dev/null)" ]; then \
echo "ERROR: No wheel found!" && exit 1; \
fi
# List what was built
RUN ls -la /output/ && \
echo "=== Build complete ===" && \
find /build -name "*.whl" -type f 2>/dev/null
# Default command: copy wheel to mounted volume
CMD ["sh", "-c", "cp /output/*.whl /wheels/ 2>/dev/null && echo 'Wheel copied to /wheels/' && ls -la /wheels/ || echo 'No wheel found in /output, checking other locations...' && find /build -name '*.whl' -exec cp {} /wheels/ \\; && ls -la /wheels/"]

View File

@@ -0,0 +1,149 @@
# Dockerfile.build-paddle-cpu - Build PaddlePaddle CPU wheel for ARM64
#
# Required because PyPI wheels don't work on ARM64 (x86 SSE instructions).
#
# Build time: ~1-2 hours
# Output: /output/paddlepaddle-*.whl
#
# Usage:
# docker build -t paddle-builder:cpu-arm64 -f Dockerfile.build-paddle-cpu .
# docker run --rm -v ./wheels:/wheels paddle-builder:cpu-arm64
# syntax=docker/dockerfile:1.4
FROM ubuntu:22.04
LABEL maintainer="Sergio Jimenez"
LABEL description="PaddlePaddle CPU wheel builder for ARM64"
ARG PADDLE_VERSION=v3.0.0
ARG PYTHON_VERSION=3.11
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV CCACHE_DIR=/ccache
ENV PATH="/usr/lib/ccache:${PATH}"
# Install build dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
python${PYTHON_VERSION} \
python${PYTHON_VERSION}-dev \
python${PYTHON_VERSION}-venv \
python3-pip \
build-essential \
cmake \
ninja-build \
git \
wget \
curl \
pkg-config \
ccache \
libssl-dev \
libffi-dev \
zlib1g-dev \
libbz2-dev \
libreadline-dev \
libsqlite3-dev \
liblzma-dev \
libncurses5-dev \
libncursesw5-dev \
libgflags-dev \
libgoogle-glog-dev \
libprotobuf-dev \
protobuf-compiler \
patchelf \
libopenblas-dev \
liblapack-dev \
swig \
&& rm -rf /var/lib/apt/lists/* \
&& ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python \
&& ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python3
# Setup ccache
RUN mkdir -p /usr/lib/ccache && \
ln -sf /usr/bin/ccache /usr/lib/ccache/gcc && \
ln -sf /usr/bin/ccache /usr/lib/ccache/g++ && \
ln -sf /usr/bin/ccache /usr/lib/ccache/cc && \
ln -sf /usr/bin/ccache /usr/lib/ccache/c++
RUN python -m pip install --upgrade pip setuptools wheel && \
python -m pip install numpy protobuf pyyaml requests packaging astor decorator paddle-bfloat opt-einsum
WORKDIR /build
RUN git clone --depth 1 --branch ${PADDLE_VERSION} https://github.com/PaddlePaddle/Paddle.git
WORKDIR /build/Paddle
# Patch -m64 flag (x86_64 specific)
RUN sed -i 's/-m64//g' cmake/flags.cmake && \
sed -i 's/-m64//g' CMakeLists.txt 2>/dev/null || true && \
find . -name "*.cmake" -exec sed -i 's/-m64//g' {} \; 2>/dev/null || true
# Install sse2neon for x86 SSE -> ARM NEON translation
RUN git clone --depth 1 https://github.com/DLTcollab/sse2neon.git /tmp/sse2neon && \
mkdir -p /usr/local/include/sse2neon && \
cp /tmp/sse2neon/sse2neon.h /usr/local/include/sse2neon/ && \
rm -rf /tmp/sse2neon
# Create x86 intrinsic wrapper headers
RUN mkdir -p /usr/local/include/x86_stubs && \
for h in immintrin xmmintrin emmintrin pmmintrin smmintrin; do \
echo "#ifndef __x86_64__" > /usr/local/include/x86_stubs/${h}.h && \
echo "#include <sse2neon/sse2neon.h>" >> /usr/local/include/x86_stubs/${h}.h && \
echo "#else" >> /usr/local/include/x86_stubs/${h}.h && \
echo "#include_next <${h}.h>" >> /usr/local/include/x86_stubs/${h}.h && \
echo "#endif" >> /usr/local/include/x86_stubs/${h}.h; \
done
RUN pip install -r python/requirements.txt || true
RUN mkdir -p build
WORKDIR /build/Paddle/build
# Configure for CPU-only ARM64 build
# WITH_ARM=ON enables ARM NEON optimizations and disables x86-specific code (XBYAK, MKL)
RUN cmake .. \
-GNinja \
-DCMAKE_BUILD_TYPE=Release \
-DPY_VERSION=${PYTHON_VERSION} \
-DWITH_GPU=OFF \
-DWITH_ARM=ON \
-DWITH_TESTING=OFF \
-DWITH_DISTRIBUTE=OFF \
-DWITH_NCCL=OFF \
-DWITH_MKL=OFF \
-DWITH_MKLDNN=OFF \
-DWITH_XBYAK=OFF \
-DON_INFER=OFF \
-DWITH_PYTHON=ON \
-DWITH_AVX=OFF \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_FLAGS="-Wno-class-memaccess -Wno-error=class-memaccess -I/usr/local/include/x86_stubs"
# Build external dependencies
RUN --mount=type=cache,target=/ccache \
ninja extern_gflags extern_glog extern_protobuf extern_zlib extern_eigen3
# Note: extern_xbyak excluded - it's x86-only and disabled with WITH_ARM=ON
RUN --mount=type=cache,target=/ccache \
ninja extern_openblas extern_pybind extern_utf8proc extern_xxhash extern_yaml extern_cryptopp extern_warpctc extern_warprnnt extern_gloo
# Build PaddlePaddle
RUN --mount=type=cache,target=/ccache \
ninja -j$(nproc) || ninja -j$(($(nproc)/2)) || ninja -j4
RUN ninja paddle_python || true
RUN mkdir -p /output
WORKDIR /build/Paddle
RUN if [ -f build/python/setup.py ]; then \
cd build/python && python setup.py bdist_wheel; \
elif [ -f python/setup.py ]; then \
cd python && python setup.py bdist_wheel; \
fi
RUN find /build -name "paddlepaddle*.whl" -type f -exec cp {} /output/ \; && \
ls -la /output/
CMD ["sh", "-c", "cp /output/*.whl /wheels/ && ls -la /wheels/"]

View File

@@ -0,0 +1,81 @@
# Dockerfile.cpu - Multi-stage CPU Dockerfile
#
# Build base only (push to registry, rarely changes):
# docker build --target base -t seryus.ddns.net/unir/paddle-ocr-cpu-base:latest -f Dockerfile.cpu .
#
# Build deploy (uses base, fast - code only):
# docker build --target deploy -t seryus.ddns.net/unir/paddle-ocr-cpu:latest -f Dockerfile.cpu .
#
# Or build all at once:
# docker build -t paddle-ocr-api:cpu -f Dockerfile.cpu .
# =============================================================================
# STAGE 1: BASE - All dependencies (rarely changes)
# =============================================================================
FROM python:3.11-slim AS base
LABEL maintainer="Sergio Jimenez"
LABEL description="PaddleOCR Base Image - CPU dependencies"
WORKDIR /app
# Install system dependencies for OpenCV and PaddleOCR
RUN apt-get update && apt-get install -y --no-install-recommends \
libgl1 \
libglib2.0-0 \
libsm6 \
libxext6 \
libxrender1 \
libgomp1 \
&& rm -rf /var/lib/apt/lists/*
# Copy local wheels directory (may contain ARM64 wheel from build-paddle-cpu)
COPY wheels/ /tmp/wheels/
# Install paddlepaddle: prefer local wheel (ARM64), fallback to PyPI (x86_64)
RUN if ls /tmp/wheels/paddlepaddle*.whl 1>/dev/null 2>&1; then \
echo "=== Installing PaddlePaddle from local wheel (ARM64) ===" && \
pip install --no-cache-dir /tmp/wheels/paddlepaddle*.whl; \
else \
echo "=== Installing PaddlePaddle from PyPI (x86_64) ===" && \
pip install --no-cache-dir paddlepaddle==3.0.0; \
fi && \
rm -rf /tmp/wheels
# Install remaining Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# =============================================================================
# STAGE 2: DEPLOY - Application code (changes frequently)
# =============================================================================
FROM base AS deploy
LABEL description="PaddleOCR Tuning REST API - CPU version"
WORKDIR /app
# Copy application code (this is the only layer that changes frequently)
COPY paddle_ocr_tuning_rest.py .
COPY dataset_manager.py .
# Build arguments for models
ARG DET_MODEL=PP-OCRv5_server_det
ARG REC_MODEL=PP-OCRv5_server_rec
# Set as environment variables (can be overridden at runtime)
ENV PADDLE_DET_MODEL=${DET_MODEL}
ENV PADDLE_REC_MODEL=${REC_MODEL}
# Volume for dataset and model cache
VOLUME ["/app/dataset", "/root/.paddlex"]
# Expose API port
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
# Run the API server
CMD ["uvicorn", "paddle_ocr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -0,0 +1,105 @@
# Dockerfile.gpu - Multi-stage GPU Dockerfile
#
# Build base only (push to registry, rarely changes):
# docker build --target base -t seryus.ddns.net/unir/paddle-ocr-gpu-base:latest -f Dockerfile.gpu .
#
# Build deploy (uses base, fast - code only):
# docker build --target deploy -t seryus.ddns.net/unir/paddle-ocr-gpu:latest -f Dockerfile.gpu .
#
# Or build all at once:
# docker build -t paddle-ocr-api:gpu -f Dockerfile.gpu .
# =============================================================================
# STAGE 1: BASE - All dependencies (rarely changes)
# =============================================================================
FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 AS base
LABEL maintainer="Sergio Jimenez"
LABEL description="PaddleOCR Base Image - GPU/CUDA dependencies"
WORKDIR /app
# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV CUDA_VISIBLE_DEVICES=0
# Install Python 3.11 and system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
python3.11 \
python3.11-venv \
python3-pip \
libgl1 \
libglib2.0-0 \
libsm6 \
libxext6 \
libxrender1 \
libgomp1 \
&& rm -rf /var/lib/apt/lists/* \
&& ln -sf /usr/bin/python3.11 /usr/bin/python
# Fix cuDNN library path for ARM64 only (PaddlePaddle looks in /usr/local/cuda/lib64)
# x86_64 doesn't need this - PyPI wheel handles paths correctly
RUN if [ "$(uname -m)" = "aarch64" ]; then \
mkdir -p /usr/local/cuda/lib64 && \
ln -sf /usr/lib/aarch64-linux-gnu/libcudnn*.so* /usr/local/cuda/lib64/ && \
ln -sf /usr/lib/aarch64-linux-gnu/libcudnn.so.9 /usr/local/cuda/lib64/libcudnn.so && \
ldconfig; \
fi
# Copy local wheels directory (may contain ARM64 wheel from build-paddle)
COPY wheels/ /tmp/wheels/
# Install paddlepaddle: prefer local wheel (ARM64), fallback to CUDA index (x86_64)
RUN if ls /tmp/wheels/paddlepaddle*.whl 1>/dev/null 2>&1; then \
echo "=== Installing PaddlePaddle from local wheel (ARM64) ===" && \
python -m pip install --no-cache-dir /tmp/wheels/paddlepaddle*.whl; \
else \
echo "=== Installing PaddlePaddle from CUDA index (x86_64) ===" && \
python -m pip install --no-cache-dir paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/; \
fi && \
rm -rf /tmp/wheels
# Install remaining dependencies
RUN python -m pip install --no-cache-dir \
paddleocr==3.3.2 \
jiwer \
numpy \
fastapi \
"uvicorn[standard]" \
pydantic \
Pillow
# =============================================================================
# STAGE 2: DEPLOY - Application code (changes frequently)
# =============================================================================
FROM base AS deploy
LABEL description="PaddleOCR Tuning REST API - GPU/CUDA version"
WORKDIR /app
# Copy application code (this is the only layer that changes frequently)
COPY paddle_ocr_tuning_rest.py .
COPY dataset_manager.py .
# Build arguments for models
ARG DET_MODEL=PP-OCRv5_server_det
ARG REC_MODEL=PP-OCRv5_server_rec
# Set as environment variables (can be overridden at runtime)
ENV PADDLE_DET_MODEL=${DET_MODEL}
ENV PADDLE_REC_MODEL=${REC_MODEL}
# Volume for dataset and model cache
VOLUME ["/app/dataset", "/root/.paddlex"]
# Expose API port
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
# Run the API server
CMD ["uvicorn", "paddle_ocr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]

824
src/paddle_ocr/README.md Normal file
View File

@@ -0,0 +1,824 @@
# PaddleOCR Tuning REST API
REST API service for PaddleOCR hyperparameter evaluation. Keeps the model loaded in memory for fast repeated evaluations during hyperparameter search.
## Quick Start with Docker Compose
Docker Compose manages building and running containers. The `docker-compose.yml` defines two services:
- `ocr-cpu` - CPU-only version (works everywhere)
- `ocr-gpu` - GPU version (requires NVIDIA GPU + Container Toolkit)
### Run CPU Version
```bash
cd src/paddle_ocr
# Build and start (first time takes ~2-3 min to build, ~30s to load model)
docker compose up ocr-cpu
# Or run in background (detached)
docker compose up -d ocr-cpu
# View logs
docker compose logs -f ocr-cpu
# Stop
docker compose down
```
### Run GPU Version
```bash
# Requires: NVIDIA GPU + nvidia-container-toolkit installed
docker compose up ocr-gpu
```
### Test the API
Once running, test with:
```bash
# Check health
curl http://localhost:8000/health
# Or use the test script
pip install requests
python test.py --url http://localhost:8000
```
### What Docker Compose Does
```
docker compose up ocr-cpu
├─► Builds image from Dockerfile.cpu (if not exists)
├─► Creates container "paddle-ocr-cpu"
├─► Mounts ../dataset → /app/dataset (your PDF images)
├─► Mounts paddlex-cache volume (persists downloaded models)
├─► Exposes port 8000
└─► Runs: uvicorn paddle_ocr_tuning_rest:app --host 0.0.0.0 --port 8000
```
## Files
| File | Description |
|------|-------------|
| `paddle_ocr_tuning_rest.py` | FastAPI REST service |
| `dataset_manager.py` | Dataset loader |
| `test.py` | API test client |
| `Dockerfile.cpu` | CPU-only image (x86_64 + ARM64 with local wheel) |
| `Dockerfile.gpu` | GPU/CUDA image (x86_64 + ARM64 with local wheel) |
| `Dockerfile.build-paddle` | PaddlePaddle GPU wheel builder for ARM64 |
| `Dockerfile.build-paddle-cpu` | PaddlePaddle CPU wheel builder for ARM64 |
| `docker-compose.yml` | Service orchestration |
| `docker-compose.cpu-registry.yml` | Pull CPU image from registry |
| `docker-compose.gpu-registry.yml` | Pull GPU image from registry |
| `wheels/` | Local PaddlePaddle wheels (created by build-paddle) |
## API Endpoints
### `GET /health`
Check if service is ready.
```json
{"status": "ok", "model_loaded": true, "dataset_loaded": true, "dataset_size": 24}
```
### `POST /evaluate`
Run OCR evaluation with given hyperparameters.
**Request:**
```json
{
"pdf_folder": "/app/dataset",
"textline_orientation": true,
"use_doc_orientation_classify": false,
"use_doc_unwarping": false,
"text_det_thresh": 0.469,
"text_det_box_thresh": 0.5412,
"text_det_unclip_ratio": 0.0,
"text_rec_score_thresh": 0.635,
"start_page": 5,
"end_page": 10
}
```
**Response:**
```json
{"CER": 0.0115, "WER": 0.0989, "TIME": 330.5, "PAGES": 5, "TIME_PER_PAGE": 66.1}
```
### `POST /evaluate_full`
Same as `/evaluate` but runs on ALL pages (ignores start_page/end_page).
## Debug Output (debugset)
The `debugset` folder allows saving OCR predictions for debugging and analysis. When `save_output=True` is passed to `/evaluate`, predictions are written to `/app/debugset`.
### Enable Debug Output
```json
{
"pdf_folder": "/app/dataset",
"save_output": true,
"start_page": 5,
"end_page": 10
}
```
### Output Structure
```
debugset/
├── doc1/
│ └── paddle_ocr/
│ ├── page_0005.txt
│ ├── page_0006.txt
│ └── ...
├── doc2/
│ └── paddle_ocr/
│ └── ...
```
Each `.txt` file contains the OCR-extracted text for that page.
### Docker Mount
The `debugset` folder is mounted read-write in docker-compose:
```yaml
volumes:
- ../debugset:/app/debugset:rw
```
### Use Cases
- **Compare OCR engines**: Run same pages through PaddleOCR, DocTR, EasyOCR with `save_output=True`, then diff results
- **Debug hyperparameters**: See how different settings affect text extraction
- **Ground truth comparison**: Compare predictions against expected output
## Building Images
### CPU Image (Multi-Architecture)
```bash
# Local build (current architecture)
docker build -f Dockerfile.cpu -t paddle-ocr-api:cpu .
# Multi-arch build with buildx (amd64 + arm64)
docker buildx create --name multiarch --use
docker buildx build -f Dockerfile.cpu \
--platform linux/amd64,linux/arm64 \
-t paddle-ocr-api:cpu \
--push .
```
### GPU Image (x86_64 + ARM64 with local wheel)
```bash
docker build -f Dockerfile.gpu -t paddle-ocr-api:gpu .
```
> **Note:** PaddlePaddle GPU 3.x packages are **not on PyPI**. The Dockerfile installs from PaddlePaddle's official CUDA index (`paddlepaddle.org.cn/packages/stable/cu126/`). This is handled automatically during build.
## Running
### CPU (Any machine)
```bash
docker run -d -p 8000:8000 \
-v $(pwd)/../dataset:/app/dataset:ro \
-v paddlex-cache:/root/.paddlex \
paddle-ocr-api:cpu
```
### GPU (NVIDIA)
```bash
docker run -d -p 8000:8000 --gpus all \
-v $(pwd)/../dataset:/app/dataset:ro \
-v paddlex-cache:/root/.paddlex \
paddle-ocr-api:gpu
```
## GPU Support Analysis
### Host System Reference (DGX Spark)
This section documents GPU support findings based on testing on an NVIDIA DGX Spark:
| Component | Value |
|-----------|-------|
| Architecture | ARM64 (aarch64) |
| CPU | NVIDIA Grace (ARM) |
| GPU | NVIDIA GB10 |
| CUDA Version | 13.0 |
| Driver | 580.95.05 |
| OS | Ubuntu 24.04 LTS |
| Container Toolkit | nvidia-container-toolkit 1.18.1 |
| Docker | 28.5.1 |
| Docker Compose | v2.40.0 |
### PaddlePaddle GPU Platform Support
**Note:** PaddlePaddle-GPU does NOT have prebuilt ARM64 wheels on PyPI, but ARM64 support is available via custom-built wheels.
| Platform | CPU | GPU |
|----------|-----|-----|
| Linux x86_64 | ✅ | ✅ CUDA 10.2/11.x/12.x |
| Windows x64 | ✅ | ✅ CUDA 10.2/11.x/12.x |
| macOS x64 | ✅ | ❌ |
| macOS ARM64 (M1/M2) | ✅ | ❌ |
| Linux ARM64 (Jetson/DGX) | ✅ | ⚠️ Limited - see Blackwell note |
**Source:** [PaddlePaddle-GPU PyPI](https://pypi.org/project/paddlepaddle-gpu/) - only `manylinux_x86_64` and `win_amd64` wheels available on PyPI. ARM64 wheels must be built from source or downloaded from Gitea packages.
### ARM64 GPU Support
ARM64 GPU support is available but requires custom-built wheels:
1. **No prebuilt PyPI wheels**: `pip install paddlepaddle-gpu` fails on ARM64 - no compatible wheels exist on PyPI
2. **Custom wheels work**: This project provides Dockerfiles to build ARM64 GPU wheels from source
3. **CI/CD builds ARM64 GPU images**: Pre-built wheels are available from Gitea packages
**To use GPU on ARM64:**
- Use the pre-built images from the container registry, or
- Build the wheel locally using `Dockerfile.build-paddle` (see Option 2 below), or
- Download the wheel from Gitea packages: `wheels/paddlepaddle_gpu-3.0.0-cp311-cp311-linux_aarch64.whl`
### ⚠️ Known Limitation: Blackwell GPU (sm_121 / GB10)
**Status: GPU inference does NOT work on NVIDIA Blackwell GPUs (DGX Spark, GB200, etc.)**
#### Symptoms
When running PaddleOCR on Blackwell GPUs:
- CUDA loads successfully ✅
- Basic tensor operations work ✅
- **Detection model outputs constant values** ❌
- 0 text regions detected
- CER/WER = 100% (nothing recognized)
#### Root Cause
**Confirmed:** PaddlePaddle's entire CUDA backend does NOT support Blackwell (sm_121). This is NOT just an inference model problem - even basic operations fail.
**Test Results (January 2026):**
1. **PTX JIT Test** (`CUDA_FORCE_PTX_JIT=1`):
```
OSError: CUDA error(209), no kernel image is available for execution on the device.
[Hint: 'cudaErrorNoKernelImageForDevice']
```
→ Confirmed: No PTX code exists in PaddlePaddle binaries
2. **Dynamic Graph Mode Test** (bypassing inference models):
```
Conv2D + BatchNorm output:
Output min: 0.0000
Output max: 0.0000
Output mean: 0.0000
Dynamic graph mode: BROKEN (constant output)
```
→ Confirmed: Even simple nn.Conv2D produces zeros on Blackwell
**Conclusion:** The issue is PaddlePaddle's compiled CUDA kernels (cubins), not just the inference models. The entire framework was compiled without sm_121 support and without PTX for JIT compilation.
**Why building PaddlePaddle from source doesn't fix it:**
1. ⚠️ Building with `CUDA_ARCH=121` requires CUDA 13.0+ (PaddlePaddle only supports up to CUDA 12.6)
2. ❌ Even if you could build it, PaddleOCR models contain pre-compiled CUDA ops
3. ❌ These model files were exported/compiled targeting sm_80/sm_90 architectures
4. ❌ The model kernels execute on GPU but produce garbage output on sm_121
**To truly fix this**, the PaddlePaddle team would need to:
1. Add sm_121 to their model export pipeline
2. Re-export all PaddleOCR models (PP-OCRv4, PP-OCRv5, etc.) with Blackwell support
3. Release new model versions
This is tracked in [GitHub Issue #17327](https://github.com/PaddlePaddle/PaddleOCR/issues/17327).
#### Debug Script
Use the included debug script to verify this issue:
```bash
docker exec paddle-ocr-gpu python /app/scripts/debug_gpu_detection.py /app/dataset/0/img/page_0001.png
```
Expected output showing the problem:
```
OUTPUT ANALYSIS:
Shape: (1, 1, 640, 640)
Min: 0.000010
Max: 0.000010 # <-- Same as min = constant output
Mean: 0.000010
DIAGNOSIS:
PROBLEM: Output is constant - model inference is broken!
This typically indicates GPU compute capability mismatch.
```
#### Workarounds
1. **Use CPU mode** (recommended):
```bash
docker compose up ocr-cpu
```
The ARM Grace CPU is fast (~2-5 sec/page). This is the reliable option.
2. **Use EasyOCR or DocTR with GPU**:
These use PyTorch which has official ARM64 CUDA wheels (cu128 index):
```bash
# EasyOCR with GPU on DGX Spark
docker build -f ../easyocr_service/Dockerfile.gpu -t easyocr-gpu ../easyocr_service
docker run --gpus all -p 8002:8000 easyocr-gpu
```
3. **Wait for PaddlePaddle Blackwell support**:
Track [GitHub Issue #17327](https://github.com/PaddlePaddle/PaddleOCR/issues/17327) for updates.
#### GPU Support Matrix (Updated)
| GPU Architecture | Compute | CPU | GPU |
|------------------|---------|-----|-----|
| Ampere (A100, A10) | sm_80 | ✅ | ✅ |
| Hopper (H100, H200) | sm_90 | ✅ | ✅ |
| **Blackwell (GB10, GB200)** | sm_121 | ✅ | ❌ Not supported |
#### FAQ: Why Doesn't CUDA Backward Compatibility Work?
**Q: CUDA normally runs older kernels on newer GPUs. Why doesn't this work for Blackwell?**
Per [NVIDIA Blackwell Compatibility Guide](https://docs.nvidia.com/cuda/blackwell-compatibility-guide/):
CUDA **can** run older code on newer GPUs via **PTX JIT compilation**:
1. PTX (Parallel Thread Execution) is NVIDIA's intermediate representation
2. If an app includes PTX code, the driver JIT-compiles it for the target GPU
3. This allows sm_80 code to run on sm_121
**The problem**: PaddleOCR inference models contain only pre-compiled **cubins** (SASS binary), not PTX. Without PTX, there's nothing to JIT-compile.
We tested PTX JIT (January 2026):
```bash
# Force PTX JIT compilation
docker run --gpus all -e CUDA_FORCE_PTX_JIT=1 paddle-ocr-gpu \
python /app/scripts/debug_gpu_detection.py /app/dataset/0/img/page_0001.png
# Result:
# OSError: CUDA error(209), no kernel image is available for execution on the device.
```
**Confirmed: No PTX exists** in PaddlePaddle binaries. The CUDA kernels are cubins-only (SASS binary), compiled for sm_80/sm_90 without PTX fallback.
**Note on sm_121**: Per NVIDIA docs, "sm_121 is the same as sm_120 since the only difference is physically integrated CPU+GPU memory of Spark." The issue is general Blackwell (sm_12x) support, not Spark-specific.
#### FAQ: Does Dynamic Graph Mode Work on Blackwell?
**Q: Can I bypass inference models and use PaddlePaddle's dynamic graph mode?**
**No.** We tested dynamic graph mode (January 2026):
```bash
# Test script runs: paddle.nn.Conv2D + paddle.nn.BatchNorm2D
python /app/scripts/test_dynamic_mode.py
# Result:
# Input shape: [1, 3, 224, 224]
# Output shape: [1, 64, 112, 112]
# Output min: 0.0000
# Output max: 0.0000 # <-- All zeros!
# Output mean: 0.0000
# Dynamic graph mode: BROKEN (constant output)
```
**Conclusion:** The problem isn't limited to inference models. PaddlePaddle's core CUDA kernels (Conv2D, BatchNorm, etc.) produce garbage on sm_121. The entire framework lacks Blackwell support.
#### FAQ: Can I Run AMD64 Containers on ARM64 DGX Spark?
**Q: Can I just run the working x86_64 GPU image via emulation?**
**Short answer: Yes for CPU, No for GPU.**
You can run amd64 containers via QEMU emulation:
```bash
# Install QEMU
sudo apt-get install qemu binfmt-support qemu-user-static
docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
# Run amd64 container
docker run --platform linux/amd64 paddle-ocr-gpu:amd64 ...
```
**But GPU doesn't work:**
- QEMU emulates CPU instructions (x86 → ARM)
- **QEMU user-mode does NOT support GPU passthrough**
- GPU calls from emulated x86 code cannot reach the ARM64 GPU
So even if the amd64 image works on x86_64:
- ❌ No GPU access through QEMU
- ❌ CPU emulation is 10-100x slower than native ARM64
- ❌ Defeats the purpose entirely
| Approach | CPU | GPU | Speed |
|----------|-----|-----|-------|
| ARM64 native (CPU) | ✅ | N/A | Fast (~2-5s/page) |
| ARM64 native (GPU) | ✅ | ❌ Blackwell issue | - |
| AMD64 via QEMU | ⚠️ Works | ❌ No passthrough | 10-100x slower |
### Options for ARM64 Systems
#### Option 1: CPU-Only (Recommended)
Use `Dockerfile.cpu` which works on ARM64:
```bash
# On DGX Spark
docker compose up ocr-cpu
# Or build directly
docker build -f Dockerfile.cpu -t paddle-ocr-api:cpu .
```
**Performance:** CPU inference on ARM64 Grace is surprisingly fast due to high core count. Expect ~2-5 seconds per page.
#### Option 2: Build PaddlePaddle from Source (Docker-based)
Use the included Docker builder to compile PaddlePaddle GPU for ARM64:
```bash
cd src/paddle_ocr
# Step 1: Build the PaddlePaddle GPU wheel (one-time, 2-4 hours)
docker compose --profile build run --rm build-paddle
# Verify wheel was created
ls -la wheels/paddlepaddle*.whl
# Step 2: Build the GPU image (uses local wheel)
docker compose build ocr-gpu
# Step 3: Run with GPU
docker compose up ocr-gpu
# Verify GPU is working
docker compose exec ocr-gpu python -c "import paddle; print(paddle.device.is_compiled_with_cuda())"
```
**What this does:**
1. `build-paddle` compiles PaddlePaddle from source inside a CUDA container
2. The wheel is saved to `./wheels/` directory
3. `Dockerfile.gpu` detects the local wheel and uses it instead of PyPI
**Caveats:**
- Build takes 2-4 hours on first run
- Requires ~20GB disk space during build
- Not officially supported by PaddlePaddle team
- May need adjustments for future PaddlePaddle versions
See: [GitHub Issue #17327](https://github.com/PaddlePaddle/PaddleOCR/issues/17327)
#### Option 3: Alternative OCR Engines
For ARM64 GPU acceleration, consider alternatives:
| Engine | ARM64 GPU | Notes |
|--------|-----------|-------|
| **Tesseract** | ❌ CPU-only | Good fallback, widely available |
| **EasyOCR** | ⚠️ Via PyTorch | PyTorch has ARM64 GPU support |
| **TrOCR** | ⚠️ Via Transformers | Hugging Face Transformers + PyTorch |
| **docTR** | ⚠️ Via TensorFlow/PyTorch | Both backends have ARM64 support |
EasyOCR with PyTorch is a viable alternative:
```bash
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121
pip install easyocr
```
### x86_64 GPU Setup (Working)
For x86_64 systems with NVIDIA GPU, the GPU Docker works:
```bash
# Verify GPU is accessible
nvidia-smi
# Verify Docker GPU access
docker run --rm --gpus all nvidia/cuda:12.0-base nvidia-smi
# Build and run GPU version
docker compose up ocr-gpu
```
### GPU Docker Compose Configuration
The `docker-compose.yml` configures GPU access via:
```yaml
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
```
This requires Docker Compose v2 and nvidia-container-toolkit.
## DGX Spark / ARM64 Quick Start
For ARM64 systems (DGX Spark, Jetson, Graviton), use CPU-only:
```bash
cd src/paddle_ocr
# Build ARM64-native CPU image
docker build -f Dockerfile.cpu -t paddle-ocr-api:arm64 .
# Run
docker run -d -p 8000:8000 \
-v $(pwd)/../dataset:/app/dataset:ro \
paddle-ocr-api:arm64
# Test
curl http://localhost:8000/health
```
### Cross-Compile from x86_64
Build ARM64 images from an x86_64 machine:
```bash
# Setup buildx for multi-arch
docker buildx create --name mybuilder --use
# Build ARM64 image from x86_64 machine
docker buildx build -f Dockerfile.cpu \
--platform linux/arm64 \
-t paddle-ocr-api:arm64 \
--load .
# Save and transfer to DGX Spark
docker save paddle-ocr-api:arm64 | gzip > paddle-ocr-arm64.tar.gz
scp paddle-ocr-arm64.tar.gz dgx-spark:~/
# On DGX Spark:
docker load < paddle-ocr-arm64.tar.gz
```
## Using with Ray Tune
### Multi-Worker Setup for Parallel Trials
Run multiple workers for parallel hyperparameter tuning:
```bash
cd src/paddle_ocr
# Start 2 CPU workers (ports 8001-8002)
sudo docker compose -f docker-compose.workers.yml --profile cpu up -d
# Or for GPU workers (if supported)
sudo docker compose -f docker-compose.workers.yml --profile gpu up -d
# Check workers are healthy
curl http://localhost:8001/health
curl http://localhost:8002/health
```
Then run the notebook with `max_concurrent_trials=2` to use both workers in parallel.
### Single Worker Setup
Update your notebook's `trainable_paddle_ocr` function:
```python
import requests
API_URL = "http://localhost:8000/evaluate"
def trainable_paddle_ocr(config):
"""Call OCR API instead of subprocess."""
payload = {
"pdf_folder": "/app/dataset",
"use_doc_orientation_classify": config.get("use_doc_orientation_classify", False),
"use_doc_unwarping": config.get("use_doc_unwarping", False),
"textline_orientation": config.get("textline_orientation", True),
"text_det_thresh": config.get("text_det_thresh", 0.0),
"text_det_box_thresh": config.get("text_det_box_thresh", 0.0),
"text_det_unclip_ratio": config.get("text_det_unclip_ratio", 1.5),
"text_rec_score_thresh": config.get("text_rec_score_thresh", 0.0),
}
try:
response = requests.post(API_URL, json=payload, timeout=600)
response.raise_for_status()
metrics = response.json()
tune.report(metrics=metrics)
except Exception as e:
tune.report({"CER": 1.0, "WER": 1.0, "ERROR": str(e)[:500]})
```
## Architecture: Model Lifecycle
The model is loaded **once** at container startup and stays in memory for all requests:
```mermaid
flowchart TB
subgraph Container["Docker Container Lifecycle"]
Start([Container Start]) --> Load[Load PaddleOCR Models<br/>~10-30s one-time cost]
Load --> Ready[API Ready<br/>Models in RAM ~500MB]
subgraph Requests["Incoming Requests - Models Stay Loaded"]
Ready --> R1[Request 1] --> Ready
Ready --> R2[Request 2] --> Ready
Ready --> RN[Request N...] --> Ready
end
Ready --> Stop([Container Stop])
Stop --> Free[Models Freed]
end
style Load fill:#f9f,stroke:#333
style Ready fill:#9f9,stroke:#333
style Requests fill:#e8f4ea,stroke:#090
```
**Subprocess vs REST API comparison:**
```mermaid
flowchart LR
subgraph Subprocess["❌ Subprocess Approach"]
direction TB
S1[Trial 1] --> L1[Load Model ~10s]
L1 --> E1[Evaluate ~60s]
E1 --> U1[Unload]
U1 --> S2[Trial 2]
S2 --> L2[Load Model ~10s]
L2 --> E2[Evaluate ~60s]
end
subgraph REST["✅ REST API Approach"]
direction TB
Start2[Start Container] --> Load2[Load Model ~10s]
Load2 --> Ready2[Model in Memory]
Ready2 --> T1[Trial 1 ~60s]
T1 --> Ready2
Ready2 --> T2[Trial 2 ~60s]
T2 --> Ready2
Ready2 --> TN[Trial N ~60s]
end
style L1 fill:#faa
style L2 fill:#faa
style Load2 fill:#afa
style Ready2 fill:#afa
```
## Performance Comparison
| Approach | Model Load | Per-Trial Overhead | 64 Trials |
|----------|------------|-------------------|-----------|
| Subprocess (original) | Every trial (~10s) | ~10s | ~7 hours |
| Docker per trial | Every trial (~10s) | ~12-15s | ~7.5 hours |
| **REST API** | **Once** | **~0.1s** | **~5.8 hours** |
The REST API saves ~1+ hour by loading the model only once.
## Troubleshooting
### Model download slow on first run
The first run downloads ~500MB of models. Use volume `paddlex-cache` to persist them.
### Out of memory
Reduce `max_concurrent_trials` in Ray Tune, or increase container memory:
```bash
docker run --memory=8g ...
```
### GPU not detected
Ensure NVIDIA Container Toolkit is installed:
```bash
nvidia-smi # Should work
docker run --rm --gpus all nvidia/cuda:12.0-base nvidia-smi # Should work
```
### PaddlePaddle GPU installation fails
PaddlePaddle 3.x GPU packages are **not available on PyPI**. They must be installed from PaddlePaddle's official index:
```bash
# For CUDA 12.x
pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
# For CUDA 11.8
pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
```
The Dockerfile.gpu handles this automatically.
## CI/CD Pipeline
The project includes a Gitea Actions workflow (`.gitea/workflows/ci.yaml`) for automated builds.
### What CI Builds
| Image | Architecture | Source |
|-------|--------------|--------|
| `paddle-ocr-cpu:amd64` | amd64 | PyPI paddlepaddle |
| `paddle-ocr-cpu:arm64` | arm64 | Pre-built wheel from Gitea packages |
| `paddle-ocr-gpu:amd64` | amd64 | PyPI paddlepaddle-gpu |
| `paddle-ocr-gpu:arm64` | arm64 | Pre-built wheel from Gitea packages |
### ARM64 Wheel Workflow
Since PyPI wheels don't work on ARM64 (x86 SSE instructions), wheels must be built from source using sse2neon:
1. Built manually on an ARM64 machine (one-time)
2. Uploaded to Gitea generic packages
3. Downloaded by CI when building ARM64 images
#### Step 1: Build ARM64 Wheels (One-time, on ARM64 machine)
```bash
cd src/paddle_ocr
# Build GPU wheel (requires NVIDIA GPU, takes 1-2 hours)
sudo docker build -t paddle-builder:gpu-arm64 -f Dockerfile.build-paddle .
sudo docker run --rm -v ./wheels:/wheels paddle-builder:gpu-arm64
# Build CPU wheel (no GPU required, takes 1-2 hours)
sudo docker build -t paddle-builder:cpu-arm64 -f Dockerfile.build-paddle-cpu .
sudo docker run --rm -v ./wheels:/wheels paddle-builder:cpu-arm64
# Verify wheels were created
ls -la wheels/paddlepaddle*.whl
# paddlepaddle_gpu-3.0.0-cp311-cp311-linux_aarch64.whl (GPU)
# paddlepaddle-3.0.0-cp311-cp311-linux_aarch64.whl (CPU)
```
#### Step 2: Upload Wheels to Gitea Packages
```bash
export GITEA_TOKEN="your-token-here"
# Upload GPU wheel
curl -X PUT \
-H "Authorization: token $GITEA_TOKEN" \
--upload-file wheels/paddlepaddle_gpu-3.0.0-cp311-cp311-linux_aarch64.whl \
"https://seryus.ddns.net/api/packages/unir/generic/paddlepaddle-gpu-arm64/3.0.0/paddlepaddle_gpu-3.0.0-cp311-cp311-linux_aarch64.whl"
# Upload CPU wheel
curl -X PUT \
-H "Authorization: token $GITEA_TOKEN" \
--upload-file wheels/paddlepaddle-3.0.0-cp311-cp311-linux_aarch64.whl \
"https://seryus.ddns.net/api/packages/unir/generic/paddlepaddle-cpu-arm64/3.0.0/paddlepaddle-3.0.0-cp311-cp311-linux_aarch64.whl"
```
Wheels available at:
```
https://seryus.ddns.net/api/packages/unir/generic/paddlepaddle-gpu-arm64/3.0.0/paddlepaddle_gpu-3.0.0-cp311-cp311-linux_aarch64.whl
https://seryus.ddns.net/api/packages/unir/generic/paddlepaddle-cpu-arm64/3.0.0/paddlepaddle-3.0.0-cp311-cp311-linux_aarch64.whl
```
#### Step 3: CI Builds Images
CI automatically:
1. Downloads ARM64 wheels from Gitea packages (for arm64 builds only)
2. Builds both CPU and GPU images for amd64 and arm64
3. Pushes to registry with arch-specific tags
### Required CI Secrets
Configure these in Gitea repository settings:
| Secret | Description |
|--------|-------------|
| `CI_READWRITE` | Gitea token with registry read/write access |
### Manual Image Push
```bash
# Login to registry
docker login seryus.ddns.net
# Build and push CPU (multi-arch)
docker buildx build -f Dockerfile.cpu \
--platform linux/amd64,linux/arm64 \
-t seryus.ddns.net/unir/paddle-ocr-api:cpu \
--push .
# Build and push GPU (x86_64)
docker build -f Dockerfile.gpu -t seryus.ddns.net/unir/paddle-ocr-api:gpu-amd64 .
docker push seryus.ddns.net/unir/paddle-ocr-api:gpu-amd64
# Build and push GPU (ARM64) - requires wheel in wheels/
docker buildx build -f Dockerfile.gpu \
--platform linux/arm64 \
-t seryus.ddns.net/unir/paddle-ocr-api:gpu-arm64 \
--push .
```
### Updating the ARM64 Wheels
When PaddlePaddle releases a new version:
1. Update `PADDLE_VERSION` in `Dockerfile.build-paddle` and `Dockerfile.build-paddle-cpu`
2. Rebuild both wheels on an ARM64 machine
3. Upload to Gitea packages with new version
4. Update `PADDLE_VERSION` in `.gitea/workflows/ci.yaml`

View File

@@ -0,0 +1,74 @@
# Imports
import os
from PIL import Image
class ImageTextDataset:
def __init__(self, root):
self.samples = []
for folder in sorted(os.listdir(root)):
sub = os.path.join(root, folder)
img_dir = os.path.join(sub, "img")
txt_dir = os.path.join(sub, "txt")
if not (os.path.isdir(img_dir) and os.path.isdir(txt_dir)):
continue
for fname in sorted(os.listdir(img_dir)):
if not fname.lower().endswith((".png", ".jpg", ".jpeg")):
continue
img_path = os.path.join(img_dir, fname)
# text file must have same name but .txt
txt_name = os.path.splitext(fname)[0] + ".txt"
txt_path = os.path.join(txt_dir, txt_name)
if not os.path.exists(txt_path):
continue
self.samples.append((img_path, txt_path))
def __len__(self):
return len(self.samples)
def __getitem__(self, idx):
img_path, txt_path = self.samples[idx]
# Load image
image = Image.open(img_path).convert("RGB")
# Load text
with open(txt_path, "r", encoding="utf-8") as f:
text = f.read()
return image, text
def get_output_path(self, idx, output_subdir, debugset_root="/app/debugset"):
"""Get output path for saving OCR result to debugset folder.
Args:
idx: Sample index
output_subdir: Subdirectory name (e.g., 'paddle_text', 'doctr_text')
debugset_root: Root folder for debug output (default: /app/debugset)
Returns:
Path like /app/debugset/doc1/{output_subdir}/page_001.txt
"""
img_path, _ = self.samples[idx]
# img_path: /app/dataset/doc1/img/page_001.png
# Extract relative path: doc1/img/page_001.png
parts = img_path.split("/dataset/", 1)
if len(parts) == 2:
rel_path = parts[1] # doc1/img/page_001.png
else:
rel_path = os.path.basename(img_path)
# Replace /img/ with /{output_subdir}/
rel_parts = rel_path.rsplit("/img/", 1)
doc_folder = rel_parts[0] # doc1
fname = os.path.splitext(rel_parts[1])[0] + ".txt" # page_001.txt
out_dir = os.path.join(debugset_root, doc_folder, output_subdir)
os.makedirs(out_dir, exist_ok=True)
return os.path.join(out_dir, fname)

View File

@@ -0,0 +1,26 @@
# docker-compose.cpu-registry.yml - Pull CPU image from registry
# Usage: docker compose -f docker-compose.cpu-registry.yml up
services:
ocr-cpu:
image: seryus.ddns.net/unir/paddle-ocr-cpu:latest
container_name: paddle-ocr-cpu-registry
ports:
- "8001:8000"
volumes:
- ../dataset:/app/dataset:ro
- ../debugset:/app/debugset:rw
- paddlex-cache:/root/.paddlex
environment:
- PYTHONUNBUFFERED=1
restart: unless-stopped
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
volumes:
paddlex-cache:
name: paddlex-model-cache

View File

@@ -0,0 +1,39 @@
# docker-compose.gpu-registry.yml - Pull GPU image from registry
# Usage: docker compose -f docker-compose.gpu-registry.yml up
#
# Requires: NVIDIA GPU + nvidia-container-toolkit installed
services:
ocr-gpu:
image: seryus.ddns.net/unir/paddle-ocr-gpu:latest
container_name: paddle-ocr-gpu-registry
ports:
- "8002:8000"
volumes:
- ../dataset:/app/dataset:ro
- ../debugset:/app/debugset:rw
- paddlex-cache:/root/.paddlex
- ./scripts:/app/scripts:ro
environment:
- PYTHONUNBUFFERED=1
- CUDA_VISIBLE_DEVICES=0
- PADDLE_DET_MODEL=PP-OCRv5_mobile_det
- PADDLE_REC_MODEL=PP-OCRv5_mobile_rec
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
restart: unless-stopped
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
volumes:
paddlex-cache:
name: paddlex-model-cache

View File

@@ -0,0 +1,140 @@
# docker-compose.workers.yml - Multiple PaddleOCR workers for parallel Ray Tune
#
# Usage:
# GPU (4 workers sharing GPU):
# docker compose -f docker-compose.workers.yml up
#
# CPU (4 workers):
# docker compose -f docker-compose.workers.yml --profile cpu up
#
# Scale workers (e.g., 8 workers):
# NUM_WORKERS=8 docker compose -f docker-compose.workers.yml up
#
# Each worker runs on a separate port: 8001, 8002, 8003, 8004, ...
x-ocr-gpu-common: &ocr-gpu-common
image: seryus.ddns.net/unir/paddle-ocr-gpu:latest
volumes:
- ../dataset:/app/dataset:ro
- ../debugset:/app/debugset:rw
- paddlex-cache:/root/.paddlex
environment:
- PYTHONUNBUFFERED=1
- CUDA_VISIBLE_DEVICES=0
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
restart: unless-stopped
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
interval: 30s
timeout: 10s
retries: 3
start_period: 120s
x-ocr-cpu-common: &ocr-cpu-common
image: seryus.ddns.net/unir/paddle-ocr-cpu:latest
volumes:
- ../dataset:/app/dataset:ro
- ../debugset:/app/debugset:rw
- paddlex-cache:/root/.paddlex
environment:
- PYTHONUNBUFFERED=1
restart: unless-stopped
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
interval: 30s
timeout: 10s
retries: 3
start_period: 120s
services:
# GPU Workers (gpu profile) - share single GPU
ocr-worker-1:
<<: *ocr-gpu-common
container_name: paddle-ocr-worker-1
ports:
- "8001:8000"
profiles:
- gpu
ocr-worker-2:
<<: *ocr-gpu-common
container_name: paddle-ocr-worker-2
ports:
- "8002:8000"
profiles:
- gpu
ocr-worker-3:
<<: *ocr-gpu-common
container_name: paddle-ocr-worker-3
ports:
- "8003:8000"
profiles:
- gpu
ocr-worker-4:
<<: *ocr-gpu-common
container_name: paddle-ocr-worker-4
ports:
- "8004:8000"
profiles:
- gpu
ocr-worker-5:
<<: *ocr-gpu-common
container_name: paddle-ocr-worker-5
ports:
- "8005:8000"
profiles:
- gpu
# CPU Workers (cpu profile) - for systems without GPU
ocr-cpu-worker-1:
<<: *ocr-cpu-common
container_name: paddle-ocr-cpu-worker-1
ports:
- "8001:8000"
profiles:
- cpu
ocr-cpu-worker-2:
<<: *ocr-cpu-common
container_name: paddle-ocr-cpu-worker-2
ports:
- "8002:8000"
profiles:
- cpu
ocr-cpu-worker-3:
<<: *ocr-cpu-common
container_name: paddle-ocr-cpu-worker-3
ports:
- "8003:8000"
profiles:
- cpu
ocr-cpu-worker-4:
<<: *ocr-cpu-common
container_name: paddle-ocr-cpu-worker-4
ports:
- "8004:8000"
profiles:
- cpu
ocr-cpu-worker-5:
<<: *ocr-cpu-common
container_name: paddle-ocr-cpu-worker-5
ports:
- "8005:8000"
profiles:
- cpu
volumes:
paddlex-cache:
name: paddlex-model-cache

View File

@@ -0,0 +1,111 @@
# docker-compose.yml - PaddleOCR REST API
# Usage:
# CPU: docker compose up ocr-cpu
# GPU: docker compose up ocr-gpu
# Test: docker compose run --rm test
# Build: CUDA_ARCH=120 docker compose --profile build run --rm build-paddle
#
# Auto-detect CUDA arch before building:
# export CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1 | tr -d '.')
# docker compose --profile build run --rm build-paddle
services:
# PaddlePaddle GPU wheel builder (ARM64 only, one-time build)
# Creates ./wheels/paddlepaddle_gpu-*.whl for ARM64 GPU support
# CUDA_ARCH env var controls target GPU architecture (default: 120 for Blackwell base)
build-paddle:
build:
context: .
dockerfile: Dockerfile.build-paddle
args:
CUDA_ARCH: ${CUDA_ARCH:-120}
volumes:
- ./wheels:/wheels
profiles:
- build
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
# CPU-only service (works on any architecture)
ocr-cpu:
build:
context: .
dockerfile: Dockerfile.cpu
args:
# Models to bake into image (change before building):
DET_MODEL: PP-OCRv5_server_det
REC_MODEL: PP-OCRv5_server_rec
image: paddle-ocr-api:cpu
container_name: paddle-ocr-cpu
ports:
- "8000:8000"
volumes:
- ../dataset:/app/dataset:ro
- ../debugset:/app/debugset:rw # Your dataset
- paddlex-cache:/root/.paddlex # For additional models at runtime
environment:
- PYTHONUNBUFFERED=1
# Override models at runtime (uncomment to use different models):
# - PADDLE_DET_MODEL=PP-OCRv5_mobile_det
# - PADDLE_REC_MODEL=PP-OCRv5_mobile_rec
restart: unless-stopped
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
# GPU service (requires NVIDIA Container Toolkit)
ocr-gpu:
build:
context: .
dockerfile: Dockerfile.gpu
args:
DET_MODEL: PP-OCRv5_server_det
REC_MODEL: PP-OCRv5_server_rec
image: paddle-ocr-api:gpu
container_name: paddle-ocr-gpu
ports:
- "8000:8000"
volumes:
- ../dataset:/app/dataset:ro
- ../debugset:/app/debugset:rw
- paddlex-cache:/root/.paddlex
environment:
- PYTHONUNBUFFERED=1
- CUDA_VISIBLE_DEVICES=0
# Override models at runtime:
# - PADDLE_DET_MODEL=PP-OCRv5_mobile_det
# - PADDLE_REC_MODEL=PP-OCRv5_mobile_rec
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
restart: unless-stopped
# Test client (runs once and exits)
test:
image: python:3.11-slim
container_name: paddle-ocr-test
depends_on:
ocr-cpu:
condition: service_healthy
volumes:
- ./test.py:/app/test.py:ro
working_dir: /app
command: >
sh -c "pip install -q requests && python test.py --url http://ocr-cpu:8000 --dataset /app/dataset"
network_mode: "service:ocr-cpu"
volumes:
paddlex-cache:
name: paddlex-model-cache

View File

@@ -0,0 +1,340 @@
# paddle_ocr_tuning_rest.py
# FastAPI REST service for PaddleOCR hyperparameter evaluation
# Usage: uvicorn paddle_ocr_tuning_rest:app --host 0.0.0.0 --port 8000
import os
import re
import time
import threading
from typing import Optional
from contextlib import asynccontextmanager
import numpy as np
import paddle
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from paddleocr import PaddleOCR
from jiwer import wer, cer
from dataset_manager import ImageTextDataset
def get_gpu_info() -> dict:
"""Get GPU status information from PaddlePaddle."""
info = {
"cuda_available": paddle.device.is_compiled_with_cuda(),
"device": str(paddle.device.get_device()),
"gpu_count": 0,
"gpu_name": None,
"gpu_memory_total": None,
"gpu_memory_used": None,
}
if info["cuda_available"]:
try:
info["gpu_count"] = paddle.device.cuda.device_count()
if info["gpu_count"] > 0:
# Get GPU properties
props = paddle.device.cuda.get_device_properties(0)
info["gpu_name"] = props.name
info["gpu_memory_total"] = f"{props.total_memory / (1024**3):.2f} GB"
# Get current memory usage
mem_reserved = paddle.device.cuda.memory_reserved(0)
mem_allocated = paddle.device.cuda.memory_allocated(0)
info["gpu_memory_used"] = f"{mem_allocated / (1024**3):.2f} GB"
info["gpu_memory_reserved"] = f"{mem_reserved / (1024**3):.2f} GB"
except Exception as e:
info["gpu_error"] = str(e)
return info
# Model configuration via environment variables (with defaults)
DEFAULT_DET_MODEL = os.environ.get("PADDLE_DET_MODEL", "PP-OCRv5_server_det")
DEFAULT_REC_MODEL = os.environ.get("PADDLE_REC_MODEL", "PP-OCRv5_server_rec")
# Global state for model and dataset
class AppState:
ocr: Optional[PaddleOCR] = None
dataset: Optional[ImageTextDataset] = None
dataset_path: Optional[str] = None
det_model: str = DEFAULT_DET_MODEL
rec_model: str = DEFAULT_REC_MODEL
lock: threading.Lock = None # Protects OCR model from concurrent access
def __init__(self):
self.lock = threading.Lock()
state = AppState()
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Load OCR model at startup."""
# Log GPU status
gpu_info = get_gpu_info()
print("=" * 50)
print("GPU STATUS")
print("=" * 50)
print(f" CUDA available: {gpu_info['cuda_available']}")
print(f" Device: {gpu_info['device']}")
if gpu_info['cuda_available']:
print(f" GPU count: {gpu_info['gpu_count']}")
print(f" GPU name: {gpu_info['gpu_name']}")
print(f" GPU memory total: {gpu_info['gpu_memory_total']}")
print("=" * 50)
print(f"Loading PaddleOCR models...")
print(f" Detection: {state.det_model}")
print(f" Recognition: {state.rec_model}")
state.ocr = PaddleOCR(
text_detection_model_name=state.det_model,
text_recognition_model_name=state.rec_model,
)
# Log GPU memory after model load
if gpu_info['cuda_available']:
gpu_after = get_gpu_info()
print(f" GPU memory after load: {gpu_after.get('gpu_memory_used', 'N/A')}")
print("Model loaded successfully!")
yield
# Cleanup on shutdown
state.ocr = None
state.dataset = None
app = FastAPI(
title="PaddleOCR Tuning API",
description="REST API for OCR hyperparameter evaluation",
version="1.0.0",
lifespan=lifespan,
)
class EvaluateRequest(BaseModel):
"""Request schema matching CLI arguments."""
pdf_folder: str = Field("/app/dataset", description="Path to dataset folder")
use_doc_orientation_classify: bool = Field(False, description="Use document orientation classification")
use_doc_unwarping: bool = Field(False, description="Use document unwarping")
textline_orientation: bool = Field(True, description="Use textline orientation classification")
text_det_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Detection pixel threshold")
text_det_box_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Detection box threshold")
text_det_unclip_ratio: float = Field(1.5, ge=0.0, description="Text detection expansion coefficient")
text_rec_score_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Recognition score threshold")
start_page: int = Field(5, ge=0, description="Start page index (inclusive)")
end_page: int = Field(10, ge=1, description="End page index (exclusive)")
save_output: bool = Field(False, description="Save OCR predictions to debugset folder")
class EvaluateResponse(BaseModel):
"""Response schema matching CLI output."""
CER: float
WER: float
TIME: float
PAGES: int
TIME_PER_PAGE: float
class HealthResponse(BaseModel):
status: str
model_loaded: bool
dataset_loaded: bool
dataset_size: Optional[int] = None
det_model: Optional[str] = None
rec_model: Optional[str] = None
# GPU info
cuda_available: Optional[bool] = None
device: Optional[str] = None
gpu_name: Optional[str] = None
gpu_memory_used: Optional[str] = None
gpu_memory_total: Optional[str] = None
def _normalize_box_xyxy(box):
"""Normalize bounding box to (x0, y0, x1, y1) format."""
if isinstance(box, (list, tuple)) and box and isinstance(box[0], (list, tuple)):
xs = [p[0] for p in box]
ys = [p[1] for p in box]
return min(xs), min(ys), max(xs), max(ys)
if isinstance(box, (list, tuple)):
if len(box) == 4:
x0, y0, x1, y1 = box
return min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1)
if len(box) == 8:
xs = box[0::2]
ys = box[1::2]
return min(xs), min(ys), max(xs), max(ys)
raise ValueError(f"Unrecognized box format: {box!r}")
def assemble_from_paddle_result(paddleocr_predict, min_score=0.0, line_tol_factor=0.6):
"""
Robust line grouping for PaddleOCR outputs.
Normalizes boxes, groups by line, and returns assembled text.
"""
boxes_all = []
for item in paddleocr_predict:
res = item.json.get("res", {})
boxes = res.get("rec_boxes", []) or []
texts = res.get("rec_texts", []) or []
scores = res.get("rec_scores", None)
for i, (box, text) in enumerate(zip(boxes, texts)):
try:
x0, y0, x1, y1 = _normalize_box_xyxy(box)
except Exception:
continue
y_mid = 0.5 * (y0 + y1)
score = float(scores[i]) if (scores is not None and i < len(scores)) else 1.0
t = re.sub(r"\s+", " ", str(text)).strip()
if not t:
continue
boxes_all.append((x0, y0, x1, y1, y_mid, t, score))
if min_score > 0:
boxes_all = [b for b in boxes_all if b[6] >= min_score]
if not boxes_all:
return ""
# Adaptive line tolerance
heights = [b[3] - b[1] for b in boxes_all]
median_h = float(np.median(heights)) if heights else 20.0
line_tol = max(8.0, line_tol_factor * median_h)
# Sort by vertical mid, then x0
boxes_all.sort(key=lambda b: (b[4], b[0]))
# Group into lines
lines, cur, last_y = [], [], None
for x0, y0, x1, y1, y_mid, text, score in boxes_all:
if last_y is None or abs(y_mid - last_y) <= line_tol:
cur.append((x0, text))
else:
cur.sort(key=lambda t: t[0])
lines.append(" ".join(t[1] for t in cur))
cur = [(x0, text)]
last_y = y_mid
if cur:
cur.sort(key=lambda t: t[0])
lines.append(" ".join(t[1] for t in cur))
res = "\n".join(lines)
res = re.sub(r"\s+\n", "\n", res).strip()
return res
def evaluate_text(reference: str, prediction: str) -> dict:
"""Calculate WER and CER metrics."""
return {"WER": wer(reference, prediction), "CER": cer(reference, prediction)}
@app.get("/health", response_model=HealthResponse)
def health_check():
"""Check if the service is ready."""
gpu_info = get_gpu_info()
return HealthResponse(
status="ok" if state.ocr is not None else "initializing",
model_loaded=state.ocr is not None,
dataset_loaded=state.dataset is not None,
dataset_size=len(state.dataset) if state.dataset else None,
det_model=state.det_model,
rec_model=state.rec_model,
cuda_available=gpu_info.get("cuda_available"),
device=gpu_info.get("device"),
gpu_name=gpu_info.get("gpu_name"),
gpu_memory_used=gpu_info.get("gpu_memory_used"),
gpu_memory_total=gpu_info.get("gpu_memory_total"),
)
@app.post("/evaluate", response_model=EvaluateResponse)
def evaluate(request: EvaluateRequest):
"""
Evaluate OCR with given hyperparameters.
Returns CER, WER, and timing metrics.
"""
if state.ocr is None:
raise HTTPException(status_code=503, detail="Model not loaded yet")
# Load or reload dataset if path changed
if state.dataset is None or state.dataset_path != request.pdf_folder:
if not os.path.isdir(request.pdf_folder):
raise HTTPException(status_code=400, detail=f"Dataset folder not found: {request.pdf_folder}")
state.dataset = ImageTextDataset(request.pdf_folder)
state.dataset_path = request.pdf_folder
if len(state.dataset) == 0:
raise HTTPException(status_code=400, detail="Dataset is empty")
# Validate page range
start = request.start_page
end = min(request.end_page, len(state.dataset))
if start >= end:
raise HTTPException(status_code=400, detail=f"Invalid page range: {start}-{end}")
cer_list, wer_list = [], []
time_per_page_list = []
t0 = time.time()
# Lock to prevent concurrent OCR access (model is not thread-safe)
with state.lock:
for idx in range(start, end):
img, ref = state.dataset[idx]
arr = np.array(img)
tp0 = time.time()
out = state.ocr.predict(
arr,
use_doc_orientation_classify=request.use_doc_orientation_classify,
use_doc_unwarping=request.use_doc_unwarping,
use_textline_orientation=request.textline_orientation,
text_det_thresh=request.text_det_thresh,
text_det_box_thresh=request.text_det_box_thresh,
text_det_unclip_ratio=request.text_det_unclip_ratio,
text_rec_score_thresh=request.text_rec_score_thresh,
)
pred = assemble_from_paddle_result(out)
time_per_page_list.append(float(time.time() - tp0))
# Save prediction to debugset if requested
if request.save_output:
out_path = state.dataset.get_output_path(idx, "paddle_text")
with open(out_path, "w", encoding="utf-8") as f:
f.write(pred)
m = evaluate_text(ref, pred)
cer_list.append(m["CER"])
wer_list.append(m["WER"])
return EvaluateResponse(
CER=float(np.mean(cer_list)) if cer_list else 1.0,
WER=float(np.mean(wer_list)) if wer_list else 1.0,
TIME=float(time.time() - t0),
PAGES=len(cer_list),
TIME_PER_PAGE=float(np.mean(time_per_page_list)) if time_per_page_list else 0.0,
)
@app.post("/evaluate_full", response_model=EvaluateResponse)
def evaluate_full(request: EvaluateRequest):
"""Evaluate on ALL pages (ignores start_page/end_page)."""
request.start_page = 0
request.end_page = 9999 # Will be clamped to dataset size
return evaluate(request)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)

View File

@@ -0,0 +1,22 @@
# PaddleOCR REST API - GPU Requirements
# Install: pip install -r requirements-gpu.txt
# PaddlePaddle (GPU version with CUDA)
paddlepaddle-gpu==3.2.0
# PaddleOCR
paddleocr==3.3.2
# OCR evaluation metrics
jiwer
# Numerical computing
numpy
# REST API framework
fastapi
uvicorn[standard]
pydantic
# Image processing
Pillow

View File

@@ -0,0 +1,22 @@
# PaddleOCR REST API - CPU Requirements
# Install: pip install -r requirements.txt
# PaddlePaddle (CPU version)
paddlepaddle==3.2.2
# PaddleOCR
paddleocr==3.3.2
# OCR evaluation metrics
jiwer
# Numerical computing
numpy
# REST API framework
fastapi
uvicorn[standard]
pydantic
# Image processing (pulled by paddleocr, but explicit)
Pillow

View File

@@ -0,0 +1,199 @@
#!/usr/bin/env python3
"""
Debug script for GPU OCR detection issues.
This script tests the raw inference output from PaddlePaddle detection models
to diagnose why detection might fail on certain GPU architectures (e.g., Blackwell/sm_121).
Usage:
docker exec paddle-ocr-gpu python /app/debug_gpu_detection.py [image_path]
Expected behavior:
- Working GPU: Output stats should show min close to 0, max close to 1, mean ~0.1-0.5
- Broken GPU: Output stats show constant values (e.g., min=max=mean=0.00001)
"""
import os
import sys
os.environ['DISABLE_MODEL_SOURCE_CHECK'] = 'True'
import numpy as np
import paddle
from PIL import Image
def check_gpu_status():
"""Check GPU availability and properties."""
print("=" * 60)
print("GPU STATUS")
print("=" * 60)
print(f"Device: {paddle.device.get_device()}")
print(f"CUDA compiled: {paddle.device.is_compiled_with_cuda()}")
if paddle.device.is_compiled_with_cuda():
print(f"GPU count: {paddle.device.cuda.device_count()}")
if paddle.device.cuda.device_count() > 0:
props = paddle.device.cuda.get_device_properties(0)
print(f"GPU name: {props.name}")
print(f"Compute capability: {props.major}.{props.minor}")
print(f"Total memory: {props.total_memory / (1024**3):.2f} GB")
print()
def test_basic_ops():
"""Test basic GPU tensor operations."""
print("=" * 60)
print("BASIC GPU OPERATIONS")
print("=" * 60)
# Test tensor creation
x = paddle.randn([2, 3])
print(f"Tensor place: {x.place}")
# Test conv2d
x = paddle.randn([1, 3, 64, 64])
conv = paddle.nn.Conv2D(3, 16, 3, padding=1)
y = conv(x)
print(f"Conv2d output shape: {y.shape}, place: {y.place}")
# Test softmax
s = paddle.nn.functional.softmax(y, axis=1)
print(f"Softmax output shape: {s.shape}")
print("Basic operations: OK")
print()
def test_detection_model(image_path: str):
"""Test detection model raw output."""
print("=" * 60)
print("DETECTION MODEL TEST")
print("=" * 60)
from paddle.inference import Config, create_predictor
model_dir = '/root/.paddlex/official_models/PP-OCRv4_mobile_det'
inference_file = f'{model_dir}/inference.json'
params_file = f'{model_dir}/inference.pdiparams'
if not os.path.exists(inference_file):
print(f"Model not found at {model_dir}")
print("Run PaddleOCR once to download models first.")
return
# Create config
config = Config()
config.set_prog_file(inference_file)
config.set_params_file(params_file)
config.enable_use_gpu(1024, 0)
print("Creating predictor...")
predictor = create_predictor(config)
# Get input/output names
input_names = predictor.get_input_names()
output_names = predictor.get_output_names()
print(f"Input names: {input_names}")
print(f"Output names: {output_names}")
# Load and preprocess image
img = Image.open(image_path)
img = img.resize((640, 640))
arr = np.array(img).astype('float32')
arr = arr / 255.0
arr = arr.transpose(2, 0, 1)[np.newaxis, ...] # NCHW
print(f"Input tensor shape: {arr.shape}")
# Set input
input_handle = predictor.get_input_handle(input_names[0])
input_handle.reshape(arr.shape)
input_handle.copy_from_cpu(arr)
# Run prediction
print("Running inference...")
predictor.run()
# Get output
output_handle = predictor.get_output_handle(output_names[0])
output = output_handle.copy_to_cpu()
print()
print("OUTPUT ANALYSIS:")
print(f" Shape: {output.shape}")
print(f" Min: {output.min():.6f}")
print(f" Max: {output.max():.6f}")
print(f" Mean: {output.mean():.6f}")
print(f" Std: {output.std():.6f}")
print(f" Has NaN: {np.isnan(output).any()}")
print(f" Has Inf: {np.isinf(output).any()}")
# Diagnosis
print()
print("DIAGNOSIS:")
if output.min() == output.max():
print(" PROBLEM: Output is constant - model inference is broken!")
print(" This typically indicates GPU compute capability mismatch.")
print(" GB10 (sm_121) may need CUDA 13.0+ for native support.")
elif output.max() < 0.01:
print(" PROBLEM: Output values too low - detection will find nothing.")
elif np.isnan(output).any() or np.isinf(output).any():
print(" PROBLEM: Output contains NaN/Inf - numerical instability.")
else:
print(" OK: Output values look reasonable.")
print(f" Detection threshold typically 0.3-0.6, max output is {output.max():.3f}")
def test_paddleocr_output(image_path: str):
"""Test full PaddleOCR pipeline."""
print()
print("=" * 60)
print("PADDLEOCR PIPELINE TEST")
print("=" * 60)
from paddleocr import PaddleOCR
ocr = PaddleOCR(
text_detection_model_name='PP-OCRv4_mobile_det',
text_recognition_model_name='PP-OCRv4_mobile_rec',
)
img = Image.open(image_path)
arr = np.array(img)
out = ocr.predict(arr)
res = out[0].json['res']
dt_polys = res.get('dt_polys', [])
rec_texts = res.get('rec_texts', [])
print(f"Detection polygons: {len(dt_polys)}")
print(f"Recognition texts: {len(rec_texts)}")
if rec_texts:
print(f"Sample texts: {rec_texts[:5]}")
else:
print("No text detected!")
def main():
# Default test image
image_path = '/app/dataset/0/img/page_0001.png'
if len(sys.argv) > 1:
image_path = sys.argv[1]
if not os.path.exists(image_path):
print(f"Image not found: {image_path}")
print("Usage: python debug_gpu_detection.py [image_path]")
sys.exit(1)
print(f"Testing with image: {image_path}")
print()
check_gpu_status()
test_basic_ops()
test_detection_model(image_path)
test_paddleocr_output(image_path)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,207 @@
#!/usr/bin/env python3
"""
Test PaddleOCR in dynamic graph mode (not inference mode).
Dynamic mode compiles kernels at runtime, which may work on Blackwell.
Inference mode uses pre-compiled kernels which fail on sm_121.
Usage:
python test_dynamic_mode.py [image_path]
"""
import os
import sys
os.environ['DISABLE_MODEL_SOURCE_CHECK'] = 'True'
# Force dynamic graph mode
os.environ['FLAGS_enable_pir_api'] = '0'
import numpy as np
import paddle
from PIL import Image
def check_gpu():
"""Check GPU status."""
print("=" * 60)
print("GPU STATUS")
print("=" * 60)
print(f"Device: {paddle.device.get_device()}")
print(f"CUDA compiled: {paddle.device.is_compiled_with_cuda()}")
if paddle.device.is_compiled_with_cuda() and paddle.device.cuda.device_count() > 0:
props = paddle.device.cuda.get_device_properties(0)
print(f"GPU: {props.name} (sm_{props.major}{props.minor})")
print(f"Memory: {props.total_memory / (1024**3):.1f} GB")
print()
def test_paddleocr_dynamic(image_path: str):
"""Test PaddleOCR with dynamic execution."""
print("=" * 60)
print("PADDLEOCR DYNAMIC MODE TEST")
print("=" * 60)
# Import PaddleOCR
from paddleocr import PaddleOCR
# Try to force dynamic mode by setting use_static=False if available
# or by using the model in eval mode directly
print("Creating PaddleOCR instance...")
print("(This may download models on first run)")
try:
# Create OCR instance - this might still use inference internally
ocr = PaddleOCR(
text_detection_model_name='PP-OCRv4_mobile_det',
text_recognition_model_name='PP-OCRv4_mobile_rec',
use_angle_cls=False, # Simplify
lang='es',
)
# Load image
img = Image.open(image_path)
arr = np.array(img)
print(f"Image shape: {arr.shape}")
# Run prediction
print("Running OCR prediction...")
result = ocr.predict(arr)
# Parse results
res = result[0].json['res']
dt_polys = res.get('dt_polys', [])
rec_texts = res.get('rec_texts', [])
print()
print("RESULTS:")
print(f" Detected boxes: {len(dt_polys)}")
print(f" Recognized texts: {len(rec_texts)}")
if rec_texts:
print(f" First 5 texts: {rec_texts[:5]}")
return True
else:
print(" WARNING: No text recognized!")
return False
except Exception as e:
print(f"ERROR: {e}")
return False
def test_paddle_dynamic_model():
"""Test loading a paddle model in dynamic graph mode."""
print()
print("=" * 60)
print("PADDLE DYNAMIC GRAPH TEST")
print("=" * 60)
# Ensure we're in dynamic mode
paddle.disable_static()
# Test a simple model forward pass
print("Testing dynamic graph execution...")
# Create a simple ResNet-like block
x = paddle.randn([1, 3, 224, 224])
# Conv -> BN -> ReLU
conv = paddle.nn.Conv2D(3, 64, 7, stride=2, padding=3)
bn = paddle.nn.BatchNorm2D(64)
# Forward pass (dynamic mode - compiles at runtime)
y = conv(x)
y = bn(y)
y = paddle.nn.functional.relu(y)
print(f"Input shape: {x.shape}")
print(f"Output shape: {y.shape}")
print(f"Output min: {y.min().item():.4f}")
print(f"Output max: {y.max().item():.4f}")
print(f"Output mean: {y.mean().item():.4f}")
if y.min() != y.max():
print("Dynamic graph mode: WORKING")
return True
else:
print("Dynamic graph mode: BROKEN (constant output)")
return False
def test_ppocr_model_direct():
"""Try loading PPOCRv4 model directly in dynamic mode."""
print()
print("=" * 60)
print("PPOCR MODEL DIRECT LOAD TEST")
print("=" * 60)
try:
# Try to import ppocr modules directly
# This bypasses the inference predictor
from paddleocr.ppocr.modeling.architectures import build_model
from paddleocr.ppocr.postprocess import build_post_process
from paddleocr.ppocr.utils.save_load import load_model
print("Direct model import available")
# Note: This approach requires model config files
# which may or may not be bundled with paddleocr
except ImportError as e:
print(f"Direct model import not available: {e}")
print("PaddleOCR may only support inference mode")
return False
def main():
# Default test image
image_path = '/app/dataset/0/img/page_0001.png'
if len(sys.argv) > 1:
image_path = sys.argv[1]
if not os.path.exists(image_path):
print(f"Image not found: {image_path}")
sys.exit(1)
print(f"Testing with image: {image_path}")
print()
check_gpu()
# Test 1: Basic dynamic graph
dynamic_works = test_paddle_dynamic_model()
if not dynamic_works:
print("\nDynamic graph mode is broken - GPU likely unsupported")
sys.exit(1)
# Test 2: Direct model load
test_ppocr_model_direct()
# Test 3: PaddleOCR pipeline
ocr_works = test_paddleocr_dynamic(image_path)
print()
print("=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Dynamic graph mode: {'WORKS' if dynamic_works else 'BROKEN'}")
print(f"PaddleOCR pipeline: {'WORKS' if ocr_works else 'BROKEN'}")
if dynamic_works and not ocr_works:
print()
print("DIAGNOSIS: Dynamic mode works but PaddleOCR fails.")
print("This means PaddleOCR internally uses inference predictor")
print("which has pre-compiled kernels without Blackwell support.")
print()
print("Potential solutions:")
print("1. Modify PaddleOCR to use dynamic mode")
print("2. Use ONNX export + ONNXRuntime")
print("3. Wait for PaddlePaddle Blackwell support")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,69 @@
#!/bin/bash
# Upload PaddlePaddle ARM64 wheel to Gitea generic packages
#
# Usage:
# ./scripts/upload-wheel.sh [wheel_file] [token]
#
# Environment variables (alternative to arguments):
# GITEA_TOKEN - Gitea API token
# WHEEL_FILE - Path to wheel file (default: auto-detect in wheels/)
set -e
GITEA_URL="https://seryus.ddns.net"
GITEA_ORG="unir"
PACKAGE_NAME="paddlepaddle-gpu-arm64"
# Get wheel file
WHEEL_FILE="${1:-${WHEEL_FILE:-$(ls wheels/paddlepaddle*.whl 2>/dev/null | head -1)}}"
if [ -z "$WHEEL_FILE" ] || [ ! -f "$WHEEL_FILE" ]; then
echo "Error: No wheel file found"
echo "Usage: $0 [wheel_file] [token]"
echo " or set WHEEL_FILE environment variable"
exit 1
fi
# Get token
TOKEN="${2:-${GITEA_TOKEN}}"
if [ -z "$TOKEN" ]; then
echo "Error: No token provided"
echo "Usage: $0 [wheel_file] [token]"
echo " or set GITEA_TOKEN environment variable"
exit 1
fi
# Extract version from wheel filename
# Format: paddlepaddle_gpu-3.0.0-cp311-cp311-linux_aarch64.whl
FILENAME=$(basename "$WHEEL_FILE")
VERSION=$(echo "$FILENAME" | sed -E 's/paddlepaddle[_-]gpu-([0-9.]+)-.*/\1/')
if [ -z "$VERSION" ]; then
echo "Error: Could not extract version from filename: $FILENAME"
exit 1
fi
echo "Uploading wheel to Gitea packages..."
echo " File: $WHEEL_FILE"
echo " Package: $PACKAGE_NAME"
echo " Version: $VERSION"
echo " URL: $GITEA_URL/api/packages/$GITEA_ORG/generic/$PACKAGE_NAME/$VERSION/$FILENAME"
# Upload using PUT request
HTTP_CODE=$(curl -sS -w "%{http_code}" -o /tmp/upload_response.txt \
-X PUT \
-H "Authorization: token $TOKEN" \
-H "Content-Type: application/octet-stream" \
--data-binary "@$WHEEL_FILE" \
"$GITEA_URL/api/packages/$GITEA_ORG/generic/$PACKAGE_NAME/$VERSION/$FILENAME")
if [ "$HTTP_CODE" = "201" ] || [ "$HTTP_CODE" = "200" ]; then
echo "Success! Wheel uploaded."
echo "Download URL: $GITEA_URL/api/packages/$GITEA_ORG/generic/$PACKAGE_NAME/$VERSION/$FILENAME"
elif [ "$HTTP_CODE" = "409" ]; then
echo "Package version already exists (HTTP 409)"
echo "To update, delete the existing version first in Gitea UI"
else
echo "Error: Upload failed with HTTP $HTTP_CODE"
cat /tmp/upload_response.txt
exit 1
fi

114
src/paddle_ocr/test.py Normal file
View File

@@ -0,0 +1,114 @@
# test.py - Simple client to test PaddleOCR REST API
# Usage: python test.py [--url URL] [--dataset PATH]
import argparse
import requests
import time
import sys
def wait_for_health(url: str, timeout: int = 120) -> bool:
"""Wait for API to be ready."""
health_url = f"{url}/health"
start = time.time()
print(f"Waiting for API at {health_url}...")
while time.time() - start < timeout:
try:
resp = requests.get(health_url, timeout=5)
if resp.status_code == 200:
data = resp.json()
if data.get("model_loaded"):
print(f"API ready! Model loaded in {time.time() - start:.1f}s")
return True
print(f" Model loading... ({time.time() - start:.0f}s)")
except requests.exceptions.ConnectionError:
print(f" Connecting... ({time.time() - start:.0f}s)")
except Exception as e:
print(f" Error: {e}")
time.sleep(2)
print("Timeout waiting for API")
return False
def test_evaluate(url: str, config: dict) -> dict:
"""Run evaluation with given config."""
eval_url = f"{url}/evaluate"
print(f"\nTesting config: {config}")
start = time.time()
resp = requests.post(eval_url, json=config, timeout=600)
resp.raise_for_status()
result = resp.json()
elapsed = time.time() - start
print(f"Results (took {elapsed:.1f}s):")
print(f" CER: {result['CER']:.4f} ({result['CER']*100:.2f}%)")
print(f" WER: {result['WER']:.4f} ({result['WER']*100:.2f}%)")
print(f" Pages: {result['PAGES']}")
print(f" Time/page: {result['TIME_PER_PAGE']:.2f}s")
return result
def main():
parser = argparse.ArgumentParser(description="Test PaddleOCR REST API")
parser.add_argument("--url", default="http://localhost:8001", help="API base URL")
parser.add_argument("--dataset", default="/app/dataset", help="Dataset path (inside container)")
parser.add_argument("--skip-health", action="store_true", help="Skip health check wait")
args = parser.parse_args()
# Wait for API to be ready
if not args.skip_health:
if not wait_for_health(args.url):
sys.exit(1)
# Test 1: Baseline config (default PaddleOCR)
print("\n" + "="*50)
print("TEST 1: Baseline Configuration")
print("="*50)
baseline = test_evaluate(args.url, {
"pdf_folder": args.dataset,
"use_doc_orientation_classify": False,
"use_doc_unwarping": False,
"textline_orientation": False, # Baseline: disabled
"text_det_thresh": 0.0,
"text_det_box_thresh": 0.0,
"text_det_unclip_ratio": 1.5,
"text_rec_score_thresh": 0.0,
"start_page": 5,
"end_page": 10,
})
# Test 2: Optimized config (from Ray Tune results)
print("\n" + "="*50)
print("TEST 2: Optimized Configuration")
print("="*50)
optimized = test_evaluate(args.url, {
"pdf_folder": args.dataset,
"use_doc_orientation_classify": False,
"use_doc_unwarping": False,
"textline_orientation": True, # KEY: enabled
"text_det_thresh": 0.4690,
"text_det_box_thresh": 0.5412,
"text_det_unclip_ratio": 0.0,
"text_rec_score_thresh": 0.6350,
"start_page": 5,
"end_page": 10,
})
# Summary
print("\n" + "="*50)
print("SUMMARY")
print("="*50)
cer_reduction = (1 - optimized["CER"] / baseline["CER"]) * 100 if baseline["CER"] > 0 else 0
print(f"Baseline CER: {baseline['CER']*100:.2f}%")
print(f"Optimized CER: {optimized['CER']*100:.2f}%")
print(f"Improvement: {cer_reduction:.1f}% reduction in errors")
if __name__ == "__main__":
main()

View File