Paddle ocr, easyicr and doctr gpu support. (#4)

2026-01-19 17:35:24 +00:00
parent 8e2b7a5096
commit c7ed7b2b9c
105 changed files with 8170 additions and 1263 deletions
--- a/src/paddle_ocr/Dockerfile.build-paddle
+++ b/src/paddle_ocr/Dockerfile.build-paddle
@@ -0,0 +1,213 @@
+# Dockerfile.build-paddle - Build PaddlePaddle GPU wheel for ARM64
+#
+# This Dockerfile compiles PaddlePaddle from source with CUDA support for ARM64.
+# The resulting wheel can be used in Dockerfile.gpu for ARM64 GPU acceleration.
+#
+# Build time: ~1-2 hours with caching, 2-4 hours first build
+# Output: /output/paddlepaddle_gpu-*.whl
+#
+# Usage:
+#   CUDA_ARCH=90 docker compose --profile build run --rm build-paddle
+#
+# Features:
+# - ccache for compiler caching (survives rebuilds)
+# - Split build stages for better layer caching
+# - ARM64 -m64 patch applied automatically
+
+# syntax=docker/dockerfile:1.4
+FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+
+LABEL maintainer="Sergio Jimenez"
+LABEL description="PaddlePaddle GPU wheel builder for ARM64"
+
+# Build arguments
+ARG PADDLE_VERSION=v3.0.0
+ARG PYTHON_VERSION=3.11
+ARG CUDA_ARCH=90
+
+# Environment setup
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV CCACHE_DIR=/ccache
+ENV PATH="/usr/lib/ccache:${PATH}"
+
+# Install build dependencies + ccache
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python${PYTHON_VERSION} \
+    python${PYTHON_VERSION}-dev \
+    python${PYTHON_VERSION}-venv \
+    python3-pip \
+    build-essential \
+    cmake \
+    ninja-build \
+    git \
+    wget \
+    curl \
+    pkg-config \
+    ccache \
+    libssl-dev \
+    libffi-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    liblzma-dev \
+    libncurses5-dev \
+    libncursesw5-dev \
+    libgflags-dev \
+    libgoogle-glog-dev \
+    libprotobuf-dev \
+    protobuf-compiler \
+    patchelf \
+    libopenblas-dev \
+    liblapack-dev \
+    swig \
+    && rm -rf /var/lib/apt/lists/* \
+    && ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python \
+    && ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python3
+
+# Setup ccache symlinks for CUDA
+RUN mkdir -p /usr/lib/ccache && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/nvcc && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/gcc && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/g++ && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/cc && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/c++
+
+# Upgrade pip and install Python build dependencies
+RUN python -m pip install --upgrade pip setuptools wheel && \
+    python -m pip install numpy protobuf pyyaml requests packaging astor decorator paddle-bfloat opt-einsum
+
+WORKDIR /build
+
+# Clone PaddlePaddle repository
+RUN git clone --depth 1 --branch ${PADDLE_VERSION} https://github.com/PaddlePaddle/Paddle.git
+
+WORKDIR /build/Paddle
+
+# Patch for ARM64: Remove -m64 flag (x86_64 specific, causes build failure on aarch64)
+RUN sed -i 's/-m64//g' cmake/flags.cmake && \
+    sed -i 's/-m64//g' CMakeLists.txt 2>/dev/null || true && \
+    find . -name "*.cmake" -exec sed -i 's/-m64//g' {} \; 2>/dev/null || true && \
+    echo "Patched -m64 flag for ARM64 compatibility"
+
+# Patch for ARM64: Install sse2neon to translate x86 SSE intrinsics to ARM NEON
+# sse2neon provides drop-in replacements for x86 SIMD headers
+RUN git clone --depth 1 https://github.com/DLTcollab/sse2neon.git /tmp/sse2neon && \
+    mkdir -p /usr/local/include/sse2neon && \
+    cp /tmp/sse2neon/sse2neon.h /usr/local/include/sse2neon/ && \
+    rm -rf /tmp/sse2neon && \
+    echo "Installed sse2neon for x86->ARM NEON translation"
+
+# Create wrapper headers that use sse2neon for ARM64
+RUN mkdir -p /usr/local/include/x86_stubs && \
+    echo "#ifndef __x86_64__" > /usr/local/include/x86_stubs/immintrin.h && \
+    echo "#include <sse2neon/sse2neon.h>" >> /usr/local/include/x86_stubs/immintrin.h && \
+    echo "#else" >> /usr/local/include/x86_stubs/immintrin.h && \
+    echo "#include_next <immintrin.h>" >> /usr/local/include/x86_stubs/immintrin.h && \
+    echo "#endif" >> /usr/local/include/x86_stubs/immintrin.h && \
+    echo "#ifndef __x86_64__" > /usr/local/include/x86_stubs/xmmintrin.h && \
+    echo "#include <sse2neon/sse2neon.h>" >> /usr/local/include/x86_stubs/xmmintrin.h && \
+    echo "#else" >> /usr/local/include/x86_stubs/xmmintrin.h && \
+    echo "#include_next <xmmintrin.h>" >> /usr/local/include/x86_stubs/xmmintrin.h && \
+    echo "#endif" >> /usr/local/include/x86_stubs/xmmintrin.h && \
+    echo "#ifndef __x86_64__" > /usr/local/include/x86_stubs/emmintrin.h && \
+    echo "#include <sse2neon/sse2neon.h>" >> /usr/local/include/x86_stubs/emmintrin.h && \
+    echo "#else" >> /usr/local/include/x86_stubs/emmintrin.h && \
+    echo "#include_next <emmintrin.h>" >> /usr/local/include/x86_stubs/emmintrin.h && \
+    echo "#endif" >> /usr/local/include/x86_stubs/emmintrin.h && \
+    echo "#ifndef __x86_64__" > /usr/local/include/x86_stubs/pmmintrin.h && \
+    echo "#include <sse2neon/sse2neon.h>" >> /usr/local/include/x86_stubs/pmmintrin.h && \
+    echo "#else" >> /usr/local/include/x86_stubs/pmmintrin.h && \
+    echo "#include_next <pmmintrin.h>" >> /usr/local/include/x86_stubs/pmmintrin.h && \
+    echo "#endif" >> /usr/local/include/x86_stubs/pmmintrin.h && \
+    echo "#ifndef __x86_64__" > /usr/local/include/x86_stubs/smmintrin.h && \
+    echo "#include <sse2neon/sse2neon.h>" >> /usr/local/include/x86_stubs/smmintrin.h && \
+    echo "#else" >> /usr/local/include/x86_stubs/smmintrin.h && \
+    echo "#include_next <smmintrin.h>" >> /usr/local/include/x86_stubs/smmintrin.h && \
+    echo "#endif" >> /usr/local/include/x86_stubs/smmintrin.h && \
+    echo "Created x86 intrinsic wrapper headers for ARM64 using sse2neon"
+
+# Install additional Python requirements for building
+RUN pip install -r python/requirements.txt || true
+
+# Create build directory
+RUN mkdir -p build
+WORKDIR /build/Paddle/build
+
+# Configure CMake for ARM64 + CUDA build
+# Note: -Wno-class-memaccess fixes Eigen NEON warning on ARM64
+RUN echo "Building for CUDA architecture: sm_${CUDA_ARCH}" && \
+    cmake .. \
+    -GNinja \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DPY_VERSION=${PYTHON_VERSION} \
+    -DWITH_GPU=ON \
+    -DWITH_TESTING=OFF \
+    -DWITH_DISTRIBUTE=OFF \
+    -DWITH_NCCL=OFF \
+    -DWITH_MKL=OFF \
+    -DWITH_MKLDNN=OFF \
+    -DON_INFER=OFF \
+    -DWITH_PYTHON=ON \
+    -DWITH_AVX=OFF \
+    -DCUDA_ARCH_NAME=Manual \
+    -DCUDA_ARCH_BIN="${CUDA_ARCH}" \
+    -DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCH}" \
+    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+    -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+    -DCMAKE_CXX_FLAGS="-Wno-class-memaccess -Wno-error=class-memaccess -I/usr/local/include/x86_stubs" \
+    -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+
+# Build external dependencies first (cacheable layer)
+RUN --mount=type=cache,target=/ccache \
+    ninja extern_gflags extern_glog extern_protobuf extern_zlib extern_eigen3
+
+# Build flashattn (heaviest dependency, separate layer for caching)
+RUN --mount=type=cache,target=/ccache \
+    ninja extern_flashattn
+
+# Build remaining external dependencies
+RUN --mount=type=cache,target=/ccache \
+    ninja extern_openblas extern_pybind extern_utf8proc extern_xxhash extern_yaml extern_cryptopp extern_warpctc extern_warprnnt extern_gloo extern_xbyak
+
+# Build main PaddlePaddle (with ccache, fallback to fewer jobs if OOM)
+RUN --mount=type=cache,target=/ccache \
+    ninja -j$(nproc) || ninja -j$(($(nproc)/2)) || ninja -j4
+
+# Build the Python wheel
+RUN ninja paddle_python || true
+
+# Create output directory
+RUN mkdir -p /output
+
+# Build wheel package - try multiple methods since PaddlePaddle build structure varies
+WORKDIR /build/Paddle
+RUN echo "=== Looking for wheel build method ===" && \
+    ls -la python/ 2>/dev/null && \
+    ls -la build/python/ 2>/dev/null && \
+    if [ -f build/python/setup.py ]; then \
+        echo "Using build/python/setup.py" && \
+        cd build/python && python setup.py bdist_wheel; \
+    elif [ -f python/setup.py ]; then \
+        echo "Using python/setup.py" && \
+        cd python && python setup.py bdist_wheel; \
+    else \
+        echo "Looking for existing wheel..." && \
+        find /build -name "paddlepaddle*.whl" -type f 2>/dev/null; \
+    fi
+
+# Copy wheel to output
+RUN find /build -name "paddlepaddle*.whl" -type f -exec cp {} /output/ \; && \
+    ls -la /output/ && \
+    if [ ! "$(ls -A /output/*.whl 2>/dev/null)" ]; then \
+        echo "ERROR: No wheel found!" && exit 1; \
+    fi
+
+# List what was built
+RUN ls -la /output/ && \
+    echo "=== Build complete ===" && \
+    find /build -name "*.whl" -type f 2>/dev/null
+
+# Default command: copy wheel to mounted volume
+CMD ["sh", "-c", "cp /output/*.whl /wheels/ 2>/dev/null && echo 'Wheel copied to /wheels/' && ls -la /wheels/ || echo 'No wheel found in /output, checking other locations...' && find /build -name '*.whl' -exec cp {} /wheels/ \\; && ls -la /wheels/"]
--- a/src/paddle_ocr/Dockerfile.build-paddle-cpu
+++ b/src/paddle_ocr/Dockerfile.build-paddle-cpu
@@ -0,0 +1,149 @@
+# Dockerfile.build-paddle-cpu - Build PaddlePaddle CPU wheel for ARM64
+#
+# Required because PyPI wheels don't work on ARM64 (x86 SSE instructions).
+#
+# Build time: ~1-2 hours
+# Output: /output/paddlepaddle-*.whl
+#
+# Usage:
+#   docker build -t paddle-builder:cpu-arm64 -f Dockerfile.build-paddle-cpu .
+#   docker run --rm -v ./wheels:/wheels paddle-builder:cpu-arm64
+
+# syntax=docker/dockerfile:1.4
+FROM ubuntu:22.04
+
+LABEL maintainer="Sergio Jimenez"
+LABEL description="PaddlePaddle CPU wheel builder for ARM64"
+
+ARG PADDLE_VERSION=v3.0.0
+ARG PYTHON_VERSION=3.11
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV CCACHE_DIR=/ccache
+ENV PATH="/usr/lib/ccache:${PATH}"
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python${PYTHON_VERSION} \
+    python${PYTHON_VERSION}-dev \
+    python${PYTHON_VERSION}-venv \
+    python3-pip \
+    build-essential \
+    cmake \
+    ninja-build \
+    git \
+    wget \
+    curl \
+    pkg-config \
+    ccache \
+    libssl-dev \
+    libffi-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    liblzma-dev \
+    libncurses5-dev \
+    libncursesw5-dev \
+    libgflags-dev \
+    libgoogle-glog-dev \
+    libprotobuf-dev \
+    protobuf-compiler \
+    patchelf \
+    libopenblas-dev \
+    liblapack-dev \
+    swig \
+    && rm -rf /var/lib/apt/lists/* \
+    && ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python \
+    && ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python3
+
+# Setup ccache
+RUN mkdir -p /usr/lib/ccache && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/gcc && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/g++ && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/cc && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/c++
+
+RUN python -m pip install --upgrade pip setuptools wheel && \
+    python -m pip install numpy protobuf pyyaml requests packaging astor decorator paddle-bfloat opt-einsum
+
+WORKDIR /build
+RUN git clone --depth 1 --branch ${PADDLE_VERSION} https://github.com/PaddlePaddle/Paddle.git
+
+WORKDIR /build/Paddle
+
+# Patch -m64 flag (x86_64 specific)
+RUN sed -i 's/-m64//g' cmake/flags.cmake && \
+    sed -i 's/-m64//g' CMakeLists.txt 2>/dev/null || true && \
+    find . -name "*.cmake" -exec sed -i 's/-m64//g' {} \; 2>/dev/null || true
+
+# Install sse2neon for x86 SSE -> ARM NEON translation
+RUN git clone --depth 1 https://github.com/DLTcollab/sse2neon.git /tmp/sse2neon && \
+    mkdir -p /usr/local/include/sse2neon && \
+    cp /tmp/sse2neon/sse2neon.h /usr/local/include/sse2neon/ && \
+    rm -rf /tmp/sse2neon
+
+# Create x86 intrinsic wrapper headers
+RUN mkdir -p /usr/local/include/x86_stubs && \
+    for h in immintrin xmmintrin emmintrin pmmintrin smmintrin; do \
+        echo "#ifndef __x86_64__" > /usr/local/include/x86_stubs/${h}.h && \
+        echo "#include <sse2neon/sse2neon.h>" >> /usr/local/include/x86_stubs/${h}.h && \
+        echo "#else" >> /usr/local/include/x86_stubs/${h}.h && \
+        echo "#include_next <${h}.h>" >> /usr/local/include/x86_stubs/${h}.h && \
+        echo "#endif" >> /usr/local/include/x86_stubs/${h}.h; \
+    done
+
+RUN pip install -r python/requirements.txt || true
+
+RUN mkdir -p build
+WORKDIR /build/Paddle/build
+
+# Configure for CPU-only ARM64 build
+# WITH_ARM=ON enables ARM NEON optimizations and disables x86-specific code (XBYAK, MKL)
+RUN cmake .. \
+    -GNinja \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DPY_VERSION=${PYTHON_VERSION} \
+    -DWITH_GPU=OFF \
+    -DWITH_ARM=ON \
+    -DWITH_TESTING=OFF \
+    -DWITH_DISTRIBUTE=OFF \
+    -DWITH_NCCL=OFF \
+    -DWITH_MKL=OFF \
+    -DWITH_MKLDNN=OFF \
+    -DWITH_XBYAK=OFF \
+    -DON_INFER=OFF \
+    -DWITH_PYTHON=ON \
+    -DWITH_AVX=OFF \
+    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+    -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+    -DCMAKE_CXX_FLAGS="-Wno-class-memaccess -Wno-error=class-memaccess -I/usr/local/include/x86_stubs"
+
+# Build external dependencies
+RUN --mount=type=cache,target=/ccache \
+    ninja extern_gflags extern_glog extern_protobuf extern_zlib extern_eigen3
+
+# Note: extern_xbyak excluded - it's x86-only and disabled with WITH_ARM=ON
+RUN --mount=type=cache,target=/ccache \
+    ninja extern_openblas extern_pybind extern_utf8proc extern_xxhash extern_yaml extern_cryptopp extern_warpctc extern_warprnnt extern_gloo
+
+# Build PaddlePaddle
+RUN --mount=type=cache,target=/ccache \
+    ninja -j$(nproc) || ninja -j$(($(nproc)/2)) || ninja -j4
+
+RUN ninja paddle_python || true
+
+RUN mkdir -p /output
+
+WORKDIR /build/Paddle
+RUN if [ -f build/python/setup.py ]; then \
+        cd build/python && python setup.py bdist_wheel; \
+    elif [ -f python/setup.py ]; then \
+        cd python && python setup.py bdist_wheel; \
+    fi
+
+RUN find /build -name "paddlepaddle*.whl" -type f -exec cp {} /output/ \; && \
+    ls -la /output/
+
+CMD ["sh", "-c", "cp /output/*.whl /wheels/ && ls -la /wheels/"]
--- a/src/paddle_ocr/Dockerfile.cpu
+++ b/src/paddle_ocr/Dockerfile.cpu
@@ -0,0 +1,81 @@
+# Dockerfile.cpu - Multi-stage CPU Dockerfile
+#
+# Build base only (push to registry, rarely changes):
+#   docker build --target base -t seryus.ddns.net/unir/paddle-ocr-cpu-base:latest -f Dockerfile.cpu .
+#
+# Build deploy (uses base, fast - code only):
+#   docker build --target deploy -t seryus.ddns.net/unir/paddle-ocr-cpu:latest -f Dockerfile.cpu .
+#
+# Or build all at once:
+#   docker build -t paddle-ocr-api:cpu -f Dockerfile.cpu .
+
+# =============================================================================
+# STAGE 1: BASE - All dependencies (rarely changes)
+# =============================================================================
+FROM python:3.11-slim AS base
+
+LABEL maintainer="Sergio Jimenez"
+LABEL description="PaddleOCR Base Image - CPU dependencies"
+
+WORKDIR /app
+
+# Install system dependencies for OpenCV and PaddleOCR
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgl1 \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender1 \
+    libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy local wheels directory (may contain ARM64 wheel from build-paddle-cpu)
+COPY wheels/ /tmp/wheels/
+
+# Install paddlepaddle: prefer local wheel (ARM64), fallback to PyPI (x86_64)
+RUN if ls /tmp/wheels/paddlepaddle*.whl 1>/dev/null 2>&1; then \
+        echo "=== Installing PaddlePaddle from local wheel (ARM64) ===" && \
+        pip install --no-cache-dir /tmp/wheels/paddlepaddle*.whl; \
+    else \
+        echo "=== Installing PaddlePaddle from PyPI (x86_64) ===" && \
+        pip install --no-cache-dir paddlepaddle==3.0.0; \
+    fi && \
+    rm -rf /tmp/wheels
+
+# Install remaining Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# =============================================================================
+# STAGE 2: DEPLOY - Application code (changes frequently)
+# =============================================================================
+FROM base AS deploy
+
+LABEL description="PaddleOCR Tuning REST API - CPU version"
+
+WORKDIR /app
+
+# Copy application code (this is the only layer that changes frequently)
+COPY paddle_ocr_tuning_rest.py .
+COPY dataset_manager.py .
+
+# Build arguments for models
+ARG DET_MODEL=PP-OCRv5_server_det
+ARG REC_MODEL=PP-OCRv5_server_rec
+
+# Set as environment variables (can be overridden at runtime)
+ENV PADDLE_DET_MODEL=${DET_MODEL}
+ENV PADDLE_REC_MODEL=${REC_MODEL}
+
+# Volume for dataset and model cache
+VOLUME ["/app/dataset", "/root/.paddlex"]
+
+# Expose API port
+EXPOSE 8000
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
+
+# Run the API server
+CMD ["uvicorn", "paddle_ocr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/src/paddle_ocr/Dockerfile.gpu
+++ b/src/paddle_ocr/Dockerfile.gpu
@@ -0,0 +1,105 @@
+# Dockerfile.gpu - Multi-stage GPU Dockerfile
+#
+# Build base only (push to registry, rarely changes):
+#   docker build --target base -t seryus.ddns.net/unir/paddle-ocr-gpu-base:latest -f Dockerfile.gpu .
+#
+# Build deploy (uses base, fast - code only):
+#   docker build --target deploy -t seryus.ddns.net/unir/paddle-ocr-gpu:latest -f Dockerfile.gpu .
+#
+# Or build all at once:
+#   docker build -t paddle-ocr-api:gpu -f Dockerfile.gpu .
+
+# =============================================================================
+# STAGE 1: BASE - All dependencies (rarely changes)
+# =============================================================================
+FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 AS base
+
+LABEL maintainer="Sergio Jimenez"
+LABEL description="PaddleOCR Base Image - GPU/CUDA dependencies"
+
+WORKDIR /app
+
+# Set environment variables
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV CUDA_VISIBLE_DEVICES=0
+
+# Install Python 3.11 and system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3.11 \
+    python3.11-venv \
+    python3-pip \
+    libgl1 \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender1 \
+    libgomp1 \
+    && rm -rf /var/lib/apt/lists/* \
+    && ln -sf /usr/bin/python3.11 /usr/bin/python
+
+# Fix cuDNN library path for ARM64 only (PaddlePaddle looks in /usr/local/cuda/lib64)
+# x86_64 doesn't need this - PyPI wheel handles paths correctly
+RUN if [ "$(uname -m)" = "aarch64" ]; then \
+        mkdir -p /usr/local/cuda/lib64 && \
+        ln -sf /usr/lib/aarch64-linux-gnu/libcudnn*.so* /usr/local/cuda/lib64/ && \
+        ln -sf /usr/lib/aarch64-linux-gnu/libcudnn.so.9 /usr/local/cuda/lib64/libcudnn.so && \
+        ldconfig; \
+    fi
+
+# Copy local wheels directory (may contain ARM64 wheel from build-paddle)
+COPY wheels/ /tmp/wheels/
+
+# Install paddlepaddle: prefer local wheel (ARM64), fallback to CUDA index (x86_64)
+RUN if ls /tmp/wheels/paddlepaddle*.whl 1>/dev/null 2>&1; then \
+        echo "=== Installing PaddlePaddle from local wheel (ARM64) ===" && \
+        python -m pip install --no-cache-dir /tmp/wheels/paddlepaddle*.whl; \
+    else \
+        echo "=== Installing PaddlePaddle from CUDA index (x86_64) ===" && \
+        python -m pip install --no-cache-dir paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/; \
+    fi && \
+    rm -rf /tmp/wheels
+
+# Install remaining dependencies
+RUN python -m pip install --no-cache-dir \
+    paddleocr==3.3.2 \
+    jiwer \
+    numpy \
+    fastapi \
+    "uvicorn[standard]" \
+    pydantic \
+    Pillow
+
+# =============================================================================
+# STAGE 2: DEPLOY - Application code (changes frequently)
+# =============================================================================
+FROM base AS deploy
+
+LABEL description="PaddleOCR Tuning REST API - GPU/CUDA version"
+
+WORKDIR /app
+
+# Copy application code (this is the only layer that changes frequently)
+COPY paddle_ocr_tuning_rest.py .
+COPY dataset_manager.py .
+
+# Build arguments for models
+ARG DET_MODEL=PP-OCRv5_server_det
+ARG REC_MODEL=PP-OCRv5_server_rec
+
+# Set as environment variables (can be overridden at runtime)
+ENV PADDLE_DET_MODEL=${DET_MODEL}
+ENV PADDLE_REC_MODEL=${REC_MODEL}
+
+# Volume for dataset and model cache
+VOLUME ["/app/dataset", "/root/.paddlex"]
+
+# Expose API port
+EXPOSE 8000
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
+
+# Run the API server
+CMD ["uvicorn", "paddle_ocr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/src/paddle_ocr/README.md
+++ b/src/paddle_ocr/README.md
@@ -0,0 +1,824 @@
+# PaddleOCR Tuning REST API
+
+REST API service for PaddleOCR hyperparameter evaluation. Keeps the model loaded in memory for fast repeated evaluations during hyperparameter search.
+
+## Quick Start with Docker Compose
+
+Docker Compose manages building and running containers. The `docker-compose.yml` defines two services:
+- `ocr-cpu` - CPU-only version (works everywhere)
+- `ocr-gpu` - GPU version (requires NVIDIA GPU + Container Toolkit)
+
+### Run CPU Version
+
+```bash
+cd src/paddle_ocr
+
+# Build and start (first time takes ~2-3 min to build, ~30s to load model)
+docker compose up ocr-cpu
+
+# Or run in background (detached)
+docker compose up -d ocr-cpu
+
+# View logs
+docker compose logs -f ocr-cpu
+
+# Stop
+docker compose down
+```
+
+### Run GPU Version
+
+```bash
+# Requires: NVIDIA GPU + nvidia-container-toolkit installed
+docker compose up ocr-gpu
+```
+
+### Test the API
+
+Once running, test with:
+```bash
+# Check health
+curl http://localhost:8000/health
+
+# Or use the test script
+pip install requests
+python test.py --url http://localhost:8000
+```
+
+### What Docker Compose Does
+
+```
+docker compose up ocr-cpu
+       │
+       ├─► Builds image from Dockerfile.cpu (if not exists)
+       ├─► Creates container "paddle-ocr-cpu"
+       ├─► Mounts ../dataset → /app/dataset (your PDF images)
+       ├─► Mounts paddlex-cache volume (persists downloaded models)
+       ├─► Exposes port 8000
+       └─► Runs: uvicorn paddle_ocr_tuning_rest:app --host 0.0.0.0 --port 8000
+```
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `paddle_ocr_tuning_rest.py` | FastAPI REST service |
+| `dataset_manager.py` | Dataset loader |
+| `test.py` | API test client |
+| `Dockerfile.cpu` | CPU-only image (x86_64 + ARM64 with local wheel) |
+| `Dockerfile.gpu` | GPU/CUDA image (x86_64 + ARM64 with local wheel) |
+| `Dockerfile.build-paddle` | PaddlePaddle GPU wheel builder for ARM64 |
+| `Dockerfile.build-paddle-cpu` | PaddlePaddle CPU wheel builder for ARM64 |
+| `docker-compose.yml` | Service orchestration |
+| `docker-compose.cpu-registry.yml` | Pull CPU image from registry |
+| `docker-compose.gpu-registry.yml` | Pull GPU image from registry |
+| `wheels/` | Local PaddlePaddle wheels (created by build-paddle) |
+
+## API Endpoints
+
+### `GET /health`
+Check if service is ready.
+
+```json
+{"status": "ok", "model_loaded": true, "dataset_loaded": true, "dataset_size": 24}
+```
+
+### `POST /evaluate`
+Run OCR evaluation with given hyperparameters.
+
+**Request:**
+```json
+{
+  "pdf_folder": "/app/dataset",
+  "textline_orientation": true,
+  "use_doc_orientation_classify": false,
+  "use_doc_unwarping": false,
+  "text_det_thresh": 0.469,
+  "text_det_box_thresh": 0.5412,
+  "text_det_unclip_ratio": 0.0,
+  "text_rec_score_thresh": 0.635,
+  "start_page": 5,
+  "end_page": 10
+}
+```
+
+**Response:**
+```json
+{"CER": 0.0115, "WER": 0.0989, "TIME": 330.5, "PAGES": 5, "TIME_PER_PAGE": 66.1}
+```
+
+### `POST /evaluate_full`
+Same as `/evaluate` but runs on ALL pages (ignores start_page/end_page).
+
+## Debug Output (debugset)
+
+The `debugset` folder allows saving OCR predictions for debugging and analysis. When `save_output=True` is passed to `/evaluate`, predictions are written to `/app/debugset`.
+
+### Enable Debug Output
+
+```json
+{
+  "pdf_folder": "/app/dataset",
+  "save_output": true,
+  "start_page": 5,
+  "end_page": 10
+}
+```
+
+### Output Structure
+
+```
+debugset/
+├── doc1/
+│   └── paddle_ocr/
+│       ├── page_0005.txt
+│       ├── page_0006.txt
+│       └── ...
+├── doc2/
+│   └── paddle_ocr/
+│       └── ...
+```
+
+Each `.txt` file contains the OCR-extracted text for that page.
+
+### Docker Mount
+
+The `debugset` folder is mounted read-write in docker-compose:
+
+```yaml
+volumes:
+  - ../debugset:/app/debugset:rw
+```
+
+### Use Cases
+
+- **Compare OCR engines**: Run same pages through PaddleOCR, DocTR, EasyOCR with `save_output=True`, then diff results
+- **Debug hyperparameters**: See how different settings affect text extraction
+- **Ground truth comparison**: Compare predictions against expected output
+
+## Building Images
+
+### CPU Image (Multi-Architecture)
+
+```bash
+# Local build (current architecture)
+docker build -f Dockerfile.cpu -t paddle-ocr-api:cpu .
+
+# Multi-arch build with buildx (amd64 + arm64)
+docker buildx create --name multiarch --use
+docker buildx build -f Dockerfile.cpu \
+  --platform linux/amd64,linux/arm64 \
+  -t paddle-ocr-api:cpu \
+  --push .
+```
+
+### GPU Image (x86_64 + ARM64 with local wheel)
+
+```bash
+docker build -f Dockerfile.gpu -t paddle-ocr-api:gpu .
+```
+
+> **Note:** PaddlePaddle GPU 3.x packages are **not on PyPI**. The Dockerfile installs from PaddlePaddle's official CUDA index (`paddlepaddle.org.cn/packages/stable/cu126/`). This is handled automatically during build.
+
+## Running
+
+### CPU (Any machine)
+
+```bash
+docker run -d -p 8000:8000 \
+  -v $(pwd)/../dataset:/app/dataset:ro \
+  -v paddlex-cache:/root/.paddlex \
+  paddle-ocr-api:cpu
+```
+
+### GPU (NVIDIA)
+
+```bash
+docker run -d -p 8000:8000 --gpus all \
+  -v $(pwd)/../dataset:/app/dataset:ro \
+  -v paddlex-cache:/root/.paddlex \
+  paddle-ocr-api:gpu
+```
+
+## GPU Support Analysis
+
+### Host System Reference (DGX Spark)
+
+This section documents GPU support findings based on testing on an NVIDIA DGX Spark:
+
+| Component | Value |
+|-----------|-------|
+| Architecture | ARM64 (aarch64) |
+| CPU | NVIDIA Grace (ARM) |
+| GPU | NVIDIA GB10 |
+| CUDA Version | 13.0 |
+| Driver | 580.95.05 |
+| OS | Ubuntu 24.04 LTS |
+| Container Toolkit | nvidia-container-toolkit 1.18.1 |
+| Docker | 28.5.1 |
+| Docker Compose | v2.40.0 |
+
+### PaddlePaddle GPU Platform Support
+
+**Note:** PaddlePaddle-GPU does NOT have prebuilt ARM64 wheels on PyPI, but ARM64 support is available via custom-built wheels.
+
+| Platform | CPU | GPU |
+|----------|-----|-----|
+| Linux x86_64 | ✅ | ✅ CUDA 10.2/11.x/12.x |
+| Windows x64 | ✅ | ✅ CUDA 10.2/11.x/12.x |
+| macOS x64 | ✅ | ❌ |
+| macOS ARM64 (M1/M2) | ✅ | ❌ |
+| Linux ARM64 (Jetson/DGX) | ✅ | ⚠️ Limited - see Blackwell note |
+
+**Source:** [PaddlePaddle-GPU PyPI](https://pypi.org/project/paddlepaddle-gpu/) - only `manylinux_x86_64` and `win_amd64` wheels available on PyPI. ARM64 wheels must be built from source or downloaded from Gitea packages.
+
+### ARM64 GPU Support
+
+ARM64 GPU support is available but requires custom-built wheels:
+
+1. **No prebuilt PyPI wheels**: `pip install paddlepaddle-gpu` fails on ARM64 - no compatible wheels exist on PyPI
+2. **Custom wheels work**: This project provides Dockerfiles to build ARM64 GPU wheels from source
+3. **CI/CD builds ARM64 GPU images**: Pre-built wheels are available from Gitea packages
+
+**To use GPU on ARM64:**
+- Use the pre-built images from the container registry, or
+- Build the wheel locally using `Dockerfile.build-paddle` (see Option 2 below), or
+- Download the wheel from Gitea packages: `wheels/paddlepaddle_gpu-3.0.0-cp311-cp311-linux_aarch64.whl`
+
+### ⚠️ Known Limitation: Blackwell GPU (sm_121 / GB10)
+
+**Status: GPU inference does NOT work on NVIDIA Blackwell GPUs (DGX Spark, GB200, etc.)**
+
+#### Symptoms
+
+When running PaddleOCR on Blackwell GPUs:
+- CUDA loads successfully ✅
+- Basic tensor operations work ✅
+- **Detection model outputs constant values** ❌
+- 0 text regions detected
+- CER/WER = 100% (nothing recognized)
+
+#### Root Cause
+
+**Confirmed:** PaddlePaddle's entire CUDA backend does NOT support Blackwell (sm_121). This is NOT just an inference model problem - even basic operations fail.
+
+**Test Results (January 2026):**
+
+1. **PTX JIT Test** (`CUDA_FORCE_PTX_JIT=1`):
+   ```
+   OSError: CUDA error(209), no kernel image is available for execution on the device.
+   [Hint: 'cudaErrorNoKernelImageForDevice']
+   ```
+   → Confirmed: No PTX code exists in PaddlePaddle binaries
+
+2. **Dynamic Graph Mode Test** (bypassing inference models):
+   ```
+   Conv2D + BatchNorm output:
+     Output min: 0.0000
+     Output max: 0.0000
+     Output mean: 0.0000
+   Dynamic graph mode: BROKEN (constant output)
+   ```
+   → Confirmed: Even simple nn.Conv2D produces zeros on Blackwell
+
+**Conclusion:** The issue is PaddlePaddle's compiled CUDA kernels (cubins), not just the inference models. The entire framework was compiled without sm_121 support and without PTX for JIT compilation.
+
+**Why building PaddlePaddle from source doesn't fix it:**
+
+1. ⚠️ Building with `CUDA_ARCH=121` requires CUDA 13.0+ (PaddlePaddle only supports up to CUDA 12.6)
+2. ❌ Even if you could build it, PaddleOCR models contain pre-compiled CUDA ops
+3. ❌ These model files were exported/compiled targeting sm_80/sm_90 architectures
+4. ❌ The model kernels execute on GPU but produce garbage output on sm_121
+
+**To truly fix this**, the PaddlePaddle team would need to:
+1. Add sm_121 to their model export pipeline
+2. Re-export all PaddleOCR models (PP-OCRv4, PP-OCRv5, etc.) with Blackwell support
+3. Release new model versions
+
+This is tracked in [GitHub Issue #17327](https://github.com/PaddlePaddle/PaddleOCR/issues/17327).
+
+#### Debug Script
+
+Use the included debug script to verify this issue:
+
+```bash
+docker exec paddle-ocr-gpu python /app/scripts/debug_gpu_detection.py /app/dataset/0/img/page_0001.png
+```
+
+Expected output showing the problem:
+```
+OUTPUT ANALYSIS:
+  Shape: (1, 1, 640, 640)
+  Min: 0.000010
+  Max: 0.000010   # <-- Same as min = constant output
+  Mean: 0.000010
+
+DIAGNOSIS:
+  PROBLEM: Output is constant - model inference is broken!
+  This typically indicates GPU compute capability mismatch.
+```
+
+#### Workarounds
+
+1. **Use CPU mode** (recommended):
+   ```bash
+   docker compose up ocr-cpu
+   ```
+   The ARM Grace CPU is fast (~2-5 sec/page). This is the reliable option.
+
+2. **Use EasyOCR or DocTR with GPU**:
+   These use PyTorch which has official ARM64 CUDA wheels (cu128 index):
+   ```bash
+   # EasyOCR with GPU on DGX Spark
+   docker build -f ../easyocr_service/Dockerfile.gpu -t easyocr-gpu ../easyocr_service
+   docker run --gpus all -p 8002:8000 easyocr-gpu
+   ```
+
+3. **Wait for PaddlePaddle Blackwell support**:
+   Track [GitHub Issue #17327](https://github.com/PaddlePaddle/PaddleOCR/issues/17327) for updates.
+
+#### GPU Support Matrix (Updated)
+
+| GPU Architecture | Compute | CPU | GPU |
+|------------------|---------|-----|-----|
+| Ampere (A100, A10) | sm_80 | ✅ | ✅ |
+| Hopper (H100, H200) | sm_90 | ✅ | ✅ |
+| **Blackwell (GB10, GB200)** | sm_121 | ✅ | ❌ Not supported |
+
+#### FAQ: Why Doesn't CUDA Backward Compatibility Work?
+
+**Q: CUDA normally runs older kernels on newer GPUs. Why doesn't this work for Blackwell?**
+
+Per [NVIDIA Blackwell Compatibility Guide](https://docs.nvidia.com/cuda/blackwell-compatibility-guide/):
+
+CUDA **can** run older code on newer GPUs via **PTX JIT compilation**:
+1. PTX (Parallel Thread Execution) is NVIDIA's intermediate representation
+2. If an app includes PTX code, the driver JIT-compiles it for the target GPU
+3. This allows sm_80 code to run on sm_121
+
+**The problem**: PaddleOCR inference models contain only pre-compiled **cubins** (SASS binary), not PTX. Without PTX, there's nothing to JIT-compile.
+
+We tested PTX JIT (January 2026):
+```bash
+# Force PTX JIT compilation
+docker run --gpus all -e CUDA_FORCE_PTX_JIT=1 paddle-ocr-gpu \
+  python /app/scripts/debug_gpu_detection.py /app/dataset/0/img/page_0001.png
+
+# Result:
+# OSError: CUDA error(209), no kernel image is available for execution on the device.
+```
+**Confirmed: No PTX exists** in PaddlePaddle binaries. The CUDA kernels are cubins-only (SASS binary), compiled for sm_80/sm_90 without PTX fallback.
+
+**Note on sm_121**: Per NVIDIA docs, "sm_121 is the same as sm_120 since the only difference is physically integrated CPU+GPU memory of Spark." The issue is general Blackwell (sm_12x) support, not Spark-specific.
+
+#### FAQ: Does Dynamic Graph Mode Work on Blackwell?
+
+**Q: Can I bypass inference models and use PaddlePaddle's dynamic graph mode?**
+
+**No.** We tested dynamic graph mode (January 2026):
+```bash
+# Test script runs: paddle.nn.Conv2D + paddle.nn.BatchNorm2D
+python /app/scripts/test_dynamic_mode.py
+
+# Result:
+# Input shape: [1, 3, 224, 224]
+# Output shape: [1, 64, 112, 112]
+# Output min: 0.0000
+# Output max: 0.0000  # <-- All zeros!
+# Output mean: 0.0000
+# Dynamic graph mode: BROKEN (constant output)
+```
+
+**Conclusion:** The problem isn't limited to inference models. PaddlePaddle's core CUDA kernels (Conv2D, BatchNorm, etc.) produce garbage on sm_121. The entire framework lacks Blackwell support.
+
+#### FAQ: Can I Run AMD64 Containers on ARM64 DGX Spark?
+
+**Q: Can I just run the working x86_64 GPU image via emulation?**
+
+**Short answer: Yes for CPU, No for GPU.**
+
+You can run amd64 containers via QEMU emulation:
+```bash
+# Install QEMU
+sudo apt-get install qemu binfmt-support qemu-user-static
+docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
+
+# Run amd64 container
+docker run --platform linux/amd64 paddle-ocr-gpu:amd64 ...
+```
+
+**But GPU doesn't work:**
+- QEMU emulates CPU instructions (x86 → ARM)
+- **QEMU user-mode does NOT support GPU passthrough**
+- GPU calls from emulated x86 code cannot reach the ARM64 GPU
+
+So even if the amd64 image works on x86_64:
+- ❌ No GPU access through QEMU
+- ❌ CPU emulation is 10-100x slower than native ARM64
+- ❌ Defeats the purpose entirely
+
+| Approach | CPU | GPU | Speed |
+|----------|-----|-----|-------|
+| ARM64 native (CPU) | ✅ | N/A | Fast (~2-5s/page) |
+| ARM64 native (GPU) | ✅ | ❌ Blackwell issue | - |
+| AMD64 via QEMU | ⚠️ Works | ❌ No passthrough | 10-100x slower |
+
+### Options for ARM64 Systems
+
+#### Option 1: CPU-Only (Recommended)
+
+Use `Dockerfile.cpu` which works on ARM64:
+
+```bash
+# On DGX Spark
+docker compose up ocr-cpu
+
+# Or build directly
+docker build -f Dockerfile.cpu -t paddle-ocr-api:cpu .
+```
+
+**Performance:** CPU inference on ARM64 Grace is surprisingly fast due to high core count. Expect ~2-5 seconds per page.
+
+#### Option 2: Build PaddlePaddle from Source (Docker-based)
+
+Use the included Docker builder to compile PaddlePaddle GPU for ARM64:
+
+```bash
+cd src/paddle_ocr
+
+# Step 1: Build the PaddlePaddle GPU wheel (one-time, 2-4 hours)
+docker compose --profile build run --rm build-paddle
+
+# Verify wheel was created
+ls -la wheels/paddlepaddle*.whl
+
+# Step 2: Build the GPU image (uses local wheel)
+docker compose build ocr-gpu
+
+# Step 3: Run with GPU
+docker compose up ocr-gpu
+
+# Verify GPU is working
+docker compose exec ocr-gpu python -c "import paddle; print(paddle.device.is_compiled_with_cuda())"
+```
+
+**What this does:**
+1. `build-paddle` compiles PaddlePaddle from source inside a CUDA container
+2. The wheel is saved to `./wheels/` directory
+3. `Dockerfile.gpu` detects the local wheel and uses it instead of PyPI
+
+**Caveats:**
+- Build takes 2-4 hours on first run
+- Requires ~20GB disk space during build
+- Not officially supported by PaddlePaddle team
+- May need adjustments for future PaddlePaddle versions
+
+See: [GitHub Issue #17327](https://github.com/PaddlePaddle/PaddleOCR/issues/17327)
+
+#### Option 3: Alternative OCR Engines
+
+For ARM64 GPU acceleration, consider alternatives:
+
+| Engine | ARM64 GPU | Notes |
+|--------|-----------|-------|
+| **Tesseract** | ❌ CPU-only | Good fallback, widely available |
+| **EasyOCR** | ⚠️ Via PyTorch | PyTorch has ARM64 GPU support |
+| **TrOCR** | ⚠️ Via Transformers | Hugging Face Transformers + PyTorch |
+| **docTR** | ⚠️ Via TensorFlow/PyTorch | Both backends have ARM64 support |
+
+EasyOCR with PyTorch is a viable alternative:
+```bash
+pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121
+pip install easyocr
+```
+
+### x86_64 GPU Setup (Working)
+
+For x86_64 systems with NVIDIA GPU, the GPU Docker works:
+
+```bash
+# Verify GPU is accessible
+nvidia-smi
+
+# Verify Docker GPU access
+docker run --rm --gpus all nvidia/cuda:12.0-base nvidia-smi
+
+# Build and run GPU version
+docker compose up ocr-gpu
+```
+
+### GPU Docker Compose Configuration
+
+The `docker-compose.yml` configures GPU access via:
+
+```yaml
+deploy:
+  resources:
+    reservations:
+      devices:
+        - driver: nvidia
+          count: 1
+          capabilities: [gpu]
+```
+
+This requires Docker Compose v2 and nvidia-container-toolkit.
+
+## DGX Spark / ARM64 Quick Start
+
+For ARM64 systems (DGX Spark, Jetson, Graviton), use CPU-only:
+
+```bash
+cd src/paddle_ocr
+
+# Build ARM64-native CPU image
+docker build -f Dockerfile.cpu -t paddle-ocr-api:arm64 .
+
+# Run
+docker run -d -p 8000:8000 \
+  -v $(pwd)/../dataset:/app/dataset:ro \
+  paddle-ocr-api:arm64
+
+# Test
+curl http://localhost:8000/health
+```
+
+### Cross-Compile from x86_64
+
+Build ARM64 images from an x86_64 machine:
+
+```bash
+# Setup buildx for multi-arch
+docker buildx create --name mybuilder --use
+
+# Build ARM64 image from x86_64 machine
+docker buildx build -f Dockerfile.cpu \
+  --platform linux/arm64 \
+  -t paddle-ocr-api:arm64 \
+  --load .
+
+# Save and transfer to DGX Spark
+docker save paddle-ocr-api:arm64 | gzip > paddle-ocr-arm64.tar.gz
+scp paddle-ocr-arm64.tar.gz dgx-spark:~/
+
+# On DGX Spark:
+docker load < paddle-ocr-arm64.tar.gz
+```
+
+## Using with Ray Tune
+
+### Multi-Worker Setup for Parallel Trials
+
+Run multiple workers for parallel hyperparameter tuning:
+
+```bash
+cd src/paddle_ocr
+
+# Start 2 CPU workers (ports 8001-8002)
+sudo docker compose -f docker-compose.workers.yml --profile cpu up -d
+
+# Or for GPU workers (if supported)
+sudo docker compose -f docker-compose.workers.yml --profile gpu up -d
+
+# Check workers are healthy
+curl http://localhost:8001/health
+curl http://localhost:8002/health
+```
+
+Then run the notebook with `max_concurrent_trials=2` to use both workers in parallel.
+
+### Single Worker Setup
+
+Update your notebook's `trainable_paddle_ocr` function:
+
+```python
+import requests
+
+API_URL = "http://localhost:8000/evaluate"
+
+def trainable_paddle_ocr(config):
+    """Call OCR API instead of subprocess."""
+    payload = {
+        "pdf_folder": "/app/dataset",
+        "use_doc_orientation_classify": config.get("use_doc_orientation_classify", False),
+        "use_doc_unwarping": config.get("use_doc_unwarping", False),
+        "textline_orientation": config.get("textline_orientation", True),
+        "text_det_thresh": config.get("text_det_thresh", 0.0),
+        "text_det_box_thresh": config.get("text_det_box_thresh", 0.0),
+        "text_det_unclip_ratio": config.get("text_det_unclip_ratio", 1.5),
+        "text_rec_score_thresh": config.get("text_rec_score_thresh", 0.0),
+    }
+
+    try:
+        response = requests.post(API_URL, json=payload, timeout=600)
+        response.raise_for_status()
+        metrics = response.json()
+        tune.report(metrics=metrics)
+    except Exception as e:
+        tune.report({"CER": 1.0, "WER": 1.0, "ERROR": str(e)[:500]})
+```
+
+## Architecture: Model Lifecycle
+
+The model is loaded **once** at container startup and stays in memory for all requests:
+
+```mermaid
+flowchart TB
+    subgraph Container["Docker Container Lifecycle"]
+        Start([Container Start]) --> Load[Load PaddleOCR Models<br/>~10-30s one-time cost]
+        Load --> Ready[API Ready<br/>Models in RAM ~500MB]
+
+        subgraph Requests["Incoming Requests - Models Stay Loaded"]
+            Ready --> R1[Request 1] --> Ready
+            Ready --> R2[Request 2] --> Ready
+            Ready --> RN[Request N...] --> Ready
+        end
+
+        Ready --> Stop([Container Stop])
+        Stop --> Free[Models Freed]
+    end
+
+    style Load fill:#f9f,stroke:#333
+    style Ready fill:#9f9,stroke:#333
+    style Requests fill:#e8f4ea,stroke:#090
+```
+
+**Subprocess vs REST API comparison:**
+
+```mermaid
+flowchart LR
+    subgraph Subprocess["❌ Subprocess Approach"]
+        direction TB
+        S1[Trial 1] --> L1[Load Model ~10s]
+        L1 --> E1[Evaluate ~60s]
+        E1 --> U1[Unload]
+        U1 --> S2[Trial 2]
+        S2 --> L2[Load Model ~10s]
+        L2 --> E2[Evaluate ~60s]
+    end
+
+    subgraph REST["✅ REST API Approach"]
+        direction TB
+        Start2[Start Container] --> Load2[Load Model ~10s]
+        Load2 --> Ready2[Model in Memory]
+        Ready2 --> T1[Trial 1 ~60s]
+        T1 --> Ready2
+        Ready2 --> T2[Trial 2 ~60s]
+        T2 --> Ready2
+        Ready2 --> TN[Trial N ~60s]
+    end
+
+    style L1 fill:#faa
+    style L2 fill:#faa
+    style Load2 fill:#afa
+    style Ready2 fill:#afa
+```
+
+## Performance Comparison
+
+| Approach | Model Load | Per-Trial Overhead | 64 Trials |
+|----------|------------|-------------------|-----------|
+| Subprocess (original) | Every trial (~10s) | ~10s | ~7 hours |
+| Docker per trial | Every trial (~10s) | ~12-15s | ~7.5 hours |
+| **REST API** | **Once** | **~0.1s** | **~5.8 hours** |
+
+The REST API saves ~1+ hour by loading the model only once.
+
+## Troubleshooting
+
+### Model download slow on first run
+The first run downloads ~500MB of models. Use volume `paddlex-cache` to persist them.
+
+### Out of memory
+Reduce `max_concurrent_trials` in Ray Tune, or increase container memory:
+```bash
+docker run --memory=8g ...
+```
+
+### GPU not detected
+Ensure NVIDIA Container Toolkit is installed:
+```bash
+nvidia-smi  # Should work
+docker run --rm --gpus all nvidia/cuda:12.0-base nvidia-smi  # Should work
+```
+
+### PaddlePaddle GPU installation fails
+PaddlePaddle 3.x GPU packages are **not available on PyPI**. They must be installed from PaddlePaddle's official index:
+```bash
+# For CUDA 12.x
+pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+
+# For CUDA 11.8
+pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
+```
+The Dockerfile.gpu handles this automatically.
+
+## CI/CD Pipeline
+
+The project includes a Gitea Actions workflow (`.gitea/workflows/ci.yaml`) for automated builds.
+
+### What CI Builds
+
+| Image | Architecture | Source |
+|-------|--------------|--------|
+| `paddle-ocr-cpu:amd64` | amd64 | PyPI paddlepaddle |
+| `paddle-ocr-cpu:arm64` | arm64 | Pre-built wheel from Gitea packages |
+| `paddle-ocr-gpu:amd64` | amd64 | PyPI paddlepaddle-gpu |
+| `paddle-ocr-gpu:arm64` | arm64 | Pre-built wheel from Gitea packages |
+
+### ARM64 Wheel Workflow
+
+Since PyPI wheels don't work on ARM64 (x86 SSE instructions), wheels must be built from source using sse2neon:
+
+1. Built manually on an ARM64 machine (one-time)
+2. Uploaded to Gitea generic packages
+3. Downloaded by CI when building ARM64 images
+
+#### Step 1: Build ARM64 Wheels (One-time, on ARM64 machine)
+
+```bash
+cd src/paddle_ocr
+
+# Build GPU wheel (requires NVIDIA GPU, takes 1-2 hours)
+sudo docker build -t paddle-builder:gpu-arm64 -f Dockerfile.build-paddle .
+sudo docker run --rm -v ./wheels:/wheels paddle-builder:gpu-arm64
+
+# Build CPU wheel (no GPU required, takes 1-2 hours)
+sudo docker build -t paddle-builder:cpu-arm64 -f Dockerfile.build-paddle-cpu .
+sudo docker run --rm -v ./wheels:/wheels paddle-builder:cpu-arm64
+
+# Verify wheels were created
+ls -la wheels/paddlepaddle*.whl
+# paddlepaddle_gpu-3.0.0-cp311-cp311-linux_aarch64.whl (GPU)
+# paddlepaddle-3.0.0-cp311-cp311-linux_aarch64.whl (CPU)
+```
+
+#### Step 2: Upload Wheels to Gitea Packages
+
+```bash
+export GITEA_TOKEN="your-token-here"
+
+# Upload GPU wheel
+curl -X PUT \
+  -H "Authorization: token $GITEA_TOKEN" \
+  --upload-file wheels/paddlepaddle_gpu-3.0.0-cp311-cp311-linux_aarch64.whl \
+  "https://seryus.ddns.net/api/packages/unir/generic/paddlepaddle-gpu-arm64/3.0.0/paddlepaddle_gpu-3.0.0-cp311-cp311-linux_aarch64.whl"
+
+# Upload CPU wheel
+curl -X PUT \
+  -H "Authorization: token $GITEA_TOKEN" \
+  --upload-file wheels/paddlepaddle-3.0.0-cp311-cp311-linux_aarch64.whl \
+  "https://seryus.ddns.net/api/packages/unir/generic/paddlepaddle-cpu-arm64/3.0.0/paddlepaddle-3.0.0-cp311-cp311-linux_aarch64.whl"
+```
+
+Wheels available at:
+```
+https://seryus.ddns.net/api/packages/unir/generic/paddlepaddle-gpu-arm64/3.0.0/paddlepaddle_gpu-3.0.0-cp311-cp311-linux_aarch64.whl
+https://seryus.ddns.net/api/packages/unir/generic/paddlepaddle-cpu-arm64/3.0.0/paddlepaddle-3.0.0-cp311-cp311-linux_aarch64.whl
+```
+
+#### Step 3: CI Builds Images
+
+CI automatically:
+1. Downloads ARM64 wheels from Gitea packages (for arm64 builds only)
+2. Builds both CPU and GPU images for amd64 and arm64
+3. Pushes to registry with arch-specific tags
+
+### Required CI Secrets
+
+Configure these in Gitea repository settings:
+
+| Secret | Description |
+|--------|-------------|
+| `CI_READWRITE` | Gitea token with registry read/write access |
+
+### Manual Image Push
+
+```bash
+# Login to registry
+docker login seryus.ddns.net
+
+# Build and push CPU (multi-arch)
+docker buildx build -f Dockerfile.cpu \
+  --platform linux/amd64,linux/arm64 \
+  -t seryus.ddns.net/unir/paddle-ocr-api:cpu \
+  --push .
+
+# Build and push GPU (x86_64)
+docker build -f Dockerfile.gpu -t seryus.ddns.net/unir/paddle-ocr-api:gpu-amd64 .
+docker push seryus.ddns.net/unir/paddle-ocr-api:gpu-amd64
+
+# Build and push GPU (ARM64) - requires wheel in wheels/
+docker buildx build -f Dockerfile.gpu \
+  --platform linux/arm64 \
+  -t seryus.ddns.net/unir/paddle-ocr-api:gpu-arm64 \
+  --push .
+```
+
+### Updating the ARM64 Wheels
+
+When PaddlePaddle releases a new version:
+
+1. Update `PADDLE_VERSION` in `Dockerfile.build-paddle` and `Dockerfile.build-paddle-cpu`
+2. Rebuild both wheels on an ARM64 machine
+3. Upload to Gitea packages with new version
+4. Update `PADDLE_VERSION` in `.gitea/workflows/ci.yaml`
--- a/src/paddle_ocr/dataset_manager.py
+++ b/src/paddle_ocr/dataset_manager.py
@@ -0,0 +1,74 @@
+# Imports
+import os
+from PIL import Image
+
+
+class ImageTextDataset:
+    def __init__(self, root):
+        self.samples = []
+
+        for folder in sorted(os.listdir(root)):
+            sub = os.path.join(root, folder)
+            img_dir = os.path.join(sub, "img")
+            txt_dir = os.path.join(sub, "txt")
+
+            if not (os.path.isdir(img_dir) and os.path.isdir(txt_dir)):
+                continue
+
+            for fname in sorted(os.listdir(img_dir)):
+                if not fname.lower().endswith((".png", ".jpg", ".jpeg")):
+                    continue
+
+                img_path = os.path.join(img_dir, fname)
+
+                # text file must have same name but .txt
+                txt_name = os.path.splitext(fname)[0] + ".txt"
+                txt_path = os.path.join(txt_dir, txt_name)
+
+                if not os.path.exists(txt_path):
+                    continue
+
+                self.samples.append((img_path, txt_path))
+    def __len__(self):
+        return len(self.samples)
+
+    def __getitem__(self, idx):
+        img_path, txt_path = self.samples[idx]
+
+        # Load image
+        image = Image.open(img_path).convert("RGB")
+
+        # Load text
+        with open(txt_path, "r", encoding="utf-8") as f:
+            text = f.read()
+
+        return image, text
+
+    def get_output_path(self, idx, output_subdir, debugset_root="/app/debugset"):
+        """Get output path for saving OCR result to debugset folder.
+
+        Args:
+            idx: Sample index
+            output_subdir: Subdirectory name (e.g., 'paddle_text', 'doctr_text')
+            debugset_root: Root folder for debug output (default: /app/debugset)
+
+        Returns:
+            Path like /app/debugset/doc1/{output_subdir}/page_001.txt
+        """
+        img_path, _ = self.samples[idx]
+        # img_path: /app/dataset/doc1/img/page_001.png
+        # Extract relative path: doc1/img/page_001.png
+        parts = img_path.split("/dataset/", 1)
+        if len(parts) == 2:
+            rel_path = parts[1]  # doc1/img/page_001.png
+        else:
+            rel_path = os.path.basename(img_path)
+
+        # Replace /img/ with /{output_subdir}/
+        rel_parts = rel_path.rsplit("/img/", 1)
+        doc_folder = rel_parts[0]  # doc1
+        fname = os.path.splitext(rel_parts[1])[0] + ".txt"  # page_001.txt
+
+        out_dir = os.path.join(debugset_root, doc_folder, output_subdir)
+        os.makedirs(out_dir, exist_ok=True)
+        return os.path.join(out_dir, fname)
--- a/src/paddle_ocr/docker-compose.cpu-registry.yml
+++ b/src/paddle_ocr/docker-compose.cpu-registry.yml
@@ -0,0 +1,26 @@
+# docker-compose.cpu-registry.yml - Pull CPU image from registry
+# Usage: docker compose -f docker-compose.cpu-registry.yml up
+
+services:
+  ocr-cpu:
+    image: seryus.ddns.net/unir/paddle-ocr-cpu:latest
+    container_name: paddle-ocr-cpu-registry
+    ports:
+      - "8001:8000"
+    volumes:
+      - ../dataset:/app/dataset:ro
+      - ../debugset:/app/debugset:rw
+      - paddlex-cache:/root/.paddlex
+    environment:
+      - PYTHONUNBUFFERED=1
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+
+volumes:
+  paddlex-cache:
+    name: paddlex-model-cache
--- a/src/paddle_ocr/docker-compose.gpu-registry.yml
+++ b/src/paddle_ocr/docker-compose.gpu-registry.yml
@@ -0,0 +1,39 @@
+# docker-compose.gpu-registry.yml - Pull GPU image from registry
+# Usage: docker compose -f docker-compose.gpu-registry.yml up
+#
+# Requires: NVIDIA GPU + nvidia-container-toolkit installed
+
+services:
+  ocr-gpu:
+    image: seryus.ddns.net/unir/paddle-ocr-gpu:latest
+    container_name: paddle-ocr-gpu-registry
+    ports:
+      - "8002:8000"
+    volumes:
+      - ../dataset:/app/dataset:ro
+      - ../debugset:/app/debugset:rw
+      - paddlex-cache:/root/.paddlex
+      - ./scripts:/app/scripts:ro
+    environment:
+      - PYTHONUNBUFFERED=1
+      - CUDA_VISIBLE_DEVICES=0
+      - PADDLE_DET_MODEL=PP-OCRv5_mobile_det
+      - PADDLE_REC_MODEL=PP-OCRv5_mobile_rec
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+
+volumes:
+  paddlex-cache:
+    name: paddlex-model-cache
--- a/src/paddle_ocr/docker-compose.workers.yml
+++ b/src/paddle_ocr/docker-compose.workers.yml
@@ -0,0 +1,140 @@
+# docker-compose.workers.yml - Multiple PaddleOCR workers for parallel Ray Tune
+#
+# Usage:
+#   GPU (4 workers sharing GPU):
+#     docker compose -f docker-compose.workers.yml up
+#
+#   CPU (4 workers):
+#     docker compose -f docker-compose.workers.yml --profile cpu up
+#
+#   Scale workers (e.g., 8 workers):
+#     NUM_WORKERS=8 docker compose -f docker-compose.workers.yml up
+#
+# Each worker runs on a separate port: 8001, 8002, 8003, 8004, ...
+
+x-ocr-gpu-common: &ocr-gpu-common
+  image: seryus.ddns.net/unir/paddle-ocr-gpu:latest
+  volumes:
+    - ../dataset:/app/dataset:ro
+    - ../debugset:/app/debugset:rw
+    - paddlex-cache:/root/.paddlex
+  environment:
+    - PYTHONUNBUFFERED=1
+    - CUDA_VISIBLE_DEVICES=0
+  deploy:
+    resources:
+      reservations:
+        devices:
+          - driver: nvidia
+            count: 1
+            capabilities: [gpu]
+  restart: unless-stopped
+  healthcheck:
+    test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
+    interval: 30s
+    timeout: 10s
+    retries: 3
+    start_period: 120s
+
+x-ocr-cpu-common: &ocr-cpu-common
+  image: seryus.ddns.net/unir/paddle-ocr-cpu:latest
+  volumes:
+    - ../dataset:/app/dataset:ro
+    - ../debugset:/app/debugset:rw
+    - paddlex-cache:/root/.paddlex
+  environment:
+    - PYTHONUNBUFFERED=1
+  restart: unless-stopped
+  healthcheck:
+    test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
+    interval: 30s
+    timeout: 10s
+    retries: 3
+    start_period: 120s
+
+services:
+  # GPU Workers (gpu profile) - share single GPU
+  ocr-worker-1:
+    <<: *ocr-gpu-common
+    container_name: paddle-ocr-worker-1
+    ports:
+      - "8001:8000"
+    profiles:
+      - gpu
+
+  ocr-worker-2:
+    <<: *ocr-gpu-common
+    container_name: paddle-ocr-worker-2
+    ports:
+      - "8002:8000"
+    profiles:
+      - gpu
+
+  ocr-worker-3:
+    <<: *ocr-gpu-common
+    container_name: paddle-ocr-worker-3
+    ports:
+      - "8003:8000"
+    profiles:
+      - gpu
+
+  ocr-worker-4:
+    <<: *ocr-gpu-common
+    container_name: paddle-ocr-worker-4
+    ports:
+      - "8004:8000"
+    profiles:
+      - gpu
+
+  ocr-worker-5:
+    <<: *ocr-gpu-common
+    container_name: paddle-ocr-worker-5
+    ports:
+      - "8005:8000"
+    profiles:
+      - gpu
+
+  # CPU Workers (cpu profile) - for systems without GPU
+  ocr-cpu-worker-1:
+    <<: *ocr-cpu-common
+    container_name: paddle-ocr-cpu-worker-1
+    ports:
+      - "8001:8000"
+    profiles:
+      - cpu
+
+  ocr-cpu-worker-2:
+    <<: *ocr-cpu-common
+    container_name: paddle-ocr-cpu-worker-2
+    ports:
+      - "8002:8000"
+    profiles:
+      - cpu
+
+  ocr-cpu-worker-3:
+    <<: *ocr-cpu-common
+    container_name: paddle-ocr-cpu-worker-3
+    ports:
+      - "8003:8000"
+    profiles:
+      - cpu
+
+  ocr-cpu-worker-4:
+    <<: *ocr-cpu-common
+    container_name: paddle-ocr-cpu-worker-4
+    ports:
+      - "8004:8000"
+    profiles:
+      - cpu
+
+  ocr-cpu-worker-5:
+    <<: *ocr-cpu-common
+    container_name: paddle-ocr-cpu-worker-5
+    ports:
+      - "8005:8000"
+    profiles:
+      - cpu
+
+volumes:
+  paddlex-cache:
+    name: paddlex-model-cache
--- a/src/paddle_ocr/docker-compose.yml
+++ b/src/paddle_ocr/docker-compose.yml
@@ -0,0 +1,111 @@
+# docker-compose.yml - PaddleOCR REST API
+# Usage:
+#   CPU:   docker compose up ocr-cpu
+#   GPU:   docker compose up ocr-gpu
+#   Test:  docker compose run --rm test
+#   Build: CUDA_ARCH=120 docker compose --profile build run --rm build-paddle
+#
+# Auto-detect CUDA arch before building:
+#   export CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1 | tr -d '.')
+#   docker compose --profile build run --rm build-paddle
+
+services:
+  # PaddlePaddle GPU wheel builder (ARM64 only, one-time build)
+  # Creates ./wheels/paddlepaddle_gpu-*.whl for ARM64 GPU support
+  # CUDA_ARCH env var controls target GPU architecture (default: 120 for Blackwell base)
+  build-paddle:
+    build:
+      context: .
+      dockerfile: Dockerfile.build-paddle
+      args:
+        CUDA_ARCH: ${CUDA_ARCH:-120}
+    volumes:
+      - ./wheels:/wheels
+    profiles:
+      - build
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+
+  # CPU-only service (works on any architecture)
+  ocr-cpu:
+    build:
+      context: .
+      dockerfile: Dockerfile.cpu
+      args:
+        # Models to bake into image (change before building):
+        DET_MODEL: PP-OCRv5_server_det
+        REC_MODEL: PP-OCRv5_server_rec
+    image: paddle-ocr-api:cpu
+    container_name: paddle-ocr-cpu
+    ports:
+      - "8000:8000"
+    volumes:
+      - ../dataset:/app/dataset:ro
+      - ../debugset:/app/debugset:rw          # Your dataset
+      - paddlex-cache:/root/.paddlex        # For additional models at runtime
+    environment:
+      - PYTHONUNBUFFERED=1
+      # Override models at runtime (uncomment to use different models):
+      # - PADDLE_DET_MODEL=PP-OCRv5_mobile_det
+      # - PADDLE_REC_MODEL=PP-OCRv5_mobile_rec
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+
+  # GPU service (requires NVIDIA Container Toolkit)
+  ocr-gpu:
+    build:
+      context: .
+      dockerfile: Dockerfile.gpu
+      args:
+        DET_MODEL: PP-OCRv5_server_det
+        REC_MODEL: PP-OCRv5_server_rec
+    image: paddle-ocr-api:gpu
+    container_name: paddle-ocr-gpu
+    ports:
+      - "8000:8000"
+    volumes:
+      - ../dataset:/app/dataset:ro
+      - ../debugset:/app/debugset:rw
+      - paddlex-cache:/root/.paddlex
+    environment:
+      - PYTHONUNBUFFERED=1
+      - CUDA_VISIBLE_DEVICES=0
+      # Override models at runtime:
+      # - PADDLE_DET_MODEL=PP-OCRv5_mobile_det
+      # - PADDLE_REC_MODEL=PP-OCRv5_mobile_rec
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    restart: unless-stopped
+
+  # Test client (runs once and exits)
+  test:
+    image: python:3.11-slim
+    container_name: paddle-ocr-test
+    depends_on:
+      ocr-cpu:
+        condition: service_healthy
+    volumes:
+      - ./test.py:/app/test.py:ro
+    working_dir: /app
+    command: >
+      sh -c "pip install -q requests && python test.py --url http://ocr-cpu:8000 --dataset /app/dataset"
+    network_mode: "service:ocr-cpu"
+
+volumes:
+  paddlex-cache:
+    name: paddlex-model-cache
--- a/src/paddle_ocr/paddle_ocr_tuning_rest.py
+++ b/src/paddle_ocr/paddle_ocr_tuning_rest.py
@@ -0,0 +1,340 @@
+# paddle_ocr_tuning_rest.py
+# FastAPI REST service for PaddleOCR hyperparameter evaluation
+# Usage: uvicorn paddle_ocr_tuning_rest:app --host 0.0.0.0 --port 8000
+
+import os
+import re
+import time
+import threading
+from typing import Optional
+from contextlib import asynccontextmanager
+
+import numpy as np
+import paddle
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, Field
+
+from paddleocr import PaddleOCR
+from jiwer import wer, cer
+from dataset_manager import ImageTextDataset
+
+
+def get_gpu_info() -> dict:
+    """Get GPU status information from PaddlePaddle."""
+    info = {
+        "cuda_available": paddle.device.is_compiled_with_cuda(),
+        "device": str(paddle.device.get_device()),
+        "gpu_count": 0,
+        "gpu_name": None,
+        "gpu_memory_total": None,
+        "gpu_memory_used": None,
+    }
+
+    if info["cuda_available"]:
+        try:
+            info["gpu_count"] = paddle.device.cuda.device_count()
+            if info["gpu_count"] > 0:
+                # Get GPU properties
+                props = paddle.device.cuda.get_device_properties(0)
+                info["gpu_name"] = props.name
+                info["gpu_memory_total"] = f"{props.total_memory / (1024**3):.2f} GB"
+
+                # Get current memory usage
+                mem_reserved = paddle.device.cuda.memory_reserved(0)
+                mem_allocated = paddle.device.cuda.memory_allocated(0)
+                info["gpu_memory_used"] = f"{mem_allocated / (1024**3):.2f} GB"
+                info["gpu_memory_reserved"] = f"{mem_reserved / (1024**3):.2f} GB"
+        except Exception as e:
+            info["gpu_error"] = str(e)
+
+    return info
+
+
+# Model configuration via environment variables (with defaults)
+DEFAULT_DET_MODEL = os.environ.get("PADDLE_DET_MODEL", "PP-OCRv5_server_det")
+DEFAULT_REC_MODEL = os.environ.get("PADDLE_REC_MODEL", "PP-OCRv5_server_rec")
+
+
+# Global state for model and dataset
+class AppState:
+    ocr: Optional[PaddleOCR] = None
+    dataset: Optional[ImageTextDataset] = None
+    dataset_path: Optional[str] = None
+    det_model: str = DEFAULT_DET_MODEL
+    rec_model: str = DEFAULT_REC_MODEL
+    lock: threading.Lock = None  # Protects OCR model from concurrent access
+
+    def __init__(self):
+        self.lock = threading.Lock()
+
+
+state = AppState()
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Load OCR model at startup."""
+    # Log GPU status
+    gpu_info = get_gpu_info()
+    print("=" * 50)
+    print("GPU STATUS")
+    print("=" * 50)
+    print(f"  CUDA available: {gpu_info['cuda_available']}")
+    print(f"  Device: {gpu_info['device']}")
+    if gpu_info['cuda_available']:
+        print(f"  GPU count: {gpu_info['gpu_count']}")
+        print(f"  GPU name: {gpu_info['gpu_name']}")
+        print(f"  GPU memory total: {gpu_info['gpu_memory_total']}")
+    print("=" * 50)
+
+    print(f"Loading PaddleOCR models...")
+    print(f"  Detection: {state.det_model}")
+    print(f"  Recognition: {state.rec_model}")
+    state.ocr = PaddleOCR(
+        text_detection_model_name=state.det_model,
+        text_recognition_model_name=state.rec_model,
+    )
+
+    # Log GPU memory after model load
+    if gpu_info['cuda_available']:
+        gpu_after = get_gpu_info()
+        print(f"  GPU memory after load: {gpu_after.get('gpu_memory_used', 'N/A')}")
+
+    print("Model loaded successfully!")
+    yield
+    # Cleanup on shutdown
+    state.ocr = None
+    state.dataset = None
+
+
+app = FastAPI(
+    title="PaddleOCR Tuning API",
+    description="REST API for OCR hyperparameter evaluation",
+    version="1.0.0",
+    lifespan=lifespan,
+)
+
+
+class EvaluateRequest(BaseModel):
+    """Request schema matching CLI arguments."""
+    pdf_folder: str = Field("/app/dataset", description="Path to dataset folder")
+    use_doc_orientation_classify: bool = Field(False, description="Use document orientation classification")
+    use_doc_unwarping: bool = Field(False, description="Use document unwarping")
+    textline_orientation: bool = Field(True, description="Use textline orientation classification")
+    text_det_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Detection pixel threshold")
+    text_det_box_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Detection box threshold")
+    text_det_unclip_ratio: float = Field(1.5, ge=0.0, description="Text detection expansion coefficient")
+    text_rec_score_thresh: float = Field(0.0, ge=0.0, le=1.0, description="Recognition score threshold")
+    start_page: int = Field(5, ge=0, description="Start page index (inclusive)")
+    end_page: int = Field(10, ge=1, description="End page index (exclusive)")
+    save_output: bool = Field(False, description="Save OCR predictions to debugset folder")
+
+
+class EvaluateResponse(BaseModel):
+    """Response schema matching CLI output."""
+    CER: float
+    WER: float
+    TIME: float
+    PAGES: int
+    TIME_PER_PAGE: float
+
+
+class HealthResponse(BaseModel):
+    status: str
+    model_loaded: bool
+    dataset_loaded: bool
+    dataset_size: Optional[int] = None
+    det_model: Optional[str] = None
+    rec_model: Optional[str] = None
+    # GPU info
+    cuda_available: Optional[bool] = None
+    device: Optional[str] = None
+    gpu_name: Optional[str] = None
+    gpu_memory_used: Optional[str] = None
+    gpu_memory_total: Optional[str] = None
+
+
+def _normalize_box_xyxy(box):
+    """Normalize bounding box to (x0, y0, x1, y1) format."""
+    if isinstance(box, (list, tuple)) and box and isinstance(box[0], (list, tuple)):
+        xs = [p[0] for p in box]
+        ys = [p[1] for p in box]
+        return min(xs), min(ys), max(xs), max(ys)
+
+    if isinstance(box, (list, tuple)):
+        if len(box) == 4:
+            x0, y0, x1, y1 = box
+            return min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1)
+        if len(box) == 8:
+            xs = box[0::2]
+            ys = box[1::2]
+            return min(xs), min(ys), max(xs), max(ys)
+
+    raise ValueError(f"Unrecognized box format: {box!r}")
+
+
+def assemble_from_paddle_result(paddleocr_predict, min_score=0.0, line_tol_factor=0.6):
+    """
+    Robust line grouping for PaddleOCR outputs.
+    Normalizes boxes, groups by line, and returns assembled text.
+    """
+    boxes_all = []
+    for item in paddleocr_predict:
+        res = item.json.get("res", {})
+        boxes = res.get("rec_boxes", []) or []
+        texts = res.get("rec_texts", []) or []
+        scores = res.get("rec_scores", None)
+
+        for i, (box, text) in enumerate(zip(boxes, texts)):
+            try:
+                x0, y0, x1, y1 = _normalize_box_xyxy(box)
+            except Exception:
+                continue
+
+            y_mid = 0.5 * (y0 + y1)
+            score = float(scores[i]) if (scores is not None and i < len(scores)) else 1.0
+
+            t = re.sub(r"\s+", " ", str(text)).strip()
+            if not t:
+                continue
+
+            boxes_all.append((x0, y0, x1, y1, y_mid, t, score))
+
+    if min_score > 0:
+        boxes_all = [b for b in boxes_all if b[6] >= min_score]
+
+    if not boxes_all:
+        return ""
+
+    # Adaptive line tolerance
+    heights = [b[3] - b[1] for b in boxes_all]
+    median_h = float(np.median(heights)) if heights else 20.0
+    line_tol = max(8.0, line_tol_factor * median_h)
+
+    # Sort by vertical mid, then x0
+    boxes_all.sort(key=lambda b: (b[4], b[0]))
+
+    # Group into lines
+    lines, cur, last_y = [], [], None
+    for x0, y0, x1, y1, y_mid, text, score in boxes_all:
+        if last_y is None or abs(y_mid - last_y) <= line_tol:
+            cur.append((x0, text))
+        else:
+            cur.sort(key=lambda t: t[0])
+            lines.append(" ".join(t[1] for t in cur))
+            cur = [(x0, text)]
+        last_y = y_mid
+
+    if cur:
+        cur.sort(key=lambda t: t[0])
+        lines.append(" ".join(t[1] for t in cur))
+
+    res = "\n".join(lines)
+    res = re.sub(r"\s+\n", "\n", res).strip()
+    return res
+
+
+def evaluate_text(reference: str, prediction: str) -> dict:
+    """Calculate WER and CER metrics."""
+    return {"WER": wer(reference, prediction), "CER": cer(reference, prediction)}
+
+
+@app.get("/health", response_model=HealthResponse)
+def health_check():
+    """Check if the service is ready."""
+    gpu_info = get_gpu_info()
+    return HealthResponse(
+        status="ok" if state.ocr is not None else "initializing",
+        model_loaded=state.ocr is not None,
+        dataset_loaded=state.dataset is not None,
+        dataset_size=len(state.dataset) if state.dataset else None,
+        det_model=state.det_model,
+        rec_model=state.rec_model,
+        cuda_available=gpu_info.get("cuda_available"),
+        device=gpu_info.get("device"),
+        gpu_name=gpu_info.get("gpu_name"),
+        gpu_memory_used=gpu_info.get("gpu_memory_used"),
+        gpu_memory_total=gpu_info.get("gpu_memory_total"),
+    )
+
+
+@app.post("/evaluate", response_model=EvaluateResponse)
+def evaluate(request: EvaluateRequest):
+    """
+    Evaluate OCR with given hyperparameters.
+    Returns CER, WER, and timing metrics.
+    """
+    if state.ocr is None:
+        raise HTTPException(status_code=503, detail="Model not loaded yet")
+
+    # Load or reload dataset if path changed
+    if state.dataset is None or state.dataset_path != request.pdf_folder:
+        if not os.path.isdir(request.pdf_folder):
+            raise HTTPException(status_code=400, detail=f"Dataset folder not found: {request.pdf_folder}")
+        state.dataset = ImageTextDataset(request.pdf_folder)
+        state.dataset_path = request.pdf_folder
+
+    if len(state.dataset) == 0:
+        raise HTTPException(status_code=400, detail="Dataset is empty")
+
+    # Validate page range
+    start = request.start_page
+    end = min(request.end_page, len(state.dataset))
+    if start >= end:
+        raise HTTPException(status_code=400, detail=f"Invalid page range: {start}-{end}")
+
+    cer_list, wer_list = [], []
+    time_per_page_list = []
+    t0 = time.time()
+
+    # Lock to prevent concurrent OCR access (model is not thread-safe)
+    with state.lock:
+        for idx in range(start, end):
+            img, ref = state.dataset[idx]
+            arr = np.array(img)
+
+            tp0 = time.time()
+            out = state.ocr.predict(
+                arr,
+                use_doc_orientation_classify=request.use_doc_orientation_classify,
+                use_doc_unwarping=request.use_doc_unwarping,
+                use_textline_orientation=request.textline_orientation,
+                text_det_thresh=request.text_det_thresh,
+                text_det_box_thresh=request.text_det_box_thresh,
+                text_det_unclip_ratio=request.text_det_unclip_ratio,
+                text_rec_score_thresh=request.text_rec_score_thresh,
+            )
+
+            pred = assemble_from_paddle_result(out)
+            time_per_page_list.append(float(time.time() - tp0))
+
+            # Save prediction to debugset if requested
+            if request.save_output:
+                out_path = state.dataset.get_output_path(idx, "paddle_text")
+                with open(out_path, "w", encoding="utf-8") as f:
+                    f.write(pred)
+
+            m = evaluate_text(ref, pred)
+            cer_list.append(m["CER"])
+            wer_list.append(m["WER"])
+
+    return EvaluateResponse(
+        CER=float(np.mean(cer_list)) if cer_list else 1.0,
+        WER=float(np.mean(wer_list)) if wer_list else 1.0,
+        TIME=float(time.time() - t0),
+        PAGES=len(cer_list),
+        TIME_PER_PAGE=float(np.mean(time_per_page_list)) if time_per_page_list else 0.0,
+    )
+
+
+@app.post("/evaluate_full", response_model=EvaluateResponse)
+def evaluate_full(request: EvaluateRequest):
+    """Evaluate on ALL pages (ignores start_page/end_page)."""
+    request.start_page = 0
+    request.end_page = 9999  # Will be clamped to dataset size
+    return evaluate(request)
+
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/src/paddle_ocr/requirements-gpu.txt
+++ b/src/paddle_ocr/requirements-gpu.txt
@@ -0,0 +1,22 @@
+# PaddleOCR REST API - GPU Requirements
+# Install: pip install -r requirements-gpu.txt
+
+# PaddlePaddle (GPU version with CUDA)
+paddlepaddle-gpu==3.2.0
+
+# PaddleOCR
+paddleocr==3.3.2
+
+# OCR evaluation metrics
+jiwer
+
+# Numerical computing
+numpy
+
+# REST API framework
+fastapi
+uvicorn[standard]
+pydantic
+
+# Image processing
+Pillow
--- a/src/paddle_ocr/requirements.txt
+++ b/src/paddle_ocr/requirements.txt
@@ -0,0 +1,22 @@
+# PaddleOCR REST API - CPU Requirements
+# Install: pip install -r requirements.txt
+
+# PaddlePaddle (CPU version)
+paddlepaddle==3.2.2
+
+# PaddleOCR
+paddleocr==3.3.2
+
+# OCR evaluation metrics
+jiwer
+
+# Numerical computing
+numpy
+
+# REST API framework
+fastapi
+uvicorn[standard]
+pydantic
+
+# Image processing (pulled by paddleocr, but explicit)
+Pillow
--- a/src/paddle_ocr/scripts/debug_gpu_detection.py
+++ b/src/paddle_ocr/scripts/debug_gpu_detection.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+"""
+Debug script for GPU OCR detection issues.
+
+This script tests the raw inference output from PaddlePaddle detection models
+to diagnose why detection might fail on certain GPU architectures (e.g., Blackwell/sm_121).
+
+Usage:
+    docker exec paddle-ocr-gpu python /app/debug_gpu_detection.py [image_path]
+
+Expected behavior:
+    - Working GPU: Output stats should show min close to 0, max close to 1, mean ~0.1-0.5
+    - Broken GPU: Output stats show constant values (e.g., min=max=mean=0.00001)
+"""
+
+import os
+import sys
+
+os.environ['DISABLE_MODEL_SOURCE_CHECK'] = 'True'
+
+import numpy as np
+import paddle
+from PIL import Image
+
+
+def check_gpu_status():
+    """Check GPU availability and properties."""
+    print("=" * 60)
+    print("GPU STATUS")
+    print("=" * 60)
+    print(f"Device: {paddle.device.get_device()}")
+    print(f"CUDA compiled: {paddle.device.is_compiled_with_cuda()}")
+
+    if paddle.device.is_compiled_with_cuda():
+        print(f"GPU count: {paddle.device.cuda.device_count()}")
+        if paddle.device.cuda.device_count() > 0:
+            props = paddle.device.cuda.get_device_properties(0)
+            print(f"GPU name: {props.name}")
+            print(f"Compute capability: {props.major}.{props.minor}")
+            print(f"Total memory: {props.total_memory / (1024**3):.2f} GB")
+    print()
+
+
+def test_basic_ops():
+    """Test basic GPU tensor operations."""
+    print("=" * 60)
+    print("BASIC GPU OPERATIONS")
+    print("=" * 60)
+
+    # Test tensor creation
+    x = paddle.randn([2, 3])
+    print(f"Tensor place: {x.place}")
+
+    # Test conv2d
+    x = paddle.randn([1, 3, 64, 64])
+    conv = paddle.nn.Conv2D(3, 16, 3, padding=1)
+    y = conv(x)
+    print(f"Conv2d output shape: {y.shape}, place: {y.place}")
+
+    # Test softmax
+    s = paddle.nn.functional.softmax(y, axis=1)
+    print(f"Softmax output shape: {s.shape}")
+    print("Basic operations: OK")
+    print()
+
+
+def test_detection_model(image_path: str):
+    """Test detection model raw output."""
+    print("=" * 60)
+    print("DETECTION MODEL TEST")
+    print("=" * 60)
+
+    from paddle.inference import Config, create_predictor
+
+    model_dir = '/root/.paddlex/official_models/PP-OCRv4_mobile_det'
+    inference_file = f'{model_dir}/inference.json'
+    params_file = f'{model_dir}/inference.pdiparams'
+
+    if not os.path.exists(inference_file):
+        print(f"Model not found at {model_dir}")
+        print("Run PaddleOCR once to download models first.")
+        return
+
+    # Create config
+    config = Config()
+    config.set_prog_file(inference_file)
+    config.set_params_file(params_file)
+    config.enable_use_gpu(1024, 0)
+
+    print("Creating predictor...")
+    predictor = create_predictor(config)
+
+    # Get input/output names
+    input_names = predictor.get_input_names()
+    output_names = predictor.get_output_names()
+    print(f"Input names: {input_names}")
+    print(f"Output names: {output_names}")
+
+    # Load and preprocess image
+    img = Image.open(image_path)
+    img = img.resize((640, 640))
+    arr = np.array(img).astype('float32')
+    arr = arr / 255.0
+    arr = arr.transpose(2, 0, 1)[np.newaxis, ...]  # NCHW
+    print(f"Input tensor shape: {arr.shape}")
+
+    # Set input
+    input_handle = predictor.get_input_handle(input_names[0])
+    input_handle.reshape(arr.shape)
+    input_handle.copy_from_cpu(arr)
+
+    # Run prediction
+    print("Running inference...")
+    predictor.run()
+
+    # Get output
+    output_handle = predictor.get_output_handle(output_names[0])
+    output = output_handle.copy_to_cpu()
+
+    print()
+    print("OUTPUT ANALYSIS:")
+    print(f"  Shape: {output.shape}")
+    print(f"  Min: {output.min():.6f}")
+    print(f"  Max: {output.max():.6f}")
+    print(f"  Mean: {output.mean():.6f}")
+    print(f"  Std: {output.std():.6f}")
+    print(f"  Has NaN: {np.isnan(output).any()}")
+    print(f"  Has Inf: {np.isinf(output).any()}")
+
+    # Diagnosis
+    print()
+    print("DIAGNOSIS:")
+    if output.min() == output.max():
+        print("  PROBLEM: Output is constant - model inference is broken!")
+        print("  This typically indicates GPU compute capability mismatch.")
+        print("  GB10 (sm_121) may need CUDA 13.0+ for native support.")
+    elif output.max() < 0.01:
+        print("  PROBLEM: Output values too low - detection will find nothing.")
+    elif np.isnan(output).any() or np.isinf(output).any():
+        print("  PROBLEM: Output contains NaN/Inf - numerical instability.")
+    else:
+        print("  OK: Output values look reasonable.")
+        print(f"  Detection threshold typically 0.3-0.6, max output is {output.max():.3f}")
+
+
+def test_paddleocr_output(image_path: str):
+    """Test full PaddleOCR pipeline."""
+    print()
+    print("=" * 60)
+    print("PADDLEOCR PIPELINE TEST")
+    print("=" * 60)
+
+    from paddleocr import PaddleOCR
+
+    ocr = PaddleOCR(
+        text_detection_model_name='PP-OCRv4_mobile_det',
+        text_recognition_model_name='PP-OCRv4_mobile_rec',
+    )
+
+    img = Image.open(image_path)
+    arr = np.array(img)
+
+    out = ocr.predict(arr)
+    res = out[0].json['res']
+
+    dt_polys = res.get('dt_polys', [])
+    rec_texts = res.get('rec_texts', [])
+
+    print(f"Detection polygons: {len(dt_polys)}")
+    print(f"Recognition texts: {len(rec_texts)}")
+
+    if rec_texts:
+        print(f"Sample texts: {rec_texts[:5]}")
+    else:
+        print("No text detected!")
+
+
+def main():
+    # Default test image
+    image_path = '/app/dataset/0/img/page_0001.png'
+    if len(sys.argv) > 1:
+        image_path = sys.argv[1]
+
+    if not os.path.exists(image_path):
+        print(f"Image not found: {image_path}")
+        print("Usage: python debug_gpu_detection.py [image_path]")
+        sys.exit(1)
+
+    print(f"Testing with image: {image_path}")
+    print()
+
+    check_gpu_status()
+    test_basic_ops()
+    test_detection_model(image_path)
+    test_paddleocr_output(image_path)
+
+
+if __name__ == '__main__':
+    main()
--- a/src/paddle_ocr/scripts/test_dynamic_mode.py
+++ b/src/paddle_ocr/scripts/test_dynamic_mode.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python3
+"""
+Test PaddleOCR in dynamic graph mode (not inference mode).
+
+Dynamic mode compiles kernels at runtime, which may work on Blackwell.
+Inference mode uses pre-compiled kernels which fail on sm_121.
+
+Usage:
+    python test_dynamic_mode.py [image_path]
+"""
+
+import os
+import sys
+
+os.environ['DISABLE_MODEL_SOURCE_CHECK'] = 'True'
+# Force dynamic graph mode
+os.environ['FLAGS_enable_pir_api'] = '0'
+
+import numpy as np
+import paddle
+from PIL import Image
+
+
+def check_gpu():
+    """Check GPU status."""
+    print("=" * 60)
+    print("GPU STATUS")
+    print("=" * 60)
+    print(f"Device: {paddle.device.get_device()}")
+    print(f"CUDA compiled: {paddle.device.is_compiled_with_cuda()}")
+
+    if paddle.device.is_compiled_with_cuda() and paddle.device.cuda.device_count() > 0:
+        props = paddle.device.cuda.get_device_properties(0)
+        print(f"GPU: {props.name} (sm_{props.major}{props.minor})")
+        print(f"Memory: {props.total_memory / (1024**3):.1f} GB")
+    print()
+
+
+def test_paddleocr_dynamic(image_path: str):
+    """Test PaddleOCR with dynamic execution."""
+    print("=" * 60)
+    print("PADDLEOCR DYNAMIC MODE TEST")
+    print("=" * 60)
+
+    # Import PaddleOCR
+    from paddleocr import PaddleOCR
+
+    # Try to force dynamic mode by setting use_static=False if available
+    # or by using the model in eval mode directly
+
+    print("Creating PaddleOCR instance...")
+    print("(This may download models on first run)")
+
+    try:
+        # Create OCR instance - this might still use inference internally
+        ocr = PaddleOCR(
+            text_detection_model_name='PP-OCRv4_mobile_det',
+            text_recognition_model_name='PP-OCRv4_mobile_rec',
+            use_angle_cls=False,  # Simplify
+            lang='es',
+        )
+
+        # Load image
+        img = Image.open(image_path)
+        arr = np.array(img)
+        print(f"Image shape: {arr.shape}")
+
+        # Run prediction
+        print("Running OCR prediction...")
+        result = ocr.predict(arr)
+
+        # Parse results
+        res = result[0].json['res']
+        dt_polys = res.get('dt_polys', [])
+        rec_texts = res.get('rec_texts', [])
+
+        print()
+        print("RESULTS:")
+        print(f"  Detected boxes: {len(dt_polys)}")
+        print(f"  Recognized texts: {len(rec_texts)}")
+
+        if rec_texts:
+            print(f"  First 5 texts: {rec_texts[:5]}")
+            return True
+        else:
+            print("  WARNING: No text recognized!")
+            return False
+
+    except Exception as e:
+        print(f"ERROR: {e}")
+        return False
+
+
+def test_paddle_dynamic_model():
+    """Test loading a paddle model in dynamic graph mode."""
+    print()
+    print("=" * 60)
+    print("PADDLE DYNAMIC GRAPH TEST")
+    print("=" * 60)
+
+    # Ensure we're in dynamic mode
+    paddle.disable_static()
+
+    # Test a simple model forward pass
+    print("Testing dynamic graph execution...")
+
+    # Create a simple ResNet-like block
+    x = paddle.randn([1, 3, 224, 224])
+
+    # Conv -> BN -> ReLU
+    conv = paddle.nn.Conv2D(3, 64, 7, stride=2, padding=3)
+    bn = paddle.nn.BatchNorm2D(64)
+
+    # Forward pass (dynamic mode - compiles at runtime)
+    y = conv(x)
+    y = bn(y)
+    y = paddle.nn.functional.relu(y)
+
+    print(f"Input shape: {x.shape}")
+    print(f"Output shape: {y.shape}")
+    print(f"Output min: {y.min().item():.4f}")
+    print(f"Output max: {y.max().item():.4f}")
+    print(f"Output mean: {y.mean().item():.4f}")
+
+    if y.min() != y.max():
+        print("Dynamic graph mode: WORKING")
+        return True
+    else:
+        print("Dynamic graph mode: BROKEN (constant output)")
+        return False
+
+
+def test_ppocr_model_direct():
+    """Try loading PPOCRv4 model directly in dynamic mode."""
+    print()
+    print("=" * 60)
+    print("PPOCR MODEL DIRECT LOAD TEST")
+    print("=" * 60)
+
+    try:
+        # Try to import ppocr modules directly
+        # This bypasses the inference predictor
+        from paddleocr.ppocr.modeling.architectures import build_model
+        from paddleocr.ppocr.postprocess import build_post_process
+        from paddleocr.ppocr.utils.save_load import load_model
+
+        print("Direct model import available")
+
+        # Note: This approach requires model config files
+        # which may or may not be bundled with paddleocr
+
+    except ImportError as e:
+        print(f"Direct model import not available: {e}")
+        print("PaddleOCR may only support inference mode")
+
+    return False
+
+
+def main():
+    # Default test image
+    image_path = '/app/dataset/0/img/page_0001.png'
+    if len(sys.argv) > 1:
+        image_path = sys.argv[1]
+
+    if not os.path.exists(image_path):
+        print(f"Image not found: {image_path}")
+        sys.exit(1)
+
+    print(f"Testing with image: {image_path}")
+    print()
+
+    check_gpu()
+
+    # Test 1: Basic dynamic graph
+    dynamic_works = test_paddle_dynamic_model()
+
+    if not dynamic_works:
+        print("\nDynamic graph mode is broken - GPU likely unsupported")
+        sys.exit(1)
+
+    # Test 2: Direct model load
+    test_ppocr_model_direct()
+
+    # Test 3: PaddleOCR pipeline
+    ocr_works = test_paddleocr_dynamic(image_path)
+
+    print()
+    print("=" * 60)
+    print("SUMMARY")
+    print("=" * 60)
+    print(f"Dynamic graph mode: {'WORKS' if dynamic_works else 'BROKEN'}")
+    print(f"PaddleOCR pipeline: {'WORKS' if ocr_works else 'BROKEN'}")
+
+    if dynamic_works and not ocr_works:
+        print()
+        print("DIAGNOSIS: Dynamic mode works but PaddleOCR fails.")
+        print("This means PaddleOCR internally uses inference predictor")
+        print("which has pre-compiled kernels without Blackwell support.")
+        print()
+        print("Potential solutions:")
+        print("1. Modify PaddleOCR to use dynamic mode")
+        print("2. Use ONNX export + ONNXRuntime")
+        print("3. Wait for PaddlePaddle Blackwell support")
+
+
+if __name__ == '__main__':
+    main()
--- a/src/paddle_ocr/scripts/upload-wheel.sh
+++ b/src/paddle_ocr/scripts/upload-wheel.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# Upload PaddlePaddle ARM64 wheel to Gitea generic packages
+#
+# Usage:
+#   ./scripts/upload-wheel.sh [wheel_file] [token]
+#
+# Environment variables (alternative to arguments):
+#   GITEA_TOKEN - Gitea API token
+#   WHEEL_FILE  - Path to wheel file (default: auto-detect in wheels/)
+
+set -e
+
+GITEA_URL="https://seryus.ddns.net"
+GITEA_ORG="unir"
+PACKAGE_NAME="paddlepaddle-gpu-arm64"
+
+# Get wheel file
+WHEEL_FILE="${1:-${WHEEL_FILE:-$(ls wheels/paddlepaddle*.whl 2>/dev/null | head -1)}}"
+if [ -z "$WHEEL_FILE" ] || [ ! -f "$WHEEL_FILE" ]; then
+    echo "Error: No wheel file found"
+    echo "Usage: $0 [wheel_file] [token]"
+    echo "  or set WHEEL_FILE environment variable"
+    exit 1
+fi
+
+# Get token
+TOKEN="${2:-${GITEA_TOKEN}}"
+if [ -z "$TOKEN" ]; then
+    echo "Error: No token provided"
+    echo "Usage: $0 [wheel_file] [token]"
+    echo "  or set GITEA_TOKEN environment variable"
+    exit 1
+fi
+
+# Extract version from wheel filename
+# Format: paddlepaddle_gpu-3.0.0-cp311-cp311-linux_aarch64.whl
+FILENAME=$(basename "$WHEEL_FILE")
+VERSION=$(echo "$FILENAME" | sed -E 's/paddlepaddle[_-]gpu-([0-9.]+)-.*/\1/')
+
+if [ -z "$VERSION" ]; then
+    echo "Error: Could not extract version from filename: $FILENAME"
+    exit 1
+fi
+
+echo "Uploading wheel to Gitea packages..."
+echo "  File: $WHEEL_FILE"
+echo "  Package: $PACKAGE_NAME"
+echo "  Version: $VERSION"
+echo "  URL: $GITEA_URL/api/packages/$GITEA_ORG/generic/$PACKAGE_NAME/$VERSION/$FILENAME"
+
+# Upload using PUT request
+HTTP_CODE=$(curl -sS -w "%{http_code}" -o /tmp/upload_response.txt \
+    -X PUT \
+    -H "Authorization: token $TOKEN" \
+    -H "Content-Type: application/octet-stream" \
+    --data-binary "@$WHEEL_FILE" \
+    "$GITEA_URL/api/packages/$GITEA_ORG/generic/$PACKAGE_NAME/$VERSION/$FILENAME")
+
+if [ "$HTTP_CODE" = "201" ] || [ "$HTTP_CODE" = "200" ]; then
+    echo "Success! Wheel uploaded."
+    echo "Download URL: $GITEA_URL/api/packages/$GITEA_ORG/generic/$PACKAGE_NAME/$VERSION/$FILENAME"
+elif [ "$HTTP_CODE" = "409" ]; then
+    echo "Package version already exists (HTTP 409)"
+    echo "To update, delete the existing version first in Gitea UI"
+else
+    echo "Error: Upload failed with HTTP $HTTP_CODE"
+    cat /tmp/upload_response.txt
+    exit 1
+fi
--- a/src/paddle_ocr/test.py
+++ b/src/paddle_ocr/test.py
@@ -0,0 +1,114 @@
+# test.py - Simple client to test PaddleOCR REST API
+# Usage: python test.py [--url URL] [--dataset PATH]
+
+import argparse
+import requests
+import time
+import sys
+
+
+def wait_for_health(url: str, timeout: int = 120) -> bool:
+    """Wait for API to be ready."""
+    health_url = f"{url}/health"
+    start = time.time()
+
+    print(f"Waiting for API at {health_url}...")
+    while time.time() - start < timeout:
+        try:
+            resp = requests.get(health_url, timeout=5)
+            if resp.status_code == 200:
+                data = resp.json()
+                if data.get("model_loaded"):
+                    print(f"API ready! Model loaded in {time.time() - start:.1f}s")
+                    return True
+                print(f"  Model loading... ({time.time() - start:.0f}s)")
+        except requests.exceptions.ConnectionError:
+            print(f"  Connecting... ({time.time() - start:.0f}s)")
+        except Exception as e:
+            print(f"  Error: {e}")
+        time.sleep(2)
+
+    print("Timeout waiting for API")
+    return False
+
+
+def test_evaluate(url: str, config: dict) -> dict:
+    """Run evaluation with given config."""
+    eval_url = f"{url}/evaluate"
+
+    print(f"\nTesting config: {config}")
+    start = time.time()
+
+    resp = requests.post(eval_url, json=config, timeout=600)
+    resp.raise_for_status()
+
+    result = resp.json()
+    elapsed = time.time() - start
+
+    print(f"Results (took {elapsed:.1f}s):")
+    print(f"  CER: {result['CER']:.4f} ({result['CER']*100:.2f}%)")
+    print(f"  WER: {result['WER']:.4f} ({result['WER']*100:.2f}%)")
+    print(f"  Pages: {result['PAGES']}")
+    print(f"  Time/page: {result['TIME_PER_PAGE']:.2f}s")
+
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Test PaddleOCR REST API")
+    parser.add_argument("--url", default="http://localhost:8001", help="API base URL")
+    parser.add_argument("--dataset", default="/app/dataset", help="Dataset path (inside container)")
+    parser.add_argument("--skip-health", action="store_true", help="Skip health check wait")
+    args = parser.parse_args()
+
+    # Wait for API to be ready
+    if not args.skip_health:
+        if not wait_for_health(args.url):
+            sys.exit(1)
+
+    # Test 1: Baseline config (default PaddleOCR)
+    print("\n" + "="*50)
+    print("TEST 1: Baseline Configuration")
+    print("="*50)
+    baseline = test_evaluate(args.url, {
+        "pdf_folder": args.dataset,
+        "use_doc_orientation_classify": False,
+        "use_doc_unwarping": False,
+        "textline_orientation": False,  # Baseline: disabled
+        "text_det_thresh": 0.0,
+        "text_det_box_thresh": 0.0,
+        "text_det_unclip_ratio": 1.5,
+        "text_rec_score_thresh": 0.0,
+        "start_page": 5,
+        "end_page": 10,
+    })
+
+    # Test 2: Optimized config (from Ray Tune results)
+    print("\n" + "="*50)
+    print("TEST 2: Optimized Configuration")
+    print("="*50)
+    optimized = test_evaluate(args.url, {
+        "pdf_folder": args.dataset,
+        "use_doc_orientation_classify": False,
+        "use_doc_unwarping": False,
+        "textline_orientation": True,  # KEY: enabled
+        "text_det_thresh": 0.4690,
+        "text_det_box_thresh": 0.5412,
+        "text_det_unclip_ratio": 0.0,
+        "text_rec_score_thresh": 0.6350,
+        "start_page": 5,
+        "end_page": 10,
+    })
+
+    # Summary
+    print("\n" + "="*50)
+    print("SUMMARY")
+    print("="*50)
+    cer_reduction = (1 - optimized["CER"] / baseline["CER"]) * 100 if baseline["CER"] > 0 else 0
+    print(f"Baseline CER:  {baseline['CER']*100:.2f}%")
+    print(f"Optimized CER: {optimized['CER']*100:.2f}%")
+    print(f"Improvement:   {cer_reduction:.1f}% reduction in errors")
+
+
+if __name__ == "__main__":
+    main()
--- a/src/paddle_ocr/wheels/.gitkeep
+++ b/src/paddle_ocr/wheels/.gitkeep