From 78fe3e8c81fd88952109298b57775cc367cd4e22 Mon Sep 17 00:00:00 2001
From: Sergio Jimenez Jimenez <sergiojj932@gmail.com>
Date: Sat, 17 Jan 2026 10:46:36 +0100
Subject: [PATCH] gpu dgx

---
 src/paddle_ocr/Dockerfile.build-paddle | 141 ++++++++++++++++++
 src/paddle_ocr/Dockerfile.gpu          |  39 ++++-
 src/paddle_ocr/README.md               | 197 ++++++++++++++++++++-----
 src/paddle_ocr/docker-compose.yml      |  26 +++-
 src/paddle_ocr/wheels/.gitkeep         |   0
 5 files changed, 358 insertions(+), 45 deletions(-)
 create mode 100644 src/paddle_ocr/Dockerfile.build-paddle
 create mode 100644 src/paddle_ocr/wheels/.gitkeep

diff --git a/src/paddle_ocr/Dockerfile.build-paddle b/src/paddle_ocr/Dockerfile.build-paddle
new file mode 100644
index 0000000..e5caf69
--- /dev/null
+++ b/src/paddle_ocr/Dockerfile.build-paddle
@@ -0,0 +1,141 @@
+# Dockerfile.build-paddle - Build PaddlePaddle GPU wheel for ARM64
+#
+# This Dockerfile compiles PaddlePaddle from source with CUDA support for ARM64.
+# The resulting wheel can be used in Dockerfile.gpu for ARM64 GPU acceleration.
+#
+# Build time: 2-4 hours depending on hardware
+# Output: /output/paddlepaddle_gpu-*.whl
+#
+# Usage:
+#   docker compose run build-paddle
+#   # or
+#   docker build -f Dockerfile.build-paddle -t paddle-builder .
+#   docker run -v ./wheels:/output paddle-builder
+
+FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+
+LABEL maintainer="Sergio Jimenez"
+LABEL description="PaddlePaddle GPU wheel builder for ARM64"
+
+# Build arguments
+ARG PADDLE_VERSION=v3.0.0
+ARG PYTHON_VERSION=3.11
+
+# Environment setup
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    # Python
+    python${PYTHON_VERSION} \
+    python${PYTHON_VERSION}-dev \
+    python${PYTHON_VERSION}-venv \
+    python3-pip \
+    # Build tools
+    build-essential \
+    cmake \
+    ninja-build \
+    git \
+    wget \
+    curl \
+    pkg-config \
+    # Libraries
+    libssl-dev \
+    libffi-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    liblzma-dev \
+    libncurses5-dev \
+    libncursesw5-dev \
+    libgflags-dev \
+    libgoogle-glog-dev \
+    libprotobuf-dev \
+    protobuf-compiler \
+    patchelf \
+    # Additional dependencies for Paddle
+    libopenblas-dev \
+    liblapack-dev \
+    swig \
+    && rm -rf /var/lib/apt/lists/* \
+    && ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python \
+    && ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python3
+
+# Upgrade pip and install Python build dependencies
+RUN python -m pip install --upgrade pip setuptools wheel \
+    && python -m pip install \
+    numpy \
+    protobuf \
+    pyyaml \
+    requests \
+    packaging \
+    astor \
+    decorator \
+    paddle-bfloat \
+    opt-einsum
+
+WORKDIR /build
+
+# Clone PaddlePaddle repository
+RUN git clone --depth 1 --branch ${PADDLE_VERSION} \
+    https://github.com/PaddlePaddle/Paddle.git
+
+WORKDIR /build/Paddle
+
+# Install additional Python requirements for building
+RUN pip install -r python/requirements.txt || true
+
+# Create build directory
+RUN mkdir -p build
+WORKDIR /build/Paddle/build
+
+# Configure CMake for ARM64 + CUDA build
+# Note: Adjust CUDA_ARCH_NAME based on your GPU architecture
+# Common values: Auto, Ampere, Ada, Hopper
+RUN cmake .. \
+    -GNinja \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DPY_VERSION=${PYTHON_VERSION} \
+    -DWITH_GPU=ON \
+    -DWITH_TESTING=OFF \
+    -DWITH_DISTRIBUTE=OFF \
+    -DWITH_NCCL=OFF \
+    -DWITH_MKL=OFF \
+    -DWITH_MKLDNN=OFF \
+    -DON_INFER=OFF \
+    -DWITH_PYTHON=ON \
+    -DWITH_AVX=OFF \
+    -DCUDA_ARCH_NAME=Auto \
+    -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+
+# Build PaddlePaddle (this takes 2-4 hours)
+RUN ninja -j$(nproc) || ninja -j$(($(nproc)/2)) || ninja -j4
+
+# Build the Python wheel
+WORKDIR /build/Paddle/build
+RUN ninja paddle_python
+
+# Create output directory and copy wheel
+RUN mkdir -p /output
+
+# The wheel should be in python/dist/
+WORKDIR /build/Paddle
+
+# Build wheel package
+RUN cd python && python setup.py bdist_wheel
+
+# Copy wheel to output
+RUN cp python/dist/*.whl /output/ 2>/dev/null || \
+    cp build/python/dist/*.whl /output/ 2>/dev/null || \
+    echo "Wheel location may vary, checking build artifacts..."
+
+# List what was built
+RUN ls -la /output/ && \
+    echo "=== Build complete ===" && \
+    echo "Wheel files:" && \
+    find /build -name "*.whl" -type f 2>/dev/null
+
+# Default command: copy wheel to mounted volume
+CMD ["sh", "-c", "cp /output/*.whl /wheels/ 2>/dev/null && echo 'Wheel copied to /wheels/' && ls -la /wheels/ || echo 'No wheel found in /output, checking other locations...' && find /build -name '*.whl' -exec cp {} /wheels/ \\; && ls -la /wheels/"]
diff --git a/src/paddle_ocr/Dockerfile.gpu b/src/paddle_ocr/Dockerfile.gpu
index 5c3ca27..df0e4df 100644
--- a/src/paddle_ocr/Dockerfile.gpu
+++ b/src/paddle_ocr/Dockerfile.gpu
@@ -1,6 +1,15 @@
 # Dockerfile.gpu - CUDA-enabled PaddleOCR REST API
-# Supports: x86_64 with NVIDIA GPU (CUDA 12.x)
-# For DGX Spark (ARM64 + CUDA): build natively on the device
+#
+# Supports:
+# - x86_64: Uses prebuilt paddlepaddle-gpu wheel from PyPI
+# - ARM64: Uses locally compiled wheel from ./wheels/ directory
+#
+# For ARM64, first build the wheel:
+#   docker compose run build-paddle
+# Then build this image:
+#   docker compose build ocr-gpu
+#
+# See README.md for detailed ARM64 GPU build instructions.
 
 FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
 
@@ -28,9 +37,31 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && rm -rf /var/lib/apt/lists/* \
     && ln -sf /usr/bin/python3.11 /usr/bin/python
 
-# Install Python dependencies from requirements file
+# Copy local wheels directory (may be empty or contain ARM64 wheel)
+# The wheels/ directory is created by: docker compose run build-paddle
+COPY wheels/ /tmp/wheels/
+
+# Install Python dependencies
+# Strategy:
+# 1. If local paddlepaddle wheel exists (ARM64), install it first
+# 2. Then install remaining dependencies (excluding paddlepaddle-gpu from requirements)
 COPY requirements-gpu.txt .
-RUN pip install --no-cache-dir -r requirements-gpu.txt
+
+# Install paddlepaddle: prefer local wheel, fallback to pip
+RUN if ls /tmp/wheels/paddlepaddle*.whl 1>/dev/null 2>&1; then \
+        echo "=== Installing PaddlePaddle from local wheel (ARM64) ===" && \
+        pip install --no-cache-dir /tmp/wheels/paddlepaddle*.whl; \
+    else \
+        echo "=== Installing PaddlePaddle from PyPI (x86_64) ===" && \
+        pip install --no-cache-dir paddlepaddle-gpu==3.0.0; \
+    fi
+
+# Install remaining dependencies (skip paddlepaddle-gpu line)
+RUN grep -v "paddlepaddle-gpu" requirements-gpu.txt > /tmp/requirements-no-paddle.txt && \
+    pip install --no-cache-dir -r /tmp/requirements-no-paddle.txt
+
+# Cleanup
+RUN rm -rf /tmp/wheels /tmp/requirements-no-paddle.txt
 
 # Copy application code
 COPY paddle_ocr_tuning_rest.py .
diff --git a/src/paddle_ocr/README.md b/src/paddle_ocr/README.md
index 1012a2b..113298d 100644
--- a/src/paddle_ocr/README.md
+++ b/src/paddle_ocr/README.md
@@ -66,8 +66,10 @@ docker compose up ocr-cpu
 | `dataset_manager.py` | Dataset loader |
 | `test.py` | API test client |
 | `Dockerfile.cpu` | CPU-only image (multi-arch) |
-| `Dockerfile.gpu` | GPU/CUDA image (x86_64) |
+| `Dockerfile.gpu` | GPU/CUDA image (x86_64 + ARM64 with local wheel) |
+| `Dockerfile.build-paddle` | PaddlePaddle GPU wheel builder for ARM64 |
 | `docker-compose.yml` | Service orchestration |
+| `wheels/` | Local PaddlePaddle wheels (created by build-paddle) |
 
 ## API Endpoints
 
@@ -147,54 +149,172 @@ docker run -d -p 8000:8000 --gpus all \
   paddle-ocr-api:gpu
 ```
 
-## DGX Spark (ARM64 + CUDA)
+## GPU Support Analysis
 
-DGX Spark uses ARM64 (Grace CPU) with NVIDIA Hopper GPU. You have two options:
+### Host System Reference (DGX Spark)
 
-### Option 1: Native ARM64 Build (Recommended)
+This section documents GPU support findings based on testing on an NVIDIA DGX Spark:
 
-PaddlePaddle has ARM64 support. Build natively:
+| Component | Value |
+|-----------|-------|
+| Architecture | ARM64 (aarch64) |
+| CPU | NVIDIA Grace (ARM) |
+| GPU | NVIDIA GB10 |
+| CUDA Version | 13.0 |
+| Driver | 580.95.05 |
+| OS | Ubuntu 24.04 LTS |
+| Container Toolkit | nvidia-container-toolkit 1.18.1 |
+| Docker | 28.5.1 |
+| Docker Compose | v2.40.0 |
+
+### PaddlePaddle GPU Platform Support
+
+**Critical Finding:** PaddlePaddle-GPU does **NOT** support ARM64/aarch64 architecture.
+
+| Platform | CPU | GPU |
+|----------|-----|-----|
+| Linux x86_64 | ✅ | ✅ CUDA 10.2/11.x/12.x |
+| Windows x64 | ✅ | ✅ CUDA 10.2/11.x/12.x |
+| macOS x64 | ✅ | ❌ |
+| macOS ARM64 (M1/M2) | ✅ | ❌ |
+| Linux ARM64 (Jetson/DGX) | ✅ | ❌ No wheels |
+
+**Source:** [PaddlePaddle-GPU PyPI](https://pypi.org/project/paddlepaddle-gpu/) - only `manylinux_x86_64` and `win_amd64` wheels available.
+
+### Why GPU Doesn't Work on ARM64
+
+1. **No prebuilt wheels**: `pip install paddlepaddle-gpu` fails on ARM64 - no compatible wheels exist
+2. **Not a CUDA issue**: The NVIDIA CUDA base images work fine on ARM64 (`nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04`)
+3. **Not a container toolkit issue**: `nvidia-container-toolkit` is installed and functional
+4. **PaddlePaddle limitation**: The Paddle team hasn't compiled GPU wheels for ARM64
+
+When you run `pip install paddlepaddle-gpu` on ARM64:
+```
+ERROR: No matching distribution found for paddlepaddle-gpu
+```
+
+### Options for ARM64 Systems
+
+#### Option 1: CPU-Only (Recommended)
+
+Use `Dockerfile.cpu` which works on ARM64:
 
 ```bash
-# On DGX Spark or ARM64 machine
+# On DGX Spark
+docker compose up ocr-cpu
+
+# Or build directly
+docker build -f Dockerfile.cpu -t paddle-ocr-api:cpu .
+```
+
+**Performance:** CPU inference on ARM64 Grace is surprisingly fast due to high core count. Expect ~2-5 seconds per page.
+
+#### Option 2: Build PaddlePaddle from Source (Docker-based)
+
+Use the included Docker builder to compile PaddlePaddle GPU for ARM64:
+
+```bash
+cd src/paddle_ocr
+
+# Step 1: Build the PaddlePaddle GPU wheel (one-time, 2-4 hours)
+docker compose --profile build run --rm build-paddle
+
+# Verify wheel was created
+ls -la wheels/paddlepaddle*.whl
+
+# Step 2: Build the GPU image (uses local wheel)
+docker compose build ocr-gpu
+
+# Step 3: Run with GPU
+docker compose up ocr-gpu
+
+# Verify GPU is working
+docker compose exec ocr-gpu python -c "import paddle; print(paddle.device.is_compiled_with_cuda())"
+```
+
+**What this does:**
+1. `build-paddle` compiles PaddlePaddle from source inside a CUDA container
+2. The wheel is saved to `./wheels/` directory
+3. `Dockerfile.gpu` detects the local wheel and uses it instead of PyPI
+
+**Caveats:**
+- Build takes 2-4 hours on first run
+- Requires ~20GB disk space during build
+- Not officially supported by PaddlePaddle team
+- May need adjustments for future PaddlePaddle versions
+
+See: [GitHub Issue #17327](https://github.com/PaddlePaddle/PaddleOCR/issues/17327)
+
+#### Option 3: Alternative OCR Engines
+
+For ARM64 GPU acceleration, consider alternatives:
+
+| Engine | ARM64 GPU | Notes |
+|--------|-----------|-------|
+| **Tesseract** | ❌ CPU-only | Good fallback, widely available |
+| **EasyOCR** | ⚠️ Via PyTorch | PyTorch has ARM64 GPU support |
+| **TrOCR** | ⚠️ Via Transformers | Hugging Face Transformers + PyTorch |
+| **docTR** | ⚠️ Via TensorFlow/PyTorch | Both backends have ARM64 support |
+
+EasyOCR with PyTorch is a viable alternative:
+```bash
+pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121
+pip install easyocr
+```
+
+### x86_64 GPU Setup (Working)
+
+For x86_64 systems with NVIDIA GPU, the GPU Docker works:
+
+```bash
+# Verify GPU is accessible
+nvidia-smi
+
+# Verify Docker GPU access
+docker run --rm --gpus all nvidia/cuda:12.0-base nvidia-smi
+
+# Build and run GPU version
+docker compose up ocr-gpu
+```
+
+### GPU Docker Compose Configuration
+
+The `docker-compose.yml` configures GPU access via:
+
+```yaml
+deploy:
+  resources:
+    reservations:
+      devices:
+        - driver: nvidia
+          count: 1
+          capabilities: [gpu]
+```
+
+This requires Docker Compose v2 and nvidia-container-toolkit.
+
+## DGX Spark / ARM64 Quick Start
+
+For ARM64 systems (DGX Spark, Jetson, Graviton), use CPU-only:
+
+```bash
+cd src/paddle_ocr
+
+# Build ARM64-native CPU image
 docker build -f Dockerfile.cpu -t paddle-ocr-api:arm64 .
-```
 
-For GPU acceleration on ARM64, you'll need to modify `Dockerfile.gpu` to use ARM-compatible base image:
-
-```dockerfile
-# Change this line in Dockerfile.gpu:
-FROM nvcr.io/nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
-
-# To ARM64-compatible version:
-FROM nvcr.io/nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
-# (same image works on ARM64 when pulled on ARM machine)
-```
-
-Then build on the DGX Spark:
-```bash
-docker build -f Dockerfile.gpu -t paddle-ocr-api:gpu-arm64 .
-```
-
-### Option 2: x86_64 Emulation via QEMU (Slow)
-
-You CAN run x86_64 images on ARM via emulation, but it's ~10-20x slower:
-
-```bash
-# On DGX Spark, enable QEMU emulation
-docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
-
-# Run x86_64 image with emulation
-docker run --platform linux/amd64 -p 8000:8000 \
+# Run
+docker run -d -p 8000:8000 \
   -v $(pwd)/../dataset:/app/dataset:ro \
-  paddle-ocr-api:cpu
+  paddle-ocr-api:arm64
+
+# Test
+curl http://localhost:8000/health
 ```
 
-**Not recommended** for production due to severe performance penalty.
+### Cross-Compile from x86_64
 
-### Option 3: Cross-compile from x86_64
-
-Build ARM64 images from your x86_64 machine:
+Build ARM64 images from an x86_64 machine:
 
 ```bash
 # Setup buildx for multi-arch
@@ -209,6 +329,7 @@ docker buildx build -f Dockerfile.cpu \
 # Save and transfer to DGX Spark
 docker save paddle-ocr-api:arm64 | gzip > paddle-ocr-arm64.tar.gz
 scp paddle-ocr-arm64.tar.gz dgx-spark:~/
+
 # On DGX Spark:
 docker load < paddle-ocr-arm64.tar.gz
 ```
diff --git a/src/paddle_ocr/docker-compose.yml b/src/paddle_ocr/docker-compose.yml
index 1bbd6e0..5f27afd 100644
--- a/src/paddle_ocr/docker-compose.yml
+++ b/src/paddle_ocr/docker-compose.yml
@@ -1,10 +1,30 @@
 # docker-compose.yml - PaddleOCR REST API
 # Usage:
-#   CPU:  docker compose up ocr-cpu
-#   GPU:  docker compose up ocr-gpu
-#   Test: docker compose run --rm test
+#   CPU:   docker compose up ocr-cpu
+#   GPU:   docker compose up ocr-gpu
+#   Test:  docker compose run --rm test
+#   Build: docker compose run --rm build-paddle  (ARM64 GPU wheel, one-time)
 
 services:
+  # PaddlePaddle GPU wheel builder (ARM64 only, one-time build)
+  # Creates ./wheels/paddlepaddle_gpu-*.whl for ARM64 GPU support
+  # Run once: docker compose run --rm build-paddle
+  build-paddle:
+    build:
+      context: .
+      dockerfile: Dockerfile.build-paddle
+    volumes:
+      - ./wheels:/wheels
+    profiles:
+      - build
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+
   # CPU-only service (works on any architecture)
   ocr-cpu:
     build:
diff --git a/src/paddle_ocr/wheels/.gitkeep b/src/paddle_ocr/wheels/.gitkeep
new file mode 100644
index 0000000..e69de29