From b96dc1ed91bd340896239c27a25fe74b9b149c5b Mon Sep 17 00:00:00 2001
From: Sergio Jimenez Jimenez <sergiojj932@gmail.com>
Date: Sat, 17 Jan 2026 17:25:05 +0100
Subject: [PATCH] build multi arch

---
 .gitea/workflows/ci.yaml                      | 42 +++++++++++++++++++
 src/paddle_ocr/Dockerfile.build-paddle-cpu    |  8 +++-
 src/paddle_ocr/README.md                      | 27 ++++++------
 .../docker-compose.cpu-registry.yml           |  2 +-
 .../docker-compose.gpu-registry.yml           |  2 +-
 5 files changed, 64 insertions(+), 17 deletions(-)

diff --git a/.gitea/workflows/ci.yaml b/.gitea/workflows/ci.yaml
index 12f3712..c3aa000 100644
--- a/.gitea/workflows/ci.yaml
+++ b/.gitea/workflows/ci.yaml
@@ -137,3 +137,45 @@ jobs:
           tags: |
             ${{ needs.essential.outputs.image_gpu }}:${{ needs.essential.outputs.Version }}-${{ steps.arch.outputs.suffix }}
             ${{ needs.essential.outputs.image_gpu }}:${{ steps.arch.outputs.suffix }}
+
+  # Create multi-arch manifest for CPU image
+  manifest_cpu:
+    runs-on: ubuntu-latest
+    needs: [essential, build_cpu]
+    steps:
+      - name: Login to Gitea Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ needs.essential.outputs.repo }}
+          username: username
+          password: ${{ secrets.CI_READWRITE }}
+
+      - name: Create multi-arch manifest (CPU)
+        run: |
+          docker buildx imagetools create -t ${{ needs.essential.outputs.image_cpu }}:latest \
+            ${{ needs.essential.outputs.image_cpu }}:amd64 \
+            ${{ needs.essential.outputs.image_cpu }}:arm64
+          docker buildx imagetools create -t ${{ needs.essential.outputs.image_cpu }}:${{ needs.essential.outputs.Version }} \
+            ${{ needs.essential.outputs.image_cpu }}:${{ needs.essential.outputs.Version }}-amd64 \
+            ${{ needs.essential.outputs.image_cpu }}:${{ needs.essential.outputs.Version }}-arm64
+
+  # Create multi-arch manifest for GPU image
+  manifest_gpu:
+    runs-on: ubuntu-latest
+    needs: [essential, build_gpu]
+    steps:
+      - name: Login to Gitea Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ needs.essential.outputs.repo }}
+          username: username
+          password: ${{ secrets.CI_READWRITE }}
+
+      - name: Create multi-arch manifest (GPU)
+        run: |
+          docker buildx imagetools create -t ${{ needs.essential.outputs.image_gpu }}:latest \
+            ${{ needs.essential.outputs.image_gpu }}:amd64 \
+            ${{ needs.essential.outputs.image_gpu }}:arm64
+          docker buildx imagetools create -t ${{ needs.essential.outputs.image_gpu }}:${{ needs.essential.outputs.Version }} \
+            ${{ needs.essential.outputs.image_gpu }}:${{ needs.essential.outputs.Version }}-amd64 \
+            ${{ needs.essential.outputs.image_gpu }}:${{ needs.essential.outputs.Version }}-arm64
diff --git a/src/paddle_ocr/Dockerfile.build-paddle-cpu b/src/paddle_ocr/Dockerfile.build-paddle-cpu
index 688c465..d95b89a 100644
--- a/src/paddle_ocr/Dockerfile.build-paddle-cpu
+++ b/src/paddle_ocr/Dockerfile.build-paddle-cpu
@@ -99,17 +99,20 @@ RUN pip install -r python/requirements.txt || true
 RUN mkdir -p build
 WORKDIR /build/Paddle/build
 
-# Configure for CPU-only build
+# Configure for CPU-only ARM64 build
+# WITH_ARM=ON enables ARM NEON optimizations and disables x86-specific code (XBYAK, MKL)
 RUN cmake .. \
     -GNinja \
     -DCMAKE_BUILD_TYPE=Release \
     -DPY_VERSION=${PYTHON_VERSION} \
     -DWITH_GPU=OFF \
+    -DWITH_ARM=ON \
     -DWITH_TESTING=OFF \
     -DWITH_DISTRIBUTE=OFF \
     -DWITH_NCCL=OFF \
     -DWITH_MKL=OFF \
     -DWITH_MKLDNN=OFF \
+    -DWITH_XBYAK=OFF \
     -DON_INFER=OFF \
     -DWITH_PYTHON=ON \
     -DWITH_AVX=OFF \
@@ -121,8 +124,9 @@ RUN cmake .. \
 RUN --mount=type=cache,target=/ccache \
     ninja extern_gflags extern_glog extern_protobuf extern_zlib extern_eigen3
 
+# Note: extern_xbyak excluded - it's x86-only and disabled with WITH_ARM=ON
 RUN --mount=type=cache,target=/ccache \
-    ninja extern_openblas extern_pybind extern_utf8proc extern_xxhash extern_yaml extern_cryptopp extern_warpctc extern_warprnnt extern_gloo extern_xbyak
+    ninja extern_openblas extern_pybind extern_utf8proc extern_xxhash extern_yaml extern_cryptopp extern_warpctc extern_warprnnt extern_gloo
 
 # Build PaddlePaddle
 RUN --mount=type=cache,target=/ccache \
diff --git a/src/paddle_ocr/README.md b/src/paddle_ocr/README.md
index 492d23f..99c3ebf 100644
--- a/src/paddle_ocr/README.md
+++ b/src/paddle_ocr/README.md
@@ -126,7 +126,7 @@ docker buildx build -f Dockerfile.cpu \
   --push .
 ```
 
-### GPU Image (x86_64 only)
+### GPU Image (x86_64 + ARM64 with local wheel)
 
 ```bash
 docker build -f Dockerfile.gpu -t paddle-ocr-api:gpu .
@@ -174,7 +174,7 @@ This section documents GPU support findings based on testing on an NVIDIA DGX Sp
 
 ### PaddlePaddle GPU Platform Support
 
-**Critical Finding:** PaddlePaddle-GPU does **NOT** support ARM64/aarch64 architecture.
+**Note:** PaddlePaddle-GPU does NOT have prebuilt ARM64 wheels on PyPI, but ARM64 support is available via custom-built wheels.
 
 | Platform | CPU | GPU |
 |----------|-----|-----|
@@ -182,21 +182,22 @@ This section documents GPU support findings based on testing on an NVIDIA DGX Sp
 | Windows x64 | ✅ | ✅ CUDA 10.2/11.x/12.x |
 | macOS x64 | ✅ | ❌ |
 | macOS ARM64 (M1/M2) | ✅ | ❌ |
-| Linux ARM64 (Jetson/DGX) | ✅ | ❌ No wheels |
+| Linux ARM64 (Jetson/DGX) | ✅ | ✅ Custom wheel required |
 
-**Source:** [PaddlePaddle-GPU PyPI](https://pypi.org/project/paddlepaddle-gpu/) - only `manylinux_x86_64` and `win_amd64` wheels available.
+**Source:** [PaddlePaddle-GPU PyPI](https://pypi.org/project/paddlepaddle-gpu/) - only `manylinux_x86_64` and `win_amd64` wheels available on PyPI. ARM64 wheels must be built from source or downloaded from Gitea packages.
 
-### Why GPU Doesn't Work on ARM64
+### ARM64 GPU Support
 
-1. **No prebuilt wheels**: `pip install paddlepaddle-gpu` fails on ARM64 - no compatible wheels exist
-2. **Not a CUDA issue**: The NVIDIA CUDA base images work fine on ARM64 (`nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04`)
-3. **Not a container toolkit issue**: `nvidia-container-toolkit` is installed and functional
-4. **PaddlePaddle limitation**: The Paddle team hasn't compiled GPU wheels for ARM64
+ARM64 GPU support is available but requires custom-built wheels:
 
-When you run `pip install paddlepaddle-gpu` on ARM64:
-```
-ERROR: No matching distribution found for paddlepaddle-gpu
-```
+1. **No prebuilt PyPI wheels**: `pip install paddlepaddle-gpu` fails on ARM64 - no compatible wheels exist on PyPI
+2. **Custom wheels work**: This project provides Dockerfiles to build ARM64 GPU wheels from source
+3. **CI/CD builds ARM64 GPU images**: Pre-built wheels are available from Gitea packages
+
+**To use GPU on ARM64:**
+- Use the pre-built images from the container registry, or
+- Build the wheel locally using `Dockerfile.build-paddle` (see Option 2 below), or
+- Download the wheel from Gitea packages: `wheels/paddlepaddle_gpu-3.0.0-cp311-cp311-linux_aarch64.whl`
 
 ### Options for ARM64 Systems
 
diff --git a/src/paddle_ocr/docker-compose.cpu-registry.yml b/src/paddle_ocr/docker-compose.cpu-registry.yml
index a9d67b0..1d9246f 100644
--- a/src/paddle_ocr/docker-compose.cpu-registry.yml
+++ b/src/paddle_ocr/docker-compose.cpu-registry.yml
@@ -3,7 +3,7 @@
 
 services:
   ocr-cpu:
-    image: seryus.ddns.net/unir/paddle-ocr-cpu:arm64
+    image: seryus.ddns.net/unir/paddle-ocr-cpu:latest
     container_name: paddle-ocr-cpu-registry
     ports:
       - "8001:8000"
diff --git a/src/paddle_ocr/docker-compose.gpu-registry.yml b/src/paddle_ocr/docker-compose.gpu-registry.yml
index c1629d9..ed37626 100644
--- a/src/paddle_ocr/docker-compose.gpu-registry.yml
+++ b/src/paddle_ocr/docker-compose.gpu-registry.yml
@@ -5,7 +5,7 @@
 
 services:
   ocr-gpu:
-    image: seryus.ddns.net/unir/paddle-ocr-gpu:arm64
+    image: seryus.ddns.net/unir/paddle-ocr-gpu:latest
     container_name: paddle-ocr-gpu-registry
     ports:
       - "8002:8000"