ci update

2026-01-17 16:15:53 +01:00
parent 7ac0971153
commit a89ddd2d13
9 changed files with 525 additions and 22 deletions
--- a/.gitea/workflows/ci.yaml
+++ b/.gitea/workflows/ci.yaml
@@ -9,6 +9,11 @@ on:
  push:
    branches:
      - main
+      - gpu_support
+
+env:
+  PADDLE_VERSION: "3.0.0"
+  WHEEL_BASE_URL: "https://seryus.ddns.net/api/packages/unir/generic"

 jobs:
  essential:
@@ -25,7 +30,7 @@ jobs:
          echo "Version: 1.0.${{ gitea.run_number }}" >> $GITHUB_STEP_SUMMARY
          echo "Event: ${{ gitea.event_name }}" >> $GITHUB_STEP_SUMMARY

-  # CPU image: Matrix build for amd64 and arm64 (each pushes as soon as done)
+  # CPU image: Matrix build for amd64 and arm64
  build_cpu:
    runs-on: ubuntu-latest
    needs: essential
@@ -60,6 +65,14 @@ jobs:
            echo "suffix=arm64" >> $GITHUB_OUTPUT
          fi

+      - name: Download ARM64 wheel from Gitea packages
+        if: matrix.platform == 'linux/arm64'
+        run: |
+          mkdir -p src/paddle_ocr/wheels
+          curl -L -o src/paddle_ocr/wheels/paddlepaddle-${{ env.PADDLE_VERSION }}-cp311-cp311-linux_aarch64.whl \
+            "${{ env.WHEEL_BASE_URL }}/paddlepaddle-cpu-arm64/${{ env.PADDLE_VERSION }}/paddlepaddle-${{ env.PADDLE_VERSION }}-cp311-cp311-linux_aarch64.whl"
+          ls -la src/paddle_ocr/wheels/
+
      - name: Build and push CPU image (${{ matrix.platform }})
        uses: docker/build-push-action@v5
        with:
@@ -71,29 +84,56 @@ jobs:
            ${{ needs.essential.outputs.image_cpu }}:${{ needs.essential.outputs.Version }}-${{ steps.arch.outputs.suffix }}
            ${{ needs.essential.outputs.image_cpu }}:${{ steps.arch.outputs.suffix }}

-  # GPU image: x86_64 only (PaddlePaddle GPU doesn't support ARM64)
+  # GPU image: Matrix build for amd64 and arm64
  build_gpu:
    runs-on: ubuntu-latest
    needs: essential
+    strategy:
+      matrix:
+        platform:
+          - linux/amd64
+          - linux/arm64
    steps:
      - name: Checkout
        uses: actions/checkout@v4

+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
      - name: Login to Gitea Registry
-        run: |
-          echo ${{ secrets.CI_READWRITE }} | docker login \
-            -u username \
-            --password-stdin ${{ needs.essential.outputs.repo }}
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ needs.essential.outputs.repo }}
+          username: username
+          password: ${{ secrets.CI_READWRITE }}

-      - name: Build GPU image (x86_64)
+      - name: Get arch suffix
+        id: arch
        run: |
-          docker build \
-            -f src/paddle_ocr/Dockerfile.gpu \
-            -t ${{ needs.essential.outputs.image_gpu }}:${{ needs.essential.outputs.Version }} \
-            -t ${{ needs.essential.outputs.image_gpu }}:latest \
-            src/paddle_ocr/
+          if [ "${{ matrix.platform }}" = "linux/amd64" ]; then
+            echo "suffix=amd64" >> $GITHUB_OUTPUT
+          else
+            echo "suffix=arm64" >> $GITHUB_OUTPUT
+          fi

-      - name: Push GPU image
+      - name: Download ARM64 GPU wheel from Gitea packages
+        if: matrix.platform == 'linux/arm64'
        run: |
-          docker push ${{ needs.essential.outputs.image_gpu }}:${{ needs.essential.outputs.Version }}
-          docker push ${{ needs.essential.outputs.image_gpu }}:latest
+          mkdir -p src/paddle_ocr/wheels
+          curl -L -o src/paddle_ocr/wheels/paddlepaddle_gpu-${{ env.PADDLE_VERSION }}-cp311-cp311-linux_aarch64.whl \
+            "${{ env.WHEEL_BASE_URL }}/paddlepaddle-gpu-arm64/${{ env.PADDLE_VERSION }}/paddlepaddle_gpu-${{ env.PADDLE_VERSION }}-cp311-cp311-linux_aarch64.whl"
+          ls -la src/paddle_ocr/wheels/
+
+      - name: Build and push GPU image (${{ matrix.platform }})
+        uses: docker/build-push-action@v5
+        with:
+          context: src/paddle_ocr
+          file: src/paddle_ocr/Dockerfile.gpu
+          platforms: ${{ matrix.platform }}
+          push: true
+          tags: |
+            ${{ needs.essential.outputs.image_gpu }}:${{ needs.essential.outputs.Version }}-${{ steps.arch.outputs.suffix }}
+            ${{ needs.essential.outputs.image_gpu }}:${{ steps.arch.outputs.suffix }}
--- a/src/paddle_ocr/Dockerfile.build-paddle
+++ b/src/paddle_ocr/Dockerfile.build-paddle
@@ -91,6 +91,43 @@ RUN sed -i 's/-m64//g' cmake/flags.cmake && \
    find . -name "*.cmake" -exec sed -i 's/-m64//g' {} \; 2>/dev/null || true && \
    echo "Patched -m64 flag for ARM64 compatibility"

+# Patch for ARM64: Install sse2neon to translate x86 SSE intrinsics to ARM NEON
+# sse2neon provides drop-in replacements for x86 SIMD headers
+RUN git clone --depth 1 https://github.com/DLTcollab/sse2neon.git /tmp/sse2neon && \
+    mkdir -p /usr/local/include/sse2neon && \
+    cp /tmp/sse2neon/sse2neon.h /usr/local/include/sse2neon/ && \
+    rm -rf /tmp/sse2neon && \
+    echo "Installed sse2neon for x86->ARM NEON translation"
+
+# Create wrapper headers that use sse2neon for ARM64
+RUN mkdir -p /usr/local/include/x86_stubs && \
+    echo "#ifndef __x86_64__" > /usr/local/include/x86_stubs/immintrin.h && \
+    echo "#include <sse2neon/sse2neon.h>" >> /usr/local/include/x86_stubs/immintrin.h && \
+    echo "#else" >> /usr/local/include/x86_stubs/immintrin.h && \
+    echo "#include_next <immintrin.h>" >> /usr/local/include/x86_stubs/immintrin.h && \
+    echo "#endif" >> /usr/local/include/x86_stubs/immintrin.h && \
+    echo "#ifndef __x86_64__" > /usr/local/include/x86_stubs/xmmintrin.h && \
+    echo "#include <sse2neon/sse2neon.h>" >> /usr/local/include/x86_stubs/xmmintrin.h && \
+    echo "#else" >> /usr/local/include/x86_stubs/xmmintrin.h && \
+    echo "#include_next <xmmintrin.h>" >> /usr/local/include/x86_stubs/xmmintrin.h && \
+    echo "#endif" >> /usr/local/include/x86_stubs/xmmintrin.h && \
+    echo "#ifndef __x86_64__" > /usr/local/include/x86_stubs/emmintrin.h && \
+    echo "#include <sse2neon/sse2neon.h>" >> /usr/local/include/x86_stubs/emmintrin.h && \
+    echo "#else" >> /usr/local/include/x86_stubs/emmintrin.h && \
+    echo "#include_next <emmintrin.h>" >> /usr/local/include/x86_stubs/emmintrin.h && \
+    echo "#endif" >> /usr/local/include/x86_stubs/emmintrin.h && \
+    echo "#ifndef __x86_64__" > /usr/local/include/x86_stubs/pmmintrin.h && \
+    echo "#include <sse2neon/sse2neon.h>" >> /usr/local/include/x86_stubs/pmmintrin.h && \
+    echo "#else" >> /usr/local/include/x86_stubs/pmmintrin.h && \
+    echo "#include_next <pmmintrin.h>" >> /usr/local/include/x86_stubs/pmmintrin.h && \
+    echo "#endif" >> /usr/local/include/x86_stubs/pmmintrin.h && \
+    echo "#ifndef __x86_64__" > /usr/local/include/x86_stubs/smmintrin.h && \
+    echo "#include <sse2neon/sse2neon.h>" >> /usr/local/include/x86_stubs/smmintrin.h && \
+    echo "#else" >> /usr/local/include/x86_stubs/smmintrin.h && \
+    echo "#include_next <smmintrin.h>" >> /usr/local/include/x86_stubs/smmintrin.h && \
+    echo "#endif" >> /usr/local/include/x86_stubs/smmintrin.h && \
+    echo "Created x86 intrinsic wrapper headers for ARM64 using sse2neon"
+
 # Install additional Python requirements for building
 RUN pip install -r python/requirements.txt || true

@@ -99,6 +136,7 @@ RUN mkdir -p build
 WORKDIR /build/Paddle/build

 # Configure CMake for ARM64 + CUDA build
+# Note: -Wno-class-memaccess fixes Eigen NEON warning on ARM64
 RUN echo "Building for CUDA architecture: sm_${CUDA_ARCH}" && \
    cmake .. \
    -GNinja \
@@ -118,6 +156,7 @@ RUN echo "Building for CUDA architecture: sm_${CUDA_ARCH}" && \
    -DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCH}" \
    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
    -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+    -DCMAKE_CXX_FLAGS="-Wno-class-memaccess -Wno-error=class-memaccess -I/usr/local/include/x86_stubs" \
    -DCMAKE_EXPORT_COMPILE_COMMANDS=ON

 # Build external dependencies first (cacheable layer)
@@ -142,14 +181,28 @@ RUN ninja paddle_python || true
 # Create output directory
 RUN mkdir -p /output

-# Build wheel package
+# Build wheel package - try multiple methods since PaddlePaddle build structure varies
 WORKDIR /build/Paddle
-RUN cd python && python setup.py bdist_wheel || pip wheel . -w dist/
+RUN echo "=== Looking for wheel build method ===" && \
+    ls -la python/ 2>/dev/null && \
+    ls -la build/python/ 2>/dev/null && \
+    if [ -f build/python/setup.py ]; then \
+        echo "Using build/python/setup.py" && \
+        cd build/python && python setup.py bdist_wheel; \
+    elif [ -f python/setup.py ]; then \
+        echo "Using python/setup.py" && \
+        cd python && python setup.py bdist_wheel; \
+    else \
+        echo "Looking for existing wheel..." && \
+        find /build -name "paddlepaddle*.whl" -type f 2>/dev/null; \
+    fi

 # Copy wheel to output
-RUN cp python/dist/*.whl /output/ 2>/dev/null || \
-    cp build/python/dist/*.whl /output/ 2>/dev/null || \
-    find /build -name "paddlepaddle*.whl" -exec cp {} /output/ \;
+RUN find /build -name "paddlepaddle*.whl" -type f -exec cp {} /output/ \; && \
+    ls -la /output/ && \
+    if [ ! "$(ls -A /output/*.whl 2>/dev/null)" ]; then \
+        echo "ERROR: No wheel found!" && exit 1; \
+    fi

 # List what was built
 RUN ls -la /output/ && \
--- a/src/paddle_ocr/Dockerfile.build-paddle-cpu
+++ b/src/paddle_ocr/Dockerfile.build-paddle-cpu
@@ -0,0 +1,145 @@
+# Dockerfile.build-paddle-cpu - Build PaddlePaddle CPU wheel for ARM64
+#
+# Required because PyPI wheels don't work on ARM64 (x86 SSE instructions).
+#
+# Build time: ~1-2 hours
+# Output: /output/paddlepaddle-*.whl
+#
+# Usage:
+#   docker build -t paddle-builder:cpu-arm64 -f Dockerfile.build-paddle-cpu .
+#   docker run --rm -v ./wheels:/wheels paddle-builder:cpu-arm64
+
+# syntax=docker/dockerfile:1.4
+FROM ubuntu:22.04
+
+LABEL maintainer="Sergio Jimenez"
+LABEL description="PaddlePaddle CPU wheel builder for ARM64"
+
+ARG PADDLE_VERSION=v3.0.0
+ARG PYTHON_VERSION=3.11
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV CCACHE_DIR=/ccache
+ENV PATH="/usr/lib/ccache:${PATH}"
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python${PYTHON_VERSION} \
+    python${PYTHON_VERSION}-dev \
+    python${PYTHON_VERSION}-venv \
+    python3-pip \
+    build-essential \
+    cmake \
+    ninja-build \
+    git \
+    wget \
+    curl \
+    pkg-config \
+    ccache \
+    libssl-dev \
+    libffi-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    liblzma-dev \
+    libncurses5-dev \
+    libncursesw5-dev \
+    libgflags-dev \
+    libgoogle-glog-dev \
+    libprotobuf-dev \
+    protobuf-compiler \
+    patchelf \
+    libopenblas-dev \
+    liblapack-dev \
+    swig \
+    && rm -rf /var/lib/apt/lists/* \
+    && ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python \
+    && ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python3
+
+# Setup ccache
+RUN mkdir -p /usr/lib/ccache && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/gcc && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/g++ && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/cc && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/c++
+
+RUN python -m pip install --upgrade pip setuptools wheel && \
+    python -m pip install numpy protobuf pyyaml requests packaging astor decorator paddle-bfloat opt-einsum
+
+WORKDIR /build
+RUN git clone --depth 1 --branch ${PADDLE_VERSION} https://github.com/PaddlePaddle/Paddle.git
+
+WORKDIR /build/Paddle
+
+# Patch -m64 flag (x86_64 specific)
+RUN sed -i 's/-m64//g' cmake/flags.cmake && \
+    sed -i 's/-m64//g' CMakeLists.txt 2>/dev/null || true && \
+    find . -name "*.cmake" -exec sed -i 's/-m64//g' {} \; 2>/dev/null || true
+
+# Install sse2neon for x86 SSE -> ARM NEON translation
+RUN git clone --depth 1 https://github.com/DLTcollab/sse2neon.git /tmp/sse2neon && \
+    mkdir -p /usr/local/include/sse2neon && \
+    cp /tmp/sse2neon/sse2neon.h /usr/local/include/sse2neon/ && \
+    rm -rf /tmp/sse2neon
+
+# Create x86 intrinsic wrapper headers
+RUN mkdir -p /usr/local/include/x86_stubs && \
+    for h in immintrin xmmintrin emmintrin pmmintrin smmintrin; do \
+        echo "#ifndef __x86_64__" > /usr/local/include/x86_stubs/${h}.h && \
+        echo "#include <sse2neon/sse2neon.h>" >> /usr/local/include/x86_stubs/${h}.h && \
+        echo "#else" >> /usr/local/include/x86_stubs/${h}.h && \
+        echo "#include_next <${h}.h>" >> /usr/local/include/x86_stubs/${h}.h && \
+        echo "#endif" >> /usr/local/include/x86_stubs/${h}.h; \
+    done
+
+RUN pip install -r python/requirements.txt || true
+
+RUN mkdir -p build
+WORKDIR /build/Paddle/build
+
+# Configure for CPU-only build
+RUN cmake .. \
+    -GNinja \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DPY_VERSION=${PYTHON_VERSION} \
+    -DWITH_GPU=OFF \
+    -DWITH_TESTING=OFF \
+    -DWITH_DISTRIBUTE=OFF \
+    -DWITH_NCCL=OFF \
+    -DWITH_MKL=OFF \
+    -DWITH_MKLDNN=OFF \
+    -DON_INFER=OFF \
+    -DWITH_PYTHON=ON \
+    -DWITH_AVX=OFF \
+    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+    -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+    -DCMAKE_CXX_FLAGS="-Wno-class-memaccess -Wno-error=class-memaccess -I/usr/local/include/x86_stubs"
+
+# Build external dependencies
+RUN --mount=type=cache,target=/ccache \
+    ninja extern_gflags extern_glog extern_protobuf extern_zlib extern_eigen3
+
+RUN --mount=type=cache,target=/ccache \
+    ninja extern_openblas extern_pybind extern_utf8proc extern_xxhash extern_yaml extern_cryptopp extern_warpctc extern_warprnnt extern_gloo extern_xbyak
+
+# Build PaddlePaddle
+RUN --mount=type=cache,target=/ccache \
+    ninja -j$(nproc) || ninja -j$(($(nproc)/2)) || ninja -j4
+
+RUN ninja paddle_python || true
+
+RUN mkdir -p /output
+
+WORKDIR /build/Paddle
+RUN if [ -f build/python/setup.py ]; then \
+        cd build/python && python setup.py bdist_wheel; \
+    elif [ -f python/setup.py ]; then \
+        cd python && python setup.py bdist_wheel; \
+    fi
+
+RUN find /build -name "paddlepaddle*.whl" -type f -exec cp {} /output/ \; && \
+    ls -la /output/
+
+CMD ["sh", "-c", "cp /output/*.whl /wheels/ && ls -la /wheels/"]
--- a/src/paddle_ocr/Dockerfile.cpu
+++ b/src/paddle_ocr/Dockerfile.cpu
@@ -29,7 +29,20 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    libgomp1 \
    && rm -rf /var/lib/apt/lists/*

-# Install Python dependencies
+# Copy local wheels directory (may contain ARM64 wheel from build-paddle-cpu)
+COPY wheels/ /tmp/wheels/
+
+# Install paddlepaddle: prefer local wheel (ARM64), fallback to PyPI (x86_64)
+RUN if ls /tmp/wheels/paddlepaddle*.whl 1>/dev/null 2>&1; then \
+        echo "=== Installing PaddlePaddle from local wheel (ARM64) ===" && \
+        pip install --no-cache-dir /tmp/wheels/paddlepaddle*.whl; \
+    else \
+        echo "=== Installing PaddlePaddle from PyPI (x86_64) ===" && \
+        pip install --no-cache-dir paddlepaddle==3.0.0; \
+    fi && \
+    rm -rf /tmp/wheels
+
+# Install remaining Python dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt

--- a/src/paddle_ocr/Dockerfile.gpu
+++ b/src/paddle_ocr/Dockerfile.gpu
@@ -38,6 +38,15 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    && rm -rf /var/lib/apt/lists/* \
    && ln -sf /usr/bin/python3.11 /usr/bin/python

+# Fix cuDNN library path for ARM64 only (PaddlePaddle looks in /usr/local/cuda/lib64)
+# x86_64 doesn't need this - PyPI wheel handles paths correctly
+RUN if [ "$(uname -m)" = "aarch64" ]; then \
+        mkdir -p /usr/local/cuda/lib64 && \
+        ln -sf /usr/lib/aarch64-linux-gnu/libcudnn*.so* /usr/local/cuda/lib64/ && \
+        ln -sf /usr/lib/aarch64-linux-gnu/libcudnn.so.9 /usr/local/cuda/lib64/libcudnn.so && \
+        ldconfig; \
+    fi
+
 # Copy local wheels directory (may contain ARM64 wheel from build-paddle)
 COPY wheels/ /tmp/wheels/

--- a/src/paddle_ocr/README.md
+++ b/src/paddle_ocr/README.md
@@ -65,10 +65,13 @@ docker compose up ocr-cpu
 | `paddle_ocr_tuning_rest.py` | FastAPI REST service |
 | `dataset_manager.py` | Dataset loader |
 | `test.py` | API test client |
-| `Dockerfile.cpu` | CPU-only image (multi-arch) |
+| `Dockerfile.cpu` | CPU-only image (x86_64 + ARM64 with local wheel) |
 | `Dockerfile.gpu` | GPU/CUDA image (x86_64 + ARM64 with local wheel) |
 | `Dockerfile.build-paddle` | PaddlePaddle GPU wheel builder for ARM64 |
+| `Dockerfile.build-paddle-cpu` | PaddlePaddle CPU wheel builder for ARM64 |
 | `docker-compose.yml` | Service orchestration |
+| `docker-compose.cpu-registry.yml` | Pull CPU image from registry |
+| `docker-compose.gpu-registry.yml` | Pull GPU image from registry |
 | `wheels/` | Local PaddlePaddle wheels (created by build-paddle) |

 ## API Endpoints
@@ -461,3 +464,114 @@ pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/
 pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
 ```
 The Dockerfile.gpu handles this automatically.
+
+## CI/CD Pipeline
+
+The project includes a Gitea Actions workflow (`.gitea/workflows/ci.yaml`) for automated builds.
+
+### What CI Builds
+
+| Image | Architecture | Source |
+|-------|--------------|--------|
+| `paddle-ocr-cpu:amd64` | amd64 | PyPI paddlepaddle |
+| `paddle-ocr-cpu:arm64` | arm64 | Pre-built wheel from Gitea packages |
+| `paddle-ocr-gpu:amd64` | amd64 | PyPI paddlepaddle-gpu |
+| `paddle-ocr-gpu:arm64` | arm64 | Pre-built wheel from Gitea packages |
+
+### ARM64 Wheel Workflow
+
+Since PyPI wheels don't work on ARM64 (x86 SSE instructions), wheels must be built from source using sse2neon:
+
+1. Built manually on an ARM64 machine (one-time)
+2. Uploaded to Gitea generic packages
+3. Downloaded by CI when building ARM64 images
+
+#### Step 1: Build ARM64 Wheels (One-time, on ARM64 machine)
+
+```bash
+cd src/paddle_ocr
+
+# Build GPU wheel (requires NVIDIA GPU, takes 1-2 hours)
+sudo docker build -t paddle-builder:gpu-arm64 -f Dockerfile.build-paddle .
+sudo docker run --rm -v ./wheels:/wheels paddle-builder:gpu-arm64
+
+# Build CPU wheel (no GPU required, takes 1-2 hours)
+sudo docker build -t paddle-builder:cpu-arm64 -f Dockerfile.build-paddle-cpu .
+sudo docker run --rm -v ./wheels:/wheels paddle-builder:cpu-arm64
+
+# Verify wheels were created
+ls -la wheels/paddlepaddle*.whl
+# paddlepaddle_gpu-3.0.0-cp311-cp311-linux_aarch64.whl (GPU)
+# paddlepaddle-3.0.0-cp311-cp311-linux_aarch64.whl (CPU)
+```
+
+#### Step 2: Upload Wheels to Gitea Packages
+
+```bash
+export GITEA_TOKEN="your-token-here"
+
+# Upload GPU wheel
+curl -X PUT \
+  -H "Authorization: token $GITEA_TOKEN" \
+  --upload-file wheels/paddlepaddle_gpu-3.0.0-cp311-cp311-linux_aarch64.whl \
+  "https://seryus.ddns.net/api/packages/unir/generic/paddlepaddle-gpu-arm64/3.0.0/paddlepaddle_gpu-3.0.0-cp311-cp311-linux_aarch64.whl"
+
+# Upload CPU wheel
+curl -X PUT \
+  -H "Authorization: token $GITEA_TOKEN" \
+  --upload-file wheels/paddlepaddle-3.0.0-cp311-cp311-linux_aarch64.whl \
+  "https://seryus.ddns.net/api/packages/unir/generic/paddlepaddle-cpu-arm64/3.0.0/paddlepaddle-3.0.0-cp311-cp311-linux_aarch64.whl"
+```
+
+Wheels available at:
+```
+https://seryus.ddns.net/api/packages/unir/generic/paddlepaddle-gpu-arm64/3.0.0/paddlepaddle_gpu-3.0.0-cp311-cp311-linux_aarch64.whl
+https://seryus.ddns.net/api/packages/unir/generic/paddlepaddle-cpu-arm64/3.0.0/paddlepaddle-3.0.0-cp311-cp311-linux_aarch64.whl
+```
+
+#### Step 3: CI Builds Images
+
+CI automatically:
+1. Downloads ARM64 wheels from Gitea packages (for arm64 builds only)
+2. Builds both CPU and GPU images for amd64 and arm64
+3. Pushes to registry with arch-specific tags
+
+### Required CI Secrets
+
+Configure these in Gitea repository settings:
+
+| Secret | Description |
+|--------|-------------|
+| `CI_READWRITE` | Gitea token with registry read/write access |
+
+### Manual Image Push
+
+```bash
+# Login to registry
+docker login seryus.ddns.net
+
+# Build and push CPU (multi-arch)
+docker buildx build -f Dockerfile.cpu \
+  --platform linux/amd64,linux/arm64 \
+  -t seryus.ddns.net/unir/paddle-ocr-api:cpu \
+  --push .
+
+# Build and push GPU (x86_64)
+docker build -f Dockerfile.gpu -t seryus.ddns.net/unir/paddle-ocr-api:gpu-amd64 .
+docker push seryus.ddns.net/unir/paddle-ocr-api:gpu-amd64
+
+# Build and push GPU (ARM64) - requires wheel in wheels/
+docker buildx build -f Dockerfile.gpu \
+  --platform linux/arm64 \
+  -t seryus.ddns.net/unir/paddle-ocr-api:gpu-arm64 \
+  --push .
+```
+
+### Updating the ARM64 Wheels
+
+When PaddlePaddle releases a new version:
+
+1. Update `PADDLE_VERSION` in `Dockerfile.build-paddle` and `Dockerfile.build-paddle-cpu`
+2. Rebuild both wheels on an ARM64 machine
+3. Upload to Gitea packages with new version
+4. Update `PADDLE_VERSION` in `.gitea/workflows/ci.yaml`
--- a/src/paddle_ocr/docker-compose.cpu-registry.yml
+++ b/src/paddle_ocr/docker-compose.cpu-registry.yml
@@ -0,0 +1,25 @@
+# docker-compose.cpu-registry.yml - Pull CPU image from registry
+# Usage: docker compose -f docker-compose.cpu-registry.yml up
+
+services:
+  ocr-cpu:
+    image: seryus.ddns.net/unir/paddle-ocr-cpu:arm64
+    container_name: paddle-ocr-cpu-registry
+    ports:
+      - "8001:8000"
+    volumes:
+      - ../dataset:/app/dataset:ro
+      - paddlex-cache:/root/.paddlex
+    environment:
+      - PYTHONUNBUFFERED=1
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+
+volumes:
+  paddlex-cache:
+    name: paddlex-model-cache
--- a/src/paddle_ocr/docker-compose.gpu-registry.yml
+++ b/src/paddle_ocr/docker-compose.gpu-registry.yml
@@ -0,0 +1,35 @@
+# docker-compose.gpu-registry.yml - Pull GPU image from registry
+# Usage: docker compose -f docker-compose.gpu-registry.yml up
+#
+# Requires: NVIDIA GPU + nvidia-container-toolkit installed
+
+services:
+  ocr-gpu:
+    image: seryus.ddns.net/unir/paddle-ocr-gpu:arm64
+    container_name: paddle-ocr-gpu-registry
+    ports:
+      - "8002:8000"
+    volumes:
+      - ../dataset:/app/dataset:ro
+      - paddlex-cache:/root/.paddlex
+    environment:
+      - PYTHONUNBUFFERED=1
+      - CUDA_VISIBLE_DEVICES=0
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+
+volumes:
+  paddlex-cache:
+    name: paddlex-model-cache
--- a/src/paddle_ocr/scripts/upload-wheel.sh
+++ b/src/paddle_ocr/scripts/upload-wheel.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# Upload PaddlePaddle ARM64 wheel to Gitea generic packages
+#
+# Usage:
+#   ./scripts/upload-wheel.sh [wheel_file] [token]
+#
+# Environment variables (alternative to arguments):
+#   GITEA_TOKEN - Gitea API token
+#   WHEEL_FILE  - Path to wheel file (default: auto-detect in wheels/)
+
+set -e
+
+GITEA_URL="https://seryus.ddns.net"
+GITEA_ORG="unir"
+PACKAGE_NAME="paddlepaddle-gpu-arm64"
+
+# Get wheel file
+WHEEL_FILE="${1:-${WHEEL_FILE:-$(ls wheels/paddlepaddle*.whl 2>/dev/null | head -1)}}"
+if [ -z "$WHEEL_FILE" ] || [ ! -f "$WHEEL_FILE" ]; then
+    echo "Error: No wheel file found"
+    echo "Usage: $0 [wheel_file] [token]"
+    echo "  or set WHEEL_FILE environment variable"
+    exit 1
+fi
+
+# Get token
+TOKEN="${2:-${GITEA_TOKEN}}"
+if [ -z "$TOKEN" ]; then
+    echo "Error: No token provided"
+    echo "Usage: $0 [wheel_file] [token]"
+    echo "  or set GITEA_TOKEN environment variable"
+    exit 1
+fi
+
+# Extract version from wheel filename
+# Format: paddlepaddle_gpu-3.0.0-cp311-cp311-linux_aarch64.whl
+FILENAME=$(basename "$WHEEL_FILE")
+VERSION=$(echo "$FILENAME" | sed -E 's/paddlepaddle[_-]gpu-([0-9.]+)-.*/\1/')
+
+if [ -z "$VERSION" ]; then
+    echo "Error: Could not extract version from filename: $FILENAME"
+    exit 1
+fi
+
+echo "Uploading wheel to Gitea packages..."
+echo "  File: $WHEEL_FILE"
+echo "  Package: $PACKAGE_NAME"
+echo "  Version: $VERSION"
+echo "  URL: $GITEA_URL/api/packages/$GITEA_ORG/generic/$PACKAGE_NAME/$VERSION/$FILENAME"
+
+# Upload using PUT request
+HTTP_CODE=$(curl -sS -w "%{http_code}" -o /tmp/upload_response.txt \
+    -X PUT \
+    -H "Authorization: token $TOKEN" \
+    -H "Content-Type: application/octet-stream" \
+    --data-binary "@$WHEEL_FILE" \
+    "$GITEA_URL/api/packages/$GITEA_ORG/generic/$PACKAGE_NAME/$VERSION/$FILENAME")
+
+if [ "$HTTP_CODE" = "201" ] || [ "$HTTP_CODE" = "200" ]; then
+    echo "Success! Wheel uploaded."
+    echo "Download URL: $GITEA_URL/api/packages/$GITEA_ORG/generic/$PACKAGE_NAME/$VERSION/$FILENAME"
+elif [ "$HTTP_CODE" = "409" ]; then
+    echo "Package version already exists (HTTP 409)"
+    echo "To update, delete the existing version first in Gitea UI"
+else
+    echo "Error: Upload failed with HTTP $HTTP_CODE"
+    cat /tmp/upload_response.txt
+    exit 1
+fi