diff --git a/src/paddle_ocr/Dockerfile.build-paddle b/src/paddle_ocr/Dockerfile.build-paddle
index 3a48270..5f67f0c 100644
--- a/src/paddle_ocr/Dockerfile.build-paddle
+++ b/src/paddle_ocr/Dockerfile.build-paddle
@@ -3,15 +3,18 @@
 # This Dockerfile compiles PaddlePaddle from source with CUDA support for ARM64.
 # The resulting wheel can be used in Dockerfile.gpu for ARM64 GPU acceleration.
 #
-# Build time: 2-4 hours depending on hardware
+# Build time: ~1-2 hours with caching, 2-4 hours first build
 # Output: /output/paddlepaddle_gpu-*.whl
 #
 # Usage:
-#   docker compose run build-paddle
-#   # or
-#   docker build -f Dockerfile.build-paddle -t paddle-builder .
-#   docker run -v ./wheels:/output paddle-builder
+#   CUDA_ARCH=90 docker compose --profile build run --rm build-paddle
+#
+# Features:
+# - ccache for compiler caching (survives rebuilds)
+# - Split build stages for better layer caching
+# - ARM64 -m64 patch applied automatically
 
+# syntax=docker/dockerfile:1.4
 FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
 
 LABEL maintainer="Sergio Jimenez"
@@ -20,19 +23,20 @@ LABEL description="PaddlePaddle GPU wheel builder for ARM64"
 # Build arguments
 ARG PADDLE_VERSION=v3.0.0
 ARG PYTHON_VERSION=3.11
+ARG CUDA_ARCH=90
 
 # Environment setup
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
+ENV CCACHE_DIR=/ccache
+ENV PATH="/usr/lib/ccache:${PATH}"
 
-# Install build dependencies
+# Install build dependencies + ccache
 RUN apt-get update && apt-get install -y --no-install-recommends \
-    # Python
     python${PYTHON_VERSION} \
     python${PYTHON_VERSION}-dev \
     python${PYTHON_VERSION}-venv \
     python3-pip \
-    # Build tools
     build-essential \
     cmake \
     ninja-build \
@@ -40,7 +44,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     wget \
     curl \
     pkg-config \
-    # Libraries
+    ccache \
     libssl-dev \
     libffi-dev \
     zlib1g-dev \
@@ -55,7 +59,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     libprotobuf-dev \
     protobuf-compiler \
     patchelf \
-    # Additional dependencies for Paddle
     libopenblas-dev \
     liblapack-dev \
     swig \
@@ -63,27 +66,31 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python \
     && ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python3
 
+# Setup ccache symlinks for CUDA
+RUN mkdir -p /usr/lib/ccache && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/nvcc && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/gcc && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/g++ && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/cc && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/c++
+
 # Upgrade pip and install Python build dependencies
-RUN python -m pip install --upgrade pip setuptools wheel \
-    && python -m pip install \
-    numpy \
-    protobuf \
-    pyyaml \
-    requests \
-    packaging \
-    astor \
-    decorator \
-    paddle-bfloat \
-    opt-einsum
+RUN python -m pip install --upgrade pip setuptools wheel && \
+    python -m pip install numpy protobuf pyyaml requests packaging astor decorator paddle-bfloat opt-einsum
 
 WORKDIR /build
 
 # Clone PaddlePaddle repository
-RUN git clone --depth 1 --branch ${PADDLE_VERSION} \
-    https://github.com/PaddlePaddle/Paddle.git
+RUN git clone --depth 1 --branch ${PADDLE_VERSION} https://github.com/PaddlePaddle/Paddle.git
 
 WORKDIR /build/Paddle
 
+# Patch for ARM64: Remove -m64 flag (x86_64 specific, causes build failure on aarch64)
+RUN sed -i 's/-m64//g' cmake/flags.cmake && \
+    sed -i 's/-m64//g' CMakeLists.txt 2>/dev/null || true && \
+    find . -name "*.cmake" -exec sed -i 's/-m64//g' {} \; 2>/dev/null || true && \
+    echo "Patched -m64 flag for ARM64 compatibility"
+
 # Install additional Python requirements for building
 RUN pip install -r python/requirements.txt || true
 
@@ -92,17 +99,8 @@ RUN mkdir -p build
 WORKDIR /build/Paddle/build
 
 # Configure CMake for ARM64 + CUDA build
-#
-# CUDA_ARCH is auto-detected from host GPU and passed via docker-compose.
-# To detect: nvidia-smi --query-gpu=compute_cap --format=csv,noheader
-# Example: 12.1 -> use "90" (Hopper, closest supported), 9.0 -> use "90"
-#
-# Build time: ~30-60 min with single arch vs 2-4 hours with all archs
-
-ARG CUDA_ARCH=90
-RUN echo "Building for CUDA architecture: sm_${CUDA_ARCH}"
-
-RUN cmake .. \
+RUN echo "Building for CUDA architecture: sm_${CUDA_ARCH}" && \
+    cmake .. \
     -GNinja \
     -DCMAKE_BUILD_TYPE=Release \
     -DPY_VERSION=${PYTHON_VERSION} \
@@ -118,33 +116,44 @@ RUN cmake .. \
     -DCUDA_ARCH_NAME=Manual \
     -DCUDA_ARCH_BIN="${CUDA_ARCH}" \
     -DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCH}" \
+    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+    -DCMAKE_C_COMPILER_LAUNCHER=ccache \
     -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 
-# Build PaddlePaddle (this takes 2-4 hours)
-RUN ninja -j$(nproc) || ninja -j$(($(nproc)/2)) || ninja -j4
+# Build external dependencies first (cacheable layer)
+RUN --mount=type=cache,target=/ccache \
+    ninja extern_gflags extern_glog extern_protobuf extern_zlib extern_eigen3
+
+# Build flashattn (heaviest dependency, separate layer for caching)
+RUN --mount=type=cache,target=/ccache \
+    ninja extern_flashattn
+
+# Build remaining external dependencies
+RUN --mount=type=cache,target=/ccache \
+    ninja extern_openblas extern_pybind extern_utf8proc extern_xxhash extern_yaml extern_cryptopp extern_warpctc extern_warprnnt extern_gloo extern_xbyak
+
+# Build main PaddlePaddle (with ccache, fallback to fewer jobs if OOM)
+RUN --mount=type=cache,target=/ccache \
+    ninja -j$(nproc) || ninja -j$(($(nproc)/2)) || ninja -j4
 
 # Build the Python wheel
-WORKDIR /build/Paddle/build
-RUN ninja paddle_python
+RUN ninja paddle_python || true
 
-# Create output directory and copy wheel
+# Create output directory
 RUN mkdir -p /output
 
-# The wheel should be in python/dist/
-WORKDIR /build/Paddle
-
 # Build wheel package
-RUN cd python && python setup.py bdist_wheel
+WORKDIR /build/Paddle
+RUN cd python && python setup.py bdist_wheel || pip wheel . -w dist/
 
 # Copy wheel to output
 RUN cp python/dist/*.whl /output/ 2>/dev/null || \
     cp build/python/dist/*.whl /output/ 2>/dev/null || \
-    echo "Wheel location may vary, checking build artifacts..."
+    find /build -name "paddlepaddle*.whl" -exec cp {} /output/ \;
 
 # List what was built
 RUN ls -la /output/ && \
     echo "=== Build complete ===" && \
-    echo "Wheel files:" && \
     find /build -name "*.whl" -type f 2>/dev/null
 
 # Default command: copy wheel to mounted volume