Paddle ocr gpu support. #4

Merged
Seryusjj merged 40 commits from gpu_support into main 2026-01-19 17:35:25 +00:00
Showing only changes of commit 5459c9d660 - Show all commits

View File

@@ -3,15 +3,18 @@
# This Dockerfile compiles PaddlePaddle from source with CUDA support for ARM64.
# The resulting wheel can be used in Dockerfile.gpu for ARM64 GPU acceleration.
#
# Build time: 2-4 hours depending on hardware
# Build time: ~1-2 hours with caching, 2-4 hours first build
# Output: /output/paddlepaddle_gpu-*.whl
#
# Usage:
# docker compose run build-paddle
# # or
# docker build -f Dockerfile.build-paddle -t paddle-builder .
# docker run -v ./wheels:/output paddle-builder
# CUDA_ARCH=90 docker compose --profile build run --rm build-paddle
#
# Features:
# - ccache for compiler caching (survives rebuilds)
# - Split build stages for better layer caching
# - ARM64 -m64 patch applied automatically
# syntax=docker/dockerfile:1.4
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
LABEL maintainer="Sergio Jimenez"
@@ -20,19 +23,20 @@ LABEL description="PaddlePaddle GPU wheel builder for ARM64"
# Build arguments
ARG PADDLE_VERSION=v3.0.0
ARG PYTHON_VERSION=3.11
ARG CUDA_ARCH=90
# Environment setup
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV CCACHE_DIR=/ccache
ENV PATH="/usr/lib/ccache:${PATH}"
# Install build dependencies
# Install build dependencies + ccache
RUN apt-get update && apt-get install -y --no-install-recommends \
# Python
python${PYTHON_VERSION} \
python${PYTHON_VERSION}-dev \
python${PYTHON_VERSION}-venv \
python3-pip \
# Build tools
build-essential \
cmake \
ninja-build \
@@ -40,7 +44,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
wget \
curl \
pkg-config \
# Libraries
ccache \
libssl-dev \
libffi-dev \
zlib1g-dev \
@@ -55,7 +59,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
libprotobuf-dev \
protobuf-compiler \
patchelf \
# Additional dependencies for Paddle
libopenblas-dev \
liblapack-dev \
swig \
@@ -63,27 +66,31 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
&& ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python \
&& ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python3
# Setup ccache symlinks for CUDA
RUN mkdir -p /usr/lib/ccache && \
ln -sf /usr/bin/ccache /usr/lib/ccache/nvcc && \
ln -sf /usr/bin/ccache /usr/lib/ccache/gcc && \
ln -sf /usr/bin/ccache /usr/lib/ccache/g++ && \
ln -sf /usr/bin/ccache /usr/lib/ccache/cc && \
ln -sf /usr/bin/ccache /usr/lib/ccache/c++
# Upgrade pip and install Python build dependencies
RUN python -m pip install --upgrade pip setuptools wheel \
&& python -m pip install \
numpy \
protobuf \
pyyaml \
requests \
packaging \
astor \
decorator \
paddle-bfloat \
opt-einsum
RUN python -m pip install --upgrade pip setuptools wheel && \
python -m pip install numpy protobuf pyyaml requests packaging astor decorator paddle-bfloat opt-einsum
WORKDIR /build
# Clone PaddlePaddle repository
RUN git clone --depth 1 --branch ${PADDLE_VERSION} \
https://github.com/PaddlePaddle/Paddle.git
RUN git clone --depth 1 --branch ${PADDLE_VERSION} https://github.com/PaddlePaddle/Paddle.git
WORKDIR /build/Paddle
# Patch for ARM64: Remove -m64 flag (x86_64 specific, causes build failure on aarch64)
RUN sed -i 's/-m64//g' cmake/flags.cmake && \
sed -i 's/-m64//g' CMakeLists.txt 2>/dev/null || true && \
find . -name "*.cmake" -exec sed -i 's/-m64//g' {} \; 2>/dev/null || true && \
echo "Patched -m64 flag for ARM64 compatibility"
# Install additional Python requirements for building
RUN pip install -r python/requirements.txt || true
@@ -92,17 +99,8 @@ RUN mkdir -p build
WORKDIR /build/Paddle/build
# Configure CMake for ARM64 + CUDA build
#
# CUDA_ARCH is auto-detected from host GPU and passed via docker-compose.
# To detect: nvidia-smi --query-gpu=compute_cap --format=csv,noheader
# Example: 12.1 -> use "90" (Hopper, closest supported), 9.0 -> use "90"
#
# Build time: ~30-60 min with single arch vs 2-4 hours with all archs
ARG CUDA_ARCH=90
RUN echo "Building for CUDA architecture: sm_${CUDA_ARCH}"
RUN cmake .. \
RUN echo "Building for CUDA architecture: sm_${CUDA_ARCH}" && \
cmake .. \
-GNinja \
-DCMAKE_BUILD_TYPE=Release \
-DPY_VERSION=${PYTHON_VERSION} \
@@ -118,33 +116,44 @@ RUN cmake .. \
-DCUDA_ARCH_NAME=Manual \
-DCUDA_ARCH_BIN="${CUDA_ARCH}" \
-DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCH}" \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
# Build PaddlePaddle (this takes 2-4 hours)
RUN ninja -j$(nproc) || ninja -j$(($(nproc)/2)) || ninja -j4
# Build external dependencies first (cacheable layer)
RUN --mount=type=cache,target=/ccache \
ninja extern_gflags extern_glog extern_protobuf extern_zlib extern_eigen3
# Build flashattn (heaviest dependency, separate layer for caching)
RUN --mount=type=cache,target=/ccache \
ninja extern_flashattn
# Build remaining external dependencies
RUN --mount=type=cache,target=/ccache \
ninja extern_openblas extern_pybind extern_utf8proc extern_xxhash extern_yaml extern_cryptopp extern_warpctc extern_warprnnt extern_gloo extern_xbyak
# Build main PaddlePaddle (with ccache, fallback to fewer jobs if OOM)
RUN --mount=type=cache,target=/ccache \
ninja -j$(nproc) || ninja -j$(($(nproc)/2)) || ninja -j4
# Build the Python wheel
WORKDIR /build/Paddle/build
RUN ninja paddle_python
RUN ninja paddle_python || true
# Create output directory and copy wheel
# Create output directory
RUN mkdir -p /output
# The wheel should be in python/dist/
WORKDIR /build/Paddle
# Build wheel package
RUN cd python && python setup.py bdist_wheel
WORKDIR /build/Paddle
RUN cd python && python setup.py bdist_wheel || pip wheel . -w dist/
# Copy wheel to output
RUN cp python/dist/*.whl /output/ 2>/dev/null || \
cp build/python/dist/*.whl /output/ 2>/dev/null || \
echo "Wheel location may vary, checking build artifacts..."
find /build -name "paddlepaddle*.whl" -exec cp {} /output/ \;
# List what was built
RUN ls -la /output/ && \
echo "=== Build complete ===" && \
echo "Wheel files:" && \
find /build -name "*.whl" -type f 2>/dev/null
# Default command: copy wheel to mounted volume