# Dockerfile.build-paddle - Build PaddlePaddle GPU wheel for ARM64
#
# This Dockerfile compiles PaddlePaddle from source with CUDA support for ARM64.
# The resulting wheel can be used in Dockerfile.gpu for ARM64 GPU acceleration.
#
# Build time: ~1-2 hours with caching, 2-4 hours first build
# Output: /output/paddlepaddle_gpu-*.whl
#
# Usage:
#   CUDA_ARCH=90 docker compose --profile build run --rm build-paddle
#
# Features:
# - ccache for compiler caching (survives rebuilds)
# - Split build stages for better layer caching
# - ARM64 -m64 patch applied automatically

# syntax=docker/dockerfile:1.4
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04

LABEL maintainer="Sergio Jimenez"
LABEL description="PaddlePaddle GPU wheel builder for ARM64"

# Build arguments
ARG PADDLE_VERSION=v3.0.0
ARG PYTHON_VERSION=3.11
ARG CUDA_ARCH=90

# Environment setup
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV CCACHE_DIR=/ccache
ENV PATH="/usr/lib/ccache:${PATH}"

# Install build dependencies + ccache
RUN apt-get update && apt-get install -y --no-install-recommends \
    python${PYTHON_VERSION} \
    python${PYTHON_VERSION}-dev \
    python${PYTHON_VERSION}-venv \
    python3-pip \
    build-essential \
    cmake \
    ninja-build \
    git \
    wget \
    curl \
    pkg-config \
    ccache \
    libssl-dev \
    libffi-dev \
    zlib1g-dev \
    libbz2-dev \
    libreadline-dev \
    libsqlite3-dev \
    liblzma-dev \
    libncurses5-dev \
    libncursesw5-dev \
    libgflags-dev \
    libgoogle-glog-dev \
    libprotobuf-dev \
    protobuf-compiler \
    patchelf \
    libopenblas-dev \
    liblapack-dev \
    swig \
    && rm -rf /var/lib/apt/lists/* \
    && ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python \
    && ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python3

# Setup ccache symlinks for CUDA
RUN mkdir -p /usr/lib/ccache && \
    ln -sf /usr/bin/ccache /usr/lib/ccache/nvcc && \
    ln -sf /usr/bin/ccache /usr/lib/ccache/gcc && \
    ln -sf /usr/bin/ccache /usr/lib/ccache/g++ && \
    ln -sf /usr/bin/ccache /usr/lib/ccache/cc && \
    ln -sf /usr/bin/ccache /usr/lib/ccache/c++

# Upgrade pip and install Python build dependencies
RUN python -m pip install --upgrade pip setuptools wheel && \
    python -m pip install numpy protobuf pyyaml requests packaging astor decorator paddle-bfloat opt-einsum

WORKDIR /build

# Clone PaddlePaddle repository
RUN git clone --depth 1 --branch ${PADDLE_VERSION} https://github.com/PaddlePaddle/Paddle.git

WORKDIR /build/Paddle

# Patch for ARM64: Remove -m64 flag (x86_64 specific, causes build failure on aarch64)
RUN sed -i 's/-m64//g' cmake/flags.cmake && \
    sed -i 's/-m64//g' CMakeLists.txt 2>/dev/null || true && \
    find . -name "*.cmake" -exec sed -i 's/-m64//g' {} \; 2>/dev/null || true && \
    echo "Patched -m64 flag for ARM64 compatibility"

# Install additional Python requirements for building
RUN pip install -r python/requirements.txt || true

# Create build directory
RUN mkdir -p build
WORKDIR /build/Paddle/build

# Configure CMake for ARM64 + CUDA build
RUN echo "Building for CUDA architecture: sm_${CUDA_ARCH}" && \
    cmake .. \
    -GNinja \
    -DCMAKE_BUILD_TYPE=Release \
    -DPY_VERSION=${PYTHON_VERSION} \
    -DWITH_GPU=ON \
    -DWITH_TESTING=OFF \
    -DWITH_DISTRIBUTE=OFF \
    -DWITH_NCCL=OFF \
    -DWITH_MKL=OFF \
    -DWITH_MKLDNN=OFF \
    -DON_INFER=OFF \
    -DWITH_PYTHON=ON \
    -DWITH_AVX=OFF \
    -DCUDA_ARCH_NAME=Manual \
    -DCUDA_ARCH_BIN="${CUDA_ARCH}" \
    -DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCH}" \
    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
    -DCMAKE_C_COMPILER_LAUNCHER=ccache \
    -DCMAKE_EXPORT_COMPILE_COMMANDS=ON

# Build external dependencies first (cacheable layer)
RUN --mount=type=cache,target=/ccache \
    ninja extern_gflags extern_glog extern_protobuf extern_zlib extern_eigen3

# Build flashattn (heaviest dependency, separate layer for caching)
RUN --mount=type=cache,target=/ccache \
    ninja extern_flashattn

# Build remaining external dependencies
RUN --mount=type=cache,target=/ccache \
    ninja extern_openblas extern_pybind extern_utf8proc extern_xxhash extern_yaml extern_cryptopp extern_warpctc extern_warprnnt extern_gloo extern_xbyak

# Build main PaddlePaddle (with ccache, fallback to fewer jobs if OOM)
RUN --mount=type=cache,target=/ccache \
    ninja -j$(nproc) || ninja -j$(($(nproc)/2)) || ninja -j4

# Build the Python wheel
RUN ninja paddle_python || true

# Create output directory
RUN mkdir -p /output

# Build wheel package
WORKDIR /build/Paddle
RUN cd python && python setup.py bdist_wheel || pip wheel . -w dist/

# Copy wheel to output
RUN cp python/dist/*.whl /output/ 2>/dev/null || \
    cp build/python/dist/*.whl /output/ 2>/dev/null || \
    find /build -name "paddlepaddle*.whl" -exec cp {} /output/ \;

# List what was built
RUN ls -la /output/ && \
    echo "=== Build complete ===" && \
    find /build -name "*.whl" -type f 2>/dev/null

# Default command: copy wheel to mounted volume
CMD ["sh", "-c", "cp /output/*.whl /wheels/ 2>/dev/null && echo 'Wheel copied to /wheels/' && ls -la /wheels/ || echo 'No wheel found in /output, checking other locations...' && find /build -name '*.whl' -exec cp {} /wheels/ \\; && ls -la /wheels/"]