From 580d1b114b51167ca19fda3d3128b0e5d93080df Mon Sep 17 00:00:00 2001 From: Sergio Jimenez Jimenez Date: Sun, 18 Jan 2026 07:13:51 +0100 Subject: [PATCH] More docs on gpu for paddle --- src/paddle_ocr/README.md | 55 ++++- .../docker-compose.gpu-registry.yml | 1 + src/paddle_ocr/scripts/test_dynamic_mode.py | 207 ++++++++++++++++++ 3 files changed, 257 insertions(+), 6 deletions(-) create mode 100644 src/paddle_ocr/scripts/test_dynamic_mode.py diff --git a/src/paddle_ocr/README.md b/src/paddle_ocr/README.md index 1cfaa36..26fa2f9 100644 --- a/src/paddle_ocr/README.md +++ b/src/paddle_ocr/README.md @@ -214,12 +214,33 @@ When running PaddleOCR on Blackwell GPUs: #### Root Cause -PaddleOCR uses **pre-compiled inference models** (PP-OCRv4_mobile_det, PP-OCRv5_server_det, etc.) that contain embedded CUDA kernels. These kernels were compiled for older GPU architectures (sm_80 Ampere, sm_90 Hopper) and **do not support Blackwell (sm_121)**. +**Confirmed:** PaddlePaddle's entire CUDA backend does NOT support Blackwell (sm_121). This is NOT just an inference model problem - even basic operations fail. + +**Test Results (January 2026):** + +1. **PTX JIT Test** (`CUDA_FORCE_PTX_JIT=1`): + ``` + OSError: CUDA error(209), no kernel image is available for execution on the device. + [Hint: 'cudaErrorNoKernelImageForDevice'] + ``` + → Confirmed: No PTX code exists in PaddlePaddle binaries + +2. **Dynamic Graph Mode Test** (bypassing inference models): + ``` + Conv2D + BatchNorm output: + Output min: 0.0000 + Output max: 0.0000 + Output mean: 0.0000 + Dynamic graph mode: BROKEN (constant output) + ``` + → Confirmed: Even simple nn.Conv2D produces zeros on Blackwell + +**Conclusion:** The issue is PaddlePaddle's compiled CUDA kernels (cubins), not just the inference models. The entire framework was compiled without sm_121 support and without PTX for JIT compilation. **Why building PaddlePaddle from source doesn't fix it:** -1. ✅ You can build `paddlepaddle-gpu` with `CUDA_ARCH=121` - this creates a Blackwell-compatible framework -2. ❌ But the **PaddleOCR inference models** (`.pdiparams`, `.pdmodel` files) contain pre-compiled CUDA ops +1. ⚠️ Building with `CUDA_ARCH=121` requires CUDA 13.0+ (PaddlePaddle only supports up to CUDA 12.6) +2. ❌ Even if you could build it, PaddleOCR models contain pre-compiled CUDA ops 3. ❌ These model files were exported/compiled targeting sm_80/sm_90 architectures 4. ❌ The model kernels execute on GPU but produce garbage output on sm_121 @@ -291,17 +312,39 @@ CUDA **can** run older code on newer GPUs via **PTX JIT compilation**: **The problem**: PaddleOCR inference models contain only pre-compiled **cubins** (SASS binary), not PTX. Without PTX, there's nothing to JIT-compile. -You can test if PTX exists: +We tested PTX JIT (January 2026): ```bash # Force PTX JIT compilation docker run --gpus all -e CUDA_FORCE_PTX_JIT=1 paddle-ocr-gpu \ python /app/scripts/debug_gpu_detection.py /app/dataset/0/img/page_0001.png + +# Result: +# OSError: CUDA error(209), no kernel image is available for execution on the device. ``` -- If output is still constant → No PTX in models (confirmed) -- If output varies → PTX worked +**Confirmed: No PTX exists** in PaddlePaddle binaries. The CUDA kernels are cubins-only (SASS binary), compiled for sm_80/sm_90 without PTX fallback. **Note on sm_121**: Per NVIDIA docs, "sm_121 is the same as sm_120 since the only difference is physically integrated CPU+GPU memory of Spark." The issue is general Blackwell (sm_12x) support, not Spark-specific. +#### FAQ: Does Dynamic Graph Mode Work on Blackwell? + +**Q: Can I bypass inference models and use PaddlePaddle's dynamic graph mode?** + +**No.** We tested dynamic graph mode (January 2026): +```bash +# Test script runs: paddle.nn.Conv2D + paddle.nn.BatchNorm2D +python /app/scripts/test_dynamic_mode.py + +# Result: +# Input shape: [1, 3, 224, 224] +# Output shape: [1, 64, 112, 112] +# Output min: 0.0000 +# Output max: 0.0000 # <-- All zeros! +# Output mean: 0.0000 +# Dynamic graph mode: BROKEN (constant output) +``` + +**Conclusion:** The problem isn't limited to inference models. PaddlePaddle's core CUDA kernels (Conv2D, BatchNorm, etc.) produce garbage on sm_121. The entire framework lacks Blackwell support. + #### FAQ: Can I Run AMD64 Containers on ARM64 DGX Spark? **Q: Can I just run the working x86_64 GPU image via emulation?** diff --git a/src/paddle_ocr/docker-compose.gpu-registry.yml b/src/paddle_ocr/docker-compose.gpu-registry.yml index ed37626..6e606c2 100644 --- a/src/paddle_ocr/docker-compose.gpu-registry.yml +++ b/src/paddle_ocr/docker-compose.gpu-registry.yml @@ -12,6 +12,7 @@ services: volumes: - ../dataset:/app/dataset:ro - paddlex-cache:/root/.paddlex + - ./scripts:/app/scripts:ro environment: - PYTHONUNBUFFERED=1 - CUDA_VISIBLE_DEVICES=0 diff --git a/src/paddle_ocr/scripts/test_dynamic_mode.py b/src/paddle_ocr/scripts/test_dynamic_mode.py new file mode 100644 index 0000000..9759eb7 --- /dev/null +++ b/src/paddle_ocr/scripts/test_dynamic_mode.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python3 +""" +Test PaddleOCR in dynamic graph mode (not inference mode). + +Dynamic mode compiles kernels at runtime, which may work on Blackwell. +Inference mode uses pre-compiled kernels which fail on sm_121. + +Usage: + python test_dynamic_mode.py [image_path] +""" + +import os +import sys + +os.environ['DISABLE_MODEL_SOURCE_CHECK'] = 'True' +# Force dynamic graph mode +os.environ['FLAGS_enable_pir_api'] = '0' + +import numpy as np +import paddle +from PIL import Image + + +def check_gpu(): + """Check GPU status.""" + print("=" * 60) + print("GPU STATUS") + print("=" * 60) + print(f"Device: {paddle.device.get_device()}") + print(f"CUDA compiled: {paddle.device.is_compiled_with_cuda()}") + + if paddle.device.is_compiled_with_cuda() and paddle.device.cuda.device_count() > 0: + props = paddle.device.cuda.get_device_properties(0) + print(f"GPU: {props.name} (sm_{props.major}{props.minor})") + print(f"Memory: {props.total_memory / (1024**3):.1f} GB") + print() + + +def test_paddleocr_dynamic(image_path: str): + """Test PaddleOCR with dynamic execution.""" + print("=" * 60) + print("PADDLEOCR DYNAMIC MODE TEST") + print("=" * 60) + + # Import PaddleOCR + from paddleocr import PaddleOCR + + # Try to force dynamic mode by setting use_static=False if available + # or by using the model in eval mode directly + + print("Creating PaddleOCR instance...") + print("(This may download models on first run)") + + try: + # Create OCR instance - this might still use inference internally + ocr = PaddleOCR( + text_detection_model_name='PP-OCRv4_mobile_det', + text_recognition_model_name='PP-OCRv4_mobile_rec', + use_angle_cls=False, # Simplify + lang='es', + ) + + # Load image + img = Image.open(image_path) + arr = np.array(img) + print(f"Image shape: {arr.shape}") + + # Run prediction + print("Running OCR prediction...") + result = ocr.predict(arr) + + # Parse results + res = result[0].json['res'] + dt_polys = res.get('dt_polys', []) + rec_texts = res.get('rec_texts', []) + + print() + print("RESULTS:") + print(f" Detected boxes: {len(dt_polys)}") + print(f" Recognized texts: {len(rec_texts)}") + + if rec_texts: + print(f" First 5 texts: {rec_texts[:5]}") + return True + else: + print(" WARNING: No text recognized!") + return False + + except Exception as e: + print(f"ERROR: {e}") + return False + + +def test_paddle_dynamic_model(): + """Test loading a paddle model in dynamic graph mode.""" + print() + print("=" * 60) + print("PADDLE DYNAMIC GRAPH TEST") + print("=" * 60) + + # Ensure we're in dynamic mode + paddle.disable_static() + + # Test a simple model forward pass + print("Testing dynamic graph execution...") + + # Create a simple ResNet-like block + x = paddle.randn([1, 3, 224, 224]) + + # Conv -> BN -> ReLU + conv = paddle.nn.Conv2D(3, 64, 7, stride=2, padding=3) + bn = paddle.nn.BatchNorm2D(64) + + # Forward pass (dynamic mode - compiles at runtime) + y = conv(x) + y = bn(y) + y = paddle.nn.functional.relu(y) + + print(f"Input shape: {x.shape}") + print(f"Output shape: {y.shape}") + print(f"Output min: {y.min().item():.4f}") + print(f"Output max: {y.max().item():.4f}") + print(f"Output mean: {y.mean().item():.4f}") + + if y.min() != y.max(): + print("Dynamic graph mode: WORKING") + return True + else: + print("Dynamic graph mode: BROKEN (constant output)") + return False + + +def test_ppocr_model_direct(): + """Try loading PPOCRv4 model directly in dynamic mode.""" + print() + print("=" * 60) + print("PPOCR MODEL DIRECT LOAD TEST") + print("=" * 60) + + try: + # Try to import ppocr modules directly + # This bypasses the inference predictor + from paddleocr.ppocr.modeling.architectures import build_model + from paddleocr.ppocr.postprocess import build_post_process + from paddleocr.ppocr.utils.save_load import load_model + + print("Direct model import available") + + # Note: This approach requires model config files + # which may or may not be bundled with paddleocr + + except ImportError as e: + print(f"Direct model import not available: {e}") + print("PaddleOCR may only support inference mode") + + return False + + +def main(): + # Default test image + image_path = '/app/dataset/0/img/page_0001.png' + if len(sys.argv) > 1: + image_path = sys.argv[1] + + if not os.path.exists(image_path): + print(f"Image not found: {image_path}") + sys.exit(1) + + print(f"Testing with image: {image_path}") + print() + + check_gpu() + + # Test 1: Basic dynamic graph + dynamic_works = test_paddle_dynamic_model() + + if not dynamic_works: + print("\nDynamic graph mode is broken - GPU likely unsupported") + sys.exit(1) + + # Test 2: Direct model load + test_ppocr_model_direct() + + # Test 3: PaddleOCR pipeline + ocr_works = test_paddleocr_dynamic(image_path) + + print() + print("=" * 60) + print("SUMMARY") + print("=" * 60) + print(f"Dynamic graph mode: {'WORKS' if dynamic_works else 'BROKEN'}") + print(f"PaddleOCR pipeline: {'WORKS' if ocr_works else 'BROKEN'}") + + if dynamic_works and not ocr_works: + print() + print("DIAGNOSIS: Dynamic mode works but PaddleOCR fails.") + print("This means PaddleOCR internally uses inference predictor") + print("which has pre-compiled kernels without Blackwell support.") + print() + print("Potential solutions:") + print("1. Modify PaddleOCR to use dynamic mode") + print("2. Use ONNX export + ONNXRuntime") + print("3. Wait for PaddlePaddle Blackwell support") + + +if __name__ == '__main__': + main()