From 580d1b114b51167ca19fda3d3128b0e5d93080df Mon Sep 17 00:00:00 2001
From: Sergio Jimenez Jimenez <sergiojj932@gmail.com>
Date: Sun, 18 Jan 2026 07:13:51 +0100
Subject: [PATCH] More docs on gpu for paddle

---
 src/paddle_ocr/README.md                      |  55 ++++-
 .../docker-compose.gpu-registry.yml           |   1 +
 src/paddle_ocr/scripts/test_dynamic_mode.py   | 207 ++++++++++++++++++
 3 files changed, 257 insertions(+), 6 deletions(-)
 create mode 100644 src/paddle_ocr/scripts/test_dynamic_mode.py

diff --git a/src/paddle_ocr/README.md b/src/paddle_ocr/README.md
index 1cfaa36..26fa2f9 100644
--- a/src/paddle_ocr/README.md
+++ b/src/paddle_ocr/README.md
@@ -214,12 +214,33 @@ When running PaddleOCR on Blackwell GPUs:
 
 #### Root Cause
 
-PaddleOCR uses **pre-compiled inference models** (PP-OCRv4_mobile_det, PP-OCRv5_server_det, etc.) that contain embedded CUDA kernels. These kernels were compiled for older GPU architectures (sm_80 Ampere, sm_90 Hopper) and **do not support Blackwell (sm_121)**.
+**Confirmed:** PaddlePaddle's entire CUDA backend does NOT support Blackwell (sm_121). This is NOT just an inference model problem - even basic operations fail.
+
+**Test Results (January 2026):**
+
+1. **PTX JIT Test** (`CUDA_FORCE_PTX_JIT=1`):
+   ```
+   OSError: CUDA error(209), no kernel image is available for execution on the device.
+   [Hint: 'cudaErrorNoKernelImageForDevice']
+   ```
+   → Confirmed: No PTX code exists in PaddlePaddle binaries
+
+2. **Dynamic Graph Mode Test** (bypassing inference models):
+   ```
+   Conv2D + BatchNorm output:
+     Output min: 0.0000
+     Output max: 0.0000
+     Output mean: 0.0000
+   Dynamic graph mode: BROKEN (constant output)
+   ```
+   → Confirmed: Even simple nn.Conv2D produces zeros on Blackwell
+
+**Conclusion:** The issue is PaddlePaddle's compiled CUDA kernels (cubins), not just the inference models. The entire framework was compiled without sm_121 support and without PTX for JIT compilation.
 
 **Why building PaddlePaddle from source doesn't fix it:**
 
-1. ✅ You can build `paddlepaddle-gpu` with `CUDA_ARCH=121` - this creates a Blackwell-compatible framework
-2. ❌ But the **PaddleOCR inference models** (`.pdiparams`, `.pdmodel` files) contain pre-compiled CUDA ops
+1. ⚠️ Building with `CUDA_ARCH=121` requires CUDA 13.0+ (PaddlePaddle only supports up to CUDA 12.6)
+2. ❌ Even if you could build it, PaddleOCR models contain pre-compiled CUDA ops
 3. ❌ These model files were exported/compiled targeting sm_80/sm_90 architectures
 4. ❌ The model kernels execute on GPU but produce garbage output on sm_121
 
@@ -291,17 +312,39 @@ CUDA **can** run older code on newer GPUs via **PTX JIT compilation**:
 
 **The problem**: PaddleOCR inference models contain only pre-compiled **cubins** (SASS binary), not PTX. Without PTX, there's nothing to JIT-compile.
 
-You can test if PTX exists:
+We tested PTX JIT (January 2026):
 ```bash
 # Force PTX JIT compilation
 docker run --gpus all -e CUDA_FORCE_PTX_JIT=1 paddle-ocr-gpu \
   python /app/scripts/debug_gpu_detection.py /app/dataset/0/img/page_0001.png
+
+# Result:
+# OSError: CUDA error(209), no kernel image is available for execution on the device.
 ```
-- If output is still constant → No PTX in models (confirmed)
-- If output varies → PTX worked
+**Confirmed: No PTX exists** in PaddlePaddle binaries. The CUDA kernels are cubins-only (SASS binary), compiled for sm_80/sm_90 without PTX fallback.
 
 **Note on sm_121**: Per NVIDIA docs, "sm_121 is the same as sm_120 since the only difference is physically integrated CPU+GPU memory of Spark." The issue is general Blackwell (sm_12x) support, not Spark-specific.
 
+#### FAQ: Does Dynamic Graph Mode Work on Blackwell?
+
+**Q: Can I bypass inference models and use PaddlePaddle's dynamic graph mode?**
+
+**No.** We tested dynamic graph mode (January 2026):
+```bash
+# Test script runs: paddle.nn.Conv2D + paddle.nn.BatchNorm2D
+python /app/scripts/test_dynamic_mode.py
+
+# Result:
+# Input shape: [1, 3, 224, 224]
+# Output shape: [1, 64, 112, 112]
+# Output min: 0.0000
+# Output max: 0.0000  # <-- All zeros!
+# Output mean: 0.0000
+# Dynamic graph mode: BROKEN (constant output)
+```
+
+**Conclusion:** The problem isn't limited to inference models. PaddlePaddle's core CUDA kernels (Conv2D, BatchNorm, etc.) produce garbage on sm_121. The entire framework lacks Blackwell support.
+
 #### FAQ: Can I Run AMD64 Containers on ARM64 DGX Spark?
 
 **Q: Can I just run the working x86_64 GPU image via emulation?**
diff --git a/src/paddle_ocr/docker-compose.gpu-registry.yml b/src/paddle_ocr/docker-compose.gpu-registry.yml
index ed37626..6e606c2 100644
--- a/src/paddle_ocr/docker-compose.gpu-registry.yml
+++ b/src/paddle_ocr/docker-compose.gpu-registry.yml
@@ -12,6 +12,7 @@ services:
     volumes:
       - ../dataset:/app/dataset:ro
       - paddlex-cache:/root/.paddlex
+      - ./scripts:/app/scripts:ro
     environment:
       - PYTHONUNBUFFERED=1
       - CUDA_VISIBLE_DEVICES=0
diff --git a/src/paddle_ocr/scripts/test_dynamic_mode.py b/src/paddle_ocr/scripts/test_dynamic_mode.py
new file mode 100644
index 0000000..9759eb7
--- /dev/null
+++ b/src/paddle_ocr/scripts/test_dynamic_mode.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python3
+"""
+Test PaddleOCR in dynamic graph mode (not inference mode).
+
+Dynamic mode compiles kernels at runtime, which may work on Blackwell.
+Inference mode uses pre-compiled kernels which fail on sm_121.
+
+Usage:
+    python test_dynamic_mode.py [image_path]
+"""
+
+import os
+import sys
+
+os.environ['DISABLE_MODEL_SOURCE_CHECK'] = 'True'
+# Force dynamic graph mode
+os.environ['FLAGS_enable_pir_api'] = '0'
+
+import numpy as np
+import paddle
+from PIL import Image
+
+
+def check_gpu():
+    """Check GPU status."""
+    print("=" * 60)
+    print("GPU STATUS")
+    print("=" * 60)
+    print(f"Device: {paddle.device.get_device()}")
+    print(f"CUDA compiled: {paddle.device.is_compiled_with_cuda()}")
+
+    if paddle.device.is_compiled_with_cuda() and paddle.device.cuda.device_count() > 0:
+        props = paddle.device.cuda.get_device_properties(0)
+        print(f"GPU: {props.name} (sm_{props.major}{props.minor})")
+        print(f"Memory: {props.total_memory / (1024**3):.1f} GB")
+    print()
+
+
+def test_paddleocr_dynamic(image_path: str):
+    """Test PaddleOCR with dynamic execution."""
+    print("=" * 60)
+    print("PADDLEOCR DYNAMIC MODE TEST")
+    print("=" * 60)
+
+    # Import PaddleOCR
+    from paddleocr import PaddleOCR
+
+    # Try to force dynamic mode by setting use_static=False if available
+    # or by using the model in eval mode directly
+
+    print("Creating PaddleOCR instance...")
+    print("(This may download models on first run)")
+
+    try:
+        # Create OCR instance - this might still use inference internally
+        ocr = PaddleOCR(
+            text_detection_model_name='PP-OCRv4_mobile_det',
+            text_recognition_model_name='PP-OCRv4_mobile_rec',
+            use_angle_cls=False,  # Simplify
+            lang='es',
+        )
+
+        # Load image
+        img = Image.open(image_path)
+        arr = np.array(img)
+        print(f"Image shape: {arr.shape}")
+
+        # Run prediction
+        print("Running OCR prediction...")
+        result = ocr.predict(arr)
+
+        # Parse results
+        res = result[0].json['res']
+        dt_polys = res.get('dt_polys', [])
+        rec_texts = res.get('rec_texts', [])
+
+        print()
+        print("RESULTS:")
+        print(f"  Detected boxes: {len(dt_polys)}")
+        print(f"  Recognized texts: {len(rec_texts)}")
+
+        if rec_texts:
+            print(f"  First 5 texts: {rec_texts[:5]}")
+            return True
+        else:
+            print("  WARNING: No text recognized!")
+            return False
+
+    except Exception as e:
+        print(f"ERROR: {e}")
+        return False
+
+
+def test_paddle_dynamic_model():
+    """Test loading a paddle model in dynamic graph mode."""
+    print()
+    print("=" * 60)
+    print("PADDLE DYNAMIC GRAPH TEST")
+    print("=" * 60)
+
+    # Ensure we're in dynamic mode
+    paddle.disable_static()
+
+    # Test a simple model forward pass
+    print("Testing dynamic graph execution...")
+
+    # Create a simple ResNet-like block
+    x = paddle.randn([1, 3, 224, 224])
+
+    # Conv -> BN -> ReLU
+    conv = paddle.nn.Conv2D(3, 64, 7, stride=2, padding=3)
+    bn = paddle.nn.BatchNorm2D(64)
+
+    # Forward pass (dynamic mode - compiles at runtime)
+    y = conv(x)
+    y = bn(y)
+    y = paddle.nn.functional.relu(y)
+
+    print(f"Input shape: {x.shape}")
+    print(f"Output shape: {y.shape}")
+    print(f"Output min: {y.min().item():.4f}")
+    print(f"Output max: {y.max().item():.4f}")
+    print(f"Output mean: {y.mean().item():.4f}")
+
+    if y.min() != y.max():
+        print("Dynamic graph mode: WORKING")
+        return True
+    else:
+        print("Dynamic graph mode: BROKEN (constant output)")
+        return False
+
+
+def test_ppocr_model_direct():
+    """Try loading PPOCRv4 model directly in dynamic mode."""
+    print()
+    print("=" * 60)
+    print("PPOCR MODEL DIRECT LOAD TEST")
+    print("=" * 60)
+
+    try:
+        # Try to import ppocr modules directly
+        # This bypasses the inference predictor
+        from paddleocr.ppocr.modeling.architectures import build_model
+        from paddleocr.ppocr.postprocess import build_post_process
+        from paddleocr.ppocr.utils.save_load import load_model
+
+        print("Direct model import available")
+
+        # Note: This approach requires model config files
+        # which may or may not be bundled with paddleocr
+
+    except ImportError as e:
+        print(f"Direct model import not available: {e}")
+        print("PaddleOCR may only support inference mode")
+
+    return False
+
+
+def main():
+    # Default test image
+    image_path = '/app/dataset/0/img/page_0001.png'
+    if len(sys.argv) > 1:
+        image_path = sys.argv[1]
+
+    if not os.path.exists(image_path):
+        print(f"Image not found: {image_path}")
+        sys.exit(1)
+
+    print(f"Testing with image: {image_path}")
+    print()
+
+    check_gpu()
+
+    # Test 1: Basic dynamic graph
+    dynamic_works = test_paddle_dynamic_model()
+
+    if not dynamic_works:
+        print("\nDynamic graph mode is broken - GPU likely unsupported")
+        sys.exit(1)
+
+    # Test 2: Direct model load
+    test_ppocr_model_direct()
+
+    # Test 3: PaddleOCR pipeline
+    ocr_works = test_paddleocr_dynamic(image_path)
+
+    print()
+    print("=" * 60)
+    print("SUMMARY")
+    print("=" * 60)
+    print(f"Dynamic graph mode: {'WORKS' if dynamic_works else 'BROKEN'}")
+    print(f"PaddleOCR pipeline: {'WORKS' if ocr_works else 'BROKEN'}")
+
+    if dynamic_works and not ocr_works:
+        print()
+        print("DIAGNOSIS: Dynamic mode works but PaddleOCR fails.")
+        print("This means PaddleOCR internally uses inference predictor")
+        print("which has pre-compiled kernels without Blackwell support.")
+        print()
+        print("Potential solutions:")
+        print("1. Modify PaddleOCR to use dynamic mode")
+        print("2. Use ONNX export + ONNXRuntime")
+        print("3. Wait for PaddlePaddle Blackwell support")
+
+
+if __name__ == '__main__':
+    main()