Paddle ocr, easyicr and doctr gpu support. (#4)

2026-01-19 17:35:24 +00:00
parent 8e2b7a5096
commit c7ed7b2b9c
105 changed files with 8170 additions and 1263 deletions
--- a/src/paddle_ocr/scripts/debug_gpu_detection.py
+++ b/src/paddle_ocr/scripts/debug_gpu_detection.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+"""
+Debug script for GPU OCR detection issues.
+
+This script tests the raw inference output from PaddlePaddle detection models
+to diagnose why detection might fail on certain GPU architectures (e.g., Blackwell/sm_121).
+
+Usage:
+    docker exec paddle-ocr-gpu python /app/debug_gpu_detection.py [image_path]
+
+Expected behavior:
+    - Working GPU: Output stats should show min close to 0, max close to 1, mean ~0.1-0.5
+    - Broken GPU: Output stats show constant values (e.g., min=max=mean=0.00001)
+"""
+
+import os
+import sys
+
+os.environ['DISABLE_MODEL_SOURCE_CHECK'] = 'True'
+
+import numpy as np
+import paddle
+from PIL import Image
+
+
+def check_gpu_status():
+    """Check GPU availability and properties."""
+    print("=" * 60)
+    print("GPU STATUS")
+    print("=" * 60)
+    print(f"Device: {paddle.device.get_device()}")
+    print(f"CUDA compiled: {paddle.device.is_compiled_with_cuda()}")
+
+    if paddle.device.is_compiled_with_cuda():
+        print(f"GPU count: {paddle.device.cuda.device_count()}")
+        if paddle.device.cuda.device_count() > 0:
+            props = paddle.device.cuda.get_device_properties(0)
+            print(f"GPU name: {props.name}")
+            print(f"Compute capability: {props.major}.{props.minor}")
+            print(f"Total memory: {props.total_memory / (1024**3):.2f} GB")
+    print()
+
+
+def test_basic_ops():
+    """Test basic GPU tensor operations."""
+    print("=" * 60)
+    print("BASIC GPU OPERATIONS")
+    print("=" * 60)
+
+    # Test tensor creation
+    x = paddle.randn([2, 3])
+    print(f"Tensor place: {x.place}")
+
+    # Test conv2d
+    x = paddle.randn([1, 3, 64, 64])
+    conv = paddle.nn.Conv2D(3, 16, 3, padding=1)
+    y = conv(x)
+    print(f"Conv2d output shape: {y.shape}, place: {y.place}")
+
+    # Test softmax
+    s = paddle.nn.functional.softmax(y, axis=1)
+    print(f"Softmax output shape: {s.shape}")
+    print("Basic operations: OK")
+    print()
+
+
+def test_detection_model(image_path: str):
+    """Test detection model raw output."""
+    print("=" * 60)
+    print("DETECTION MODEL TEST")
+    print("=" * 60)
+
+    from paddle.inference import Config, create_predictor
+
+    model_dir = '/root/.paddlex/official_models/PP-OCRv4_mobile_det'
+    inference_file = f'{model_dir}/inference.json'
+    params_file = f'{model_dir}/inference.pdiparams'
+
+    if not os.path.exists(inference_file):
+        print(f"Model not found at {model_dir}")
+        print("Run PaddleOCR once to download models first.")
+        return
+
+    # Create config
+    config = Config()
+    config.set_prog_file(inference_file)
+    config.set_params_file(params_file)
+    config.enable_use_gpu(1024, 0)
+
+    print("Creating predictor...")
+    predictor = create_predictor(config)
+
+    # Get input/output names
+    input_names = predictor.get_input_names()
+    output_names = predictor.get_output_names()
+    print(f"Input names: {input_names}")
+    print(f"Output names: {output_names}")
+
+    # Load and preprocess image
+    img = Image.open(image_path)
+    img = img.resize((640, 640))
+    arr = np.array(img).astype('float32')
+    arr = arr / 255.0
+    arr = arr.transpose(2, 0, 1)[np.newaxis, ...]  # NCHW
+    print(f"Input tensor shape: {arr.shape}")
+
+    # Set input
+    input_handle = predictor.get_input_handle(input_names[0])
+    input_handle.reshape(arr.shape)
+    input_handle.copy_from_cpu(arr)
+
+    # Run prediction
+    print("Running inference...")
+    predictor.run()
+
+    # Get output
+    output_handle = predictor.get_output_handle(output_names[0])
+    output = output_handle.copy_to_cpu()
+
+    print()
+    print("OUTPUT ANALYSIS:")
+    print(f"  Shape: {output.shape}")
+    print(f"  Min: {output.min():.6f}")
+    print(f"  Max: {output.max():.6f}")
+    print(f"  Mean: {output.mean():.6f}")
+    print(f"  Std: {output.std():.6f}")
+    print(f"  Has NaN: {np.isnan(output).any()}")
+    print(f"  Has Inf: {np.isinf(output).any()}")
+
+    # Diagnosis
+    print()
+    print("DIAGNOSIS:")
+    if output.min() == output.max():
+        print("  PROBLEM: Output is constant - model inference is broken!")
+        print("  This typically indicates GPU compute capability mismatch.")
+        print("  GB10 (sm_121) may need CUDA 13.0+ for native support.")
+    elif output.max() < 0.01:
+        print("  PROBLEM: Output values too low - detection will find nothing.")
+    elif np.isnan(output).any() or np.isinf(output).any():
+        print("  PROBLEM: Output contains NaN/Inf - numerical instability.")
+    else:
+        print("  OK: Output values look reasonable.")
+        print(f"  Detection threshold typically 0.3-0.6, max output is {output.max():.3f}")
+
+
+def test_paddleocr_output(image_path: str):
+    """Test full PaddleOCR pipeline."""
+    print()
+    print("=" * 60)
+    print("PADDLEOCR PIPELINE TEST")
+    print("=" * 60)
+
+    from paddleocr import PaddleOCR
+
+    ocr = PaddleOCR(
+        text_detection_model_name='PP-OCRv4_mobile_det',
+        text_recognition_model_name='PP-OCRv4_mobile_rec',
+    )
+
+    img = Image.open(image_path)
+    arr = np.array(img)
+
+    out = ocr.predict(arr)
+    res = out[0].json['res']
+
+    dt_polys = res.get('dt_polys', [])
+    rec_texts = res.get('rec_texts', [])
+
+    print(f"Detection polygons: {len(dt_polys)}")
+    print(f"Recognition texts: {len(rec_texts)}")
+
+    if rec_texts:
+        print(f"Sample texts: {rec_texts[:5]}")
+    else:
+        print("No text detected!")
+
+
+def main():
+    # Default test image
+    image_path = '/app/dataset/0/img/page_0001.png'
+    if len(sys.argv) > 1:
+        image_path = sys.argv[1]
+
+    if not os.path.exists(image_path):
+        print(f"Image not found: {image_path}")
+        print("Usage: python debug_gpu_detection.py [image_path]")
+        sys.exit(1)
+
+    print(f"Testing with image: {image_path}")
+    print()
+
+    check_gpu_status()
+    test_basic_ops()
+    test_detection_model(image_path)
+    test_paddleocr_output(image_path)
+
+
+if __name__ == '__main__':
+    main()