measuring

2026-01-17 17:28:33 +01:00
parent b96dc1ed91
commit 38ba2d1f5a
2 changed files with 208 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ results
 .DS_Store
 .claude
 node_modules
 src/paddle_ocr/wheels
--- a/src/paddle_ocr/benchmark.py
+++ b/src/paddle_ocr/benchmark.py
@@ -0,0 +1,207 @@
 # benchmark.py - Compare CPU vs GPU performance for PaddleOCR REST API
 # Usage: python benchmark.py
 import requests
 import time
 import json
 import sys
 from datetime import datetime
 CONTAINERS = {
    "GPU": {"url": "http://localhost:8000", "port": 8000},
    "CPU": {"url": "http://localhost:8002", "port": 8002},
 }
 DATASET_PATH = "/app/dataset"
 # Test configurations
 TEST_CONFIGS = [
    {
        "name": "Baseline",
        "config": {
            "pdf_folder": DATASET_PATH,
            "use_doc_orientation_classify": False,
            "use_doc_unwarping": False,
            "textline_orientation": False,
            "text_det_thresh": 0.0,
            "text_det_box_thresh": 0.0,
            "text_det_unclip_ratio": 1.5,
            "text_rec_score_thresh": 0.0,
            "start_page": 5,
            "end_page": 10,
        }
    },
    {
        "name": "Optimized",
        "config": {
            "pdf_folder": DATASET_PATH,
            "use_doc_orientation_classify": False,
            "use_doc_unwarping": False,
            "textline_orientation": True,
            "text_det_thresh": 0.4690,
            "text_det_box_thresh": 0.5412,
            "text_det_unclip_ratio": 0.0,
            "text_rec_score_thresh": 0.6350,
            "start_page": 5,
            "end_page": 10,
        }
    },
 ]
 def check_health(url: str, timeout: int = 10) -> bool:
    """Check if API is healthy."""
    try:
        resp = requests.get(f"{url}/health", timeout=timeout)
        if resp.status_code == 200:
            data = resp.json()
            return data.get("model_loaded", False)
    except Exception as e:
        print(f"  Health check failed: {e}")
    return False
 def run_benchmark(url: str, config: dict, warmup: bool = False) -> dict:
    """Run a single benchmark test."""
    eval_url = f"{url}/evaluate"
    start = time.time()
    resp = requests.post(eval_url, json=config, timeout=600)
    resp.raise_for_status()
    total_time = time.time() - start
    result = resp.json()
    result["total_request_time"] = total_time
    return result
 def main():
    results = {
        "timestamp": datetime.now().isoformat(),
        "containers": {},
    }
    print("=" * 60)
    print("PaddleOCR CPU vs GPU Benchmark")
    print("=" * 60)
    print()
    # Check container health
    print("Checking container health...")
    for name, info in CONTAINERS.items():
        healthy = check_health(info["url"])
        status = "✓ Ready" if healthy else "✗ Not Ready"
        print(f"  {name} ({info['url']}): {status}")
        if not healthy:
            print(f"    Skipping {name} - container not available")
            continue
    print()
    # Run benchmarks for each container
    for container_name, container_info in CONTAINERS.items():
        url = container_info["url"]
        if not check_health(url):
            print(f"Skipping {container_name} - not healthy")
            continue
        print("=" * 60)
        print(f"Testing: {container_name} Container")
        print(f"URL: {url}")
        print("=" * 60)
        container_results = {
            "url": url,
            "tests": {},
        }
        # Warmup run (first run often slower due to model loading/caching)
        print("\n  Warmup run...")
        try:
            warmup_config = TEST_CONFIGS[0]["config"].copy()
            warmup_config["start_page"] = 5
            warmup_config["end_page"] = 6  # Just 1 page for warmup
            run_benchmark(url, warmup_config, warmup=True)
            print("  Warmup complete.")
        except Exception as e:
            print(f"  Warmup failed: {e}")
        # Run each test configuration
        for test in TEST_CONFIGS:
            test_name = test["name"]
            config = test["config"]
            print(f"\n  Running: {test_name} Configuration")
            print(f"  Pages: {config['start_page']} to {config['end_page']}")
            try:
                result = run_benchmark(url, config)
                container_results["tests"][test_name] = {
                    "CER": result["CER"],
                    "WER": result["WER"],
                    "PAGES": result["PAGES"],
                    "TIME_PER_PAGE": result["TIME_PER_PAGE"],
                    "TOTAL_TIME": result["total_request_time"],
                }
                print(f"    CER: {result['CER']*100:.2f}%")
                print(f"    WER: {result['WER']*100:.2f}%")
                print(f"    Pages: {result['PAGES']}")
                print(f"    Time/page: {result['TIME_PER_PAGE']:.3f}s")
                print(f"    Total time: {result['total_request_time']:.2f}s")
            except Exception as e:
                print(f"    ERROR: {e}")
                container_results["tests"][test_name] = {"error": str(e)}
        results["containers"][container_name] = container_results
    # Print summary
    print("\n")
    print("=" * 60)
    print("BENCHMARK SUMMARY")
    print("=" * 60)
    # Table header
    print(f"\n{'Test':<12} {'Container':<8} {'CER %':<10} {'WER %':<10} {'Time/Page':<12} {'Total (s)':<10}")
    print("-" * 62)
    for test in TEST_CONFIGS:
        test_name = test["name"]
        for container_name in CONTAINERS.keys():
            if container_name in results["containers"]:
                tests = results["containers"][container_name].get("tests", {})
                if test_name in tests and "error" not in tests[test_name]:
                    t = tests[test_name]
                    print(f"{test_name:<12} {container_name:<8} {t['CER']*100:<10.2f} {t['WER']*100:<10.2f} {t['TIME_PER_PAGE']:<12.3f} {t['TOTAL_TIME']:<10.2f}")
    # Speed comparison
    print("\n" + "=" * 60)
    print("SPEED COMPARISON")
    print("=" * 60)
    for test in TEST_CONFIGS:
        test_name = test["name"]
        gpu_data = results["containers"].get("GPU", {}).get("tests", {}).get(test_name, {})
        cpu_data = results["containers"].get("CPU", {}).get("tests", {}).get(test_name, {})
        if gpu_data and cpu_data and "error" not in gpu_data and "error" not in cpu_data:
            speedup = cpu_data["TIME_PER_PAGE"] / gpu_data["TIME_PER_PAGE"]
            print(f"\n{test_name} Configuration:")
            print(f"  GPU: {gpu_data['TIME_PER_PAGE']:.3f}s per page")
            print(f"  CPU: {cpu_data['TIME_PER_PAGE']:.3f}s per page")
            print(f"  GPU is {speedup:.2f}x faster than CPU")
    # Save results to JSON
    output_file = "benchmark_results.json"
    with open(output_file, "w") as f:
        json.dump(results, f, indent=2)
    print(f"\n\nResults saved to: {output_file}")
    return results
 if __name__ == "__main__":
    main()