# test.py - Simple client to test PaddleOCR REST API # Usage: python test.py [--url URL] [--dataset PATH] import argparse import requests import time import sys def wait_for_health(url: str, timeout: int = 120) -> bool: """Wait for API to be ready.""" health_url = f"{url}/health" start = time.time() print(f"Waiting for API at {health_url}...") while time.time() - start < timeout: try: resp = requests.get(health_url, timeout=5) if resp.status_code == 200: data = resp.json() if data.get("model_loaded"): print(f"API ready! Model loaded in {time.time() - start:.1f}s") return True print(f" Model loading... ({time.time() - start:.0f}s)") except requests.exceptions.ConnectionError: print(f" Connecting... ({time.time() - start:.0f}s)") except Exception as e: print(f" Error: {e}") time.sleep(2) print("Timeout waiting for API") return False def test_evaluate(url: str, config: dict) -> dict: """Run evaluation with given config.""" eval_url = f"{url}/evaluate" print(f"\nTesting config: {config}") start = time.time() resp = requests.post(eval_url, json=config, timeout=600) resp.raise_for_status() result = resp.json() elapsed = time.time() - start print(f"Results (took {elapsed:.1f}s):") print(f" CER: {result['CER']:.4f} ({result['CER']*100:.2f}%)") print(f" WER: {result['WER']:.4f} ({result['WER']*100:.2f}%)") print(f" Pages: {result['PAGES']}") print(f" Time/page: {result['TIME_PER_PAGE']:.2f}s") return result def main(): parser = argparse.ArgumentParser(description="Test PaddleOCR REST API") parser.add_argument("--url", default="http://localhost:8000", help="API base URL") parser.add_argument("--dataset", default="/app/dataset", help="Dataset path (inside container)") parser.add_argument("--skip-health", action="store_true", help="Skip health check wait") args = parser.parse_args() # Wait for API to be ready if not args.skip_health: if not wait_for_health(args.url): sys.exit(1) # Test 1: Baseline config (default PaddleOCR) print("\n" + "="*50) print("TEST 1: Baseline Configuration") print("="*50) baseline = test_evaluate(args.url, { "pdf_folder": args.dataset, "use_doc_orientation_classify": False, "use_doc_unwarping": False, "textline_orientation": False, # Baseline: disabled "text_det_thresh": 0.0, "text_det_box_thresh": 0.0, "text_det_unclip_ratio": 1.5, "text_rec_score_thresh": 0.0, "start_page": 5, "end_page": 10, }) # Test 2: Optimized config (from Ray Tune results) print("\n" + "="*50) print("TEST 2: Optimized Configuration") print("="*50) optimized = test_evaluate(args.url, { "pdf_folder": args.dataset, "use_doc_orientation_classify": False, "use_doc_unwarping": False, "textline_orientation": True, # KEY: enabled "text_det_thresh": 0.4690, "text_det_box_thresh": 0.5412, "text_det_unclip_ratio": 0.0, "text_rec_score_thresh": 0.6350, "start_page": 5, "end_page": 10, }) # Summary print("\n" + "="*50) print("SUMMARY") print("="*50) cer_reduction = (1 - optimized["CER"] / baseline["CER"]) * 100 if baseline["CER"] > 0 else 0 print(f"Baseline CER: {baseline['CER']*100:.2f}%") print(f"Optimized CER: {optimized['CER']*100:.2f}%") print(f"Improvement: {cer_reduction:.1f}% reduction in errors") if __name__ == "__main__": main()