doceker support

This commit is contained in:
2026-01-17 10:24:00 +01:00
parent 8e2b7a5096
commit c4ab0ffad1
9 changed files with 1004 additions and 0 deletions

114
src/paddle_ocr/test.py Normal file
View File

@@ -0,0 +1,114 @@
# test.py - Simple client to test PaddleOCR REST API
# Usage: python test.py [--url URL] [--dataset PATH]
import argparse
import requests
import time
import sys
def wait_for_health(url: str, timeout: int = 120) -> bool:
"""Wait for API to be ready."""
health_url = f"{url}/health"
start = time.time()
print(f"Waiting for API at {health_url}...")
while time.time() - start < timeout:
try:
resp = requests.get(health_url, timeout=5)
if resp.status_code == 200:
data = resp.json()
if data.get("model_loaded"):
print(f"API ready! Model loaded in {time.time() - start:.1f}s")
return True
print(f" Model loading... ({time.time() - start:.0f}s)")
except requests.exceptions.ConnectionError:
print(f" Connecting... ({time.time() - start:.0f}s)")
except Exception as e:
print(f" Error: {e}")
time.sleep(2)
print("Timeout waiting for API")
return False
def test_evaluate(url: str, config: dict) -> dict:
"""Run evaluation with given config."""
eval_url = f"{url}/evaluate"
print(f"\nTesting config: {config}")
start = time.time()
resp = requests.post(eval_url, json=config, timeout=600)
resp.raise_for_status()
result = resp.json()
elapsed = time.time() - start
print(f"Results (took {elapsed:.1f}s):")
print(f" CER: {result['CER']:.4f} ({result['CER']*100:.2f}%)")
print(f" WER: {result['WER']:.4f} ({result['WER']*100:.2f}%)")
print(f" Pages: {result['PAGES']}")
print(f" Time/page: {result['TIME_PER_PAGE']:.2f}s")
return result
def main():
parser = argparse.ArgumentParser(description="Test PaddleOCR REST API")
parser.add_argument("--url", default="http://localhost:8000", help="API base URL")
parser.add_argument("--dataset", default="/app/dataset", help="Dataset path (inside container)")
parser.add_argument("--skip-health", action="store_true", help="Skip health check wait")
args = parser.parse_args()
# Wait for API to be ready
if not args.skip_health:
if not wait_for_health(args.url):
sys.exit(1)
# Test 1: Baseline config (default PaddleOCR)
print("\n" + "="*50)
print("TEST 1: Baseline Configuration")
print("="*50)
baseline = test_evaluate(args.url, {
"pdf_folder": args.dataset,
"use_doc_orientation_classify": False,
"use_doc_unwarping": False,
"textline_orientation": False, # Baseline: disabled
"text_det_thresh": 0.0,
"text_det_box_thresh": 0.0,
"text_det_unclip_ratio": 1.5,
"text_rec_score_thresh": 0.0,
"start_page": 5,
"end_page": 10,
})
# Test 2: Optimized config (from Ray Tune results)
print("\n" + "="*50)
print("TEST 2: Optimized Configuration")
print("="*50)
optimized = test_evaluate(args.url, {
"pdf_folder": args.dataset,
"use_doc_orientation_classify": False,
"use_doc_unwarping": False,
"textline_orientation": True, # KEY: enabled
"text_det_thresh": 0.4690,
"text_det_box_thresh": 0.5412,
"text_det_unclip_ratio": 0.0,
"text_rec_score_thresh": 0.6350,
"start_page": 5,
"end_page": 10,
})
# Summary
print("\n" + "="*50)
print("SUMMARY")
print("="*50)
cer_reduction = (1 - optimized["CER"] / baseline["CER"]) * 100 if baseline["CER"] > 0 else 0
print(f"Baseline CER: {baseline['CER']*100:.2f}%")
print(f"Optimized CER: {optimized['CER']*100:.2f}%")
print(f"Improvement: {cer_reduction:.1f}% reduction in errors")
if __name__ == "__main__":
main()