raytune as docker

2026-01-19 16:32:45 +01:00
parent d67cbd4677
commit 94b25f9752
20 changed files with 7214 additions and 112 deletions
--- a/src/raytune/Dockerfile
+++ b/src/raytune/Dockerfile
@@ -0,0 +1,18 @@
+FROM python:3.12-slim
+
+WORKDIR /app
+
+# Install dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy application files
+COPY raytune_ocr.py .
+COPY run_tuning.py .
+
+# Create results directory
+RUN mkdir -p /app/results
+
+ENV PYTHONUNBUFFERED=1
+
+ENTRYPOINT ["python", "run_tuning.py"]
--- a/src/raytune/README.md
+++ b/src/raytune/README.md
@@ -0,0 +1,131 @@
+# Ray Tune OCR Hyperparameter Optimization
+
+Docker-based hyperparameter tuning for OCR services using Ray Tune with Optuna search.
+
+## Structure
+
+```
+raytune/
+├── Dockerfile          # Python 3.12-slim with Ray Tune + Optuna
+├── requirements.txt    # Dependencies
+├── raytune_ocr.py      # Shared utilities and search spaces
+├── run_tuning.py       # CLI entry point
+└── README.md
+```
+
+## Quick Start
+
+```bash
+cd src
+
+# Build the raytune image
+docker compose -f docker-compose.tuning.paddle.yml build raytune
+
+# Or pull from registry
+docker pull seryus.ddns.net/unir/raytune:latest
+```
+
+## Usage
+
+### PaddleOCR Tuning
+
+```bash
+# Start PaddleOCR service
+docker compose -f docker-compose.tuning.paddle.yml up -d paddle-ocr-gpu
+
+# Wait for health check, then run tuning
+docker compose -f docker-compose.tuning.paddle.yml run raytune --service paddle --samples 64
+
+# Stop when done
+docker compose -f docker-compose.tuning.paddle.yml down
+```
+
+### DocTR Tuning
+
+```bash
+docker compose -f docker-compose.tuning.doctr.yml up -d doctr-gpu
+docker compose -f docker-compose.tuning.doctr.yml run raytune --service doctr --samples 64
+docker compose -f docker-compose.tuning.doctr.yml down
+```
+
+### EasyOCR Tuning
+
+```bash
+# Note: EasyOCR uses port 8002 (same as PaddleOCR). Cannot run simultaneously.
+docker compose -f docker-compose.tuning.easyocr.yml up -d easyocr-gpu
+docker compose -f docker-compose.tuning.easyocr.yml run raytune --service easyocr --samples 64
+docker compose -f docker-compose.tuning.easyocr.yml down
+```
+
+## CLI Options
+
+```
+python run_tuning.py --service {paddle,doctr,easyocr} --samples N
+```
+
+| Option     | Description                          | Default |
+|------------|--------------------------------------|---------|
+| --service  | OCR service to tune (required)       | -       |
+| --samples  | Number of hyperparameter trials      | 64      |
+
+## Search Spaces
+
+### PaddleOCR
+- `use_doc_orientation_classify`: [True, False]
+- `use_doc_unwarping`: [True, False]
+- `textline_orientation`: [True, False]
+- `text_det_thresh`: uniform(0.0, 0.7)
+- `text_det_box_thresh`: uniform(0.0, 0.7)
+- `text_rec_score_thresh`: uniform(0.0, 0.7)
+
+### DocTR
+- `assume_straight_pages`: [True, False]
+- `straighten_pages`: [True, False]
+- `preserve_aspect_ratio`: [True, False]
+- `symmetric_pad`: [True, False]
+- `disable_page_orientation`: [True, False]
+- `disable_crop_orientation`: [True, False]
+- `resolve_lines`: [True, False]
+- `resolve_blocks`: [True, False]
+- `paragraph_break`: uniform(0.01, 0.1)
+
+### EasyOCR
+- `text_threshold`: uniform(0.3, 0.9)
+- `low_text`: uniform(0.2, 0.6)
+- `link_threshold`: uniform(0.2, 0.6)
+- `slope_ths`: uniform(0.0, 0.3)
+- `ycenter_ths`: uniform(0.3, 1.0)
+- `height_ths`: uniform(0.3, 1.0)
+- `width_ths`: uniform(0.3, 1.0)
+- `add_margin`: uniform(0.0, 0.3)
+- `contrast_ths`: uniform(0.05, 0.3)
+- `adjust_contrast`: uniform(0.3, 0.8)
+- `decoder`: ["greedy", "beamsearch"]
+- `beamWidth`: [3, 5, 7, 10]
+- `min_size`: [5, 10, 15, 20]
+
+## Output
+
+Results are saved to `src/results/` as CSV files:
+- `raytune_paddle_results_YYYYMMDD_HHMMSS.csv`
+- `raytune_doctr_results_YYYYMMDD_HHMMSS.csv`
+- `raytune_easyocr_results_YYYYMMDD_HHMMSS.csv`
+
+Each row contains:
+- Configuration parameters (prefixed with `config/`)
+- Metrics: CER, WER, TIME, PAGES, TIME_PER_PAGE
+- Worker URL used for the trial
+
+## Network Mode
+
+The raytune container uses `network_mode: host` to access OCR services on localhost ports:
+- PaddleOCR: port 8002
+- DocTR: port 8003
+- EasyOCR: port 8002 (conflicts with PaddleOCR)
+
+## Dependencies
+
+- ray[tune]==2.52.1
+- optuna==4.7.0
+- requests>=2.28.0
+- pandas>=2.0.0
--- a/src/raytune/raytune_ocr.py
+++ b/src/raytune/raytune_ocr.py
@@ -0,0 +1,371 @@
+# raytune_ocr.py
+# Shared Ray Tune utilities for OCR hyperparameter optimization
+#
+# Usage:
+#   from raytune_ocr import check_workers, create_trainable, run_tuner, analyze_results
+#
+# Environment variables:
+#   OCR_HOST: Host for OCR services (default: localhost)
+
+import os
+from datetime import datetime
+from typing import List, Dict, Any, Callable
+
+import requests
+import pandas as pd
+
+import ray
+from ray import tune
+from ray.tune.search.optuna import OptunaSearch
+
+
+def check_workers(
+    ports: List[int],
+    service_name: str = "OCR",
+    timeout: int = 180,
+    interval: int = 5,
+) -> List[str]:
+    """
+    Wait for workers to be fully ready (model + dataset loaded) and return healthy URLs.
+
+    Args:
+        ports: List of port numbers to check
+        service_name: Name for error messages
+        timeout: Max seconds to wait for each worker
+        interval: Seconds between retries
+
+    Returns:
+        List of healthy worker URLs
+
+    Raises:
+        RuntimeError if no healthy workers found after timeout
+    """
+    import time
+
+    host = os.environ.get("OCR_HOST", "localhost")
+    worker_urls = [f"http://{host}:{port}" for port in ports]
+    healthy_workers = []
+
+    for url in worker_urls:
+        print(f"Waiting for {url}...")
+        start = time.time()
+
+        while time.time() - start < timeout:
+            try:
+                health = requests.get(f"{url}/health", timeout=10).json()
+                model_ok = health.get('model_loaded', False)
+                dataset_ok = health.get('dataset_loaded', False)
+
+                if health.get('status') == 'ok' and model_ok:
+                    gpu = health.get('gpu_name', 'CPU')
+                    print(f"✓ {url}: ready ({gpu})")
+                    healthy_workers.append(url)
+                    break
+
+                elapsed = int(time.time() - start)
+                print(f"  [{elapsed}s] model={model_ok}")
+            except requests.exceptions.RequestException:
+                elapsed = int(time.time() - start)
+                print(f"  [{elapsed}s] not reachable")
+
+            time.sleep(interval)
+        else:
+            print(f"✗ {url}: timeout after {timeout}s")
+
+    if not healthy_workers:
+        raise RuntimeError(
+            f"No healthy {service_name} workers found.\n"
+            f"Checked ports: {ports}"
+        )
+
+    print(f"\n{len(healthy_workers)}/{len(worker_urls)} workers ready\n")
+    return healthy_workers
+
+
+def create_trainable(ports: List[int], payload_fn: Callable[[Dict], Dict]) -> Callable:
+    """
+    Factory to create a trainable function for Ray Tune.
+
+    Args:
+        ports: List of worker ports for load balancing
+        payload_fn: Function that takes config dict and returns API payload dict
+
+    Returns:
+        Trainable function for Ray Tune
+
+    Note:
+        Ray Tune 2.x API: tune.report(metrics_dict) - pass dict directly, NOT kwargs.
+        See: https://docs.ray.io/en/latest/tune/api/doc/ray.tune.report.html
+    """
+    def trainable(config):
+        import os
+        import random
+        import requests
+        from ray.tune import report  # Ray 2.x: report(dict), not report(**kwargs)
+
+        host = os.environ.get("OCR_HOST", "localhost")
+        api_url = f"http://{host}:{random.choice(ports)}"
+        payload = payload_fn(config)
+
+        try:
+            response = requests.post(f"{api_url}/evaluate", json=payload, timeout=None)
+            response.raise_for_status()
+            metrics = response.json()
+            metrics["worker"] = api_url
+            report(metrics)  # Ray 2.x API: pass dict directly
+        except Exception as e:
+            report({  # Ray 2.x API: pass dict directly
+                "CER": 1.0,
+                "WER": 1.0,
+                "TIME": 0.0,
+                "PAGES": 0,
+                "TIME_PER_PAGE": 0,
+                "worker": api_url,
+                "ERROR": str(e)[:500]
+            })
+
+    return trainable
+
+
+def run_tuner(
+    trainable: Callable,
+    search_space: Dict[str, Any],
+    num_samples: int = 64,
+    num_workers: int = 1,
+    metric: str = "CER",
+    mode: str = "min",
+) -> tune.ResultGrid:
+    """
+    Initialize Ray and run hyperparameter tuning.
+
+    Args:
+        trainable: Trainable function from create_trainable()
+        search_space: Dict of parameter names to tune.* search spaces
+        num_samples: Number of trials to run
+        num_workers: Max concurrent trials
+        metric: Metric to optimize
+        mode: "min" or "max"
+
+    Returns:
+        Ray Tune ResultGrid
+    """
+    ray.init(
+        ignore_reinit_error=True,
+        include_dashboard=False,
+        configure_logging=False,
+        _metrics_export_port=0,  # Disable metrics export to avoid connection warnings
+    )
+    print(f"Ray Tune ready (version: {ray.__version__})")
+
+    tuner = tune.Tuner(
+        trainable,
+        tune_config=tune.TuneConfig(
+            metric=metric,
+            mode=mode,
+            search_alg=OptunaSearch(),
+            num_samples=num_samples,
+            max_concurrent_trials=num_workers,
+        ),
+        param_space=search_space,
+    )
+
+    return tuner.fit()
+
+
+def analyze_results(
+    results: tune.ResultGrid,
+    output_folder: str = "results",
+    prefix: str = "raytune",
+    config_keys: List[str] = None,
+) -> pd.DataFrame:
+    """
+    Analyze and save tuning results.
+
+    Args:
+        results: Ray Tune ResultGrid
+        output_folder: Directory to save CSV
+        prefix: Filename prefix
+        config_keys: List of config keys to show in best result (without 'config/' prefix)
+
+    Returns:
+        Results DataFrame
+    """
+    os.makedirs(output_folder, exist_ok=True)
+    df = results.get_dataframe()
+
+    # Save to CSV
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"{prefix}_results_{timestamp}.csv"
+    filepath = os.path.join(output_folder, filename)
+    df.to_csv(filepath, index=False)
+    print(f"Results saved: {filepath}")
+
+    # Best configuration
+    best = df.loc[df["CER"].idxmin()]
+    print(f"\nBest CER: {best['CER']:.6f}")
+    print(f"Best WER: {best['WER']:.6f}")
+
+    if config_keys:
+        print(f"\nOptimal Configuration:")
+        for key in config_keys:
+            col = f"config/{key}"
+            if col in best:
+                val = best[col]
+                if isinstance(val, float):
+                    print(f"  {key}: {val:.4f}")
+                else:
+                    print(f"  {key}: {val}")
+
+    return df
+
+
+def correlation_analysis(df: pd.DataFrame, param_keys: List[str]) -> None:
+    """
+    Print correlation of numeric parameters with CER/WER.
+
+    Args:
+        df: Results DataFrame
+        param_keys: List of config keys (without 'config/' prefix)
+    """
+    param_cols = [f"config/{k}" for k in param_keys if f"config/{k}" in df.columns]
+    numeric_cols = [c for c in param_cols if df[c].dtype in ['float64', 'int64']]
+
+    if not numeric_cols:
+        print("No numeric parameters for correlation analysis")
+        return
+
+    corr_cer = df[numeric_cols + ["CER"]].corr()["CER"].sort_values(ascending=False)
+    corr_wer = df[numeric_cols + ["WER"]].corr()["WER"].sort_values(ascending=False)
+
+    print("Correlation with CER:")
+    print(corr_cer)
+    print("\nCorrelation with WER:")
+    print(corr_wer)
+
+
+# =============================================================================
+# OCR-specific payload functions
+# =============================================================================
+
+def paddle_ocr_payload(config: Dict) -> Dict:
+    """Create payload for PaddleOCR API. Uses pages 5-10 (first doc) for tuning."""
+    return {
+        "pdf_folder": "/app/dataset",
+        "use_doc_orientation_classify": config.get("use_doc_orientation_classify", False),
+        "use_doc_unwarping": config.get("use_doc_unwarping", False),
+        "textline_orientation": config.get("textline_orientation", True),
+        "text_det_thresh": config.get("text_det_thresh", 0.0),
+        "text_det_box_thresh": config.get("text_det_box_thresh", 0.0),
+        "text_det_unclip_ratio": config.get("text_det_unclip_ratio", 1.5),
+        "text_rec_score_thresh": config.get("text_rec_score_thresh", 0.0),
+        "start_page": 5,
+        "end_page": 10,
+        "save_output": False,
+    }
+
+
+def doctr_payload(config: Dict) -> Dict:
+    """Create payload for DocTR API. Uses pages 5-10 (first doc) for tuning."""
+    return {
+        "pdf_folder": "/app/dataset",
+        "assume_straight_pages": config.get("assume_straight_pages", True),
+        "straighten_pages": config.get("straighten_pages", False),
+        "preserve_aspect_ratio": config.get("preserve_aspect_ratio", True),
+        "symmetric_pad": config.get("symmetric_pad", True),
+        "disable_page_orientation": config.get("disable_page_orientation", False),
+        "disable_crop_orientation": config.get("disable_crop_orientation", False),
+        "resolve_lines": config.get("resolve_lines", True),
+        "resolve_blocks": config.get("resolve_blocks", False),
+        "paragraph_break": config.get("paragraph_break", 0.035),
+        "start_page": 5,
+        "end_page": 10,
+        "save_output": False,
+    }
+
+
+def easyocr_payload(config: Dict) -> Dict:
+    """Create payload for EasyOCR API. Uses pages 5-10 (first doc) for tuning."""
+    return {
+        "pdf_folder": "/app/dataset",
+        "text_threshold": config.get("text_threshold", 0.7),
+        "low_text": config.get("low_text", 0.4),
+        "link_threshold": config.get("link_threshold", 0.4),
+        "slope_ths": config.get("slope_ths", 0.1),
+        "ycenter_ths": config.get("ycenter_ths", 0.5),
+        "height_ths": config.get("height_ths", 0.5),
+        "width_ths": config.get("width_ths", 0.5),
+        "add_margin": config.get("add_margin", 0.1),
+        "contrast_ths": config.get("contrast_ths", 0.1),
+        "adjust_contrast": config.get("adjust_contrast", 0.5),
+        "decoder": config.get("decoder", "greedy"),
+        "beamWidth": config.get("beamWidth", 5),
+        "min_size": config.get("min_size", 10),
+        "start_page": 5,
+        "end_page": 10,
+        "save_output": False,
+    }
+
+
+# =============================================================================
+# Search spaces
+# =============================================================================
+
+PADDLE_OCR_SEARCH_SPACE = {
+    "use_doc_orientation_classify": tune.choice([True, False]),
+    "use_doc_unwarping": tune.choice([True, False]),
+    "textline_orientation": tune.choice([True, False]),
+    "text_det_thresh": tune.uniform(0.0, 0.7),
+    "text_det_box_thresh": tune.uniform(0.0, 0.7),
+    "text_det_unclip_ratio": tune.choice([0.0]),
+    "text_rec_score_thresh": tune.uniform(0.0, 0.7),
+}
+
+DOCTR_SEARCH_SPACE = {
+    "assume_straight_pages": tune.choice([True, False]),
+    "straighten_pages": tune.choice([True, False]),
+    "preserve_aspect_ratio": tune.choice([True, False]),
+    "symmetric_pad": tune.choice([True, False]),
+    "disable_page_orientation": tune.choice([True, False]),
+    "disable_crop_orientation": tune.choice([True, False]),
+    "resolve_lines": tune.choice([True, False]),
+    "resolve_blocks": tune.choice([True, False]),
+    "paragraph_break": tune.uniform(0.01, 0.1),
+}
+
+EASYOCR_SEARCH_SPACE = {
+    "text_threshold": tune.uniform(0.3, 0.9),
+    "low_text": tune.uniform(0.2, 0.6),
+    "link_threshold": tune.uniform(0.2, 0.6),
+    "slope_ths": tune.uniform(0.0, 0.3),
+    "ycenter_ths": tune.uniform(0.3, 1.0),
+    "height_ths": tune.uniform(0.3, 1.0),
+    "width_ths": tune.uniform(0.3, 1.0),
+    "add_margin": tune.uniform(0.0, 0.3),
+    "contrast_ths": tune.uniform(0.05, 0.3),
+    "adjust_contrast": tune.uniform(0.3, 0.8),
+    "decoder": tune.choice(["greedy", "beamsearch"]),
+    "beamWidth": tune.choice([3, 5, 7, 10]),
+    "min_size": tune.choice([5, 10, 15, 20]),
+}
+
+
+# =============================================================================
+# Config keys for results display
+# =============================================================================
+
+PADDLE_OCR_CONFIG_KEYS = [
+    "use_doc_orientation_classify", "use_doc_unwarping", "textline_orientation",
+    "text_det_thresh", "text_det_box_thresh", "text_det_unclip_ratio", "text_rec_score_thresh",
+]
+
+DOCTR_CONFIG_KEYS = [
+    "assume_straight_pages", "straighten_pages", "preserve_aspect_ratio", "symmetric_pad",
+    "disable_page_orientation", "disable_crop_orientation", "resolve_lines", "resolve_blocks",
+    "paragraph_break",
+]
+
+EASYOCR_CONFIG_KEYS = [
+    "text_threshold", "low_text", "link_threshold", "slope_ths", "ycenter_ths",
+    "height_ths", "width_ths", "add_margin", "contrast_ths", "adjust_contrast",
+    "decoder", "beamWidth", "min_size",
+]
--- a/src/raytune/requirements.txt
+++ b/src/raytune/requirements.txt
@@ -0,0 +1,4 @@
+ray[tune]==2.52.1
+optuna==4.7.0
+requests>=2.28.0
+pandas>=2.0.0
--- a/src/raytune/run_tuning.py
+++ b/src/raytune/run_tuning.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""Run hyperparameter tuning for OCR services."""
+
+import os
+import sys
+import argparse
+from raytune_ocr import (
+    check_workers, create_trainable, run_tuner, analyze_results,
+    paddle_ocr_payload, doctr_payload, easyocr_payload,
+    PADDLE_OCR_SEARCH_SPACE, DOCTR_SEARCH_SPACE, EASYOCR_SEARCH_SPACE,
+    PADDLE_OCR_CONFIG_KEYS, DOCTR_CONFIG_KEYS, EASYOCR_CONFIG_KEYS,
+)
+
+SERVICES = {
+    "paddle": {
+        "payload_fn": paddle_ocr_payload,
+        "search_space": PADDLE_OCR_SEARCH_SPACE,
+        "config_keys": PADDLE_OCR_CONFIG_KEYS,
+        "name": "PaddleOCR",
+    },
+    "doctr": {
+        "payload_fn": doctr_payload,
+        "search_space": DOCTR_SEARCH_SPACE,
+        "config_keys": DOCTR_CONFIG_KEYS,
+        "name": "DocTR",
+    },
+    "easyocr": {
+        "payload_fn": easyocr_payload,
+        "search_space": EASYOCR_SEARCH_SPACE,
+        "config_keys": EASYOCR_CONFIG_KEYS,
+        "name": "EasyOCR",
+    },
+}
+
+def main():
+    parser = argparse.ArgumentParser(description="Run OCR hyperparameter tuning")
+    parser.add_argument("--service", choices=["paddle", "doctr", "easyocr"], required=True)
+    parser.add_argument("--host", type=str, default="localhost", help="OCR service host")
+    parser.add_argument("--port", type=int, default=8000, help="OCR service port")
+    parser.add_argument("--samples", type=int, default=64, help="Number of samples")
+    args = parser.parse_args()
+
+    # Set environment variable for raytune_ocr module
+    os.environ["OCR_HOST"] = args.host
+
+    cfg = SERVICES[args.service]
+    ports = [args.port]
+
+    print(f"\n{'='*50}")
+    print(f"Hyperparameter Tuning: {cfg['name']}")
+    print(f"Host: {args.host}:{args.port}")
+    print(f"Samples: {args.samples}")
+    print(f"{'='*50}\n")
+
+    # Check workers
+    healthy = check_workers(ports, cfg["name"])
+
+    # Create trainable and run tuning
+    trainable = create_trainable(ports, cfg["payload_fn"])
+    results = run_tuner(
+        trainable=trainable,
+        search_space=cfg["search_space"],
+        num_samples=args.samples,
+        num_workers=len(healthy),
+    )
+
+    # Analyze results
+    df = analyze_results(
+        results,
+        output_folder="results",
+        prefix=f"raytune_{args.service}",
+        config_keys=cfg["config_keys"],
+    )
+
+    print(f"\n{'='*50}")
+    print("Tuning complete!")
+    print(f"{'='*50}")
+
+if __name__ == "__main__":
+    main()