remove unneded py file

2026-01-18 08:24:23 +01:00
parent 67092e4df0
commit b29df98602
3 changed files with 129 additions and 3264 deletions
--- a/src/output_raytune.ipynb
+++ b/src/output_raytune.ipynb
--- a/src/paddle_ocr_raytune_rest.ipynb
+++ b/src/paddle_ocr_raytune_rest.ipynb
@@ -188,62 +188,7 @@
   "id": "trainable",
   "metadata": {},
   "outputs": [],
-   "source": [
+   "source": "def trainable_paddle_ocr(config):\n    \"\"\"Call PaddleOCR REST API with the given hyperparameter config.\"\"\"\n    import random\n    import requests\n    from ray import tune\n\n    # Worker URLs - random selection (load balances with 2 workers, 2 concurrent trials)\n    WORKER_PORTS = [8001, 8002]\n    api_url = f\"http://localhost:{random.choice(WORKER_PORTS)}\"\n\n    payload = {\n        \"pdf_folder\": \"/app/dataset\",\n        \"use_doc_orientation_classify\": config.get(\"use_doc_orientation_classify\", False),\n        \"use_doc_unwarping\": config.get(\"use_doc_unwarping\", False),\n        \"textline_orientation\": config.get(\"textline_orientation\", True),\n        \"text_det_thresh\": config.get(\"text_det_thresh\", 0.0),\n        \"text_det_box_thresh\": config.get(\"text_det_box_thresh\", 0.0),\n        \"text_det_unclip_ratio\": config.get(\"text_det_unclip_ratio\", 1.5),\n        \"text_rec_score_thresh\": config.get(\"text_rec_score_thresh\", 0.0),\n        \"start_page\": 5,\n        \"end_page\": 10,\n    }\n\n    try:\n        response = requests.post(f\"{api_url}/evaluate\", json=payload, timeout=None)\n        response.raise_for_status()\n        metrics = response.json()\n        metrics[\"worker\"] = api_url\n        tune.report(**metrics)\n    except Exception as e:\n        tune.report(\n            CER=1.0,\n            WER=1.0,\n            TIME=0.0,\n            PAGES=0,\n            TIME_PER_PAGE=0,\n            worker=api_url,\n            ERROR=str(e)[:500]\n        )"
    "def trainable_paddle_ocr(config):\n",
    "    \"\"\"Call PaddleOCR REST API with the given hyperparameter config.\n",
    "    \n",
    "    Uses trial index to deterministically assign a worker (round-robin),\n",
    "    ensuring only 1 request per container at a time.\n",
    "    \"\"\"\n",
    "    import requests  # Must be inside function for Ray workers\n",
    "    from ray import train\n",
    "\n",
    "    # Worker URLs - round-robin assignment based on trial index\n",
    "    WORKER_PORTS = [8001, 8002]\n",
    "    NUM_WORKERS = len(WORKER_PORTS)\n",
    "    \n",
    "    # Get trial context for deterministic worker assignment\n",
    "    context = train.get_context()\n",
    "    trial_id = context.get_trial_id() if context else \"0\"\n",
    "    # Extract numeric part from trial ID (e.g., \"trainable_paddle_ocr_abc123_00001\" -> 1)\n",
    "    try:\n",
    "        trial_num = int(trial_id.split(\"_\")[-1])\n",
    "    except (ValueError, IndexError):\n",
    "        trial_num = hash(trial_id)\n",
    "    \n",
    "    worker_idx = trial_num % NUM_WORKERS\n",
    "    api_url = f\"http://localhost:{WORKER_PORTS[worker_idx]}\"\n",
    "\n",
    "    payload = {\n",
    "        \"pdf_folder\": \"/app/dataset\",\n",
    "        \"use_doc_orientation_classify\": config.get(\"use_doc_orientation_classify\", False),\n",
    "        \"use_doc_unwarping\": config.get(\"use_doc_unwarping\", False),\n",
    "        \"textline_orientation\": config.get(\"textline_orientation\", True),\n",
    "        \"text_det_thresh\": config.get(\"text_det_thresh\", 0.0),\n",
    "        \"text_det_box_thresh\": config.get(\"text_det_box_thresh\", 0.0),\n",
    "        \"text_det_unclip_ratio\": config.get(\"text_det_unclip_ratio\", 1.5),\n",
    "        \"text_rec_score_thresh\": config.get(\"text_rec_score_thresh\", 0.0),\n",
    "        \"start_page\": 5,\n",
    "        \"end_page\": 10,\n",
    "    }\n",
    "\n",
    "    try:\n",
    "        response = requests.post(f\"{api_url}/evaluate\", json=payload, timeout=None)  # No timeout\n",
    "        response.raise_for_status()\n",
    "        metrics = response.json()\n",
    "        metrics[\"worker\"] = api_url\n",
    "        train.report(metrics)\n",
    "    except Exception as e:\n",
    "        train.report({\n",
    "            \"CER\": 1.0,\n",
    "            \"WER\": 1.0,\n",
    "            \"TIME\": 0.0,\n",
    "            \"PAGES\": 0,\n",
    "            \"TIME_PER_PAGE\": 0,\n",
    "            \"worker\": api_url,\n",
    "            \"ERROR\": str(e)[:500]\n",
    "        })"
   ]
  },
  {
   "cell_type": "markdown",
--- a/src/run_raytune.py
+++ b/src/run_raytune.py
@@ -1,124 +0,0 @@
 #!/usr/bin/env python3
 """Ray Tune hyperparameter search for PaddleOCR via REST API."""
 import os
 from datetime import datetime
 import requests
 import pandas as pd
 import ray
 from ray import tune, train, air
 from ray.tune.search.optuna import OptunaSearch
 # Configuration
 WORKER_PORTS = [8001, 8002]
 OUTPUT_FOLDER = "results"
 os.makedirs(OUTPUT_FOLDER, exist_ok=True)
 # Search space
 search_space = {
    "use_doc_orientation_classify": tune.choice([True, False]),
    "use_doc_unwarping": tune.choice([True, False]),
    "textline_orientation": tune.choice([True, False]),
    "text_det_thresh": tune.uniform(0.0, 0.7),
    "text_det_box_thresh": tune.uniform(0.0, 0.7),
    "text_det_unclip_ratio": tune.choice([0.0]),
    "text_rec_score_thresh": tune.uniform(0.0, 0.7),
 }
 def trainable_paddle_ocr(config):
    """Call PaddleOCR REST API with the given hyperparameter config."""
    import requests
    from ray import train
    NUM_WORKERS = len(WORKER_PORTS)
    context = train.get_context()
    trial_id = context.get_trial_id() if context else "0"
    try:
        trial_num = int(trial_id.split("_")[-1])
    except (ValueError, IndexError):
        trial_num = hash(trial_id)
    worker_idx = trial_num % NUM_WORKERS
    api_url = f"http://localhost:{WORKER_PORTS[worker_idx]}"
    payload = {
        "pdf_folder": "/app/dataset",
        "use_doc_orientation_classify": config.get("use_doc_orientation_classify", False),
        "use_doc_unwarping": config.get("use_doc_unwarping", False),
        "textline_orientation": config.get("textline_orientation", True),
        "text_det_thresh": config.get("text_det_thresh", 0.0),
        "text_det_box_thresh": config.get("text_det_box_thresh", 0.0),
        "text_det_unclip_ratio": config.get("text_det_unclip_ratio", 1.5),
        "text_rec_score_thresh": config.get("text_rec_score_thresh", 0.0),
        "start_page": 5,
        "end_page": 10,
    }
    try:
        response = requests.post(f"{api_url}/evaluate", json=payload, timeout=None)
        response.raise_for_status()
        metrics = response.json()
        metrics["worker"] = api_url
        train.report(metrics)
    except Exception as e:
        train.report({
            "CER": 1.0,
            "WER": 1.0,
            "TIME": 0.0,
            "PAGES": 0,
            "TIME_PER_PAGE": 0,
            "worker": api_url,
            "ERROR": str(e)[:500]
        })
 def main():
    # Check workers
    print("Checking workers...")
    for port in WORKER_PORTS:
        try:
            r = requests.get(f"http://localhost:{port}/health", timeout=10)
            print(f"  Worker {port}: {r.json().get('status', 'unknown')}")
        except Exception as e:
            print(f"  Worker {port}: ERROR - {e}")
    print("\nStarting Ray Tune...")
    ray.init(ignore_reinit_error=True)
    tuner = tune.Tuner(
        trainable_paddle_ocr,
        tune_config=tune.TuneConfig(
            metric="CER",
            mode="min",
            search_alg=OptunaSearch(),
            num_samples=64,
            max_concurrent_trials=len(WORKER_PORTS),
        ),
        run_config=air.RunConfig(verbose=2, log_to_file=True),
        param_space=search_space,
    )
    results = tuner.fit()
    # Save results
    df = results.get_dataframe()
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filepath = os.path.join(OUTPUT_FOLDER, f"raytune_paddle_results_{timestamp}.csv")
    df.to_csv(filepath, index=False)
    print(f"\nResults saved: {filepath}")
    # Best config
    if len(df) > 0 and "CER" in df.columns:
        best = df.loc[df["CER"].idxmin()]
        print(f"\nBest CER: {best['CER']:.6f}")
        print(f"Best WER: {best['WER']:.6f}")
    ray.shutdown()
 if __name__ == "__main__":
    main()