src/paddle_ocr_raytune_rest.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "header",
   "metadata": {},
   "source": [
    "# PaddleOCR Hyperparameter Optimization via REST API\n",
    "\n",
    "This notebook runs Ray Tune hyperparameter search calling the PaddleOCR REST API (Docker container).\n",
    "\n",
    "**Benefits:**\n",
    "- No model reload per trial - Model stays loaded in Docker container\n",
    "- Faster trials - Skip ~10s model load time per trial\n",
    "- Cleaner code - REST API replaces subprocess + CLI arg parsing"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "prereq",
   "metadata": {},
   "source": [
    "## Prerequisites\n",
    "\n",
    "Start 2 PaddleOCR workers for parallel hyperparameter tuning:\n",
    "\n",
    "```bash\n",
    "cd src/paddle_ocr\n",
    "docker compose -f docker-compose.workers.yml up\n",
    "```\n",
    "\n",
    "This starts 2 GPU workers on ports 8001-8002, allowing 2 concurrent trials.\n",
    "\n",
    "For CPU-only systems:\n",
    "```bash\n",
    "docker compose -f docker-compose.workers.yml --profile cpu up\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3ob9fsoilc4",
   "metadata": {},
   "source": [
    "## 0. Dependencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "wyr2nsoj7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Install dependencies (run once)\n",
    "%pip install -U \"ray[tune]\"\n",
    "%pip install optuna\n",
    "%pip install requests pandas"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "imports-header",
   "metadata": {},
   "source": [
    "## 1. Imports & Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "imports",
   "metadata": {},
   "outputs": [],
   "source": "import os\nfrom datetime import datetime\n\nimport requests\nimport pandas as pd\n\nimport ray\nfrom ray import tune, train\nfrom ray.tune.search.optuna import OptunaSearch"
  },
  {
   "cell_type": "markdown",
   "id": "config-header",
   "metadata": {},
   "source": [
    "## 2. API Configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "config",
   "metadata": {},
   "outputs": [],
   "source": [
    "# PaddleOCR REST API endpoints - 2 workers for parallel trials\n",
    "# Start workers with: cd src/paddle_ocr && docker compose -f docker-compose.workers.yml up\n",
    "WORKER_PORTS = [8001, 8002]\n",
    "WORKER_URLS = [f\"http://localhost:{port}\" for port in WORKER_PORTS]\n",
    "\n",
    "# Output folder for results\n",
    "OUTPUT_FOLDER = \"results\"\n",
    "os.makedirs(OUTPUT_FOLDER, exist_ok=True)\n",
    "\n",
    "# Number of concurrent trials = number of workers\n",
    "NUM_WORKERS = len(WORKER_URLS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "health-check",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Verify all workers are running\n",
    "healthy_workers = []\n",
    "for url in WORKER_URLS:\n",
    "    try:\n",
    "        health = requests.get(f\"{url}/health\", timeout=10).json()\n",
    "        if health['status'] == 'ok' and health['model_loaded']:\n",
    "            healthy_workers.append(url)\n",
    "            print(f\"✓ {url}: {health['status']} (GPU: {health.get('gpu_name', 'N/A')})\")\n",
    "        else:\n",
    "            print(f\"✗ {url}: not ready yet\")\n",
    "    except requests.exceptions.ConnectionError:\n",
    "        print(f\"✗ {url}: not reachable\")\n",
    "\n",
    "if not healthy_workers:\n",
    "    raise RuntimeError(\n",
    "        \"No healthy workers found. Start them with:\\n\"\n",
    "        \"  cd src/paddle_ocr && docker compose -f docker-compose.workers.yml up\"\n",
    "    )\n",
    "\n",
    "print(f\"\\n{len(healthy_workers)}/{len(WORKER_URLS)} workers ready for parallel tuning\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "search-space-header",
   "metadata": {},
   "source": [
    "## 3. Search Space"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "search-space",
   "metadata": {},
   "outputs": [],
   "source": [
    "search_space = {\n",
    "    # Whether to use document image orientation classification\n",
    "    \"use_doc_orientation_classify\": tune.choice([True, False]),\n",
    "    # Whether to use text image unwarping\n",
    "    \"use_doc_unwarping\": tune.choice([True, False]),\n",
    "    # Whether to use text line orientation classification\n",
    "    \"textline_orientation\": tune.choice([True, False]),\n",
    "    # Detection pixel threshold (pixels > threshold are considered text)\n",
    "    \"text_det_thresh\": tune.uniform(0.0, 0.7),\n",
    "    # Detection box threshold (average score within border)\n",
    "    \"text_det_box_thresh\": tune.uniform(0.0, 0.7),\n",
    "    # Text detection expansion coefficient\n",
    "    \"text_det_unclip_ratio\": tune.choice([0.0]),\n",
    "    # Text recognition threshold (filter low confidence results)\n",
    "    \"text_rec_score_thresh\": tune.uniform(0.0, 0.7),\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "trainable-header",
   "metadata": {},
   "source": [
    "## 4. Trainable Function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "trainable",
   "metadata": {},
   "outputs": [],
   "source": "def trainable_paddle_ocr(config):\n    \"\"\"Call PaddleOCR REST API with the given hyperparameter config.\"\"\"\n    import random\n    import requests\n    from ray import train\n\n    # Worker URLs - random selection (load balances with 2 workers, 2 concurrent trials)\n    WORKER_PORTS = [8001, 8002]\n    api_url = f\"http://localhost:{random.choice(WORKER_PORTS)}\"\n\n    payload = {\n        \"pdf_folder\": \"/app/dataset\",\n        \"use_doc_orientation_classify\": config.get(\"use_doc_orientation_classify\", False),\n        \"use_doc_unwarping\": config.get(\"use_doc_unwarping\", False),\n        \"textline_orientation\": config.get(\"textline_orientation\", True),\n        \"text_det_thresh\": config.get(\"text_det_thresh\", 0.0),\n        \"text_det_box_thresh\": config.get(\"text_det_box_thresh\", 0.0),\n        \"text_det_unclip_ratio\": config.get(\"text_det_unclip_ratio\", 1.5),\n        \"text_rec_score_thresh\": config.get(\"text_rec_score_thresh\", 0.0),\n        \"start_page\": 5,\n        \"end_page\": 10,\n    }\n\n    try:\n        response = requests.post(f\"{api_url}/evaluate\", json=payload, timeout=None)\n        response.raise_for_status()\n        metrics = response.json()\n        metrics[\"worker\"] = api_url\n        train.report(metrics)\n    except Exception as e:\n        train.report({\n            \"CER\": 1.0,\n            \"WER\": 1.0,\n            \"TIME\": 0.0,\n            \"PAGES\": 0,\n            \"TIME_PER_PAGE\": 0,\n            \"worker\": api_url,\n            \"ERROR\": str(e)[:500]\n        })"
  },
  {
   "cell_type": "markdown",
   "id": "tuner-header",
   "metadata": {},
   "source": [
    "## 5. Run Tuner"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ray-init",
   "metadata": {},
   "outputs": [],
   "source": [
    "ray.init(ignore_reinit_error=True)\n",
    "print(f\"Ray Tune ready (version: {ray.__version__})\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "tuner",
   "metadata": {},
   "outputs": [],
   "source": "tuner = tune.Tuner(\n    trainable_paddle_ocr,\n    tune_config=tune.TuneConfig(\n        metric=\"CER\",\n        mode=\"min\",\n        search_alg=OptunaSearch(),\n        num_samples=64,\n        max_concurrent_trials=NUM_WORKERS,  # Run trials in parallel across workers\n    ),\n    param_space=search_space,\n)\n\nresults = tuner.fit()"
  },
  {
   "cell_type": "markdown",
   "id": "analysis-header",
   "metadata": {},
   "source": [
    "## 6. Results Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "results-df",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = results.get_dataframe()\n",
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "save-results",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save results to CSV\n",
    "timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
    "filename = f\"raytune_paddle_rest_results_{timestamp}.csv\"\n",
    "filepath = os.path.join(OUTPUT_FOLDER, filename)\n",
    "\n",
    "df.to_csv(filepath, index=False)\n",
    "print(f\"Results saved: {filepath}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "best-config",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Best configuration\n",
    "best = df.loc[df[\"CER\"].idxmin()]\n",
    "\n",
    "print(f\"Best CER: {best['CER']:.6f}\")\n",
    "print(f\"Best WER: {best['WER']:.6f}\")\n",
    "print(f\"\\nOptimal Configuration:\")\n",
    "print(f\"  textline_orientation: {best['config/textline_orientation']}\")\n",
    "print(f\"  use_doc_orientation_classify: {best['config/use_doc_orientation_classify']}\")\n",
    "print(f\"  use_doc_unwarping: {best['config/use_doc_unwarping']}\")\n",
    "print(f\"  text_det_thresh: {best['config/text_det_thresh']:.4f}\")\n",
    "print(f\"  text_det_box_thresh: {best['config/text_det_box_thresh']:.4f}\")\n",
    "print(f\"  text_det_unclip_ratio: {best['config/text_det_unclip_ratio']}\")\n",
    "print(f\"  text_rec_score_thresh: {best['config/text_rec_score_thresh']:.4f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "correlation",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Correlation analysis\n",
    "param_cols = [\n",
    "    \"config/text_det_thresh\",\n",
    "    \"config/text_det_box_thresh\",\n",
    "    \"config/text_det_unclip_ratio\",\n",
    "    \"config/text_rec_score_thresh\",\n",
    "]\n",
    "\n",
    "corr_cer = df[param_cols + [\"CER\"]].corr()[\"CER\"].sort_values(ascending=False)\n",
    "corr_wer = df[param_cols + [\"WER\"]].corr()[\"WER\"].sort_values(ascending=False)\n",
    "\n",
    "print(\"Correlation with CER:\")\n",
    "print(corr_cer)\n",
    "print(\"\\nCorrelation with WER:\")\n",
    "print(corr_wer)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
raytune rest 2026-01-18 08:19:34 +01:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "markdown",`
			`"id": "header",`
			`"metadata": {},`
			`"source": [`
			`"# PaddleOCR Hyperparameter Optimization via REST API\n",`
			`"\n",`
			`"This notebook runs Ray Tune hyperparameter search calling the PaddleOCR REST API (Docker container).\n",`
			`"\n",`
			`"Benefits:\n",`
			`"- No model reload per trial - Model stays loaded in Docker container\n",`
			`"- Faster trials - Skip ~10s model load time per trial\n",`
			`"- Cleaner code - REST API replaces subprocess + CLI arg parsing"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "prereq",`
			`"metadata": {},`
			`"source": [`
			`"## Prerequisites\n",`
			`"\n",`
			`"Start 2 PaddleOCR workers for parallel hyperparameter tuning:\n",`
			`"\n",`
			"```bash\n",
			`"cd src/paddle_ocr\n",`
			`"docker compose -f docker-compose.workers.yml up\n",`
			"```\n",
			`"\n",`
			`"This starts 2 GPU workers on ports 8001-8002, allowing 2 concurrent trials.\n",`
			`"\n",`
			`"For CPU-only systems:\n",`
			"```bash\n",
			`"docker compose -f docker-compose.workers.yml --profile cpu up\n",`
			"```"
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "3ob9fsoilc4",`
			`"metadata": {},`
			`"source": [`
			`"## 0. Dependencies"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"id": "wyr2nsoj7",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# Install dependencies (run once)\n",`
			`"%pip install -U \"ray[tune]\"\n",`
			`"%pip install optuna\n",`
			`"%pip install requests pandas"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "imports-header",`
			`"metadata": {},`
			`"source": [`
			`"## 1. Imports & Setup"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"id": "imports",`
			`"metadata": {},`
			`"outputs": [],`
lock model 2026-01-18 17:38:42 +01:00			`"source": "import os\nfrom datetime import datetime\n\nimport requests\nimport pandas as pd\n\nimport ray\nfrom ray import tune, train\nfrom ray.tune.search.optuna import OptunaSearch"`
raytune rest 2026-01-18 08:19:34 +01:00			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "config-header",`
			`"metadata": {},`
			`"source": [`
			`"## 2. API Configuration"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"id": "config",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# PaddleOCR REST API endpoints - 2 workers for parallel trials\n",`
			`"# Start workers with: cd src/paddle_ocr && docker compose -f docker-compose.workers.yml up\n",`
			`"WORKER_PORTS = [8001, 8002]\n",`
			`"WORKER_URLS = [f\"http://localhost:{port}\" for port in WORKER_PORTS]\n",`
			`"\n",`
			`"# Output folder for results\n",`
			`"OUTPUT_FOLDER = \"results\"\n",`
			`"os.makedirs(OUTPUT_FOLDER, exist_ok=True)\n",`
			`"\n",`
			`"# Number of concurrent trials = number of workers\n",`
			`"NUM_WORKERS = len(WORKER_URLS)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"id": "health-check",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# Verify all workers are running\n",`
			`"healthy_workers = []\n",`
			`"for url in WORKER_URLS:\n",`
			`" try:\n",`
			`" health = requests.get(f\"{url}/health\", timeout=10).json()\n",`
			`" if health['status'] == 'ok' and health['model_loaded']:\n",`
			`" healthy_workers.append(url)\n",`
			`" print(f\"✓ {url}: {health['status']} (GPU: {health.get('gpu_name', 'N/A')})\")\n",`
			`" else:\n",`
			`" print(f\"✗ {url}: not ready yet\")\n",`
			`" except requests.exceptions.ConnectionError:\n",`
			`" print(f\"✗ {url}: not reachable\")\n",`
			`"\n",`
			`"if not healthy_workers:\n",`
			`" raise RuntimeError(\n",`
			`" \"No healthy workers found. Start them with:\\n\"\n",`
			`" \" cd src/paddle_ocr && docker compose -f docker-compose.workers.yml up\"\n",`
			`" )\n",`
			`"\n",`
			`"print(f\"\\n{len(healthy_workers)}/{len(WORKER_URLS)} workers ready for parallel tuning\")"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "search-space-header",`
			`"metadata": {},`
			`"source": [`
			`"## 3. Search Space"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"id": "search-space",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"search_space = {\n",`
			`" # Whether to use document image orientation classification\n",`
			`" \"use_doc_orientation_classify\": tune.choice([True, False]),\n",`
			`" # Whether to use text image unwarping\n",`
			`" \"use_doc_unwarping\": tune.choice([True, False]),\n",`
			`" # Whether to use text line orientation classification\n",`
			`" \"textline_orientation\": tune.choice([True, False]),\n",`
			`" # Detection pixel threshold (pixels > threshold are considered text)\n",`
			`" \"text_det_thresh\": tune.uniform(0.0, 0.7),\n",`
			`" # Detection box threshold (average score within border)\n",`
			`" \"text_det_box_thresh\": tune.uniform(0.0, 0.7),\n",`
			`" # Text detection expansion coefficient\n",`
			`" \"text_det_unclip_ratio\": tune.choice([0.0]),\n",`
			`" # Text recognition threshold (filter low confidence results)\n",`
			`" \"text_rec_score_thresh\": tune.uniform(0.0, 0.7),\n",`
			`"}"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "trainable-header",`
			`"metadata": {},`
			`"source": [`
			`"## 4. Trainable Function"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"id": "trainable",`
			`"metadata": {},`
			`"outputs": [],`
lock model 2026-01-18 17:38:42 +01:00			"source": "def trainable_paddle_ocr(config):\n \"\"\"Call PaddleOCR REST API with the given hyperparameter config.\"\"\"\n import random\n import requests\n from ray import train\n\n # Worker URLs - random selection (load balances with 2 workers, 2 concurrent trials)\n WORKER_PORTS = [8001, 8002]\n api_url = f\"http://localhost:{random.choice(WORKER_PORTS)}\"\n\n payload = {\n \"pdf_folder\": \"/app/dataset\",\n \"use_doc_orientation_classify\": config.get(\"use_doc_orientation_classify\", False),\n \"use_doc_unwarping\": config.get(\"use_doc_unwarping\", False),\n \"textline_orientation\": config.get(\"textline_orientation\", True),\n \"text_det_thresh\": config.get(\"text_det_thresh\", 0.0),\n \"text_det_box_thresh\": config.get(\"text_det_box_thresh\", 0.0),\n \"text_det_unclip_ratio\": config.get(\"text_det_unclip_ratio\", 1.5),\n \"text_rec_score_thresh\": config.get(\"text_rec_score_thresh\", 0.0),\n \"start_page\": 5,\n \"end_page\": 10,\n }\n\n try:\n response = requests.post(f\"{api_url}/evaluate\", json=payload, timeout=None)\n response.raise_for_status()\n metrics = response.json()\n metrics[\"worker\"] = api_url\n train.report(metrics)\n except Exception as e:\n train.report({\n \"CER\": 1.0,\n \"WER\": 1.0,\n \"TIME\": 0.0,\n \"PAGES\": 0,\n \"TIME_PER_PAGE\": 0,\n \"worker\": api_url,\n \"ERROR\": str(e)[:500]\n })"
raytune rest 2026-01-18 08:19:34 +01:00			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "tuner-header",`
			`"metadata": {},`
			`"source": [`
			`"## 5. Run Tuner"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"id": "ray-init",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"ray.init(ignore_reinit_error=True)\n",`
			`"print(f\"Ray Tune ready (version: {ray.__version__})\")"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"id": "tuner",`
			`"metadata": {},`
			`"outputs": [],`
lock model 2026-01-18 17:38:42 +01:00			`"source": "tuner = tune.Tuner(\n trainable_paddle_ocr,\n tune_config=tune.TuneConfig(\n metric=\"CER\",\n mode=\"min\",\n search_alg=OptunaSearch(),\n num_samples=64,\n max_concurrent_trials=NUM_WORKERS, # Run trials in parallel across workers\n ),\n param_space=search_space,\n)\n\nresults = tuner.fit()"`
raytune rest 2026-01-18 08:19:34 +01:00			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "analysis-header",`
			`"metadata": {},`
			`"source": [`
			`"## 6. Results Analysis"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"id": "results-df",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"df = results.get_dataframe()\n",`
			`"df.describe()"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"id": "save-results",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# Save results to CSV\n",`
			`"timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",`
			`"filename = f\"raytune_paddle_rest_results_{timestamp}.csv\"\n",`
			`"filepath = os.path.join(OUTPUT_FOLDER, filename)\n",`
			`"\n",`
			`"df.to_csv(filepath, index=False)\n",`
			`"print(f\"Results saved: {filepath}\")"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"id": "best-config",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# Best configuration\n",`
			`"best = df.loc[df[\"CER\"].idxmin()]\n",`
			`"\n",`
			`"print(f\"Best CER: {best['CER']:.6f}\")\n",`
			`"print(f\"Best WER: {best['WER']:.6f}\")\n",`
			`"print(f\"\\nOptimal Configuration:\")\n",`
			`"print(f\" textline_orientation: {best['config/textline_orientation']}\")\n",`
			`"print(f\" use_doc_orientation_classify: {best['config/use_doc_orientation_classify']}\")\n",`
			`"print(f\" use_doc_unwarping: {best['config/use_doc_unwarping']}\")\n",`
			`"print(f\" text_det_thresh: {best['config/text_det_thresh']:.4f}\")\n",`
			`"print(f\" text_det_box_thresh: {best['config/text_det_box_thresh']:.4f}\")\n",`
			`"print(f\" text_det_unclip_ratio: {best['config/text_det_unclip_ratio']}\")\n",`
			`"print(f\" text_rec_score_thresh: {best['config/text_rec_score_thresh']:.4f}\")"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"id": "correlation",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# Correlation analysis\n",`
			`"param_cols = [\n",`
			`" \"config/text_det_thresh\",\n",`
			`" \"config/text_det_box_thresh\",\n",`
			`" \"config/text_det_unclip_ratio\",\n",`
			`" \"config/text_rec_score_thresh\",\n",`
			`"]\n",`
			`"\n",`
			`"corr_cer = df[param_cols + [\"CER\"]].corr()[\"CER\"].sort_values(ascending=False)\n",`
			`"corr_wer = df[param_cols + [\"WER\"]].corr()[\"WER\"].sort_values(ascending=False)\n",`
			`"\n",`
			`"print(\"Correlation with CER:\")\n",`
			`"print(corr_cer)\n",`
			`"print(\"\\nCorrelation with WER:\")\n",`
			`"print(corr_wer)"`
			`]`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": ".venv",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.12.3"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 5`
remove unneded py file 2026-01-18 08:24:23 +01:00			`}`