{ "cells": [ { "cell_type": "markdown", "id": "header", "metadata": {}, "source": [ "# PaddleOCR Hyperparameter Optimization via REST API\n", "\n", "This notebook runs Ray Tune hyperparameter search calling the PaddleOCR REST API (Docker container).\n", "\n", "**Benefits:**\n", "- No model reload per trial - Model stays loaded in Docker container\n", "- Faster trials - Skip ~10s model load time per trial\n", "- Cleaner code - REST API replaces subprocess + CLI arg parsing" ] }, { "cell_type": "markdown", "id": "prereq", "metadata": {}, "source": [ "## Prerequisites\n", "\n", "Start 2 PaddleOCR workers for parallel hyperparameter tuning:\n", "\n", "```bash\n", "cd src/paddle_ocr\n", "docker compose -f docker-compose.workers.yml up\n", "```\n", "\n", "This starts 2 GPU workers on ports 8001-8002, allowing 2 concurrent trials.\n", "\n", "For CPU-only systems:\n", "```bash\n", "docker compose -f docker-compose.workers.yml --profile cpu up\n", "```" ] }, { "cell_type": "markdown", "id": "3ob9fsoilc4", "metadata": {}, "source": [ "## 0. Dependencies" ] }, { "cell_type": "code", "execution_count": null, "id": "wyr2nsoj7", "metadata": {}, "outputs": [], "source": [ "# Install dependencies (run once)\n", "%pip install -U \"ray[tune]\"\n", "%pip install optuna\n", "%pip install requests pandas" ] }, { "cell_type": "markdown", "id": "imports-header", "metadata": {}, "source": [ "## 1. Imports & Setup" ] }, { "cell_type": "code", "execution_count": null, "id": "imports", "metadata": {}, "outputs": [], "source": "import os\nfrom datetime import datetime\n\nimport requests\nimport pandas as pd\n\nimport ray\nfrom ray import tune, train\nfrom ray.tune.search.optuna import OptunaSearch" }, { "cell_type": "markdown", "id": "config-header", "metadata": {}, "source": [ "## 2. API Configuration" ] }, { "cell_type": "code", "execution_count": null, "id": "config", "metadata": {}, "outputs": [], "source": [ "# PaddleOCR REST API endpoints - 2 workers for parallel trials\n", "# Start workers with: cd src/paddle_ocr && docker compose -f docker-compose.workers.yml up\n", "WORKER_PORTS = [8001, 8002]\n", "WORKER_URLS = [f\"http://localhost:{port}\" for port in WORKER_PORTS]\n", "\n", "# Output folder for results\n", "OUTPUT_FOLDER = \"results\"\n", "os.makedirs(OUTPUT_FOLDER, exist_ok=True)\n", "\n", "# Number of concurrent trials = number of workers\n", "NUM_WORKERS = len(WORKER_URLS)" ] }, { "cell_type": "code", "execution_count": null, "id": "health-check", "metadata": {}, "outputs": [], "source": [ "# Verify all workers are running\n", "healthy_workers = []\n", "for url in WORKER_URLS:\n", " try:\n", " health = requests.get(f\"{url}/health\", timeout=10).json()\n", " if health['status'] == 'ok' and health['model_loaded']:\n", " healthy_workers.append(url)\n", " print(f\"✓ {url}: {health['status']} (GPU: {health.get('gpu_name', 'N/A')})\")\n", " else:\n", " print(f\"✗ {url}: not ready yet\")\n", " except requests.exceptions.ConnectionError:\n", " print(f\"✗ {url}: not reachable\")\n", "\n", "if not healthy_workers:\n", " raise RuntimeError(\n", " \"No healthy workers found. Start them with:\\n\"\n", " \" cd src/paddle_ocr && docker compose -f docker-compose.workers.yml up\"\n", " )\n", "\n", "print(f\"\\n{len(healthy_workers)}/{len(WORKER_URLS)} workers ready for parallel tuning\")" ] }, { "cell_type": "markdown", "id": "search-space-header", "metadata": {}, "source": [ "## 3. Search Space" ] }, { "cell_type": "code", "execution_count": null, "id": "search-space", "metadata": {}, "outputs": [], "source": [ "search_space = {\n", " # Whether to use document image orientation classification\n", " \"use_doc_orientation_classify\": tune.choice([True, False]),\n", " # Whether to use text image unwarping\n", " \"use_doc_unwarping\": tune.choice([True, False]),\n", " # Whether to use text line orientation classification\n", " \"textline_orientation\": tune.choice([True, False]),\n", " # Detection pixel threshold (pixels > threshold are considered text)\n", " \"text_det_thresh\": tune.uniform(0.0, 0.7),\n", " # Detection box threshold (average score within border)\n", " \"text_det_box_thresh\": tune.uniform(0.0, 0.7),\n", " # Text detection expansion coefficient\n", " \"text_det_unclip_ratio\": tune.choice([0.0]),\n", " # Text recognition threshold (filter low confidence results)\n", " \"text_rec_score_thresh\": tune.uniform(0.0, 0.7),\n", "}" ] }, { "cell_type": "markdown", "id": "trainable-header", "metadata": {}, "source": [ "## 4. Trainable Function" ] }, { "cell_type": "code", "execution_count": null, "id": "trainable", "metadata": {}, "outputs": [], "source": "def trainable_paddle_ocr(config):\n \"\"\"Call PaddleOCR REST API with the given hyperparameter config.\"\"\"\n import random\n import requests\n from ray import train\n\n # Worker URLs - random selection (load balances with 2 workers, 2 concurrent trials)\n WORKER_PORTS = [8001, 8002]\n api_url = f\"http://localhost:{random.choice(WORKER_PORTS)}\"\n\n payload = {\n \"pdf_folder\": \"/app/dataset\",\n \"use_doc_orientation_classify\": config.get(\"use_doc_orientation_classify\", False),\n \"use_doc_unwarping\": config.get(\"use_doc_unwarping\", False),\n \"textline_orientation\": config.get(\"textline_orientation\", True),\n \"text_det_thresh\": config.get(\"text_det_thresh\", 0.0),\n \"text_det_box_thresh\": config.get(\"text_det_box_thresh\", 0.0),\n \"text_det_unclip_ratio\": config.get(\"text_det_unclip_ratio\", 1.5),\n \"text_rec_score_thresh\": config.get(\"text_rec_score_thresh\", 0.0),\n \"start_page\": 5,\n \"end_page\": 10,\n }\n\n try:\n response = requests.post(f\"{api_url}/evaluate\", json=payload, timeout=None)\n response.raise_for_status()\n metrics = response.json()\n metrics[\"worker\"] = api_url\n train.report(metrics)\n except Exception as e:\n train.report({\n \"CER\": 1.0,\n \"WER\": 1.0,\n \"TIME\": 0.0,\n \"PAGES\": 0,\n \"TIME_PER_PAGE\": 0,\n \"worker\": api_url,\n \"ERROR\": str(e)[:500]\n })" }, { "cell_type": "markdown", "id": "tuner-header", "metadata": {}, "source": [ "## 5. Run Tuner" ] }, { "cell_type": "code", "execution_count": null, "id": "ray-init", "metadata": {}, "outputs": [], "source": [ "ray.init(ignore_reinit_error=True)\n", "print(f\"Ray Tune ready (version: {ray.__version__})\")" ] }, { "cell_type": "code", "execution_count": null, "id": "tuner", "metadata": {}, "outputs": [], "source": "tuner = tune.Tuner(\n trainable_paddle_ocr,\n tune_config=tune.TuneConfig(\n metric=\"CER\",\n mode=\"min\",\n search_alg=OptunaSearch(),\n num_samples=64,\n max_concurrent_trials=NUM_WORKERS, # Run trials in parallel across workers\n ),\n param_space=search_space,\n)\n\nresults = tuner.fit()" }, { "cell_type": "markdown", "id": "analysis-header", "metadata": {}, "source": [ "## 6. Results Analysis" ] }, { "cell_type": "code", "execution_count": null, "id": "results-df", "metadata": {}, "outputs": [], "source": [ "df = results.get_dataframe()\n", "df.describe()" ] }, { "cell_type": "code", "execution_count": null, "id": "save-results", "metadata": {}, "outputs": [], "source": [ "# Save results to CSV\n", "timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n", "filename = f\"raytune_paddle_rest_results_{timestamp}.csv\"\n", "filepath = os.path.join(OUTPUT_FOLDER, filename)\n", "\n", "df.to_csv(filepath, index=False)\n", "print(f\"Results saved: {filepath}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "best-config", "metadata": {}, "outputs": [], "source": [ "# Best configuration\n", "best = df.loc[df[\"CER\"].idxmin()]\n", "\n", "print(f\"Best CER: {best['CER']:.6f}\")\n", "print(f\"Best WER: {best['WER']:.6f}\")\n", "print(f\"\\nOptimal Configuration:\")\n", "print(f\" textline_orientation: {best['config/textline_orientation']}\")\n", "print(f\" use_doc_orientation_classify: {best['config/use_doc_orientation_classify']}\")\n", "print(f\" use_doc_unwarping: {best['config/use_doc_unwarping']}\")\n", "print(f\" text_det_thresh: {best['config/text_det_thresh']:.4f}\")\n", "print(f\" text_det_box_thresh: {best['config/text_det_box_thresh']:.4f}\")\n", "print(f\" text_det_unclip_ratio: {best['config/text_det_unclip_ratio']}\")\n", "print(f\" text_rec_score_thresh: {best['config/text_rec_score_thresh']:.4f}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "correlation", "metadata": {}, "outputs": [], "source": [ "# Correlation analysis\n", "param_cols = [\n", " \"config/text_det_thresh\",\n", " \"config/text_det_box_thresh\",\n", " \"config/text_det_unclip_ratio\",\n", " \"config/text_rec_score_thresh\",\n", "]\n", "\n", "corr_cer = df[param_cols + [\"CER\"]].corr()[\"CER\"].sort_values(ascending=False)\n", "corr_wer = df[param_cols + [\"WER\"]].corr()[\"WER\"].sort_values(ascending=False)\n", "\n", "print(\"Correlation with CER:\")\n", "print(corr_cer)\n", "print(\"\\nCorrelation with WER:\")\n", "print(corr_wer)" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }