Files
MastersThesis/src/paddle_ocr_raytune_rest.ipynb

87 lines
2.4 KiB
Plaintext
Raw Normal View History

2026-01-18 08:19:34 +01:00
{
"cells": [
{
"cell_type": "markdown",
"id": "header",
"metadata": {},
"source": [
"# PaddleOCR Hyperparameter Optimization via REST API\n",
"\n",
2026-01-18 18:03:23 +01:00
"Uses Ray Tune + Optuna to find optimal PaddleOCR parameters.\n",
2026-01-18 08:19:34 +01:00
"\n",
"## Prerequisites\n",
"\n",
"```bash\n",
"cd src/paddle_ocr\n",
2026-01-18 18:03:23 +01:00
"docker compose -f docker-compose.workers.yml up # GPU workers on 8001-8002\n",
"# or: docker compose -f docker-compose.workers.yml --profile cpu up\n",
2026-01-18 08:19:34 +01:00
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
2026-01-18 18:03:23 +01:00
"id": "deps",
2026-01-18 08:19:34 +01:00
"metadata": {},
"outputs": [],
2026-01-18 18:43:16 +01:00
"source": "# Pin Ray version for API stability (tune.report takes dict, not kwargs in 2.x)\n%pip install -q \"ray[tune]==2.53.0\" optuna requests pandas"
2026-01-18 08:19:34 +01:00
},
{
"cell_type": "code",
"execution_count": null,
2026-01-18 18:03:23 +01:00
"id": "setup",
2026-01-18 08:19:34 +01:00
"metadata": {},
"outputs": [],
2026-01-18 18:43:16 +01:00
"source": "from raytune_ocr import (\n check_workers, create_trainable, run_tuner, analyze_results, correlation_analysis,\n paddle_ocr_payload, PADDLE_OCR_SEARCH_SPACE, PADDLE_OCR_CONFIG_KEYS,\n)\n\n# Worker ports (3 workers to avoid OOM)\nPORTS = [8001, 8002, 8003]\n\n# Check workers are running\nhealthy = check_workers(PORTS, \"PaddleOCR\")"
2026-01-18 08:19:34 +01:00
},
{
"cell_type": "code",
"execution_count": null,
2026-01-18 18:03:23 +01:00
"id": "tune",
2026-01-18 08:19:34 +01:00
"metadata": {},
"outputs": [],
2026-01-18 18:43:16 +01:00
"source": "# Create trainable and run tuning\ntrainable = create_trainable(PORTS, paddle_ocr_payload)\n\nresults = run_tuner(\n trainable=trainable,\n search_space=PADDLE_OCR_SEARCH_SPACE,\n num_samples=128,\n num_workers=len(healthy),\n)"
2026-01-18 08:19:34 +01:00
},
{
"cell_type": "code",
"execution_count": null,
2026-01-18 18:03:23 +01:00
"id": "analysis",
2026-01-18 08:19:34 +01:00
"metadata": {},
"outputs": [],
"source": [
2026-01-18 18:03:23 +01:00
"# Analyze results\n",
"df = analyze_results(\n",
" results,\n",
" prefix=\"raytune_paddle\",\n",
" config_keys=PADDLE_OCR_CONFIG_KEYS,\n",
")\n",
2026-01-18 08:19:34 +01:00
"\n",
2026-01-18 18:03:23 +01:00
"df.describe()"
2026-01-18 08:19:34 +01:00
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "correlation",
"metadata": {},
"outputs": [],
"source": [
"# Correlation analysis\n",
2026-01-18 18:03:23 +01:00
"correlation_analysis(df, PADDLE_OCR_CONFIG_KEYS)"
2026-01-18 08:19:34 +01:00
]
}
],
"metadata": {
"kernelspec": {
2026-01-18 18:03:23 +01:00
"display_name": "Python 3",
2026-01-18 08:19:34 +01:00
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
2026-01-18 18:03:23 +01:00
"version": "3.10.0"
2026-01-18 08:19:34 +01:00
}
},
"nbformat": 4,
"nbformat_minor": 5
2026-01-18 18:43:16 +01:00
}