\n"
+ ],
+ "text/plain": [
+ "args: \u001b[1m[\u001b[0m\u001b[32m'c:\\\\Users\\\\sji\\\\Desktop\\\\MastersThesis\\\\.venv\\\\Scripts\\\\python.exe'\u001b[0m, \n",
+ "\u001b[32m'c:\\\\Users\\\\sji\\\\Desktop\\\\MastersThesis\\\\paddle_ocr_tuning.py'\u001b[0m, \u001b[32m'--pdf-folder'\u001b[0m, \n",
+ "\u001b[32m'c:\\\\Users\\\\sji\\\\Desktop\\\\MastersThesis\\\\instructions'\u001b[0m, \u001b[32m'--pages-per-pdf'\u001b[0m, \u001b[32m'1'\u001b[0m, \u001b[32m'--dpi'\u001b[0m, \u001b[32m'360'\u001b[0m, \n",
+ "\u001b[32m'--textline-orientation'\u001b[0m, \u001b[32m'True'\u001b[0m, \u001b[32m'--text-det-box-thresh'\u001b[0m, \u001b[32m'0.46611732611383844'\u001b[0m, \u001b[32m'--text-det-unclip-ratio'\u001b[0m, \n",
+ "\u001b[32m'1.3598680409827462'\u001b[0m, \u001b[32m'--text-rec-score-thresh'\u001b[0m, \u001b[32m'0.0'\u001b[0m, \u001b[32m'--line-tolerance'\u001b[0m, \u001b[32m'0.5'\u001b[0m, \u001b[32m'--min-box-score'\u001b[0m, \u001b[32m'0.6'\u001b[0m\u001b[1m]\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
"import sys, subprocess\n",
"print(\"Notebook Python:\", sys.executable)\n",
"# test paddle ocr run with params\n",
- "test_proc = subprocess.run([sys.executable, SCRIPT_ABS, \"--pdf-folder\", PDF_FOLDER_ABS, \"--pages-per-pdf\", \"1\"], capture_output=True, text=True, cwd=SCRIPT_DIR)\n",
+ "args = [sys.executable, \n",
+ " SCRIPT_ABS, \n",
+ " \"--pdf-folder\", PDF_FOLDER_ABS, \n",
+ " \"--pages-per-pdf\", \"1\",\n",
+ " \"--dpi\",\"360\" ,\n",
+ " \"--textline-orientation\",\"True\",\n",
+ " \"--text-det-box-thresh\",\"0.46611732611383844\",\n",
+ " \"--text-det-unclip-ratio\",\"1.3598680409827462\",\n",
+ " \"--text-rec-score-thresh\",\"0.0\",\n",
+ " \"--line-tolerance\", \"0.5\",\n",
+ " \"--min-box-score\",\"0.6\"]\n",
+ "test_proc = subprocess.run(args, capture_output=True, text=True, cwd=SCRIPT_DIR)\n",
+ "if test_proc.returncode != 0:\n",
+ " print(test_proc.stderr)\n",
"last = test_proc.stdout.strip().splitlines()[-1]\n",
"\n",
"metrics = json.loads(last)\n",
"print(metrics)\n",
"\n",
- "print(f\"return code: {test_proc.returncode}\")\n"
+ "print(f\"return code: {test_proc.returncode}\")\n",
+ "print(f\"args: {args}\")"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"id": "8df28468",
"metadata": {},
"outputs": [
@@ -650,7 +662,9 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "2025-11-12 10:13:25,930\tINFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949\n"
+ "c:\\Users\\sji\\Desktop\\MastersThesis\\.venv\\Lib\\site-packages\\ray\\tune\\impl\\tuner_internal.py:144: RayDeprecationWarning: The `RunConfig` class should be imported from `ray.tune` when passing it to the Tuner. Please update your imports. See this issue for more context and migration options: https://github.com/ray-project/ray/issues/49454. Disable these warnings by setting the environment variable: RAY_TRAIN_ENABLE_V2_MIGRATION_WARNINGS=0\n",
+ " _log_deprecation_warning(\n",
+ "2025-11-12 22:31:01,166\tINFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949\n"
]
},
{
@@ -662,16 +676,16 @@
"
Tune Status
\n",
"
\n",
"\n",
- "
Current time:
2025-11-12 10:16:28
\n",
- "
Running for:
00:03:02.06
\n",
- "
Memory:
21.7/31.8 GiB
\n",
+ "
Current time:
2025-11-12 22:39:26
\n",
+ "
Running for:
00:08:25.78
\n",
+ "
Memory:
9.9/31.8 GiB
\n",
"\n",
"
\n",
" \n",
" \n",
"
\n",
"
System Info
\n",
- " Using AsyncHyperBand: num_stopped=0 Bracket: Iter 64.000: None | Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None Logical resource usage: 8.0/12 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:G)\n",
+ " Using AsyncHyperBand: num_stopped=1 Bracket: Iter 64.000: None | Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: -0.062382927481937384 Logical resource usage: 1.0/12 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:G)\n",
"
\n",
" \n",
" \n",
@@ -680,19 +694,13 @@
"
Trial Status
\n",
"
\n",
"\n",
- "
Trial name
status
loc
dpi
line_tolerance
min_box_score
text_det_box_thresh
text_det_unclip_rati\n",
+ "
Trial name
status
loc
dpi
line_tolerance
min_box_score
text_det_box_thresh
text_det_unclip_rati\n",
"o
text_rec_score_thres\n",
- "h
textline_orientation
\n",
+ "h
textline_orientation
iter
total time (s)
CER
WER
TIME
\n",
"\n",
"\n",
- "
trainable_paddle_ocr_2c044_00000
RUNNING
127.0.0.1:16412
300
0.7
0
0.5844
1.68825
0
False
\n",
- "
trainable_paddle_ocr_2c044_00001
RUNNING
127.0.0.1:23300
300
0.6
0.5
0.56087
1.5857
0.4
False
\n",
- "
trainable_paddle_ocr_2c044_00002
RUNNING
127.0.0.1:15080
300
0.7
0.6
0.534888
1.27986
0.2
True
\n",
- "
trainable_paddle_ocr_2c044_00003
RUNNING
127.0.0.1:22208
300
0.6
0.6
0.570881
1.92797
0
True
\n",
- "
trainable_paddle_ocr_2c044_00004
RUNNING
127.0.0.1:6244
240
0.5
0
0.445475
1.70568
0
True
\n",
- "
trainable_paddle_ocr_2c044_00005
RUNNING
127.0.0.1:1252
300
0.7
0.5
0.402891
1.65377
0
False
\n",
- "
trainable_paddle_ocr_2c044_00006
RUNNING
127.0.0.1:4104
300
0.6
0.5
0.493143
1.26816
0.4
False
\n",
- "
trainable_paddle_ocr_2c044_00007
RUNNING
127.0.0.1:15552
300
0.5
0.6
0.660866
1.52281
0.4
True
\n",
+ "
trainable_paddle_ocr_3632f_00000
TERMINATED
127.0.0.1:22388
360
0.6
0.6
0.598139
1.595
0.2
True
1
500.4
0.0684595
0.414935
473.74
\n",
+ "
trainable_paddle_ocr_3632f_00001
TERMINATED
127.0.0.1:10796
300
0.6
0.5
0.418069
1.61857
0.2
True
1
465.474
0.0563063
0.285714
438.892
\n",
"\n",
"
\n",
" \n",
@@ -739,76 +747,551 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "2025-11-12 10:13:25,974\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00000_0_dpi=300,line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.5844,text_det_unclip_ratio=_2025-11-12_10-13-25\n",
- "2025-11-12 10:13:25,980\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00000_0_dpi=300,line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.5844,text_det_unclip_ratio=_2025-11-12_10-13-25\n",
- "2025-11-12 10:13:25,985\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.5609,text_det_unclip_r_2025-11-12_10-13-25\n",
- "2025-11-12 10:13:25,989\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.5609,text_det_unclip_r_2025-11-12_10-13-25\n",
- "2025-11-12 10:13:25,993\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00002_2_dpi=300,line_tolerance=0.7000,min_box_score=0.6000,text_det_box_thresh=0.5349,text_det_unclip_r_2025-11-12_10-13-25\n",
- "2025-11-12 10:13:25,997\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00002_2_dpi=300,line_tolerance=0.7000,min_box_score=0.6000,text_det_box_thresh=0.5349,text_det_unclip_r_2025-11-12_10-13-25\n",
- "2025-11-12 10:13:26,002\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00003_3_dpi=300,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5709,text_det_unclip_r_2025-11-12_10-13-26\n",
- "2025-11-12 10:13:26,007\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00003_3_dpi=300,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5709,text_det_unclip_r_2025-11-12_10-13-26\n",
- "2025-11-12 10:13:26,014\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00004_4_dpi=240,line_tolerance=0.5000,min_box_score=0,text_det_box_thresh=0.4455,text_det_unclip_ratio=_2025-11-12_10-13-26\n",
- "2025-11-12 10:13:26,017\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00004_4_dpi=240,line_tolerance=0.5000,min_box_score=0,text_det_box_thresh=0.4455,text_det_unclip_ratio=_2025-11-12_10-13-26\n",
- "2025-11-12 10:13:26,021\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00005_5_dpi=300,line_tolerance=0.7000,min_box_score=0.5000,text_det_box_thresh=0.4029,text_det_unclip_r_2025-11-12_10-13-26\n",
- "2025-11-12 10:13:26,023\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00005_5_dpi=300,line_tolerance=0.7000,min_box_score=0.5000,text_det_box_thresh=0.4029,text_det_unclip_r_2025-11-12_10-13-26\n",
- "2025-11-12 10:13:26,027\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00006_6_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4931,text_det_unclip_r_2025-11-12_10-13-26\n",
- "2025-11-12 10:13:26,030\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00006_6_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4931,text_det_unclip_r_2025-11-12_10-13-26\n",
- "2025-11-12 10:13:26,033\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00007_7_dpi=300,line_tolerance=0.5000,min_box_score=0.6000,text_det_box_thresh=0.6609,text_det_unclip_r_2025-11-12_10-13-26\n",
- "2025-11-12 10:13:26,035\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00007_7_dpi=300,line_tolerance=0.5000,min_box_score=0.6000,text_det_box_thresh=0.6609,text_det_unclip_r_2025-11-12_10-13-26\n",
- "2025-11-12 10:13:33,013\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00000_0_dpi=300,line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.5844,text_det_unclip_ratio=_2025-11-12_10-13-25\n",
- "2025-11-12 10:13:33,016\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00000_0_dpi=300,line_tolerance=0.7000,min_box_score=0,text_det_box_thresh=0.5844,text_det_unclip_ratio=_2025-11-12_10-13-25\n",
- "2025-11-12 10:13:33,162\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00002_2_dpi=300,line_tolerance=0.7000,min_box_score=0.6000,text_det_box_thresh=0.5349,text_det_unclip_r_2025-11-12_10-13-25\n",
- "2025-11-12 10:13:33,164\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00002_2_dpi=300,line_tolerance=0.7000,min_box_score=0.6000,text_det_box_thresh=0.5349,text_det_unclip_r_2025-11-12_10-13-25\n",
- "2025-11-12 10:13:33,179\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.5609,text_det_unclip_r_2025-11-12_10-13-25\n",
- "2025-11-12 10:13:33,183\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.5609,text_det_unclip_r_2025-11-12_10-13-25\n",
- "2025-11-12 10:13:33,296\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00004_4_dpi=240,line_tolerance=0.5000,min_box_score=0,text_det_box_thresh=0.4455,text_det_unclip_ratio=_2025-11-12_10-13-26\n",
- "2025-11-12 10:13:33,303\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00004_4_dpi=240,line_tolerance=0.5000,min_box_score=0,text_det_box_thresh=0.4455,text_det_unclip_ratio=_2025-11-12_10-13-26\n",
- "2025-11-12 10:13:33,322\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00005_5_dpi=300,line_tolerance=0.7000,min_box_score=0.5000,text_det_box_thresh=0.4029,text_det_unclip_r_2025-11-12_10-13-26\n",
- "2025-11-12 10:13:33,325\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00005_5_dpi=300,line_tolerance=0.7000,min_box_score=0.5000,text_det_box_thresh=0.4029,text_det_unclip_r_2025-11-12_10-13-26\n",
- "2025-11-12 10:13:33,339\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00006_6_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4931,text_det_unclip_r_2025-11-12_10-13-26\n",
- "2025-11-12 10:13:33,341\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00006_6_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4931,text_det_unclip_r_2025-11-12_10-13-26\n",
- "2025-11-12 10:13:33,349\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00003_3_dpi=300,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5709,text_det_unclip_r_2025-11-12_10-13-26\n",
- "2025-11-12 10:13:33,352\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00003_3_dpi=300,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5709,text_det_unclip_r_2025-11-12_10-13-26\n",
- "2025-11-12 10:13:33,388\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00007_7_dpi=300,line_tolerance=0.5000,min_box_score=0.6000,text_det_box_thresh=0.6609,text_det_unclip_r_2025-11-12_10-13-26\n",
- "2025-11-12 10:13:33,390\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_10-11-32_166827_18128\\artifacts\\2025-11-12_10-13-25\\trainable_paddle_ocr_2025-11-12_10-13-25\\driver_artifacts\\trainable_paddle_ocr_2c044_00007_7_dpi=300,line_tolerance=0.5000,min_box_score=0.6000,text_det_box_thresh=0.6609,text_det_unclip_r_2025-11-12_10-13-26\n"
+ "2025-11-12 22:31:01,216\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00000_0_dpi=360,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5981,text_det_unclip_r_2025-11-12_22-31-01\n",
+ "2025-11-12 22:31:01,216\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00000_0_dpi=360,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5981,text_det_unclip_r_2025-11-12_22-31-01\n",
+ "2025-11-12 22:31:01,265\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4181,text_det_unclip_r_2025-11-12_22-31-01\n",
+ "2025-11-12 22:31:01,265\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4181,text_det_unclip_r_2025-11-12_22-31-01\n",
+ "2025-11-12 22:31:06,561\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00000_0_dpi=360,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5981,text_det_unclip_r_2025-11-12_22-31-01\n",
+ "2025-11-12 22:31:06,563\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00000_0_dpi=360,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5981,text_det_unclip_r_2025-11-12_22-31-01\n",
+ "2025-11-12 22:31:06,605\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4181,text_det_unclip_r_2025-11-12_22-31-01\n",
+ "2025-11-12 22:31:06,605\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4181,text_det_unclip_r_2025-11-12_22-31-01\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "
Trial Progress
\n",
+ "
\n",
+ "\n",
+ "
Trial name
CER
PAGES
TIME
TIME_PER_PAGE
WER
\n",
+ "\n",
+ "\n",
+ "
trainable_paddle_ocr_3632f_00000
0.0684595
2
473.74
236.768
0.414935
\n",
+ "
trainable_paddle_ocr_3632f_00001
0.0563063
2
438.892
219.372
0.285714
\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2025-11-12 22:38:52,093\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00001_1_dpi=300,line_tolerance=0.6000,min_box_score=0.5000,text_det_box_thresh=0.4181,text_det_unclip_r_2025-11-12_22-31-01\n",
+ "2025-11-12 22:39:26,972\tWARNING trial.py:647 -- The path to the trial log directory is too long (max length: 260. Consider using `trial_dirname_creator` to shorten the path. Path: C:\\Users\\sji\\AppData\\Local\\Temp\\ray\\session_2025-11-12_22-29-00_496141_15712\\artifacts\\2025-11-12_22-31-01\\trainable_paddle_ocr_2025-11-12_22-31-01\\driver_artifacts\\trainable_paddle_ocr_3632f_00000_0_dpi=360,line_tolerance=0.6000,min_box_score=0.6000,text_det_box_thresh=0.5981,text_det_unclip_r_2025-11-12_22-31-01\n",
+ "2025-11-12 22:39:26,988\tINFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to 'C:/Users/sji/ray_results/trainable_paddle_ocr_2025-11-12_22-31-01' in 0.0087s.\n",
+ "2025-11-12 22:39:26,994\tINFO tune.py:1041 -- Total run time: 505.83 seconds (505.77 seconds for the tuning loop).\n"
]
}
],
"source": [
"def trainable_paddle_ocr(config):\n",
- " args = [sys.executable, SCRIPT_ABS, \"--pdf-folder\", PDF_FOLDER_ABS, \"--pages-per-pdf\", \"1\"]\n",
+ " args = [sys.executable, SCRIPT_ABS, \"--pdf-folder\", PDF_FOLDER_ABS, \"--pages-per-pdf\", \"2\"]\n",
" for k, v in config.items():\n",
" args += [f\"--{KEYMAP[k]}\", str(v)]\n",
- " proc = subprocess.run([sys.executable, SCRIPT_ABS, \"--pdf-folder\", PDF_FOLDER_ABS, \"--pages-per-pdf\", \"1\"], capture_output=True, text=True, cwd=SCRIPT_DIR)\n",
+ " proc = subprocess.run(args, capture_output=True, text=True, cwd=SCRIPT_DIR)\n",
"\n",
" if proc.returncode != 0:\n",
- " tune.report(CER=1.0, WER=1.0, time=0.0, error=proc.stderr[:500])\n",
+ " tune.report(CER=1.0, WER=1.0, TIME=0.0, ERROR=proc.stderr[:500])\n",
" return\n",
" # última línea = JSON con métricas\n",
" last = proc.stdout.strip().splitlines()[-1]\n",
" \n",
" metrics = json.loads(last)\n",
- " tune.report(**metrics)\n",
+ " tune.report(metrics=metrics)\n",
"\n",
"scheduler = ASHAScheduler(grace_period=1, reduction_factor=2)\n",
"\n",
"tuner = tune.Tuner(\n",
" trainable_paddle_ocr,\n",
- " tune_config=tune.TuneConfig(metric=\"CER\", mode=\"min\", scheduler=scheduler, num_samples=8),\n",
- " param_space=search_space,\n",
- " run_config=air.RunConfig(\n",
- " log_to_file=False, # <- stream stdout/stderr to the notebook instead of files\n",
- " verbose=2 # 0=silent, 1=brief, 2=default, 3=debuggy\n",
- " ),\n",
+ " tune_config=tune.TuneConfig(metric=\"CER\", \n",
+ " mode=\"min\", \n",
+ " scheduler=scheduler, \n",
+ " num_samples=2, \n",
+ " max_concurrent_trials=4),\n",
+ " run_config=air.RunConfig(verbose=2, log_to_file=False),\n",
+ " param_space=search_space\n",
")\n",
"\n",
"results = tuner.fit()\n",
- "df = results.get_dataframe().sort_values(\"CER\", ascending=True)\n",
- "cols = [\"dpi\",\"textline_orientation\",\"text_det_box_thresh\",\"text_det_unclip_ratio\",\n",
- " \"text_rec_score_thresh\",\"line_tolerance\",\"pages_per_pdf\",\"lang\",\"CER\",\"WER\",\"time\"]\n",
- "print(df[cols].head(10))\n",
- "df.to_csv(\"raytune_paddle_subproc_results.csv\", index=False)\n",
- "print(\" Guardado: raytune_paddle_subproc_results.csv\")"
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "710a67ce",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = results.get_dataframe().sort_values(\"CER\", ascending=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "1ab345a3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "