Documentation review and data consistency.

2026-01-24 15:53:34 +01:00
parent 9ee2490097
commit 0089b34cb3
48 changed files with 1030 additions and 930 deletions
--- a/src/README.md
+++ b/src/README.md
@@ -95,6 +95,25 @@ Results are saved to `src/results/` as CSV files:
 - `raytune_doctr_results_<timestamp>.csv`
 - `raytune_easyocr_results_<timestamp>.csv`

+### Correlation Analysis
+
+Correlation tables used in the thesis are derived from the CSV results with a local script:
+
+```bash
+source .venv/bin/activate
+python tem/scripts/compute_correlations_all.py
+```
+
+Outputs are written to `src/results/correlations/`:
+- `paddle_correlations.csv`
+- `doctr_correlations.csv`
+- `easyocr_correlations.csv`
+
+These files are computed from the corresponding inputs:
+- `src/results/raytune_paddle_results_20260119_122609.csv`
+- `src/results/raytune_doctr_results_20260119_121445.csv`
+- `src/results/raytune_easyocr_results_20260119_120204.csv`
+
 ## Directory Structure

 ```
--- a/src/archived/dataset_manager.py
+++ b/src/archived/dataset_manager.py
--- a/src/archived/doctr_raytune_rest.ipynb
+++ b/src/archived/doctr_raytune_rest.ipynb
--- a/src/archived/easyocr_raytune_rest.ipynb
+++ b/src/archived/easyocr_raytune_rest.ipynb
--- a/src/archived/ocr_benchmark_notebook.ipynb
+++ b/src/archived/ocr_benchmark_notebook.ipynb
--- a/src/archived/paddle_ocr_fine_tune_unir.ipynb
+++ b/src/archived/paddle_ocr_fine_tune_unir.ipynb
--- a/src/archived/paddle_ocr_fine_tune_unir_raytune.ipynb
+++ b/src/archived/paddle_ocr_fine_tune_unir_raytune.ipynb
--- a/src/archived/paddle_ocr_raytune_rest.ipynb
+++ b/src/archived/paddle_ocr_raytune_rest.ipynb
--- a/src/archived/paddle_ocr_tuning.py
+++ b/src/archived/paddle_ocr_tuning.py
--- a/src/archived/raytune_ocr.py
+++ b/src/archived/raytune_ocr.py
--- a/src/archived/run_tuning.py
+++ b/src/archived/run_tuning.py
--- a/src/docker-compose.tuning.yml
+++ b/src/docker-compose.tuning.yml
@@ -1,82 +0,0 @@
-# docker-compose.tuning.yml - Ray Tune with all OCR services (PaddleOCR + DocTR)
-# Usage:
-#   docker compose -f docker-compose.tuning.yml up -d paddle-ocr-gpu doctr-gpu
-#   docker compose -f docker-compose.tuning.yml run raytune --service paddle --samples 64
-#   docker compose -f docker-compose.tuning.yml run raytune --service doctr --samples 64
-#   docker compose -f docker-compose.tuning.yml down
-#
-# Note: EasyOCR uses port 8002 (same as PaddleOCR). Use docker-compose.tuning.easyocr.yml separately.
-
-services:
-  raytune:
-    image: seryus.ddns.net/unir/raytune:latest
-    network_mode: host
-    shm_size: '5gb'
-    volumes:
-      - ./results:/app/results:rw
-    environment:
-      - PYTHONUNBUFFERED=1
-
-  paddle-ocr-gpu:
-    image: seryus.ddns.net/unir/paddle-ocr-gpu:latest
-    container_name: paddle-ocr-gpu-tuning
-    ports:
-      - "8002:8000"
-    volumes:
-      - ./dataset:/app/dataset:ro
-      - ./debugset:/app/debugset:rw
-      - paddlex-cache:/root/.paddlex
-    environment:
-      - PYTHONUNBUFFERED=1
-      - CUDA_VISIBLE_DEVICES=0
-      - PADDLE_DET_MODEL=PP-OCRv5_mobile_det
-      - PADDLE_REC_MODEL=PP-OCRv5_mobile_rec
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
-    restart: unless-stopped
-    healthcheck:
-      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
-      interval: 30s
-      timeout: 10s
-      retries: 3
-      start_period: 60s
-
-  doctr-gpu:
-    image: seryus.ddns.net/unir/doctr-gpu:latest
-    container_name: doctr-gpu-tuning
-    ports:
-      - "8003:8000"
-    volumes:
-      - ./dataset:/app/dataset:ro
-      - ./debugset:/app/debugset:rw
-      - doctr-cache:/root/.cache/doctr
-    environment:
-      - PYTHONUNBUFFERED=1
-      - CUDA_VISIBLE_DEVICES=0
-      - DOCTR_DET_ARCH=db_resnet50
-      - DOCTR_RECO_ARCH=crnn_vgg16_bn
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
-    restart: unless-stopped
-    healthcheck:
-      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"]
-      interval: 30s
-      timeout: 10s
-      retries: 3
-      start_period: 180s
-
-volumes:
-  paddlex-cache:
-    name: paddlex-model-cache
-  doctr-cache:
-    name: doctr-model-cache