eassyocr doctr

2026-01-18 06:47:01 +01:00
parent 38ba2d1f5a
commit 578689443d
14 changed files with 1473 additions and 211 deletions
--- a/src/doctr_service/Dockerfile
+++ b/src/doctr_service/Dockerfile
@@ -0,0 +1,49 @@
+# Dockerfile - DocTR Tuning REST API
+#
+# Build:
+#   docker build -t doctr-api:latest .
+#
+# Run:
+#   docker run -p 8003:8000 -v ./dataset:/app/dataset doctr-api:latest
+
+FROM python:3.11-slim
+
+LABEL maintainer="Sergio Jimenez"
+LABEL description="DocTR Tuning REST API"
+
+WORKDIR /app
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV DOCTR_DET_ARCH=db_resnet50
+ENV DOCTR_RECO_ARCH=crnn_vgg16_bn
+
+# Install system dependencies for OpenCV and image processing
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgl1 \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender1 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy application code
+COPY doctr_tuning_rest.py .
+COPY dataset_manager.py .
+
+# Volume for dataset and model cache
+VOLUME ["/app/dataset", "/root/.cache/doctr"]
+
+# Expose API port
+EXPOSE 8000
+
+# Health check (longer start period for model download)
+HEALTHCHECK --interval=30s --timeout=10s --start-period=180s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
+
+# Run the API server
+CMD ["uvicorn", "doctr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/src/doctr_service/dataset_manager.py
+++ b/src/doctr_service/dataset_manager.py
@@ -0,0 +1,45 @@
+# Imports
+import os
+from PIL import Image
+
+
+class ImageTextDataset:
+    def __init__(self, root):
+        self.samples = []
+
+        for folder in sorted(os.listdir(root)):
+            sub = os.path.join(root, folder)
+            img_dir = os.path.join(sub, "img")
+            txt_dir = os.path.join(sub, "txt")
+
+            if not (os.path.isdir(img_dir) and os.path.isdir(txt_dir)):
+                continue
+
+            for fname in sorted(os.listdir(img_dir)):
+                if not fname.lower().endswith((".png", ".jpg", ".jpeg")):
+                    continue
+
+                img_path = os.path.join(img_dir, fname)
+
+                # text file must have same name but .txt
+                txt_name = os.path.splitext(fname)[0] + ".txt"
+                txt_path = os.path.join(txt_dir, txt_name)
+
+                if not os.path.exists(txt_path):
+                    continue
+
+                self.samples.append((img_path, txt_path))
+    def __len__(self):
+        return len(self.samples)
+
+    def __getitem__(self, idx):
+        img_path, txt_path = self.samples[idx]
+
+        # Load image
+        image = Image.open(img_path).convert("RGB")
+
+        # Load text
+        with open(txt_path, "r", encoding="utf-8") as f:
+            text = f.read()
+
+        return image, text
--- a/src/doctr_service/doctr_tuning_rest.py
+++ b/src/doctr_service/doctr_tuning_rest.py
@@ -0,0 +1,322 @@
+# doctr_tuning_rest.py
+# FastAPI REST service for DocTR hyperparameter evaluation
+# Usage: uvicorn doctr_tuning_rest:app --host 0.0.0.0 --port 8000
+
+import os
+import re
+import time
+from typing import Optional
+from contextlib import asynccontextmanager
+
+import numpy as np
+import torch
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, Field
+
+from doctr.models import ocr_predictor
+from jiwer import wer, cer
+from dataset_manager import ImageTextDataset
+
+
+def get_gpu_info() -> dict:
+    """Get GPU status information from PyTorch."""
+    info = {
+        "cuda_available": torch.cuda.is_available(),
+        "device": "cuda" if torch.cuda.is_available() else "cpu",
+        "gpu_count": 0,
+        "gpu_name": None,
+        "gpu_memory_total": None,
+        "gpu_memory_used": None,
+    }
+
+    if info["cuda_available"]:
+        try:
+            info["gpu_count"] = torch.cuda.device_count()
+            if info["gpu_count"] > 0:
+                info["gpu_name"] = torch.cuda.get_device_name(0)
+                info["gpu_memory_total"] = f"{torch.cuda.get_device_properties(0).total_memory / (1024**3):.2f} GB"
+                info["gpu_memory_used"] = f"{torch.cuda.memory_allocated(0) / (1024**3):.2f} GB"
+        except Exception as e:
+            info["gpu_error"] = str(e)
+
+    return info
+
+
+# Model configuration via environment variables
+DEFAULT_DET_ARCH = os.environ.get("DOCTR_DET_ARCH", "db_resnet50")
+DEFAULT_RECO_ARCH = os.environ.get("DOCTR_RECO_ARCH", "crnn_vgg16_bn")
+
+
+# Global state for model and dataset
+class AppState:
+    model: Optional[object] = None
+    dataset: Optional[ImageTextDataset] = None
+    dataset_path: Optional[str] = None
+    det_arch: str = DEFAULT_DET_ARCH
+    reco_arch: str = DEFAULT_RECO_ARCH
+    # Track current model config for cache invalidation
+    current_config: Optional[dict] = None
+    device: str = "cuda" if torch.cuda.is_available() else "cpu"
+
+
+state = AppState()
+
+
+def create_model(
+    assume_straight_pages: bool = True,
+    straighten_pages: bool = False,
+    preserve_aspect_ratio: bool = True,
+    symmetric_pad: bool = True,
+    disable_page_orientation: bool = False,
+    disable_crop_orientation: bool = False,
+) -> object:
+    """Create DocTR model with given configuration."""
+    model = ocr_predictor(
+        det_arch=state.det_arch,
+        reco_arch=state.reco_arch,
+        pretrained=True,
+        assume_straight_pages=assume_straight_pages,
+        straighten_pages=straighten_pages,
+        preserve_aspect_ratio=preserve_aspect_ratio,
+        symmetric_pad=symmetric_pad,
+    )
+
+    # Apply orientation settings if supported
+    if hasattr(model, 'disable_page_orientation'):
+        model.disable_page_orientation = disable_page_orientation
+    if hasattr(model, 'disable_crop_orientation'):
+        model.disable_crop_orientation = disable_crop_orientation
+
+    # Move to GPU if available
+    if state.device == "cuda":
+        model = model.cuda()
+
+    return model
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Load DocTR model at startup with default configuration."""
+    gpu_info = get_gpu_info()
+    print("=" * 50)
+    print("GPU STATUS")
+    print("=" * 50)
+    print(f"  CUDA available: {gpu_info['cuda_available']}")
+    print(f"  Device: {gpu_info['device']}")
+    if gpu_info['cuda_available']:
+        print(f"  GPU count: {gpu_info['gpu_count']}")
+        print(f"  GPU name: {gpu_info['gpu_name']}")
+        print(f"  GPU memory total: {gpu_info['gpu_memory_total']}")
+    print("=" * 50)
+
+    print(f"Loading DocTR models...")
+    print(f"  Detection: {state.det_arch}")
+    print(f"  Recognition: {state.reco_arch}")
+
+    # Load with default config
+    state.model = create_model()
+    state.current_config = {
+        "assume_straight_pages": True,
+        "straighten_pages": False,
+        "preserve_aspect_ratio": True,
+        "symmetric_pad": True,
+        "disable_page_orientation": False,
+        "disable_crop_orientation": False,
+    }
+
+    if gpu_info['cuda_available']:
+        gpu_after = get_gpu_info()
+        print(f"  GPU memory after load: {gpu_after.get('gpu_memory_used', 'N/A')}")
+
+    print("Model loaded successfully!")
+    yield
+    state.model = None
+    state.dataset = None
+
+
+app = FastAPI(
+    title="DocTR Tuning API",
+    description="REST API for DocTR hyperparameter evaluation",
+    version="1.0.0",
+    lifespan=lifespan,
+)
+
+
+class EvaluateRequest(BaseModel):
+    """Request schema with all tunable DocTR hyperparameters."""
+    pdf_folder: str = Field("/app/dataset", description="Path to dataset folder")
+
+    # Processing flags (require model reinit)
+    assume_straight_pages: bool = Field(True, description="Skip rotation handling for straight documents")
+    straighten_pages: bool = Field(False, description="Pre-straighten pages before detection")
+    preserve_aspect_ratio: bool = Field(True, description="Maintain document proportions during resize")
+    symmetric_pad: bool = Field(True, description="Use symmetric padding when preserving aspect ratio")
+
+    # Orientation flags
+    disable_page_orientation: bool = Field(False, description="Skip page orientation classification")
+    disable_crop_orientation: bool = Field(False, description="Skip crop orientation detection")
+
+    # Output grouping
+    resolve_lines: bool = Field(True, description="Group words into lines")
+    resolve_blocks: bool = Field(False, description="Group lines into blocks")
+    paragraph_break: float = Field(0.035, ge=0.0, le=1.0, description="Minimum space ratio separating paragraphs")
+
+    # Page range
+    start_page: int = Field(5, ge=0, description="Start page index (inclusive)")
+    end_page: int = Field(10, ge=1, description="End page index (exclusive)")
+
+
+class EvaluateResponse(BaseModel):
+    """Response schema matching CLI output."""
+    CER: float
+    WER: float
+    TIME: float
+    PAGES: int
+    TIME_PER_PAGE: float
+    model_reinitialized: bool = False
+
+
+class HealthResponse(BaseModel):
+    status: str
+    model_loaded: bool
+    dataset_loaded: bool
+    dataset_size: Optional[int] = None
+    det_arch: Optional[str] = None
+    reco_arch: Optional[str] = None
+    cuda_available: Optional[bool] = None
+    device: Optional[str] = None
+    gpu_name: Optional[str] = None
+    gpu_memory_used: Optional[str] = None
+    gpu_memory_total: Optional[str] = None
+
+
+def doctr_result_to_text(result, resolve_lines: bool = True, resolve_blocks: bool = False) -> str:
+    """
+    Convert DocTR result to plain text.
+    Structure: Document -> pages -> blocks -> lines -> words
+    """
+    lines = []
+    for page in result.pages:
+        for block in page.blocks:
+            for line in block.lines:
+                line_text = " ".join([w.value for w in line.words])
+                lines.append(line_text)
+            if resolve_blocks:
+                lines.append("")  # paragraph separator
+
+    text = " ".join([l for l in lines if l]).strip()
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+
+
+def evaluate_text(reference: str, prediction: str) -> dict:
+    """Calculate WER and CER metrics."""
+    return {"WER": wer(reference, prediction), "CER": cer(reference, prediction)}
+
+
+@app.get("/health", response_model=HealthResponse)
+def health_check():
+    """Check if the service is ready."""
+    gpu_info = get_gpu_info()
+    return HealthResponse(
+        status="ok" if state.model is not None else "initializing",
+        model_loaded=state.model is not None,
+        dataset_loaded=state.dataset is not None,
+        dataset_size=len(state.dataset) if state.dataset else None,
+        det_arch=state.det_arch,
+        reco_arch=state.reco_arch,
+        cuda_available=gpu_info.get("cuda_available"),
+        device=gpu_info.get("device"),
+        gpu_name=gpu_info.get("gpu_name"),
+        gpu_memory_used=gpu_info.get("gpu_memory_used"),
+        gpu_memory_total=gpu_info.get("gpu_memory_total"),
+    )
+
+
+@app.post("/evaluate", response_model=EvaluateResponse)
+def evaluate(request: EvaluateRequest):
+    """
+    Evaluate OCR with given hyperparameters.
+    Returns CER, WER, and timing metrics.
+    Note: Model will be reinitialized if processing flags change.
+    """
+    if state.model is None:
+        raise HTTPException(status_code=503, detail="Model not loaded yet")
+
+    # Load or reload dataset if path changed
+    if state.dataset is None or state.dataset_path != request.pdf_folder:
+        if not os.path.isdir(request.pdf_folder):
+            raise HTTPException(status_code=400, detail=f"Dataset folder not found: {request.pdf_folder}")
+        state.dataset = ImageTextDataset(request.pdf_folder)
+        state.dataset_path = request.pdf_folder
+
+    if len(state.dataset) == 0:
+        raise HTTPException(status_code=400, detail="Dataset is empty")
+
+    # Check if model needs to be reinitialized
+    new_config = {
+        "assume_straight_pages": request.assume_straight_pages,
+        "straighten_pages": request.straighten_pages,
+        "preserve_aspect_ratio": request.preserve_aspect_ratio,
+        "symmetric_pad": request.symmetric_pad,
+        "disable_page_orientation": request.disable_page_orientation,
+        "disable_crop_orientation": request.disable_crop_orientation,
+    }
+
+    model_reinitialized = False
+    if state.current_config != new_config:
+        print(f"Model config changed, reinitializing...")
+        state.model = create_model(**new_config)
+        state.current_config = new_config
+        model_reinitialized = True
+
+    # Validate page range
+    start = request.start_page
+    end = min(request.end_page, len(state.dataset))
+    if start >= end:
+        raise HTTPException(status_code=400, detail=f"Invalid page range: {start}-{end}")
+
+    cer_list, wer_list = [], []
+    time_per_page_list = []
+    t0 = time.time()
+
+    for idx in range(start, end):
+        img, ref = state.dataset[idx]
+        arr = np.array(img)
+
+        tp0 = time.time()
+        # DocTR expects a list of images
+        result = state.model([arr])
+
+        pred = doctr_result_to_text(
+            result,
+            resolve_lines=request.resolve_lines,
+            resolve_blocks=request.resolve_blocks,
+        )
+        time_per_page_list.append(float(time.time() - tp0))
+
+        m = evaluate_text(ref, pred)
+        cer_list.append(m["CER"])
+        wer_list.append(m["WER"])
+
+    return EvaluateResponse(
+        CER=float(np.mean(cer_list)) if cer_list else 1.0,
+        WER=float(np.mean(wer_list)) if wer_list else 1.0,
+        TIME=float(time.time() - t0),
+        PAGES=len(cer_list),
+        TIME_PER_PAGE=float(np.mean(time_per_page_list)) if time_per_page_list else 0.0,
+        model_reinitialized=model_reinitialized,
+    )
+
+
+@app.post("/evaluate_full", response_model=EvaluateResponse)
+def evaluate_full(request: EvaluateRequest):
+    """Evaluate on ALL pages (ignores start_page/end_page)."""
+    request.start_page = 0
+    request.end_page = 9999
+    return evaluate(request)
+
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/src/doctr_service/requirements.txt
+++ b/src/doctr_service/requirements.txt
@@ -0,0 +1,8 @@
+python-doctr[torch]>=0.8.0
+fastapi>=0.104.0
+uvicorn>=0.24.0
+pydantic>=2.0.0
+jiwer>=3.0.0
+numpy>=1.24.0
+pillow>=10.0.0
+torch>=2.0.0
--- a/src/easyocr_service/Dockerfile
+++ b/src/easyocr_service/Dockerfile
@@ -0,0 +1,48 @@
+# Dockerfile - EasyOCR Tuning REST API
+#
+# Build:
+#   docker build -t easyocr-api:latest .
+#
+# Run:
+#   docker run -p 8002:8000 -v ./dataset:/app/dataset easyocr-api:latest
+
+FROM python:3.11-slim
+
+LABEL maintainer="Sergio Jimenez"
+LABEL description="EasyOCR Tuning REST API"
+
+WORKDIR /app
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV EASYOCR_LANGUAGES=es,en
+
+# Install system dependencies for OpenCV and image processing
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgl1 \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender1 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy application code
+COPY easyocr_tuning_rest.py .
+COPY dataset_manager.py .
+
+# Volume for dataset and model cache
+VOLUME ["/app/dataset", "/root/.EasyOCR"]
+
+# Expose API port
+EXPOSE 8000
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
+
+# Run the API server
+CMD ["uvicorn", "easyocr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/src/easyocr_service/dataset_manager.py
+++ b/src/easyocr_service/dataset_manager.py
@@ -0,0 +1,45 @@
+# Imports
+import os
+from PIL import Image
+
+
+class ImageTextDataset:
+    def __init__(self, root):
+        self.samples = []
+
+        for folder in sorted(os.listdir(root)):
+            sub = os.path.join(root, folder)
+            img_dir = os.path.join(sub, "img")
+            txt_dir = os.path.join(sub, "txt")
+
+            if not (os.path.isdir(img_dir) and os.path.isdir(txt_dir)):
+                continue
+
+            for fname in sorted(os.listdir(img_dir)):
+                if not fname.lower().endswith((".png", ".jpg", ".jpeg")):
+                    continue
+
+                img_path = os.path.join(img_dir, fname)
+
+                # text file must have same name but .txt
+                txt_name = os.path.splitext(fname)[0] + ".txt"
+                txt_path = os.path.join(txt_dir, txt_name)
+
+                if not os.path.exists(txt_path):
+                    continue
+
+                self.samples.append((img_path, txt_path))
+    def __len__(self):
+        return len(self.samples)
+
+    def __getitem__(self, idx):
+        img_path, txt_path = self.samples[idx]
+
+        # Load image
+        image = Image.open(img_path).convert("RGB")
+
+        # Load text
+        with open(txt_path, "r", encoding="utf-8") as f:
+            text = f.read()
+
+        return image, text
--- a/src/easyocr_service/easyocr_tuning_rest.py
+++ b/src/easyocr_service/easyocr_tuning_rest.py
@@ -0,0 +1,320 @@
+# easyocr_tuning_rest.py
+# FastAPI REST service for EasyOCR hyperparameter evaluation
+# Usage: uvicorn easyocr_tuning_rest:app --host 0.0.0.0 --port 8000
+
+import os
+import re
+import time
+from typing import Optional, List
+from contextlib import asynccontextmanager
+
+import numpy as np
+import torch
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, Field
+
+import easyocr
+from jiwer import wer, cer
+from dataset_manager import ImageTextDataset
+
+
+def get_gpu_info() -> dict:
+    """Get GPU status information from PyTorch."""
+    info = {
+        "cuda_available": torch.cuda.is_available(),
+        "device": "cuda" if torch.cuda.is_available() else "cpu",
+        "gpu_count": 0,
+        "gpu_name": None,
+        "gpu_memory_total": None,
+        "gpu_memory_used": None,
+    }
+
+    if info["cuda_available"]:
+        try:
+            info["gpu_count"] = torch.cuda.device_count()
+            if info["gpu_count"] > 0:
+                info["gpu_name"] = torch.cuda.get_device_name(0)
+                info["gpu_memory_total"] = f"{torch.cuda.get_device_properties(0).total_memory / (1024**3):.2f} GB"
+                info["gpu_memory_used"] = f"{torch.cuda.memory_allocated(0) / (1024**3):.2f} GB"
+        except Exception as e:
+            info["gpu_error"] = str(e)
+
+    return info
+
+
+# Model configuration via environment variables
+DEFAULT_LANGUAGES = os.environ.get("EASYOCR_LANGUAGES", "es,en").split(",")
+
+
+# Global state for model and dataset
+class AppState:
+    reader: Optional[easyocr.Reader] = None
+    dataset: Optional[ImageTextDataset] = None
+    dataset_path: Optional[str] = None
+    languages: List[str] = DEFAULT_LANGUAGES
+
+
+state = AppState()
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Load EasyOCR model at startup."""
+    gpu_info = get_gpu_info()
+    print("=" * 50)
+    print("GPU STATUS")
+    print("=" * 50)
+    print(f"  CUDA available: {gpu_info['cuda_available']}")
+    print(f"  Device: {gpu_info['device']}")
+    if gpu_info['cuda_available']:
+        print(f"  GPU count: {gpu_info['gpu_count']}")
+        print(f"  GPU name: {gpu_info['gpu_name']}")
+        print(f"  GPU memory total: {gpu_info['gpu_memory_total']}")
+    print("=" * 50)
+
+    print(f"Loading EasyOCR models...")
+    print(f"  Languages: {state.languages}")
+    state.reader = easyocr.Reader(
+        state.languages,
+        gpu=gpu_info['cuda_available'],
+    )
+
+    if gpu_info['cuda_available']:
+        gpu_after = get_gpu_info()
+        print(f"  GPU memory after load: {gpu_after.get('gpu_memory_used', 'N/A')}")
+
+    print("Model loaded successfully!")
+    yield
+    state.reader = None
+    state.dataset = None
+
+
+app = FastAPI(
+    title="EasyOCR Tuning API",
+    description="REST API for EasyOCR hyperparameter evaluation",
+    version="1.0.0",
+    lifespan=lifespan,
+)
+
+
+class EvaluateRequest(BaseModel):
+    """Request schema with all tunable EasyOCR hyperparameters."""
+    pdf_folder: str = Field("/app/dataset", description="Path to dataset folder")
+
+    # Detection thresholds (CRAFT algorithm)
+    text_threshold: float = Field(0.7, ge=0.0, le=1.0, description="Text confidence threshold")
+    low_text: float = Field(0.4, ge=0.0, le=1.0, description="Text lower-bound score")
+    link_threshold: float = Field(0.4, ge=0.0, le=1.0, description="Link confidence threshold")
+
+    # Bounding box merging
+    slope_ths: float = Field(0.1, ge=0.0, le=1.0, description="Maximum slope for box merging")
+    ycenter_ths: float = Field(0.5, ge=0.0, le=2.0, description="Maximum vertical shift for merging")
+    height_ths: float = Field(0.5, ge=0.0, le=2.0, description="Maximum height variance for merging")
+    width_ths: float = Field(0.5, ge=0.0, le=2.0, description="Maximum horizontal distance for merging")
+    add_margin: float = Field(0.1, ge=0.0, le=1.0, description="Bounding box extension margin")
+
+    # Contrast handling
+    contrast_ths: float = Field(0.1, ge=0.0, le=1.0, description="Contrast threshold for dual-pass")
+    adjust_contrast: float = Field(0.5, ge=0.0, le=1.0, description="Target contrast adjustment level")
+
+    # Decoder options
+    decoder: str = Field("greedy", description="Decoder type: greedy, beamsearch, wordbeamsearch")
+    beamWidth: int = Field(5, ge=1, le=20, description="Beam width for beam search decoders")
+
+    # Other
+    min_size: int = Field(10, ge=1, description="Minimum text box size in pixels")
+    rotation_info: Optional[List[int]] = Field(None, description="Rotation angles to try: [90, 180, 270]")
+
+    # Page range
+    start_page: int = Field(5, ge=0, description="Start page index (inclusive)")
+    end_page: int = Field(10, ge=1, description="End page index (exclusive)")
+
+
+class EvaluateResponse(BaseModel):
+    """Response schema matching CLI output."""
+    CER: float
+    WER: float
+    TIME: float
+    PAGES: int
+    TIME_PER_PAGE: float
+
+
+class HealthResponse(BaseModel):
+    status: str
+    model_loaded: bool
+    dataset_loaded: bool
+    dataset_size: Optional[int] = None
+    languages: Optional[List[str]] = None
+    cuda_available: Optional[bool] = None
+    device: Optional[str] = None
+    gpu_name: Optional[str] = None
+    gpu_memory_used: Optional[str] = None
+    gpu_memory_total: Optional[str] = None
+
+
+def assemble_easyocr_result(result: list) -> str:
+    """
+    Assemble EasyOCR result into text.
+    EasyOCR returns: [(bbox, text, confidence), ...]
+    """
+    if not result:
+        return ""
+
+    # Sort by vertical position (y), then horizontal (x)
+    # bbox format: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+    def get_y_center(item):
+        bbox = item[0]
+        return (bbox[0][1] + bbox[2][1]) / 2
+
+    def get_x(item):
+        return item[0][0][0]
+
+    # Group by lines based on y-center
+    sorted_items = sorted(result, key=lambda x: (get_y_center(x), get_x(x)))
+
+    if not sorted_items:
+        return ""
+
+    # Adaptive line tolerance
+    heights = []
+    for item in sorted_items:
+        bbox = item[0]
+        h = abs(bbox[2][1] - bbox[0][1])
+        heights.append(h)
+
+    median_h = float(np.median(heights)) if heights else 20.0
+    line_tol = max(8.0, 0.6 * median_h)
+
+    lines, cur_line, last_y = [], [], None
+    for item in sorted_items:
+        y_center = get_y_center(item)
+        text = item[1]
+
+        if last_y is None or abs(y_center - last_y) <= line_tol:
+            cur_line.append((get_x(item), text))
+        else:
+            cur_line.sort(key=lambda t: t[0])
+            lines.append(" ".join(t[1] for t in cur_line))
+            cur_line = [(get_x(item), text)]
+        last_y = y_center
+
+    if cur_line:
+        cur_line.sort(key=lambda t: t[0])
+        lines.append(" ".join(t[1] for t in cur_line))
+
+    text = " ".join(lines)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+
+
+def evaluate_text(reference: str, prediction: str) -> dict:
+    """Calculate WER and CER metrics."""
+    return {"WER": wer(reference, prediction), "CER": cer(reference, prediction)}
+
+
+@app.get("/health", response_model=HealthResponse)
+def health_check():
+    """Check if the service is ready."""
+    gpu_info = get_gpu_info()
+    return HealthResponse(
+        status="ok" if state.reader is not None else "initializing",
+        model_loaded=state.reader is not None,
+        dataset_loaded=state.dataset is not None,
+        dataset_size=len(state.dataset) if state.dataset else None,
+        languages=state.languages,
+        cuda_available=gpu_info.get("cuda_available"),
+        device=gpu_info.get("device"),
+        gpu_name=gpu_info.get("gpu_name"),
+        gpu_memory_used=gpu_info.get("gpu_memory_used"),
+        gpu_memory_total=gpu_info.get("gpu_memory_total"),
+    )
+
+
+@app.post("/evaluate", response_model=EvaluateResponse)
+def evaluate(request: EvaluateRequest):
+    """
+    Evaluate OCR with given hyperparameters.
+    Returns CER, WER, and timing metrics.
+    """
+    if state.reader is None:
+        raise HTTPException(status_code=503, detail="Model not loaded yet")
+
+    # Validate decoder
+    if request.decoder not in ["greedy", "beamsearch", "wordbeamsearch"]:
+        raise HTTPException(status_code=400, detail=f"Invalid decoder: {request.decoder}")
+
+    # Load or reload dataset if path changed
+    if state.dataset is None or state.dataset_path != request.pdf_folder:
+        if not os.path.isdir(request.pdf_folder):
+            raise HTTPException(status_code=400, detail=f"Dataset folder not found: {request.pdf_folder}")
+        state.dataset = ImageTextDataset(request.pdf_folder)
+        state.dataset_path = request.pdf_folder
+
+    if len(state.dataset) == 0:
+        raise HTTPException(status_code=400, detail="Dataset is empty")
+
+    # Validate page range
+    start = request.start_page
+    end = min(request.end_page, len(state.dataset))
+    if start >= end:
+        raise HTTPException(status_code=400, detail=f"Invalid page range: {start}-{end}")
+
+    cer_list, wer_list = [], []
+    time_per_page_list = []
+    t0 = time.time()
+
+    for idx in range(start, end):
+        img, ref = state.dataset[idx]
+        arr = np.array(img)
+
+        tp0 = time.time()
+        result = state.reader.readtext(
+            arr,
+            # Detection thresholds
+            text_threshold=request.text_threshold,
+            low_text=request.low_text,
+            link_threshold=request.link_threshold,
+            # Bounding box merging
+            slope_ths=request.slope_ths,
+            ycenter_ths=request.ycenter_ths,
+            height_ths=request.height_ths,
+            width_ths=request.width_ths,
+            add_margin=request.add_margin,
+            # Contrast
+            contrast_ths=request.contrast_ths,
+            adjust_contrast=request.adjust_contrast,
+            # Decoder
+            decoder=request.decoder,
+            beamWidth=request.beamWidth,
+            # Other
+            min_size=request.min_size,
+            rotation_info=request.rotation_info,
+        )
+
+        pred = assemble_easyocr_result(result)
+        time_per_page_list.append(float(time.time() - tp0))
+
+        m = evaluate_text(ref, pred)
+        cer_list.append(m["CER"])
+        wer_list.append(m["WER"])
+
+    return EvaluateResponse(
+        CER=float(np.mean(cer_list)) if cer_list else 1.0,
+        WER=float(np.mean(wer_list)) if wer_list else 1.0,
+        TIME=float(time.time() - t0),
+        PAGES=len(cer_list),
+        TIME_PER_PAGE=float(np.mean(time_per_page_list)) if time_per_page_list else 0.0,
+    )
+
+
+@app.post("/evaluate_full", response_model=EvaluateResponse)
+def evaluate_full(request: EvaluateRequest):
+    """Evaluate on ALL pages (ignores start_page/end_page)."""
+    request.start_page = 0
+    request.end_page = 9999
+    return evaluate(request)
+
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/src/easyocr_service/requirements.txt
+++ b/src/easyocr_service/requirements.txt
@@ -0,0 +1,8 @@
+easyocr>=1.7.0
+fastapi>=0.104.0
+uvicorn>=0.24.0
+pydantic>=2.0.0
+jiwer>=3.0.0
+numpy>=1.24.0
+pillow>=10.0.0
+torch>=2.0.0
--- a/src/paddle_ocr/benchmark.py
+++ b/src/paddle_ocr/benchmark.py
@@ -1,207 +0,0 @@
-# benchmark.py - Compare CPU vs GPU performance for PaddleOCR REST API
-# Usage: python benchmark.py
-
-import requests
-import time
-import json
-import sys
-from datetime import datetime
-
-CONTAINERS = {
-    "GPU": {"url": "http://localhost:8000", "port": 8000},
-    "CPU": {"url": "http://localhost:8002", "port": 8002},
-}
-
-DATASET_PATH = "/app/dataset"
-
-# Test configurations
-TEST_CONFIGS = [
-    {
-        "name": "Baseline",
-        "config": {
-            "pdf_folder": DATASET_PATH,
-            "use_doc_orientation_classify": False,
-            "use_doc_unwarping": False,
-            "textline_orientation": False,
-            "text_det_thresh": 0.0,
-            "text_det_box_thresh": 0.0,
-            "text_det_unclip_ratio": 1.5,
-            "text_rec_score_thresh": 0.0,
-            "start_page": 5,
-            "end_page": 10,
-        }
-    },
-    {
-        "name": "Optimized",
-        "config": {
-            "pdf_folder": DATASET_PATH,
-            "use_doc_orientation_classify": False,
-            "use_doc_unwarping": False,
-            "textline_orientation": True,
-            "text_det_thresh": 0.4690,
-            "text_det_box_thresh": 0.5412,
-            "text_det_unclip_ratio": 0.0,
-            "text_rec_score_thresh": 0.6350,
-            "start_page": 5,
-            "end_page": 10,
-        }
-    },
-]
-
-
-def check_health(url: str, timeout: int = 10) -> bool:
-    """Check if API is healthy."""
-    try:
-        resp = requests.get(f"{url}/health", timeout=timeout)
-        if resp.status_code == 200:
-            data = resp.json()
-            return data.get("model_loaded", False)
-    except Exception as e:
-        print(f"  Health check failed: {e}")
-    return False
-
-
-def run_benchmark(url: str, config: dict, warmup: bool = False) -> dict:
-    """Run a single benchmark test."""
-    eval_url = f"{url}/evaluate"
-
-    start = time.time()
-    resp = requests.post(eval_url, json=config, timeout=600)
-    resp.raise_for_status()
-    total_time = time.time() - start
-
-    result = resp.json()
-    result["total_request_time"] = total_time
-
-    return result
-
-
-def main():
-    results = {
-        "timestamp": datetime.now().isoformat(),
-        "containers": {},
-    }
-
-    print("=" * 60)
-    print("PaddleOCR CPU vs GPU Benchmark")
-    print("=" * 60)
-    print()
-
-    # Check container health
-    print("Checking container health...")
-    for name, info in CONTAINERS.items():
-        healthy = check_health(info["url"])
-        status = "✓ Ready" if healthy else "✗ Not Ready"
-        print(f"  {name} ({info['url']}): {status}")
-        if not healthy:
-            print(f"    Skipping {name} - container not available")
-            continue
-    print()
-
-    # Run benchmarks for each container
-    for container_name, container_info in CONTAINERS.items():
-        url = container_info["url"]
-
-        if not check_health(url):
-            print(f"Skipping {container_name} - not healthy")
-            continue
-
-        print("=" * 60)
-        print(f"Testing: {container_name} Container")
-        print(f"URL: {url}")
-        print("=" * 60)
-
-        container_results = {
-            "url": url,
-            "tests": {},
-        }
-
-        # Warmup run (first run often slower due to model loading/caching)
-        print("\n  Warmup run...")
-        try:
-            warmup_config = TEST_CONFIGS[0]["config"].copy()
-            warmup_config["start_page"] = 5
-            warmup_config["end_page"] = 6  # Just 1 page for warmup
-            run_benchmark(url, warmup_config, warmup=True)
-            print("  Warmup complete.")
-        except Exception as e:
-            print(f"  Warmup failed: {e}")
-
-        # Run each test configuration
-        for test in TEST_CONFIGS:
-            test_name = test["name"]
-            config = test["config"]
-
-            print(f"\n  Running: {test_name} Configuration")
-            print(f"  Pages: {config['start_page']} to {config['end_page']}")
-
-            try:
-                result = run_benchmark(url, config)
-
-                container_results["tests"][test_name] = {
-                    "CER": result["CER"],
-                    "WER": result["WER"],
-                    "PAGES": result["PAGES"],
-                    "TIME_PER_PAGE": result["TIME_PER_PAGE"],
-                    "TOTAL_TIME": result["total_request_time"],
-                }
-
-                print(f"    CER: {result['CER']*100:.2f}%")
-                print(f"    WER: {result['WER']*100:.2f}%")
-                print(f"    Pages: {result['PAGES']}")
-                print(f"    Time/page: {result['TIME_PER_PAGE']:.3f}s")
-                print(f"    Total time: {result['total_request_time']:.2f}s")
-
-            except Exception as e:
-                print(f"    ERROR: {e}")
-                container_results["tests"][test_name] = {"error": str(e)}
-
-        results["containers"][container_name] = container_results
-
-    # Print summary
-    print("\n")
-    print("=" * 60)
-    print("BENCHMARK SUMMARY")
-    print("=" * 60)
-
-    # Table header
-    print(f"\n{'Test':<12} {'Container':<8} {'CER %':<10} {'WER %':<10} {'Time/Page':<12} {'Total (s)':<10}")
-    print("-" * 62)
-
-    for test in TEST_CONFIGS:
-        test_name = test["name"]
-        for container_name in CONTAINERS.keys():
-            if container_name in results["containers"]:
-                tests = results["containers"][container_name].get("tests", {})
-                if test_name in tests and "error" not in tests[test_name]:
-                    t = tests[test_name]
-                    print(f"{test_name:<12} {container_name:<8} {t['CER']*100:<10.2f} {t['WER']*100:<10.2f} {t['TIME_PER_PAGE']:<12.3f} {t['TOTAL_TIME']:<10.2f}")
-
-    # Speed comparison
-    print("\n" + "=" * 60)
-    print("SPEED COMPARISON")
-    print("=" * 60)
-
-    for test in TEST_CONFIGS:
-        test_name = test["name"]
-        gpu_data = results["containers"].get("GPU", {}).get("tests", {}).get(test_name, {})
-        cpu_data = results["containers"].get("CPU", {}).get("tests", {}).get(test_name, {})
-
-        if gpu_data and cpu_data and "error" not in gpu_data and "error" not in cpu_data:
-            speedup = cpu_data["TIME_PER_PAGE"] / gpu_data["TIME_PER_PAGE"]
-            print(f"\n{test_name} Configuration:")
-            print(f"  GPU: {gpu_data['TIME_PER_PAGE']:.3f}s per page")
-            print(f"  CPU: {cpu_data['TIME_PER_PAGE']:.3f}s per page")
-            print(f"  GPU is {speedup:.2f}x faster than CPU")
-
-    # Save results to JSON
-    output_file = "benchmark_results.json"
-    with open(output_file, "w") as f:
-        json.dump(results, f, indent=2)
-    print(f"\n\nResults saved to: {output_file}")
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
--- a/src/paddle_ocr/docker-compose.yml
+++ b/src/paddle_ocr/docker-compose.yml
@@ -3,7 +3,7 @@
 #   CPU:   docker compose up ocr-cpu
 #   GPU:   docker compose up ocr-gpu
 #   Test:  docker compose run --rm test
-#   Build: CUDA_ARCH=90 docker compose --profile build run --rm build-paddle
+#   Build: CUDA_ARCH=120 docker compose --profile build run --rm build-paddle
 #
 # Auto-detect CUDA arch before building:
 #   export CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1 | tr -d '.')
@@ -12,13 +12,13 @@
 services:
  # PaddlePaddle GPU wheel builder (ARM64 only, one-time build)
  # Creates ./wheels/paddlepaddle_gpu-*.whl for ARM64 GPU support
-  # CUDA_ARCH env var controls target GPU architecture (default: 90 for Hopper)
+  # CUDA_ARCH env var controls target GPU architecture (default: 120 for Blackwell base)
  build-paddle:
    build:
      context: .
      dockerfile: Dockerfile.build-paddle
      args:
-        CUDA_ARCH: ${CUDA_ARCH:-90}
+        CUDA_ARCH: ${CUDA_ARCH:-120}
    volumes:
      - ./wheels:/wheels
    profiles:
--- a/src/paddle_ocr/scripts/debug_gpu_detection.py
+++ b/src/paddle_ocr/scripts/debug_gpu_detection.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+"""
+Debug script for GPU OCR detection issues.
+
+This script tests the raw inference output from PaddlePaddle detection models
+to diagnose why detection might fail on certain GPU architectures (e.g., Blackwell/sm_121).
+
+Usage:
+    docker exec paddle-ocr-gpu python /app/debug_gpu_detection.py [image_path]
+
+Expected behavior:
+    - Working GPU: Output stats should show min close to 0, max close to 1, mean ~0.1-0.5
+    - Broken GPU: Output stats show constant values (e.g., min=max=mean=0.00001)
+"""
+
+import os
+import sys
+
+os.environ['DISABLE_MODEL_SOURCE_CHECK'] = 'True'
+
+import numpy as np
+import paddle
+from PIL import Image
+
+
+def check_gpu_status():
+    """Check GPU availability and properties."""
+    print("=" * 60)
+    print("GPU STATUS")
+    print("=" * 60)
+    print(f"Device: {paddle.device.get_device()}")
+    print(f"CUDA compiled: {paddle.device.is_compiled_with_cuda()}")
+
+    if paddle.device.is_compiled_with_cuda():
+        print(f"GPU count: {paddle.device.cuda.device_count()}")
+        if paddle.device.cuda.device_count() > 0:
+            props = paddle.device.cuda.get_device_properties(0)
+            print(f"GPU name: {props.name}")
+            print(f"Compute capability: {props.major}.{props.minor}")
+            print(f"Total memory: {props.total_memory / (1024**3):.2f} GB")
+    print()
+
+
+def test_basic_ops():
+    """Test basic GPU tensor operations."""
+    print("=" * 60)
+    print("BASIC GPU OPERATIONS")
+    print("=" * 60)
+
+    # Test tensor creation
+    x = paddle.randn([2, 3])
+    print(f"Tensor place: {x.place}")
+
+    # Test conv2d
+    x = paddle.randn([1, 3, 64, 64])
+    conv = paddle.nn.Conv2D(3, 16, 3, padding=1)
+    y = conv(x)
+    print(f"Conv2d output shape: {y.shape}, place: {y.place}")
+
+    # Test softmax
+    s = paddle.nn.functional.softmax(y, axis=1)
+    print(f"Softmax output shape: {s.shape}")
+    print("Basic operations: OK")
+    print()
+
+
+def test_detection_model(image_path: str):
+    """Test detection model raw output."""
+    print("=" * 60)
+    print("DETECTION MODEL TEST")
+    print("=" * 60)
+
+    from paddle.inference import Config, create_predictor
+
+    model_dir = '/root/.paddlex/official_models/PP-OCRv4_mobile_det'
+    inference_file = f'{model_dir}/inference.json'
+    params_file = f'{model_dir}/inference.pdiparams'
+
+    if not os.path.exists(inference_file):
+        print(f"Model not found at {model_dir}")
+        print("Run PaddleOCR once to download models first.")
+        return
+
+    # Create config
+    config = Config()
+    config.set_prog_file(inference_file)
+    config.set_params_file(params_file)
+    config.enable_use_gpu(1024, 0)
+
+    print("Creating predictor...")
+    predictor = create_predictor(config)
+
+    # Get input/output names
+    input_names = predictor.get_input_names()
+    output_names = predictor.get_output_names()
+    print(f"Input names: {input_names}")
+    print(f"Output names: {output_names}")
+
+    # Load and preprocess image
+    img = Image.open(image_path)
+    img = img.resize((640, 640))
+    arr = np.array(img).astype('float32')
+    arr = arr / 255.0
+    arr = arr.transpose(2, 0, 1)[np.newaxis, ...]  # NCHW
+    print(f"Input tensor shape: {arr.shape}")
+
+    # Set input
+    input_handle = predictor.get_input_handle(input_names[0])
+    input_handle.reshape(arr.shape)
+    input_handle.copy_from_cpu(arr)
+
+    # Run prediction
+    print("Running inference...")
+    predictor.run()
+
+    # Get output
+    output_handle = predictor.get_output_handle(output_names[0])
+    output = output_handle.copy_to_cpu()
+
+    print()
+    print("OUTPUT ANALYSIS:")
+    print(f"  Shape: {output.shape}")
+    print(f"  Min: {output.min():.6f}")
+    print(f"  Max: {output.max():.6f}")
+    print(f"  Mean: {output.mean():.6f}")
+    print(f"  Std: {output.std():.6f}")
+    print(f"  Has NaN: {np.isnan(output).any()}")
+    print(f"  Has Inf: {np.isinf(output).any()}")
+
+    # Diagnosis
+    print()
+    print("DIAGNOSIS:")
+    if output.min() == output.max():
+        print("  PROBLEM: Output is constant - model inference is broken!")
+        print("  This typically indicates GPU compute capability mismatch.")
+        print("  GB10 (sm_121) may need CUDA 13.0+ for native support.")
+    elif output.max() < 0.01:
+        print("  PROBLEM: Output values too low - detection will find nothing.")
+    elif np.isnan(output).any() or np.isinf(output).any():
+        print("  PROBLEM: Output contains NaN/Inf - numerical instability.")
+    else:
+        print("  OK: Output values look reasonable.")
+        print(f"  Detection threshold typically 0.3-0.6, max output is {output.max():.3f}")
+
+
+def test_paddleocr_output(image_path: str):
+    """Test full PaddleOCR pipeline."""
+    print()
+    print("=" * 60)
+    print("PADDLEOCR PIPELINE TEST")
+    print("=" * 60)
+
+    from paddleocr import PaddleOCR
+
+    ocr = PaddleOCR(
+        text_detection_model_name='PP-OCRv4_mobile_det',
+        text_recognition_model_name='PP-OCRv4_mobile_rec',
+    )
+
+    img = Image.open(image_path)
+    arr = np.array(img)
+
+    out = ocr.predict(arr)
+    res = out[0].json['res']
+
+    dt_polys = res.get('dt_polys', [])
+    rec_texts = res.get('rec_texts', [])
+
+    print(f"Detection polygons: {len(dt_polys)}")
+    print(f"Recognition texts: {len(rec_texts)}")
+
+    if rec_texts:
+        print(f"Sample texts: {rec_texts[:5]}")
+    else:
+        print("No text detected!")
+
+
+def main():
+    # Default test image
+    image_path = '/app/dataset/0/img/page_0001.png'
+    if len(sys.argv) > 1:
+        image_path = sys.argv[1]
+
+    if not os.path.exists(image_path):
+        print(f"Image not found: {image_path}")
+        print("Usage: python debug_gpu_detection.py [image_path]")
+        sys.exit(1)
+
+    print(f"Testing with image: {image_path}")
+    print()
+
+    check_gpu_status()
+    test_basic_ops()
+    test_detection_model(image_path)
+    test_paddleocr_output(image_path)
+
+
+if __name__ == '__main__':
+    main()
--- a/src/paddle_ocr/test.py
+++ b/src/paddle_ocr/test.py
@@ -56,7 +56,7 @@ def test_evaluate(url: str, config: dict) -> dict:

 def main():
    parser = argparse.ArgumentParser(description="Test PaddleOCR REST API")
-    parser.add_argument("--url", default="http://localhost:8000", help="API base URL")
+    parser.add_argument("--url", default="http://localhost:8001", help="API base URL")
    parser.add_argument("--dataset", default="/app/dataset", help="Dataset path (inside container)")
    parser.add_argument("--skip-health", action="store_true", help="Skip health check wait")
    args = parser.parse_args()