eassyocr doctr
Some checks failed
build_docker / build_easyocr (linux/amd64) (push) Has been cancelled
build_docker / build_easyocr (linux/arm64) (push) Has been cancelled
build_docker / build_doctr (linux/amd64) (push) Has been cancelled
build_docker / essential (push) Successful in 1s
build_docker / essential (pull_request) Successful in 1s
build_docker / build_gpu (linux/amd64) (push) Has been cancelled
build_docker / build_gpu (linux/arm64) (push) Has been cancelled
build_docker / manifest_cpu (push) Has been cancelled
build_docker / manifest_gpu (push) Has been cancelled
build_docker / build_cpu (linux/amd64) (push) Has been cancelled
build_docker / build_doctr (linux/arm64) (push) Has been cancelled
build_docker / manifest_easyocr (push) Has been cancelled
build_docker / manifest_doctr (push) Has been cancelled
build_docker / build_cpu (linux/arm64) (push) Has been cancelled
build_docker / build_cpu (linux/amd64) (pull_request) Successful in 4m56s
build_docker / build_gpu (linux/amd64) (pull_request) Has been cancelled
build_docker / build_gpu (linux/arm64) (pull_request) Has been cancelled
build_docker / manifest_cpu (pull_request) Has been cancelled
build_docker / manifest_gpu (pull_request) Has been cancelled
build_docker / build_easyocr (linux/amd64) (pull_request) Has been cancelled
build_docker / build_easyocr (linux/arm64) (pull_request) Has been cancelled
build_docker / build_doctr (linux/amd64) (pull_request) Has been cancelled
build_docker / build_doctr (linux/arm64) (pull_request) Has been cancelled
build_docker / manifest_easyocr (pull_request) Has been cancelled
build_docker / manifest_doctr (pull_request) Has been cancelled
build_docker / build_cpu (linux/arm64) (pull_request) Has been cancelled

This commit is contained in:
2026-01-18 06:47:01 +01:00
parent 38ba2d1f5a
commit 578689443d
14 changed files with 1473 additions and 211 deletions

View File

@@ -0,0 +1,49 @@
# Dockerfile - DocTR Tuning REST API
#
# Build:
# docker build -t doctr-api:latest .
#
# Run:
# docker run -p 8003:8000 -v ./dataset:/app/dataset doctr-api:latest
FROM python:3.11-slim
LABEL maintainer="Sergio Jimenez"
LABEL description="DocTR Tuning REST API"
WORKDIR /app
# Set environment variables
ENV PYTHONUNBUFFERED=1
ENV DOCTR_DET_ARCH=db_resnet50
ENV DOCTR_RECO_ARCH=crnn_vgg16_bn
# Install system dependencies for OpenCV and image processing
RUN apt-get update && apt-get install -y --no-install-recommends \
libgl1 \
libglib2.0-0 \
libsm6 \
libxext6 \
libxrender1 \
&& rm -rf /var/lib/apt/lists/*
# Copy and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY doctr_tuning_rest.py .
COPY dataset_manager.py .
# Volume for dataset and model cache
VOLUME ["/app/dataset", "/root/.cache/doctr"]
# Expose API port
EXPOSE 8000
# Health check (longer start period for model download)
HEALTHCHECK --interval=30s --timeout=10s --start-period=180s --retries=3 \
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
# Run the API server
CMD ["uvicorn", "doctr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -0,0 +1,45 @@
# Imports
import os
from PIL import Image
class ImageTextDataset:
def __init__(self, root):
self.samples = []
for folder in sorted(os.listdir(root)):
sub = os.path.join(root, folder)
img_dir = os.path.join(sub, "img")
txt_dir = os.path.join(sub, "txt")
if not (os.path.isdir(img_dir) and os.path.isdir(txt_dir)):
continue
for fname in sorted(os.listdir(img_dir)):
if not fname.lower().endswith((".png", ".jpg", ".jpeg")):
continue
img_path = os.path.join(img_dir, fname)
# text file must have same name but .txt
txt_name = os.path.splitext(fname)[0] + ".txt"
txt_path = os.path.join(txt_dir, txt_name)
if not os.path.exists(txt_path):
continue
self.samples.append((img_path, txt_path))
def __len__(self):
return len(self.samples)
def __getitem__(self, idx):
img_path, txt_path = self.samples[idx]
# Load image
image = Image.open(img_path).convert("RGB")
# Load text
with open(txt_path, "r", encoding="utf-8") as f:
text = f.read()
return image, text

View File

@@ -0,0 +1,322 @@
# doctr_tuning_rest.py
# FastAPI REST service for DocTR hyperparameter evaluation
# Usage: uvicorn doctr_tuning_rest:app --host 0.0.0.0 --port 8000
import os
import re
import time
from typing import Optional
from contextlib import asynccontextmanager
import numpy as np
import torch
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from doctr.models import ocr_predictor
from jiwer import wer, cer
from dataset_manager import ImageTextDataset
def get_gpu_info() -> dict:
"""Get GPU status information from PyTorch."""
info = {
"cuda_available": torch.cuda.is_available(),
"device": "cuda" if torch.cuda.is_available() else "cpu",
"gpu_count": 0,
"gpu_name": None,
"gpu_memory_total": None,
"gpu_memory_used": None,
}
if info["cuda_available"]:
try:
info["gpu_count"] = torch.cuda.device_count()
if info["gpu_count"] > 0:
info["gpu_name"] = torch.cuda.get_device_name(0)
info["gpu_memory_total"] = f"{torch.cuda.get_device_properties(0).total_memory / (1024**3):.2f} GB"
info["gpu_memory_used"] = f"{torch.cuda.memory_allocated(0) / (1024**3):.2f} GB"
except Exception as e:
info["gpu_error"] = str(e)
return info
# Model configuration via environment variables
DEFAULT_DET_ARCH = os.environ.get("DOCTR_DET_ARCH", "db_resnet50")
DEFAULT_RECO_ARCH = os.environ.get("DOCTR_RECO_ARCH", "crnn_vgg16_bn")
# Global state for model and dataset
class AppState:
model: Optional[object] = None
dataset: Optional[ImageTextDataset] = None
dataset_path: Optional[str] = None
det_arch: str = DEFAULT_DET_ARCH
reco_arch: str = DEFAULT_RECO_ARCH
# Track current model config for cache invalidation
current_config: Optional[dict] = None
device: str = "cuda" if torch.cuda.is_available() else "cpu"
state = AppState()
def create_model(
assume_straight_pages: bool = True,
straighten_pages: bool = False,
preserve_aspect_ratio: bool = True,
symmetric_pad: bool = True,
disable_page_orientation: bool = False,
disable_crop_orientation: bool = False,
) -> object:
"""Create DocTR model with given configuration."""
model = ocr_predictor(
det_arch=state.det_arch,
reco_arch=state.reco_arch,
pretrained=True,
assume_straight_pages=assume_straight_pages,
straighten_pages=straighten_pages,
preserve_aspect_ratio=preserve_aspect_ratio,
symmetric_pad=symmetric_pad,
)
# Apply orientation settings if supported
if hasattr(model, 'disable_page_orientation'):
model.disable_page_orientation = disable_page_orientation
if hasattr(model, 'disable_crop_orientation'):
model.disable_crop_orientation = disable_crop_orientation
# Move to GPU if available
if state.device == "cuda":
model = model.cuda()
return model
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Load DocTR model at startup with default configuration."""
gpu_info = get_gpu_info()
print("=" * 50)
print("GPU STATUS")
print("=" * 50)
print(f" CUDA available: {gpu_info['cuda_available']}")
print(f" Device: {gpu_info['device']}")
if gpu_info['cuda_available']:
print(f" GPU count: {gpu_info['gpu_count']}")
print(f" GPU name: {gpu_info['gpu_name']}")
print(f" GPU memory total: {gpu_info['gpu_memory_total']}")
print("=" * 50)
print(f"Loading DocTR models...")
print(f" Detection: {state.det_arch}")
print(f" Recognition: {state.reco_arch}")
# Load with default config
state.model = create_model()
state.current_config = {
"assume_straight_pages": True,
"straighten_pages": False,
"preserve_aspect_ratio": True,
"symmetric_pad": True,
"disable_page_orientation": False,
"disable_crop_orientation": False,
}
if gpu_info['cuda_available']:
gpu_after = get_gpu_info()
print(f" GPU memory after load: {gpu_after.get('gpu_memory_used', 'N/A')}")
print("Model loaded successfully!")
yield
state.model = None
state.dataset = None
app = FastAPI(
title="DocTR Tuning API",
description="REST API for DocTR hyperparameter evaluation",
version="1.0.0",
lifespan=lifespan,
)
class EvaluateRequest(BaseModel):
"""Request schema with all tunable DocTR hyperparameters."""
pdf_folder: str = Field("/app/dataset", description="Path to dataset folder")
# Processing flags (require model reinit)
assume_straight_pages: bool = Field(True, description="Skip rotation handling for straight documents")
straighten_pages: bool = Field(False, description="Pre-straighten pages before detection")
preserve_aspect_ratio: bool = Field(True, description="Maintain document proportions during resize")
symmetric_pad: bool = Field(True, description="Use symmetric padding when preserving aspect ratio")
# Orientation flags
disable_page_orientation: bool = Field(False, description="Skip page orientation classification")
disable_crop_orientation: bool = Field(False, description="Skip crop orientation detection")
# Output grouping
resolve_lines: bool = Field(True, description="Group words into lines")
resolve_blocks: bool = Field(False, description="Group lines into blocks")
paragraph_break: float = Field(0.035, ge=0.0, le=1.0, description="Minimum space ratio separating paragraphs")
# Page range
start_page: int = Field(5, ge=0, description="Start page index (inclusive)")
end_page: int = Field(10, ge=1, description="End page index (exclusive)")
class EvaluateResponse(BaseModel):
"""Response schema matching CLI output."""
CER: float
WER: float
TIME: float
PAGES: int
TIME_PER_PAGE: float
model_reinitialized: bool = False
class HealthResponse(BaseModel):
status: str
model_loaded: bool
dataset_loaded: bool
dataset_size: Optional[int] = None
det_arch: Optional[str] = None
reco_arch: Optional[str] = None
cuda_available: Optional[bool] = None
device: Optional[str] = None
gpu_name: Optional[str] = None
gpu_memory_used: Optional[str] = None
gpu_memory_total: Optional[str] = None
def doctr_result_to_text(result, resolve_lines: bool = True, resolve_blocks: bool = False) -> str:
"""
Convert DocTR result to plain text.
Structure: Document -> pages -> blocks -> lines -> words
"""
lines = []
for page in result.pages:
for block in page.blocks:
for line in block.lines:
line_text = " ".join([w.value for w in line.words])
lines.append(line_text)
if resolve_blocks:
lines.append("") # paragraph separator
text = " ".join([l for l in lines if l]).strip()
text = re.sub(r"\s+", " ", text).strip()
return text
def evaluate_text(reference: str, prediction: str) -> dict:
"""Calculate WER and CER metrics."""
return {"WER": wer(reference, prediction), "CER": cer(reference, prediction)}
@app.get("/health", response_model=HealthResponse)
def health_check():
"""Check if the service is ready."""
gpu_info = get_gpu_info()
return HealthResponse(
status="ok" if state.model is not None else "initializing",
model_loaded=state.model is not None,
dataset_loaded=state.dataset is not None,
dataset_size=len(state.dataset) if state.dataset else None,
det_arch=state.det_arch,
reco_arch=state.reco_arch,
cuda_available=gpu_info.get("cuda_available"),
device=gpu_info.get("device"),
gpu_name=gpu_info.get("gpu_name"),
gpu_memory_used=gpu_info.get("gpu_memory_used"),
gpu_memory_total=gpu_info.get("gpu_memory_total"),
)
@app.post("/evaluate", response_model=EvaluateResponse)
def evaluate(request: EvaluateRequest):
"""
Evaluate OCR with given hyperparameters.
Returns CER, WER, and timing metrics.
Note: Model will be reinitialized if processing flags change.
"""
if state.model is None:
raise HTTPException(status_code=503, detail="Model not loaded yet")
# Load or reload dataset if path changed
if state.dataset is None or state.dataset_path != request.pdf_folder:
if not os.path.isdir(request.pdf_folder):
raise HTTPException(status_code=400, detail=f"Dataset folder not found: {request.pdf_folder}")
state.dataset = ImageTextDataset(request.pdf_folder)
state.dataset_path = request.pdf_folder
if len(state.dataset) == 0:
raise HTTPException(status_code=400, detail="Dataset is empty")
# Check if model needs to be reinitialized
new_config = {
"assume_straight_pages": request.assume_straight_pages,
"straighten_pages": request.straighten_pages,
"preserve_aspect_ratio": request.preserve_aspect_ratio,
"symmetric_pad": request.symmetric_pad,
"disable_page_orientation": request.disable_page_orientation,
"disable_crop_orientation": request.disable_crop_orientation,
}
model_reinitialized = False
if state.current_config != new_config:
print(f"Model config changed, reinitializing...")
state.model = create_model(**new_config)
state.current_config = new_config
model_reinitialized = True
# Validate page range
start = request.start_page
end = min(request.end_page, len(state.dataset))
if start >= end:
raise HTTPException(status_code=400, detail=f"Invalid page range: {start}-{end}")
cer_list, wer_list = [], []
time_per_page_list = []
t0 = time.time()
for idx in range(start, end):
img, ref = state.dataset[idx]
arr = np.array(img)
tp0 = time.time()
# DocTR expects a list of images
result = state.model([arr])
pred = doctr_result_to_text(
result,
resolve_lines=request.resolve_lines,
resolve_blocks=request.resolve_blocks,
)
time_per_page_list.append(float(time.time() - tp0))
m = evaluate_text(ref, pred)
cer_list.append(m["CER"])
wer_list.append(m["WER"])
return EvaluateResponse(
CER=float(np.mean(cer_list)) if cer_list else 1.0,
WER=float(np.mean(wer_list)) if wer_list else 1.0,
TIME=float(time.time() - t0),
PAGES=len(cer_list),
TIME_PER_PAGE=float(np.mean(time_per_page_list)) if time_per_page_list else 0.0,
model_reinitialized=model_reinitialized,
)
@app.post("/evaluate_full", response_model=EvaluateResponse)
def evaluate_full(request: EvaluateRequest):
"""Evaluate on ALL pages (ignores start_page/end_page)."""
request.start_page = 0
request.end_page = 9999
return evaluate(request)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)

View File

@@ -0,0 +1,8 @@
python-doctr[torch]>=0.8.0
fastapi>=0.104.0
uvicorn>=0.24.0
pydantic>=2.0.0
jiwer>=3.0.0
numpy>=1.24.0
pillow>=10.0.0
torch>=2.0.0

View File

@@ -0,0 +1,48 @@
# Dockerfile - EasyOCR Tuning REST API
#
# Build:
# docker build -t easyocr-api:latest .
#
# Run:
# docker run -p 8002:8000 -v ./dataset:/app/dataset easyocr-api:latest
FROM python:3.11-slim
LABEL maintainer="Sergio Jimenez"
LABEL description="EasyOCR Tuning REST API"
WORKDIR /app
# Set environment variables
ENV PYTHONUNBUFFERED=1
ENV EASYOCR_LANGUAGES=es,en
# Install system dependencies for OpenCV and image processing
RUN apt-get update && apt-get install -y --no-install-recommends \
libgl1 \
libglib2.0-0 \
libsm6 \
libxext6 \
libxrender1 \
&& rm -rf /var/lib/apt/lists/*
# Copy and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY easyocr_tuning_rest.py .
COPY dataset_manager.py .
# Volume for dataset and model cache
VOLUME ["/app/dataset", "/root/.EasyOCR"]
# Expose API port
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
# Run the API server
CMD ["uvicorn", "easyocr_tuning_rest:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -0,0 +1,45 @@
# Imports
import os
from PIL import Image
class ImageTextDataset:
def __init__(self, root):
self.samples = []
for folder in sorted(os.listdir(root)):
sub = os.path.join(root, folder)
img_dir = os.path.join(sub, "img")
txt_dir = os.path.join(sub, "txt")
if not (os.path.isdir(img_dir) and os.path.isdir(txt_dir)):
continue
for fname in sorted(os.listdir(img_dir)):
if not fname.lower().endswith((".png", ".jpg", ".jpeg")):
continue
img_path = os.path.join(img_dir, fname)
# text file must have same name but .txt
txt_name = os.path.splitext(fname)[0] + ".txt"
txt_path = os.path.join(txt_dir, txt_name)
if not os.path.exists(txt_path):
continue
self.samples.append((img_path, txt_path))
def __len__(self):
return len(self.samples)
def __getitem__(self, idx):
img_path, txt_path = self.samples[idx]
# Load image
image = Image.open(img_path).convert("RGB")
# Load text
with open(txt_path, "r", encoding="utf-8") as f:
text = f.read()
return image, text

View File

@@ -0,0 +1,320 @@
# easyocr_tuning_rest.py
# FastAPI REST service for EasyOCR hyperparameter evaluation
# Usage: uvicorn easyocr_tuning_rest:app --host 0.0.0.0 --port 8000
import os
import re
import time
from typing import Optional, List
from contextlib import asynccontextmanager
import numpy as np
import torch
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
import easyocr
from jiwer import wer, cer
from dataset_manager import ImageTextDataset
def get_gpu_info() -> dict:
"""Get GPU status information from PyTorch."""
info = {
"cuda_available": torch.cuda.is_available(),
"device": "cuda" if torch.cuda.is_available() else "cpu",
"gpu_count": 0,
"gpu_name": None,
"gpu_memory_total": None,
"gpu_memory_used": None,
}
if info["cuda_available"]:
try:
info["gpu_count"] = torch.cuda.device_count()
if info["gpu_count"] > 0:
info["gpu_name"] = torch.cuda.get_device_name(0)
info["gpu_memory_total"] = f"{torch.cuda.get_device_properties(0).total_memory / (1024**3):.2f} GB"
info["gpu_memory_used"] = f"{torch.cuda.memory_allocated(0) / (1024**3):.2f} GB"
except Exception as e:
info["gpu_error"] = str(e)
return info
# Model configuration via environment variables
DEFAULT_LANGUAGES = os.environ.get("EASYOCR_LANGUAGES", "es,en").split(",")
# Global state for model and dataset
class AppState:
reader: Optional[easyocr.Reader] = None
dataset: Optional[ImageTextDataset] = None
dataset_path: Optional[str] = None
languages: List[str] = DEFAULT_LANGUAGES
state = AppState()
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Load EasyOCR model at startup."""
gpu_info = get_gpu_info()
print("=" * 50)
print("GPU STATUS")
print("=" * 50)
print(f" CUDA available: {gpu_info['cuda_available']}")
print(f" Device: {gpu_info['device']}")
if gpu_info['cuda_available']:
print(f" GPU count: {gpu_info['gpu_count']}")
print(f" GPU name: {gpu_info['gpu_name']}")
print(f" GPU memory total: {gpu_info['gpu_memory_total']}")
print("=" * 50)
print(f"Loading EasyOCR models...")
print(f" Languages: {state.languages}")
state.reader = easyocr.Reader(
state.languages,
gpu=gpu_info['cuda_available'],
)
if gpu_info['cuda_available']:
gpu_after = get_gpu_info()
print(f" GPU memory after load: {gpu_after.get('gpu_memory_used', 'N/A')}")
print("Model loaded successfully!")
yield
state.reader = None
state.dataset = None
app = FastAPI(
title="EasyOCR Tuning API",
description="REST API for EasyOCR hyperparameter evaluation",
version="1.0.0",
lifespan=lifespan,
)
class EvaluateRequest(BaseModel):
"""Request schema with all tunable EasyOCR hyperparameters."""
pdf_folder: str = Field("/app/dataset", description="Path to dataset folder")
# Detection thresholds (CRAFT algorithm)
text_threshold: float = Field(0.7, ge=0.0, le=1.0, description="Text confidence threshold")
low_text: float = Field(0.4, ge=0.0, le=1.0, description="Text lower-bound score")
link_threshold: float = Field(0.4, ge=0.0, le=1.0, description="Link confidence threshold")
# Bounding box merging
slope_ths: float = Field(0.1, ge=0.0, le=1.0, description="Maximum slope for box merging")
ycenter_ths: float = Field(0.5, ge=0.0, le=2.0, description="Maximum vertical shift for merging")
height_ths: float = Field(0.5, ge=0.0, le=2.0, description="Maximum height variance for merging")
width_ths: float = Field(0.5, ge=0.0, le=2.0, description="Maximum horizontal distance for merging")
add_margin: float = Field(0.1, ge=0.0, le=1.0, description="Bounding box extension margin")
# Contrast handling
contrast_ths: float = Field(0.1, ge=0.0, le=1.0, description="Contrast threshold for dual-pass")
adjust_contrast: float = Field(0.5, ge=0.0, le=1.0, description="Target contrast adjustment level")
# Decoder options
decoder: str = Field("greedy", description="Decoder type: greedy, beamsearch, wordbeamsearch")
beamWidth: int = Field(5, ge=1, le=20, description="Beam width for beam search decoders")
# Other
min_size: int = Field(10, ge=1, description="Minimum text box size in pixels")
rotation_info: Optional[List[int]] = Field(None, description="Rotation angles to try: [90, 180, 270]")
# Page range
start_page: int = Field(5, ge=0, description="Start page index (inclusive)")
end_page: int = Field(10, ge=1, description="End page index (exclusive)")
class EvaluateResponse(BaseModel):
"""Response schema matching CLI output."""
CER: float
WER: float
TIME: float
PAGES: int
TIME_PER_PAGE: float
class HealthResponse(BaseModel):
status: str
model_loaded: bool
dataset_loaded: bool
dataset_size: Optional[int] = None
languages: Optional[List[str]] = None
cuda_available: Optional[bool] = None
device: Optional[str] = None
gpu_name: Optional[str] = None
gpu_memory_used: Optional[str] = None
gpu_memory_total: Optional[str] = None
def assemble_easyocr_result(result: list) -> str:
"""
Assemble EasyOCR result into text.
EasyOCR returns: [(bbox, text, confidence), ...]
"""
if not result:
return ""
# Sort by vertical position (y), then horizontal (x)
# bbox format: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
def get_y_center(item):
bbox = item[0]
return (bbox[0][1] + bbox[2][1]) / 2
def get_x(item):
return item[0][0][0]
# Group by lines based on y-center
sorted_items = sorted(result, key=lambda x: (get_y_center(x), get_x(x)))
if not sorted_items:
return ""
# Adaptive line tolerance
heights = []
for item in sorted_items:
bbox = item[0]
h = abs(bbox[2][1] - bbox[0][1])
heights.append(h)
median_h = float(np.median(heights)) if heights else 20.0
line_tol = max(8.0, 0.6 * median_h)
lines, cur_line, last_y = [], [], None
for item in sorted_items:
y_center = get_y_center(item)
text = item[1]
if last_y is None or abs(y_center - last_y) <= line_tol:
cur_line.append((get_x(item), text))
else:
cur_line.sort(key=lambda t: t[0])
lines.append(" ".join(t[1] for t in cur_line))
cur_line = [(get_x(item), text)]
last_y = y_center
if cur_line:
cur_line.sort(key=lambda t: t[0])
lines.append(" ".join(t[1] for t in cur_line))
text = " ".join(lines)
text = re.sub(r"\s+", " ", text).strip()
return text
def evaluate_text(reference: str, prediction: str) -> dict:
"""Calculate WER and CER metrics."""
return {"WER": wer(reference, prediction), "CER": cer(reference, prediction)}
@app.get("/health", response_model=HealthResponse)
def health_check():
"""Check if the service is ready."""
gpu_info = get_gpu_info()
return HealthResponse(
status="ok" if state.reader is not None else "initializing",
model_loaded=state.reader is not None,
dataset_loaded=state.dataset is not None,
dataset_size=len(state.dataset) if state.dataset else None,
languages=state.languages,
cuda_available=gpu_info.get("cuda_available"),
device=gpu_info.get("device"),
gpu_name=gpu_info.get("gpu_name"),
gpu_memory_used=gpu_info.get("gpu_memory_used"),
gpu_memory_total=gpu_info.get("gpu_memory_total"),
)
@app.post("/evaluate", response_model=EvaluateResponse)
def evaluate(request: EvaluateRequest):
"""
Evaluate OCR with given hyperparameters.
Returns CER, WER, and timing metrics.
"""
if state.reader is None:
raise HTTPException(status_code=503, detail="Model not loaded yet")
# Validate decoder
if request.decoder not in ["greedy", "beamsearch", "wordbeamsearch"]:
raise HTTPException(status_code=400, detail=f"Invalid decoder: {request.decoder}")
# Load or reload dataset if path changed
if state.dataset is None or state.dataset_path != request.pdf_folder:
if not os.path.isdir(request.pdf_folder):
raise HTTPException(status_code=400, detail=f"Dataset folder not found: {request.pdf_folder}")
state.dataset = ImageTextDataset(request.pdf_folder)
state.dataset_path = request.pdf_folder
if len(state.dataset) == 0:
raise HTTPException(status_code=400, detail="Dataset is empty")
# Validate page range
start = request.start_page
end = min(request.end_page, len(state.dataset))
if start >= end:
raise HTTPException(status_code=400, detail=f"Invalid page range: {start}-{end}")
cer_list, wer_list = [], []
time_per_page_list = []
t0 = time.time()
for idx in range(start, end):
img, ref = state.dataset[idx]
arr = np.array(img)
tp0 = time.time()
result = state.reader.readtext(
arr,
# Detection thresholds
text_threshold=request.text_threshold,
low_text=request.low_text,
link_threshold=request.link_threshold,
# Bounding box merging
slope_ths=request.slope_ths,
ycenter_ths=request.ycenter_ths,
height_ths=request.height_ths,
width_ths=request.width_ths,
add_margin=request.add_margin,
# Contrast
contrast_ths=request.contrast_ths,
adjust_contrast=request.adjust_contrast,
# Decoder
decoder=request.decoder,
beamWidth=request.beamWidth,
# Other
min_size=request.min_size,
rotation_info=request.rotation_info,
)
pred = assemble_easyocr_result(result)
time_per_page_list.append(float(time.time() - tp0))
m = evaluate_text(ref, pred)
cer_list.append(m["CER"])
wer_list.append(m["WER"])
return EvaluateResponse(
CER=float(np.mean(cer_list)) if cer_list else 1.0,
WER=float(np.mean(wer_list)) if wer_list else 1.0,
TIME=float(time.time() - t0),
PAGES=len(cer_list),
TIME_PER_PAGE=float(np.mean(time_per_page_list)) if time_per_page_list else 0.0,
)
@app.post("/evaluate_full", response_model=EvaluateResponse)
def evaluate_full(request: EvaluateRequest):
"""Evaluate on ALL pages (ignores start_page/end_page)."""
request.start_page = 0
request.end_page = 9999
return evaluate(request)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)

View File

@@ -0,0 +1,8 @@
easyocr>=1.7.0
fastapi>=0.104.0
uvicorn>=0.24.0
pydantic>=2.0.0
jiwer>=3.0.0
numpy>=1.24.0
pillow>=10.0.0
torch>=2.0.0

View File

@@ -1,207 +0,0 @@
# benchmark.py - Compare CPU vs GPU performance for PaddleOCR REST API
# Usage: python benchmark.py
import requests
import time
import json
import sys
from datetime import datetime
CONTAINERS = {
"GPU": {"url": "http://localhost:8000", "port": 8000},
"CPU": {"url": "http://localhost:8002", "port": 8002},
}
DATASET_PATH = "/app/dataset"
# Test configurations
TEST_CONFIGS = [
{
"name": "Baseline",
"config": {
"pdf_folder": DATASET_PATH,
"use_doc_orientation_classify": False,
"use_doc_unwarping": False,
"textline_orientation": False,
"text_det_thresh": 0.0,
"text_det_box_thresh": 0.0,
"text_det_unclip_ratio": 1.5,
"text_rec_score_thresh": 0.0,
"start_page": 5,
"end_page": 10,
}
},
{
"name": "Optimized",
"config": {
"pdf_folder": DATASET_PATH,
"use_doc_orientation_classify": False,
"use_doc_unwarping": False,
"textline_orientation": True,
"text_det_thresh": 0.4690,
"text_det_box_thresh": 0.5412,
"text_det_unclip_ratio": 0.0,
"text_rec_score_thresh": 0.6350,
"start_page": 5,
"end_page": 10,
}
},
]
def check_health(url: str, timeout: int = 10) -> bool:
"""Check if API is healthy."""
try:
resp = requests.get(f"{url}/health", timeout=timeout)
if resp.status_code == 200:
data = resp.json()
return data.get("model_loaded", False)
except Exception as e:
print(f" Health check failed: {e}")
return False
def run_benchmark(url: str, config: dict, warmup: bool = False) -> dict:
"""Run a single benchmark test."""
eval_url = f"{url}/evaluate"
start = time.time()
resp = requests.post(eval_url, json=config, timeout=600)
resp.raise_for_status()
total_time = time.time() - start
result = resp.json()
result["total_request_time"] = total_time
return result
def main():
results = {
"timestamp": datetime.now().isoformat(),
"containers": {},
}
print("=" * 60)
print("PaddleOCR CPU vs GPU Benchmark")
print("=" * 60)
print()
# Check container health
print("Checking container health...")
for name, info in CONTAINERS.items():
healthy = check_health(info["url"])
status = "✓ Ready" if healthy else "✗ Not Ready"
print(f" {name} ({info['url']}): {status}")
if not healthy:
print(f" Skipping {name} - container not available")
continue
print()
# Run benchmarks for each container
for container_name, container_info in CONTAINERS.items():
url = container_info["url"]
if not check_health(url):
print(f"Skipping {container_name} - not healthy")
continue
print("=" * 60)
print(f"Testing: {container_name} Container")
print(f"URL: {url}")
print("=" * 60)
container_results = {
"url": url,
"tests": {},
}
# Warmup run (first run often slower due to model loading/caching)
print("\n Warmup run...")
try:
warmup_config = TEST_CONFIGS[0]["config"].copy()
warmup_config["start_page"] = 5
warmup_config["end_page"] = 6 # Just 1 page for warmup
run_benchmark(url, warmup_config, warmup=True)
print(" Warmup complete.")
except Exception as e:
print(f" Warmup failed: {e}")
# Run each test configuration
for test in TEST_CONFIGS:
test_name = test["name"]
config = test["config"]
print(f"\n Running: {test_name} Configuration")
print(f" Pages: {config['start_page']} to {config['end_page']}")
try:
result = run_benchmark(url, config)
container_results["tests"][test_name] = {
"CER": result["CER"],
"WER": result["WER"],
"PAGES": result["PAGES"],
"TIME_PER_PAGE": result["TIME_PER_PAGE"],
"TOTAL_TIME": result["total_request_time"],
}
print(f" CER: {result['CER']*100:.2f}%")
print(f" WER: {result['WER']*100:.2f}%")
print(f" Pages: {result['PAGES']}")
print(f" Time/page: {result['TIME_PER_PAGE']:.3f}s")
print(f" Total time: {result['total_request_time']:.2f}s")
except Exception as e:
print(f" ERROR: {e}")
container_results["tests"][test_name] = {"error": str(e)}
results["containers"][container_name] = container_results
# Print summary
print("\n")
print("=" * 60)
print("BENCHMARK SUMMARY")
print("=" * 60)
# Table header
print(f"\n{'Test':<12} {'Container':<8} {'CER %':<10} {'WER %':<10} {'Time/Page':<12} {'Total (s)':<10}")
print("-" * 62)
for test in TEST_CONFIGS:
test_name = test["name"]
for container_name in CONTAINERS.keys():
if container_name in results["containers"]:
tests = results["containers"][container_name].get("tests", {})
if test_name in tests and "error" not in tests[test_name]:
t = tests[test_name]
print(f"{test_name:<12} {container_name:<8} {t['CER']*100:<10.2f} {t['WER']*100:<10.2f} {t['TIME_PER_PAGE']:<12.3f} {t['TOTAL_TIME']:<10.2f}")
# Speed comparison
print("\n" + "=" * 60)
print("SPEED COMPARISON")
print("=" * 60)
for test in TEST_CONFIGS:
test_name = test["name"]
gpu_data = results["containers"].get("GPU", {}).get("tests", {}).get(test_name, {})
cpu_data = results["containers"].get("CPU", {}).get("tests", {}).get(test_name, {})
if gpu_data and cpu_data and "error" not in gpu_data and "error" not in cpu_data:
speedup = cpu_data["TIME_PER_PAGE"] / gpu_data["TIME_PER_PAGE"]
print(f"\n{test_name} Configuration:")
print(f" GPU: {gpu_data['TIME_PER_PAGE']:.3f}s per page")
print(f" CPU: {cpu_data['TIME_PER_PAGE']:.3f}s per page")
print(f" GPU is {speedup:.2f}x faster than CPU")
# Save results to JSON
output_file = "benchmark_results.json"
with open(output_file, "w") as f:
json.dump(results, f, indent=2)
print(f"\n\nResults saved to: {output_file}")
return results
if __name__ == "__main__":
main()

View File

@@ -3,7 +3,7 @@
# CPU: docker compose up ocr-cpu
# GPU: docker compose up ocr-gpu
# Test: docker compose run --rm test
# Build: CUDA_ARCH=90 docker compose --profile build run --rm build-paddle
# Build: CUDA_ARCH=120 docker compose --profile build run --rm build-paddle
#
# Auto-detect CUDA arch before building:
# export CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1 | tr -d '.')
@@ -12,13 +12,13 @@
services:
# PaddlePaddle GPU wheel builder (ARM64 only, one-time build)
# Creates ./wheels/paddlepaddle_gpu-*.whl for ARM64 GPU support
# CUDA_ARCH env var controls target GPU architecture (default: 90 for Hopper)
# CUDA_ARCH env var controls target GPU architecture (default: 120 for Blackwell base)
build-paddle:
build:
context: .
dockerfile: Dockerfile.build-paddle
args:
CUDA_ARCH: ${CUDA_ARCH:-90}
CUDA_ARCH: ${CUDA_ARCH:-120}
volumes:
- ./wheels:/wheels
profiles:

View File

@@ -0,0 +1,199 @@
#!/usr/bin/env python3
"""
Debug script for GPU OCR detection issues.
This script tests the raw inference output from PaddlePaddle detection models
to diagnose why detection might fail on certain GPU architectures (e.g., Blackwell/sm_121).
Usage:
docker exec paddle-ocr-gpu python /app/debug_gpu_detection.py [image_path]
Expected behavior:
- Working GPU: Output stats should show min close to 0, max close to 1, mean ~0.1-0.5
- Broken GPU: Output stats show constant values (e.g., min=max=mean=0.00001)
"""
import os
import sys
os.environ['DISABLE_MODEL_SOURCE_CHECK'] = 'True'
import numpy as np
import paddle
from PIL import Image
def check_gpu_status():
"""Check GPU availability and properties."""
print("=" * 60)
print("GPU STATUS")
print("=" * 60)
print(f"Device: {paddle.device.get_device()}")
print(f"CUDA compiled: {paddle.device.is_compiled_with_cuda()}")
if paddle.device.is_compiled_with_cuda():
print(f"GPU count: {paddle.device.cuda.device_count()}")
if paddle.device.cuda.device_count() > 0:
props = paddle.device.cuda.get_device_properties(0)
print(f"GPU name: {props.name}")
print(f"Compute capability: {props.major}.{props.minor}")
print(f"Total memory: {props.total_memory / (1024**3):.2f} GB")
print()
def test_basic_ops():
"""Test basic GPU tensor operations."""
print("=" * 60)
print("BASIC GPU OPERATIONS")
print("=" * 60)
# Test tensor creation
x = paddle.randn([2, 3])
print(f"Tensor place: {x.place}")
# Test conv2d
x = paddle.randn([1, 3, 64, 64])
conv = paddle.nn.Conv2D(3, 16, 3, padding=1)
y = conv(x)
print(f"Conv2d output shape: {y.shape}, place: {y.place}")
# Test softmax
s = paddle.nn.functional.softmax(y, axis=1)
print(f"Softmax output shape: {s.shape}")
print("Basic operations: OK")
print()
def test_detection_model(image_path: str):
"""Test detection model raw output."""
print("=" * 60)
print("DETECTION MODEL TEST")
print("=" * 60)
from paddle.inference import Config, create_predictor
model_dir = '/root/.paddlex/official_models/PP-OCRv4_mobile_det'
inference_file = f'{model_dir}/inference.json'
params_file = f'{model_dir}/inference.pdiparams'
if not os.path.exists(inference_file):
print(f"Model not found at {model_dir}")
print("Run PaddleOCR once to download models first.")
return
# Create config
config = Config()
config.set_prog_file(inference_file)
config.set_params_file(params_file)
config.enable_use_gpu(1024, 0)
print("Creating predictor...")
predictor = create_predictor(config)
# Get input/output names
input_names = predictor.get_input_names()
output_names = predictor.get_output_names()
print(f"Input names: {input_names}")
print(f"Output names: {output_names}")
# Load and preprocess image
img = Image.open(image_path)
img = img.resize((640, 640))
arr = np.array(img).astype('float32')
arr = arr / 255.0
arr = arr.transpose(2, 0, 1)[np.newaxis, ...] # NCHW
print(f"Input tensor shape: {arr.shape}")
# Set input
input_handle = predictor.get_input_handle(input_names[0])
input_handle.reshape(arr.shape)
input_handle.copy_from_cpu(arr)
# Run prediction
print("Running inference...")
predictor.run()
# Get output
output_handle = predictor.get_output_handle(output_names[0])
output = output_handle.copy_to_cpu()
print()
print("OUTPUT ANALYSIS:")
print(f" Shape: {output.shape}")
print(f" Min: {output.min():.6f}")
print(f" Max: {output.max():.6f}")
print(f" Mean: {output.mean():.6f}")
print(f" Std: {output.std():.6f}")
print(f" Has NaN: {np.isnan(output).any()}")
print(f" Has Inf: {np.isinf(output).any()}")
# Diagnosis
print()
print("DIAGNOSIS:")
if output.min() == output.max():
print(" PROBLEM: Output is constant - model inference is broken!")
print(" This typically indicates GPU compute capability mismatch.")
print(" GB10 (sm_121) may need CUDA 13.0+ for native support.")
elif output.max() < 0.01:
print(" PROBLEM: Output values too low - detection will find nothing.")
elif np.isnan(output).any() or np.isinf(output).any():
print(" PROBLEM: Output contains NaN/Inf - numerical instability.")
else:
print(" OK: Output values look reasonable.")
print(f" Detection threshold typically 0.3-0.6, max output is {output.max():.3f}")
def test_paddleocr_output(image_path: str):
"""Test full PaddleOCR pipeline."""
print()
print("=" * 60)
print("PADDLEOCR PIPELINE TEST")
print("=" * 60)
from paddleocr import PaddleOCR
ocr = PaddleOCR(
text_detection_model_name='PP-OCRv4_mobile_det',
text_recognition_model_name='PP-OCRv4_mobile_rec',
)
img = Image.open(image_path)
arr = np.array(img)
out = ocr.predict(arr)
res = out[0].json['res']
dt_polys = res.get('dt_polys', [])
rec_texts = res.get('rec_texts', [])
print(f"Detection polygons: {len(dt_polys)}")
print(f"Recognition texts: {len(rec_texts)}")
if rec_texts:
print(f"Sample texts: {rec_texts[:5]}")
else:
print("No text detected!")
def main():
# Default test image
image_path = '/app/dataset/0/img/page_0001.png'
if len(sys.argv) > 1:
image_path = sys.argv[1]
if not os.path.exists(image_path):
print(f"Image not found: {image_path}")
print("Usage: python debug_gpu_detection.py [image_path]")
sys.exit(1)
print(f"Testing with image: {image_path}")
print()
check_gpu_status()
test_basic_ops()
test_detection_model(image_path)
test_paddleocr_output(image_path)
if __name__ == '__main__':
main()

View File

@@ -56,7 +56,7 @@ def test_evaluate(url: str, config: dict) -> dict:
def main():
parser = argparse.ArgumentParser(description="Test PaddleOCR REST API")
parser.add_argument("--url", default="http://localhost:8000", help="API base URL")
parser.add_argument("--url", default="http://localhost:8001", help="API base URL")
parser.add_argument("--dataset", default="/app/dataset", help="Dataset path (inside container)")
parser.add_argument("--skip-health", action="store_true", help="Skip health check wait")
args = parser.parse_args()