From 04bbbebd5a6d8af6dc2dcf2afb89308ffc769ba5 Mon Sep 17 00:00:00 2001 From: Aaron Roberts Date: Mon, 29 Jun 2026 13:09:01 +0100 Subject: [PATCH] Remove Freeform and Find from UI. Allow Description to be added to Reviewed job --- .env.example | 13 + README.md | 45 ++- backend/database.py | 5 + backend/main.py | 488 +++++++++-------------- backend/requirements.txt | 1 + docker-compose.yml | 9 + frontend/src/App.jsx | 42 +- frontend/src/components/JobsPanel.jsx | 122 ++++-- frontend/src/components/ModeSelector.jsx | 67 +--- frontend/src/components/PDFProcessor.jsx | 5 +- 10 files changed, 394 insertions(+), 403 deletions(-) diff --git a/.env.example b/.env.example index 5f0300d..6dea19f 100644 --- a/.env.example +++ b/.env.example @@ -11,6 +11,19 @@ FRONTEND_PORT=3000 MODEL_NAME=deepseek-ai/DeepSeek-OCR HF_HOME=/models +# OCR model selection +# Register the local DeepSeek-OCR model (set to false for an Ollama-only deployment) +ENABLE_DEEPSEEK_LOCAL=true +# External Ollama host the backend should call (no trailing slash) +OLLAMA_BASE_URL=http://host.docker.internal:11434 +# Comma-separated Ollama vision model tags to surface in the UI. +# Pull these on the Ollama host first, e.g. `ollama pull glm-ocr`. +OLLAMA_MODELS=glm-ocr,llama3.2-vision,minicpm-v,qwen2.5vl +# Default model id selected in the UI (deepseek-local or ollama:) +DEFAULT_OCR_MODEL=deepseek-local +# Per-request timeout (seconds) for Ollama calls +OLLAMA_TIMEOUT=300 + # CORS Configuration (comma-separated origins, defaults to http://localhost:3000) CORS_ORIGINS=http://localhost:3000 diff --git a/README.md b/README.md index 9f2205e..c88ae5c 100644 --- a/README.md +++ b/README.md @@ -172,6 +172,13 @@ FRONTEND_PORT=3000 MODEL_NAME=deepseek-ai/DeepSeek-OCR HF_HOME=/models +# OCR model selection (DeepSeek + Ollama) +ENABLE_DEEPSEEK_LOCAL=true # register the local GPU model +OLLAMA_BASE_URL=http://host.docker.internal:11434 # external Ollama host +OLLAMA_MODELS=glm-ocr,llama3.2-vision,minicpm-v,qwen2.5vl +DEFAULT_OCR_MODEL=deepseek-local # deepseek-local or ollama: +OLLAMA_TIMEOUT=300 # per-request timeout (seconds) + # Upload Configuration MAX_UPLOAD_SIZE_MB=100 # Maximum file upload size @@ -186,13 +193,47 @@ CROP_MODE=true # Enable dynamic cropping for large images - `API_HOST`: Backend API host (default: 0.0.0.0) - `API_PORT`: Backend API port (default: 8000) - `FRONTEND_PORT`: Frontend port (default: 3000) -- `MODEL_NAME`: HuggingFace model identifier +- `MODEL_NAME`: HuggingFace model identifier for the local DeepSeek-OCR model - `HF_HOME`: Model cache directory +- `ENABLE_DEEPSEEK_LOCAL`: Register the local DeepSeek-OCR model (set `false` for an Ollama-only deployment with no GPU model loaded) +- `OLLAMA_BASE_URL`: URL of an external Ollama server the backend calls for non-DeepSeek models +- `OLLAMA_MODELS`: Comma-separated Ollama vision model tags to expose in the UI (pull them on the Ollama host first, e.g. `ollama pull glm-ocr`) +- `DEFAULT_OCR_MODEL`: Model id selected by default (`deepseek-local` or `ollama:`) +- `OLLAMA_TIMEOUT`: Per-request timeout in seconds for Ollama calls - `MAX_UPLOAD_SIZE_MB`: Maximum file upload size in megabytes - `BASE_SIZE`: Base image processing size (affects memory usage) - `IMAGE_SIZE`: Tile size for dynamic cropping - `CROP_MODE`: Enable/disable dynamic image cropping +### Choosing an OCR Model + +The **Model** selector (next to the Mode selector) chooses which backend runs the OCR: + +- **DeepSeek-OCR (local GPU)** β€” the default. Loaded lazily on first use. Supports + every mode including grounding/bounding-box modes (Find), plus the Advanced + Settings (base size, crop mode, etc.). +- **Ollama models** β€” any vision model pulled on your Ollama host and listed in + `OLLAMA_MODELS` (e.g. `glm-ocr`, `llama3.2-vision`). These run remotely on the + Ollama server. They return **plain text only**: bounding boxes are not produced, + so grounding modes (Find) and the DeepSeek-specific Advanced Settings are ignored + / disabled when an Ollama model is selected. + +Setup for Ollama models: + +```bash +# On the machine running Ollama +ollama pull glm-ocr +ollama pull llama3.2-vision + +# Point the backend at it (in .env), then restart +OLLAMA_BASE_URL=http://host.docker.internal:11434 +OLLAMA_MODELS=glm-ocr,llama3.2-vision +``` + +`GET /api/models` returns the registered models and their capabilities; the UI +populates the selector from it. The model used for each job is stored on the job +record (`ocr_model`) and shown in the Browse Jobs view. + ## Tech Stack ### Frontend @@ -377,6 +418,7 @@ For large images, the model uses dynamic cropping: **Parameters:** - `image` (file, required) - Image file to process (up to 100MB) +- `model` (string) - OCR model id from `GET /api/models` (default: registry default). Grounding/Advanced settings apply to DeepSeek only. - `mode` (string) - OCR mode: `plain_ocr` | `describe` | `find_ref` | `freeform` - `prompt` (string) - Custom prompt for freeform mode - `grounding` (bool) - Enable bounding boxes (auto-enabled for find_ref) @@ -416,6 +458,7 @@ Process PDF documents with OCR and export to various formats. **Parameters:** - `pdf_file` (file, required) - PDF file to process (up to 100MB) +- `model` (string) - OCR model id from `GET /api/models` (default: registry default) - `mode` (string) - OCR mode: `plain_ocr` | `describe` | `find_ref` | `freeform` - `prompt` (string) - Custom prompt for freeform mode - `output_format` (string) - Output format: `markdown` | `html` | `docx` | `json` diff --git a/backend/database.py b/backend/database.py index 5c99e98..d668783 100644 --- a/backend/database.py +++ b/backend/database.py @@ -62,6 +62,11 @@ def init_db(): ALTER TABLE ocr_jobs ADD COLUMN IF NOT EXISTS updated_at TIMESTAMPTZ """) + # Which OCR model produced this job (e.g. "deepseek-local", "ollama:glm-ocr") + cur.execute(""" + ALTER TABLE ocr_jobs + ADD COLUMN IF NOT EXISTS ocr_model TEXT + """) # Trigger function: stamp updated_at on every row update cur.execute(""" CREATE OR REPLACE FUNCTION set_updated_at() diff --git a/backend/main.py b/backend/main.py index d56efb1..c2e6967 100644 --- a/backend/main.py +++ b/backend/main.py @@ -1,8 +1,6 @@ import os -import re import uuid import tempfile -import shutil import base64 from typing import List, Dict, Any, Optional from contextlib import asynccontextmanager @@ -12,8 +10,6 @@ from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Query from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, StreamingResponse, FileResponse from pydantic import BaseModel -import torch -from transformers import AutoModel, AutoTokenizer from PIL import Image import uvicorn from decouple import config as env_config @@ -28,20 +24,29 @@ from pdf_utils import ( ) from format_converter import DocumentConverter from database import init_db, get_db +from providers import ( + build_registry, + parse_detections, + clean_grounding_text, + ProviderError, + GROUNDING_MODES, +) OCR_IMAGES_DIR = env_config("OCR_IMAGES_DIR", default="/data/ocr_images") # ----------------------------- -# Lifespan context for model loading +# Lifespan context # ----------------------------- -model = None -tokenizer = None +# The model registry holds all available OCR providers. Local models (e.g. +# DeepSeek-OCR) are loaded lazily on first use so an Ollama-only deployment +# starts instantly and never touches the GPU. +registry = None @asynccontextmanager async def lifespan(app: FastAPI): - """Load model on startup, cleanup on shutdown""" - global model, tokenizer - + """Build the model registry on startup.""" + global registry + # Image storage directory os.makedirs(OCR_IMAGES_DIR, exist_ok=True) @@ -51,42 +56,11 @@ async def lifespan(app: FastAPI): except Exception as exc: print(f"Warning: database initialization failed: {exc}") - # Environment setup - os.environ.pop("TRANSFORMERS_CACHE", None) - MODEL_NAME = env_config("MODEL_NAME", default="deepseek-ai/DeepSeek-OCR") - HF_HOME = env_config("HF_HOME", default="/models") - os.makedirs(HF_HOME, exist_ok=True) - - # Load model - print(f"πŸš€ Loading {MODEL_NAME}...") - torch_dtype = torch.bfloat16 - - tokenizer = AutoTokenizer.from_pretrained( - MODEL_NAME, - trust_remote_code=True, - ) - - model = AutoModel.from_pretrained( - MODEL_NAME, - trust_remote_code=True, - use_safetensors=True, - attn_implementation="eager", - torch_dtype=torch_dtype, - ).eval().to("cuda") - - # Pad token setup - try: - if getattr(tokenizer, "pad_token_id", None) is None and getattr(tokenizer, "eos_token_id", None) is not None: - tokenizer.pad_token = tokenizer.eos_token - if getattr(model.config, "pad_token_id", None) is None and getattr(tokenizer, "pad_token_id", None) is not None: - model.config.pad_token_id = tokenizer.pad_token_id - except Exception: - pass - - print("βœ… Model loaded and ready!") - + # OCR model registry (providers load their models lazily) + registry = build_registry() + yield - + # Cleanup print("πŸ›‘ Shutting down...") @@ -112,155 +86,6 @@ app.add_middleware( allow_headers=["*"], ) -# ----------------------------- -# Prompt builder -# ----------------------------- -def build_prompt( - mode: str, - user_prompt: str, - grounding: bool, - find_term: Optional[str], - schema: Optional[str], - include_caption: bool, -) -> str: - """Build the prompt based on mode""" - parts: List[str] = [""] - mode_requires_grounding = mode in {"find_ref", "layout_map", "pii_redact"} - if grounding or mode_requires_grounding: - parts.append("<|grounding|>") - - instruction = "" - if mode == "plain_ocr": - instruction = "Free OCR." - elif mode == "markdown": - instruction = "Convert the document to markdown." - elif mode == "tables_csv": - instruction = ( - "Extract every table and output CSV only. " - "Use commas, minimal quoting. If multiple tables, separate with a line containing '---'." - ) - elif mode == "tables_md": - instruction = "Extract every table as GitHub-flavored Markdown tables. Output only the tables." - elif mode == "kv_json": - schema_text = schema.strip() if schema else "{}" - instruction = ( - "Extract key fields and return strict JSON only. " - f"Use this schema (fill the values): {schema_text}" - ) - elif mode == "figure_chart": - instruction = ( - "Parse the figure. First extract any numeric series as a two-column table (x,y). " - "Then summarize the chart in 2 sentences. Output the table, then a line '---', then the summary." - ) - elif mode == "find_ref": - key = (find_term or "").strip() or "Total" - instruction = f"Locate <|ref|>{key}<|/ref|> in the image." - elif mode == "layout_map": - instruction = ( - 'Return a JSON array of blocks with fields {"type":["title","paragraph","table","figure"],' - '"box":[x1,y1,x2,y2]}. Do not include any text content.' - ) - elif mode == "pii_redact": - instruction = ( - 'Find all occurrences of emails, phone numbers, postal addresses, and IBANs. ' - 'Return a JSON array of objects {label, text, box:[x1,y1,x2,y2]}.' - ) - elif mode == "multilingual": - instruction = "Free OCR. Detect the language automatically and output in the same script." - elif mode == "describe": - instruction = "Describe this image. Focus on visible key elements." - elif mode == "freeform": - instruction = user_prompt.strip() if user_prompt else "OCR this image." - else: - instruction = "OCR this image." - - if include_caption and mode not in {"describe"}: - instruction = instruction + "\nThen add a one-paragraph description of the image." - - parts.append(instruction) - return "\n".join(parts) - -# ----------------------------- -# Grounding parser -# ----------------------------- -# Match a full detection block and capture the coordinates as the entire list expression -# Examples of captured coords (including outer brackets): -# - [[312, 339, 480, 681]] -# - [[504, 700, 625, 910], [771, 570, 996, 996]] -# - [[110, 310, 255, 800], [312, 343, 479, 680], ...] -# Using a greedy bracket capture ensures we include all inner lists up to the last ']' before -DET_BLOCK = re.compile( - r"<\|ref\|>(?P