Remove Freeform and Find from UI. Allow Description to be added to Reviewed job

This commit is contained in:
Aaron Roberts
2026-06-29 13:09:01 +01:00
parent 48f958de6c
commit 04bbbebd5a
10 changed files with 394 additions and 403 deletions

View File

@@ -11,6 +11,19 @@ FRONTEND_PORT=3000
MODEL_NAME=deepseek-ai/DeepSeek-OCR
HF_HOME=/models
# OCR model selection
# Register the local DeepSeek-OCR model (set to false for an Ollama-only deployment)
ENABLE_DEEPSEEK_LOCAL=true
# External Ollama host the backend should call (no trailing slash)
OLLAMA_BASE_URL=http://host.docker.internal:11434
# Comma-separated Ollama vision model tags to surface in the UI.
# Pull these on the Ollama host first, e.g. `ollama pull glm-ocr`.
OLLAMA_MODELS=glm-ocr,llama3.2-vision,minicpm-v,qwen2.5vl
# Default model id selected in the UI (deepseek-local or ollama:<tag>)
DEFAULT_OCR_MODEL=deepseek-local
# Per-request timeout (seconds) for Ollama calls
OLLAMA_TIMEOUT=300
# CORS Configuration (comma-separated origins, defaults to http://localhost:3000)
CORS_ORIGINS=http://localhost:3000

View File

@@ -172,6 +172,13 @@ FRONTEND_PORT=3000
MODEL_NAME=deepseek-ai/DeepSeek-OCR
HF_HOME=/models
# OCR model selection (DeepSeek + Ollama)
ENABLE_DEEPSEEK_LOCAL=true # register the local GPU model
OLLAMA_BASE_URL=http://host.docker.internal:11434 # external Ollama host
OLLAMA_MODELS=glm-ocr,llama3.2-vision,minicpm-v,qwen2.5vl
DEFAULT_OCR_MODEL=deepseek-local # deepseek-local or ollama:<tag>
OLLAMA_TIMEOUT=300 # per-request timeout (seconds)
# Upload Configuration
MAX_UPLOAD_SIZE_MB=100 # Maximum file upload size
@@ -186,13 +193,47 @@ CROP_MODE=true # Enable dynamic cropping for large images
- `API_HOST`: Backend API host (default: 0.0.0.0)
- `API_PORT`: Backend API port (default: 8000)
- `FRONTEND_PORT`: Frontend port (default: 3000)
- `MODEL_NAME`: HuggingFace model identifier
- `MODEL_NAME`: HuggingFace model identifier for the local DeepSeek-OCR model
- `HF_HOME`: Model cache directory
- `ENABLE_DEEPSEEK_LOCAL`: Register the local DeepSeek-OCR model (set `false` for an Ollama-only deployment with no GPU model loaded)
- `OLLAMA_BASE_URL`: URL of an external Ollama server the backend calls for non-DeepSeek models
- `OLLAMA_MODELS`: Comma-separated Ollama vision model tags to expose in the UI (pull them on the Ollama host first, e.g. `ollama pull glm-ocr`)
- `DEFAULT_OCR_MODEL`: Model id selected by default (`deepseek-local` or `ollama:<tag>`)
- `OLLAMA_TIMEOUT`: Per-request timeout in seconds for Ollama calls
- `MAX_UPLOAD_SIZE_MB`: Maximum file upload size in megabytes
- `BASE_SIZE`: Base image processing size (affects memory usage)
- `IMAGE_SIZE`: Tile size for dynamic cropping
- `CROP_MODE`: Enable/disable dynamic image cropping
### Choosing an OCR Model
The **Model** selector (next to the Mode selector) chooses which backend runs the OCR:
- **DeepSeek-OCR (local GPU)** — the default. Loaded lazily on first use. Supports
every mode including grounding/bounding-box modes (Find), plus the Advanced
Settings (base size, crop mode, etc.).
- **Ollama models** — any vision model pulled on your Ollama host and listed in
`OLLAMA_MODELS` (e.g. `glm-ocr`, `llama3.2-vision`). These run remotely on the
Ollama server. They return **plain text only**: bounding boxes are not produced,
so grounding modes (Find) and the DeepSeek-specific Advanced Settings are ignored
/ disabled when an Ollama model is selected.
Setup for Ollama models:
```bash
# On the machine running Ollama
ollama pull glm-ocr
ollama pull llama3.2-vision
# Point the backend at it (in .env), then restart
OLLAMA_BASE_URL=http://host.docker.internal:11434
OLLAMA_MODELS=glm-ocr,llama3.2-vision
```
`GET /api/models` returns the registered models and their capabilities; the UI
populates the selector from it. The model used for each job is stored on the job
record (`ocr_model`) and shown in the Browse Jobs view.
## Tech Stack
### Frontend
@@ -377,6 +418,7 @@ For large images, the model uses dynamic cropping:
**Parameters:**
- `image` (file, required) - Image file to process (up to 100MB)
- `model` (string) - OCR model id from `GET /api/models` (default: registry default). Grounding/Advanced settings apply to DeepSeek only.
- `mode` (string) - OCR mode: `plain_ocr` | `describe` | `find_ref` | `freeform`
- `prompt` (string) - Custom prompt for freeform mode
- `grounding` (bool) - Enable bounding boxes (auto-enabled for find_ref)
@@ -416,6 +458,7 @@ Process PDF documents with OCR and export to various formats.
**Parameters:**
- `pdf_file` (file, required) - PDF file to process (up to 100MB)
- `model` (string) - OCR model id from `GET /api/models` (default: registry default)
- `mode` (string) - OCR mode: `plain_ocr` | `describe` | `find_ref` | `freeform`
- `prompt` (string) - Custom prompt for freeform mode
- `output_format` (string) - Output format: `markdown` | `html` | `docx` | `json`

View File

@@ -62,6 +62,11 @@ def init_db():
ALTER TABLE ocr_jobs
ADD COLUMN IF NOT EXISTS updated_at TIMESTAMPTZ
""")
# Which OCR model produced this job (e.g. "deepseek-local", "ollama:glm-ocr")
cur.execute("""
ALTER TABLE ocr_jobs
ADD COLUMN IF NOT EXISTS ocr_model TEXT
""")
# Trigger function: stamp updated_at on every row update
cur.execute("""
CREATE OR REPLACE FUNCTION set_updated_at()

View File

@@ -1,8 +1,6 @@
import os
import re
import uuid
import tempfile
import shutil
import base64
from typing import List, Dict, Any, Optional
from contextlib import asynccontextmanager
@@ -12,8 +10,6 @@ from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Query
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, StreamingResponse, FileResponse
from pydantic import BaseModel
import torch
from transformers import AutoModel, AutoTokenizer
from PIL import Image
import uvicorn
from decouple import config as env_config
@@ -28,19 +24,28 @@ from pdf_utils import (
)
from format_converter import DocumentConverter
from database import init_db, get_db
from providers import (
build_registry,
parse_detections,
clean_grounding_text,
ProviderError,
GROUNDING_MODES,
)
OCR_IMAGES_DIR = env_config("OCR_IMAGES_DIR", default="/data/ocr_images")
# -----------------------------
# Lifespan context for model loading
# Lifespan context
# -----------------------------
model = None
tokenizer = None
# The model registry holds all available OCR providers. Local models (e.g.
# DeepSeek-OCR) are loaded lazily on first use so an Ollama-only deployment
# starts instantly and never touches the GPU.
registry = None
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Load model on startup, cleanup on shutdown"""
global model, tokenizer
"""Build the model registry on startup."""
global registry
# Image storage directory
os.makedirs(OCR_IMAGES_DIR, exist_ok=True)
@@ -51,39 +56,8 @@ async def lifespan(app: FastAPI):
except Exception as exc:
print(f"Warning: database initialization failed: {exc}")
# Environment setup
os.environ.pop("TRANSFORMERS_CACHE", None)
MODEL_NAME = env_config("MODEL_NAME", default="deepseek-ai/DeepSeek-OCR")
HF_HOME = env_config("HF_HOME", default="/models")
os.makedirs(HF_HOME, exist_ok=True)
# Load model
print(f"🚀 Loading {MODEL_NAME}...")
torch_dtype = torch.bfloat16
tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
)
model = AutoModel.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
use_safetensors=True,
attn_implementation="eager",
torch_dtype=torch_dtype,
).eval().to("cuda")
# Pad token setup
try:
if getattr(tokenizer, "pad_token_id", None) is None and getattr(tokenizer, "eos_token_id", None) is not None:
tokenizer.pad_token = tokenizer.eos_token
if getattr(model.config, "pad_token_id", None) is None and getattr(tokenizer, "pad_token_id", None) is not None:
model.config.pad_token_id = tokenizer.pad_token_id
except Exception:
pass
print("✅ Model loaded and ready!")
# OCR model registry (providers load their models lazily)
registry = build_registry()
yield
@@ -112,155 +86,6 @@ app.add_middleware(
allow_headers=["*"],
)
# -----------------------------
# Prompt builder
# -----------------------------
def build_prompt(
mode: str,
user_prompt: str,
grounding: bool,
find_term: Optional[str],
schema: Optional[str],
include_caption: bool,
) -> str:
"""Build the prompt based on mode"""
parts: List[str] = ["<image>"]
mode_requires_grounding = mode in {"find_ref", "layout_map", "pii_redact"}
if grounding or mode_requires_grounding:
parts.append("<|grounding|>")
instruction = ""
if mode == "plain_ocr":
instruction = "Free OCR."
elif mode == "markdown":
instruction = "Convert the document to markdown."
elif mode == "tables_csv":
instruction = (
"Extract every table and output CSV only. "
"Use commas, minimal quoting. If multiple tables, separate with a line containing '---'."
)
elif mode == "tables_md":
instruction = "Extract every table as GitHub-flavored Markdown tables. Output only the tables."
elif mode == "kv_json":
schema_text = schema.strip() if schema else "{}"
instruction = (
"Extract key fields and return strict JSON only. "
f"Use this schema (fill the values): {schema_text}"
)
elif mode == "figure_chart":
instruction = (
"Parse the figure. First extract any numeric series as a two-column table (x,y). "
"Then summarize the chart in 2 sentences. Output the table, then a line '---', then the summary."
)
elif mode == "find_ref":
key = (find_term or "").strip() or "Total"
instruction = f"Locate <|ref|>{key}<|/ref|> in the image."
elif mode == "layout_map":
instruction = (
'Return a JSON array of blocks with fields {"type":["title","paragraph","table","figure"],'
'"box":[x1,y1,x2,y2]}. Do not include any text content.'
)
elif mode == "pii_redact":
instruction = (
'Find all occurrences of emails, phone numbers, postal addresses, and IBANs. '
'Return a JSON array of objects {label, text, box:[x1,y1,x2,y2]}.'
)
elif mode == "multilingual":
instruction = "Free OCR. Detect the language automatically and output in the same script."
elif mode == "describe":
instruction = "Describe this image. Focus on visible key elements."
elif mode == "freeform":
instruction = user_prompt.strip() if user_prompt else "OCR this image."
else:
instruction = "OCR this image."
if include_caption and mode not in {"describe"}:
instruction = instruction + "\nThen add a one-paragraph description of the image."
parts.append(instruction)
return "\n".join(parts)
# -----------------------------
# Grounding parser
# -----------------------------
# Match a full detection block and capture the coordinates as the entire list expression
# Examples of captured coords (including outer brackets):
# - [[312, 339, 480, 681]]
# - [[504, 700, 625, 910], [771, 570, 996, 996]]
# - [[110, 310, 255, 800], [312, 343, 479, 680], ...]
# Using a greedy bracket capture ensures we include all inner lists up to the last ']' before </|det|>
DET_BLOCK = re.compile(
r"<\|ref\|>(?P<label>.*?)<\|/ref\|>\s*<\|det\|>\s*(?P<coords>\[.*\])\s*<\|/det\|>",
re.DOTALL,
)
def clean_grounding_text(text: str) -> str:
"""Remove grounding tags from text for display, keeping labels"""
# Replace <|ref|>label<|/ref|><|det|>[...any nested lists...]<|/det|> with just the label
cleaned = re.sub(
r"<\|ref\|>(.*?)<\|/ref\|>\s*<\|det\|>\s*\[.*\]\s*<\|/det\|>",
r"\1",
text,
flags=re.DOTALL,
)
# Also remove any standalone grounding tags
cleaned = re.sub(r"<\|grounding\|>", "", cleaned)
return cleaned.strip()
def parse_detections(text: str, image_width: int, image_height: int) -> List[Dict[str, Any]]:
"""Parse grounding boxes from text and scale from 0-999 normalized coords to actual image dimensions
Handles both single and multiple bounding boxes:
- Single: <|ref|>label<|/ref|><|det|>[[x1,y1,x2,y2]]<|/det|>
- Multiple: <|ref|>label<|/ref|><|det|>[[x1,y1,x2,y2], [x1,y1,x2,y2], ...]<|/det|>
"""
boxes: List[Dict[str, Any]] = []
for m in DET_BLOCK.finditer(text or ""):
label = m.group("label").strip()
coords_str = m.group("coords").strip()
print(f"🔍 DEBUG: Found detection for '{label}'")
print(f"📦 Raw coords string (with brackets): {coords_str}")
try:
import ast
# Parse the full bracket expression directly (handles single and multiple)
parsed = ast.literal_eval(coords_str)
# Normalize to a list of lists
if (
isinstance(parsed, list)
and len(parsed) == 4
and all(isinstance(n, (int, float)) for n in parsed)
):
# Single box provided as [x1,y1,x2,y2]
box_coords = [parsed]
print("📦 Single box (flat list) detected")
elif isinstance(parsed, list):
box_coords = parsed
print(f"📦 Boxes detected: {len(box_coords)}")
else:
raise ValueError("Unsupported coords structure")
# Process each box
for idx, box in enumerate(box_coords):
if isinstance(box, (list, tuple)) and len(box) >= 4:
x1 = int(float(box[0]) / 999 * image_width)
y1 = int(float(box[1]) / 999 * image_height)
x2 = int(float(box[2]) / 999 * image_width)
y2 = int(float(box[3]) / 999 * image_height)
print(f" Box {idx+1}: {box} → [{x1}, {y1}, {x2}, {y2}]")
boxes.append({"label": label, "box": [x1, y1, x2, y2]})
else:
print(f" ⚠️ Skipping invalid box: {box}")
except Exception as e:
print(f"❌ Parsing failed: {e}")
continue
print(f"🎯 Total boxes parsed: {len(boxes)}")
return boxes
# -----------------------------
# Routes
# -----------------------------
@@ -270,11 +95,38 @@ async def root():
@app.get("/health")
async def health():
return {"status": "healthy", "model_loaded": model is not None}
return {"status": "healthy", "models": registry.list_models() if registry else []}
@app.get("/api/models")
async def list_models():
"""List the OCR models available for selection in the UI."""
if registry is None:
raise HTTPException(status_code=503, detail="Model registry not ready.")
return JSONResponse({"models": registry.list_models()})
def _resolve_provider(model_id: Optional[str], mode: str):
"""Look up the provider and reject capability mismatches (e.g. grounding)."""
if registry is None:
raise HTTPException(status_code=503, detail="Model registry not ready.")
try:
provider = registry.get(model_id)
except ProviderError as exc:
raise HTTPException(status_code=400, detail=str(exc))
if mode in GROUNDING_MODES and not provider.capabilities.get("grounding"):
raise HTTPException(
status_code=400,
detail=f"Model '{provider.label}' does not support grounding modes (e.g. {mode}).",
)
return provider
@app.post("/api/ocr")
async def ocr_inference(
image: UploadFile = File(...),
model: Optional[str] = Form(None),
mode: str = Form("plain_ocr"),
prompt: str = Form(""),
grounding: bool = Form(False),
@@ -290,32 +142,18 @@ async def ocr_inference(
Perform OCR inference on uploaded image
- **image**: Image file to process
- **model**: OCR model id (see GET /api/models); defaults to the registry default
- **mode**: OCR mode (plain_ocr, markdown, tables_csv, etc.)
- **prompt**: Custom prompt for freeform mode
- **grounding**: Enable grounding boxes
- **grounding**: Enable grounding boxes (DeepSeek only)
- **include_caption**: Add image description
- **find_term**: Term to find (for find_ref mode)
- **schema**: JSON schema (for kv_json mode)
- **base_size**: Base processing size
- **image_size**: Image size parameter
- **crop_mode**: Enable crop mode
- **test_compress**: Test compression
- **base_size/image_size/crop_mode/test_compress**: DeepSeek processing options
"""
if model is None or tokenizer is None:
raise HTTPException(status_code=503, detail="Model not loaded yet")
# Build prompt
prompt_text = build_prompt(
mode=mode,
user_prompt=prompt,
grounding=grounding,
find_term=find_term,
schema=schema,
include_caption=include_caption,
)
provider = _resolve_provider(model, mode)
tmp_img = None
out_dir = None
try:
# Save uploaded file
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
@@ -330,42 +168,27 @@ async def ocr_inference(
except Exception:
orig_w = orig_h = None
out_dir = tempfile.mkdtemp(prefix="dsocr_")
# Run inference
res = model.infer(
tokenizer,
prompt=prompt_text,
image_file=tmp_img,
output_path=out_dir,
base_size=base_size,
image_size=image_size,
crop_mode=crop_mode,
save_results=False,
test_compress=test_compress,
eval_mode=True,
# Run inference through the selected provider
text = provider.run(
tmp_img,
mode=mode,
prompt=prompt,
grounding=grounding,
find_term=find_term,
schema=schema,
include_caption=include_caption,
options={
"base_size": base_size,
"image_size": image_size,
"crop_mode": crop_mode,
"test_compress": test_compress,
},
)
# Normalize response
if isinstance(res, str):
text = res.strip()
elif isinstance(res, dict) and "text" in res:
text = str(res["text"]).strip()
elif isinstance(res, (list, tuple)):
text = "\n".join(map(str, res)).strip()
else:
text = ""
# Fallback: check output file
if not text:
mmd = os.path.join(out_dir, "result.mmd")
if os.path.exists(mmd):
with open(mmd, "r", encoding="utf-8") as fh:
text = fh.read().strip()
if not text:
text = "No text returned by model."
# Parse grounding boxes with proper coordinate scaling
# Parse grounding boxes (no-op for providers/text without grounding tokens)
boxes = parse_detections(text, orig_w or 1, orig_h or 1) if ("<|det|>" in text or "<|ref|>" in text) else []
# Clean grounding tags from display text, but keep the labels
@@ -382,14 +205,21 @@ async def ocr_inference(
"boxes": boxes,
"image_dims": {"w": orig_w, "h": orig_h},
"metadata": {
"model": provider.id,
"model_label": provider.label,
"mode": mode,
"grounding": grounding or (mode in {"find_ref","layout_map","pii_redact"}),
"grounding": grounding or (mode in GROUNDING_MODES),
"base_size": base_size,
"image_size": image_size,
"crop_mode": crop_mode
}
})
except ProviderError as e:
print(f"OCR provider error: {e}")
raise HTTPException(status_code=502, detail=str(e))
except HTTPException:
raise
except Exception as e:
print(f"OCR inference error: {type(e).__name__}: {str(e)}")
raise HTTPException(status_code=500, detail="An internal error occurred during OCR processing.")
@@ -400,12 +230,11 @@ async def ocr_inference(
os.remove(tmp_img)
except Exception:
pass
if out_dir:
shutil.rmtree(out_dir, ignore_errors=True)
@app.post("/api/process-pdf")
async def process_pdf(
pdf_file: UploadFile = File(...),
model: Optional[str] = Form(None),
mode: str = Form("plain_ocr"),
prompt: str = Form(""),
output_format: str = Form("markdown"), # markdown, html, docx, json
@@ -432,8 +261,7 @@ async def process_pdf(
- **image_size**: Image size parameter
- **crop_mode**: Enable crop mode
"""
if model is None or tokenizer is None:
raise HTTPException(status_code=503, detail="Model not loaded yet")
provider = _resolve_provider(model, mode)
# Validate output format
if output_format not in ["markdown", "html", "docx", "json"]:
@@ -456,56 +284,32 @@ async def process_pdf(
for page_idx, img in enumerate(images):
print(f"🔍 Processing page {page_idx + 1}/{total_pages}...")
# Build prompt for this page
prompt_text = build_prompt(
mode=mode,
user_prompt=prompt,
grounding=grounding,
find_term=None,
schema=None,
include_caption=include_caption,
)
# Save image temporarily
tmp_img = None
out_dir = None
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
img.save(tmp, format="PNG")
tmp_img = tmp.name
orig_w, orig_h = img.size
out_dir = tempfile.mkdtemp(prefix="dsocr_pdf_")
# Run inference
res = model.infer(
tokenizer,
prompt=prompt_text,
image_file=tmp_img,
output_path=out_dir,
base_size=base_size,
image_size=image_size,
crop_mode=crop_mode,
save_results=False,
test_compress=False,
eval_mode=True,
# Run inference through the selected provider
text = provider.run(
tmp_img,
mode=mode,
prompt=prompt,
grounding=grounding,
find_term=None,
schema=None,
include_caption=include_caption,
options={
"base_size": base_size,
"image_size": image_size,
"crop_mode": crop_mode,
"test_compress": False,
},
)
# Normalize response
if isinstance(res, str):
text = res.strip()
elif isinstance(res, dict) and "text" in res:
text = str(res["text"]).strip()
elif isinstance(res, (list, tuple)):
text = "\n".join(map(str, res)).strip()
else:
text = ""
if not text:
mmd = os.path.join(out_dir, "result.mmd")
if os.path.exists(mmd):
with open(mmd, "r", encoding="utf-8") as fh:
text = fh.read().strip()
if not text:
text = f"No text returned for page {page_idx + 1}."
@@ -550,8 +354,6 @@ async def process_pdf(
os.remove(tmp_img)
except Exception:
pass
if out_dir:
shutil.rmtree(out_dir, ignore_errors=True)
print(f"✅ Processed all {total_pages} pages")
@@ -562,6 +364,8 @@ async def process_pdf(
"total_pages": total_pages,
"pages": pages_content,
"metadata": {
"model": provider.id,
"model_label": provider.label,
"mode": mode,
"grounding": grounding,
"extract_images": extract_images,
@@ -590,6 +394,9 @@ async def process_pdf(
headers={"Content-Disposition": f"attachment; filename=ocr_result.docx"}
)
except ProviderError as e:
print(f"PDF provider error: {e}")
raise HTTPException(status_code=502, detail=str(e))
except Exception as e:
import traceback
print(f"Error processing PDF: {e}")
@@ -633,6 +440,7 @@ async def commit_job(
describe_text: str = Form(""),
freeform_text: str = Form(""),
mode: str = Form("plain_ocr"),
ocr_model: str = Form(""),
):
"""Commit an OCR job: save the image and insert a DB record."""
job_id = str(uuid.uuid4())
@@ -664,13 +472,14 @@ async def commit_job(
"""
INSERT INTO ocr_jobs
(id, author, book, chapter, page, image_path, original_filename,
ocr_text, describe_text, freeform_text, mode, status)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 'unreviewed')
ocr_text, describe_text, freeform_text, mode, ocr_model, status)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 'unreviewed')
RETURNING *
""",
(job_id, author or None, book or None, chapter or None,
page or None, image_path, original_filename,
ocr_text or None, describe_text or None, freeform_text or None, mode),
ocr_text or None, describe_text or None, freeform_text or None,
mode, ocr_model or None),
)
row = cur.fetchone()
except Exception as exc:
@@ -743,7 +552,7 @@ async def list_jobs(
cur.execute(
f"""
SELECT id, author, book, chapter, page, submitted_at, status,
reviewer_name, reviewed_at, mode, original_filename
reviewer_name, reviewed_at, mode, ocr_model, original_filename
FROM ocr_jobs {where}
ORDER BY submitted_at DESC
LIMIT %s OFFSET %s
@@ -945,6 +754,75 @@ async def set_job_status(job_id: str, body: StatusRequest):
return JSONResponse(_job_row_to_dict(row))
class JobDescribeRequest(BaseModel):
model: Optional[str] = None
@app.post("/api/jobs/{job_id}/describe")
async def describe_job(job_id: str, body: JobDescribeRequest):
"""Run Describe mode on a job's stored image and save the result to describe_text."""
try:
uuid.UUID(job_id)
except ValueError:
raise HTTPException(status_code=400, detail="Invalid job ID.")
# Look up the stored image for this job
try:
with get_db() as conn:
with conn.cursor() as cur:
cur.execute("SELECT image_path FROM ocr_jobs WHERE id = %s", (job_id,))
row = cur.fetchone()
except Exception as exc:
print(f"describe_job lookup DB error: {exc}")
raise HTTPException(status_code=500, detail="Database error.")
if not row:
raise HTTPException(status_code=404, detail="Job not found.")
image_path = row["image_path"]
if not image_path or not os.path.isfile(image_path):
raise HTTPException(status_code=404, detail="Image file not found on disk.")
provider = _resolve_provider(body.model, "describe")
try:
text = provider.run(
image_path,
mode="describe",
prompt="",
grounding=False,
find_term=None,
schema=None,
include_caption=False,
options={"base_size": 1024, "image_size": 640, "crop_mode": True, "test_compress": False},
)
except ProviderError as e:
print(f"describe_job provider error: {e}")
raise HTTPException(status_code=502, detail=str(e))
except Exception as e:
print(f"describe_job inference error: {type(e).__name__}: {e}")
raise HTTPException(status_code=500, detail="An internal error occurred during description.")
display_text = clean_grounding_text(text) if ("<|ref|>" in text or "<|grounding|>" in text) else text
# Persist the generated description on the job
try:
with get_db() as conn:
with conn.cursor() as cur:
cur.execute(
"UPDATE ocr_jobs SET describe_text = %s WHERE id = %s RETURNING *",
(display_text, job_id),
)
updated = cur.fetchone()
except Exception as exc:
print(f"describe_job save DB error: {exc}")
raise HTTPException(status_code=500, detail="Database error.")
if not updated:
raise HTTPException(status_code=404, detail="Job not found.")
return JSONResponse(_job_row_to_dict(updated))
@app.delete("/api/jobs/{job_id}")
async def delete_job(job_id: str):
"""Delete a job record and its stored image."""

View File

@@ -16,3 +16,4 @@ img2pdf>=0.5.0
python-docx>=1.1.0
markdown>=3.5.0
psycopg2-binary>=2.9.0
httpx>=0.27.0

View File

@@ -27,6 +27,15 @@ services:
MAX_UPLOAD_SIZE_MB: ${MAX_UPLOAD_SIZE_MB:-100}
DATABASE_URL: ${DATABASE_URL:-postgresql://ocr_user:ocr_password@postgres:5432/ocr_db}
OCR_IMAGES_DIR: ${OCR_IMAGES_DIR:-/data/ocr_images}
ENABLE_DEEPSEEK_LOCAL: ${ENABLE_DEEPSEEK_LOCAL:-true}
OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-http://host.docker.internal:11434}
OLLAMA_MODELS: ${OLLAMA_MODELS:-}
DEFAULT_OCR_MODEL: ${DEFAULT_OCR_MODEL:-deepseek-local}
OLLAMA_TIMEOUT: ${OLLAMA_TIMEOUT:-300}
# Lets the container reach an Ollama server running on the Docker host
# (works out of the box on Docker Desktop; required for Linux engines).
extra_hosts:
- "host.docker.internal:host-gateway"
volumes:
- ./models:/models
- ./ocr_images:/data/ocr_images

View File

@@ -1,5 +1,6 @@
import { useState, useCallback } from 'react'
import { useState, useCallback, useEffect } from 'react'
import { useSuggestions } from './hooks/useSuggestions'
import { useModels } from './hooks/useModels'
import { motion, AnimatePresence } from 'framer-motion'
import {
Sparkles, Zap, Loader2, Settings, Image as ImageIcon, FileText,
@@ -7,6 +8,7 @@ import {
} from 'lucide-react'
import ImageUpload from './components/ImageUpload'
import ModeSelector from './components/ModeSelector'
import ModelSelector from './components/ModelSelector'
import ResultPanel from './components/ResultPanel'
import AdvancedSettings from './components/AdvancedSettings'
import PDFProcessor from './components/PDFProcessor'
@@ -24,6 +26,8 @@ function App() {
const [view, setView] = useState('new_job')
// OCR state
const { models, loading: modelsLoading } = useModels()
const [model, setModel] = useState(null)
const [mode, setMode] = useState('plain_ocr')
const [fileType, setFileType] = useState('image')
const [image, setImage] = useState(null)
@@ -51,8 +55,15 @@ function App() {
const [commitResult, setCommitResult] = useState(null)
// Modes that produce editable text output and can be committed to the DB
const COMMITTABLE_MODES = new Set(['plain_ocr', 'describe', 'freeform'])
const MODE_LABELS = { plain_ocr: 'OCR Text', describe: 'Description', freeform: 'Freeform' }
const COMMITTABLE_MODES = new Set(['plain_ocr', 'describe'])
const MODE_LABELS = { plain_ocr: 'OCR Text', describe: 'Description' }
// Pick the default model once the list loads
useEffect(() => {
if (!model && models.length > 0) {
setModel((models.find(m => m.default) || models[0]).id)
}
}, [models, model])
// Show the full-screen result view once at least one committable mode has a result
const showResultView = view === 'new_job' && Object.keys(modeResults).length > 0
@@ -97,6 +108,7 @@ function App() {
try {
const formData = new FormData()
formData.append('image', image)
if (model) formData.append('model', model)
formData.append('mode', mode)
formData.append('prompt', prompt)
formData.append('grounding', mode === 'find_ref')
@@ -149,6 +161,7 @@ function App() {
formData.append('describe_text', editedResults.describe || '')
formData.append('freeform_text', editedResults.freeform || '')
formData.append('mode', mode)
if (model) formData.append('ocr_model', model)
const response = await axios.post(`${API_BASE}/jobs`, formData, {
headers: { 'Content-Type': 'multipart/form-data' },
@@ -159,7 +172,7 @@ function App() {
} finally {
setCommitLoading(false)
}
}, [image, editedResults, metadata, mode])
}, [image, editedResults, metadata, mode, model])
const handleCopy = useCallback(() => {
const text = (activeResultMode && editedResults[activeResultMode]) || result?.text
@@ -263,11 +276,12 @@ function App() {
>
{/* Run additional modes */}
<div className="glass p-4 rounded-2xl flex-shrink-0">
<ModeSelector
mode={mode} onModeChange={setMode}
prompt={prompt} onPromptChange={setPrompt}
findTerm={findTerm} onFindTermChange={setFindTerm}
/>
<div className="mb-3">
<ModelSelector
models={models} value={model} onChange={setModel} loading={modelsLoading}
/>
</div>
<ModeSelector mode={mode} onModeChange={setMode} />
<div className="flex items-center gap-3 mt-3">
<motion.button
onClick={handleSubmit}
@@ -462,12 +476,12 @@ function App() {
<MetadataForm metadata={metadata} onChange={setMetadata} suggestions={suggestions} />
<ModeSelector
mode={mode} onModeChange={setMode}
prompt={prompt} onPromptChange={setPrompt}
findTerm={findTerm} onFindTermChange={setFindTerm}
<ModelSelector
models={models} value={model} onChange={setModel} loading={modelsLoading}
/>
<ModeSelector mode={mode} onModeChange={setMode} />
<ImageUpload onImageSelect={handleImageSelect} preview={imagePreview} fileType={fileType} />
<motion.button
@@ -497,7 +511,7 @@ function App() {
{fileType === 'pdf' ? (
<PDFProcessor
pdfFile={image} mode={mode} prompt={prompt}
pdfFile={image} mode={mode} prompt={prompt} model={model}
advancedSettings={advancedSettings} includeCaption={includeCaption}
/>
) : (

View File

@@ -1,9 +1,10 @@
import { useState, useEffect, useCallback } from 'react'
import { useSuggestions } from '../hooks/useSuggestions'
import { useModels } from '../hooks/useModels'
import { motion, AnimatePresence } from 'framer-motion'
import {
Search, ChevronLeft, ChevronRight, CheckCircle2, Clock,
FileText, Loader2, Save, RefreshCw, Trash2,
FileText, Loader2, Save, RefreshCw, Trash2, Sparkles,
} from 'lucide-react'
import axios from 'axios'
@@ -32,10 +33,14 @@ function StatusBadge({ status }) {
// Full-screen Job Detail
// ─────────────────────────────────────────────────────────────
function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} }) {
const { models } = useModels()
const [job, setJob] = useState(null)
const [loading, setLoading] = useState(true)
const [error, setError] = useState(null)
const [describeModel, setDescribeModel] = useState('')
const [generatingDescribe, setGeneratingDescribe] = useState(false)
const [editedText, setEditedText] = useState('')
const [editDescribeText, setEditDescribeText] = useState('')
const [editFreeformText, setEditFreeformText] = useState('')
@@ -71,10 +76,9 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
setEditChapter(d.chapter || '')
setEditPage(d.page || '')
setReviewerName(d.reviewer_name || '')
// Default to first tab that has content
// Default to the OCR tab when there's OCR text, otherwise Description
if (d.reviewed_text || d.ocr_text) setActiveTab('ocr')
else if (d.describe_text) setActiveTab('describe')
else if (d.freeform_text) setActiveTab('freeform')
else setActiveTab('describe')
}
})
.catch(err => {
@@ -85,6 +89,32 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
return () => { cancelled = true }
}, [jobId])
// Default the Describe model to the job's original model (if available) or the registry default
useEffect(() => {
if (!describeModel && models.length > 0) {
const def = models.find(m => m.default) || models[0]
const fromJob = job?.ocr_model && models.some(m => m.id === job.ocr_model) ? job.ocr_model : null
setDescribeModel(fromJob || def.id)
}
}, [models, job, describeModel])
const handleGenerateDescribe = async () => {
setGeneratingDescribe(true)
setSaveResult(null)
try {
const res = await axios.post(`${API_BASE}/jobs/${jobId}/describe`, {
model: describeModel || null,
})
setJob(res.data)
setEditDescribeText(res.data.describe_text || '')
onReviewed(res.data)
} catch (err) {
setSaveResult({ success: false, error: err.response?.data?.detail || err.message })
} finally {
setGeneratingDescribe(false)
}
}
const handleSave = async () => {
if (!reviewerName.trim()) {
setSaveResult({ success: false, error: 'Reviewer name is required.' })
@@ -114,16 +144,24 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
}
const handleToggleStatus = async () => {
const next = isReviewed ? 'unreviewed' : 'reviewed'
if (next === 'reviewed' && !reviewerName.trim()) {
setSaveResult({ success: false, error: 'Reviewer name is required to mark reviewed.' })
// Marking reviewed accepts BOTH the reviewed document text and the description,
// so it goes through the full review save (not a status-only flip).
if (!isReviewed) {
setTogglingStatus(true)
try {
await handleSave()
} finally {
setTogglingStatus(false)
}
return
}
// Reverting to unreviewed preserves the saved reviewed text and description.
setTogglingStatus(true)
setSaveResult(null)
try {
const res = await axios.put(`${API_BASE}/jobs/${jobId}/status`, {
status: next,
status: 'unreviewed',
reviewer_name: reviewerName.trim() || null,
})
setJob(res.data)
@@ -259,8 +297,7 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
{(() => {
const tabs = [
job.ocr_text || job.reviewed_text ? { id: 'ocr', label: 'OCR Text' } : null,
job.describe_text != null ? { id: 'describe', label: 'Description' } : null,
job.freeform_text != null ? { id: 'freeform', label: 'Freeform' } : null,
{ id: 'describe', label: 'Description' },
].filter(Boolean)
return tabs.length > 1 ? (
<div className="flex gap-1 mb-3 flex-shrink-0">
@@ -282,7 +319,7 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
})()}
<p className="text-xs text-gray-400 mb-2 flex-shrink-0">
{{ ocr: isReviewed ? 'Reviewed Text' : 'OCR Text', describe: 'Description', freeform: 'Freeform' }[activeTab]}
{{ ocr: isReviewed ? 'Reviewed Text' : 'OCR Text', describe: 'Description' }[activeTab]}
<span className="text-purple-400 ml-1">(editable)</span>
</p>
@@ -307,20 +344,43 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
</>
)}
{activeTab === 'describe' && (
<textarea
value={editDescribeText}
onChange={e => setEditDescribeText(e.target.value)}
className="flex-1 w-full bg-transparent text-sm text-gray-200 font-mono resize-none focus:outline-none min-h-0"
placeholder="Description text..."
/>
)}
{activeTab === 'freeform' && (
<textarea
value={editFreeformText}
onChange={e => setEditFreeformText(e.target.value)}
className="flex-1 w-full bg-transparent text-sm text-gray-200 font-mono resize-none focus:outline-none min-h-0"
placeholder="Freeform result..."
/>
<>
<div className="flex items-center gap-2 mb-2 flex-shrink-0">
<select
value={describeModel}
onChange={e => setDescribeModel(e.target.value)}
disabled={generatingDescribe || models.length === 0}
className="bg-white/5 border border-white/10 rounded-lg px-2 py-1.5 text-xs text-gray-200 focus:outline-none focus:border-purple-500/50"
>
{models.length === 0 && <option value="">No models</option>}
{models.map(m => (
<option key={m.id} value={m.id}>{m.label}{m.default ? ' (default)' : ''}</option>
))}
</select>
<motion.button
onClick={handleGenerateDescribe}
disabled={generatingDescribe || !describeModel}
className={`flex items-center gap-1.5 px-3 py-1.5 rounded-lg text-xs font-medium transition-all ${
generatingDescribe || !describeModel
? 'opacity-50 cursor-not-allowed bg-white/5'
: 'bg-gradient-to-r from-violet-600 to-purple-600 hover:from-violet-500 hover:to-purple-500'
}`}
whileHover={!generatingDescribe && describeModel ? { scale: 1.02 } : {}}
whileTap={!generatingDescribe && describeModel ? { scale: 0.98 } : {}}
title="Run Describe on this job's image and save it"
>
{generatingDescribe
? <><Loader2 className="w-3.5 h-3.5 animate-spin" /> Generating</>
: <><Sparkles className="w-3.5 h-3.5" /> Generate Description</>}
</motion.button>
</div>
<textarea
value={editDescribeText}
onChange={e => setEditDescribeText(e.target.value)}
className="flex-1 w-full bg-transparent text-sm text-gray-200 font-mono resize-none focus:outline-none min-h-0"
placeholder="No description yet — pick a model and click Generate Description, or type one here."
/>
</>
)}
</div>
</div>
@@ -385,6 +445,12 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
</div>
</div>
{!isReviewed && (
<p className="text-xs text-gray-500 mt-2">
Marking reviewed accepts both the reviewed document text and the description.
</p>
)}
{saveResult && (
<motion.div
initial={{ opacity: 0, y: -4 }} animate={{ opacity: 1, y: 0 }}
@@ -405,6 +471,7 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
<span className="text-xs text-gray-500">Last reviewed: {new Date(job.reviewed_at).toLocaleString()}</span>
)}
{job.mode && <span className="text-xs text-gray-500">Mode: {job.mode}</span>}
{job.ocr_model && <span className="text-xs text-gray-500">Model: {job.ocr_model}</span>}
</div>
</div>
</>
@@ -573,7 +640,10 @@ export default function JobsPanel() {
{job.page && <span className="text-xs text-gray-500">p. {job.page}</span>}
</div>
{job.author && <p className="text-xs text-gray-400 mt-1">{job.author}</p>}
<p className="text-xs text-gray-600 mt-2 font-mono">{new Date(job.submitted_at).toLocaleDateString()}</p>
<div className="flex items-center justify-between mt-2">
<p className="text-xs text-gray-600 font-mono">{new Date(job.submitted_at).toLocaleDateString()}</p>
{job.ocr_model && <span className="text-[10px] text-gray-500 truncate ml-2">{job.ocr_model}</span>}
</div>
</motion.button>
))}
</AnimatePresence>

View File

@@ -1,29 +1,17 @@
import { motion } from 'framer-motion'
import { FileText, Eye, Search, Wand2 } from 'lucide-react'
import { FileText, Eye } from 'lucide-react'
const modes = [
{ id: 'plain_ocr', name: 'Plain OCR', icon: FileText, color: 'from-blue-500 to-cyan-500', desc: 'Extract raw text', needsInput: false },
{ id: 'describe', name: 'Describe', icon: Eye, color: 'from-violet-500 to-purple-500', desc: 'Image description', needsInput: false },
{ id: 'find_ref', name: 'Find', icon: Search, color: 'from-yellow-500 to-orange-500', desc: 'Locate specific terms', needsInput: 'findTerm' },
{ id: 'freeform', name: 'Freeform', icon: Wand2, color: 'from-fuchsia-500 to-pink-500', desc: 'Custom prompt', needsInput: 'prompt' },
{ id: 'plain_ocr', name: 'Plain OCR', icon: FileText, color: 'from-blue-500 to-cyan-500', desc: 'Extract raw text' },
{ id: 'describe', name: 'Describe', icon: Eye, color: 'from-violet-500 to-purple-500', desc: 'Image description' },
]
export default function ModeSelector({
mode,
onModeChange,
prompt,
onPromptChange,
findTerm,
onFindTermChange
}) {
const selectedMode = modes.find(m => m.id === mode)
const needsInput = selectedMode?.needsInput
export default function ModeSelector({ mode, onModeChange }) {
return (
<div className="glass p-4 rounded-2xl space-y-3">
<h3 className="text-sm font-semibold text-gray-200">Mode</h3>
<div className="grid grid-cols-4 gap-2">
<div className="grid grid-cols-2 gap-2">
{modes.map((m) => {
const Icon = m.icon
const isSelected = mode === m.id
@@ -32,6 +20,7 @@ export default function ModeSelector({
<motion.button
key={m.id}
onClick={() => onModeChange(m.id)}
title={m.desc}
className={`
relative p-2 rounded-xl text-center transition-all
${isSelected
@@ -68,38 +57,6 @@ export default function ModeSelector({
)
})}
</div>
{needsInput === 'findTerm' && (
<motion.div
initial={{ opacity: 0, height: 0 }}
animate={{ opacity: 1, height: 'auto' }}
exit={{ opacity: 0, height: 0 }}
>
<input
type="text"
value={findTerm}
onChange={(e) => onFindTermChange(e.target.value)}
placeholder="Enter term to find (e.g., Total, Invoice #)"
className="w-full bg-white/5 border border-white/10 rounded-xl px-3 py-2 text-sm focus:outline-none focus:border-purple-500 transition-colors"
/>
</motion.div>
)}
{needsInput === 'prompt' && (
<motion.div
initial={{ opacity: 0, height: 0 }}
animate={{ opacity: 1, height: 'auto' }}
exit={{ opacity: 0, height: 0 }}
>
<textarea
value={prompt}
onChange={(e) => onPromptChange(e.target.value)}
placeholder="Enter your custom prompt..."
className="w-full bg-white/5 border border-white/10 rounded-xl px-3 py-2 text-sm focus:outline-none focus:border-purple-500 transition-colors resize-none"
rows={2}
/>
</motion.div>
)}
</div>
)
}

View File

@@ -5,7 +5,7 @@ import axios from 'axios'
const API_BASE = import.meta.env.VITE_API_URL || '/api'
function PDFProcessor({ pdfFile, mode, prompt, advancedSettings, includeCaption }) {
function PDFProcessor({ pdfFile, mode, prompt, model, advancedSettings, includeCaption }) {
const [processing, setProcessing] = useState(false)
const [progress, setProgress] = useState(0)
const [result, setResult] = useState(null)
@@ -29,6 +29,7 @@ function PDFProcessor({ pdfFile, mode, prompt, advancedSettings, includeCaption
try {
const formData = new FormData()
formData.append('pdf_file', pdfFile)
if (model) formData.append('model', model)
formData.append('mode', mode)
formData.append('prompt', prompt)
formData.append('output_format', outputFormat)
@@ -80,7 +81,7 @@ function PDFProcessor({ pdfFile, mode, prompt, advancedSettings, includeCaption
} finally {
setProcessing(false)
}
}, [pdfFile, mode, prompt, outputFormat, includeCaption, advancedSettings])
}, [pdfFile, mode, prompt, model, outputFormat, includeCaption, advancedSettings])
const handleDownloadJSON = useCallback(() => {
if (!result || outputFormat !== 'json') return