Remove Freeform and Find from UI. Allow Description to be added to Reviewed job

2026-06-29 13:09:01 +01:00
parent 48f958de6c
commit 04bbbebd5a
10 changed files with 394 additions and 403 deletions
--- a/.env.example
+++ b/.env.example
@@ -11,6 +11,19 @@ FRONTEND_PORT=3000
 MODEL_NAME=deepseek-ai/DeepSeek-OCR
 HF_HOME=/models

+# OCR model selection
+# Register the local DeepSeek-OCR model (set to false for an Ollama-only deployment)
+ENABLE_DEEPSEEK_LOCAL=true
+# External Ollama host the backend should call (no trailing slash)
+OLLAMA_BASE_URL=http://host.docker.internal:11434
+# Comma-separated Ollama vision model tags to surface in the UI.
+# Pull these on the Ollama host first, e.g. `ollama pull glm-ocr`.
+OLLAMA_MODELS=glm-ocr,llama3.2-vision,minicpm-v,qwen2.5vl
+# Default model id selected in the UI (deepseek-local or ollama:<tag>)
+DEFAULT_OCR_MODEL=deepseek-local
+# Per-request timeout (seconds) for Ollama calls
+OLLAMA_TIMEOUT=300
+
 # CORS Configuration (comma-separated origins, defaults to http://localhost:3000)
 CORS_ORIGINS=http://localhost:3000

--- a/README.md
+++ b/README.md
@@ -172,6 +172,13 @@ FRONTEND_PORT=3000
 MODEL_NAME=deepseek-ai/DeepSeek-OCR
 HF_HOME=/models

+# OCR model selection (DeepSeek + Ollama)
+ENABLE_DEEPSEEK_LOCAL=true                          # register the local GPU model
+OLLAMA_BASE_URL=http://host.docker.internal:11434   # external Ollama host
+OLLAMA_MODELS=glm-ocr,llama3.2-vision,minicpm-v,qwen2.5vl
+DEFAULT_OCR_MODEL=deepseek-local                    # deepseek-local or ollama:<tag>
+OLLAMA_TIMEOUT=300                                  # per-request timeout (seconds)
+
 # Upload Configuration
 MAX_UPLOAD_SIZE_MB=100  # Maximum file upload size

@@ -186,13 +193,47 @@ CROP_MODE=true         # Enable dynamic cropping for large images
 - `API_HOST`: Backend API host (default: 0.0.0.0)
 - `API_PORT`: Backend API port (default: 8000)
 - `FRONTEND_PORT`: Frontend port (default: 3000)
- `MODEL_NAME`: HuggingFace model identifier
+- `MODEL_NAME`: HuggingFace model identifier for the local DeepSeek-OCR model
 - `HF_HOME`: Model cache directory
+- `ENABLE_DEEPSEEK_LOCAL`: Register the local DeepSeek-OCR model (set `false` for an Ollama-only deployment with no GPU model loaded)
+- `OLLAMA_BASE_URL`: URL of an external Ollama server the backend calls for non-DeepSeek models
+- `OLLAMA_MODELS`: Comma-separated Ollama vision model tags to expose in the UI (pull them on the Ollama host first, e.g. `ollama pull glm-ocr`)
+- `DEFAULT_OCR_MODEL`: Model id selected by default (`deepseek-local` or `ollama:<tag>`)
+- `OLLAMA_TIMEOUT`: Per-request timeout in seconds for Ollama calls
 - `MAX_UPLOAD_SIZE_MB`: Maximum file upload size in megabytes
 - `BASE_SIZE`: Base image processing size (affects memory usage)
 - `IMAGE_SIZE`: Tile size for dynamic cropping
 - `CROP_MODE`: Enable/disable dynamic image cropping

+### Choosing an OCR Model
+
+The **Model** selector (next to the Mode selector) chooses which backend runs the OCR:
+
+- **DeepSeek-OCR (local GPU)** — the default. Loaded lazily on first use. Supports
+  every mode including grounding/bounding-box modes (Find), plus the Advanced
+  Settings (base size, crop mode, etc.).
+- **Ollama models** — any vision model pulled on your Ollama host and listed in
+  `OLLAMA_MODELS` (e.g. `glm-ocr`, `llama3.2-vision`). These run remotely on the
+  Ollama server. They return **plain text only**: bounding boxes are not produced,
+  so grounding modes (Find) and the DeepSeek-specific Advanced Settings are ignored
+  / disabled when an Ollama model is selected.
+
+Setup for Ollama models:
+
+```bash
+# On the machine running Ollama
+ollama pull glm-ocr
+ollama pull llama3.2-vision
+
+# Point the backend at it (in .env), then restart
+OLLAMA_BASE_URL=http://host.docker.internal:11434
+OLLAMA_MODELS=glm-ocr,llama3.2-vision
+```
+
+`GET /api/models` returns the registered models and their capabilities; the UI
+populates the selector from it. The model used for each job is stored on the job
+record (`ocr_model`) and shown in the Browse Jobs view.
+
 ## Tech Stack

 ### Frontend
@@ -377,6 +418,7 @@ For large images, the model uses dynamic cropping:

 **Parameters:**
 - `image` (file, required) - Image file to process (up to 100MB)
+- `model` (string) - OCR model id from `GET /api/models` (default: registry default). Grounding/Advanced settings apply to DeepSeek only.
 - `mode` (string) - OCR mode: `plain_ocr` | `describe` | `find_ref` | `freeform`
 - `prompt` (string) - Custom prompt for freeform mode
 - `grounding` (bool) - Enable bounding boxes (auto-enabled for find_ref)
@@ -416,6 +458,7 @@ Process PDF documents with OCR and export to various formats.

 **Parameters:**
 - `pdf_file` (file, required) - PDF file to process (up to 100MB)
+- `model` (string) - OCR model id from `GET /api/models` (default: registry default)
 - `mode` (string) - OCR mode: `plain_ocr` | `describe` | `find_ref` | `freeform`
 - `prompt` (string) - Custom prompt for freeform mode
 - `output_format` (string) - Output format: `markdown` | `html` | `docx` | `json`
--- a/backend/database.py
+++ b/backend/database.py
@@ -62,6 +62,11 @@ def init_db():
                ALTER TABLE ocr_jobs
                ADD COLUMN IF NOT EXISTS updated_at TIMESTAMPTZ
            """)
+            # Which OCR model produced this job (e.g. "deepseek-local", "ollama:glm-ocr")
+            cur.execute("""
+                ALTER TABLE ocr_jobs
+                ADD COLUMN IF NOT EXISTS ocr_model TEXT
+            """)
            # Trigger function: stamp updated_at on every row update
            cur.execute("""
                CREATE OR REPLACE FUNCTION set_updated_at()
--- a/backend/main.py
+++ b/backend/main.py
@@ -1,8 +1,6 @@
 import os
-import re
 import uuid
 import tempfile
-import shutil
 import base64
 from typing import List, Dict, Any, Optional
 from contextlib import asynccontextmanager
@@ -12,8 +10,6 @@ from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Query
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse, FileResponse
 from pydantic import BaseModel
-import torch
-from transformers import AutoModel, AutoTokenizer
 from PIL import Image
 import uvicorn
 from decouple import config as env_config
@@ -28,20 +24,29 @@ from pdf_utils import (
 )
 from format_converter import DocumentConverter
 from database import init_db, get_db
+from providers import (
+    build_registry,
+    parse_detections,
+    clean_grounding_text,
+    ProviderError,
+    GROUNDING_MODES,
+)

 OCR_IMAGES_DIR = env_config("OCR_IMAGES_DIR", default="/data/ocr_images")

 # -----------------------------
-# Lifespan context for model loading
+# Lifespan context
 # -----------------------------
-model = None
-tokenizer = None
+# The model registry holds all available OCR providers. Local models (e.g.
+# DeepSeek-OCR) are loaded lazily on first use so an Ollama-only deployment
+# starts instantly and never touches the GPU.
+registry = None

@asynccontextmanager
 async def lifespan(app: FastAPI):
-    """Load model on startup, cleanup on shutdown"""
-    global model, tokenizer
-    
+    """Build the model registry on startup."""
+    global registry
+
    # Image storage directory
    os.makedirs(OCR_IMAGES_DIR, exist_ok=True)

@@ -51,42 +56,11 @@ async def lifespan(app: FastAPI):
    except Exception as exc:
        print(f"Warning: database initialization failed: {exc}")

-    # Environment setup
-    os.environ.pop("TRANSFORMERS_CACHE", None)
-    MODEL_NAME = env_config("MODEL_NAME", default="deepseek-ai/DeepSeek-OCR")
-    HF_HOME = env_config("HF_HOME", default="/models")
-    os.makedirs(HF_HOME, exist_ok=True)
-    
-    # Load model
-    print(f"🚀 Loading {MODEL_NAME}...")
-    torch_dtype = torch.bfloat16
-    
-    tokenizer = AutoTokenizer.from_pretrained(
-        MODEL_NAME,
-        trust_remote_code=True,
-    )
-    
-    model = AutoModel.from_pretrained(
-        MODEL_NAME,
-        trust_remote_code=True,
-        use_safetensors=True,
-        attn_implementation="eager",
-        torch_dtype=torch_dtype,
-    ).eval().to("cuda")
-    
-    # Pad token setup
-    try:
-        if getattr(tokenizer, "pad_token_id", None) is None and getattr(tokenizer, "eos_token_id", None) is not None:
-            tokenizer.pad_token = tokenizer.eos_token
-        if getattr(model.config, "pad_token_id", None) is None and getattr(tokenizer, "pad_token_id", None) is not None:
-            model.config.pad_token_id = tokenizer.pad_token_id
-    except Exception:
-        pass
-    
-    print("✅ Model loaded and ready!")
-    
+    # OCR model registry (providers load their models lazily)
+    registry = build_registry()
+
    yield
-    
+
    # Cleanup
    print("🛑 Shutting down...")

@@ -112,155 +86,6 @@ app.add_middleware(
    allow_headers=["*"],
 )

-# -----------------------------
-# Prompt builder
-# -----------------------------
-def build_prompt(
-    mode: str,
-    user_prompt: str,
-    grounding: bool,
-    find_term: Optional[str],
-    schema: Optional[str],
-    include_caption: bool,
-) -> str:
-    """Build the prompt based on mode"""
-    parts: List[str] = ["<image>"]
-    mode_requires_grounding = mode in {"find_ref", "layout_map", "pii_redact"}
-    if grounding or mode_requires_grounding:
-        parts.append("<|grounding|>")
-
-    instruction = ""
-    if mode == "plain_ocr":
-        instruction = "Free OCR."
-    elif mode == "markdown":
-        instruction = "Convert the document to markdown."
-    elif mode == "tables_csv":
-        instruction = (
-            "Extract every table and output CSV only. "
-            "Use commas, minimal quoting. If multiple tables, separate with a line containing '---'."
-        )
-    elif mode == "tables_md":
-        instruction = "Extract every table as GitHub-flavored Markdown tables. Output only the tables."
-    elif mode == "kv_json":
-        schema_text = schema.strip() if schema else "{}"
-        instruction = (
-            "Extract key fields and return strict JSON only. "
-            f"Use this schema (fill the values): {schema_text}"
-        )
-    elif mode == "figure_chart":
-        instruction = (
-            "Parse the figure. First extract any numeric series as a two-column table (x,y). "
-            "Then summarize the chart in 2 sentences. Output the table, then a line '---', then the summary."
-        )
-    elif mode == "find_ref":
-        key = (find_term or "").strip() or "Total"
-        instruction = f"Locate <|ref|>{key}<|/ref|> in the image."
-    elif mode == "layout_map":
-        instruction = (
-            'Return a JSON array of blocks with fields {"type":["title","paragraph","table","figure"],'
-            '"box":[x1,y1,x2,y2]}. Do not include any text content.'
-        )
-    elif mode == "pii_redact":
-        instruction = (
-            'Find all occurrences of emails, phone numbers, postal addresses, and IBANs. '
-            'Return a JSON array of objects {label, text, box:[x1,y1,x2,y2]}.'
-        )
-    elif mode == "multilingual":
-        instruction = "Free OCR. Detect the language automatically and output in the same script."
-    elif mode == "describe":
-        instruction = "Describe this image. Focus on visible key elements."
-    elif mode == "freeform":
-        instruction = user_prompt.strip() if user_prompt else "OCR this image."
-    else:
-        instruction = "OCR this image."
-
-    if include_caption and mode not in {"describe"}:
-        instruction = instruction + "\nThen add a one-paragraph description of the image."
-
-    parts.append(instruction)
-    return "\n".join(parts)
-
-# -----------------------------
-# Grounding parser
-# -----------------------------
-# Match a full detection block and capture the coordinates as the entire list expression
-# Examples of captured coords (including outer brackets):
-#  - [[312, 339, 480, 681]]
-#  - [[504, 700, 625, 910], [771, 570, 996, 996]]
-#  - [[110, 310, 255, 800], [312, 343, 479, 680], ...]
-# Using a greedy bracket capture ensures we include all inner lists up to the last ']' before </|det|>
-DET_BLOCK = re.compile(
-    r"<\|ref\|>(?P<label>.*?)<\|/ref\|>\s*<\|det\|>\s*(?P<coords>\[.*\])\s*<\|/det\|>",
-    re.DOTALL,
-)
-
-def clean_grounding_text(text: str) -> str:
-    """Remove grounding tags from text for display, keeping labels"""
-    # Replace <|ref|>label<|/ref|><|det|>[...any nested lists...]<|/det|> with just the label
-    cleaned = re.sub(
-        r"<\|ref\|>(.*?)<\|/ref\|>\s*<\|det\|>\s*\[.*\]\s*<\|/det\|>",
-        r"\1",
-        text,
-        flags=re.DOTALL,
-    )
-    # Also remove any standalone grounding tags
-    cleaned = re.sub(r"<\|grounding\|>", "", cleaned)
-    return cleaned.strip()
-
-def parse_detections(text: str, image_width: int, image_height: int) -> List[Dict[str, Any]]:
-    """Parse grounding boxes from text and scale from 0-999 normalized coords to actual image dimensions
-    
-    Handles both single and multiple bounding boxes:
-    - Single: <|ref|>label<|/ref|><|det|>[[x1,y1,x2,y2]]<|/det|>
-    - Multiple: <|ref|>label<|/ref|><|det|>[[x1,y1,x2,y2], [x1,y1,x2,y2], ...]<|/det|>
-    """
-    boxes: List[Dict[str, Any]] = []
-    for m in DET_BLOCK.finditer(text or ""):
-        label = m.group("label").strip()
-        coords_str = m.group("coords").strip()
-
-        print(f"🔍 DEBUG: Found detection for '{label}'")
-        print(f"📦 Raw coords string (with brackets): {coords_str}")
-
-        try:
-            import ast
-
-            # Parse the full bracket expression directly (handles single and multiple)
-            parsed = ast.literal_eval(coords_str)
-
-            # Normalize to a list of lists
-            if (
-                isinstance(parsed, list)
-                and len(parsed) == 4
-                and all(isinstance(n, (int, float)) for n in parsed)
-            ):
-                # Single box provided as [x1,y1,x2,y2]
-                box_coords = [parsed]
-                print("📦 Single box (flat list) detected")
-            elif isinstance(parsed, list):
-                box_coords = parsed
-                print(f"📦 Boxes detected: {len(box_coords)}")
-            else:
-                raise ValueError("Unsupported coords structure")
-
-            # Process each box
-            for idx, box in enumerate(box_coords):
-                if isinstance(box, (list, tuple)) and len(box) >= 4:
-                    x1 = int(float(box[0]) / 999 * image_width)
-                    y1 = int(float(box[1]) / 999 * image_height)
-                    x2 = int(float(box[2]) / 999 * image_width)
-                    y2 = int(float(box[3]) / 999 * image_height)
-                    print(f"  Box {idx+1}: {box} → [{x1}, {y1}, {x2}, {y2}]")
-                    boxes.append({"label": label, "box": [x1, y1, x2, y2]})
-                else:
-                    print(f"  ⚠️ Skipping invalid box: {box}")
-        except Exception as e:
-            print(f"❌ Parsing failed: {e}")
-            continue
-    
-    print(f"🎯 Total boxes parsed: {len(boxes)}")
-    return boxes
-
 # -----------------------------
 # Routes
 # -----------------------------
@@ -270,11 +95,38 @@ async def root():

@app.get("/health")
 async def health():
-    return {"status": "healthy", "model_loaded": model is not None}
+    return {"status": "healthy", "models": registry.list_models() if registry else []}
+
+
+@app.get("/api/models")
+async def list_models():
+    """List the OCR models available for selection in the UI."""
+    if registry is None:
+        raise HTTPException(status_code=503, detail="Model registry not ready.")
+    return JSONResponse({"models": registry.list_models()})
+
+
+def _resolve_provider(model_id: Optional[str], mode: str):
+    """Look up the provider and reject capability mismatches (e.g. grounding)."""
+    if registry is None:
+        raise HTTPException(status_code=503, detail="Model registry not ready.")
+    try:
+        provider = registry.get(model_id)
+    except ProviderError as exc:
+        raise HTTPException(status_code=400, detail=str(exc))
+
+    if mode in GROUNDING_MODES and not provider.capabilities.get("grounding"):
+        raise HTTPException(
+            status_code=400,
+            detail=f"Model '{provider.label}' does not support grounding modes (e.g. {mode}).",
+        )
+    return provider
+

@app.post("/api/ocr")
 async def ocr_inference(
    image: UploadFile = File(...),
+    model: Optional[str] = Form(None),
    mode: str = Form("plain_ocr"),
    prompt: str = Form(""),
    grounding: bool = Form(False),
@@ -288,93 +140,64 @@ async def ocr_inference(
 ):
    """
    Perform OCR inference on uploaded image
-    
+
    - **image**: Image file to process
+    - **model**: OCR model id (see GET /api/models); defaults to the registry default
    - **mode**: OCR mode (plain_ocr, markdown, tables_csv, etc.)
    - **prompt**: Custom prompt for freeform mode
-    - **grounding**: Enable grounding boxes
+    - **grounding**: Enable grounding boxes (DeepSeek only)
    - **include_caption**: Add image description
    - **find_term**: Term to find (for find_ref mode)
    - **schema**: JSON schema (for kv_json mode)
-    - **base_size**: Base processing size
-    - **image_size**: Image size parameter
-    - **crop_mode**: Enable crop mode
-    - **test_compress**: Test compression
+    - **base_size/image_size/crop_mode/test_compress**: DeepSeek processing options
    """
-    if model is None or tokenizer is None:
-        raise HTTPException(status_code=503, detail="Model not loaded yet")
-    
-    # Build prompt
-    prompt_text = build_prompt(
-        mode=mode,
-        user_prompt=prompt,
-        grounding=grounding,
-        find_term=find_term,
-        schema=schema,
-        include_caption=include_caption,
-    )
-    
+    provider = _resolve_provider(model, mode)
+
    tmp_img = None
-    out_dir = None
    try:
        # Save uploaded file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
            content = await image.read()
            tmp.write(content)
            tmp_img = tmp.name
-        
+
        # Get original dimensions
        try:
            with Image.open(tmp_img) as im:
                orig_w, orig_h = im.size
        except Exception:
            orig_w = orig_h = None
-        
-        out_dir = tempfile.mkdtemp(prefix="dsocr_")
-        
-        # Run inference
-        res = model.infer(
-            tokenizer,
-            prompt=prompt_text,
-            image_file=tmp_img,
-            output_path=out_dir,
-            base_size=base_size,
-            image_size=image_size,
-            crop_mode=crop_mode,
-            save_results=False,
-            test_compress=test_compress,
-            eval_mode=True,
+
+        # Run inference through the selected provider
+        text = provider.run(
+            tmp_img,
+            mode=mode,
+            prompt=prompt,
+            grounding=grounding,
+            find_term=find_term,
+            schema=schema,
+            include_caption=include_caption,
+            options={
+                "base_size": base_size,
+                "image_size": image_size,
+                "crop_mode": crop_mode,
+                "test_compress": test_compress,
+            },
        )
-        
-        # Normalize response
-        if isinstance(res, str):
-            text = res.strip()
-        elif isinstance(res, dict) and "text" in res:
-            text = str(res["text"]).strip()
-        elif isinstance(res, (list, tuple)):
-            text = "\n".join(map(str, res)).strip()
-        else:
-            text = ""
-        
-        # Fallback: check output file
-        if not text:
-            mmd = os.path.join(out_dir, "result.mmd")
-            if os.path.exists(mmd):
-                with open(mmd, "r", encoding="utf-8") as fh:
-                    text = fh.read().strip()
+
        if not text:
            text = "No text returned by model."
-        
-        # Parse grounding boxes with proper coordinate scaling
+
+        # Parse grounding boxes (no-op for providers/text without grounding tokens)
        boxes = parse_detections(text, orig_w or 1, orig_h or 1) if ("<|det|>" in text or "<|ref|>" in text) else []
-        
+
        # Clean grounding tags from display text, but keep the labels
        display_text = clean_grounding_text(text) if ("<|ref|>" in text or "<|grounding|>" in text) else text
-        
+
        # If display text is empty after cleaning but we have boxes, show the labels
        if not display_text and boxes:
            display_text = ", ".join([b["label"] for b in boxes])
-        
+
        return JSONResponse({
            "success": True,
            "text": display_text,
@@ -382,30 +205,36 @@ async def ocr_inference(
            "boxes": boxes,
            "image_dims": {"w": orig_w, "h": orig_h},
            "metadata": {
+                "model": provider.id,
+                "model_label": provider.label,
                "mode": mode,
-                "grounding": grounding or (mode in {"find_ref","layout_map","pii_redact"}),
+                "grounding": grounding or (mode in GROUNDING_MODES),
                "base_size": base_size,
                "image_size": image_size,
                "crop_mode": crop_mode
            }
        })
-    
+
+    except ProviderError as e:
+        print(f"OCR provider error: {e}")
+        raise HTTPException(status_code=502, detail=str(e))
+    except HTTPException:
+        raise
    except Exception as e:
        print(f"OCR inference error: {type(e).__name__}: {str(e)}")
        raise HTTPException(status_code=500, detail="An internal error occurred during OCR processing.")
-    
+
    finally:
        if tmp_img:
            try:
                os.remove(tmp_img)
            except Exception:
                pass
-        if out_dir:
-            shutil.rmtree(out_dir, ignore_errors=True)

@app.post("/api/process-pdf")
 async def process_pdf(
    pdf_file: UploadFile = File(...),
+    model: Optional[str] = Form(None),
    mode: str = Form("plain_ocr"),
    prompt: str = Form(""),
    output_format: str = Form("markdown"),  # markdown, html, docx, json
@@ -432,8 +261,7 @@ async def process_pdf(
    - **image_size**: Image size parameter
    - **crop_mode**: Enable crop mode
    """
-    if model is None or tokenizer is None:
-        raise HTTPException(status_code=503, detail="Model not loaded yet")
+    provider = _resolve_provider(model, mode)

    # Validate output format
    if output_format not in ["markdown", "html", "docx", "json"]:
@@ -456,56 +284,32 @@ async def process_pdf(
        for page_idx, img in enumerate(images):
            print(f"🔍 Processing page {page_idx + 1}/{total_pages}...")

-            # Build prompt for this page
-            prompt_text = build_prompt(
-                mode=mode,
-                user_prompt=prompt,
-                grounding=grounding,
-                find_term=None,
-                schema=None,
-                include_caption=include_caption,
-            )
-
            # Save image temporarily
            tmp_img = None
-            out_dir = None
            try:
                with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
                    img.save(tmp, format="PNG")
                    tmp_img = tmp.name

                orig_w, orig_h = img.size
-                out_dir = tempfile.mkdtemp(prefix="dsocr_pdf_")

-                # Run inference
-                res = model.infer(
-                    tokenizer,
-                    prompt=prompt_text,
-                    image_file=tmp_img,
-                    output_path=out_dir,
-                    base_size=base_size,
-                    image_size=image_size,
-                    crop_mode=crop_mode,
-                    save_results=False,
-                    test_compress=False,
-                    eval_mode=True,
+                # Run inference through the selected provider
+                text = provider.run(
+                    tmp_img,
+                    mode=mode,
+                    prompt=prompt,
+                    grounding=grounding,
+                    find_term=None,
+                    schema=None,
+                    include_caption=include_caption,
+                    options={
+                        "base_size": base_size,
+                        "image_size": image_size,
+                        "crop_mode": crop_mode,
+                        "test_compress": False,
+                    },
                )

-                # Normalize response
-                if isinstance(res, str):
-                    text = res.strip()
-                elif isinstance(res, dict) and "text" in res:
-                    text = str(res["text"]).strip()
-                elif isinstance(res, (list, tuple)):
-                    text = "\n".join(map(str, res)).strip()
-                else:
-                    text = ""
-
-                if not text:
-                    mmd = os.path.join(out_dir, "result.mmd")
-                    if os.path.exists(mmd):
-                        with open(mmd, "r", encoding="utf-8") as fh:
-                            text = fh.read().strip()
                if not text:
                    text = f"No text returned for page {page_idx + 1}."

@@ -550,8 +354,6 @@ async def process_pdf(
                        os.remove(tmp_img)
                    except Exception:
                        pass
-                if out_dir:
-                    shutil.rmtree(out_dir, ignore_errors=True)

        print(f"✅ Processed all {total_pages} pages")

@@ -562,6 +364,8 @@ async def process_pdf(
                "total_pages": total_pages,
                "pages": pages_content,
                "metadata": {
+                    "model": provider.id,
+                    "model_label": provider.label,
                    "mode": mode,
                    "grounding": grounding,
                    "extract_images": extract_images,
@@ -590,6 +394,9 @@ async def process_pdf(
                headers={"Content-Disposition": f"attachment; filename=ocr_result.docx"}
            )

+    except ProviderError as e:
+        print(f"PDF provider error: {e}")
+        raise HTTPException(status_code=502, detail=str(e))
    except Exception as e:
        import traceback
        print(f"Error processing PDF: {e}")
@@ -633,6 +440,7 @@ async def commit_job(
    describe_text: str = Form(""),
    freeform_text: str = Form(""),
    mode: str = Form("plain_ocr"),
+    ocr_model: str = Form(""),
 ):
    """Commit an OCR job: save the image and insert a DB record."""
    job_id = str(uuid.uuid4())
@@ -664,13 +472,14 @@ async def commit_job(
                    """
                    INSERT INTO ocr_jobs
                        (id, author, book, chapter, page, image_path, original_filename,
-                         ocr_text, describe_text, freeform_text, mode, status)
-                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 'unreviewed')
+                         ocr_text, describe_text, freeform_text, mode, ocr_model, status)
+                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 'unreviewed')
                    RETURNING *
                    """,
                    (job_id, author or None, book or None, chapter or None,
                     page or None, image_path, original_filename,
-                     ocr_text or None, describe_text or None, freeform_text or None, mode),
+                     ocr_text or None, describe_text or None, freeform_text or None,
+                     mode, ocr_model or None),
                )
                row = cur.fetchone()
    except Exception as exc:
@@ -743,7 +552,7 @@ async def list_jobs(
                cur.execute(
                    f"""
                    SELECT id, author, book, chapter, page, submitted_at, status,
-                           reviewer_name, reviewed_at, mode, original_filename
+                           reviewer_name, reviewed_at, mode, ocr_model, original_filename
                    FROM ocr_jobs {where}
                    ORDER BY submitted_at DESC
                    LIMIT %s OFFSET %s
@@ -945,6 +754,75 @@ async def set_job_status(job_id: str, body: StatusRequest):
    return JSONResponse(_job_row_to_dict(row))


+class JobDescribeRequest(BaseModel):
+    model: Optional[str] = None
+
+
+@app.post("/api/jobs/{job_id}/describe")
+async def describe_job(job_id: str, body: JobDescribeRequest):
+    """Run Describe mode on a job's stored image and save the result to describe_text."""
+    try:
+        uuid.UUID(job_id)
+    except ValueError:
+        raise HTTPException(status_code=400, detail="Invalid job ID.")
+
+    # Look up the stored image for this job
+    try:
+        with get_db() as conn:
+            with conn.cursor() as cur:
+                cur.execute("SELECT image_path FROM ocr_jobs WHERE id = %s", (job_id,))
+                row = cur.fetchone()
+    except Exception as exc:
+        print(f"describe_job lookup DB error: {exc}")
+        raise HTTPException(status_code=500, detail="Database error.")
+
+    if not row:
+        raise HTTPException(status_code=404, detail="Job not found.")
+    image_path = row["image_path"]
+    if not image_path or not os.path.isfile(image_path):
+        raise HTTPException(status_code=404, detail="Image file not found on disk.")
+
+    provider = _resolve_provider(body.model, "describe")
+
+    try:
+        text = provider.run(
+            image_path,
+            mode="describe",
+            prompt="",
+            grounding=False,
+            find_term=None,
+            schema=None,
+            include_caption=False,
+            options={"base_size": 1024, "image_size": 640, "crop_mode": True, "test_compress": False},
+        )
+    except ProviderError as e:
+        print(f"describe_job provider error: {e}")
+        raise HTTPException(status_code=502, detail=str(e))
+    except Exception as e:
+        print(f"describe_job inference error: {type(e).__name__}: {e}")
+        raise HTTPException(status_code=500, detail="An internal error occurred during description.")
+
+    display_text = clean_grounding_text(text) if ("<|ref|>" in text or "<|grounding|>" in text) else text
+
+    # Persist the generated description on the job
+    try:
+        with get_db() as conn:
+            with conn.cursor() as cur:
+                cur.execute(
+                    "UPDATE ocr_jobs SET describe_text = %s WHERE id = %s RETURNING *",
+                    (display_text, job_id),
+                )
+                updated = cur.fetchone()
+    except Exception as exc:
+        print(f"describe_job save DB error: {exc}")
+        raise HTTPException(status_code=500, detail="Database error.")
+
+    if not updated:
+        raise HTTPException(status_code=404, detail="Job not found.")
+
+    return JSONResponse(_job_row_to_dict(updated))
+
+
@app.delete("/api/jobs/{job_id}")
 async def delete_job(job_id: str):
    """Delete a job record and its stored image."""
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -16,3 +16,4 @@ img2pdf>=0.5.0
 python-docx>=1.1.0
 markdown>=3.5.0
 psycopg2-binary>=2.9.0
+httpx>=0.27.0
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -27,6 +27,15 @@ services:
      MAX_UPLOAD_SIZE_MB: ${MAX_UPLOAD_SIZE_MB:-100}
      DATABASE_URL: ${DATABASE_URL:-postgresql://ocr_user:ocr_password@postgres:5432/ocr_db}
      OCR_IMAGES_DIR: ${OCR_IMAGES_DIR:-/data/ocr_images}
+      ENABLE_DEEPSEEK_LOCAL: ${ENABLE_DEEPSEEK_LOCAL:-true}
+      OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-http://host.docker.internal:11434}
+      OLLAMA_MODELS: ${OLLAMA_MODELS:-}
+      DEFAULT_OCR_MODEL: ${DEFAULT_OCR_MODEL:-deepseek-local}
+      OLLAMA_TIMEOUT: ${OLLAMA_TIMEOUT:-300}
+    # Lets the container reach an Ollama server running on the Docker host
+    # (works out of the box on Docker Desktop; required for Linux engines).
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
    volumes:
      - ./models:/models
      - ./ocr_images:/data/ocr_images
--- a/frontend/src/App.jsx
+++ b/frontend/src/App.jsx
@@ -1,5 +1,6 @@
-import { useState, useCallback } from 'react'
+import { useState, useCallback, useEffect } from 'react'
 import { useSuggestions } from './hooks/useSuggestions'
+import { useModels } from './hooks/useModels'
 import { motion, AnimatePresence } from 'framer-motion'
 import {
  Sparkles, Zap, Loader2, Settings, Image as ImageIcon, FileText,
@@ -7,6 +8,7 @@ import {
 } from 'lucide-react'
 import ImageUpload from './components/ImageUpload'
 import ModeSelector from './components/ModeSelector'
+import ModelSelector from './components/ModelSelector'
 import ResultPanel from './components/ResultPanel'
 import AdvancedSettings from './components/AdvancedSettings'
 import PDFProcessor from './components/PDFProcessor'
@@ -24,6 +26,8 @@ function App() {
  const [view, setView] = useState('new_job')

  // OCR state
+  const { models, loading: modelsLoading } = useModels()
+  const [model, setModel] = useState(null)
  const [mode, setMode] = useState('plain_ocr')
  const [fileType, setFileType] = useState('image')
  const [image, setImage] = useState(null)
@@ -51,8 +55,15 @@ function App() {
  const [commitResult, setCommitResult] = useState(null)

  // Modes that produce editable text output and can be committed to the DB
-  const COMMITTABLE_MODES = new Set(['plain_ocr', 'describe', 'freeform'])
-  const MODE_LABELS = { plain_ocr: 'OCR Text', describe: 'Description', freeform: 'Freeform' }
+  const COMMITTABLE_MODES = new Set(['plain_ocr', 'describe'])
+  const MODE_LABELS = { plain_ocr: 'OCR Text', describe: 'Description' }
+
+  // Pick the default model once the list loads
+  useEffect(() => {
+    if (!model && models.length > 0) {
+      setModel((models.find(m => m.default) || models[0]).id)
+    }
+  }, [models, model])

  // Show the full-screen result view once at least one committable mode has a result
  const showResultView = view === 'new_job' && Object.keys(modeResults).length > 0
@@ -97,6 +108,7 @@ function App() {
    try {
      const formData = new FormData()
      formData.append('image', image)
+      if (model) formData.append('model', model)
      formData.append('mode', mode)
      formData.append('prompt', prompt)
      formData.append('grounding', mode === 'find_ref')
@@ -149,6 +161,7 @@ function App() {
      formData.append('describe_text', editedResults.describe || '')
      formData.append('freeform_text', editedResults.freeform || '')
      formData.append('mode', mode)
+      if (model) formData.append('ocr_model', model)

      const response = await axios.post(`${API_BASE}/jobs`, formData, {
        headers: { 'Content-Type': 'multipart/form-data' },
@@ -159,7 +172,7 @@ function App() {
    } finally {
      setCommitLoading(false)
    }
-  }, [image, editedResults, metadata, mode])
+  }, [image, editedResults, metadata, mode, model])

  const handleCopy = useCallback(() => {
    const text = (activeResultMode && editedResults[activeResultMode]) || result?.text
@@ -263,11 +276,12 @@ function App() {
            >
              {/* Run additional modes */}
              <div className="glass p-4 rounded-2xl flex-shrink-0">
-                <ModeSelector
-                  mode={mode} onModeChange={setMode}
-                  prompt={prompt} onPromptChange={setPrompt}
-                  findTerm={findTerm} onFindTermChange={setFindTerm}
-                />
+                <div className="mb-3">
+                  <ModelSelector
+                    models={models} value={model} onChange={setModel} loading={modelsLoading}
+                  />
+                </div>
+                <ModeSelector mode={mode} onModeChange={setMode} />
                <div className="flex items-center gap-3 mt-3">
                  <motion.button
                    onClick={handleSubmit}
@@ -462,12 +476,12 @@ function App() {

                  <MetadataForm metadata={metadata} onChange={setMetadata} suggestions={suggestions} />

-                  <ModeSelector
-                    mode={mode} onModeChange={setMode}
-                    prompt={prompt} onPromptChange={setPrompt}
-                    findTerm={findTerm} onFindTermChange={setFindTerm}
+                  <ModelSelector
+                    models={models} value={model} onChange={setModel} loading={modelsLoading}
                  />

+                  <ModeSelector mode={mode} onModeChange={setMode} />
+
                  <ImageUpload onImageSelect={handleImageSelect} preview={imagePreview} fileType={fileType} />

                  <motion.button
@@ -497,7 +511,7 @@ function App() {

                  {fileType === 'pdf' ? (
                    <PDFProcessor
-                      pdfFile={image} mode={mode} prompt={prompt}
+                      pdfFile={image} mode={mode} prompt={prompt} model={model}
                      advancedSettings={advancedSettings} includeCaption={includeCaption}
                    />
                  ) : (
--- a/frontend/src/components/JobsPanel.jsx
+++ b/frontend/src/components/JobsPanel.jsx
@@ -1,9 +1,10 @@
 import { useState, useEffect, useCallback } from 'react'
 import { useSuggestions } from '../hooks/useSuggestions'
+import { useModels } from '../hooks/useModels'
 import { motion, AnimatePresence } from 'framer-motion'
 import {
  Search, ChevronLeft, ChevronRight, CheckCircle2, Clock,
-  FileText, Loader2, Save, RefreshCw, Trash2,
+  FileText, Loader2, Save, RefreshCw, Trash2, Sparkles,
 } from 'lucide-react'
 import axios from 'axios'

@@ -32,10 +33,14 @@ function StatusBadge({ status }) {
 // Full-screen Job Detail
 // ─────────────────────────────────────────────────────────────
 function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} }) {
+  const { models } = useModels()
  const [job, setJob] = useState(null)
  const [loading, setLoading] = useState(true)
  const [error, setError] = useState(null)

+  const [describeModel, setDescribeModel] = useState('')
+  const [generatingDescribe, setGeneratingDescribe] = useState(false)
+
  const [editedText, setEditedText]         = useState('')
  const [editDescribeText, setEditDescribeText] = useState('')
  const [editFreeformText, setEditFreeformText] = useState('')
@@ -71,10 +76,9 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
          setEditChapter(d.chapter || '')
          setEditPage(d.page || '')
          setReviewerName(d.reviewer_name || '')
-          // Default to first tab that has content
+          // Default to the OCR tab when there's OCR text, otherwise Description
          if (d.reviewed_text || d.ocr_text) setActiveTab('ocr')
-          else if (d.describe_text) setActiveTab('describe')
-          else if (d.freeform_text) setActiveTab('freeform')
+          else setActiveTab('describe')
        }
      })
      .catch(err => {
@@ -85,6 +89,32 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
    return () => { cancelled = true }
  }, [jobId])

+  // Default the Describe model to the job's original model (if available) or the registry default
+  useEffect(() => {
+    if (!describeModel && models.length > 0) {
+      const def = models.find(m => m.default) || models[0]
+      const fromJob = job?.ocr_model && models.some(m => m.id === job.ocr_model) ? job.ocr_model : null
+      setDescribeModel(fromJob || def.id)
+    }
+  }, [models, job, describeModel])
+
+  const handleGenerateDescribe = async () => {
+    setGeneratingDescribe(true)
+    setSaveResult(null)
+    try {
+      const res = await axios.post(`${API_BASE}/jobs/${jobId}/describe`, {
+        model: describeModel || null,
+      })
+      setJob(res.data)
+      setEditDescribeText(res.data.describe_text || '')
+      onReviewed(res.data)
+    } catch (err) {
+      setSaveResult({ success: false, error: err.response?.data?.detail || err.message })
+    } finally {
+      setGeneratingDescribe(false)
+    }
+  }
+
  const handleSave = async () => {
    if (!reviewerName.trim()) {
      setSaveResult({ success: false, error: 'Reviewer name is required.' })
@@ -114,16 +144,24 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
  }

  const handleToggleStatus = async () => {
-    const next = isReviewed ? 'unreviewed' : 'reviewed'
-    if (next === 'reviewed' && !reviewerName.trim()) {
-      setSaveResult({ success: false, error: 'Reviewer name is required to mark reviewed.' })
+    // Marking reviewed accepts BOTH the reviewed document text and the description,
+    // so it goes through the full review save (not a status-only flip).
+    if (!isReviewed) {
+      setTogglingStatus(true)
+      try {
+        await handleSave()
+      } finally {
+        setTogglingStatus(false)
+      }
      return
    }
+
+    // Reverting to unreviewed preserves the saved reviewed text and description.
    setTogglingStatus(true)
    setSaveResult(null)
    try {
      const res = await axios.put(`${API_BASE}/jobs/${jobId}/status`, {
-        status: next,
+        status: 'unreviewed',
        reviewer_name: reviewerName.trim() || null,
      })
      setJob(res.data)
@@ -259,8 +297,7 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
              {(() => {
                const tabs = [
                  job.ocr_text || job.reviewed_text ? { id: 'ocr', label: 'OCR Text' } : null,
-                  job.describe_text != null ? { id: 'describe', label: 'Description' } : null,
-                  job.freeform_text != null ? { id: 'freeform', label: 'Freeform' } : null,
+                  { id: 'describe', label: 'Description' },
                ].filter(Boolean)
                return tabs.length > 1 ? (
                  <div className="flex gap-1 mb-3 flex-shrink-0">
@@ -282,7 +319,7 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
              })()}

              <p className="text-xs text-gray-400 mb-2 flex-shrink-0">
-                {{ ocr: isReviewed ? 'Reviewed Text' : 'OCR Text', describe: 'Description', freeform: 'Freeform' }[activeTab]}
+                {{ ocr: isReviewed ? 'Reviewed Text' : 'OCR Text', describe: 'Description' }[activeTab]}
                <span className="text-purple-400 ml-1">(editable)</span>
              </p>

@@ -307,20 +344,43 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
                </>
              )}
              {activeTab === 'describe' && (
-                <textarea
-                  value={editDescribeText}
-                  onChange={e => setEditDescribeText(e.target.value)}
-                  className="flex-1 w-full bg-transparent text-sm text-gray-200 font-mono resize-none focus:outline-none min-h-0"
-                  placeholder="Description text..."
-                />
-              )}
-              {activeTab === 'freeform' && (
-                <textarea
-                  value={editFreeformText}
-                  onChange={e => setEditFreeformText(e.target.value)}
-                  className="flex-1 w-full bg-transparent text-sm text-gray-200 font-mono resize-none focus:outline-none min-h-0"
-                  placeholder="Freeform result..."
-                />
+                <>
+                  <div className="flex items-center gap-2 mb-2 flex-shrink-0">
+                    <select
+                      value={describeModel}
+                      onChange={e => setDescribeModel(e.target.value)}
+                      disabled={generatingDescribe || models.length === 0}
+                      className="bg-white/5 border border-white/10 rounded-lg px-2 py-1.5 text-xs text-gray-200 focus:outline-none focus:border-purple-500/50"
+                    >
+                      {models.length === 0 && <option value="">No models</option>}
+                      {models.map(m => (
+                        <option key={m.id} value={m.id}>{m.label}{m.default ? ' (default)' : ''}</option>
+                      ))}
+                    </select>
+                    <motion.button
+                      onClick={handleGenerateDescribe}
+                      disabled={generatingDescribe || !describeModel}
+                      className={`flex items-center gap-1.5 px-3 py-1.5 rounded-lg text-xs font-medium transition-all ${
+                        generatingDescribe || !describeModel
+                          ? 'opacity-50 cursor-not-allowed bg-white/5'
+                          : 'bg-gradient-to-r from-violet-600 to-purple-600 hover:from-violet-500 hover:to-purple-500'
+                      }`}
+                      whileHover={!generatingDescribe && describeModel ? { scale: 1.02 } : {}}
+                      whileTap={!generatingDescribe && describeModel ? { scale: 0.98 } : {}}
+                      title="Run Describe on this job's image and save it"
+                    >
+                      {generatingDescribe
+                        ? <><Loader2 className="w-3.5 h-3.5 animate-spin" /> Generating…</>
+                        : <><Sparkles className="w-3.5 h-3.5" /> Generate Description</>}
+                    </motion.button>
+                  </div>
+                  <textarea
+                    value={editDescribeText}
+                    onChange={e => setEditDescribeText(e.target.value)}
+                    className="flex-1 w-full bg-transparent text-sm text-gray-200 font-mono resize-none focus:outline-none min-h-0"
+                    placeholder="No description yet — pick a model and click Generate Description, or type one here."
+                  />
+                </>
              )}
            </div>
          </div>
@@ -385,6 +445,12 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
              </div>
            </div>

+            {!isReviewed && (
+              <p className="text-xs text-gray-500 mt-2">
+                Marking reviewed accepts both the reviewed document text and the description.
+              </p>
+            )}
+
            {saveResult && (
              <motion.div
                initial={{ opacity: 0, y: -4 }} animate={{ opacity: 1, y: 0 }}
@@ -405,6 +471,7 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
                <span className="text-xs text-gray-500">Last reviewed: {new Date(job.reviewed_at).toLocaleString()}</span>
              )}
              {job.mode && <span className="text-xs text-gray-500">Mode: {job.mode}</span>}
+              {job.ocr_model && <span className="text-xs text-gray-500">Model: {job.ocr_model}</span>}
            </div>
          </div>
        </>
@@ -573,7 +640,10 @@ export default function JobsPanel() {
                {job.page && <span className="text-xs text-gray-500">p. {job.page}</span>}
              </div>
              {job.author && <p className="text-xs text-gray-400 mt-1">{job.author}</p>}
-              <p className="text-xs text-gray-600 mt-2 font-mono">{new Date(job.submitted_at).toLocaleDateString()}</p>
+              <div className="flex items-center justify-between mt-2">
+                <p className="text-xs text-gray-600 font-mono">{new Date(job.submitted_at).toLocaleDateString()}</p>
+                {job.ocr_model && <span className="text-[10px] text-gray-500 truncate ml-2">{job.ocr_model}</span>}
+              </div>
            </motion.button>
          ))}
        </AnimatePresence>
--- a/frontend/src/components/ModeSelector.jsx
+++ b/frontend/src/components/ModeSelector.jsx
@@ -1,41 +1,30 @@
 import { motion } from 'framer-motion'
-import { FileText, Eye, Search, Wand2 } from 'lucide-react'
+import { FileText, Eye } from 'lucide-react'

 const modes = [
-  { id: 'plain_ocr', name: 'Plain OCR', icon: FileText, color: 'from-blue-500 to-cyan-500', desc: 'Extract raw text', needsInput: false },
-  { id: 'describe', name: 'Describe', icon: Eye, color: 'from-violet-500 to-purple-500', desc: 'Image description', needsInput: false },
-  { id: 'find_ref', name: 'Find', icon: Search, color: 'from-yellow-500 to-orange-500', desc: 'Locate specific terms', needsInput: 'findTerm' },
-  { id: 'freeform', name: 'Freeform', icon: Wand2, color: 'from-fuchsia-500 to-pink-500', desc: 'Custom prompt', needsInput: 'prompt' },
+  { id: 'plain_ocr', name: 'Plain OCR', icon: FileText, color: 'from-blue-500 to-cyan-500', desc: 'Extract raw text' },
+  { id: 'describe', name: 'Describe', icon: Eye, color: 'from-violet-500 to-purple-500', desc: 'Image description' },
 ]

-export default function ModeSelector({ 
-  mode, 
-  onModeChange, 
-  prompt, 
-  onPromptChange,
-  findTerm,
-  onFindTermChange
-}) {
-  const selectedMode = modes.find(m => m.id === mode)
-  const needsInput = selectedMode?.needsInput
-
+export default function ModeSelector({ mode, onModeChange }) {
  return (
    <div className="glass p-4 rounded-2xl space-y-3">
      <h3 className="text-sm font-semibold text-gray-200">Mode</h3>

-      <div className="grid grid-cols-4 gap-2">
+      <div className="grid grid-cols-2 gap-2">
        {modes.map((m) => {
          const Icon = m.icon
          const isSelected = mode === m.id
-          
+
          return (
            <motion.button
              key={m.id}
              onClick={() => onModeChange(m.id)}
+              title={m.desc}
              className={`
                relative p-2 rounded-xl text-center transition-all
-                ${isSelected 
-                  ? 'glass border-white/20 shadow-lg' 
+                ${isSelected
+                  ? 'glass border-white/20 shadow-lg'
                  : 'bg-white/5 border border-white/10 hover:border-white/20'
                }
              `}
@@ -49,12 +38,12 @@ export default function ModeSelector({
                  transition={{ type: "spring", bounce: 0.2, duration: 0.6 }}
                />
              )}
-              
+
              <div className="relative space-y-1">
                <div className={`
                  w-8 h-8 mx-auto rounded-lg flex items-center justify-center
-                  ${isSelected 
-                    ? `bg-gradient-to-br ${m.color}` 
+                  ${isSelected
+                    ? `bg-gradient-to-br ${m.color}`
                    : 'bg-white/10'
                  }
                `}>
@@ -68,38 +57,6 @@ export default function ModeSelector({
          )
        })}
      </div>
-
-      {needsInput === 'findTerm' && (
-        <motion.div
-          initial={{ opacity: 0, height: 0 }}
-          animate={{ opacity: 1, height: 'auto' }}
-          exit={{ opacity: 0, height: 0 }}
-        >
-          <input
-            type="text"
-            value={findTerm}
-            onChange={(e) => onFindTermChange(e.target.value)}
-            placeholder="Enter term to find (e.g., Total, Invoice #)"
-            className="w-full bg-white/5 border border-white/10 rounded-xl px-3 py-2 text-sm focus:outline-none focus:border-purple-500 transition-colors"
-          />
-        </motion.div>
-      )}
-
-      {needsInput === 'prompt' && (
-        <motion.div
-          initial={{ opacity: 0, height: 0 }}
-          animate={{ opacity: 1, height: 'auto' }}
-          exit={{ opacity: 0, height: 0 }}
-        >
-          <textarea
-            value={prompt}
-            onChange={(e) => onPromptChange(e.target.value)}
-            placeholder="Enter your custom prompt..."
-            className="w-full bg-white/5 border border-white/10 rounded-xl px-3 py-2 text-sm focus:outline-none focus:border-purple-500 transition-colors resize-none"
-            rows={2}
-          />
-        </motion.div>
-      )}
    </div>
  )
 }
--- a/frontend/src/components/PDFProcessor.jsx
+++ b/frontend/src/components/PDFProcessor.jsx
@@ -5,7 +5,7 @@ import axios from 'axios'

 const API_BASE = import.meta.env.VITE_API_URL || '/api'

-function PDFProcessor({ pdfFile, mode, prompt, advancedSettings, includeCaption }) {
+function PDFProcessor({ pdfFile, mode, prompt, model, advancedSettings, includeCaption }) {
  const [processing, setProcessing] = useState(false)
  const [progress, setProgress] = useState(0)
  const [result, setResult] = useState(null)
@@ -29,6 +29,7 @@ function PDFProcessor({ pdfFile, mode, prompt, advancedSettings, includeCaption
    try {
      const formData = new FormData()
      formData.append('pdf_file', pdfFile)
+      if (model) formData.append('model', model)
      formData.append('mode', mode)
      formData.append('prompt', prompt)
      formData.append('output_format', outputFormat)
@@ -80,7 +81,7 @@ function PDFProcessor({ pdfFile, mode, prompt, advancedSettings, includeCaption
    } finally {
      setProcessing(false)
    }
-  }, [pdfFile, mode, prompt, outputFormat, includeCaption, advancedSettings])
+  }, [pdfFile, mode, prompt, model, outputFormat, includeCaption, advancedSettings])

  const handleDownloadJSON = useCallback(() => {
    if (!result || outputFormat !== 'json') return