Add job tracking with PostgreSQL, image storage, and review workflow
- Add PostgreSQL service to docker-compose with health check and postgres_data volume
- Mount ./ocr_images as bind volume for persistent image storage
- Add backend/database.py with schema init and get_db() context manager
- Add 5 new API endpoints: POST /api/jobs, GET /api/jobs (search), GET /api/jobs/{id},
GET /api/jobs/{id}/image, PUT /api/jobs/{id}/review
- Jobs are saved with author/book/chapter/page metadata, auto UUID, and submitted_at timestamp
- Jobs start as 'unreviewed'; review captures edited text, reviewer name, and reviewed_at
- Add MetadataForm.jsx (author/book/chapter/page inputs) to the New Job panel
- Add JobsPanel.jsx with search/filter, paginated list, and detail pane with review form
- Add "Commit Job" button to ResultPanel (plain_ocr mode only) with success/error feedback
- Add "New Job" / "Browse Jobs" navigation to the app header
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
71
backend/database.py
Normal file
71
backend/database.py
Normal file
@@ -0,0 +1,71 @@
|
||||
import os
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
from contextlib import contextmanager
|
||||
from decouple import config as env_config
|
||||
|
||||
DATABASE_URL = env_config(
|
||||
"DATABASE_URL",
|
||||
default="postgresql://ocr_user:ocr_password@postgres:5432/ocr_db"
|
||||
)
|
||||
|
||||
|
||||
def _get_conn():
|
||||
return psycopg2.connect(DATABASE_URL, cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
|
||||
|
||||
def init_db():
|
||||
"""Create tables if they don't exist. Called once at startup."""
|
||||
conn = None
|
||||
try:
|
||||
conn = _get_conn()
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
CREATE TABLE IF NOT EXISTS ocr_jobs (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
author TEXT,
|
||||
book TEXT,
|
||||
chapter TEXT,
|
||||
page TEXT,
|
||||
submitted_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
image_path TEXT NOT NULL,
|
||||
original_filename TEXT,
|
||||
ocr_text TEXT,
|
||||
status TEXT NOT NULL DEFAULT 'unreviewed',
|
||||
reviewed_text TEXT,
|
||||
reviewer_name TEXT,
|
||||
reviewed_at TIMESTAMPTZ,
|
||||
mode TEXT
|
||||
)
|
||||
""")
|
||||
# Index for fast full-text-style searches on common fields
|
||||
cur.execute("""
|
||||
CREATE INDEX IF NOT EXISTS ocr_jobs_status_idx ON ocr_jobs(status)
|
||||
""")
|
||||
cur.execute("""
|
||||
CREATE INDEX IF NOT EXISTS ocr_jobs_submitted_at_idx ON ocr_jobs(submitted_at DESC)
|
||||
""")
|
||||
conn.commit()
|
||||
print("Database initialized.")
|
||||
except Exception as exc:
|
||||
print(f"Database init failed: {exc}")
|
||||
if conn:
|
||||
conn.rollback()
|
||||
raise
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
|
||||
@contextmanager
|
||||
def get_db():
|
||||
"""Yield a connection and auto-commit/rollback."""
|
||||
conn = _get_conn()
|
||||
try:
|
||||
yield conn
|
||||
conn.commit()
|
||||
except Exception:
|
||||
conn.rollback()
|
||||
raise
|
||||
finally:
|
||||
conn.close()
|
||||
251
backend/main.py
251
backend/main.py
@@ -1,14 +1,17 @@
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
import tempfile
|
||||
import shutil
|
||||
import base64
|
||||
from typing import List, Dict, Any, Optional
|
||||
from contextlib import asynccontextmanager
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
|
||||
from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Query
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
from fastapi.responses import JSONResponse, StreamingResponse, FileResponse
|
||||
from pydantic import BaseModel
|
||||
import torch
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
from PIL import Image
|
||||
@@ -24,6 +27,9 @@ from pdf_utils import (
|
||||
clean_markdown_content
|
||||
)
|
||||
from format_converter import DocumentConverter
|
||||
from database import init_db, get_db
|
||||
|
||||
OCR_IMAGES_DIR = env_config("OCR_IMAGES_DIR", default="/data/ocr_images")
|
||||
|
||||
# -----------------------------
|
||||
# Lifespan context for model loading
|
||||
@@ -36,6 +42,15 @@ async def lifespan(app: FastAPI):
|
||||
"""Load model on startup, cleanup on shutdown"""
|
||||
global model, tokenizer
|
||||
|
||||
# Image storage directory
|
||||
os.makedirs(OCR_IMAGES_DIR, exist_ok=True)
|
||||
|
||||
# Database
|
||||
try:
|
||||
init_db()
|
||||
except Exception as exc:
|
||||
print(f"Warning: database initialization failed: {exc}")
|
||||
|
||||
# Environment setup
|
||||
os.environ.pop("TRANSFORMERS_CACHE", None)
|
||||
MODEL_NAME = env_config("MODEL_NAME", default="deepseek-ai/DeepSeek-OCR")
|
||||
@@ -581,6 +596,238 @@ async def process_pdf(
|
||||
print(traceback.format_exc())
|
||||
raise HTTPException(status_code=500, detail="An internal error occurred during PDF processing.")
|
||||
|
||||
# -----------------------------
|
||||
# Job management routes
|
||||
# -----------------------------
|
||||
|
||||
class ReviewRequest(BaseModel):
|
||||
reviewed_text: str
|
||||
reviewer_name: str
|
||||
|
||||
|
||||
def _job_row_to_dict(row) -> Dict[str, Any]:
|
||||
"""Convert a DB row (RealDictRow) to a plain dict with serialisable values."""
|
||||
d = dict(row)
|
||||
for key, val in d.items():
|
||||
if isinstance(val, datetime):
|
||||
d[key] = val.isoformat()
|
||||
elif val is not None and hasattr(val, '__str__') and type(val).__name__ == 'UUID':
|
||||
d[key] = str(val)
|
||||
return d
|
||||
|
||||
|
||||
@app.post("/api/jobs")
|
||||
async def commit_job(
|
||||
image: UploadFile = File(...),
|
||||
author: str = Form(""),
|
||||
book: str = Form(""),
|
||||
chapter: str = Form(""),
|
||||
page: str = Form(""),
|
||||
ocr_text: str = Form(""),
|
||||
mode: str = Form("plain_ocr"),
|
||||
):
|
||||
"""Commit an OCR job: save the image and insert a DB record."""
|
||||
job_id = str(uuid.uuid4())
|
||||
|
||||
# Determine file extension from original filename or content type
|
||||
original_filename = image.filename or "image"
|
||||
ext = os.path.splitext(original_filename)[1].lower()
|
||||
if not ext:
|
||||
ct = (image.content_type or "").lower()
|
||||
ext_map = {
|
||||
"image/png": ".png", "image/jpeg": ".jpg", "image/jpg": ".jpg",
|
||||
"image/webp": ".webp", "image/gif": ".gif", "image/bmp": ".bmp",
|
||||
}
|
||||
ext = ext_map.get(ct, ".png")
|
||||
|
||||
image_path = os.path.join(OCR_IMAGES_DIR, f"{job_id}{ext}")
|
||||
|
||||
try:
|
||||
content = await image.read()
|
||||
with open(image_path, "wb") as f:
|
||||
f.write(content)
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=500, detail="Failed to save image file.")
|
||||
|
||||
try:
|
||||
with get_db() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO ocr_jobs
|
||||
(id, author, book, chapter, page, image_path, original_filename,
|
||||
ocr_text, mode, status)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, 'unreviewed')
|
||||
RETURNING *
|
||||
""",
|
||||
(job_id, author or None, book or None, chapter or None,
|
||||
page or None, image_path, original_filename,
|
||||
ocr_text or None, mode),
|
||||
)
|
||||
row = cur.fetchone()
|
||||
except Exception as exc:
|
||||
# Clean up saved image if DB insert fails
|
||||
try:
|
||||
os.remove(image_path)
|
||||
except Exception:
|
||||
pass
|
||||
print(f"Job commit DB error: {exc}")
|
||||
raise HTTPException(status_code=500, detail="Failed to save job to database.")
|
||||
|
||||
return JSONResponse(_job_row_to_dict(row), status_code=201)
|
||||
|
||||
|
||||
@app.get("/api/jobs")
|
||||
async def list_jobs(
|
||||
search: Optional[str] = Query(None, description="General text search across all fields"),
|
||||
author: Optional[str] = Query(None),
|
||||
book: Optional[str] = Query(None),
|
||||
chapter: Optional[str] = Query(None),
|
||||
status: Optional[str] = Query(None, description="unreviewed | reviewed"),
|
||||
limit: int = Query(20, ge=1, le=200),
|
||||
offset: int = Query(0, ge=0),
|
||||
):
|
||||
"""Search and list jobs. All filters are optional and combinable."""
|
||||
conditions = []
|
||||
params: List[Any] = []
|
||||
|
||||
if search:
|
||||
conditions.append(
|
||||
"(author ILIKE %s OR book ILIKE %s OR chapter ILIKE %s "
|
||||
"OR page ILIKE %s OR ocr_text ILIKE %s OR reviewer_name ILIKE %s)"
|
||||
)
|
||||
like = f"%{search}%"
|
||||
params.extend([like, like, like, like, like, like])
|
||||
|
||||
if author:
|
||||
conditions.append("author ILIKE %s")
|
||||
params.append(f"%{author}%")
|
||||
|
||||
if book:
|
||||
conditions.append("book ILIKE %s")
|
||||
params.append(f"%{book}%")
|
||||
|
||||
if chapter:
|
||||
conditions.append("chapter ILIKE %s")
|
||||
params.append(f"%{chapter}%")
|
||||
|
||||
if status:
|
||||
conditions.append("status = %s")
|
||||
params.append(status)
|
||||
|
||||
where = ("WHERE " + " AND ".join(conditions)) if conditions else ""
|
||||
|
||||
try:
|
||||
with get_db() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
f"SELECT COUNT(*) AS total FROM ocr_jobs {where}",
|
||||
params,
|
||||
)
|
||||
total = cur.fetchone()["total"]
|
||||
|
||||
cur.execute(
|
||||
f"""
|
||||
SELECT id, author, book, chapter, page, submitted_at, status,
|
||||
reviewer_name, reviewed_at, mode, original_filename
|
||||
FROM ocr_jobs {where}
|
||||
ORDER BY submitted_at DESC
|
||||
LIMIT %s OFFSET %s
|
||||
""",
|
||||
params + [limit, offset],
|
||||
)
|
||||
rows = [_job_row_to_dict(r) for r in cur.fetchall()]
|
||||
except Exception as exc:
|
||||
print(f"list_jobs DB error: {exc}")
|
||||
raise HTTPException(status_code=500, detail="Database error.")
|
||||
|
||||
return JSONResponse({"total": total, "limit": limit, "offset": offset, "jobs": rows})
|
||||
|
||||
|
||||
@app.get("/api/jobs/{job_id}")
|
||||
async def get_job(job_id: str):
|
||||
"""Retrieve full job record including OCR text."""
|
||||
try:
|
||||
uuid.UUID(job_id)
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=400, detail="Invalid job ID.")
|
||||
|
||||
try:
|
||||
with get_db() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT * FROM ocr_jobs WHERE id = %s", (job_id,))
|
||||
row = cur.fetchone()
|
||||
except Exception as exc:
|
||||
print(f"get_job DB error: {exc}")
|
||||
raise HTTPException(status_code=500, detail="Database error.")
|
||||
|
||||
if not row:
|
||||
raise HTTPException(status_code=404, detail="Job not found.")
|
||||
|
||||
return JSONResponse(_job_row_to_dict(row))
|
||||
|
||||
|
||||
@app.get("/api/jobs/{job_id}/image")
|
||||
async def get_job_image(job_id: str):
|
||||
"""Serve the stored image for a job."""
|
||||
try:
|
||||
uuid.UUID(job_id)
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=400, detail="Invalid job ID.")
|
||||
|
||||
try:
|
||||
with get_db() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT image_path FROM ocr_jobs WHERE id = %s", (job_id,))
|
||||
row = cur.fetchone()
|
||||
except Exception as exc:
|
||||
print(f"get_job_image DB error: {exc}")
|
||||
raise HTTPException(status_code=500, detail="Database error.")
|
||||
|
||||
if not row:
|
||||
raise HTTPException(status_code=404, detail="Job not found.")
|
||||
|
||||
path = row["image_path"]
|
||||
if not os.path.isfile(path):
|
||||
raise HTTPException(status_code=404, detail="Image file not found on disk.")
|
||||
|
||||
return FileResponse(path)
|
||||
|
||||
|
||||
@app.put("/api/jobs/{job_id}/review")
|
||||
async def review_job(job_id: str, body: ReviewRequest):
|
||||
"""Mark a job as reviewed with the corrected text and reviewer name."""
|
||||
try:
|
||||
uuid.UUID(job_id)
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=400, detail="Invalid job ID.")
|
||||
|
||||
try:
|
||||
with get_db() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
UPDATE ocr_jobs
|
||||
SET status = 'reviewed',
|
||||
reviewed_text = %s,
|
||||
reviewer_name = %s,
|
||||
reviewed_at = NOW()
|
||||
WHERE id = %s
|
||||
RETURNING *
|
||||
""",
|
||||
(body.reviewed_text, body.reviewer_name, job_id),
|
||||
)
|
||||
row = cur.fetchone()
|
||||
except Exception as exc:
|
||||
print(f"review_job DB error: {exc}")
|
||||
raise HTTPException(status_code=500, detail="Database error.")
|
||||
|
||||
if not row:
|
||||
raise HTTPException(status_code=404, detail="Job not found.")
|
||||
|
||||
return JSONResponse(_job_row_to_dict(row))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
host = env_config("API_HOST", default="0.0.0.0")
|
||||
port = env_config("API_PORT", default=8000, cast=int)
|
||||
|
||||
@@ -15,3 +15,4 @@ PyMuPDF>=1.23.0
|
||||
img2pdf>=0.5.0
|
||||
python-docx>=1.1.0
|
||||
markdown>=3.5.0
|
||||
psycopg2-binary>=2.9.0
|
||||
|
||||
Reference in New Issue
Block a user