Add PDF processing and multi-format document conversion

Features added:
- PDF to image conversion with configurable DPI
- Multi-page PDF processing with OCR
- Export to Markdown, HTML, DOCX, and JSON formats
- Automatic image extraction from PDFs
- Formula and formatting preservation
- Real-time progress tracking for multi-page documents

Backend changes:
- New /api/process-pdf endpoint for PDF processing
- pdf_utils.py: PDF conversion and image extraction utilities
- format_converter.py: Document format conversion (MD, HTML, DOCX)
- Updated dependencies: PyMuPDF, img2pdf, python-docx, markdown

Frontend changes:
- File type toggle (Image OCR / PDF Processing)
- PDFProcessor component with format selection
- Updated ImageUpload to support both images and PDFs
- Progress bars for multi-page processing
- Download options for converted documents

Documentation:
- Updated README with PDF processing features
- Added API documentation for /api/process-pdf endpoint
- Added format conversion examples
This commit is contained in:
Claude
2025-11-15 14:25:09 +00:00
parent 5ba45f7db2
commit e578276d3e
8 changed files with 1220 additions and 65 deletions

View File

@@ -2,18 +2,29 @@ import os
import re
import tempfile
import shutil
import base64
from typing import List, Dict, Any, Optional
from contextlib import asynccontextmanager
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from fastapi.responses import JSONResponse, StreamingResponse
import torch
from transformers import AutoModel, AutoTokenizer
from PIL import Image
import uvicorn
from decouple import config as env_config
# Import PDF and document conversion utilities
from pdf_utils import (
pdf_to_images_high_quality,
images_to_pdf,
extract_ref_patterns,
crop_images_from_refs,
clean_markdown_content
)
from format_converter import DocumentConverter
# -----------------------------
# Lifespan context for model loading
# -----------------------------
@@ -373,6 +384,199 @@ async def ocr_inference(
if out_dir:
shutil.rmtree(out_dir, ignore_errors=True)
@app.post("/api/process-pdf")
async def process_pdf(
pdf_file: UploadFile = File(...),
mode: str = Form("plain_ocr"),
prompt: str = Form(""),
output_format: str = Form("markdown"), # markdown, html, docx, json
grounding: bool = Form(False),
include_caption: bool = Form(False),
extract_images: bool = Form(True),
dpi: int = Form(144),
base_size: int = Form(1024),
image_size: int = Form(640),
crop_mode: bool = Form(True),
):
"""
Process PDF document with OCR and convert to various formats
- **pdf_file**: PDF file to process
- **mode**: OCR mode (plain_ocr, markdown, tables_csv, etc.)
- **prompt**: Custom prompt for freeform mode
- **output_format**: Output format (markdown, html, docx, json)
- **grounding**: Enable grounding boxes
- **include_caption**: Add image descriptions
- **extract_images**: Extract images from PDF
- **dpi**: PDF rendering resolution (default: 144)
- **base_size**: Base processing size
- **image_size**: Image size parameter
- **crop_mode**: Enable crop mode
"""
if model is None or tokenizer is None:
raise HTTPException(status_code=503, detail="Model not loaded yet")
# Validate output format
if output_format not in ["markdown", "html", "docx", "json"]:
raise HTTPException(status_code=400, detail="Invalid output format. Must be: markdown, html, docx, or json")
try:
# Read PDF file
pdf_bytes = await pdf_file.read()
# Convert PDF to images
print(f"📄 Converting PDF to images (DPI: {dpi})...")
images = pdf_to_images_high_quality(pdf_bytes, dpi=dpi)
total_pages = len(images)
print(f"✅ Converted {total_pages} pages")
# Process each page
pages_content = []
converter = DocumentConverter()
for page_idx, img in enumerate(images):
print(f"🔍 Processing page {page_idx + 1}/{total_pages}...")
# Build prompt for this page
prompt_text = build_prompt(
mode=mode,
user_prompt=prompt,
grounding=grounding,
find_term=None,
schema=None,
include_caption=include_caption,
)
# Save image temporarily
tmp_img = None
out_dir = None
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
img.save(tmp, format="PNG")
tmp_img = tmp.name
orig_w, orig_h = img.size
out_dir = tempfile.mkdtemp(prefix="dsocr_pdf_")
# Run inference
res = model.infer(
tokenizer,
prompt=prompt_text,
image_file=tmp_img,
output_path=out_dir,
base_size=base_size,
image_size=image_size,
crop_mode=crop_mode,
save_results=False,
test_compress=False,
eval_mode=True,
)
# Normalize response
if isinstance(res, str):
text = res.strip()
elif isinstance(res, dict) and "text" in res:
text = str(res["text"]).strip()
elif isinstance(res, (list, tuple)):
text = "\n".join(map(str, res)).strip()
else:
text = ""
if not text:
mmd = os.path.join(out_dir, "result.mmd")
if os.path.exists(mmd):
with open(mmd, "r", encoding="utf-8") as fh:
text = fh.read().strip()
if not text:
text = f"No text returned for page {page_idx + 1}."
# Extract images if requested
page_images = []
if extract_images:
matches, matches_image, matches_other = extract_ref_patterns(text)
if matches_image:
cropped = crop_images_from_refs(img, matches)
for cropped_img in cropped:
# Convert to base64
img_buffer = tempfile.NamedTemporaryFile(delete=False, suffix=".jpg")
cropped_img.save(img_buffer.name, format="JPEG", quality=95)
with open(img_buffer.name, "rb") as f:
img_b64 = base64.b64encode(f.read()).decode('utf-8')
page_images.append(img_b64)
os.remove(img_buffer.name)
# Clean the text and add image placeholders
text = clean_markdown_content(text, matches_image, matches_other)
for img_idx in range(len(page_images)):
text = f"[IMAGE_{img_idx}]\n" + text
# Parse grounding boxes
boxes = parse_detections(text, orig_w, orig_h) if ("<|det|>" in text or "<|ref|>" in text) else []
# Clean grounding tags from display text
display_text = clean_grounding_text(text) if ("<|ref|>" in text or "<|grounding|>" in text) else text
pages_content.append({
'page_number': page_idx + 1,
'text': display_text,
'raw_text': text,
'boxes': boxes,
'images': page_images,
'image_dims': {'w': orig_w, 'h': orig_h}
})
finally:
if tmp_img:
try:
os.remove(tmp_img)
except Exception:
pass
if out_dir:
shutil.rmtree(out_dir, ignore_errors=True)
print(f"✅ Processed all {total_pages} pages")
# Convert to requested format
if output_format == "json":
return JSONResponse({
"success": True,
"total_pages": total_pages,
"pages": pages_content,
"metadata": {
"mode": mode,
"grounding": grounding,
"extract_images": extract_images,
"dpi": dpi
}
})
elif output_format == "markdown":
md_content = converter.to_markdown(pages_content, include_images=extract_images)
return StreamingResponse(
iter([md_content.encode('utf-8')]),
media_type="text/markdown",
headers={"Content-Disposition": f"attachment; filename=ocr_result.md"}
)
elif output_format == "html":
html_content = converter.to_html(pages_content, include_images=extract_images)
return StreamingResponse(
iter([html_content.encode('utf-8')]),
media_type="text/html",
headers={"Content-Disposition": f"attachment; filename=ocr_result.html"}
)
elif output_format == "docx":
docx_buffer = converter.to_docx(pages_content, include_images=extract_images)
return StreamingResponse(
docx_buffer,
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
headers={"Content-Disposition": f"attachment; filename=ocr_result.docx"}
)
except Exception as e:
import traceback
print(f"❌ Error processing PDF: {e}")
print(traceback.format_exc())
raise HTTPException(status_code=500, detail=f"{type(e).__name__}: {str(e)}")
if __name__ == "__main__":
host = env_config("API_HOST", default="0.0.0.0")
port = env_config("API_PORT", default=8000, cast=int)