Add PDF processing and multi-format document conversion

Features added: - PDF to image conversion with configurable DPI - Multi-page PDF processing with OCR - Export to Markdown, HTML, DOCX, and JSON formats - Automatic image extraction from PDFs - Formula and formatting preservation - Real-time progress tracking for multi-page documents Backend changes: - New /api/process-pdf endpoint for PDF processing - pdf_utils.py: PDF conversion and image extraction utilities - format_converter.py: Document format conversion (MD, HTML, DOCX) - Updated dependencies: PyMuPDF, img2pdf, python-docx, markdown Frontend changes: - File type toggle (Image OCR / PDF Processing) - PDFProcessor component with format selection - Updated ImageUpload to support both images and PDFs - Progress bars for multi-page processing - Download options for converted documents Documentation: - Updated README with PDF processing features - Added API documentation for /api/process-pdf endpoint - Added format conversion examples
2025-11-15 14:25:09 +00:00
parent 5ba45f7db2
commit e578276d3e
8 changed files with 1220 additions and 65 deletions
--- a/backend/main.py
+++ b/backend/main.py
@@ -2,18 +2,29 @@ import os
 import re
 import tempfile
 import shutil
+import base64
 from typing import List, Dict, Any, Optional
 from contextlib import asynccontextmanager

 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse
+from fastapi.responses import JSONResponse, StreamingResponse
 import torch
 from transformers import AutoModel, AutoTokenizer
 from PIL import Image
 import uvicorn
 from decouple import config as env_config

+# Import PDF and document conversion utilities
+from pdf_utils import (
+    pdf_to_images_high_quality,
+    images_to_pdf,
+    extract_ref_patterns,
+    crop_images_from_refs,
+    clean_markdown_content
+)
+from format_converter import DocumentConverter
+
 # -----------------------------
 # Lifespan context for model loading
 # -----------------------------
@@ -373,6 +384,199 @@ async def ocr_inference(
        if out_dir:
            shutil.rmtree(out_dir, ignore_errors=True)

+@app.post("/api/process-pdf")
+async def process_pdf(
+    pdf_file: UploadFile = File(...),
+    mode: str = Form("plain_ocr"),
+    prompt: str = Form(""),
+    output_format: str = Form("markdown"),  # markdown, html, docx, json
+    grounding: bool = Form(False),
+    include_caption: bool = Form(False),
+    extract_images: bool = Form(True),
+    dpi: int = Form(144),
+    base_size: int = Form(1024),
+    image_size: int = Form(640),
+    crop_mode: bool = Form(True),
+):
+    """
+    Process PDF document with OCR and convert to various formats
+
+    - **pdf_file**: PDF file to process
+    - **mode**: OCR mode (plain_ocr, markdown, tables_csv, etc.)
+    - **prompt**: Custom prompt for freeform mode
+    - **output_format**: Output format (markdown, html, docx, json)
+    - **grounding**: Enable grounding boxes
+    - **include_caption**: Add image descriptions
+    - **extract_images**: Extract images from PDF
+    - **dpi**: PDF rendering resolution (default: 144)
+    - **base_size**: Base processing size
+    - **image_size**: Image size parameter
+    - **crop_mode**: Enable crop mode
+    """
+    if model is None or tokenizer is None:
+        raise HTTPException(status_code=503, detail="Model not loaded yet")
+
+    # Validate output format
+    if output_format not in ["markdown", "html", "docx", "json"]:
+        raise HTTPException(status_code=400, detail="Invalid output format. Must be: markdown, html, docx, or json")
+
+    try:
+        # Read PDF file
+        pdf_bytes = await pdf_file.read()
+
+        # Convert PDF to images
+        print(f"📄 Converting PDF to images (DPI: {dpi})...")
+        images = pdf_to_images_high_quality(pdf_bytes, dpi=dpi)
+        total_pages = len(images)
+        print(f"✅ Converted {total_pages} pages")
+
+        # Process each page
+        pages_content = []
+        converter = DocumentConverter()
+
+        for page_idx, img in enumerate(images):
+            print(f"🔍 Processing page {page_idx + 1}/{total_pages}...")
+
+            # Build prompt for this page
+            prompt_text = build_prompt(
+                mode=mode,
+                user_prompt=prompt,
+                grounding=grounding,
+                find_term=None,
+                schema=None,
+                include_caption=include_caption,
+            )
+
+            # Save image temporarily
+            tmp_img = None
+            out_dir = None
+            try:
+                with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
+                    img.save(tmp, format="PNG")
+                    tmp_img = tmp.name
+
+                orig_w, orig_h = img.size
+                out_dir = tempfile.mkdtemp(prefix="dsocr_pdf_")
+
+                # Run inference
+                res = model.infer(
+                    tokenizer,
+                    prompt=prompt_text,
+                    image_file=tmp_img,
+                    output_path=out_dir,
+                    base_size=base_size,
+                    image_size=image_size,
+                    crop_mode=crop_mode,
+                    save_results=False,
+                    test_compress=False,
+                    eval_mode=True,
+                )
+
+                # Normalize response
+                if isinstance(res, str):
+                    text = res.strip()
+                elif isinstance(res, dict) and "text" in res:
+                    text = str(res["text"]).strip()
+                elif isinstance(res, (list, tuple)):
+                    text = "\n".join(map(str, res)).strip()
+                else:
+                    text = ""
+
+                if not text:
+                    mmd = os.path.join(out_dir, "result.mmd")
+                    if os.path.exists(mmd):
+                        with open(mmd, "r", encoding="utf-8") as fh:
+                            text = fh.read().strip()
+                if not text:
+                    text = f"No text returned for page {page_idx + 1}."
+
+                # Extract images if requested
+                page_images = []
+                if extract_images:
+                    matches, matches_image, matches_other = extract_ref_patterns(text)
+                    if matches_image:
+                        cropped = crop_images_from_refs(img, matches)
+                        for cropped_img in cropped:
+                            # Convert to base64
+                            img_buffer = tempfile.NamedTemporaryFile(delete=False, suffix=".jpg")
+                            cropped_img.save(img_buffer.name, format="JPEG", quality=95)
+                            with open(img_buffer.name, "rb") as f:
+                                img_b64 = base64.b64encode(f.read()).decode('utf-8')
+                                page_images.append(img_b64)
+                            os.remove(img_buffer.name)
+
+                        # Clean the text and add image placeholders
+                        text = clean_markdown_content(text, matches_image, matches_other)
+                        for img_idx in range(len(page_images)):
+                            text = f"[IMAGE_{img_idx}]\n" + text
+
+                # Parse grounding boxes
+                boxes = parse_detections(text, orig_w, orig_h) if ("<|det|>" in text or "<|ref|>" in text) else []
+
+                # Clean grounding tags from display text
+                display_text = clean_grounding_text(text) if ("<|ref|>" in text or "<|grounding|>" in text) else text
+
+                pages_content.append({
+                    'page_number': page_idx + 1,
+                    'text': display_text,
+                    'raw_text': text,
+                    'boxes': boxes,
+                    'images': page_images,
+                    'image_dims': {'w': orig_w, 'h': orig_h}
+                })
+
+            finally:
+                if tmp_img:
+                    try:
+                        os.remove(tmp_img)
+                    except Exception:
+                        pass
+                if out_dir:
+                    shutil.rmtree(out_dir, ignore_errors=True)
+
+        print(f"✅ Processed all {total_pages} pages")
+
+        # Convert to requested format
+        if output_format == "json":
+            return JSONResponse({
+                "success": True,
+                "total_pages": total_pages,
+                "pages": pages_content,
+                "metadata": {
+                    "mode": mode,
+                    "grounding": grounding,
+                    "extract_images": extract_images,
+                    "dpi": dpi
+                }
+            })
+        elif output_format == "markdown":
+            md_content = converter.to_markdown(pages_content, include_images=extract_images)
+            return StreamingResponse(
+                iter([md_content.encode('utf-8')]),
+                media_type="text/markdown",
+                headers={"Content-Disposition": f"attachment; filename=ocr_result.md"}
+            )
+        elif output_format == "html":
+            html_content = converter.to_html(pages_content, include_images=extract_images)
+            return StreamingResponse(
+                iter([html_content.encode('utf-8')]),
+                media_type="text/html",
+                headers={"Content-Disposition": f"attachment; filename=ocr_result.html"}
+            )
+        elif output_format == "docx":
+            docx_buffer = converter.to_docx(pages_content, include_images=extract_images)
+            return StreamingResponse(
+                docx_buffer,
+                media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                headers={"Content-Disposition": f"attachment; filename=ocr_result.docx"}
+            )
+
+    except Exception as e:
+        import traceback
+        print(f"❌ Error processing PDF: {e}")
+        print(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=f"{type(e).__name__}: {str(e)}")
+
 if __name__ == "__main__":
    host = env_config("API_HOST", default="0.0.0.0")
    port = env_config("API_PORT", default=8000, cast=int)