From e578276d3e15b0bb784efb3eac9c0573d7633629 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 15 Nov 2025 14:25:09 +0000 Subject: [PATCH] Add PDF processing and multi-format document conversion Features added: - PDF to image conversion with configurable DPI - Multi-page PDF processing with OCR - Export to Markdown, HTML, DOCX, and JSON formats - Automatic image extraction from PDFs - Formula and formatting preservation - Real-time progress tracking for multi-page documents Backend changes: - New /api/process-pdf endpoint for PDF processing - pdf_utils.py: PDF conversion and image extraction utilities - format_converter.py: Document format conversion (MD, HTML, DOCX) - Updated dependencies: PyMuPDF, img2pdf, python-docx, markdown Frontend changes: - File type toggle (Image OCR / PDF Processing) - PDFProcessor component with format selection - Updated ImageUpload to support both images and PDFs - Progress bars for multi-page processing - Download options for converted documents Documentation: - Updated README with PDF processing features - Added API documentation for /api/process-pdf endpoint - Added format conversion examples --- README.md | 102 ++++++- backend/format_converter.py | 326 +++++++++++++++++++++++ backend/main.py | 206 +++++++++++++- backend/pdf_utils.py | 214 +++++++++++++++ backend/requirements.txt | 4 + frontend/src/App.jsx | 146 +++++++--- frontend/src/components/ImageUpload.jsx | 54 +++- frontend/src/components/PDFProcessor.jsx | 233 ++++++++++++++++ 8 files changed, 1220 insertions(+), 65 deletions(-) create mode 100644 backend/format_converter.py create mode 100644 backend/pdf_utils.py create mode 100644 frontend/src/components/PDFProcessor.jsx diff --git a/README.md b/README.md index 28cf4f5..fb3bac5 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,15 @@ Modern OCR web application powered by DeepSeek-OCR with a stunning React fronten ![DeepSeek OCR in Action](assets/multi-bird.png) -> **Recent Updates (v2.1.1)** +> **Recent Updates (v2.2.0)** +> - 🎉 **NEW: PDF Processing** - Upload PDFs and extract text from all pages +> - 🎉 **NEW: Multi-Format Export** - Convert to Markdown, HTML, DOCX, or JSON +> - 🎉 **NEW: Automatic Image Extraction** - Extract and preserve images from PDFs +> - 🎉 **NEW: Progress Tracking** - Real-time progress for multi-page documents +> - ✅ Dual mode: Image OCR + PDF Processing with format conversion +> - ✅ Enhanced document processing with formula and formatting preservation +> +> **Previous Updates (v2.1.1)** > - ✅ Fixed image removal button - now properly clears and allows re-upload > - ✅ Fixed multiple bounding boxes parsing - handles `[[x1,y1,x2,y2], [x1,y1,x2,y2]]` format > - ✅ Simplified to 4 core working modes for better stability @@ -39,22 +47,32 @@ Modern OCR web application powered by DeepSeek-OCR with a stunning React fronten ## Features -### 4 Core OCR Modes +### Dual Processing Modes +#### 📸 **Image OCR** (4 Core Modes) - **Plain OCR** - Raw text extraction from any image - **Describe** - Generate intelligent image descriptions - **Find** - Locate specific terms with visual bounding boxes - **Freeform** - Custom prompts for specialized tasks +#### 📄 **PDF Processing** (NEW!) +- **Multi-Page Processing** - Process entire PDF documents page by page +- **Format Conversion** - Export to Markdown, HTML, DOCX, or JSON +- **Image Extraction** - Automatically extract and preserve embedded images +- **Formula Preservation** - Maintain mathematical formulas and special formatting +- **Progress Tracking** - Real-time progress updates for large documents + ### UI Features - 🎨 Glass morphism design with animated gradients -- 🎯 Drag & drop file upload (up to 100MB by default) -- 🗑️ Easy image removal and re-upload +- 🎯 Drag & drop file upload (Images up to 10MB, PDFs up to 100MB) +- 🔄 Easy file removal and re-upload - 📦 Grounding box visualization with proper coordinate scaling - ✨ Smooth animations (Framer Motion) -- 📋 Copy/Download results +- 📋 Copy/Download results in multiple formats - 🎛️ Advanced settings dropdown - 📝 HTML and Markdown rendering for formatted output - 🔍 Multiple bounding box support (handles multiple instances of found terms) +- 📊 Progress bars for multi-page PDF processing +- 💾 Direct download for converted documents (MD, HTML, DOCX) ## Configuration @@ -106,19 +124,26 @@ CROP_MODE=true # Enable dynamic cropping for large images ``` deepseek-ocr/ -├── backend/ # FastAPI backend -│ ├── main.py +├── backend/ # FastAPI backend +│ ├── main.py # Main API with OCR and PDF endpoints +│ ├── pdf_utils.py # PDF processing utilities (NEW) +│ ├── format_converter.py # Document format conversion (NEW) │ ├── requirements.txt │ └── Dockerfile -├── frontend/ # React frontend +├── frontend/ # React frontend │ ├── src/ │ │ ├── components/ -│ │ ├── App.jsx +│ │ │ ├── ImageUpload.jsx # File upload (images & PDFs) +│ │ │ ├── PDFProcessor.jsx # PDF processing UI (NEW) +│ │ │ ├── ModeSelector.jsx +│ │ │ ├── ResultPanel.jsx +│ │ │ └── AdvancedSettings.jsx +│ │ ├── App.jsx # Main app with dual mode support │ │ └── main.jsx │ ├── package.json │ ├── nginx.conf │ └── Dockerfile -├── models/ # Model cache +├── models/ # Model cache └── docker-compose.yml ``` @@ -288,6 +313,63 @@ For large images, the model uses dynamic cropping: - **Supports multiple boxes**: When finding multiple instances, format is `[[x1,y1,x2,y2], [x1,y1,x2,y2], ...]` - Frontend automatically displays all boxes overlaid on the image with unique colors +### POST /api/process-pdf (NEW!) + +Process PDF documents with OCR and export to various formats. + +**Parameters:** +- `pdf_file` (file, required) - PDF file to process (up to 100MB) +- `mode` (string) - OCR mode: `plain_ocr` | `describe` | `find_ref` | `freeform` +- `prompt` (string) - Custom prompt for freeform mode +- `output_format` (string) - Output format: `markdown` | `html` | `docx` | `json` +- `grounding` (bool) - Enable bounding boxes (default: false) +- `include_caption` (bool) - Add image descriptions (default: false) +- `extract_images` (bool) - Extract embedded images from PDF (default: true) +- `dpi` (int) - PDF rendering resolution (default: 144) +- `base_size` (int) - Base processing size (default: 1024) +- `image_size` (int) - Tile size for cropping (default: 640) +- `crop_mode` (bool) - Enable dynamic cropping (default: true) + +**Response Formats:** + +**JSON Format** (`output_format=json`): +```json +{ + "success": true, + "total_pages": 5, + "pages": [ + { + "page_number": 1, + "text": "Extracted and cleaned text...", + "raw_text": "Raw model output with tags...", + "boxes": [{"label": "field", "box": [x1, y1, x2, y2]}], + "images": ["base64_encoded_image_data..."], + "image_dims": {"w": 1920, "h": 1080} + } + ], + "metadata": { + "mode": "plain_ocr", + "grounding": false, + "extract_images": true, + "dpi": 144 + } +} +``` + +**File Downloads** (`output_format=markdown|html|docx`): +- Returns the document as a downloadable file +- Markdown: `.md` file with preserved formatting +- HTML: `.html` file with embedded styling and images +- DOCX: `.docx` Word document with tables and formatting + +**Features:** +- 📄 Multi-page processing with progress tracking +- 🖼️ Automatic image extraction and embedding +- 📐 Formula and formatting preservation +- 🎨 Styled HTML output with tables and code blocks +- 📝 Clean Markdown with proper structure +- 📋 Professional DOCX with headings and tables + ## Examples Here are some example images showcasing different OCR capabilities: diff --git a/backend/format_converter.py b/backend/format_converter.py new file mode 100644 index 0000000..76ea492 --- /dev/null +++ b/backend/format_converter.py @@ -0,0 +1,326 @@ +""" +Document Format Conversion Utilities +Handles conversion to Markdown, HTML, DOCX while preserving formatting +""" + +import re +from typing import List, Dict, Any +from io import BytesIO +from docx import Document +from docx.shared import Pt, Inches, RGBColor +from docx.enum.text import WD_PARAGRAPH_ALIGNMENT +import markdown +import base64 +from PIL import Image + + +class DocumentConverter: + """Handles conversion of OCR results to various document formats""" + + def __init__(self): + self.page_separator = '<--- Page Split --->' + + def to_markdown(self, pages_content: List[Dict[str, Any]], include_images: bool = True) -> str: + """ + Convert OCR results to Markdown format + + Args: + pages_content: List of page dictionaries with text and metadata + include_images: Whether to include image references + + Returns: + Markdown formatted string + """ + md_content = [] + + for idx, page in enumerate(pages_content): + # Add page header + md_content.append(f"# Page {idx + 1}\n") + + text = page.get('text', '') + + # Process and clean the text + if include_images and 'images' in page: + # Replace image placeholders with actual markdown image syntax + for img_idx, img_data in enumerate(page.get('images', [])): + placeholder = f"[IMAGE_{img_idx}]" + img_ref = f"![Image {img_idx + 1}](data:image/jpeg;base64,{img_data})" + text = text.replace(placeholder, img_ref) + + md_content.append(text) + md_content.append("\n\n---\n\n") # Page separator + + return "\n".join(md_content) + + def to_html(self, pages_content: List[Dict[str, Any]], include_images: bool = True) -> str: + """ + Convert OCR results to HTML format + + Args: + pages_content: List of page dictionaries with text and metadata + include_images: Whether to include images + + Returns: + HTML formatted string + """ + html_parts = [] + + # HTML header + html_parts.append(""" + + + + + + OCR Results + + + +

DeepSeek OCR Results

+""") + + # Process each page + for idx, page in enumerate(pages_content): + html_parts.append(f'
') + html_parts.append(f' ') + + text = page.get('text', '') + + # Handle images if present + if include_images and 'images' in page: + for img_idx, img_data in enumerate(page.get('images', [])): + placeholder = f"[IMAGE_{img_idx}]" + img_tag = f'Image {img_idx + 1}' + text = text.replace(placeholder, img_tag) + + # Convert markdown to HTML if the text appears to be markdown + if self._is_markdown(text): + html_content = markdown.markdown(text, extensions=['tables', 'fenced_code']) + else: + # Otherwise, preserve the HTML or wrap in paragraph + html_content = text if '<' in text else f'

{text.replace(chr(10), "
")}

' + + html_parts.append(f' {html_content}') + html_parts.append('
') + + # HTML footer + html_parts.append(""" + + +""") + + return "\n".join(html_parts) + + def to_docx(self, pages_content: List[Dict[str, Any]], include_images: bool = True) -> BytesIO: + """ + Convert OCR results to DOCX format + + Args: + pages_content: List of page dictionaries with text and metadata + include_images: Whether to include images + + Returns: + BytesIO object containing the DOCX file + """ + doc = Document() + + # Set default font + style = doc.styles['Normal'] + font = style.font + font.name = 'Calibri' + font.size = Pt(11) + + # Add title + title = doc.add_heading('DeepSeek OCR Results', 0) + title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER + + # Process each page + for idx, page in enumerate(pages_content): + # Add page heading + page_heading = doc.add_heading(f'Page {idx + 1}', level=1) + page_heading.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT + + text = page.get('text', '') + + # Handle images + if include_images and 'images' in page: + for img_idx, img_data in enumerate(page.get('images', [])): + placeholder = f"[IMAGE_{img_idx}]" + + # Add image to document + try: + img_bytes = base64.b64decode(img_data) + img_stream = BytesIO(img_bytes) + doc.add_picture(img_stream, width=Inches(5)) + text = text.replace(placeholder, '') + except Exception as e: + print(f"Error adding image to DOCX: {e}") + + # Process text content + self._add_formatted_text_to_doc(doc, text) + + # Add page break (except for last page) + if idx < len(pages_content) - 1: + doc.add_page_break() + + # Save to BytesIO + docx_buffer = BytesIO() + doc.save(docx_buffer) + docx_buffer.seek(0) + + return docx_buffer + + def _is_markdown(self, text: str) -> bool: + """Check if text appears to be markdown formatted""" + markdown_patterns = [ + r'^#+\s', # Headers + r'\*\*.*\*\*', # Bold + r'\*.*\*', # Italic + r'^\*\s', # Lists + r'^\d+\.\s', # Numbered lists + r'\[.*\]\(.*\)', # Links + r'```', # Code blocks + ] + + for pattern in markdown_patterns: + if re.search(pattern, text, re.MULTILINE): + return True + return False + + def _add_formatted_text_to_doc(self, doc: Document, text: str): + """ + Add formatted text to document, preserving structure + + Args: + doc: Document object + text: Text to add + """ + # Split into paragraphs + paragraphs = text.split('\n\n') + + for para in paragraphs: + if not para.strip(): + continue + + # Check for headers + if para.startswith('# '): + doc.add_heading(para.replace('# ', ''), level=1) + elif para.startswith('## '): + doc.add_heading(para.replace('## ', ''), level=2) + elif para.startswith('### '): + doc.add_heading(para.replace('### ', ''), level=3) + # Check for tables (simple detection) + elif '|' in para and para.count('|') > 2: + self._add_table_to_doc(doc, para) + # Check for code blocks + elif para.startswith('```'): + code_text = para.strip('```').strip() + p = doc.add_paragraph() + run = p.add_run(code_text) + run.font.name = 'Courier New' + run.font.size = Pt(10) + else: + # Regular paragraph + doc.add_paragraph(para.strip()) + + def _add_table_to_doc(self, doc: Document, table_text: str): + """ + Add a table to the document from markdown-style table text + + Args: + doc: Document object + table_text: Table in markdown format + """ + rows = [row.strip() for row in table_text.split('\n') if row.strip()] + + # Filter out separator rows + data_rows = [row for row in rows if not re.match(r'^[\|\s\-:]+$', row)] + + if not data_rows: + return + + # Parse table data + table_data = [] + for row in data_rows: + cells = [cell.strip() for cell in row.split('|')] + cells = [c for c in cells if c] # Remove empty cells + if cells: + table_data.append(cells) + + if not table_data: + return + + # Create table + max_cols = max(len(row) for row in table_data) + table = doc.add_table(rows=len(table_data), cols=max_cols) + table.style = 'Light Grid Accent 1' + + # Populate table + for i, row_data in enumerate(table_data): + row = table.rows[i] + for j, cell_text in enumerate(row_data): + if j < len(row.cells): + row.cells[j].text = cell_text + + # Make header row bold + if i == 0: + for paragraph in row.cells[j].paragraphs: + for run in paragraph.runs: + run.font.bold = True diff --git a/backend/main.py b/backend/main.py index 945e693..de1cf1f 100644 --- a/backend/main.py +++ b/backend/main.py @@ -2,18 +2,29 @@ import os import re import tempfile import shutil +import base64 from typing import List, Dict, Any, Optional from contextlib import asynccontextmanager from fastapi import FastAPI, File, UploadFile, Form, HTTPException from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse +from fastapi.responses import JSONResponse, StreamingResponse import torch from transformers import AutoModel, AutoTokenizer from PIL import Image import uvicorn from decouple import config as env_config +# Import PDF and document conversion utilities +from pdf_utils import ( + pdf_to_images_high_quality, + images_to_pdf, + extract_ref_patterns, + crop_images_from_refs, + clean_markdown_content +) +from format_converter import DocumentConverter + # ----------------------------- # Lifespan context for model loading # ----------------------------- @@ -373,6 +384,199 @@ async def ocr_inference( if out_dir: shutil.rmtree(out_dir, ignore_errors=True) +@app.post("/api/process-pdf") +async def process_pdf( + pdf_file: UploadFile = File(...), + mode: str = Form("plain_ocr"), + prompt: str = Form(""), + output_format: str = Form("markdown"), # markdown, html, docx, json + grounding: bool = Form(False), + include_caption: bool = Form(False), + extract_images: bool = Form(True), + dpi: int = Form(144), + base_size: int = Form(1024), + image_size: int = Form(640), + crop_mode: bool = Form(True), +): + """ + Process PDF document with OCR and convert to various formats + + - **pdf_file**: PDF file to process + - **mode**: OCR mode (plain_ocr, markdown, tables_csv, etc.) + - **prompt**: Custom prompt for freeform mode + - **output_format**: Output format (markdown, html, docx, json) + - **grounding**: Enable grounding boxes + - **include_caption**: Add image descriptions + - **extract_images**: Extract images from PDF + - **dpi**: PDF rendering resolution (default: 144) + - **base_size**: Base processing size + - **image_size**: Image size parameter + - **crop_mode**: Enable crop mode + """ + if model is None or tokenizer is None: + raise HTTPException(status_code=503, detail="Model not loaded yet") + + # Validate output format + if output_format not in ["markdown", "html", "docx", "json"]: + raise HTTPException(status_code=400, detail="Invalid output format. Must be: markdown, html, docx, or json") + + try: + # Read PDF file + pdf_bytes = await pdf_file.read() + + # Convert PDF to images + print(f"📄 Converting PDF to images (DPI: {dpi})...") + images = pdf_to_images_high_quality(pdf_bytes, dpi=dpi) + total_pages = len(images) + print(f"✅ Converted {total_pages} pages") + + # Process each page + pages_content = [] + converter = DocumentConverter() + + for page_idx, img in enumerate(images): + print(f"🔍 Processing page {page_idx + 1}/{total_pages}...") + + # Build prompt for this page + prompt_text = build_prompt( + mode=mode, + user_prompt=prompt, + grounding=grounding, + find_term=None, + schema=None, + include_caption=include_caption, + ) + + # Save image temporarily + tmp_img = None + out_dir = None + try: + with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp: + img.save(tmp, format="PNG") + tmp_img = tmp.name + + orig_w, orig_h = img.size + out_dir = tempfile.mkdtemp(prefix="dsocr_pdf_") + + # Run inference + res = model.infer( + tokenizer, + prompt=prompt_text, + image_file=tmp_img, + output_path=out_dir, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + save_results=False, + test_compress=False, + eval_mode=True, + ) + + # Normalize response + if isinstance(res, str): + text = res.strip() + elif isinstance(res, dict) and "text" in res: + text = str(res["text"]).strip() + elif isinstance(res, (list, tuple)): + text = "\n".join(map(str, res)).strip() + else: + text = "" + + if not text: + mmd = os.path.join(out_dir, "result.mmd") + if os.path.exists(mmd): + with open(mmd, "r", encoding="utf-8") as fh: + text = fh.read().strip() + if not text: + text = f"No text returned for page {page_idx + 1}." + + # Extract images if requested + page_images = [] + if extract_images: + matches, matches_image, matches_other = extract_ref_patterns(text) + if matches_image: + cropped = crop_images_from_refs(img, matches) + for cropped_img in cropped: + # Convert to base64 + img_buffer = tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") + cropped_img.save(img_buffer.name, format="JPEG", quality=95) + with open(img_buffer.name, "rb") as f: + img_b64 = base64.b64encode(f.read()).decode('utf-8') + page_images.append(img_b64) + os.remove(img_buffer.name) + + # Clean the text and add image placeholders + text = clean_markdown_content(text, matches_image, matches_other) + for img_idx in range(len(page_images)): + text = f"[IMAGE_{img_idx}]\n" + text + + # Parse grounding boxes + boxes = parse_detections(text, orig_w, orig_h) if ("<|det|>" in text or "<|ref|>" in text) else [] + + # Clean grounding tags from display text + display_text = clean_grounding_text(text) if ("<|ref|>" in text or "<|grounding|>" in text) else text + + pages_content.append({ + 'page_number': page_idx + 1, + 'text': display_text, + 'raw_text': text, + 'boxes': boxes, + 'images': page_images, + 'image_dims': {'w': orig_w, 'h': orig_h} + }) + + finally: + if tmp_img: + try: + os.remove(tmp_img) + except Exception: + pass + if out_dir: + shutil.rmtree(out_dir, ignore_errors=True) + + print(f"✅ Processed all {total_pages} pages") + + # Convert to requested format + if output_format == "json": + return JSONResponse({ + "success": True, + "total_pages": total_pages, + "pages": pages_content, + "metadata": { + "mode": mode, + "grounding": grounding, + "extract_images": extract_images, + "dpi": dpi + } + }) + elif output_format == "markdown": + md_content = converter.to_markdown(pages_content, include_images=extract_images) + return StreamingResponse( + iter([md_content.encode('utf-8')]), + media_type="text/markdown", + headers={"Content-Disposition": f"attachment; filename=ocr_result.md"} + ) + elif output_format == "html": + html_content = converter.to_html(pages_content, include_images=extract_images) + return StreamingResponse( + iter([html_content.encode('utf-8')]), + media_type="text/html", + headers={"Content-Disposition": f"attachment; filename=ocr_result.html"} + ) + elif output_format == "docx": + docx_buffer = converter.to_docx(pages_content, include_images=extract_images) + return StreamingResponse( + docx_buffer, + media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + headers={"Content-Disposition": f"attachment; filename=ocr_result.docx"} + ) + + except Exception as e: + import traceback + print(f"❌ Error processing PDF: {e}") + print(traceback.format_exc()) + raise HTTPException(status_code=500, detail=f"{type(e).__name__}: {str(e)}") + if __name__ == "__main__": host = env_config("API_HOST", default="0.0.0.0") port = env_config("API_PORT", default=8000, cast=int) diff --git a/backend/pdf_utils.py b/backend/pdf_utils.py new file mode 100644 index 0000000..0b9cace --- /dev/null +++ b/backend/pdf_utils.py @@ -0,0 +1,214 @@ +""" +PDF Processing Utilities for DeepSeek OCR +Handles PDF to image conversion and batch processing +""" + +import io +import re +from typing import List, Tuple, Dict, Any +import fitz # PyMuPDF +import img2pdf +from PIL import Image +import numpy as np + + +def pdf_to_images_high_quality(pdf_bytes: bytes, dpi: int = 144) -> List[Image.Image]: + """ + Convert PDF pages to high-quality PIL images + + Args: + pdf_bytes: PDF file as bytes + dpi: Resolution for rendering (default: 144) + + Returns: + List of PIL Image objects, one per page + """ + images = [] + + # Open PDF from bytes + pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf") + + # Calculate zoom factor from DPI + zoom = dpi / 72.0 + matrix = fitz.Matrix(zoom, zoom) + + # Process each page + for page_num in range(pdf_document.page_count): + page = pdf_document[page_num] + + # Render page to pixmap + pixmap = page.get_pixmap(matrix=matrix, alpha=False) + + # Allow large images + Image.MAX_IMAGE_PIXELS = None + + # Convert to PIL Image + img_data = pixmap.tobytes("png") + img = Image.open(io.BytesIO(img_data)) + + # Ensure RGB mode + if img.mode in ('RGBA', 'LA'): + background = Image.new('RGB', img.size, (255, 255, 255)) + background.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None) + img = background + elif img.mode != 'RGB': + img = img.convert('RGB') + + images.append(img) + + pdf_document.close() + return images + + +def images_to_pdf(pil_images: List[Image.Image]) -> bytes: + """ + Convert list of PIL images to PDF bytes + + Args: + pil_images: List of PIL Image objects + + Returns: + PDF file as bytes + """ + if not pil_images: + return b'' + + image_bytes_list = [] + + for img in pil_images: + # Ensure RGB mode + if img.mode != 'RGB': + img = img.convert('RGB') + + # Convert to JPEG bytes + img_buffer = io.BytesIO() + img.save(img_buffer, format='JPEG', quality=95) + img_bytes = img_buffer.getvalue() + image_bytes_list.append(img_bytes) + + # Convert to PDF + pdf_bytes = img2pdf.convert(image_bytes_list) + return pdf_bytes + + +def extract_ref_patterns(text: str) -> Tuple[List[Tuple], List[str], List[str]]: + """ + Extract reference patterns from OCR output + + Args: + text: OCR output text with reference tags + + Returns: + Tuple of (all_matches, image_matches, other_matches) + """ + pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)' + matches = re.findall(pattern, text, re.DOTALL) + + matches_image = [] + matches_other = [] + + for match in matches: + if '<|ref|>image<|/ref|>' in match[0]: + matches_image.append(match[0]) + else: + matches_other.append(match[0]) + + return matches, matches_image, matches_other + + +def parse_coordinates(ref_text: Tuple, image_width: int, image_height: int) -> Dict[str, Any]: + """ + Parse coordinates from reference text + + Args: + ref_text: Tuple of (full_match, label, coordinates) + image_width: Image width in pixels + image_height: Image height in pixels + + Returns: + Dictionary with label and scaled coordinates + """ + try: + label_type = ref_text[1] + cor_list = eval(ref_text[2]) + + # Scale coordinates from 0-999 to actual pixels + scaled_boxes = [] + for points in cor_list: + x1, y1, x2, y2 = points + scaled_box = [ + int(x1 / 999 * image_width), + int(y1 / 999 * image_height), + int(x2 / 999 * image_width), + int(y2 / 999 * image_height) + ] + scaled_boxes.append(scaled_box) + + return { + 'label': label_type, + 'boxes': scaled_boxes + } + except Exception as e: + print(f"Error parsing coordinates: {e}") + return None + + +def crop_images_from_refs(image: Image.Image, refs: List[Tuple]) -> List[Image.Image]: + """ + Crop images based on reference bounding boxes + + Args: + image: Source PIL Image + refs: List of reference tuples + + Returns: + List of cropped PIL Images + """ + cropped_images = [] + image_width, image_height = image.size + + for ref in refs: + coord_data = parse_coordinates(ref, image_width, image_height) + if coord_data and coord_data['label'] == 'image': + for box in coord_data['boxes']: + x1, y1, x2, y2 = box + try: + cropped = image.crop((x1, y1, x2, y2)) + cropped_images.append(cropped) + except Exception as e: + print(f"Error cropping image: {e}") + continue + + return cropped_images + + +def clean_markdown_content(content: str, image_refs: List[str], other_refs: List[str]) -> str: + """ + Clean markdown content by removing reference tags + + Args: + content: Raw OCR output with tags + image_refs: List of image reference tags + other_refs: List of other reference tags + + Returns: + Cleaned markdown content + """ + cleaned = content + + # Remove image reference tags (will be replaced with markdown images) + for ref in image_refs: + cleaned = cleaned.replace(ref, '') + + # Remove other reference tags and clean up formatting + for ref in other_refs: + cleaned = cleaned.replace(ref, '') + + # Clean up LaTeX and formatting + cleaned = (cleaned + .replace('\\coloneqq', ':=') + .replace('\\eqqcolon', '=:') + .replace('\n\n\n\n', '\n\n') + .replace('\n\n\n', '\n\n')) + + return cleaned diff --git a/backend/requirements.txt b/backend/requirements.txt index 9792f9b..49259d1 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -11,3 +11,7 @@ pillow safetensors torch python-decouple>=3.8 +PyMuPDF>=1.23.0 +img2pdf>=0.5.0 +python-docx>=1.1.0 +markdown>=3.5.0 diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx index c0cd6a9..891f5dd 100644 --- a/frontend/src/App.jsx +++ b/frontend/src/App.jsx @@ -1,16 +1,18 @@ import { useState, useCallback } from 'react' import { motion, AnimatePresence } from 'framer-motion' -import { Sparkles, Zap, Loader2, Settings } from 'lucide-react' +import { Sparkles, Zap, Loader2, Settings, Image as ImageIcon, FileText } from 'lucide-react' import ImageUpload from './components/ImageUpload' import ModeSelector from './components/ModeSelector' import ResultPanel from './components/ResultPanel' import AdvancedSettings from './components/AdvancedSettings' +import PDFProcessor from './components/PDFProcessor' import axios from 'axios' const API_BASE = import.meta.env.VITE_API_URL || '/api' function App() { const [mode, setMode] = useState('plain_ocr') + const [fileType, setFileType] = useState('image') // 'image' or 'pdf' const [image, setImage] = useState(null) const [imagePreview, setImagePreview] = useState(null) const [result, setResult] = useState(null) @@ -29,11 +31,23 @@ function App() { test_compress: false }) + const handleFileTypeChange = useCallback((newType) => { + // Clear current file when switching types + setImage(null) + if (imagePreview) { + URL.revokeObjectURL(imagePreview) + } + setImagePreview(null) + setError(null) + setResult(null) + setFileType(newType) + }, [imagePreview]) + const handleImageSelect = useCallback((file) => { if (file === null) { // Clear everything when removing image setImage(null) - if (imagePreview) { + if (imagePreview && fileType === 'image') { URL.revokeObjectURL(imagePreview) } setImagePreview(null) @@ -41,11 +55,16 @@ function App() { setResult(null) } else { setImage(file) - setImagePreview(URL.createObjectURL(file)) + // Only create preview URL for images, not PDFs + if (fileType === 'image') { + setImagePreview(URL.createObjectURL(file)) + } else { + setImagePreview(file) // Just store the file for PDFs + } setError(null) setResult(null) } - }, [imagePreview]) + }, [imagePreview, fileType]) const handleSubmit = async () => { if (!image) { @@ -177,9 +196,41 @@ function App() { transition={{ delay: 0.1 }} className="space-y-6" > + {/* File Type Toggle */} +
+
+ handleFileTypeChange('image')} + className={`p-3 rounded-xl text-sm font-medium transition-all flex items-center justify-center gap-2 ${ + fileType === 'image' + ? 'bg-gradient-to-r from-purple-600 to-cyan-600 text-white' + : 'glass text-gray-400 hover:bg-white/5' + }`} + whileHover={{ scale: 1.02 }} + whileTap={{ scale: 0.98 }} + > + + Image OCR + + handleFileTypeChange('pdf')} + className={`p-3 rounded-xl text-sm font-medium transition-all flex items-center justify-center gap-2 ${ + fileType === 'pdf' + ? 'bg-gradient-to-r from-purple-600 to-cyan-600 text-white' + : 'glass text-gray-400 hover:bg-white/5' + }`} + whileHover={{ scale: 1.02 }} + whileTap={{ scale: 0.98 }} + > + + PDF Processing + +
+
+ {/* Mode Selector with integrated inputs */} - - {/* Image Upload */} - {/* Advanced Settings Toggle */} @@ -226,40 +278,52 @@ function App() { )} - {/* Action Button */} - -
-
- {loading ? ( - <> - - Processing Magic... - - ) : ( - <> - - Analyze Image - - )} -
- + {/* Action Button / PDF Processor */} + {fileType === 'pdf' ? ( + + ) : ( + <> + +
+
+ {loading ? ( + <> + + Processing Magic... + + ) : ( + <> + + Analyze Image + + )} +
+ - {error && ( - -

{error}

-
+ {error && ( + +

{error}

+
+ )} + )} diff --git a/frontend/src/components/ImageUpload.jsx b/frontend/src/components/ImageUpload.jsx index 8af732e..fdf6918 100644 --- a/frontend/src/components/ImageUpload.jsx +++ b/frontend/src/components/ImageUpload.jsx @@ -1,18 +1,22 @@ import { useCallback } from 'react' import { motion } from 'framer-motion' import { useDropzone } from 'react-dropzone' -import { Upload, Image as ImageIcon, X } from 'lucide-react' +import { Upload, Image as ImageIcon, X, FileText } from 'lucide-react' -export default function ImageUpload({ onImageSelect, preview }) { +export default function ImageUpload({ onImageSelect, preview, fileType = 'image' }) { const onDrop = useCallback((acceptedFiles) => { if (acceptedFiles?.[0]) { onImageSelect(acceptedFiles[0]) } }, [onImageSelect]) + const isPDF = fileType === 'pdf' + const { getRootProps, getInputProps, isDragActive } = useDropzone({ onDrop, - accept: { + accept: isPDF ? { + 'application/pdf': ['.pdf'] + } : { 'image/*': ['.png', '.jpg', '.jpeg', '.webp', '.gif', '.bmp'] }, multiple: false @@ -21,8 +25,14 @@ export default function ImageUpload({ onImageSelect, preview }) { return (
-

Upload Image

- +

+ {isPDF ? 'Upload PDF' : 'Upload Image'} +

+ {isPDF ? ( + + ) : ( + + )}
{!preview ? ( @@ -59,10 +69,18 @@ export default function ImageUpload({ onImageSelect, preview }) {

- {isDragActive ? 'Drop it like it\'s hot! 🔥' : 'Drag & drop your image'} + {isDragActive + ? 'Drop it like it\'s hot! 🔥' + : isPDF + ? 'Drag & drop your PDF' + : 'Drag & drop your image' + }

- or click to browse • PNG, JPG, WEBP up to 10MB + {isPDF + ? 'or click to browse • PDF files up to 100MB' + : 'or click to browse • PNG, JPG, WEBP up to 10MB' + }

@@ -73,11 +91,21 @@ export default function ImageUpload({ onImageSelect, preview }) { animate={{ opacity: 1, scale: 1 }} className="relative group rounded-2xl overflow-hidden" > - Preview + {isPDF ? ( +
+
+ +

PDF Ready

+

{preview?.name || 'Document loaded'}

+
+
+ ) : ( + Preview + )}
{ @@ -87,7 +115,7 @@ export default function ImageUpload({ onImageSelect, preview }) { className="bg-red-500/90 backdrop-blur-sm px-3 py-2 rounded-full opacity-100 hover:bg-red-600 transition-colors flex items-center gap-2 shadow-lg" whileHover={{ scale: 1.05 }} whileTap={{ scale: 0.95 }} - title="Remove image" + title={isPDF ? "Remove PDF" : "Remove image"} > Remove diff --git a/frontend/src/components/PDFProcessor.jsx b/frontend/src/components/PDFProcessor.jsx new file mode 100644 index 0000000..bb6458a --- /dev/null +++ b/frontend/src/components/PDFProcessor.jsx @@ -0,0 +1,233 @@ +import { useState, useCallback } from 'react' +import { motion, AnimatePresence } from 'framer-motion' +import { FileText, Download, Loader2, CheckCircle2, AlertCircle } from 'lucide-react' +import axios from 'axios' + +const API_BASE = import.meta.env.VITE_API_URL || '/api' + +function PDFProcessor({ pdfFile, mode, prompt, advancedSettings, includeCaption }) { + const [processing, setProcessing] = useState(false) + const [progress, setProgress] = useState(0) + const [result, setResult] = useState(null) + const [error, setError] = useState(null) + const [outputFormat, setOutputFormat] = useState('markdown') + + const formats = [ + { value: 'markdown', label: 'Markdown', ext: 'md', icon: '📝' }, + { value: 'html', label: 'HTML', ext: 'html', icon: '🌐' }, + { value: 'docx', label: 'Word', ext: 'docx', icon: '📄' }, + { value: 'json', label: 'JSON', ext: 'json', icon: '📊' } + ] + + const handleProcess = useCallback(async () => { + if (!pdfFile) return + + setProcessing(true) + setError(null) + setProgress(0) + + try { + const formData = new FormData() + formData.append('pdf_file', pdfFile) + formData.append('mode', mode) + formData.append('prompt', prompt) + formData.append('output_format', outputFormat) + formData.append('grounding', mode === 'find_ref') + formData.append('include_caption', includeCaption) + formData.append('extract_images', true) + formData.append('dpi', 144) + formData.append('base_size', advancedSettings.base_size) + formData.append('image_size', advancedSettings.image_size) + formData.append('crop_mode', advancedSettings.crop_mode) + + const response = await axios.post(`${API_BASE}/process-pdf`, formData, { + headers: { + 'Content-Type': 'multipart/form-data', + }, + responseType: outputFormat === 'json' ? 'json' : 'blob', + onUploadProgress: (progressEvent) => { + const percentCompleted = Math.round((progressEvent.loaded * 100) / progressEvent.total) + setProgress(percentCompleted) + } + }) + + if (outputFormat === 'json') { + setResult(response.data) + } else { + // For file downloads (markdown, html, docx) + const format = formats.find(f => f.value === outputFormat) + const blob = new Blob([response.data], { + type: response.headers['content-type'] + }) + const url = URL.createObjectURL(blob) + const a = document.createElement('a') + a.href = url + a.download = `ocr_result.${format.ext}` + a.click() + URL.revokeObjectURL(url) + + setResult({ + success: true, + message: `Document downloaded as ${format.label}`, + format: outputFormat + }) + } + + setProgress(100) + } catch (err) { + console.error('PDF processing error:', err) + setError(err.response?.data?.detail || err.message || 'Failed to process PDF') + } finally { + setProcessing(false) + } + }, [pdfFile, mode, prompt, outputFormat, includeCaption, advancedSettings]) + + const handleDownloadJSON = useCallback(() => { + if (!result || outputFormat !== 'json') return + + const blob = new Blob([JSON.stringify(result, null, 2)], { type: 'application/json' }) + const url = URL.createObjectURL(blob) + const a = document.createElement('a') + a.href = url + a.download = 'ocr_result.json' + a.click() + URL.revokeObjectURL(url) + }, [result, outputFormat]) + + return ( +
+ {/* Format Selector */} +
+ +
+ {formats.map((format) => ( + setOutputFormat(format.value)} + className={`p-3 rounded-xl text-sm font-medium transition-all ${ + outputFormat === format.value + ? 'bg-gradient-to-r from-purple-600 to-cyan-600 text-white' + : 'glass text-gray-400 hover:bg-white/5' + }`} + whileHover={{ scale: 1.02 }} + whileTap={{ scale: 0.98 }} + > + {format.icon} + {format.label} + + ))} +
+
+ + {/* Process Button */} + +
+
+ {processing ? ( + <> + + Processing PDF... + + ) : ( + <> + + Process PDF + + )} +
+ + + {/* Progress Bar */} + + {processing && progress > 0 && ( + +
+ Processing... + {progress}% +
+
+ +
+
+ )} +
+ + {/* Error Display */} + + {error && ( + + +
+

Processing Failed

+

{error}

+
+
+ )} +
+ + {/* Success Display */} + + {result && !error && ( + +
+ +
+

+ {result.message || 'PDF processed successfully!'} +

+ {outputFormat === 'json' && result.pages && ( +
+

+ Processed {result.total_pages} page{result.total_pages > 1 ? 's' : ''} +

+ + + Download JSON + +
+ )} +
+
+
+ )} +
+
+ ) +} + +export default PDFProcessor