Add PDF processing and multi-format document conversion

Features added: - PDF to image conversion with configurable DPI - Multi-page PDF processing with OCR - Export to Markdown, HTML, DOCX, and JSON formats - Automatic image extraction from PDFs - Formula and formatting preservation - Real-time progress tracking for multi-page documents Backend changes: - New /api/process-pdf endpoint for PDF processing - pdf_utils.py: PDF conversion and image extraction utilities - format_converter.py: Document format conversion (MD, HTML, DOCX) - Updated dependencies: PyMuPDF, img2pdf, python-docx, markdown Frontend changes: - File type toggle (Image OCR / PDF Processing) - PDFProcessor component with format selection - Updated ImageUpload to support both images and PDFs - Progress bars for multi-page processing - Download options for converted documents Documentation: - Updated README with PDF processing features - Added API documentation for /api/process-pdf endpoint - Added format conversion examples
2025-11-15 14:25:09 +00:00
parent 5ba45f7db2
commit e578276d3e
8 changed files with 1220 additions and 65 deletions
--- a/backend/pdf_utils.py
+++ b/backend/pdf_utils.py
@@ -0,0 +1,214 @@
+"""
+PDF Processing Utilities for DeepSeek OCR
+Handles PDF to image conversion and batch processing
+"""
+
+import io
+import re
+from typing import List, Tuple, Dict, Any
+import fitz  # PyMuPDF
+import img2pdf
+from PIL import Image
+import numpy as np
+
+
+def pdf_to_images_high_quality(pdf_bytes: bytes, dpi: int = 144) -> List[Image.Image]:
+    """
+    Convert PDF pages to high-quality PIL images
+
+    Args:
+        pdf_bytes: PDF file as bytes
+        dpi: Resolution for rendering (default: 144)
+
+    Returns:
+        List of PIL Image objects, one per page
+    """
+    images = []
+
+    # Open PDF from bytes
+    pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
+
+    # Calculate zoom factor from DPI
+    zoom = dpi / 72.0
+    matrix = fitz.Matrix(zoom, zoom)
+
+    # Process each page
+    for page_num in range(pdf_document.page_count):
+        page = pdf_document[page_num]
+
+        # Render page to pixmap
+        pixmap = page.get_pixmap(matrix=matrix, alpha=False)
+
+        # Allow large images
+        Image.MAX_IMAGE_PIXELS = None
+
+        # Convert to PIL Image
+        img_data = pixmap.tobytes("png")
+        img = Image.open(io.BytesIO(img_data))
+
+        # Ensure RGB mode
+        if img.mode in ('RGBA', 'LA'):
+            background = Image.new('RGB', img.size, (255, 255, 255))
+            background.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
+            img = background
+        elif img.mode != 'RGB':
+            img = img.convert('RGB')
+
+        images.append(img)
+
+    pdf_document.close()
+    return images
+
+
+def images_to_pdf(pil_images: List[Image.Image]) -> bytes:
+    """
+    Convert list of PIL images to PDF bytes
+
+    Args:
+        pil_images: List of PIL Image objects
+
+    Returns:
+        PDF file as bytes
+    """
+    if not pil_images:
+        return b''
+
+    image_bytes_list = []
+
+    for img in pil_images:
+        # Ensure RGB mode
+        if img.mode != 'RGB':
+            img = img.convert('RGB')
+
+        # Convert to JPEG bytes
+        img_buffer = io.BytesIO()
+        img.save(img_buffer, format='JPEG', quality=95)
+        img_bytes = img_buffer.getvalue()
+        image_bytes_list.append(img_bytes)
+
+    # Convert to PDF
+    pdf_bytes = img2pdf.convert(image_bytes_list)
+    return pdf_bytes
+
+
+def extract_ref_patterns(text: str) -> Tuple[List[Tuple], List[str], List[str]]:
+    """
+    Extract reference patterns from OCR output
+
+    Args:
+        text: OCR output text with reference tags
+
+    Returns:
+        Tuple of (all_matches, image_matches, other_matches)
+    """
+    pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
+    matches = re.findall(pattern, text, re.DOTALL)
+
+    matches_image = []
+    matches_other = []
+
+    for match in matches:
+        if '<|ref|>image<|/ref|>' in match[0]:
+            matches_image.append(match[0])
+        else:
+            matches_other.append(match[0])
+
+    return matches, matches_image, matches_other
+
+
+def parse_coordinates(ref_text: Tuple, image_width: int, image_height: int) -> Dict[str, Any]:
+    """
+    Parse coordinates from reference text
+
+    Args:
+        ref_text: Tuple of (full_match, label, coordinates)
+        image_width: Image width in pixels
+        image_height: Image height in pixels
+
+    Returns:
+        Dictionary with label and scaled coordinates
+    """
+    try:
+        label_type = ref_text[1]
+        cor_list = eval(ref_text[2])
+
+        # Scale coordinates from 0-999 to actual pixels
+        scaled_boxes = []
+        for points in cor_list:
+            x1, y1, x2, y2 = points
+            scaled_box = [
+                int(x1 / 999 * image_width),
+                int(y1 / 999 * image_height),
+                int(x2 / 999 * image_width),
+                int(y2 / 999 * image_height)
+            ]
+            scaled_boxes.append(scaled_box)
+
+        return {
+            'label': label_type,
+            'boxes': scaled_boxes
+        }
+    except Exception as e:
+        print(f"Error parsing coordinates: {e}")
+        return None
+
+
+def crop_images_from_refs(image: Image.Image, refs: List[Tuple]) -> List[Image.Image]:
+    """
+    Crop images based on reference bounding boxes
+
+    Args:
+        image: Source PIL Image
+        refs: List of reference tuples
+
+    Returns:
+        List of cropped PIL Images
+    """
+    cropped_images = []
+    image_width, image_height = image.size
+
+    for ref in refs:
+        coord_data = parse_coordinates(ref, image_width, image_height)
+        if coord_data and coord_data['label'] == 'image':
+            for box in coord_data['boxes']:
+                x1, y1, x2, y2 = box
+                try:
+                    cropped = image.crop((x1, y1, x2, y2))
+                    cropped_images.append(cropped)
+                except Exception as e:
+                    print(f"Error cropping image: {e}")
+                    continue
+
+    return cropped_images
+
+
+def clean_markdown_content(content: str, image_refs: List[str], other_refs: List[str]) -> str:
+    """
+    Clean markdown content by removing reference tags
+
+    Args:
+        content: Raw OCR output with tags
+        image_refs: List of image reference tags
+        other_refs: List of other reference tags
+
+    Returns:
+        Cleaned markdown content
+    """
+    cleaned = content
+
+    # Remove image reference tags (will be replaced with markdown images)
+    for ref in image_refs:
+        cleaned = cleaned.replace(ref, '')
+
+    # Remove other reference tags and clean up formatting
+    for ref in other_refs:
+        cleaned = cleaned.replace(ref, '')
+
+    # Clean up LaTeX and formatting
+    cleaned = (cleaned
+               .replace('\\coloneqq', ':=')
+               .replace('\\eqqcolon', '=:')
+               .replace('\n\n\n\n', '\n\n')
+               .replace('\n\n\n', '\n\n'))
+
+    return cleaned