Fix RCE vulnerability and harden security

- Replace eval() with ast.literal_eval() in pdf_utils.py to fix
  unauthenticated remote code execution via crafted PDF uploads
  (reported by OX Security)
- Sanitize HTML output with DOMPurify to prevent XSS
- Restrict CORS origins (configurable via CORS_ORIGINS env var)
- Suppress raw exception details in API error responses
- Cap Image.MAX_IMAGE_PIXELS to prevent decompression bomb DoS
- Add security regression test suite

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Ray Dumasia
2026-03-31 09:01:52 +01:00
parent e24f064042
commit 3dac0741b1
6 changed files with 169 additions and 9 deletions

View File

@@ -3,6 +3,7 @@ PDF Processing Utilities for DeepSeek OCR
Handles PDF to image conversion and batch processing
"""
import ast
import io
import re
from typing import List, Tuple, Dict, Any
@@ -39,8 +40,8 @@ def pdf_to_images_high_quality(pdf_bytes: bytes, dpi: int = 144) -> List[Image.I
# Render page to pixmap
pixmap = page.get_pixmap(matrix=matrix, alpha=False)
# Allow large images
Image.MAX_IMAGE_PIXELS = None
# Allow reasonably large images (200 megapixels) but not decompression bombs
Image.MAX_IMAGE_PIXELS = 200_000_000
# Convert to PIL Image
img_data = pixmap.tobytes("png")
@@ -130,7 +131,7 @@ def parse_coordinates(ref_text: Tuple, image_width: int, image_height: int) -> D
"""
try:
label_type = ref_text[1]
cor_list = eval(ref_text[2])
cor_list = ast.literal_eval(ref_text[2])
# Scale coordinates from 0-999 to actual pixels
scaled_boxes = []