rw-deepseek-ocr/backend/test_security.py

"""
Security regression tests for the eval() RCE vulnerability (OX Security disclosure).

The vulnerability allowed arbitrary code execution via crafted OCR output
that was passed to eval() in parse_coordinates(). The fix uses ast.literal_eval()
which only allows literal data structures.

This test is self-contained and does not require backend dependencies.

Run: python test_security.py
"""

import ast


def parse_coordinates(ref_text, image_width, image_height):
    """
    Minimal reproduction of pdf_utils.parse_coordinates using the patched code.
    This mirrors the fixed version that uses ast.literal_eval() instead of eval().
    """
    try:
        label_type = ref_text[1]
        cor_list = ast.literal_eval(ref_text[2])

        scaled_boxes = []
        for points in cor_list:
            x1, y1, x2, y2 = points
            scaled_box = [
                int(x1 / 999 * image_width),
                int(y1 / 999 * image_height),
                int(x2 / 999 * image_width),
                int(y2 / 999 * image_height)
            ]
            scaled_boxes.append(scaled_box)

        return {
            'label': label_type,
            'boxes': scaled_boxes
        }
    except Exception as e:
        print(f"  [Blocked] {type(e).__name__}: {e}")
        return None


def test_legitimate_coordinates():
    """Verify that normal coordinate parsing still works."""
    ref_text = ("full_match", "text", "[[312, 339, 480, 681]]")
    result = parse_coordinates(ref_text, 1000, 1000)

    assert result is not None, "Legitimate coordinates should parse successfully"
    assert result['label'] == 'text'
    assert len(result['boxes']) == 1
    print("PASS: Legitimate coordinates parse correctly")


def test_multiple_boxes():
    """Verify multiple bounding boxes still work."""
    ref_text = ("full_match", "image", "[[100, 200, 300, 400], [500, 600, 700, 800]]")
    result = parse_coordinates(ref_text, 1000, 1000)

    assert result is not None, "Multiple boxes should parse successfully"
    assert len(result['boxes']) == 2
    print("PASS: Multiple bounding boxes parse correctly")


def test_rce_blocked_import_os():
    """The original exploit: __import__('os').system('...') must be blocked."""
    malicious = "__import__('os').system('echo HACKED')"
    ref_text = ("full_match", "exploit", malicious)
    result = parse_coordinates(ref_text, 1000, 1000)

    assert result is None, "Code execution payload should be rejected"
    print("PASS: __import__('os').system() payload is blocked")


def test_rce_blocked_exec():
    """exec() based payloads must be blocked."""
    malicious = "exec('import os; os.system(\"echo HACKED\")')"
    ref_text = ("full_match", "exploit", malicious)
    result = parse_coordinates(ref_text, 1000, 1000)

    assert result is None, "exec() payload should be rejected"
    print("PASS: exec() payload is blocked")


def test_rce_blocked_eval():
    """Nested eval() payloads must be blocked."""
    malicious = "eval('__import__(\"os\").popen(\"id\").read()')"
    ref_text = ("full_match", "exploit", malicious)
    result = parse_coordinates(ref_text, 1000, 1000)

    assert result is None, "Nested eval() payload should be rejected"
    print("PASS: Nested eval() payload is blocked")


def test_rce_blocked_lambda():
    """Lambda-based payloads must be blocked."""
    malicious = "(lambda: __import__('os').system('echo HACKED'))()"
    ref_text = ("full_match", "exploit", malicious)
    result = parse_coordinates(ref_text, 1000, 1000)

    assert result is None, "Lambda payload should be rejected"
    print("PASS: Lambda payload is blocked")


def test_rce_blocked_comprehension():
    """List comprehension code execution must be blocked."""
    malicious = "[__import__('os').system('echo HACKED') for x in [1]]"
    ref_text = ("full_match", "exploit", malicious)
    result = parse_coordinates(ref_text, 1000, 1000)

    assert result is None, "List comprehension payload should be rejected"
    print("PASS: List comprehension payload is blocked")


if __name__ == "__main__":
    print("=" * 60)
    print("Security Regression Tests (OX Security RCE disclosure)")
    print("=" * 60)
    print()

    tests = [
        test_legitimate_coordinates,
        test_multiple_boxes,
        test_rce_blocked_import_os,
        test_rce_blocked_exec,
        test_rce_blocked_eval,
        test_rce_blocked_lambda,
        test_rce_blocked_comprehension,
    ]

    passed = 0
    failed = 0
    for test in tests:
        try:
            test()
            passed += 1
        except AssertionError as e:
            print(f"FAIL: {test.__name__}: {e}")
            failed += 1
        except Exception as e:
            print(f"ERROR: {test.__name__}: {e}")
            failed += 1

    print()
    print(f"Results: {passed} passed, {failed} failed out of {len(tests)} tests")
    if failed == 0:
        print("All security tests passed - RCE vulnerability is patched.")
    else:
        print("WARNING: Some tests failed!")