-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPDF_Checker.py
More file actions
129 lines (107 loc) · 4.23 KB
/
Copy pathPDF_Checker.py
File metadata and controls
129 lines (107 loc) · 4.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""PDF Post-Redaction Verification Tool.
Performs deep forensic analysis on redacted PDFs to detect remaining
traces of sensitive data in text layers, metadata, and binary data.
"""
import fitz
import os
import logging
from typing import Optional
logger = logging.getLogger(__name__)
def inspect_pdf(file_path: str, target_word: str) -> None:
"""Inspect PDF for remaining traces of sensitive data.
Performs three-layer forensic scan:
1. Metadata and text layer analysis
2. Binary/hex data carving
3. Comprehensive threat verdict
Args:
file_path: Path to PDF to inspect.
target_word: Word to search for (case-insensitive).
Returns:
None - prints results to stdout and logs.
"""
# Input validation
if not file_path or not isinstance(file_path, str):
print("[-] Error: file_path must be a non-empty string")
logger.error("inspect_pdf called with invalid file_path")
return
if not target_word or not isinstance(target_word, str):
print("[-] Error: target_word must be a non-empty string")
logger.error("inspect_pdf called with invalid target_word")
return
if not file_path.lower().endswith('.pdf'):
print(f"[-] Error: File must be a PDF: {file_path}")
logger.error(f"inspect_pdf called with non-PDF file: {file_path}")
return
if not os.path.exists(file_path):
print(f"[-] Error: The file '{file_path}' does not exist.")
return
print(f"--- STARTING DEEP INSPECTION: {os.path.basename(file_path)} ---")
print(f"[*] Target to search: '{target_word}'\n")
word_lower = target_word.lower()
alerts = 0
# PHASE 1: Structural and Text Analysis (Decompressed)
print("[*] Scanning Text and Metadata layers...")
try:
doc = fitz.open(file_path)
# 1A. Metadata Check (Author, Title, Hidden tags)
for key, value in doc.metadata.items():
if value and word_lower in value.lower():
print(
f" [!] THREAT DETECTED: Trace found in metadata [{key} -> {value}]"
)
alerts += 1
# 1B. Text Layer Check
for i, page in enumerate(doc):
text = page.get_text().lower()
if word_lower in text:
print(
f" [!] THREAT DETECTED: The word still exists as hidden text on Page {i+1}"
)
alerts += 1
doc.close()
except Exception as e:
print(f"[-] PyMuPDF read error: {e}")
# PHASE 2: Raw Binary Analysis (Data Carving)
print("[*] Scanning hex/binary...")
try:
needle = word_lower.encode("utf-8")
# Stream the file in chunks instead of loading it fully into memory,
# so that large PDFs (>1GB) don't OOM the inspector. Overlap chunks
# by len(needle)-1 bytes to catch matches that straddle a boundary.
chunk_size = 1024 * 1024 # 1 MiB
overlap = max(0, len(needle) - 1)
found = False
with open(file_path, "rb") as f:
tail = b""
while True:
chunk = f.read(chunk_size)
if not chunk:
break
window = (tail + chunk).lower()
if needle in window:
found = True
break
tail = chunk[-overlap:] if overlap else b""
if found:
print(
" [!] THREAT DETECTED: String trace found in the raw source code of the file."
)
alerts += 1
except Exception as e:
print(f"[-] Binary read error: {e}")
# --- FINAL VERDICT ---
print("\n" + "=" * 50)
if alerts == 0:
print("[✓] VERDICT: SECURE. The PDF is clinically clean.")
print(" Data has been annihilated and is unrecoverable.")
else:
print(f"[X] VERDICT: VULNERABLE. Found {alerts} traces of sensitive data.")
print("=" * 50)
if __name__ == "__main__":
# INSERT YOUR REAL DATA HERE:
file_name = "name_of_your_nullified_file.pdf"
word_to_search = "redacted_word"
# Start the inspection
inspect_pdf(file_name, word_to_search)
# Block the terminal so you can read the output
input("\nPress ENTER to close the terminal...")