Skip to content

Commit 46319a5

Browse files
author
test
committed
Add grammar security audit and enhance vendor script
- vendor-grammar.sh: copy extra headers (.h, .inc) and common/ subdirs from grammar src/ directories. Needed for Astro (tag.h), PureScript/ Typst (unicode.h), VHDL (.h/.inc files), F# (common/scanner.h). - audit-grammar-security.sh: pre-vendoring scanner for dangerous patterns in vendored grammar C files. Checks for dangerous includes (sys/*, unistd.h, dlfcn.h), dangerous calls (system, exec, popen, fopen, socket, getenv, fork, dlopen), and suspicious patterns (constructor attributes, inline assembly, base64 blobs). PASS/WARN/BLOCK per grammar.
1 parent 240b277 commit 46319a5

2 files changed

Lines changed: 116 additions & 0 deletions

File tree

scripts/audit-grammar-security.sh

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
4+
# audit-grammar-security.sh — Scan vendored tree-sitter grammar scanner.c
5+
# files for dangerous patterns that could indicate malicious code.
6+
#
7+
# Grammars are third-party C source compiled into our binary. parser.c files
8+
# are auto-generated (low risk), but scanner.c files are hand-written C with
9+
# access to memory allocation and input text.
10+
#
11+
# Usage: scripts/audit-grammar-security.sh [directory]
12+
# Default: internal/cbm/vendored/grammars/
13+
14+
ROOT="$(cd "$(dirname "$0")/.." && pwd)"
15+
GRAMMAR_DIR="${1:-$ROOT/internal/cbm/vendored/grammars}"
16+
17+
if [ ! -d "$GRAMMAR_DIR" ]; then
18+
echo "ERROR: directory not found: $GRAMMAR_DIR" >&2
19+
exit 1
20+
fi
21+
22+
echo "=== Grammar Security Audit ==="
23+
echo "Scanning: $GRAMMAR_DIR"
24+
echo ""
25+
26+
TOTAL=0
27+
WARNED=0
28+
BLOCKED=0
29+
30+
# Safe includes that scanners legitimately use
31+
SAFE_INCLUDES='string\.h|stdint\.h|stdbool\.h|stdlib\.h|wctype\.h|stdio\.h|stddef\.h|limits\.h|assert\.h|ctype\.h|wchar\.h|math\.h|stdalign\.h|stdarg\.h|float\.h|inttypes\.h'
32+
33+
# Dangerous includes
34+
DANGER_INCLUDES='unistd\.h|sys/|netdb\.h|dlfcn\.h|signal\.h|spawn\.h|pthread\.h|fcntl\.h|dirent\.h|termios\.h|arpa/|netinet/'
35+
36+
# Dangerous function calls
37+
DANGER_CALLS='system\s*\(|exec[lvpe]+\s*\(|popen\s*\(|fopen\s*\(|fwrite\s*\(|socket\s*\(|getenv\s*\(|fork\s*\(|dlopen\s*\(|connect\s*\(|bind\s*\(|listen\s*\(|accept\s*\(|sendto\s*\(|recvfrom\s*\(|mmap\s*\(|mprotect\s*\('
38+
39+
# Suspicious patterns
40+
SUSPICIOUS='__attribute__\s*\(\s*\(\s*constructor|__attribute__\s*\(\s*\(\s*destructor|asm\s*\(|__asm__|__asm\b'
41+
42+
for grammar_dir in "$GRAMMAR_DIR"/*/; do
43+
name=$(basename "$grammar_dir")
44+
TOTAL=$((TOTAL + 1))
45+
issues=""
46+
47+
for src in "$grammar_dir"scanner.c "$grammar_dir"scanner.cc "$grammar_dir"*.h "$grammar_dir"*.inc; do
48+
[ -f "$src" ] || continue
49+
basename_src=$(basename "$src")
50+
51+
# Skip tree_sitter/ subdirectory headers (those are standard)
52+
case "$src" in
53+
*/tree_sitter/*) continue ;;
54+
esac
55+
56+
# Check dangerous includes
57+
while IFS= read -r line; do
58+
issues="${issues} WARN $basename_src: dangerous include: $line\n"
59+
done < <(grep -nE "#include\s*<($DANGER_INCLUDES)" "$src" 2>/dev/null || true)
60+
61+
# Check dangerous calls
62+
while IFS= read -r line; do
63+
issues="${issues} BLOCK $basename_src: dangerous call: $line\n"
64+
done < <(grep -nE "$DANGER_CALLS" "$src" 2>/dev/null || true)
65+
66+
# Check suspicious patterns
67+
while IFS= read -r line; do
68+
issues="${issues} BLOCK $basename_src: suspicious pattern: $line\n"
69+
done < <(grep -nE "$SUSPICIOUS" "$src" 2>/dev/null || true)
70+
71+
# Check for base64-like long encoded strings (40+ alphanumeric chars)
72+
while IFS= read -r line; do
73+
issues="${issues} WARN $basename_src: possible encoded data: $line\n"
74+
done < <(grep -nE '"[A-Za-z0-9+/]{60,}={0,2}"' "$src" 2>/dev/null || true)
75+
done
76+
77+
if [ -n "$issues" ]; then
78+
if echo -e "$issues" | grep -q "BLOCK"; then
79+
echo "BLOCK $name:"
80+
echo -e "$issues"
81+
BLOCKED=$((BLOCKED + 1))
82+
else
83+
echo "WARN $name:"
84+
echo -e "$issues"
85+
WARNED=$((WARNED + 1))
86+
fi
87+
fi
88+
done
89+
90+
echo "────────────────────────────────────────────"
91+
echo " Scanned: $TOTAL grammars"
92+
echo " Clean: $((TOTAL - WARNED - BLOCKED))"
93+
echo " Warned: $WARNED"
94+
echo " Blocked: $BLOCKED"
95+
echo "────────────────────────────────────────────"
96+
97+
if [ "$BLOCKED" -gt 0 ]; then
98+
echo ""
99+
echo "FAILED: $BLOCKED grammar(s) have dangerous patterns. Review before vendoring."
100+
exit 1
101+
fi
102+
103+
echo ""
104+
echo "PASSED: No dangerous patterns found."
105+
exit 0

scripts/vendor-grammar.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,17 @@ if [ -d "$SRC_DIR/tree_sitter" ]; then
4848
cp "$SRC_DIR/tree_sitter/"*.h "$GRAMMAR_DIR/tree_sitter/" 2>/dev/null || true
4949
fi
5050

51+
# Copy any extra headers (.h, .inc files) used by scanners
52+
# Examples: tag.h (Vue/Svelte/Astro), unicode.h (PureScript/Typst),
53+
# TokenTree.h/.inc (VHDL)
54+
for f in "$SRC_DIR"/*.h "$SRC_DIR"/*.inc; do
55+
[ -f "$f" ] && cp "$f" "$GRAMMAR_DIR/"
56+
done
57+
# Copy common/ subdirectory if present (e.g., F# scanner uses common/scanner.h)
58+
if [ -d "$SRC_DIR/common" ]; then
59+
cp -r "$SRC_DIR/common" "$GRAMMAR_DIR/"
60+
fi
61+
5162
# Copy LICENSE file from upstream repo
5263
REPO_ROOT="$TMPDIR/repo"
5364
if [ -n "$SUBDIR" ]; then

0 commit comments

Comments
 (0)