From b5975d6c33a8ebce42f3447324f3e38e1d92f47e Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Thu, 23 Apr 2026 00:00:19 +0000 Subject: [PATCH 1/2] fix(security): tighten hookinstaller perms + crawler URL scheme check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses 3 open code-scanning alerts: - CodeQL go/incomplete-url-scheme-check (high): crawler's denylist only covered mailto:/javascript: — data:, vbscript:, tel:, file:, blob: could slip through. Replaced with an http(s)-only allow-list on the parsed URL scheme (case-insensitive). Added table-driven tests. - SonarCloud go:S2612 (x2): hookinstaller wrote config with 0o644 and hook scripts with 0o755 — both world-readable. Tightened to 0o600 / 0o700 since these files live in the user's own ~/.claude dir and only the owner needs access. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/crawler/crawler.go | 12 ++++++++-- internal/crawler/crawler_test.go | 34 +++++++++++++++++++++++++++++ internal/hookinstaller/installer.go | 14 ++++++------ 3 files changed, 51 insertions(+), 9 deletions(-) create mode 100644 internal/crawler/crawler_test.go diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go index 785b097..bb04cdd 100644 --- a/internal/crawler/crawler.go +++ b/internal/crawler/crawler.go @@ -274,8 +274,7 @@ func extractLinks(client *http.Client, pageURL string, base *url.URL) []string { } func resolveURL(base, href string) string { - if strings.HasPrefix(href, "#") || strings.HasPrefix(href, "mailto:") || - strings.HasPrefix(href, "javascript:") { + if strings.HasPrefix(href, "#") { return "" } b, err := url.Parse(base) @@ -286,6 +285,15 @@ func resolveURL(base, href string) string { if err != nil { return "" } + // Reject any href with an explicit scheme other than http/https. + // This allow-list covers mailto:, javascript:, data:, vbscript:, + // tel:, file:, blob:, and anything else we don't crawl. + if h.Scheme != "" { + s := strings.ToLower(h.Scheme) + if s != "http" && s != "https" { + return "" + } + } resolved := b.ResolveReference(h) resolved.Fragment = "" resolved.RawQuery = "" diff --git a/internal/crawler/crawler_test.go b/internal/crawler/crawler_test.go new file mode 100644 index 0000000..6b614fd --- /dev/null +++ b/internal/crawler/crawler_test.go @@ -0,0 +1,34 @@ +package crawler + +import "testing" + +func TestResolveURL_SchemeAllowList(t *testing.T) { + const base = "https://example.com/docs/" + cases := []struct { + name string + href string + want string + }{ + {"relative path", "guide.html", "https://example.com/docs/guide.html"}, + {"absolute http", "http://example.com/x", "http://example.com/x"}, + {"absolute https", "https://example.com/x", "https://example.com/x"}, + {"fragment only", "#anchor", ""}, + {"mailto", "mailto:a@b.c", ""}, + {"javascript", "javascript:alert(1)", ""}, + {"javascript case", "JavaScript:alert(1)", ""}, + {"data uri", "data:text/html,", ""}, + {"vbscript", "vbscript:msgbox(1)", ""}, + {"tel", "tel:+15555555555", ""}, + {"file", "file:///etc/passwd", ""}, + {"blob", "blob:https://example.com/abc", ""}, + {"ftp", "ftp://example.com/file", ""}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := resolveURL(base, tc.href) + if got != tc.want { + t.Errorf("resolveURL(%q, %q) = %q, want %q", base, tc.href, got, tc.want) + } + }) + } +} diff --git a/internal/hookinstaller/installer.go b/internal/hookinstaller/installer.go index 2bc3714..23aca52 100644 --- a/internal/hookinstaller/installer.go +++ b/internal/hookinstaller/installer.go @@ -131,8 +131,8 @@ func writeJSONMapAtomic(path string, data map[string]json.RawMessage) error { cleanup() return fmt.Errorf("close temp: %w", err) } - // 0o644 — standard config perms. Writable by user only. - if err := os.Chmod(tmpPath, 0o644); err != nil { + // 0o600 — user-only read/write. No group/world bits. + if err := os.Chmod(tmpPath, 0o600); err != nil { cleanup() return fmt.Errorf("chmod temp: %w", err) } @@ -163,19 +163,19 @@ func validateHookPath(p string) error { return nil } -// ExtractHookScript writes the embedded hook.sh to dest with 0o755 perms. +// ExtractHookScript writes the embedded hook.sh to dest with 0o700 perms. // Creates the parent directory if missing. Overwrites existing files — // callers that want to preserve a user-modified hook should check first. func ExtractHookScript(dest string) error { - if err := os.MkdirAll(filepath.Dir(dest), 0o755); err != nil { + if err := os.MkdirAll(filepath.Dir(dest), 0o700); err != nil { return fmt.Errorf("mkdir %s: %w", filepath.Dir(dest), err) } - if err := os.WriteFile(dest, HookScript, 0o755); err != nil { + if err := os.WriteFile(dest, HookScript, 0o700); err != nil { return fmt.Errorf("write %s: %w", dest, err) } // WriteFile masks perms through umask on create; reset explicitly so - // the resulting file is reliably executable. - if err := os.Chmod(dest, 0o755); err != nil { + // the resulting file is reliably executable for the owner only. + if err := os.Chmod(dest, 0o700); err != nil { return fmt.Errorf("chmod %s: %w", dest, err) } return nil From 64f260bbde2125abed93ba9b931071f329f8d006 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Thu, 23 Apr 2026 00:08:01 +0000 Subject: [PATCH 2/2] ci: add tier-1 fuzz smoke targets (crawler + chunker) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - FuzzResolveURL — found and fixed a second edge case in resolveURL: non-http(s) base URLs could escape the scheme allow-list via ResolveReference. Added belt-and-braces scheme check on the resolved URL. 1.76M executions clean at 30s/target. - FuzzChunker — exercises text splitter on UTF-8 edges, zero bytes, CRLF, large inputs, and random (size, overlap) pairs. - .github/workflows/fuzz.yml — runs each target for 30s on every PR + main push. Scorecard's Fuzzing check detects Go native fuzz targets. Not a replacement for continuous fuzzing; see OSS-Fuzz tier-2 for that. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/fuzz.yml | 40 ++++++++++++++++ internal/chunker/chunker_fuzz_test.go | 40 ++++++++++++++++ internal/crawler/crawler.go | 6 +++ internal/crawler/crawler_fuzz_test.go | 48 +++++++++++++++++++ .../fuzz/FuzzResolveURL/e1f049c94ccc2753 | 3 ++ 5 files changed, 137 insertions(+) create mode 100644 .github/workflows/fuzz.yml create mode 100644 internal/chunker/chunker_fuzz_test.go create mode 100644 internal/crawler/crawler_fuzz_test.go create mode 100644 internal/crawler/testdata/fuzz/FuzzResolveURL/e1f049c94ccc2753 diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml new file mode 100644 index 0000000..75ec537 --- /dev/null +++ b/.github/workflows/fuzz.yml @@ -0,0 +1,40 @@ +name: fuzz (smoke) + +# Short, bounded fuzz smoke test on every PR + push. Not a replacement +# for continuous fuzzing (OSS-Fuzz Tier 2) — just catches obvious +# regressions before merge. +on: + push: + branches: [main] + pull_request: + +permissions: read-all + +jobs: + fuzz: + name: go fuzz smoke + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + + - uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6 + with: + go-version-file: go.mod + + - name: fuzz targets (30s each) + run: | + set -eu + targets=( + "./internal/crawler::FuzzResolveURL" + "./internal/chunker::FuzzChunker" + ) + for entry in "${targets[@]}"; do + pkg="${entry%%::*}" + fn="${entry##*::}" + echo "::group::fuzz $pkg $fn" + CGO_ENABLED=1 go test -tags sqlite_fts5 \ + -run=^$ -fuzz="^${fn}$" -fuzztime=30s "$pkg" + echo "::endgroup::" + done diff --git a/internal/chunker/chunker_fuzz_test.go b/internal/chunker/chunker_fuzz_test.go new file mode 100644 index 0000000..c4fc356 --- /dev/null +++ b/internal/chunker/chunker_fuzz_test.go @@ -0,0 +1,40 @@ +package chunker + +import ( + "strings" + "testing" +) + +func FuzzChunker(f *testing.F) { + seeds := []string{ + "", + "hello world", + "one two three four five", + "paragraph one.\n\nparagraph two.", + strings.Repeat("a", 2048), + "\x00\x00\x00", + "unicode: 你好 世界 𝕌𝕟𝕚𝕔𝕠𝕕𝕖", + strings.Repeat("word ", 1024), + "line\nline\nline\nline", + "mixed\r\nwindows\r\nendings", + } + for _, s := range seeds { + f.Add(s, 256, 32) + } + + f.Fuzz(func(t *testing.T, text string, size, overlap int) { + if size <= 0 || size > 4096 || overlap < 0 || overlap >= size { + t.Skip() + } + c := New(size, overlap) + chunks := c.Split(text) + for i, ch := range chunks { + if ch.Index != i { + t.Fatalf("chunk %d has wrong Index=%d", i, ch.Index) + } + if ch.Tokens < 0 { + t.Fatalf("chunk %d negative token count: %d", i, ch.Tokens) + } + } + }) +} diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go index bb04cdd..d46529b 100644 --- a/internal/crawler/crawler.go +++ b/internal/crawler/crawler.go @@ -295,6 +295,12 @@ func resolveURL(base, href string) string { } } resolved := b.ResolveReference(h) + // Belt-and-braces: also reject if the base URL had a non-http(s) + // scheme. The crawler only receives http(s) base URLs in production, + // but fuzzing proved resolveURL itself must enforce the invariant. + if rs := strings.ToLower(resolved.Scheme); rs != "http" && rs != "https" { + return "" + } resolved.Fragment = "" resolved.RawQuery = "" return resolved.String() diff --git a/internal/crawler/crawler_fuzz_test.go b/internal/crawler/crawler_fuzz_test.go new file mode 100644 index 0000000..0ad44ae --- /dev/null +++ b/internal/crawler/crawler_fuzz_test.go @@ -0,0 +1,48 @@ +package crawler + +import ( + "net/url" + "strings" + "testing" +) + +func FuzzResolveURL(f *testing.F) { + seeds := []struct { + base, href string + }{ + {"https://example.com/", ""}, + {"https://example.com/docs/", "guide.html"}, + {"https://example.com/", "#anchor"}, + {"https://example.com/", "mailto:a@b.c"}, + {"https://example.com/", "javascript:alert(1)"}, + {"https://example.com/", "JavaScript:alert(1)"}, + {"https://example.com/", "data:text/html,"}, + {"https://example.com/", "vbscript:msgbox(1)"}, + {"https://example.com/", "tel:+15555555555"}, + {"https://example.com/", "file:///etc/passwd"}, + {"https://example.com/", "//evil.com/x"}, + {"https://example.com/", "http://example.com/%"}, + {"https://example.com/", strings.Repeat("a", 4096)}, + } + for _, s := range seeds { + f.Add(s.base, s.href) + } + + f.Fuzz(func(t *testing.T, base, href string) { + got := resolveURL(base, href) + if got == "" { + return + } + // Any non-empty result MUST be a parseable http/https URL. + u, err := url.Parse(got) + if err != nil { + t.Fatalf("resolveURL returned unparseable URL: base=%q href=%q got=%q err=%v", + base, href, got, err) + } + scheme := strings.ToLower(u.Scheme) + if scheme != "http" && scheme != "https" { + t.Fatalf("resolveURL returned non-http(s) scheme: base=%q href=%q got=%q scheme=%q", + base, href, got, scheme) + } + }) +} diff --git a/internal/crawler/testdata/fuzz/FuzzResolveURL/e1f049c94ccc2753 b/internal/crawler/testdata/fuzz/FuzzResolveURL/e1f049c94ccc2753 new file mode 100644 index 0000000..37ad38b --- /dev/null +++ b/internal/crawler/testdata/fuzz/FuzzResolveURL/e1f049c94ccc2753 @@ -0,0 +1,3 @@ +go test fuzz v1 +string("aaaaa:0000") +string("0")