diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml new file mode 100644 index 0000000..75ec537 --- /dev/null +++ b/.github/workflows/fuzz.yml @@ -0,0 +1,40 @@ +name: fuzz (smoke) + +# Short, bounded fuzz smoke test on every PR + push. Not a replacement +# for continuous fuzzing (OSS-Fuzz Tier 2) — just catches obvious +# regressions before merge. +on: + push: + branches: [main] + pull_request: + +permissions: read-all + +jobs: + fuzz: + name: go fuzz smoke + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + + - uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6 + with: + go-version-file: go.mod + + - name: fuzz targets (30s each) + run: | + set -eu + targets=( + "./internal/crawler::FuzzResolveURL" + "./internal/chunker::FuzzChunker" + ) + for entry in "${targets[@]}"; do + pkg="${entry%%::*}" + fn="${entry##*::}" + echo "::group::fuzz $pkg $fn" + CGO_ENABLED=1 go test -tags sqlite_fts5 \ + -run=^$ -fuzz="^${fn}$" -fuzztime=30s "$pkg" + echo "::endgroup::" + done diff --git a/internal/chunker/chunker_fuzz_test.go b/internal/chunker/chunker_fuzz_test.go new file mode 100644 index 0000000..c4fc356 --- /dev/null +++ b/internal/chunker/chunker_fuzz_test.go @@ -0,0 +1,40 @@ +package chunker + +import ( + "strings" + "testing" +) + +func FuzzChunker(f *testing.F) { + seeds := []string{ + "", + "hello world", + "one two three four five", + "paragraph one.\n\nparagraph two.", + strings.Repeat("a", 2048), + "\x00\x00\x00", + "unicode: 你好 世界 𝕌𝕟𝕚𝕔𝕠𝕕𝕖", + strings.Repeat("word ", 1024), + "line\nline\nline\nline", + "mixed\r\nwindows\r\nendings", + } + for _, s := range seeds { + f.Add(s, 256, 32) + } + + f.Fuzz(func(t *testing.T, text string, size, overlap int) { + if size <= 0 || size > 4096 || overlap < 0 || overlap >= size { + t.Skip() + } + c := New(size, overlap) + chunks := c.Split(text) + for i, ch := range chunks { + if ch.Index != i { + t.Fatalf("chunk %d has wrong Index=%d", i, ch.Index) + } + if ch.Tokens < 0 { + t.Fatalf("chunk %d negative token count: %d", i, ch.Tokens) + } + } + }) +} diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go index bb04cdd..d46529b 100644 --- a/internal/crawler/crawler.go +++ b/internal/crawler/crawler.go @@ -295,6 +295,12 @@ func resolveURL(base, href string) string { } } resolved := b.ResolveReference(h) + // Belt-and-braces: also reject if the base URL had a non-http(s) + // scheme. The crawler only receives http(s) base URLs in production, + // but fuzzing proved resolveURL itself must enforce the invariant. + if rs := strings.ToLower(resolved.Scheme); rs != "http" && rs != "https" { + return "" + } resolved.Fragment = "" resolved.RawQuery = "" return resolved.String() diff --git a/internal/crawler/crawler_fuzz_test.go b/internal/crawler/crawler_fuzz_test.go new file mode 100644 index 0000000..0ad44ae --- /dev/null +++ b/internal/crawler/crawler_fuzz_test.go @@ -0,0 +1,48 @@ +package crawler + +import ( + "net/url" + "strings" + "testing" +) + +func FuzzResolveURL(f *testing.F) { + seeds := []struct { + base, href string + }{ + {"https://example.com/", ""}, + {"https://example.com/docs/", "guide.html"}, + {"https://example.com/", "#anchor"}, + {"https://example.com/", "mailto:a@b.c"}, + {"https://example.com/", "javascript:alert(1)"}, + {"https://example.com/", "JavaScript:alert(1)"}, + {"https://example.com/", "data:text/html,"}, + {"https://example.com/", "vbscript:msgbox(1)"}, + {"https://example.com/", "tel:+15555555555"}, + {"https://example.com/", "file:///etc/passwd"}, + {"https://example.com/", "//evil.com/x"}, + {"https://example.com/", "http://example.com/%"}, + {"https://example.com/", strings.Repeat("a", 4096)}, + } + for _, s := range seeds { + f.Add(s.base, s.href) + } + + f.Fuzz(func(t *testing.T, base, href string) { + got := resolveURL(base, href) + if got == "" { + return + } + // Any non-empty result MUST be a parseable http/https URL. + u, err := url.Parse(got) + if err != nil { + t.Fatalf("resolveURL returned unparseable URL: base=%q href=%q got=%q err=%v", + base, href, got, err) + } + scheme := strings.ToLower(u.Scheme) + if scheme != "http" && scheme != "https" { + t.Fatalf("resolveURL returned non-http(s) scheme: base=%q href=%q got=%q scheme=%q", + base, href, got, scheme) + } + }) +} diff --git a/internal/crawler/testdata/fuzz/FuzzResolveURL/e1f049c94ccc2753 b/internal/crawler/testdata/fuzz/FuzzResolveURL/e1f049c94ccc2753 new file mode 100644 index 0000000..37ad38b --- /dev/null +++ b/internal/crawler/testdata/fuzz/FuzzResolveURL/e1f049c94ccc2753 @@ -0,0 +1,3 @@ +go test fuzz v1 +string("aaaaa:0000") +string("0")