Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions .github/workflows/fuzz.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
name: fuzz (smoke)

# Short, bounded fuzz smoke test on every PR + push. Not a replacement
# for continuous fuzzing (OSS-Fuzz Tier 2) — just catches obvious
# regressions before merge.
on:
push:
branches: [main]
pull_request:

permissions: read-all

jobs:
fuzz:
name: go fuzz smoke
runs-on: ubuntu-latest
permissions:
contents: read
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6

- uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6
with:
go-version-file: go.mod

- name: fuzz targets (30s each)
run: |
set -eu
targets=(
"./internal/crawler::FuzzResolveURL"
"./internal/chunker::FuzzChunker"
)
for entry in "${targets[@]}"; do
pkg="${entry%%::*}"
fn="${entry##*::}"
echo "::group::fuzz $pkg $fn"
CGO_ENABLED=1 go test -tags sqlite_fts5 \
-run=^$ -fuzz="^${fn}$" -fuzztime=30s "$pkg"
echo "::endgroup::"
done
40 changes: 40 additions & 0 deletions internal/chunker/chunker_fuzz_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package chunker

import (
"strings"
"testing"
)

func FuzzChunker(f *testing.F) {
seeds := []string{
"",
"hello world",
"one two three four five",
"paragraph one.\n\nparagraph two.",
strings.Repeat("a", 2048),
"\x00\x00\x00",
"unicode: 你好 世界 𝕌𝕟𝕚𝕔𝕠𝕕𝕖",
strings.Repeat("word ", 1024),
"line\nline\nline\nline",
"mixed\r\nwindows\r\nendings",
}
for _, s := range seeds {
f.Add(s, 256, 32)
}

f.Fuzz(func(t *testing.T, text string, size, overlap int) {
if size <= 0 || size > 4096 || overlap < 0 || overlap >= size {
t.Skip()
}
c := New(size, overlap)
chunks := c.Split(text)
for i, ch := range chunks {
if ch.Index != i {
t.Fatalf("chunk %d has wrong Index=%d", i, ch.Index)
}
if ch.Tokens < 0 {
t.Fatalf("chunk %d negative token count: %d", i, ch.Tokens)
}
}
})
}
6 changes: 6 additions & 0 deletions internal/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,12 @@ func resolveURL(base, href string) string {
}
}
resolved := b.ResolveReference(h)
// Belt-and-braces: also reject if the base URL had a non-http(s)
// scheme. The crawler only receives http(s) base URLs in production,
// but fuzzing proved resolveURL itself must enforce the invariant.
if rs := strings.ToLower(resolved.Scheme); rs != "http" && rs != "https" {
return ""
}
resolved.Fragment = ""
resolved.RawQuery = ""
return resolved.String()
Expand Down
48 changes: 48 additions & 0 deletions internal/crawler/crawler_fuzz_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package crawler

import (
"net/url"
"strings"
"testing"
)

func FuzzResolveURL(f *testing.F) {
seeds := []struct {
base, href string
}{
{"https://example.com/", ""},
{"https://example.com/docs/", "guide.html"},
{"https://example.com/", "#anchor"},
{"https://example.com/", "mailto:a@b.c"},
{"https://example.com/", "javascript:alert(1)"},
{"https://example.com/", "JavaScript:alert(1)"},
{"https://example.com/", "data:text/html,<script>alert(1)</script>"},
{"https://example.com/", "vbscript:msgbox(1)"},
{"https://example.com/", "tel:+15555555555"},
{"https://example.com/", "file:///etc/passwd"},
{"https://example.com/", "//evil.com/x"},
{"https://example.com/", "http://example.com/%"},
{"https://example.com/", strings.Repeat("a", 4096)},
}
for _, s := range seeds {
f.Add(s.base, s.href)
}

f.Fuzz(func(t *testing.T, base, href string) {
got := resolveURL(base, href)
if got == "" {
return
}
// Any non-empty result MUST be a parseable http/https URL.
u, err := url.Parse(got)
if err != nil {
t.Fatalf("resolveURL returned unparseable URL: base=%q href=%q got=%q err=%v",
base, href, got, err)
}
scheme := strings.ToLower(u.Scheme)
if scheme != "http" && scheme != "https" {
t.Fatalf("resolveURL returned non-http(s) scheme: base=%q href=%q got=%q scheme=%q",
base, href, got, scheme)
}
})
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
go test fuzz v1
string("aaaaa:0000")
string("0")
Loading