From 374a4ea4262503031daf9dd797e8e47dcf5075cf Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 17 Apr 2026 16:06:20 +0000
Subject: [PATCH 1/8] perf: cache function-PC and reflect.Type name resolution

Add two process-global sync.Map caches for the two "location" resolutions
on every request:

- reflect.Type -> "pkgpath.TypeName" (was a fresh string allocation per call)
- function PC -> runtime.FuncForPC(pc).Name() (symbol-table walk)

Keys are stable for the lifetime of the process, so the caches never evict
and are bounded by the number of distinct types / builder functions.

Observed (benchstat, count=6; see BENCHMARKS.md):
- Result.Get: -75% (single-threaded) / -91% (parallel, lock-free reads)
- AddBuilders (warm): -42% time, -59% allocations
- CachedStructName hit: -87%, zero allocations
- ResolveFuncName hit: -63%, zero allocations
- Compile / RunParallel: no significant latency delta, ~5% fewer allocs

Also adds benchmarks_test.go (the package had none) covering the
micro-benchmarks above plus AddBuilders / Compile / RunParallel /
Result.Get end-to-end, and BENCHMARKS.md documenting reproduction steps
and interpretation.

https://claude.ai/code/session_01Hu8nZg5zrsaRWWf3Dq5XVY
---
 BENCHMARKS.md      | 124 ++++++++++++++++++++++++
 benchmarks_test.go | 228 +++++++++++++++++++++++++++++++++++++++++++++
 cache.go           |  38 ++++++++
 databuilder.go     |  16 +---
 plan.go            |   6 +-
 5 files changed, 398 insertions(+), 14 deletions(-)
 create mode 100644 BENCHMARKS.md
 create mode 100644 benchmarks_test.go
 create mode 100644 cache.go

diff --git a/BENCHMARKS.md b/BENCHMARKS.md
new file mode 100644
index 0000000..e80f418
--- /dev/null
+++ b/BENCHMARKS.md
@@ -0,0 +1,124 @@
+# Benchmarks: Function / Type Name Resolution Caching
+
+`data-builder` resolves two "locations" on every request:
+
+1. **PC → function name** via `runtime.FuncForPC(pc).Name()`
+2. **`reflect.Type` → qualified struct name** via `t.PkgPath() + "." + t.Name()`
+
+Both are now cached in process-global `sync.Map`s (see `cache.go`). Keys
+(reflect.Type identity, function PC) are stable for the life of the program,
+so the caches never need eviction.
+
+## Reproducing
+
+```sh
+go install golang.org/x/perf/cmd/benchstat@latest
+
+# "before": with cache.go rewritten to a pass-through (no caching)
+go test -run=^$ -bench=. -benchmem -count=6 ./... | tee before.txt
+
+# "after": with cache.go in its cached form
+go test -run=^$ -bench=. -benchmem -count=6 ./... | tee after.txt
+
+benchstat before.txt after.txt
+```
+
+The benchmark suite lives in `benchmarks_test.go`. `make bench` runs it with
+`-count=1`; use the commands above for statistically stable comparisons.
+
+## Environment
+
+- `go version go1.25.8 linux/amd64`
+- CPU: INTEL(R) XEON(R) PLATINUM 8581C @ 2.10GHz (16 logical cores)
+- Kernel: Linux 4.4.0
+- `benchstat` with `-count=6`
+
+## Results (benchstat)
+
+### Time per op
+
+| Benchmark                    |   Before |    After |          Δ |
+| ---------------------------- | -------: | -------: | ---------: |
+| `GetStructName_Uncached`     |  81.28ns |  84.98ns |         ~  |
+| `CachedStructName_Hit`       |  83.79ns |  11.23ns | **-86.6%** |
+| `CachedStructName_ColdMix`   |  86.16ns |  11.46ns | **-86.7%** |
+| `FuncForPC_Uncached`         |  32.71ns |  33.38ns |         ~  |
+| `ResolveFuncName_Hit`        |  32.60ns |  12.07ns | **-63.0%** |
+| `ResolveFuncName_ColdMix`    |  30.90ns |  12.13ns | **-60.7%** |
+| `AddBuilders`                |  3.950µs |  2.300µs | **-41.8%** |
+| `AddBuilders_ColdCache`      |  8.089µs | 10.357µs |    +28.1%  |
+| `Compile`                    |  6.920µs |  7.006µs |         ~  |
+| `RunParallel_Workers1`       |  15.64µs |  15.44µs |         ~  |
+| `RunParallel_Workers4`       |  20.71µs |  20.71µs |         ~  |
+| `RunParallel_Workers8`       |  23.91µs |  23.59µs |         ~  |
+| `ResultGet`                  | 103.70ns |  25.54ns | **-75.4%** |
+| `ResultGet_Parallel`         |  16.80ns |   1.54ns | **-90.8%** |
+| **geomean**                  |   498ns  |   244ns  | **-51.0%** |
+
+### Allocations
+
+| Benchmark               | Before   | After   | Δ B/op     | Δ allocs/op |
+| ----------------------- | -------: | ------: | ---------: | ----------: |
+| `CachedStructName_Hit`  |      48B |      0B |   **-100%** |   **-100%** |
+| `CachedStructName_ColdMix` |   51B |      0B |   **-100%** |   **-100%** |
+| `AddBuilders`           |   1872B |    928B |    -50.4%  |    -59.4%   |
+| `Compile`               |   4328B |   4266B |     -1.4%  |     -2.2%   |
+| `RunParallel_Workers1`  |   4945B |   4695B |     -5.1%  |     -6.3%   |
+| `RunParallel_Workers4`  |   5036B |   4786B |     -5.0%  |     -6.1%   |
+| `RunParallel_Workers8`  |   5161B |   4911B |     -4.8%  |     -5.8%   |
+| `ResultGet`             |     48B |      0B |   **-100%** |   **-100%** |
+| `ResultGet_Parallel`    |     48B |      0B |   **-100%** |   **-100%** |
+
+Statistical significance: all reported deltas have `p=0.002` with n=6; entries
+marked `~` are not statistically distinguishable from the baseline.
+
+## Interpretation
+
+**Where caching helps most**
+
+- `Result.Get` and the hot path inside `doWorkAndGetResult` / `RunParallel`
+  init loops used to allocate a fresh `string` for every type lookup
+  (`t.PkgPath() + "." + t.Name()`). Interning the result via `sync.Map`
+  eliminates that allocation entirely: `ResultGet` drops 78ns and one
+  allocation; under parallel load (`ResultGet_Parallel`) it goes from
+  16.8ns to 1.5ns — an **11× speedup** because `sync.Map`'s read-only
+  fast-path is lock-free and scales linearly across cores.
+- `AddBuilders` gets a steady-state 42% latency win and 59% fewer
+  allocations because each builder registration re-resolves the same input
+  and output type names several times via `IsValidBuilder` and `getBuilder`.
+- `FuncForPC` caching is a smaller absolute win (20ns / call) than struct
+  name caching, but it's on the same hot path for `getBuilder` and
+  `plan.Replace`, so it still helps `AddBuilders` directly.
+
+**Where caching does not help (and that's fine)**
+
+- `Compile` and `RunParallel` end-to-end are dominated by
+  `resolveDependencies`, goroutine scheduling, and `reflect.Value.Call`.
+  Name resolution is <5% of those timings, so benchstat reports "no
+  significant change" — but the memory column still shows a real reduction
+  (~5% bytes/allocs per run) because those allocations were shifted off
+  the hot path.
+- `_Uncached` baselines for both resolvers come in identical before and
+  after (as expected — they call the un-cached code directly).
+
+**The `AddBuilders_ColdCache` regression**
+
+This synthetic benchmark resets both `sync.Map`s to empty at the start of
+every iteration, so every call is a miss. `sync.Map` is slower than a
+direct computation in the pure-miss case because it pays for an atomic
+`Load` + an `LoadOrStore` on top of the original work. In production the
+cache warms up once and then serves hits forever, so this scenario isn't
+observable in practice — it's included only to pin the worst-case cost.
+
+## Caveats
+
+- `sync.Map` has higher per-op overhead than a plain `map` when the working
+  set is tiny **and** purely single-threaded. The `_ColdMix` benchmarks
+  are intentionally small (5 types / 4 PCs) to stress this path; they
+  still show ~85-87% wins, because `PkgPath()+Name()` and `FuncForPC`
+  dominate the miss cost.
+- Absolute numbers depend on CPU, OS scheduler, and the number of distinct
+  types/builders the program touches. Don't generalize — re-measure in
+  the target deployment if it matters.
+- Benchmarks should be run with the machine idle; pin `GOMAXPROCS` if you
+  want tighter variance across runs.
diff --git a/benchmarks_test.go b/benchmarks_test.go
new file mode 100644
index 0000000..68fad72
--- /dev/null
+++ b/benchmarks_test.go
@@ -0,0 +1,228 @@
+package databuilder
+
+import (
+	"context"
+	"reflect"
+	"runtime"
+	"strings"
+	"testing"
+)
+
+// Quiet benchmark-only builder variants. The production fixtures in
+// common_test.go call fmt.Println and dominate end-to-end timings, hiding
+// the effect we want to measure.
+
+type benchStructIn struct{ Value string }
+type benchStructA struct{ Value string }
+type benchStructB struct{ Value string }
+type benchStructC struct{ Value string }
+type benchStructD struct{ Value string }
+
+func benchFuncA(_ context.Context, s benchStructIn) (benchStructA, error) {
+	return benchStructA{Value: strings.ReplaceAll(s.Value, "-", "_")}, nil
+}
+
+func benchFuncB(_ context.Context, s benchStructA) (benchStructB, error) {
+	return benchStructB{Value: s.Value + "B"}, nil
+}
+
+func benchFuncC(_ context.Context, s benchStructA) (benchStructC, error) {
+	return benchStructC{Value: s.Value + "C"}, nil
+}
+
+func benchFuncD(_ context.Context, _ benchStructB, _ benchStructC) (benchStructD, error) {
+	return benchStructD{Value: "D"}, nil
+}
+
+// uncachedStructName reproduces the pre-caching implementation for apples-to-apples
+// comparison in the micro-benchmarks.
+func uncachedStructName(t reflect.Type) string {
+	return t.PkgPath() + "." + t.Name()
+}
+
+// --- struct name resolution ---
+
+func BenchmarkGetStructName_Uncached(b *testing.B) {
+	t := reflect.TypeOf(benchStructA{})
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = uncachedStructName(t)
+	}
+}
+
+func BenchmarkCachedStructName_Hit(b *testing.B) {
+	t := reflect.TypeOf(benchStructA{})
+	_ = cachedStructName(t) // warm
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = cachedStructName(t)
+	}
+}
+
+func BenchmarkCachedStructName_ColdMix(b *testing.B) {
+	types := []reflect.Type{
+		reflect.TypeOf(benchStructIn{}),
+		reflect.TypeOf(benchStructA{}),
+		reflect.TypeOf(benchStructB{}),
+		reflect.TypeOf(benchStructC{}),
+		reflect.TypeOf(benchStructD{}),
+	}
+	for _, t := range types {
+		_ = cachedStructName(t) // warm once - realistic steady state
+	}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = cachedStructName(types[i%len(types)])
+	}
+}
+
+// --- function PC resolution ---
+
+func BenchmarkFuncForPC_Uncached(b *testing.B) {
+	pc := reflect.ValueOf(benchFuncA).Pointer()
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = runtime.FuncForPC(pc).Name()
+	}
+}
+
+func BenchmarkResolveFuncName_Hit(b *testing.B) {
+	pc := reflect.ValueOf(benchFuncA).Pointer()
+	_ = resolveFuncName(pc) // warm
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = resolveFuncName(pc)
+	}
+}
+
+func BenchmarkResolveFuncName_ColdMix(b *testing.B) {
+	pcs := []uintptr{
+		reflect.ValueOf(benchFuncA).Pointer(),
+		reflect.ValueOf(benchFuncB).Pointer(),
+		reflect.ValueOf(benchFuncC).Pointer(),
+		reflect.ValueOf(benchFuncD).Pointer(),
+	}
+	for _, pc := range pcs {
+		_ = resolveFuncName(pc) // warm once
+	}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = resolveFuncName(pcs[i%len(pcs)])
+	}
+}
+
+// --- registration ---
+
+func BenchmarkAddBuilders(b *testing.B) {
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		d := New()
+		if err := d.AddBuilders(benchFuncA, benchFuncB, benchFuncC, benchFuncD); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+// BenchmarkAddBuilders_ColdCache exercises the worst-case path where the
+// caches are purged before every iteration. Not realistic, but it pins the
+// ceiling of how much the caches can help registration.
+func BenchmarkAddBuilders_ColdCache(b *testing.B) {
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		b.StopTimer()
+		resetCachesForTest()
+		b.StartTimer()
+		d := New()
+		if err := d.AddBuilders(benchFuncA, benchFuncB, benchFuncC, benchFuncD); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+// --- compile ---
+
+func BenchmarkCompile(b *testing.B) {
+	d := New()
+	if err := d.AddBuilders(benchFuncA, benchFuncB, benchFuncC, benchFuncD); err != nil {
+		b.Fatal(err)
+	}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		if _, err := d.Compile(benchStructIn{}); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+// --- end-to-end execution ---
+
+func newBenchPlan(b *testing.B) Plan {
+	b.Helper()
+	d := New()
+	if err := d.AddBuilders(benchFuncA, benchFuncB, benchFuncC, benchFuncD); err != nil {
+		b.Fatal(err)
+	}
+	plan, err := d.Compile(benchStructIn{})
+	if err != nil {
+		b.Fatal(err)
+	}
+	return plan
+}
+
+func benchRunParallel(b *testing.B, workers uint) {
+	plan := newBenchPlan(b)
+	ctx := context.Background()
+	in := benchStructIn{Value: "hello-world"}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		if _, err := plan.RunParallel(ctx, workers, in); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+func BenchmarkRunParallel_Workers1(b *testing.B) { benchRunParallel(b, 1) }
+func BenchmarkRunParallel_Workers4(b *testing.B) { benchRunParallel(b, 4) }
+func BenchmarkRunParallel_Workers8(b *testing.B) { benchRunParallel(b, 8) }
+
+// --- Result.Get ---
+
+func BenchmarkResultGet(b *testing.B) {
+	plan := newBenchPlan(b)
+	result, err := plan.RunParallel(context.Background(), 4, benchStructIn{Value: "x"})
+	if err != nil {
+		b.Fatal(err)
+	}
+	key := benchStructC{}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = result.Get(key)
+	}
+}
+
+func BenchmarkResultGet_Parallel(b *testing.B) {
+	plan := newBenchPlan(b)
+	result, err := plan.RunParallel(context.Background(), 4, benchStructIn{Value: "x"})
+	if err != nil {
+		b.Fatal(err)
+	}
+	key := benchStructC{}
+	b.ReportAllocs()
+	b.ResetTimer()
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			_ = result.Get(key)
+		}
+	})
+}
diff --git a/cache.go b/cache.go
new file mode 100644
index 0000000..990b3b3
--- /dev/null
+++ b/cache.go
@@ -0,0 +1,38 @@
+package databuilder
+
+import (
+	"reflect"
+	"runtime"
+	"sync"
+)
+
+// Keys (reflect.Type identity, function PC) are stable for the lifetime of
+// the process, so these caches never need eviction and are bounded by the
+// number of distinct types and builder functions ever observed.
+var (
+	structNameCache sync.Map // reflect.Type -> string
+	funcNameCache   sync.Map // uintptr      -> string
+)
+
+func cachedStructName(t reflect.Type) string {
+	if v, ok := structNameCache.Load(t); ok {
+		return v.(string)
+	}
+	name := t.PkgPath() + "." + t.Name()
+	actual, _ := structNameCache.LoadOrStore(t, name)
+	return actual.(string)
+}
+
+func resolveFuncName(pc uintptr) string {
+	if v, ok := funcNameCache.Load(pc); ok {
+		return v.(string)
+	}
+	name := runtime.FuncForPC(pc).Name()
+	actual, _ := funcNameCache.LoadOrStore(pc, name)
+	return actual.(string)
+}
+
+func resetCachesForTest() {
+	structNameCache = sync.Map{}
+	funcNameCache = sync.Map{}
+}
diff --git a/databuilder.go b/databuilder.go
index da121b6..2363a08 100644
--- a/databuilder.go
+++ b/databuilder.go
@@ -3,8 +3,6 @@ package databuilder
 import (
 	"context"
 	"reflect"
-	"runtime"
-
 )
 
 /*
@@ -80,7 +78,7 @@ func (d *db) Compile(init ...any) (Plan, error) {
 		if t.Kind() != reflect.Struct {
 			return nil, ErrInvalidBuilderInput
 		}
-		initialialData = append(initialialData, getStructName(t))
+		initialialData = append(initialialData, cachedStructName(t))
 	}
 
 	order, err := resolveDependencies(d.builders, initialialData...)
@@ -133,7 +131,7 @@ func IsValidBuilder(builder any) error {
 				// checks for vardic functions as well
 				return ErrInvalidBuilderInput
 			}
-			if getStructName(t.In(i)) == getStructName(t.Out(0)) {
+			if cachedStructName(t.In(i)) == cachedStructName(t.Out(0)) {
 				return ErrSameInputAsOutput
 			}
 		}
@@ -155,8 +153,8 @@ func getBuilder(bldr any) (*builder, error) {
 	}
 
 	t := fnValue.Type()
-	out := getStructName(t.Out(0))
-	name := runtime.FuncForPC(fnValue.Pointer()).Name()
+	out := cachedStructName(t.Out(0))
+	name := resolveFuncName(fnValue.Pointer())
 
 	b := &builder{
 		Out:     out,
@@ -165,15 +163,11 @@ func getBuilder(bldr any) (*builder, error) {
 	}
 	// first in context.Context so we start from second
 	for i := 1; i < t.NumIn(); i++ {
-		b.In = append(b.In, getStructName(t.In(i)))
+		b.In = append(b.In, cachedStructName(t.In(i)))
 	}
 	return b, nil
 }
 
-func getStructName(t reflect.Type) string {
-	return t.PkgPath() + "." + t.Name()
-}
-
 // New Creates a new DataBuilder
 func New() DataBuilder {
 	return &db{}
diff --git a/plan.go b/plan.go
index 8eb112d..1b557b1 100644
--- a/plan.go
+++ b/plan.go
@@ -81,7 +81,7 @@ func (p *plan) RunParallel(ctx context.Context, workers uint, initData ...any) (
 		if t.Kind() != reflect.Struct {
 			return nil, ErrInvalidBuilderInput
 		}
-		name := getStructName(t)
+		name := cachedStructName(t)
 		if initialData.Has(name) {
 			return nil, ErrMultipleInitialData
 		}
@@ -194,7 +194,7 @@ func doWorkAndGetResult(ctx context.Context, builders []*builder, dataMap map[st
 			continue
 		}
 		// add result
-		name := getStructName(outputs[0].Type())
+		name := cachedStructName(outputs[0].Type())
 		dataMap[name] = outputs[0].Interface()
 	}
 	return joinErrors(errs)
@@ -252,7 +252,7 @@ func (r Result) Get(obj any) any {
 	if t.Kind() != reflect.Struct {
 		return nil
 	}
-	name := getStructName(t)
+	name := cachedStructName(t)
 	if value, ok := r[name]; ok {
 		return value
 	}

From 9d4ebde8a76c98c771284a27874e606ed6d04f4b Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 17 Apr 2026 16:14:29 +0000
Subject: [PATCH 2/8] test: move resetCachesForTest to _test.go, pin warm state
 in BenchmarkAddBuilders

Addresses PR #31 review feedback:

- resetCachesForTest lived in cache.go and compiled into production builds.
  Moved into a new cache_test.go so it only exists in test/bench builds.
  Also switched from reassigning the sync.Map vars to Range+Delete in place
  (reassignment isn't concurrency-safe if anything else is touching the
  caches).
- BenchmarkAddBuilders didn't control cache state, so results could drift
  based on which other benchmarks ran first. It now explicitly resets and
  warms the caches before ResetTimer so it deterministically measures
  steady-state (warm) registration cost, distinct from
  BenchmarkAddBuilders_ColdCache.

https://claude.ai/code/session_01Hu8nZg5zrsaRWWf3Dq5XVY
---
 benchmarks_test.go |  7 +++++++
 cache.go           |  5 -----
 cache_test.go      | 15 +++++++++++++++
 3 files changed, 22 insertions(+), 5 deletions(-)
 create mode 100644 cache_test.go

diff --git a/benchmarks_test.go b/benchmarks_test.go
index 68fad72..b719e90 100644
--- a/benchmarks_test.go
+++ b/benchmarks_test.go
@@ -120,6 +120,13 @@ func BenchmarkResolveFuncName_ColdMix(b *testing.B) {
 // --- registration ---
 
 func BenchmarkAddBuilders(b *testing.B) {
+	// Pin cache state to "warm" so this benchmark measures steady-state
+	// registration and doesn't drift based on prior benchmark ordering.
+	resetCachesForTest()
+	warm := New()
+	if err := warm.AddBuilders(benchFuncA, benchFuncB, benchFuncC, benchFuncD); err != nil {
+		b.Fatal(err)
+	}
 	b.ReportAllocs()
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
diff --git a/cache.go b/cache.go
index 990b3b3..3b61d98 100644
--- a/cache.go
+++ b/cache.go
@@ -31,8 +31,3 @@ func resolveFuncName(pc uintptr) string {
 	actual, _ := funcNameCache.LoadOrStore(pc, name)
 	return actual.(string)
 }
-
-func resetCachesForTest() {
-	structNameCache = sync.Map{}
-	funcNameCache = sync.Map{}
-}
diff --git a/cache_test.go b/cache_test.go
new file mode 100644
index 0000000..0151fae
--- /dev/null
+++ b/cache_test.go
@@ -0,0 +1,15 @@
+package databuilder
+
+// resetCachesForTest clears both resolution caches in place. It is safe only
+// when no other goroutines are reading or writing the caches (i.e. from
+// tests/benchmarks that are not running alongside live callers).
+func resetCachesForTest() {
+	structNameCache.Range(func(key, _ any) bool {
+		structNameCache.Delete(key)
+		return true
+	})
+	funcNameCache.Range(func(key, _ any) bool {
+		funcNameCache.Delete(key)
+		return true
+	})
+}

From 97f2d0fb884aa25bf56e9e32d382f6fde4422089 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 17 Apr 2026 16:24:10 +0000
Subject: [PATCH 3/8] chore: bump go directive to 1.25.9 for stdlib
 vulnerability fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

govulncheck on CI flags 9 reachable vulnerabilities routing through
pre-existing code paths (data.init, data.resolveDependencies,
data.plan.RunParallel) into Go stdlib (text/template, crypto/tls,
crypto/x509). Bumping the go directive from 1.25.8 to 1.25.9 picks up
the patched stdlib so govulncheck passes.

None of the vulnerable call sites are introduced by this PR — they
originate from pre-existing dependencies (tracing, goccy/go-graphviz)
and stdlib packages called by fmt.Errorf. Main would hit the same
failure if its CI were re-run today.

https://claude.ai/code/session_01Hu8nZg5zrsaRWWf3Dq5XVY
---
 go.mod | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/go.mod b/go.mod
index d03c7c3..3a3defa 100644
--- a/go.mod
+++ b/go.mod
@@ -1,6 +1,6 @@
 module github.com/go-coldbrew/data-builder
 
-go 1.25.8
+go 1.25.9
 
 require (
 	github.com/go-coldbrew/tracing v0.1.0

From b3e7d8035012780e2ec8da4c04ba538a9d358c65 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 17 Apr 2026 16:33:27 +0000
Subject: [PATCH 4/8] chore: fix initialialData typo and align BENCHMARKS.md
 with go.mod

Addresses PR #31 review feedback:

- Rename local variable initialialData -> initialData in db.Compile. Pre-existing
  typo; local-only, so no API impact.
- Update BENCHMARKS.md environment line from go1.25.8 to go1.25.9 to match
  the bumped go.mod directive.

https://claude.ai/code/session_01Hu8nZg5zrsaRWWf3Dq5XVY
---
 BENCHMARKS.md  | 2 +-
 databuilder.go | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/BENCHMARKS.md b/BENCHMARKS.md
index e80f418..1d99c98 100644
--- a/BENCHMARKS.md
+++ b/BENCHMARKS.md
@@ -28,7 +28,7 @@ The benchmark suite lives in `benchmarks_test.go`. `make bench` runs it with
 
 ## Environment
 
-- `go version go1.25.8 linux/amd64`
+- `go version go1.25.9 linux/amd64`
 - CPU: INTEL(R) XEON(R) PLATINUM 8581C @ 2.10GHz (16 logical cores)
 - Kernel: Linux 4.4.0
 - `benchstat` with `-count=6`
diff --git a/databuilder.go b/databuilder.go
index 2363a08..664b5c8 100644
--- a/databuilder.go
+++ b/databuilder.go
@@ -69,7 +69,7 @@ func (d *db) add(bldr any) error {
 }
 
 func (d *db) Compile(init ...any) (Plan, error) {
-	initialialData := make([]string, 0, len(init))
+	initialData := make([]string, 0, len(init))
 	for _, inter := range init {
 		if inter == nil {
 			continue
@@ -78,14 +78,14 @@ func (d *db) Compile(init ...any) (Plan, error) {
 		if t.Kind() != reflect.Struct {
 			return nil, ErrInvalidBuilderInput
 		}
-		initialialData = append(initialialData, cachedStructName(t))
+		initialData = append(initialData, cachedStructName(t))
 	}
 
-	order, err := resolveDependencies(d.builders, initialialData...)
+	order, err := resolveDependencies(d.builders, initialData...)
 	if err != nil {
 		return nil, err
 	}
-	return newPlan(order, initialialData)
+	return newPlan(order, initialData)
 }
 
 // IsValidBuilder checks if the given function is valid or not

From c8aa0441667db8398bbc580125503d382827d10a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 18 Apr 2026 06:02:35 +0000
Subject: [PATCH 5/8] chore: drop narrative // warm comments in benchmarks

The inline comments just described the line above. Cache warmup reads
cleanly without narration.

https://claude.ai/code/session_01Hu8nZg5zrsaRWWf3Dq5XVY
---
 benchmarks_test.go | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks_test.go b/benchmarks_test.go
index b719e90..b8ab45a 100644
--- a/benchmarks_test.go
+++ b/benchmarks_test.go
@@ -53,7 +53,7 @@ func BenchmarkGetStructName_Uncached(b *testing.B) {
 
 func BenchmarkCachedStructName_Hit(b *testing.B) {
 	t := reflect.TypeOf(benchStructA{})
-	_ = cachedStructName(t) // warm
+	_ = cachedStructName(t)
 	b.ReportAllocs()
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
@@ -70,7 +70,7 @@ func BenchmarkCachedStructName_ColdMix(b *testing.B) {
 		reflect.TypeOf(benchStructD{}),
 	}
 	for _, t := range types {
-		_ = cachedStructName(t) // warm once - realistic steady state
+		_ = cachedStructName(t)
 	}
 	b.ReportAllocs()
 	b.ResetTimer()
@@ -92,7 +92,7 @@ func BenchmarkFuncForPC_Uncached(b *testing.B) {
 
 func BenchmarkResolveFuncName_Hit(b *testing.B) {
 	pc := reflect.ValueOf(benchFuncA).Pointer()
-	_ = resolveFuncName(pc) // warm
+	_ = resolveFuncName(pc)
 	b.ReportAllocs()
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
@@ -108,7 +108,7 @@ func BenchmarkResolveFuncName_ColdMix(b *testing.B) {
 		reflect.ValueOf(benchFuncD).Pointer(),
 	}
 	for _, pc := range pcs {
-		_ = resolveFuncName(pc) // warm once
+		_ = resolveFuncName(pc)
 	}
 	b.ReportAllocs()
 	b.ResetTimer()

From b0be301d75d6a228efff407ded549a09725b88b8 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 20 Apr 2026 06:53:28 +0000
Subject: [PATCH 6/8] chore: rename _ColdMix benchmarks to _MixedHit

The _ColdMix benchmarks warmed the cache for every key before
b.ResetTimer(), so they measured mixed-key hit performance, not cold
misses. Renamed to _MixedHit to match what they actually measure, and
updated BENCHMARKS.md caveats to point at AddBuilders_ColdCache as the
true cold-miss signal.
---
 BENCHMARKS.md      | 17 ++++++++++-------
 benchmarks_test.go |  4 ++--
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/BENCHMARKS.md b/BENCHMARKS.md
index 1d99c98..c1c1204 100644
--- a/BENCHMARKS.md
+++ b/BENCHMARKS.md
@@ -41,10 +41,10 @@ The benchmark suite lives in `benchmarks_test.go`. `make bench` runs it with
 | ---------------------------- | -------: | -------: | ---------: |
 | `GetStructName_Uncached`     |  81.28ns |  84.98ns |         ~  |
 | `CachedStructName_Hit`       |  83.79ns |  11.23ns | **-86.6%** |
-| `CachedStructName_ColdMix`   |  86.16ns |  11.46ns | **-86.7%** |
+| `CachedStructName_MixedHit`  |  86.16ns |  11.46ns | **-86.7%** |
 | `FuncForPC_Uncached`         |  32.71ns |  33.38ns |         ~  |
 | `ResolveFuncName_Hit`        |  32.60ns |  12.07ns | **-63.0%** |
-| `ResolveFuncName_ColdMix`    |  30.90ns |  12.13ns | **-60.7%** |
+| `ResolveFuncName_MixedHit`   |  30.90ns |  12.13ns | **-60.7%** |
 | `AddBuilders`                |  3.950µs |  2.300µs | **-41.8%** |
 | `AddBuilders_ColdCache`      |  8.089µs | 10.357µs |    +28.1%  |
 | `Compile`                    |  6.920µs |  7.006µs |         ~  |
@@ -60,7 +60,7 @@ The benchmark suite lives in `benchmarks_test.go`. `make bench` runs it with
 | Benchmark               | Before   | After   | Δ B/op     | Δ allocs/op |
 | ----------------------- | -------: | ------: | ---------: | ----------: |
 | `CachedStructName_Hit`  |      48B |      0B |   **-100%** |   **-100%** |
-| `CachedStructName_ColdMix` |   51B |      0B |   **-100%** |   **-100%** |
+| `CachedStructName_MixedHit` |  51B |      0B |   **-100%** |   **-100%** |
 | `AddBuilders`           |   1872B |    928B |    -50.4%  |    -59.4%   |
 | `Compile`               |   4328B |   4266B |     -1.4%  |     -2.2%   |
 | `RunParallel_Workers1`  |   4945B |   4695B |     -5.1%  |     -6.3%   |
@@ -113,10 +113,13 @@ observable in practice — it's included only to pin the worst-case cost.
 ## Caveats
 
 - `sync.Map` has higher per-op overhead than a plain `map` when the working
-  set is tiny **and** purely single-threaded. The `_ColdMix` benchmarks
-  are intentionally small (5 types / 4 PCs) to stress this path; they
-  still show ~85-87% wins, because `PkgPath()+Name()` and `FuncForPC`
-  dominate the miss cost.
+  set is tiny **and** purely single-threaded. The `_MixedHit` benchmarks
+  are intentionally small (5 types / 4 PCs) and pre-warm the cache before
+  timing, so they measure mixed-key lookup overhead on a tiny hot set
+  rather than true cold misses. They still show ~85–87% wins, because
+  caching avoids repeated `PkgPath()+Name()` and `FuncForPC` work even in
+  that small-set regime. True cold-miss behavior is captured end-to-end by
+  `AddBuilders_ColdCache`, which resets both caches every iteration.
 - Absolute numbers depend on CPU, OS scheduler, and the number of distinct
   types/builders the program touches. Don't generalize — re-measure in
   the target deployment if it matters.
diff --git a/benchmarks_test.go b/benchmarks_test.go
index b8ab45a..d70df3a 100644
--- a/benchmarks_test.go
+++ b/benchmarks_test.go
@@ -61,7 +61,7 @@ func BenchmarkCachedStructName_Hit(b *testing.B) {
 	}
 }
 
-func BenchmarkCachedStructName_ColdMix(b *testing.B) {
+func BenchmarkCachedStructName_MixedHit(b *testing.B) {
 	types := []reflect.Type{
 		reflect.TypeOf(benchStructIn{}),
 		reflect.TypeOf(benchStructA{}),
@@ -100,7 +100,7 @@ func BenchmarkResolveFuncName_Hit(b *testing.B) {
 	}
 }
 
-func BenchmarkResolveFuncName_ColdMix(b *testing.B) {
+func BenchmarkResolveFuncName_MixedHit(b *testing.B) {
 	pcs := []uintptr{
 		reflect.ValueOf(benchFuncA).Pointer(),
 		reflect.ValueOf(benchFuncB).Pointer(),

From 948ba5fe66c0db13b1054cc89907f7f846881a0f Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 20 Apr 2026 06:56:27 +0000
Subject: [PATCH 7/8] docs: add package doc comment, drop BENCHMARKS.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Moved benchmark highlights and general package overview into a new
doc.go so they live in Go source (picked up by gomarkdoc and
pkg.go.dev) instead of a standalone markdown file that would rot.
README is regenerated from the package doc.

Deleted BENCHMARKS.md — the benchmark code in benchmarks_test.go is the
source of truth; absolute nanosecond numbers were going stale and
duplicated what the PR description already covers.
---
 BENCHMARKS.md | 127 --------------------------------------------------
 README.md     |  70 +++++++++++++++++++++++-----
 doc.go        |  69 +++++++++++++++++++++++++++
 3 files changed, 128 insertions(+), 138 deletions(-)
 delete mode 100644 BENCHMARKS.md
 create mode 100644 doc.go

diff --git a/BENCHMARKS.md b/BENCHMARKS.md
deleted file mode 100644
index c1c1204..0000000
--- a/BENCHMARKS.md
+++ /dev/null
@@ -1,127 +0,0 @@
-# Benchmarks: Function / Type Name Resolution Caching
-
-`data-builder` resolves two "locations" on every request:
-
-1. **PC → function name** via `runtime.FuncForPC(pc).Name()`
-2. **`reflect.Type` → qualified struct name** via `t.PkgPath() + "." + t.Name()`
-
-Both are now cached in process-global `sync.Map`s (see `cache.go`). Keys
-(reflect.Type identity, function PC) are stable for the life of the program,
-so the caches never need eviction.
-
-## Reproducing
-
-```sh
-go install golang.org/x/perf/cmd/benchstat@latest
-
-# "before": with cache.go rewritten to a pass-through (no caching)
-go test -run=^$ -bench=. -benchmem -count=6 ./... | tee before.txt
-
-# "after": with cache.go in its cached form
-go test -run=^$ -bench=. -benchmem -count=6 ./... | tee after.txt
-
-benchstat before.txt after.txt
-```
-
-The benchmark suite lives in `benchmarks_test.go`. `make bench` runs it with
-`-count=1`; use the commands above for statistically stable comparisons.
-
-## Environment
-
-- `go version go1.25.9 linux/amd64`
-- CPU: INTEL(R) XEON(R) PLATINUM 8581C @ 2.10GHz (16 logical cores)
-- Kernel: Linux 4.4.0
-- `benchstat` with `-count=6`
-
-## Results (benchstat)
-
-### Time per op
-
-| Benchmark                    |   Before |    After |          Δ |
-| ---------------------------- | -------: | -------: | ---------: |
-| `GetStructName_Uncached`     |  81.28ns |  84.98ns |         ~  |
-| `CachedStructName_Hit`       |  83.79ns |  11.23ns | **-86.6%** |
-| `CachedStructName_MixedHit`  |  86.16ns |  11.46ns | **-86.7%** |
-| `FuncForPC_Uncached`         |  32.71ns |  33.38ns |         ~  |
-| `ResolveFuncName_Hit`        |  32.60ns |  12.07ns | **-63.0%** |
-| `ResolveFuncName_MixedHit`   |  30.90ns |  12.13ns | **-60.7%** |
-| `AddBuilders`                |  3.950µs |  2.300µs | **-41.8%** |
-| `AddBuilders_ColdCache`      |  8.089µs | 10.357µs |    +28.1%  |
-| `Compile`                    |  6.920µs |  7.006µs |         ~  |
-| `RunParallel_Workers1`       |  15.64µs |  15.44µs |         ~  |
-| `RunParallel_Workers4`       |  20.71µs |  20.71µs |         ~  |
-| `RunParallel_Workers8`       |  23.91µs |  23.59µs |         ~  |
-| `ResultGet`                  | 103.70ns |  25.54ns | **-75.4%** |
-| `ResultGet_Parallel`         |  16.80ns |   1.54ns | **-90.8%** |
-| **geomean**                  |   498ns  |   244ns  | **-51.0%** |
-
-### Allocations
-
-| Benchmark               | Before   | After   | Δ B/op     | Δ allocs/op |
-| ----------------------- | -------: | ------: | ---------: | ----------: |
-| `CachedStructName_Hit`  |      48B |      0B |   **-100%** |   **-100%** |
-| `CachedStructName_MixedHit` |  51B |      0B |   **-100%** |   **-100%** |
-| `AddBuilders`           |   1872B |    928B |    -50.4%  |    -59.4%   |
-| `Compile`               |   4328B |   4266B |     -1.4%  |     -2.2%   |
-| `RunParallel_Workers1`  |   4945B |   4695B |     -5.1%  |     -6.3%   |
-| `RunParallel_Workers4`  |   5036B |   4786B |     -5.0%  |     -6.1%   |
-| `RunParallel_Workers8`  |   5161B |   4911B |     -4.8%  |     -5.8%   |
-| `ResultGet`             |     48B |      0B |   **-100%** |   **-100%** |
-| `ResultGet_Parallel`    |     48B |      0B |   **-100%** |   **-100%** |
-
-Statistical significance: all reported deltas have `p=0.002` with n=6; entries
-marked `~` are not statistically distinguishable from the baseline.
-
-## Interpretation
-
-**Where caching helps most**
-
-- `Result.Get` and the hot path inside `doWorkAndGetResult` / `RunParallel`
-  init loops used to allocate a fresh `string` for every type lookup
-  (`t.PkgPath() + "." + t.Name()`). Interning the result via `sync.Map`
-  eliminates that allocation entirely: `ResultGet` drops 78ns and one
-  allocation; under parallel load (`ResultGet_Parallel`) it goes from
-  16.8ns to 1.5ns — an **11× speedup** because `sync.Map`'s read-only
-  fast-path is lock-free and scales linearly across cores.
-- `AddBuilders` gets a steady-state 42% latency win and 59% fewer
-  allocations because each builder registration re-resolves the same input
-  and output type names several times via `IsValidBuilder` and `getBuilder`.
-- `FuncForPC` caching is a smaller absolute win (20ns / call) than struct
-  name caching, but it's on the same hot path for `getBuilder` and
-  `plan.Replace`, so it still helps `AddBuilders` directly.
-
-**Where caching does not help (and that's fine)**
-
-- `Compile` and `RunParallel` end-to-end are dominated by
-  `resolveDependencies`, goroutine scheduling, and `reflect.Value.Call`.
-  Name resolution is <5% of those timings, so benchstat reports "no
-  significant change" — but the memory column still shows a real reduction
-  (~5% bytes/allocs per run) because those allocations were shifted off
-  the hot path.
-- `_Uncached` baselines for both resolvers come in identical before and
-  after (as expected — they call the un-cached code directly).
-
-**The `AddBuilders_ColdCache` regression**
-
-This synthetic benchmark resets both `sync.Map`s to empty at the start of
-every iteration, so every call is a miss. `sync.Map` is slower than a
-direct computation in the pure-miss case because it pays for an atomic
-`Load` + an `LoadOrStore` on top of the original work. In production the
-cache warms up once and then serves hits forever, so this scenario isn't
-observable in practice — it's included only to pin the worst-case cost.
-
-## Caveats
-
-- `sync.Map` has higher per-op overhead than a plain `map` when the working
-  set is tiny **and** purely single-threaded. The `_MixedHit` benchmarks
-  are intentionally small (5 types / 4 PCs) and pre-warm the cache before
-  timing, so they measure mixed-key lookup overhead on a tiny hot set
-  rather than true cold misses. They still show ~85–87% wins, because
-  caching avoids repeated `PkgPath()+Name()` and `FuncForPC` work even in
-  that small-set regime. True cold-miss behavior is captured end-to-end by
-  `AddBuilders_ColdCache`, which resets both caches every iteration.
-- Absolute numbers depend on CPU, OS scheduler, and the number of distinct
-  types/builders the program touches. Don't generalize — re-measure in
-  the target deployment if it matters.
-- Benchmarks should be run with the machine idle; pin `GOMAXPROCS` if you
-  want tighter variance across runs.
diff --git a/README.md b/README.md
index c52dddf..d33dd11 100755
--- a/README.md
+++ b/README.md
@@ -13,6 +13,54 @@
 import "github.com/go-coldbrew/data-builder"
 ```
 
+Package databuilder compiles a set of builder functions into an execution plan with automatic dependency resolution, then runs them sequentially or in parallel.
+
+### Builder functions
+
+A builder is a plain Go function whose signature encodes its inputs and output as types:
+
+```
+func(ctx context.Context, in1 StructA, in2 StructB) (StructC, error)
+```
+
+Rules enforced by [IsValidBuilder](<#IsValidBuilder>):
+
+- The first parameter must be context.Context.
+- All remaining parameters must be concrete struct values \(no pointers, no variadics, no primitives\).
+- The function must return exactly two values: a concrete struct and an error.
+- Two registered builders cannot produce the same output struct.
+- A builder cannot take its own output type as input.
+
+Types are identified by their fully qualified "pkgpath.TypeName", so the dependency graph is built entirely from ordinary Go type information.
+
+### Typical flow
+
+1. Build a [DataBuilder](<#DataBuilder>) with [New](<#New>).
+2. Register builder functions with \[DataBuilder.AddBuilders\].
+3. Call \[DataBuilder.Compile\] with zero\-valued instances of the structs the caller will supply at runtime. Compile topologically sorts the builders into stages, returning a [Plan](<#Plan>).
+4. Run the plan with \[Plan.Run\] \(sequential\) or \[Plan.RunParallel\] \(bounded worker pool\). Both return a [Result](<#Result>).
+5. Read typed outputs from the result with [Result.Get](<#Result.Get>) or [GetFromResult](<#GetFromResult>) from inside a builder.
+
+A compiled [Plan](<#Plan>) is side\-effect free and safe to reuse across goroutines. \[Plan.Replace\] can swap a builder for a compatible one without recompiling, as long as the replacement's inputs are a subset of the original's.
+
+### Parallelism
+
+\[Plan.RunParallel\] runs all builders in the same stage of the DAG concurrently, bounded by a caller\-supplied worker count. A panic or error from any builder is surfaced back to the caller; subsequent stages do not start. Use [MaxPlanParallelism](<#MaxPlanParallelism>) to size the worker pool to the widest stage.
+
+### Performance
+
+Function\-name \(runtime.FuncForPC\) and struct\-name \(reflect.Type\) resolutions are cached in process\-global sync.Maps. Keys are stable for the life of the program, so the caches never evict. Hot\-path effects \(benchstat, count=6\):
+
+- Result.Get: \~4x faster single\-threaded, \~11x faster under parallel load, zero allocations on hit.
+- AddBuilders \(warm cache\): \~40% faster, \~60% fewer allocations.
+- Per\-resolution hits: \~10\-15 ns/op, zero allocations.
+
+Benchmarks live in benchmarks\_test.go; run \`make bench\` to measure on your hardware.
+
+### Visualization
+
+[BuildGraph](<#BuildGraph>) renders the compiled plan to a graphviz file in any format graphviz supports \(png, svg, dot, ...\). Graphviz must be installed on the system.
+
 ## Index
 
 - [Constants](<#constants>)
@@ -78,7 +126,7 @@ var ErrWTF = errors.New("what a terrible failure: this is likely a bug in depend
 ```
 
 <a name="AddResultToCtx"></a>
-## func [AddResultToCtx](<https://github.com/go-coldbrew/data-builder/blob/main/context.go#L17>)
+## func AddResultToCtx
 
 ```go
 func AddResultToCtx(ctx context.Context, r Result) context.Context
@@ -89,7 +137,7 @@ AddResultToCtx adds the given result object to context
 this function should ideally only be used in your tests and/or for debugging modification made to Result obj will NOT persist
 
 <a name="BuildGraph"></a>
-## func [BuildGraph](<https://github.com/go-coldbrew/data-builder/blob/main/plan.go#L319>)
+## func BuildGraph
 
 ```go
 func BuildGraph(executionPlan Plan, format, file string) error
@@ -98,7 +146,7 @@ func BuildGraph(executionPlan Plan, format, file string) error
 BuildGraph helps understand the execution plan, it renders the plan in the given format please note we depend on graphviz, please ensure you have graphviz installed
 
 <a name="GetFromResult"></a>
-## func [GetFromResult](<https://github.com/go-coldbrew/data-builder/blob/main/context.go#L44>)
+## func GetFromResult
 
 ```go
 func GetFromResult(ctx context.Context, obj any) any
@@ -109,7 +157,7 @@ GetFromResult allows builders to access data built by other builders
 this function enables optional access to data, your code should not rely on values being present, if you have explicit dependency please add them to your function parameters
 
 <a name="IsValidBuilder"></a>
-## func [IsValidBuilder](<https://github.com/go-coldbrew/data-builder/blob/main/databuilder.go#L94>)
+## func IsValidBuilder
 
 ```go
 func IsValidBuilder(builder any) error
@@ -118,7 +166,7 @@ func IsValidBuilder(builder any) error
 IsValidBuilder checks if the given function is valid or not
 
 <a name="MaxPlanParallelism"></a>
-## func [MaxPlanParallelism](<https://github.com/go-coldbrew/data-builder/blob/main/plan.go#L331>)
+## func MaxPlanParallelism
 
 ```go
 func MaxPlanParallelism(pl Plan) (uint, error)
@@ -129,7 +177,7 @@ MaxPlanParallelism return the maximum number of buildes that can be exsecuted pa
 this number does not take into account if the builder are cpu intensive or netwrok intensive it may not be benificial to run builders at max parallelism if they are cpu intensive
 
 <a name="DataBuilder"></a>
-## type [DataBuilder](<https://github.com/go-coldbrew/data-builder/blob/main/types.go#L36-L42>)
+## type DataBuilder
 
 DataBuilder is the interface for DataBuilder
 
@@ -293,7 +341,7 @@ welcome to singapore
 </details>
 
 <a name="New"></a>
-### func [New](<https://github.com/go-coldbrew/data-builder/blob/main/databuilder.go#L178>)
+### func New
 
 ```go
 func New() DataBuilder
@@ -302,7 +350,7 @@ func New() DataBuilder
 New Creates a new DataBuilder
 
 <a name="Plan"></a>
-## type [Plan](<https://github.com/go-coldbrew/data-builder/blob/main/types.go#L45-L52>)
+## type Plan
 
 Plan is the interface that wraps execution of Plans created by DataBuilder.Compile method.
 
@@ -367,7 +415,7 @@ true
 </details>
 
 <a name="Result"></a>
-## type [Result](<https://github.com/go-coldbrew/data-builder/blob/main/types.go#L55>)
+## type Result
 
 Result is the result of the Plan.Run method
 
@@ -376,7 +424,7 @@ type Result map[string]any
 ```
 
 <a name="GetResultFromCtx"></a>
-### func [GetResultFromCtx](<https://github.com/go-coldbrew/data-builder/blob/main/context.go#L28>)
+### func GetResultFromCtx
 
 ```go
 func GetResultFromCtx(ctx context.Context) Result
@@ -387,7 +435,7 @@ GetResultFromCtx gives access to result object at this point in execution
 this function should ideally only be used in your tests and/or for debugging modification made to Result obj may or may not persist
 
 <a name="Result.Get"></a>
-### func \(Result\) [Get](<https://github.com/go-coldbrew/data-builder/blob/main/plan.go#L247>)
+### func \(Result\) Get
 
 ```go
 func (r Result) Get(obj any) any
diff --git a/doc.go b/doc.go
new file mode 100644
index 0000000..76233cf
--- /dev/null
+++ b/doc.go
@@ -0,0 +1,69 @@
+// Package databuilder compiles a set of builder functions into an execution
+// plan with automatic dependency resolution, then runs them sequentially or
+// in parallel.
+//
+// # Builder functions
+//
+// A builder is a plain Go function whose signature encodes its inputs and
+// output as types:
+//
+//	func(ctx context.Context, in1 StructA, in2 StructB) (StructC, error)
+//
+// Rules enforced by [IsValidBuilder]:
+//
+//   - The first parameter must be context.Context.
+//   - All remaining parameters must be concrete struct values (no pointers,
+//     no variadics, no primitives).
+//   - The function must return exactly two values: a concrete struct and an
+//     error.
+//   - Two registered builders cannot produce the same output struct.
+//   - A builder cannot take its own output type as input.
+//
+// Types are identified by their fully qualified "pkgpath.TypeName", so the
+// dependency graph is built entirely from ordinary Go type information.
+//
+// # Typical flow
+//
+//  1. Build a [DataBuilder] with [New].
+//  2. Register builder functions with [DataBuilder.AddBuilders].
+//  3. Call [DataBuilder.Compile] with zero-valued instances of the structs
+//     the caller will supply at runtime. Compile topologically sorts the
+//     builders into stages, returning a [Plan].
+//  4. Run the plan with [Plan.Run] (sequential) or [Plan.RunParallel]
+//     (bounded worker pool). Both return a [Result].
+//  5. Read typed outputs from the result with [Result.Get] or
+//     [GetFromResult] from inside a builder.
+//
+// A compiled [Plan] is side-effect free and safe to reuse across goroutines.
+// [Plan.Replace] can swap a builder for a compatible one without recompiling,
+// as long as the replacement's inputs are a subset of the original's.
+//
+// # Parallelism
+//
+// [Plan.RunParallel] runs all builders in the same stage of the DAG
+// concurrently, bounded by a caller-supplied worker count. A panic or error
+// from any builder is surfaced back to the caller; subsequent stages do not
+// start. Use [MaxPlanParallelism] to size the worker pool to the widest
+// stage.
+//
+// # Performance
+//
+// Function-name (runtime.FuncForPC) and struct-name (reflect.Type)
+// resolutions are cached in process-global sync.Maps. Keys are stable for
+// the life of the program, so the caches never evict. Hot-path effects
+// (benchstat, count=6):
+//
+//   - Result.Get: ~4x faster single-threaded, ~11x faster under parallel
+//     load, zero allocations on hit.
+//   - AddBuilders (warm cache): ~40% faster, ~60% fewer allocations.
+//   - Per-resolution hits: ~10-15 ns/op, zero allocations.
+//
+// Benchmarks live in benchmarks_test.go; run `make bench` to measure on your
+// hardware.
+//
+// # Visualization
+//
+// [BuildGraph] renders the compiled plan to a graphviz file in any format
+// graphviz supports (png, svg, dot, ...). Graphviz must be installed on the
+// system.
+package databuilder

From 7a9a95ffa7f87c5e18c1990e9af4e3c5235b89b3 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 20 Apr 2026 07:05:29 +0000
Subject: [PATCH 8/8] bench: harden with runtime.KeepAlive to prevent DCE

Store measured results in locals and call runtime.KeepAlive after each
loop so the compiler cannot elide inlineable work or under-count allocs.
For b.RunParallel the local lives inside the worker closure to avoid
false sharing. Numbers are unchanged on this hardware (sync.Map and
reflect method calls were already opaque to the inliner), but this
makes the suite robust to future code changes.
---
 benchmarks_test.go | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/benchmarks_test.go b/benchmarks_test.go
index d70df3a..79700a2 100644
--- a/benchmarks_test.go
+++ b/benchmarks_test.go
@@ -46,9 +46,11 @@ func BenchmarkGetStructName_Uncached(b *testing.B) {
 	t := reflect.TypeOf(benchStructA{})
 	b.ReportAllocs()
 	b.ResetTimer()
+	var got string
 	for i := 0; i < b.N; i++ {
-		_ = uncachedStructName(t)
+		got = uncachedStructName(t)
 	}
+	runtime.KeepAlive(got)
 }
 
 func BenchmarkCachedStructName_Hit(b *testing.B) {
@@ -56,9 +58,11 @@ func BenchmarkCachedStructName_Hit(b *testing.B) {
 	_ = cachedStructName(t)
 	b.ReportAllocs()
 	b.ResetTimer()
+	var got string
 	for i := 0; i < b.N; i++ {
-		_ = cachedStructName(t)
+		got = cachedStructName(t)
 	}
+	runtime.KeepAlive(got)
 }
 
 func BenchmarkCachedStructName_MixedHit(b *testing.B) {
@@ -74,9 +78,11 @@ func BenchmarkCachedStructName_MixedHit(b *testing.B) {
 	}
 	b.ReportAllocs()
 	b.ResetTimer()
+	var got string
 	for i := 0; i < b.N; i++ {
-		_ = cachedStructName(types[i%len(types)])
+		got = cachedStructName(types[i%len(types)])
 	}
+	runtime.KeepAlive(got)
 }
 
 // --- function PC resolution ---
@@ -85,9 +91,11 @@ func BenchmarkFuncForPC_Uncached(b *testing.B) {
 	pc := reflect.ValueOf(benchFuncA).Pointer()
 	b.ReportAllocs()
 	b.ResetTimer()
+	var got string
 	for i := 0; i < b.N; i++ {
-		_ = runtime.FuncForPC(pc).Name()
+		got = runtime.FuncForPC(pc).Name()
 	}
+	runtime.KeepAlive(got)
 }
 
 func BenchmarkResolveFuncName_Hit(b *testing.B) {
@@ -95,9 +103,11 @@ func BenchmarkResolveFuncName_Hit(b *testing.B) {
 	_ = resolveFuncName(pc)
 	b.ReportAllocs()
 	b.ResetTimer()
+	var got string
 	for i := 0; i < b.N; i++ {
-		_ = resolveFuncName(pc)
+		got = resolveFuncName(pc)
 	}
+	runtime.KeepAlive(got)
 }
 
 func BenchmarkResolveFuncName_MixedHit(b *testing.B) {
@@ -112,9 +122,11 @@ func BenchmarkResolveFuncName_MixedHit(b *testing.B) {
 	}
 	b.ReportAllocs()
 	b.ResetTimer()
+	var got string
 	for i := 0; i < b.N; i++ {
-		_ = resolveFuncName(pcs[i%len(pcs)])
+		got = resolveFuncName(pcs[i%len(pcs)])
 	}
+	runtime.KeepAlive(got)
 }
 
 // --- registration ---
@@ -213,9 +225,11 @@ func BenchmarkResultGet(b *testing.B) {
 	key := benchStructC{}
 	b.ReportAllocs()
 	b.ResetTimer()
+	var got any
 	for i := 0; i < b.N; i++ {
-		_ = result.Get(key)
+		got = result.Get(key)
 	}
+	runtime.KeepAlive(got)
 }
 
 func BenchmarkResultGet_Parallel(b *testing.B) {
@@ -228,8 +242,10 @@ func BenchmarkResultGet_Parallel(b *testing.B) {
 	b.ReportAllocs()
 	b.ResetTimer()
 	b.RunParallel(func(pb *testing.PB) {
+		var got any
 		for pb.Next() {
-			_ = result.Get(key)
+			got = result.Get(key)
 		}
+		runtime.KeepAlive(got)
 	})
 }