From de73b4613e1bf875998a5bbcd71c612525dd80df Mon Sep 17 00:00:00 2001 From: Darron Froese Date: Mon, 15 Jun 2026 22:41:43 -0600 Subject: [PATCH] fix(web): derive share topics from evidence tags Replace public chat share topic generation with an evidence-weighted ranking built from research-pack user tags. Prioritize tags from sources cited in the answer, then explicit citations, then coverage tags, and run all labels through the configured category vocabulary. This removes the old keyword/source-type guessing that produced generic chips such as research, media, software, GitHub, or YouTube. Public shares now show up to four specific topics and may show fewer or none when the pack has no useful tags. Risk: topic quality now depends more directly on source tagging and vocabulary coverage. Follow up by watching real shared pages for missing or over-specific topics and tuning the vocabulary/drop list as needed. Add regression coverage for citation weighting, coverage fallback, vocabulary cleanup, and public-page chip rendering. Regenerated UI build artifacts are included with the commit. --- CHANGELOG.md | 10 +++ web/chat_shares.go | 191 +++++++++++++++++++++++++++++++--------- web/chat_shares_test.go | 105 +++++++++++++++++++++- 3 files changed, 261 insertions(+), 45 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e00f28d..2dd85fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ development date for the change set. ## Recent Improvements +### Public Chat Share Topics (2026-06-15) + +- **Fixed**: Public chat share topic chips now come from weighted research-pack + evidence tags, prioritizing answer-cited sources and applying the configured + category vocabulary instead of using generic keyword/source-type guesses. +- **Hardening**: Share topic derivation now suppresses generic labels such as + `research`, `media`, `software`, and source-platform noise, with regression + coverage for citation weighting, coverage fallback, and vocabulary cleanup. +- **Location**: `web/chat_shares.go`, `web/chat_shares_test.go` + ### Open Knowledge Format Export (2026-06-14) - **OKF export**: Added `dbrain okf export` and `dbrain okf validate` for diff --git a/web/chat_shares.go b/web/chat_shares.go index 2db632d..a007824 100644 --- a/web/chat_shares.go +++ b/web/chat_shares.go @@ -12,6 +12,7 @@ import ( "strings" "time" + "github.com/darron/dbrain/internal/categoryvocab" "github.com/darron/dbrain/internal/store" "github.com/yuin/goldmark" "github.com/yuin/goldmark/extension" @@ -20,8 +21,9 @@ import ( ) const ( - localShareOwnerProvider = "local" - localShareOwnerSubject = "local" + localShareOwnerProvider = "local" + localShareOwnerSubject = "local" + maxPublicShareCategories = 4 ) var ( @@ -84,7 +86,8 @@ func (s *server) handleChatShareCreate(w http.ResponseWriter, r *http.Request) { return } - input := buildPublicChatShareInput(owner, req.Turn) + vocab, _ := categoryvocab.Load(s.cfg.CategoriesPath) + input := buildPublicChatShareInput(owner, req.Turn, vocab) share, err := s.store.SavePublicChatShare(r.Context(), input) if err != nil { writeError(w, http.StatusInternalServerError, err) @@ -175,12 +178,12 @@ func (s *server) chatShareOwner(r *http.Request) (chatShareOwner, bool) { }, true } -func buildPublicChatShareInput(owner chatShareOwner, turn ChatTranscriptTurn) store.PublicChatShareInput { +func buildPublicChatShareInput(owner chatShareOwner, turn ChatTranscriptTurn, vocab categoryvocab.Vocab) store.PublicChatShareInput { keyURLs := sourceKeyURLMap(turn) content := sanitizeSharedChatContent(turn.Answer, keyURLs) originalURLs := mergeShareURLs(collectOriginalURLs(turn), extractExternalURLs(content)) summary := summarizeSharedContent(content) - categories := categorizeSharedContent(content, turn) + categories := categorizeSharedContent(content, turn, vocab) title := shareTitle(turn.Question, summary) metadata := publicChatShareMetadata{ Question: sanitizeSharedChatContent(turn.Question, keyURLs), @@ -473,63 +476,165 @@ func summarizeSharedContent(content string) string { return strings.TrimSpace(string(runes[:cut])) + "..." } -func categorizeSharedContent(content string, turn ChatTranscriptTurn) []string { - lower := strings.ToLower(content + " " + turn.Question) - score := map[string]int{} - keywords := map[string][]string{ - "ai": {"agent", "model", "llm", "prompt", "inference", "retrieval", "embedding"}, - "software": {"code", "api", "database", "sqlite", "server", "github", "deploy", "bug", "test"}, - "infrastructure": {"tailscale", "kubernetes", "docker", "cloudflare", "s3", "r2", "oauth", "auth"}, - "media": {"video", "audio", "ocr", "transcript", "image", "youtube"}, - "research": {"evidence", "source", "citation", "summary", "study", "article"}, - "security": {"token", "secret", "vulnerability", "exploit", "malware", "phishing"}, - } - for category, terms := range keywords { - for _, term := range terms { - if strings.Contains(lower, term) { - score[category]++ +func categorizeSharedContent(_ string, turn ChatTranscriptTurn, vocab categoryvocab.Vocab) []string { + type rankedCategory struct { + category string + score int + first int + } + + scores := map[string]rankedCategory{} + sequence := 0 + answerCited := sourceKeysInShareText(turn.Answer) + citationKeys := sourceKeysFromShareCitations(turn) + primaryEvidence := map[string]struct{}{} + + addTag := func(raw string, weight int) { + for _, token := range vocab.ApplyToTokens([]string{raw}) { + if !usefulShareCategory(token) { + continue } + current, ok := scores[token] + if !ok { + current = rankedCategory{category: token, first: sequence} + sequence++ + } + current.score += weight + scores[token] = current + } + } + addCSV := func(raw string, weight int) { + for _, token := range strings.Split(raw, ",") { + addTag(token, weight) } } + evidenceWeight := func(sourceKey string, base int) int { + sourceKey = strings.TrimSpace(sourceKey) + if sourceKey == "" { + return base + } + if _, ok := answerCited[sourceKey]; ok { + return 100 + } + if _, ok := citationKeys[sourceKey]; ok { + return 60 + } + return base + } + for _, evidence := range turn.ResearchPack.Evidence { - switch strings.TrimSpace(evidence.SourceType) { - case "github_star", "github": - score["software"]++ - case "youtube_watch_later", "youtube_liked", "youtube": - score["media"]++ - case "web", "feed_entry": - score["research"]++ + key := strings.TrimSpace(evidence.SourceKey) + if key != "" { + primaryEvidence[key] = struct{}{} } + addCSV(evidence.UserTags, evidenceWeight(key, 25)) } - type ranked struct { - category string - score int + for _, evidence := range turn.ResearchPack.ExactTagEvidence { + key := strings.TrimSpace(evidence.SourceKey) + if _, ok := primaryEvidence[key]; ok { + continue + } + addCSV(evidence.UserTags, evidenceWeight(key, 18)) + } + for _, bucket := range turn.ResearchPack.Coverage.TopUserTags { + weight := bucket.Count * 8 + if weight <= 0 { + weight = 8 + } + if weight > 32 { + weight = 32 + } + addTag(bucket.Key, weight) + } + if len(scores) == 0 { + for _, tag := range turn.ResearchPack.QueryPlan.TagQueries { + addTag(tag, 6) + } + addTag(turn.ResearchPack.Topic, 6) } - var rankings []ranked - for category, value := range score { - if value > 0 { - rankings = append(rankings, ranked{category: category, score: value}) + + rankings := make([]rankedCategory, 0, len(scores)) + for _, ranking := range scores { + if ranking.score > 0 { + rankings = append(rankings, ranking) } } sort.Slice(rankings, func(i, j int) bool { - if rankings[i].score == rankings[j].score { - return rankings[i].category < rankings[j].category + if rankings[i].score != rankings[j].score { + return rankings[i].score > rankings[j].score + } + if rankings[i].first != rankings[j].first { + return rankings[i].first < rankings[j].first } - return rankings[i].score > rankings[j].score + return rankings[i].category < rankings[j].category }) - categories := make([]string, 0, min(3, len(rankings))) + + categories := make([]string, 0, min(maxPublicShareCategories, len(rankings))) for _, ranking := range rankings { categories = append(categories, ranking.category) - if len(categories) == 3 { + if len(categories) == maxPublicShareCategories { break } } - if len(categories) == 0 { - categories = []string{"general"} - } return categories } +func sourceKeysInShareText(text string) map[string]struct{} { + keys := map[string]struct{}{} + for _, key := range shareSourceKeyPattern.FindAllString(text, -1) { + key = strings.TrimSpace(key) + if key != "" { + keys[key] = struct{}{} + } + } + return keys +} + +func sourceKeysFromShareCitations(turn ChatTranscriptTurn) map[string]struct{} { + keys := map[string]struct{}{} + for _, citation := range turn.Citations { + key := strings.TrimSpace(citation.SourceKey) + if key != "" { + keys[key] = struct{}{} + } + } + return keys +} + +func usefulShareCategory(token string) bool { + token = categoryvocab.Normalize(token) + if token == "" { + return false + } + if _, ok := genericShareCategories[token]; ok { + return false + } + return true +} + +var genericShareCategories = map[string]struct{}{ + "ai": {}, + "article": {}, + "articles": {}, + "citation": {}, + "citations": {}, + "evidence": {}, + "general": {}, + "github": {}, + "infrastructure": {}, + "media": {}, + "research": {}, + "security": {}, + "software": {}, + "source": {}, + "sources": {}, + "summary": {}, + "summaries": {}, + "web": {}, + "x": {}, + "youtube": {}, +} + func shareTitle(question string, summary string) string { title := strings.Join(strings.Fields(sanitizeSharedChatContent(question, nil)), " ") if title == "" { @@ -818,7 +923,7 @@ var publicShareTemplate = template.Must(template.New("public-share").Parse(`

{{.Title}}

{{if .Categories}} -
+
{{range .Categories}}{{.}}{{end}}
{{end}} diff --git a/web/chat_shares_test.go b/web/chat_shares_test.go index 1ffaa47..d8b9f94 100644 --- a/web/chat_shares_test.go +++ b/web/chat_shares_test.go @@ -11,6 +11,7 @@ import ( "github.com/darron/dbrain/internal/ask" "github.com/darron/dbrain/internal/brainresearch" + "github.com/darron/dbrain/internal/categoryvocab" "github.com/darron/dbrain/internal/store" ) @@ -56,6 +57,7 @@ func TestChatShareCreateListAndPublicPageRedactsInternals(t *testing.T) { NotePath: "sources/test-agent-memory.md", Summary: "Summary about durable retrieval.", SourceType: "web", + UserTags: "agent-memory, durable-retrieval, research", }, }, }, @@ -79,9 +81,12 @@ func TestChatShareCreateListAndPublicPageRedactsInternals(t *testing.T) { if createResponse.URL == "" || !strings.HasPrefix(createResponse.URL, "/share/") || createResponse.Slug == "" { t.Fatalf("unexpected create response: %+v", createResponse) } - if len(createResponse.Categories) == 0 || createResponse.Summary == "" { + if createResponse.Summary == "" { t.Fatalf("expected summary/categories in response: %+v", createResponse) } + if got, want := strings.Join(createResponse.Categories, ","), "agent-memory,durable-retrieval"; got != want { + t.Fatalf("expected evidence-derived share categories %q, got %q", want, got) + } list := httptest.NewRecorder() handler.ServeHTTP(list, httptest.NewRequest(http.MethodGet, "/api/chat/shares", nil)) @@ -102,7 +107,7 @@ func TestChatShareCreateListAndPublicPageRedactsInternals(t *testing.T) { t.Fatalf("expected public page 200, got %d: %s", public.Code, public.Body.String()) } page := public.Body.String() - for _, want := range []string{"https://example.com/agent-memory", ">https://example.com/agent-memory: Summary about durable retrieval.", "Agent memory systems", "Original URLs", "Summary about durable retrieval.", "

Markdown Heading

", "bold", "code", "href=\"https://example.com/with-backtick\"", "href=\"https://example.com/with-colon\"", "href=\"https://example.com/bracketed\">example.com", "href=\"https://www.example.org/angled\">example.org", "href=\"https://example.net/code-source\">example.net"} { + for _, want := range []string{"https://example.com/agent-memory", ">https://example.com/agent-memory: Summary about durable retrieval.", "Agent memory systems", "Original URLs", "Summary about durable retrieval.", "agent-memory", "durable-retrieval", "

Markdown Heading

", "bold", "code", "href=\"https://example.com/with-backtick\"", "href=\"https://example.com/with-colon\"", "href=\"https://example.com/bracketed\">example.com", "href=\"https://www.example.org/angled\">example.org", "href=\"https://example.net/code-source\">example.net"} { if !strings.Contains(page, want) { t.Fatalf("expected public page to contain %q:\n%s", want, page) } @@ -122,6 +127,7 @@ func TestChatShareCreateListAndPublicPageRedactsInternals(t *testing.T) { "<https://", "[", + "research", } { if strings.Contains(page, forbidden) { t.Fatalf("public page leaked %q:\n%s", forbidden, page) @@ -147,6 +153,92 @@ func TestChatShareCreateListAndPublicPageRedactsInternals(t *testing.T) { } } +func TestCategorizeSharedContentWeightsAnswerCitedEvidenceTags(t *testing.T) { + turn := ChatTranscriptTurn{ + Question: "barcelona toronto police", + Answer: "The key incident involved Toronto police in Barcelona [src:cited]. This answer mentions github, youtube, evidence, source, citation, and software.", + Citations: []brainresearch.Citation{ + {SourceKey: "src:cited", URL: "https://example.com/cited"}, + {SourceKey: "src:included", URL: "https://example.com/included"}, + }, + ResearchPack: brainresearch.Pack{ + Evidence: []ask.Evidence{ + { + SourceKey: "src:included", + SourceType: "github", + UserTags: "software, infrastructure, included-only", + }, + { + SourceKey: "src:cited", + SourceType: "web", + UserTags: "toronto-police, barcelona, criminal-law, research", + }, + }, + }, + } + + got := categorizeSharedContent("old keyword fallback should not matter", turn, categoryvocab.Vocab{}) + want := "toronto-police,barcelona,criminal-law,included-only" + if strings.Join(got, ",") != want { + t.Fatalf("categorizeSharedContent() = %q, want %q", strings.Join(got, ","), want) + } + for _, forbidden := range []string{"research", "software", "media", "infrastructure"} { + if containsString(got, forbidden) { + t.Fatalf("share categories should not include generic %q: %#v", forbidden, got) + } + } +} + +func TestCategorizeSharedContentFallsBackToCoverageTopUserTags(t *testing.T) { + turn := ChatTranscriptTurn{ + Question: "summarize ai harness", + Answer: "No evidence rows carried tags.", + ResearchPack: brainresearch.Pack{ + Coverage: brainresearch.Coverage{ + TopUserTags: []brainresearch.Bucket{ + {Key: "research", Count: 50}, + {Key: "large-language-models", Count: 4}, + {Key: "ai-agents", Count: 3}, + }, + }, + }, + } + + got := categorizeSharedContent("", turn, categoryvocab.Vocab{}) + want := "large-language-models,ai-agents" + if strings.Join(got, ",") != want { + t.Fatalf("categorizeSharedContent() = %q, want %q", strings.Join(got, ","), want) + } +} + +func TestCategorizeSharedContentAppliesCategoryVocabulary(t *testing.T) { + vocab, err := categoryvocab.Parse([]byte(strings.Join([]string{ + "aliases:", + " llm: large-language-models", + " ai-agent: ai-agents", + "drop:", + " - github-repository", + }, "\n"))) + if err != nil { + t.Fatalf("parse vocab: %v", err) + } + turn := ChatTranscriptTurn{ + Answer: "Evidence citation [src:vocab].", + ResearchPack: brainresearch.Pack{ + Evidence: []ask.Evidence{{ + SourceKey: "src:vocab", + UserTags: "LLM, AI Agent, github-repository", + }}, + }, + } + + got := categorizeSharedContent("", turn, vocab) + want := "large-language-models,ai-agents" + if strings.Join(got, ",") != want { + t.Fatalf("categorizeSharedContent() = %q, want %q", strings.Join(got, ","), want) + } +} + func TestChatShareRejectsVerificationFailedTurn(t *testing.T) { cfg, st := openTestStore(t) handler, err := NewHandler(cfg, st) @@ -168,6 +260,15 @@ func TestChatShareRejectsVerificationFailedTurn(t *testing.T) { } } +func containsString(values []string, needle string) bool { + for _, value := range values { + if value == needle { + return true + } + } + return false +} + func TestPublicExternalURLCleansEncodedBackticksAndPunctuation(t *testing.T) { tests := []struct { name string