diff --git a/CHANGELOG.md b/CHANGELOG.md index e00f28d..2dd85fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ development date for the change set. ## Recent Improvements +### Public Chat Share Topics (2026-06-15) + +- **Fixed**: Public chat share topic chips now come from weighted research-pack + evidence tags, prioritizing answer-cited sources and applying the configured + category vocabulary instead of using generic keyword/source-type guesses. +- **Hardening**: Share topic derivation now suppresses generic labels such as + `research`, `media`, `software`, and source-platform noise, with regression + coverage for citation weighting, coverage fallback, and vocabulary cleanup. +- **Location**: `web/chat_shares.go`, `web/chat_shares_test.go` + ### Open Knowledge Format Export (2026-06-14) - **OKF export**: Added `dbrain okf export` and `dbrain okf validate` for diff --git a/web/chat_shares.go b/web/chat_shares.go index 2db632d..a007824 100644 --- a/web/chat_shares.go +++ b/web/chat_shares.go @@ -12,6 +12,7 @@ import ( "strings" "time" + "github.com/darron/dbrain/internal/categoryvocab" "github.com/darron/dbrain/internal/store" "github.com/yuin/goldmark" "github.com/yuin/goldmark/extension" @@ -20,8 +21,9 @@ import ( ) const ( - localShareOwnerProvider = "local" - localShareOwnerSubject = "local" + localShareOwnerProvider = "local" + localShareOwnerSubject = "local" + maxPublicShareCategories = 4 ) var ( @@ -84,7 +86,8 @@ func (s *server) handleChatShareCreate(w http.ResponseWriter, r *http.Request) { return } - input := buildPublicChatShareInput(owner, req.Turn) + vocab, _ := categoryvocab.Load(s.cfg.CategoriesPath) + input := buildPublicChatShareInput(owner, req.Turn, vocab) share, err := s.store.SavePublicChatShare(r.Context(), input) if err != nil { writeError(w, http.StatusInternalServerError, err) @@ -175,12 +178,12 @@ func (s *server) chatShareOwner(r *http.Request) (chatShareOwner, bool) { }, true } -func buildPublicChatShareInput(owner chatShareOwner, turn ChatTranscriptTurn) store.PublicChatShareInput { +func buildPublicChatShareInput(owner chatShareOwner, turn ChatTranscriptTurn, vocab categoryvocab.Vocab) store.PublicChatShareInput { keyURLs := sourceKeyURLMap(turn) content := sanitizeSharedChatContent(turn.Answer, keyURLs) originalURLs := mergeShareURLs(collectOriginalURLs(turn), extractExternalURLs(content)) summary := summarizeSharedContent(content) - categories := categorizeSharedContent(content, turn) + categories := categorizeSharedContent(content, turn, vocab) title := shareTitle(turn.Question, summary) metadata := publicChatShareMetadata{ Question: sanitizeSharedChatContent(turn.Question, keyURLs), @@ -473,63 +476,165 @@ func summarizeSharedContent(content string) string { return strings.TrimSpace(string(runes[:cut])) + "..." } -func categorizeSharedContent(content string, turn ChatTranscriptTurn) []string { - lower := strings.ToLower(content + " " + turn.Question) - score := map[string]int{} - keywords := map[string][]string{ - "ai": {"agent", "model", "llm", "prompt", "inference", "retrieval", "embedding"}, - "software": {"code", "api", "database", "sqlite", "server", "github", "deploy", "bug", "test"}, - "infrastructure": {"tailscale", "kubernetes", "docker", "cloudflare", "s3", "r2", "oauth", "auth"}, - "media": {"video", "audio", "ocr", "transcript", "image", "youtube"}, - "research": {"evidence", "source", "citation", "summary", "study", "article"}, - "security": {"token", "secret", "vulnerability", "exploit", "malware", "phishing"}, - } - for category, terms := range keywords { - for _, term := range terms { - if strings.Contains(lower, term) { - score[category]++ +func categorizeSharedContent(_ string, turn ChatTranscriptTurn, vocab categoryvocab.Vocab) []string { + type rankedCategory struct { + category string + score int + first int + } + + scores := map[string]rankedCategory{} + sequence := 0 + answerCited := sourceKeysInShareText(turn.Answer) + citationKeys := sourceKeysFromShareCitations(turn) + primaryEvidence := map[string]struct{}{} + + addTag := func(raw string, weight int) { + for _, token := range vocab.ApplyToTokens([]string{raw}) { + if !usefulShareCategory(token) { + continue } + current, ok := scores[token] + if !ok { + current = rankedCategory{category: token, first: sequence} + sequence++ + } + current.score += weight + scores[token] = current + } + } + addCSV := func(raw string, weight int) { + for _, token := range strings.Split(raw, ",") { + addTag(token, weight) } } + evidenceWeight := func(sourceKey string, base int) int { + sourceKey = strings.TrimSpace(sourceKey) + if sourceKey == "" { + return base + } + if _, ok := answerCited[sourceKey]; ok { + return 100 + } + if _, ok := citationKeys[sourceKey]; ok { + return 60 + } + return base + } + for _, evidence := range turn.ResearchPack.Evidence { - switch strings.TrimSpace(evidence.SourceType) { - case "github_star", "github": - score["software"]++ - case "youtube_watch_later", "youtube_liked", "youtube": - score["media"]++ - case "web", "feed_entry": - score["research"]++ + key := strings.TrimSpace(evidence.SourceKey) + if key != "" { + primaryEvidence[key] = struct{}{} } + addCSV(evidence.UserTags, evidenceWeight(key, 25)) } - type ranked struct { - category string - score int + for _, evidence := range turn.ResearchPack.ExactTagEvidence { + key := strings.TrimSpace(evidence.SourceKey) + if _, ok := primaryEvidence[key]; ok { + continue + } + addCSV(evidence.UserTags, evidenceWeight(key, 18)) + } + for _, bucket := range turn.ResearchPack.Coverage.TopUserTags { + weight := bucket.Count * 8 + if weight <= 0 { + weight = 8 + } + if weight > 32 { + weight = 32 + } + addTag(bucket.Key, weight) + } + if len(scores) == 0 { + for _, tag := range turn.ResearchPack.QueryPlan.TagQueries { + addTag(tag, 6) + } + addTag(turn.ResearchPack.Topic, 6) } - var rankings []ranked - for category, value := range score { - if value > 0 { - rankings = append(rankings, ranked{category: category, score: value}) + + rankings := make([]rankedCategory, 0, len(scores)) + for _, ranking := range scores { + if ranking.score > 0 { + rankings = append(rankings, ranking) } } sort.Slice(rankings, func(i, j int) bool { - if rankings[i].score == rankings[j].score { - return rankings[i].category < rankings[j].category + if rankings[i].score != rankings[j].score { + return rankings[i].score > rankings[j].score + } + if rankings[i].first != rankings[j].first { + return rankings[i].first < rankings[j].first } - return rankings[i].score > rankings[j].score + return rankings[i].category < rankings[j].category }) - categories := make([]string, 0, min(3, len(rankings))) + + categories := make([]string, 0, min(maxPublicShareCategories, len(rankings))) for _, ranking := range rankings { categories = append(categories, ranking.category) - if len(categories) == 3 { + if len(categories) == maxPublicShareCategories { break } } - if len(categories) == 0 { - categories = []string{"general"} - } return categories } +func sourceKeysInShareText(text string) map[string]struct{} { + keys := map[string]struct{}{} + for _, key := range shareSourceKeyPattern.FindAllString(text, -1) { + key = strings.TrimSpace(key) + if key != "" { + keys[key] = struct{}{} + } + } + return keys +} + +func sourceKeysFromShareCitations(turn ChatTranscriptTurn) map[string]struct{} { + keys := map[string]struct{}{} + for _, citation := range turn.Citations { + key := strings.TrimSpace(citation.SourceKey) + if key != "" { + keys[key] = struct{}{} + } + } + return keys +} + +func usefulShareCategory(token string) bool { + token = categoryvocab.Normalize(token) + if token == "" { + return false + } + if _, ok := genericShareCategories[token]; ok { + return false + } + return true +} + +var genericShareCategories = map[string]struct{}{ + "ai": {}, + "article": {}, + "articles": {}, + "citation": {}, + "citations": {}, + "evidence": {}, + "general": {}, + "github": {}, + "infrastructure": {}, + "media": {}, + "research": {}, + "security": {}, + "software": {}, + "source": {}, + "sources": {}, + "summary": {}, + "summaries": {}, + "web": {}, + "x": {}, + "youtube": {}, +} + func shareTitle(question string, summary string) string { title := strings.Join(strings.Fields(sanitizeSharedChatContent(question, nil)), " ") if title == "" { @@ -818,7 +923,7 @@ var publicShareTemplate = template.Must(template.New("public-share").Parse(`

{{.Title}}

{{if .Categories}} -
+
{{range .Categories}}{{.}}{{end}}
{{end}} diff --git a/web/chat_shares_test.go b/web/chat_shares_test.go index 1ffaa47..d8b9f94 100644 --- a/web/chat_shares_test.go +++ b/web/chat_shares_test.go @@ -11,6 +11,7 @@ import ( "github.com/darron/dbrain/internal/ask" "github.com/darron/dbrain/internal/brainresearch" + "github.com/darron/dbrain/internal/categoryvocab" "github.com/darron/dbrain/internal/store" ) @@ -56,6 +57,7 @@ func TestChatShareCreateListAndPublicPageRedactsInternals(t *testing.T) { NotePath: "sources/test-agent-memory.md", Summary: "Summary about durable retrieval.", SourceType: "web", + UserTags: "agent-memory, durable-retrieval, research", }, }, }, @@ -79,9 +81,12 @@ func TestChatShareCreateListAndPublicPageRedactsInternals(t *testing.T) { if createResponse.URL == "" || !strings.HasPrefix(createResponse.URL, "/share/") || createResponse.Slug == "" { t.Fatalf("unexpected create response: %+v", createResponse) } - if len(createResponse.Categories) == 0 || createResponse.Summary == "" { + if createResponse.Summary == "" { t.Fatalf("expected summary/categories in response: %+v", createResponse) } + if got, want := strings.Join(createResponse.Categories, ","), "agent-memory,durable-retrieval"; got != want { + t.Fatalf("expected evidence-derived share categories %q, got %q", want, got) + } list := httptest.NewRecorder() handler.ServeHTTP(list, httptest.NewRequest(http.MethodGet, "/api/chat/shares", nil)) @@ -102,7 +107,7 @@ func TestChatShareCreateListAndPublicPageRedactsInternals(t *testing.T) { t.Fatalf("expected public page 200, got %d: %s", public.Code, public.Body.String()) } page := public.Body.String() - for _, want := range []string{"https://example.com/agent-memory", ">https://example.com/agent-memory: Summary about durable retrieval.", "Agent memory systems", "Original URLs", "Summary about durable retrieval.", "

Markdown Heading

", "bold", "code", "href=\"https://example.com/with-backtick\"", "href=\"https://example.com/with-colon\"", "href=\"https://example.com/bracketed\">example.com", "href=\"https://www.example.org/angled\">example.org", "href=\"https://example.net/code-source\">example.net"} { + for _, want := range []string{"https://example.com/agent-memory", ">https://example.com/agent-memory: Summary about durable retrieval.", "Agent memory systems", "Original URLs", "Summary about durable retrieval.", "agent-memory", "durable-retrieval", "

Markdown Heading

", "bold", "code", "href=\"https://example.com/with-backtick\"", "href=\"https://example.com/with-colon\"", "href=\"https://example.com/bracketed\">example.com", "href=\"https://www.example.org/angled\">example.org", "href=\"https://example.net/code-source\">example.net"} { if !strings.Contains(page, want) { t.Fatalf("expected public page to contain %q:\n%s", want, page) } @@ -122,6 +127,7 @@ func TestChatShareCreateListAndPublicPageRedactsInternals(t *testing.T) { "<https://", "[", + "research", } { if strings.Contains(page, forbidden) { t.Fatalf("public page leaked %q:\n%s", forbidden, page) @@ -147,6 +153,92 @@ func TestChatShareCreateListAndPublicPageRedactsInternals(t *testing.T) { } } +func TestCategorizeSharedContentWeightsAnswerCitedEvidenceTags(t *testing.T) { + turn := ChatTranscriptTurn{ + Question: "barcelona toronto police", + Answer: "The key incident involved Toronto police in Barcelona [src:cited]. This answer mentions github, youtube, evidence, source, citation, and software.", + Citations: []brainresearch.Citation{ + {SourceKey: "src:cited", URL: "https://example.com/cited"}, + {SourceKey: "src:included", URL: "https://example.com/included"}, + }, + ResearchPack: brainresearch.Pack{ + Evidence: []ask.Evidence{ + { + SourceKey: "src:included", + SourceType: "github", + UserTags: "software, infrastructure, included-only", + }, + { + SourceKey: "src:cited", + SourceType: "web", + UserTags: "toronto-police, barcelona, criminal-law, research", + }, + }, + }, + } + + got := categorizeSharedContent("old keyword fallback should not matter", turn, categoryvocab.Vocab{}) + want := "toronto-police,barcelona,criminal-law,included-only" + if strings.Join(got, ",") != want { + t.Fatalf("categorizeSharedContent() = %q, want %q", strings.Join(got, ","), want) + } + for _, forbidden := range []string{"research", "software", "media", "infrastructure"} { + if containsString(got, forbidden) { + t.Fatalf("share categories should not include generic %q: %#v", forbidden, got) + } + } +} + +func TestCategorizeSharedContentFallsBackToCoverageTopUserTags(t *testing.T) { + turn := ChatTranscriptTurn{ + Question: "summarize ai harness", + Answer: "No evidence rows carried tags.", + ResearchPack: brainresearch.Pack{ + Coverage: brainresearch.Coverage{ + TopUserTags: []brainresearch.Bucket{ + {Key: "research", Count: 50}, + {Key: "large-language-models", Count: 4}, + {Key: "ai-agents", Count: 3}, + }, + }, + }, + } + + got := categorizeSharedContent("", turn, categoryvocab.Vocab{}) + want := "large-language-models,ai-agents" + if strings.Join(got, ",") != want { + t.Fatalf("categorizeSharedContent() = %q, want %q", strings.Join(got, ","), want) + } +} + +func TestCategorizeSharedContentAppliesCategoryVocabulary(t *testing.T) { + vocab, err := categoryvocab.Parse([]byte(strings.Join([]string{ + "aliases:", + " llm: large-language-models", + " ai-agent: ai-agents", + "drop:", + " - github-repository", + }, "\n"))) + if err != nil { + t.Fatalf("parse vocab: %v", err) + } + turn := ChatTranscriptTurn{ + Answer: "Evidence citation [src:vocab].", + ResearchPack: brainresearch.Pack{ + Evidence: []ask.Evidence{{ + SourceKey: "src:vocab", + UserTags: "LLM, AI Agent, github-repository", + }}, + }, + } + + got := categorizeSharedContent("", turn, vocab) + want := "large-language-models,ai-agents" + if strings.Join(got, ",") != want { + t.Fatalf("categorizeSharedContent() = %q, want %q", strings.Join(got, ","), want) + } +} + func TestChatShareRejectsVerificationFailedTurn(t *testing.T) { cfg, st := openTestStore(t) handler, err := NewHandler(cfg, st) @@ -168,6 +260,15 @@ func TestChatShareRejectsVerificationFailedTurn(t *testing.T) { } } +func containsString(values []string, needle string) bool { + for _, value := range values { + if value == needle { + return true + } + } + return false +} + func TestPublicExternalURLCleansEncodedBackticksAndPunctuation(t *testing.T) { tests := []struct { name string