diff --git a/CHANGELOG.md b/CHANGELOG.md index e00f28d..2dd85fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ development date for the change set. ## Recent Improvements +### Public Chat Share Topics (2026-06-15) + +- **Fixed**: Public chat share topic chips now come from weighted research-pack + evidence tags, prioritizing answer-cited sources and applying the configured + category vocabulary instead of using generic keyword/source-type guesses. +- **Hardening**: Share topic derivation now suppresses generic labels such as + `research`, `media`, `software`, and source-platform noise, with regression + coverage for citation weighting, coverage fallback, and vocabulary cleanup. +- **Location**: `web/chat_shares.go`, `web/chat_shares_test.go` + ### Open Knowledge Format Export (2026-06-14) - **OKF export**: Added `dbrain okf export` and `dbrain okf validate` for diff --git a/web/chat_shares.go b/web/chat_shares.go index 2db632d..a007824 100644 --- a/web/chat_shares.go +++ b/web/chat_shares.go @@ -12,6 +12,7 @@ import ( "strings" "time" + "github.com/darron/dbrain/internal/categoryvocab" "github.com/darron/dbrain/internal/store" "github.com/yuin/goldmark" "github.com/yuin/goldmark/extension" @@ -20,8 +21,9 @@ import ( ) const ( - localShareOwnerProvider = "local" - localShareOwnerSubject = "local" + localShareOwnerProvider = "local" + localShareOwnerSubject = "local" + maxPublicShareCategories = 4 ) var ( @@ -84,7 +86,8 @@ func (s *server) handleChatShareCreate(w http.ResponseWriter, r *http.Request) { return } - input := buildPublicChatShareInput(owner, req.Turn) + vocab, _ := categoryvocab.Load(s.cfg.CategoriesPath) + input := buildPublicChatShareInput(owner, req.Turn, vocab) share, err := s.store.SavePublicChatShare(r.Context(), input) if err != nil { writeError(w, http.StatusInternalServerError, err) @@ -175,12 +178,12 @@ func (s *server) chatShareOwner(r *http.Request) (chatShareOwner, bool) { }, true } -func buildPublicChatShareInput(owner chatShareOwner, turn ChatTranscriptTurn) store.PublicChatShareInput { +func buildPublicChatShareInput(owner chatShareOwner, turn ChatTranscriptTurn, vocab categoryvocab.Vocab) store.PublicChatShareInput { keyURLs := sourceKeyURLMap(turn) content := sanitizeSharedChatContent(turn.Answer, keyURLs) originalURLs := mergeShareURLs(collectOriginalURLs(turn), extractExternalURLs(content)) summary := summarizeSharedContent(content) - categories := categorizeSharedContent(content, turn) + categories := categorizeSharedContent(content, turn, vocab) title := shareTitle(turn.Question, summary) metadata := publicChatShareMetadata{ Question: sanitizeSharedChatContent(turn.Question, keyURLs), @@ -473,63 +476,165 @@ func summarizeSharedContent(content string) string { return strings.TrimSpace(string(runes[:cut])) + "..." } -func categorizeSharedContent(content string, turn ChatTranscriptTurn) []string { - lower := strings.ToLower(content + " " + turn.Question) - score := map[string]int{} - keywords := map[string][]string{ - "ai": {"agent", "model", "llm", "prompt", "inference", "retrieval", "embedding"}, - "software": {"code", "api", "database", "sqlite", "server", "github", "deploy", "bug", "test"}, - "infrastructure": {"tailscale", "kubernetes", "docker", "cloudflare", "s3", "r2", "oauth", "auth"}, - "media": {"video", "audio", "ocr", "transcript", "image", "youtube"}, - "research": {"evidence", "source", "citation", "summary", "study", "article"}, - "security": {"token", "secret", "vulnerability", "exploit", "malware", "phishing"}, - } - for category, terms := range keywords { - for _, term := range terms { - if strings.Contains(lower, term) { - score[category]++ +func categorizeSharedContent(_ string, turn ChatTranscriptTurn, vocab categoryvocab.Vocab) []string { + type rankedCategory struct { + category string + score int + first int + } + + scores := map[string]rankedCategory{} + sequence := 0 + answerCited := sourceKeysInShareText(turn.Answer) + citationKeys := sourceKeysFromShareCitations(turn) + primaryEvidence := map[string]struct{}{} + + addTag := func(raw string, weight int) { + for _, token := range vocab.ApplyToTokens([]string{raw}) { + if !usefulShareCategory(token) { + continue } + current, ok := scores[token] + if !ok { + current = rankedCategory{category: token, first: sequence} + sequence++ + } + current.score += weight + scores[token] = current + } + } + addCSV := func(raw string, weight int) { + for _, token := range strings.Split(raw, ",") { + addTag(token, weight) } } + evidenceWeight := func(sourceKey string, base int) int { + sourceKey = strings.TrimSpace(sourceKey) + if sourceKey == "" { + return base + } + if _, ok := answerCited[sourceKey]; ok { + return 100 + } + if _, ok := citationKeys[sourceKey]; ok { + return 60 + } + return base + } + for _, evidence := range turn.ResearchPack.Evidence { - switch strings.TrimSpace(evidence.SourceType) { - case "github_star", "github": - score["software"]++ - case "youtube_watch_later", "youtube_liked", "youtube": - score["media"]++ - case "web", "feed_entry": - score["research"]++ + key := strings.TrimSpace(evidence.SourceKey) + if key != "" { + primaryEvidence[key] = struct{}{} } + addCSV(evidence.UserTags, evidenceWeight(key, 25)) } - type ranked struct { - category string - score int + for _, evidence := range turn.ResearchPack.ExactTagEvidence { + key := strings.TrimSpace(evidence.SourceKey) + if _, ok := primaryEvidence[key]; ok { + continue + } + addCSV(evidence.UserTags, evidenceWeight(key, 18)) + } + for _, bucket := range turn.ResearchPack.Coverage.TopUserTags { + weight := bucket.Count * 8 + if weight <= 0 { + weight = 8 + } + if weight > 32 { + weight = 32 + } + addTag(bucket.Key, weight) + } + if len(scores) == 0 { + for _, tag := range turn.ResearchPack.QueryPlan.TagQueries { + addTag(tag, 6) + } + addTag(turn.ResearchPack.Topic, 6) } - var rankings []ranked - for category, value := range score { - if value > 0 { - rankings = append(rankings, ranked{category: category, score: value}) + + rankings := make([]rankedCategory, 0, len(scores)) + for _, ranking := range scores { + if ranking.score > 0 { + rankings = append(rankings, ranking) } } sort.Slice(rankings, func(i, j int) bool { - if rankings[i].score == rankings[j].score { - return rankings[i].category < rankings[j].category + if rankings[i].score != rankings[j].score { + return rankings[i].score > rankings[j].score + } + if rankings[i].first != rankings[j].first { + return rankings[i].first < rankings[j].first } - return rankings[i].score > rankings[j].score + return rankings[i].category < rankings[j].category }) - categories := make([]string, 0, min(3, len(rankings))) + + categories := make([]string, 0, min(maxPublicShareCategories, len(rankings))) for _, ranking := range rankings { categories = append(categories, ranking.category) - if len(categories) == 3 { + if len(categories) == maxPublicShareCategories { break } } - if len(categories) == 0 { - categories = []string{"general"} - } return categories } +func sourceKeysInShareText(text string) map[string]struct{} { + keys := map[string]struct{}{} + for _, key := range shareSourceKeyPattern.FindAllString(text, -1) { + key = strings.TrimSpace(key) + if key != "" { + keys[key] = struct{}{} + } + } + return keys +} + +func sourceKeysFromShareCitations(turn ChatTranscriptTurn) map[string]struct{} { + keys := map[string]struct{}{} + for _, citation := range turn.Citations { + key := strings.TrimSpace(citation.SourceKey) + if key != "" { + keys[key] = struct{}{} + } + } + return keys +} + +func usefulShareCategory(token string) bool { + token = categoryvocab.Normalize(token) + if token == "" { + return false + } + if _, ok := genericShareCategories[token]; ok { + return false + } + return true +} + +var genericShareCategories = map[string]struct{}{ + "ai": {}, + "article": {}, + "articles": {}, + "citation": {}, + "citations": {}, + "evidence": {}, + "general": {}, + "github": {}, + "infrastructure": {}, + "media": {}, + "research": {}, + "security": {}, + "software": {}, + "source": {}, + "sources": {}, + "summary": {}, + "summaries": {}, + "web": {}, + "x": {}, + "youtube": {}, +} + func shareTitle(question string, summary string) string { title := strings.Join(strings.Fields(sanitizeSharedChatContent(question, nil)), " ") if title == "" { @@ -818,7 +923,7 @@ var publicShareTemplate = template.Must(template.New("public-share").Parse(`
code", "href=\"https://example.com/with-backtick\"", "href=\"https://example.com/with-colon\"", "href=\"https://example.com/bracketed\">example.com", "href=\"https://www.example.org/angled\">example.org", "href=\"https://example.net/code-source\">example.net"} {
+ for _, want := range []string{"https://example.com/agent-memory", ">https://example.com/agent-memory: Summary about durable retrieval.", "Agent memory systems", "Original URLs", "Summary about durable retrieval.", "agent-memory", "durable-retrieval", "code", "href=\"https://example.com/with-backtick\"", "href=\"https://example.com/with-colon\"", "href=\"https://example.com/bracketed\">example.com", "href=\"https://www.example.org/angled\">example.org", "href=\"https://example.net/code-source\">example.net"} {
if !strings.Contains(page, want) {
t.Fatalf("expected public page to contain %q:\n%s", want, page)
}
@@ -122,6 +127,7 @@ func TestChatShareCreateListAndPublicPageRedactsInternals(t *testing.T) {
"<https://",
"[",
+ "research",
} {
if strings.Contains(page, forbidden) {
t.Fatalf("public page leaked %q:\n%s", forbidden, page)
@@ -147,6 +153,92 @@ func TestChatShareCreateListAndPublicPageRedactsInternals(t *testing.T) {
}
}
+func TestCategorizeSharedContentWeightsAnswerCitedEvidenceTags(t *testing.T) {
+ turn := ChatTranscriptTurn{
+ Question: "barcelona toronto police",
+ Answer: "The key incident involved Toronto police in Barcelona [src:cited]. This answer mentions github, youtube, evidence, source, citation, and software.",
+ Citations: []brainresearch.Citation{
+ {SourceKey: "src:cited", URL: "https://example.com/cited"},
+ {SourceKey: "src:included", URL: "https://example.com/included"},
+ },
+ ResearchPack: brainresearch.Pack{
+ Evidence: []ask.Evidence{
+ {
+ SourceKey: "src:included",
+ SourceType: "github",
+ UserTags: "software, infrastructure, included-only",
+ },
+ {
+ SourceKey: "src:cited",
+ SourceType: "web",
+ UserTags: "toronto-police, barcelona, criminal-law, research",
+ },
+ },
+ },
+ }
+
+ got := categorizeSharedContent("old keyword fallback should not matter", turn, categoryvocab.Vocab{})
+ want := "toronto-police,barcelona,criminal-law,included-only"
+ if strings.Join(got, ",") != want {
+ t.Fatalf("categorizeSharedContent() = %q, want %q", strings.Join(got, ","), want)
+ }
+ for _, forbidden := range []string{"research", "software", "media", "infrastructure"} {
+ if containsString(got, forbidden) {
+ t.Fatalf("share categories should not include generic %q: %#v", forbidden, got)
+ }
+ }
+}
+
+func TestCategorizeSharedContentFallsBackToCoverageTopUserTags(t *testing.T) {
+ turn := ChatTranscriptTurn{
+ Question: "summarize ai harness",
+ Answer: "No evidence rows carried tags.",
+ ResearchPack: brainresearch.Pack{
+ Coverage: brainresearch.Coverage{
+ TopUserTags: []brainresearch.Bucket{
+ {Key: "research", Count: 50},
+ {Key: "large-language-models", Count: 4},
+ {Key: "ai-agents", Count: 3},
+ },
+ },
+ },
+ }
+
+ got := categorizeSharedContent("", turn, categoryvocab.Vocab{})
+ want := "large-language-models,ai-agents"
+ if strings.Join(got, ",") != want {
+ t.Fatalf("categorizeSharedContent() = %q, want %q", strings.Join(got, ","), want)
+ }
+}
+
+func TestCategorizeSharedContentAppliesCategoryVocabulary(t *testing.T) {
+ vocab, err := categoryvocab.Parse([]byte(strings.Join([]string{
+ "aliases:",
+ " llm: large-language-models",
+ " ai-agent: ai-agents",
+ "drop:",
+ " - github-repository",
+ }, "\n")))
+ if err != nil {
+ t.Fatalf("parse vocab: %v", err)
+ }
+ turn := ChatTranscriptTurn{
+ Answer: "Evidence citation [src:vocab].",
+ ResearchPack: brainresearch.Pack{
+ Evidence: []ask.Evidence{{
+ SourceKey: "src:vocab",
+ UserTags: "LLM, AI Agent, github-repository",
+ }},
+ },
+ }
+
+ got := categorizeSharedContent("", turn, vocab)
+ want := "large-language-models,ai-agents"
+ if strings.Join(got, ",") != want {
+ t.Fatalf("categorizeSharedContent() = %q, want %q", strings.Join(got, ","), want)
+ }
+}
+
func TestChatShareRejectsVerificationFailedTurn(t *testing.T) {
cfg, st := openTestStore(t)
handler, err := NewHandler(cfg, st)
@@ -168,6 +260,15 @@ func TestChatShareRejectsVerificationFailedTurn(t *testing.T) {
}
}
+func containsString(values []string, needle string) bool {
+ for _, value := range values {
+ if value == needle {
+ return true
+ }
+ }
+ return false
+}
+
func TestPublicExternalURLCleansEncodedBackticksAndPunctuation(t *testing.T) {
tests := []struct {
name string