From efecd755463e09c7705a1f6939b13f8a0d058098 Mon Sep 17 00:00:00 2001 From: Alec Thomas Date: Wed, 24 Jun 2026 13:00:33 +1000 Subject: [PATCH] feat: match git branch in breadcrumb agent detection --- README.md | 4 +- src/breadcrumbs.rs | 303 +++++++++++++++++++++++++++++++++------------ src/git.rs | 65 ++++++++++ 3 files changed, 294 insertions(+), 78 deletions(-) diff --git a/README.md b/README.md index a7c401f..0bcaa70 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ It finds agents in four ways: 1. It checks for agent-specific environment variables. 2. It walks its own process ancestry, under the assumption that the git commit was initiated by an agent. 3. It walks up the process tree and checks all descendants of siblings at each level, looking for agents working in the same repository. -4. It checks agent-specific state files ("breadcrumbs") to determine if an agent was recently active in this repo (e.g. `~/.claude/projects/`, `~/.codex/sessions/`, `~/.pi/agent/sessions/`). +4. It checks agent-specific state files ("breadcrumbs") to determine if an agent was recently active in this repo (e.g. `~/.claude/projects/`, `~/.codex/sessions/`, `~/.pi/agent/sessions/`). When both the session and the commit record a branch, they must match; this avoids misattributing a commit to an agent that was last used on a different branch. At most one agent is attributed per commit: the first one found, in the order above. If an agent is found, it will append the following git trailer to the git commit: @@ -59,7 +59,7 @@ ln -s /usr/local/bin/aittributor .git/hooks/prepare-commit-msg ## Known limitations -**Process detection is not always possible.** Agents may exit before the commit runs, or use process names that don't match (e.g. Electron-based desktop apps). When process scanning fails, aittributor falls back to agent session history, checking state files for recent activity in the same repo. This fallback only works for agents that write scannable state files (currently Claude, Codex, Copilot CLI, and Pi). Some agents like OpenCode store sessions in SQLite, which is not yet supported by the breadcrumb scanner, and it cannot distinguish between an agent that wrote the code being committed and one that was only used for research. The result is a bias toward over-attribution, which is a deliberate tradeoff as undercounting real AI usage is harder to correct after the fact than occasional overcounting. +**Process detection is not always possible.** Agents may exit before the commit runs, or use process names that don't match (e.g. Electron-based desktop apps). When process scanning fails, aittributor falls back to agent session history, checking state files for recent activity in the same repo. This fallback only works for agents that write scannable state files (currently Claude, Codex, Copilot CLI, and Pi). Some agents like OpenCode store sessions in SQLite, which is not yet supported by the breadcrumb scanner. To reduce false positives, the breadcrumb scanner matches the session's git branch against the commit's branch when both are recorded; note that Codex captures the branch only at session start, so it can still misattribute if you remain in one Codex session across a branch switch. Even so, the fallback cannot fully distinguish between an agent that wrote the code being committed and one that was only used for research, so the result is a bias toward over-attribution. This is a deliberate tradeoff, as undercounting real AI usage is harder to correct after the fact than occasional overcounting. **Agent-initiated commits are the most reliable.** Attribution is most accurate when the agent itself runs `git commit`. Manual commits while an agent session is open (or recently closed) are the main source of attribution that may not reflect actual code contribution. diff --git a/src/breadcrumbs.rs b/src/breadcrumbs.rs index dd005ad..eb40989 100644 --- a/src/breadcrumbs.rs +++ b/src/breadcrumbs.rs @@ -4,12 +4,32 @@ use std::path::Path; use std::time::SystemTime; use crate::agent::{Agent, KNOWN_AGENTS}; +use crate::git; const CUTOFF_SECS: u64 = 2 * 60 * 60; // 2 hours as a rough approximation /// Maximum number of lines to read from a session file when looking for "cwd". const MAX_LINES_TO_SCAN: usize = 5; +/// Shared inputs for a breadcrumb scan: which repo we're matching, how recent a +/// session must be, and the branch the commit is happening on (if known). +struct ScanContext<'a> { + repo_path: &'a Path, + cutoff: SystemTime, + current_branch: Option<&'a str>, +} + +/// Outcome of scanning a breadcrumb directory for a single agent. +enum SessionScan { + /// A recent session whose cwd and branch both matched the commit. + Matched, + /// A recent session matched the repo but was recorded on a different + /// branch, so it was skipped. Carries that session's branch for debug. + BranchMismatch(String), + /// No recent session for this repo. + None, +} + fn home_dir() -> Option { std::env::var("HOME").ok() } @@ -24,25 +44,49 @@ fn has_extension(path: &Path, ext: &str) -> bool { path.extension().and_then(|e| e.to_str()) == Some(ext) } -fn extract_cwd_from_json(line: &str) -> Option<&str> { - // Simple string extraction: find "cwd":"" - let marker = "\"cwd\":\""; - let start = line.find(marker)? + marker.len(); +/// Extract a JSON string value for `key` from a line via a simple substring +/// scan: find `"":"` and read until the next `"`. This avoids a full JSON +/// parse, which matters because session lines can be very large. +fn extract_json_string<'a>(line: &'a str, key: &str) -> Option<&'a str> { + let marker = format!("\"{}\":\"", key); + let start = line.find(&marker)? + marker.len(); let rest = &line[start..]; let end = rest.find('"')?; Some(&rest[..end]) } +fn extract_cwd_from_json(line: &str) -> Option<&str> { + extract_json_string(line, "cwd") +} + +/// Extract the branch a session was recorded on. Codex stores it as `branch` +/// (inside its `git` object) and Claude as `gitBranch`; we try both. +fn extract_branch_from_json(line: &str) -> Option<&str> { + extract_json_string(line, "gitBranch").or_else(|| extract_json_string(line, "branch")) +} + fn cwd_matches_repo(cwd: &str, repo_path: &Path) -> bool { Path::new(cwd).starts_with(repo_path) } -/// Read the first few lines of a file looking for a "cwd" field that -/// matches the repo path. Returns true on match. -fn file_has_matching_cwd(path: &Path, repo_path: &Path) -> bool { +/// Decide whether a session's branch is compatible with the commit's branch. +/// +/// We only reject when *both* branches are known and differ; if either side is +/// unknown we fall back to cwd-only matching to avoid false negatives. +fn branch_matches(session_branch: Option<&str>, current_branch: Option<&str>) -> bool { + match (session_branch, current_branch) { + (Some(session), Some(current)) => session == current, + _ => true, + } +} + +/// Scan the first few lines of one session file. The `cwd` and branch fields +/// live on the same line (Codex's `session_meta`, each of Claude's messages), +/// so we evaluate both as soon as we find the line carrying `cwd`. +fn scan_file(path: &Path, ctx: &ScanContext) -> SessionScan { let file = match fs::File::open(path) { Ok(f) => f, - Err(_) => return false, + Err(_) => return SessionScan::None, }; let reader = std::io::BufReader::new(file); @@ -51,18 +95,29 @@ fn file_has_matching_cwd(path: &Path, repo_path: &Path) -> bool { Ok(l) => l, Err(_) => break, }; - if let Some(cwd) = extract_cwd_from_json(&line) { - return cwd_matches_repo(cwd, repo_path); + let Some(cwd) = extract_cwd_from_json(&line) else { + continue; + }; + if !cwd_matches_repo(cwd, ctx.repo_path) { + return SessionScan::None; } + let session_branch = extract_branch_from_json(&line); + if branch_matches(session_branch, ctx.current_branch) { + return SessionScan::Matched; + } + return SessionScan::BranchMismatch(session_branch.unwrap_or_default().to_string()); } - false + SessionScan::None } -/// Walk nested subdirectories (any depth) looking for recent files whose -/// first few lines contain a "cwd" field matching the repo path. -fn find_session_file_with_cwd(dir: &Path, ext: &str, repo_path: &Path, cutoff: SystemTime) -> bool { +/// Walk nested subdirectories (any depth) looking for a recent session file +/// whose `cwd` matches the repo. A full (cwd + branch) match wins immediately; +/// otherwise we remember any branch-mismatched session so the caller can +/// explain why the agent was skipped. +fn scan_breadcrumb_dir(dir: &Path, ext: &str, ctx: &ScanContext) -> SessionScan { let mut dirs_to_visit = vec![dir.to_path_buf()]; + let mut branch_mismatch: Option = None; while let Some(current) = dirs_to_visit.pop() { let entries = match fs::read_dir(¤t) { @@ -75,25 +130,24 @@ fn find_session_file_with_cwd(dir: &Path, ext: &str, repo_path: &Path, cutoff: S dirs_to_visit.push(path); continue; } - if !has_extension(&path, ext) || !is_recent(&path, cutoff) { + if !has_extension(&path, ext) || !is_recent(&path, ctx.cutoff) { continue; } - if file_has_matching_cwd(&path, repo_path) { - return true; + match scan_file(&path, ctx) { + SessionScan::Matched => return SessionScan::Matched, + SessionScan::BranchMismatch(branch) => branch_mismatch = Some(branch), + SessionScan::None => {} } } } - false + match branch_mismatch { + Some(branch) => SessionScan::BranchMismatch(branch), + None => SessionScan::None, + } } -fn check_source( - agent: &'static Agent, - repo_path: &Path, - cutoff: SystemTime, - log: &mut Vec, - debug: bool, -) -> bool { +fn check_source(agent: &'static Agent, ctx: &ScanContext, log: &mut Vec, debug: bool) -> bool { let breadcrumb_dir = match agent.breadcrumb_dir { Some(d) => d, None => return false, @@ -112,29 +166,42 @@ fn check_source( return false; } - let matched = find_session_file_with_cwd(&base, breadcrumb_ext, repo_path, cutoff); + let scan = scan_breadcrumb_dir(&base, breadcrumb_ext, ctx); if debug { - if matched { - log.push(format!(" found {} ({})", agent.email, base.display())); - } else { - log.push(format!(" scanned {} (no recent session in repo)", base.display())); + match &scan { + SessionScan::Matched => log.push(format!(" found {} ({})", agent.email, base.display())), + SessionScan::BranchMismatch(branch) => log.push(format!( + " scanned {} (recent session on branch '{}', current '{}') — skipped", + base.display(), + branch, + ctx.current_branch.unwrap_or("unknown") + )), + SessionScan::None => log.push(format!(" scanned {} (no recent session in repo)", base.display())), } } - matched + matches!(scan, SessionScan::Matched) } pub fn detect_agents_from_breadcrumbs(repo_path: &Path, log: &mut Vec, debug: bool) -> Vec<&'static Agent> { - let cutoff = SystemTime::now() - std::time::Duration::from_secs(CUTOFF_SECS); + let current_branch = git::current_branch(repo_path); + let ctx = ScanContext { + repo_path, + cutoff: SystemTime::now() - std::time::Duration::from_secs(CUTOFF_SECS), + current_branch: current_branch.as_deref(), + }; let mut agents = Vec::new(); if debug { - log.push("strategy: breadcrumb session files".to_string()); + match ctx.current_branch { + Some(branch) => log.push(format!("strategy: breadcrumb session files (branch: {})", branch)), + None => log.push("strategy: breadcrumb session files (branch: unknown)".to_string()), + } } for agent in KNOWN_AGENTS { - if check_source(agent, repo_path, cutoff, log, debug) { + if check_source(agent, &ctx, log, debug) { agents.push(agent); } } @@ -183,95 +250,147 @@ mod tests { assert!(agents.is_empty()); } + /// Builds a `ScanContext` with a generous recency window for tests. + fn test_ctx<'a>(repo: &'a Path, branch: Option<&'a str>) -> ScanContext<'a> { + ScanContext { + repo_path: repo, + cutoff: SystemTime::now() - std::time::Duration::from_secs(10), + current_branch: branch, + } + } + + #[test] + fn test_extract_branch_from_json() { + // Codex: branch lives inside the git object. + let codex = r#"{"cwd":"/r","git":{"branch":"feature-x"}}"#; + assert_eq!(extract_branch_from_json(codex), Some("feature-x")); + // Claude: camelCase gitBranch. + let claude = r#"{"cwd":"/r","gitBranch":"main"}"#; + assert_eq!(extract_branch_from_json(claude), Some("main")); + // Absent. + assert_eq!(extract_branch_from_json(r#"{"cwd":"/r"}"#), None); + } + #[test] - fn test_file_has_matching_cwd_on_line_1() { + fn test_branch_matches_only_rejects_when_both_known_and_differ() { + assert!(branch_matches(Some("main"), Some("main"))); + assert!(!branch_matches(Some("main"), Some("feature"))); + // Unknown on either side falls back to a match (cwd-only behaviour). + assert!(branch_matches(None, Some("main"))); + assert!(branch_matches(Some("main"), None)); + assert!(branch_matches(None, None)); + } + + #[test] + fn test_scan_file_cwd_on_line_1() { let dir = tempfile::TempDir::new().unwrap(); let path = dir.path().join("session.jsonl"); let mut f = fs::File::create(&path).unwrap(); writeln!(f, r#"{{"type":"session_meta","cwd":"/Users/foo/myrepo"}}"#).unwrap(); - assert!(file_has_matching_cwd(&path, Path::new("/Users/foo/myrepo"))); - assert!(!file_has_matching_cwd(&path, Path::new("/Users/bar/other"))); + assert!(matches!( + scan_file(&path, &test_ctx(Path::new("/Users/foo/myrepo"), None)), + SessionScan::Matched + )); + assert!(matches!( + scan_file(&path, &test_ctx(Path::new("/Users/bar/other"), None)), + SessionScan::None + )); } #[test] - fn test_file_has_matching_cwd_on_line_2() { + fn test_scan_file_cwd_on_line_2() { let dir = tempfile::TempDir::new().unwrap(); let path = dir.path().join("session.jsonl"); let mut f = fs::File::create(&path).unwrap(); writeln!(f, r#"{{"type":"file-history-snapshot","messageId":"abc"}}"#).unwrap(); writeln!(f, r#"{{"type":"user","cwd":"/Users/foo/myrepo"}}"#).unwrap(); - assert!(file_has_matching_cwd(&path, Path::new("/Users/foo/myrepo"))); - assert!(!file_has_matching_cwd(&path, Path::new("/Users/bar/other"))); + assert!(matches!( + scan_file(&path, &test_ctx(Path::new("/Users/foo/myrepo"), None)), + SessionScan::Matched + )); } #[test] - fn test_file_has_matching_cwd_no_cwd_field() { + fn test_scan_file_no_cwd_field() { let dir = tempfile::TempDir::new().unwrap(); let path = dir.path().join("session.jsonl"); let mut f = fs::File::create(&path).unwrap(); writeln!(f, r#"{{"type":"something","data":"value"}}"#).unwrap(); writeln!(f, r#"{{"type":"other","data":"value"}}"#).unwrap(); - assert!(!file_has_matching_cwd(&path, Path::new("/Users/foo/myrepo"))); + assert!(matches!( + scan_file(&path, &test_ctx(Path::new("/Users/foo/myrepo"), None)), + SessionScan::None + )); } #[test] - fn test_find_session_file_with_cwd() { + fn test_scan_file_branch_match_and_mismatch() { let dir = tempfile::TempDir::new().unwrap(); - let cutoff = SystemTime::now() - std::time::Duration::from_secs(10); + let path = dir.path().join("session.jsonl"); + let mut f = fs::File::create(&path).unwrap(); + writeln!( + f, + r#"{{"type":"session_meta","cwd":"/Users/foo/myrepo","git":{{"branch":"feature"}}}}"# + ) + .unwrap(); + let repo = Path::new("/Users/foo/myrepo"); + + assert!(matches!( + scan_file(&path, &test_ctx(repo, Some("feature"))), + SessionScan::Matched + )); + assert!(matches!( + scan_file(&path, &test_ctx(repo, Some("main"))), + SessionScan::BranchMismatch(b) if b == "feature" + )); + // Unknown current branch falls back to cwd-only matching. + assert!(matches!(scan_file(&path, &test_ctx(repo, None)), SessionScan::Matched)); + } - // Create nested date dirs + #[test] + fn test_scan_breadcrumb_dir_matches() { + let dir = tempfile::TempDir::new().unwrap(); let day_dir = dir.path().join("2025").join("06").join("15"); fs::create_dir_all(&day_dir).unwrap(); - - // Write a session file with cwd let mut f = fs::File::create(day_dir.join("session.jsonl")).unwrap(); writeln!(f, r#"{{"type":"session_meta","cwd":"/Users/foo/myrepo"}}"#).unwrap(); - // Matching repo - assert!(find_session_file_with_cwd( - dir.path(), - "jsonl", - Path::new("/Users/foo/myrepo"), - cutoff + assert!(matches!( + scan_breadcrumb_dir(dir.path(), "jsonl", &test_ctx(Path::new("/Users/foo/myrepo"), None)), + SessionScan::Matched )); - - // Non-matching repo - assert!(!find_session_file_with_cwd( - dir.path(), - "jsonl", - Path::new("/Users/bar/other"), - cutoff + assert!(matches!( + scan_breadcrumb_dir(dir.path(), "jsonl", &test_ctx(Path::new("/Users/bar/other"), None)), + SessionScan::None )); } #[test] - fn test_find_session_file_with_cwd_rejects_sibling_prefix_repo() { + fn test_scan_breadcrumb_dir_rejects_sibling_prefix_repo() { let dir = tempfile::TempDir::new().unwrap(); - let cutoff = SystemTime::now() - std::time::Duration::from_secs(10); let day_dir = dir.path().join("2025").join("06").join("15"); fs::create_dir_all(&day_dir).unwrap(); - let mut f = fs::File::create(day_dir.join("session.jsonl")).unwrap(); writeln!(f, r#"{{"type":"session_meta","cwd":"/Users/foo/aittributor2"}}"#).unwrap(); - assert!(!find_session_file_with_cwd( - dir.path(), - "jsonl", - Path::new("/Users/foo/aittributor"), - cutoff + assert!(matches!( + scan_breadcrumb_dir( + dir.path(), + "jsonl", + &test_ctx(Path::new("/Users/foo/aittributor"), None) + ), + SessionScan::None )); } #[test] - fn test_find_session_file_with_cwd_matches_monorepo_sibling_subdir() { + fn test_scan_breadcrumb_dir_matches_monorepo_sibling_subdir() { let dir = tempfile::TempDir::new().unwrap(); - let cutoff = SystemTime::now() - std::time::Duration::from_secs(10); let day_dir = dir.path().join("2025").join("06").join("15"); fs::create_dir_all(&day_dir).unwrap(); - let mut f = fs::File::create(day_dir.join("session.jsonl")).unwrap(); writeln!( f, @@ -280,11 +399,43 @@ mod tests { .unwrap(); // Commit can run from another folder in the same repo; we match by git root. - assert!(find_session_file_with_cwd( - dir.path(), - "jsonl", - Path::new("/Users/foo/monorepo"), - cutoff + assert!(matches!( + scan_breadcrumb_dir(dir.path(), "jsonl", &test_ctx(Path::new("/Users/foo/monorepo"), None)), + SessionScan::Matched + )); + } + + #[test] + fn test_scan_breadcrumb_dir_prefers_branch_match_over_mismatch() { + let dir = tempfile::TempDir::new().unwrap(); + let day_dir = dir.path().join("2025").join("06").join("15"); + fs::create_dir_all(&day_dir).unwrap(); + let repo = "/Users/foo/myrepo"; + + let mut wrong = fs::File::create(day_dir.join("wrong-branch.jsonl")).unwrap(); + writeln!(wrong, r#"{{"cwd":"{repo}","git":{{"branch":"old"}}}}"#).unwrap(); + let mut right = fs::File::create(day_dir.join("right-branch.jsonl")).unwrap(); + writeln!(right, r#"{{"cwd":"{repo}","git":{{"branch":"current"}}}}"#).unwrap(); + + assert!(matches!( + scan_breadcrumb_dir(dir.path(), "jsonl", &test_ctx(Path::new(repo), Some("current"))), + SessionScan::Matched + )); + } + + #[test] + fn test_scan_breadcrumb_dir_reports_branch_mismatch_when_no_match() { + let dir = tempfile::TempDir::new().unwrap(); + let day_dir = dir.path().join("2025").join("06").join("15"); + fs::create_dir_all(&day_dir).unwrap(); + let repo = "/Users/foo/myrepo"; + + let mut f = fs::File::create(day_dir.join("session.jsonl")).unwrap(); + writeln!(f, r#"{{"cwd":"{repo}","git":{{"branch":"old"}}}}"#).unwrap(); + + assert!(matches!( + scan_breadcrumb_dir(dir.path(), "jsonl", &test_ctx(Path::new(repo), Some("current"))), + SessionScan::BranchMismatch(b) if b == "old" )); } } diff --git a/src/git.rs b/src/git.rs index 6d48e59..a421c5e 100644 --- a/src/git.rs +++ b/src/git.rs @@ -3,6 +3,28 @@ use std::path::{Path, PathBuf}; use crate::agent::Agent; +/// Returns the current branch name for the repo, or `None` when it can't be +/// determined (e.g. detached HEAD, or `git` isn't available). +/// +/// Uses `git symbolic-ref --short -q HEAD`, which prints the branch and exits +/// zero on a normal checkout, and prints nothing / exits non-zero on a detached +/// HEAD. We treat both failure and empty output as "unknown". +pub fn current_branch(repo_path: &Path) -> Option { + let output = std::process::Command::new("git") + .arg("-C") + .arg(repo_path) + .args(["symbolic-ref", "--short", "-q", "HEAD"]) + .output() + .ok()?; + + if !output.status.success() { + return None; + } + + let branch = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if branch.is_empty() { None } else { Some(branch) } +} + pub fn find_git_root(start_path: &Path) -> Option { let mut current = start_path.to_path_buf(); @@ -65,3 +87,46 @@ pub fn append_trailers(commit_msg_file: &PathBuf, agent: &Agent, debug: bool) -> Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + use std::process::Command; + use tempfile::TempDir; + + #[test] + fn test_current_branch_reads_checked_out_branch() { + let dir = TempDir::new().unwrap(); + let path = dir.path(); + + // `git init` followed by pointing HEAD at a branch gives a deterministic + // branch name without needing any commits or user git config. + assert!( + Command::new("git") + .arg("-C") + .arg(path) + .arg("init") + .arg("-q") + .status() + .unwrap() + .success() + ); + assert!( + Command::new("git") + .arg("-C") + .arg(path) + .args(["symbolic-ref", "HEAD", "refs/heads/feature-x"]) + .status() + .unwrap() + .success() + ); + + assert_eq!(current_branch(path).as_deref(), Some("feature-x")); + } + + #[test] + fn test_current_branch_none_outside_repo() { + let dir = TempDir::new().unwrap(); + assert_eq!(current_branch(dir.path()), None); + } +}