Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
0a9980f
add specificity utility
May 19, 2026
e99177a
refactor example, add compute_bitmap
May 20, 2026
ce447a3
commit to switch
May 20, 2026
e039051
work out kinks in OrderedFloat
May 21, 2026
8b47aef
undo change in docstring
May 21, 2026
962fdf6
Potential fix for pull request finding
magdalendobson May 22, 2026
07cb5bd
Potential fix for pull request finding
magdalendobson May 22, 2026
80574ca
fix label-filter accelerator doc id mapping for inverted index
Copilot May 22, 2026
ecc3895
fix: use document ids for numeric btree accelerator postings
Copilot May 22, 2026
4fe2935
fix: guard compute_specificities against empty base labels
Copilot May 22, 2026
2161a1d
Avoid cloning/silencing errors in query accelerator build
Copilot May 22, 2026
164f4b9
change format
May 22, 2026
cce1a8a
fmt
May 22, 2026
8c20dc9
Merge branch 'main' of github.com:microsoft/DiskANN into users/magdal…
May 22, 2026
8f22beb
Revert "Avoid cloning/silencing errors in query accelerator build"
May 22, 2026
f09f9b8
Revert "fmt"
May 22, 2026
1696ee3
Reapply "Avoid cloning/silencing errors in query accelerator build"
May 22, 2026
d631aae
small changes
May 22, 2026
38688be
Revert "small changes"
May 22, 2026
9dcca30
fmt
May 22, 2026
7553b99
fix clippy, fmt
May 22, 2026
9676dac
update groundtruth calculation to use fast bitmap computation
May 22, 2026
17d069e
reduce repeated code, reduce instances of pub
May 22, 2026
287d438
remove roaring
May 22, 2026
43aefb3
remove crate
May 22, 2026
50f274b
Revert "remove crate"
May 22, 2026
1f774bb
remove crate
May 22, 2026
07b2d2b
move to diskann-tools
May 22, 2026
0713244
remove from toml file
May 22, 2026
c205a87
add check that i64 -> f64 conversion is lossless
May 22, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions diskann-label-filter/src/utils/flatten_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ impl Document {
}

/// Configurable version that uses FlattenConfig
/// For example, with config.separator=".": {"details": {"color": "red"}} becomes {".details.color": "red"}
/// For example, with config.separator="/": {"details": {"color": "red"}} becomes {"/details/color": "red"}
pub fn flatten_metadata_with_config(&self, config: &FlattenConfig) -> Attributes {
flatten_json_pointers_map_with_config(&self.label, config)
}
Expand Down Expand Up @@ -191,7 +191,7 @@ pub fn flatten_json_pointers(value: &Value) -> AttributesVec {
/// Configurable version that uses FlattenConfig
///
/// Example:
/// With config.separator="/": {"a": {"b": [1, 2]}} -> [ ("/a/b/0", 1), ("/a/b/1", 2) ]
/// With config.separator=".": {"a": {"b": [1, 2]}} -> [ (".a.b.0", 1), (".a.b.1", 2) ]
pub fn flatten_json_pointers_with_config(value: &Value, config: &FlattenConfig) -> AttributesVec {
let mut out = Vec::new();
flatten_json_pointer_inner(value, &config.root_prefix, &mut out, &config.separator);
Expand Down
1 change: 1 addition & 0 deletions diskann-tools/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ bit-set.workspace = true
anyhow.workspace = true
itertools.workspace = true
diskann-label-filter.workspace = true
serde_json.workspace = true

[dev-dependencies]
rstest.workspace = true
Expand Down
105 changes: 105 additions & 0 deletions diskann-tools/src/bin/compute_specificities.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/*
* Copyright (c) Microsoft Corporation.
* Licensed under the MIT license.
*/

use diskann_label_filter::{read_and_parse_queries, read_baselabels};
use diskann_tools::utils::compute_bitmap::compute_query_bitmaps;
use std::env;
use std::fs::File;
use std::io::Write;
use std::process;

fn main() {
let args: Vec<String> = env::args().collect();
if args.len() != 3 && args.len() != 4 {
eprintln!(
"Usage: {} <base_label_file> <query_label_file> [specificity_output_file]",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would prefer input intake with argparse. This is error prone.

args[0]
);
process::exit(1);
}
let base_label_file = &args[1];
let query_label_file = &args[2];
let output_file = if args.len() == 4 {
Some(&args[3])
} else {
None
};

let base_labels = match read_baselabels(base_label_file) {
Ok(labels) => labels,
Err(e) => {
eprintln!("Error reading base labels: {}", e);
process::exit(1);
}
};

let total_base = base_labels.len() as u64;
if total_base == 0 {
eprintln!("Base labels are empty: cannot compute specificities.");
process::exit(1);
}

let query_labels = match read_and_parse_queries(query_label_file) {
Ok(queries) => queries,
Err(e) => {
eprintln!("Error reading query labels: {}", e);
process::exit(1);
}
};

let start = std::time::Instant::now();
let bitmaps = match compute_query_bitmaps(base_labels, query_labels) {
Ok(b) => b,
Err(e) => {
eprintln!("Error computing bitmaps: {}", e);
process::exit(1);
}
};
let elapsed = start.elapsed();
println!("Computing bitmap took {:.3?} seconds", elapsed);

let mut specificities: Vec<f64> = bitmaps
.iter()
.map(|bitmap| {
let count = bitmap.len();
count as f64 / total_base as f64
})
.collect();
Comment thread
magdalendobson marked this conversation as resolved.

if let Some(path) = output_file {
let mut file = match File::create(path) {
Ok(f) => f,
Err(e) => {
eprintln!("Failed to create output file {}: {}", path, e);
process::exit(1);
}
};
for spec in &specificities {
if let Err(e) = writeln!(file, "{:.6}", spec) {
eprintln!("Failed to write to output file: {}", e);
process::exit(1);
}
}
println!("Specificities written to {}", path);
}

if !specificities.is_empty() {
specificities.sort_by(|a, b| a.partial_cmp(b).unwrap());
let min = specificities[0];
let max = specificities[specificities.len() - 1];
let median = if specificities.len().is_multiple_of(2) {
let mid = specificities.len() / 2;
(specificities[mid - 1] + specificities[mid]) / 2.0
} else {
specificities[specificities.len() / 2]
};
let avg = specificities.iter().sum::<f64>() / specificities.len() as f64;
println!("\nSpecificity stats:");
println!(" average: {:.6}", avg);
println!(" median: {:.6}", median);
println!(" min: {:.6}", min);
println!(" max: {:.6}", max);
}
}
Loading
Loading