diff --git a/engine/clients/client_factory.py b/engine/clients/client_factory.py index c5f47521c..be5d06b25 100644 --- a/engine/clients/client_factory.py +++ b/engine/clients/client_factory.py @@ -12,6 +12,7 @@ ElasticSearcher, ElasticUploader, ) +from engine.clients.logosdb import LogosDBConfigurator, LogosDBSearcher, LogosDBUploader from engine.clients.milvus import MilvusConfigurator, MilvusSearcher, MilvusUploader from engine.clients.opensearch import ( OpenSearchConfigurator, @@ -42,6 +43,7 @@ ) ENGINE_CONFIGURATORS = { + "logosdb": LogosDBConfigurator, "qdrant": QdrantConfigurator, "qdrant_native": QdrantNativeConfigurator, "qdrant_hybrid": QdrantHybridConfigurator, @@ -54,6 +56,7 @@ } ENGINE_UPLOADERS = { + "logosdb": LogosDBUploader, "qdrant": QdrantUploader, "qdrant_native": QdrantNativeUploader, "qdrant_hybrid": QdrantHybridUploader, @@ -66,6 +69,7 @@ } ENGINE_SEARCHERS = { + "logosdb": LogosDBSearcher, "qdrant": QdrantSearcher, "qdrant_native": QdrantNativeSearcher, "qdrant_hybrid": QdrantHybridSearcher, diff --git a/engine/clients/logosdb/__init__.py b/engine/clients/logosdb/__init__.py new file mode 100644 index 000000000..604a59a49 --- /dev/null +++ b/engine/clients/logosdb/__init__.py @@ -0,0 +1,9 @@ +from engine.clients.logosdb.configure import LogosDBConfigurator +from engine.clients.logosdb.search import LogosDBSearcher +from engine.clients.logosdb.upload import LogosDBUploader + +__all__ = [ + "LogosDBConfigurator", + "LogosDBSearcher", + "LogosDBUploader", +] diff --git a/engine/clients/logosdb/configure.py b/engine/clients/logosdb/configure.py new file mode 100644 index 000000000..d50e41cd3 --- /dev/null +++ b/engine/clients/logosdb/configure.py @@ -0,0 +1,45 @@ +import json +import os +import shutil + +from benchmark.dataset import Dataset +from engine.base_client.configure import BaseConfigurator +from engine.base_client.distances import Distance + +DISTANCE_MAP = { + Distance.COSINE: 1, # logosdb.DIST_COSINE + Distance.DOT: 0, # logosdb.DIST_IP + Distance.L2: 2, # logosdb.DIST_L2 +} + +DEFAULT_PATH = "/tmp/logosdb_vdb_bench" + + +class LogosDBConfigurator(BaseConfigurator): + def __init__(self, host, collection_params: dict, connection_params: dict): + super().__init__(host, collection_params, connection_params) + self.path = connection_params.get("path", DEFAULT_PATH) + + def clean(self): + if os.path.exists(self.path): + shutil.rmtree(self.path) + meta = self.path + ".meta.json" + if os.path.exists(meta): + os.remove(meta) + + def recreate(self, dataset: Dataset, collection_params): + import logosdb + + dim = dataset.config.vector_size + dist = DISTANCE_MAP.get(dataset.config.distance, logosdb.DIST_COSINE) + max_elements = collection_params.get("max_elements", 2_000_000) + + db = logosdb.DB(self.path, dim=dim, distance=dist, max_elements=max_elements) + del db + + with open(self.path + ".meta.json", "w") as f: + json.dump( + {"dim": dim, "distance": int(dist), "max_elements": max_elements}, f + ) + + return {} diff --git a/engine/clients/logosdb/search.py b/engine/clients/logosdb/search.py new file mode 100644 index 000000000..104cbf2ef --- /dev/null +++ b/engine/clients/logosdb/search.py @@ -0,0 +1,41 @@ +import json +from typing import List, Tuple + +import numpy as np + +from dataset_reader.base_reader import Query +from engine.base_client.search import BaseSearcher + +DEFAULT_PATH = "/tmp/logosdb_vdb_bench" + + +class LogosDBSearcher(BaseSearcher): + client = None + + @classmethod + def init_client( + cls, host: str, distance, connection_params: dict, search_params: dict + ): + import logosdb + + path = connection_params.get("path", DEFAULT_PATH) + with open(path + ".meta.json") as f: + meta = json.load(f) + cls.client = logosdb.DB( + path, + dim=meta["dim"], + distance=meta["distance"], + max_elements=meta.get("max_elements", 2_000_000), + ) + + @classmethod + def search_one(cls, query: Query, top: int) -> List[Tuple[int, float]]: + q = np.array(query.vector, dtype=np.float32) + hits = cls.client.search(q, top_k=top) + return [(int(h.text), h.score) for h in hits] + + @classmethod + def delete_client(cls): + if cls.client is not None: + del cls.client + cls.client = None diff --git a/engine/clients/logosdb/upload.py b/engine/clients/logosdb/upload.py new file mode 100644 index 000000000..ce0474c74 --- /dev/null +++ b/engine/clients/logosdb/upload.py @@ -0,0 +1,45 @@ +import json +from typing import List + +import numpy as np + +from dataset_reader.base_reader import Record +from engine.base_client.upload import BaseUploader + +DEFAULT_PATH = "/tmp/logosdb_vdb_bench" + + +class LogosDBUploader(BaseUploader): + client = None + upload_params = {} + + @classmethod + def init_client(cls, host, distance, connection_params: dict, upload_params: dict): + import logosdb + + path = connection_params.get("path", DEFAULT_PATH) + with open(path + ".meta.json") as f: + meta = json.load(f) + cls.client = logosdb.DB( + path, + dim=meta["dim"], + distance=meta["distance"], + max_elements=meta.get("max_elements", 2_000_000), + ) + cls.upload_params = upload_params + + @classmethod + def upload_batch(cls, batch: List[Record]): + vectors = np.array([r.vector for r in batch], dtype=np.float32) + texts = [str(r.id) for r in batch] + cls.client.put_batch(vectors, texts=texts) + + @classmethod + def post_upload(cls, distance): + return {} + + @classmethod + def delete_client(cls): + if cls.client is not None: + del cls.client + cls.client = None diff --git a/experiments/configurations/logosdb.json b/experiments/configurations/logosdb.json new file mode 100644 index 000000000..3c5d0e366 --- /dev/null +++ b/experiments/configurations/logosdb.json @@ -0,0 +1,22 @@ +[ + { + "name": "logosdb-m16-ef200", + "engine": "logosdb", + "connection_params": { + "path": "/tmp/logosdb_vdb_bench" + }, + "collection_params": { + "max_elements": 2000000 + }, + "upload_params": { + "parallel": 1, + "batch_size": 1000 + }, + "search_params": [ + { + "parallel": 1, + "top": 10 + } + ] + } +] diff --git a/pyproject.toml b/pyproject.toml index 79e4c17ad..a2d693e58 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ opensearch-py = "^2.3.2" tqdm = "^4.66.1" psycopg = {extras = ["binary"], version = "^3.1.17"} pgvector = "^0.2.4" +logosdb = ">=0.9.0" [tool.poetry.group.dev.dependencies] pre-commit = "^2.20.0"