Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions src/harmony/matching/matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import os
import pathlib
import statistics
import warnings
from collections import Counter, OrderedDict
from typing import List, Callable, Optional, Union

Expand Down Expand Up @@ -57,6 +58,14 @@
DetectorFactory.seed = 0


_CATALOGUE_DEPRECATION_MESSAGE = (
"The catalogue-matching code path is deprecated and will be removed in a "
"future release. The hosted catalogue search is now backed by a Weaviate "
"index (see https://harmonydata.ac.uk/search). This function is retained "
"only for backwards compatibility with existing callers."
)


# This has been tested on 16 GB RAM production server, 1000 seems a safe number (TW, 15 Dec 2024)
def get_batch_size(default=1000):
try:
Expand Down Expand Up @@ -216,6 +225,11 @@ def match_instruments_with_catalogue_instruments(
"""
Match instruments with catalogue instruments.

.. deprecated::
The catalogue path was replaced by a Weaviate index
(https://harmonydata.ac.uk/search) because it did not scale. This
function will be removed in a future release.

:param instruments: The instruments.
:param catalogue_data: The catalogue data.
:param vectorisation_function: A function to vectorize a text.
Expand All @@ -224,6 +238,8 @@ def match_instruments_with_catalogue_instruments(
Index 1 in the tuple contains a list of closest instrument matches from the catalog for all the instruments.
"""

warnings.warn(_CATALOGUE_DEPRECATION_MESSAGE, DeprecationWarning, stacklevel=2)

# Gather all questions
all_questions: List[str] = []
for instrument in instruments:
Expand Down Expand Up @@ -275,6 +291,11 @@ def match_questions_with_catalogue_instruments(
Each question from the list will receive the closest instrument match for it.
The closest instrument match for all questions is returned as a result of this function.

.. deprecated::
The catalogue path was replaced by a Weaviate index
(https://harmonydata.ac.uk/search) because it did not scale. This
function will be removed in a future release.

:param questions: The questions.
:param catalogue_data: The catalogue data.
:param all_instruments_text_vectors: A list of text vectors of all questions found in all the instruments uploaded.
Expand All @@ -283,6 +304,8 @@ def match_questions_with_catalogue_instruments(
:return: A list of closest instrument matches for the questions provided.
"""

warnings.warn(_CATALOGUE_DEPRECATION_MESSAGE, DeprecationWarning, stacklevel=2)

# Catalogue data
catalogue_instrument_idx_to_catalogue_questions_idx: List[List[int]] = catalogue_data[
"instrument_idx_to_question_idx"
Expand Down Expand Up @@ -516,6 +539,11 @@ def match_query_with_catalogue_instruments(
"""
Match query with catalogue instruments.

.. deprecated::
The catalogue path was replaced by a Weaviate index
(https://harmonydata.ac.uk/search) because it did not scale. This
function will be removed in a future release.

:param query: The query.
:param catalogue_data: The catalogue data.
:param vectorisation_function: A function to vectorize a text.
Expand All @@ -525,6 +553,8 @@ def match_query_with_catalogue_instruments(
E.g. {"instruments": [...], "new_text_vectors": {...}}.
"""

warnings.warn(_CATALOGUE_DEPRECATION_MESSAGE, DeprecationWarning, stacklevel=2)

response = {"instruments": [], "new_text_vectors": {}}

# Catalogue data
Expand Down
99 changes: 99 additions & 0 deletions tests/test_catalogue_deprecation_warnings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""Verify the three catalogue-matching entry points emit DeprecationWarning.

The catalogue path was replaced by a Weaviate
index and these functions are slated for removal. They must emit a
DeprecationWarning whose message references Weaviate so library users can
discover the replacement.
"""
import sys
import warnings

import numpy as np

sys.path.append("../src")

from harmony.matching.matcher import (
match_instruments_with_catalogue_instruments,
match_query_with_catalogue_instruments,
match_questions_with_catalogue_instruments,
)
from harmony.schemas.requests.text import Instrument, Question
from harmony.schemas.text_vector import TextVector


def _minimal_catalogue_data():
return {
"instrument_idx_to_question_idx": [[0]],
"all_embeddings_concatenated": np.array([[1.0, 0.0]]),
"all_instruments": [{"instrument_name": "X", "metadata": {"source": "ref"}}],
"all_questions": ["q"],
}


def _assert_deprecation_mentions_weaviate(records):
dep = [r for r in records if issubclass(r.category, DeprecationWarning)]
assert dep, "expected at least one DeprecationWarning"
assert any("weaviate" in str(r.message).lower() for r in dep), (
f"DeprecationWarning should mention Weaviate; got: "
f"{[str(r.message) for r in dep]}"
)


def test_match_questions_with_catalogue_instruments_is_deprecated():
questions = [Question(question_text="q")]
vectors = [TextVector(text="q", vector=[1.0, 0.0], is_negated=False, is_query=False)]
with warnings.catch_warnings(record=True) as recs:
warnings.simplefilter("always")
try:
match_questions_with_catalogue_instruments(
questions=questions,
catalogue_data=_minimal_catalogue_data(),
all_instruments_text_vectors=vectors,
questions_are_from_one_instrument=True,
)
except Exception:
pass
_assert_deprecation_mentions_weaviate(recs)


def test_match_query_with_catalogue_instruments_is_deprecated():
with warnings.catch_warnings(record=True) as recs:
warnings.simplefilter("always")
try:
match_query_with_catalogue_instruments(
query="hello",
catalogue_data=_minimal_catalogue_data(),
vectorisation_function=lambda texts: np.array([[1.0, 0.0]] * len(texts)),
texts_cached_vectors={},
)
except Exception:
pass
_assert_deprecation_mentions_weaviate(recs)


def test_match_instruments_with_catalogue_instruments_is_deprecated():
instruments = [
Instrument(
file_id="f",
instrument_id="i",
instrument_name="I",
file_name="f.pdf",
file_type="pdf",
file_section="s",
language="en",
questions=[Question(question_text="q")],
)
]
with warnings.catch_warnings(record=True) as recs:
warnings.simplefilter("always")
try:
match_instruments_with_catalogue_instruments(
instruments=instruments,
catalogue_data=_minimal_catalogue_data(),
vectorisation_function=lambda texts: np.array([[1.0, 0.0]] * len(texts)),
texts_cached_vectors={},
is_negate=False,
)
except Exception:
pass
_assert_deprecation_mentions_weaviate(recs)
Loading
Loading