From a43a9f4a17af455cdbd263090dc0b9e22e08f712 Mon Sep 17 00:00:00 2001 From: KrishnaRani Date: Tue, 26 May 2026 18:21:43 +0200 Subject: [PATCH 1/3] PropMatch Aligner extension: adding feature for LLM/RAG alignment technique --- examples/property_alignment/propmatch_llm.py | 142 +++++++++++++++++++ examples/property_alignment/propmatch_rag.py | 96 +++++++++++++ ontoaligner/aligner/llm/dataset.py | 105 ++++++++++++++ ontoaligner/aligner/rag/dataset.py | 113 ++++++++++++++- ontoaligner/aligner/rag/rag.py | 2 +- ontoaligner/encoder/llm.py | 89 ++++++++++++ ontoaligner/encoder/rag.py | 42 +++++- 7 files changed, 586 insertions(+), 3 deletions(-) create mode 100644 examples/property_alignment/propmatch_llm.py create mode 100644 examples/property_alignment/propmatch_rag.py diff --git a/examples/property_alignment/propmatch_llm.py b/examples/property_alignment/propmatch_llm.py new file mode 100644 index 0000000..32163ff --- /dev/null +++ b/examples/property_alignment/propmatch_llm.py @@ -0,0 +1,142 @@ +import json +from torch.utils.data import DataLoader +from tqdm import tqdm +from sklearn.linear_model import LogisticRegression + +from ontoaligner.ontology import PropertyOMDataset +from ontoaligner.encoder import PropMatchEncoder +from ontoaligner.aligner import AutoModelDecoderLLM +from ontoaligner.aligner import PropertyFullTextLLMDataset + +from ontoaligner.postprocess import TFIDFLabelMapper, llm_postprocessor +from ontoaligner.utils import metrics, xmlify + +# --------------------------------------------------------- +# Step 1: Initialize the property ontology matching task +# --------------------------------------------------------- +task = PropertyOMDataset() + +print("Property Matching Task:", task) + +# --------------------------------------------------------- +# Step 2: Collect source ontology, target ontology, and references +# --------------------------------------------------------- +dataset = task.collect( + source_ontology_path="../assets/MI-MatOnto/mi_ontology.xml", + target_ontology_path="../assets/MI-MatOnto/matonto_ontology.xml", + reference_matching_path="../assets/MI-MatOnto/property_matchings.xml", +) + +# --------------------------------------------------------- +# Step 3: Encode properties +# --------------------------------------------------------- +# PropMatchEncoder should produce property dictionaries containing: +# iri, label, domain, range, inverse +# +# These fields are used by PropertyFullTextLLMDataset. +encoder_model = PropMatchEncoder() + +source_onto, target_onto = encoder_model( + source=dataset["source"], + target=dataset["target"], +) +# --------------------------------------------------------- +# Step 4: Prepare property LLM dataset +# --------------------------------------------------------- +llm_dataset = PropertyFullTextLLMDataset( + source_onto=source_onto, + target_onto=target_onto, +) +print("Number of property pairs:", len(llm_dataset)) + +# --------------------------------------------------------- +# Step 5: Create DataLoader +# --------------------------------------------------------- +dataloader = DataLoader( + llm_dataset, + batch_size=128, + shuffle=False, + collate_fn=llm_dataset.collate_fn, +) + +# --------------------------------------------------------- +# Step 6: Initialize LLM model +# --------------------------------------------------------- +model = AutoModelDecoderLLM( + device="cpu", # Use "cpu" if GPU is not available + max_length=300, + max_new_tokens=10, +) + +# --------------------------------------------------------- +# Step 7: Load LLM +# --------------------------------------------------------- +model.load( + path="Qwen/Qwen2.5-0.5B-Instruct" +) + +# --------------------------------------------------------- +# Step 8: Generate LLM predictions +# --------------------------------------------------------- +predictions = [] + +for batch in tqdm(dataloader): + prompts = batch["prompts"] + sequences = model.generate(prompts) + predictions.extend(sequences) + +print("Number of predictions:", len(predictions)) + + +# --------------------------------------------------------- +# Step 9: Map LLM outputs to yes/no +# --------------------------------------------------------- +label_dict = { + "yes": ["yes", "correct", "true", "positive", "valid"], + "no": ["no", "incorrect", "false", "negative", "invalid"], +} + +mapper = TFIDFLabelMapper( + classifier=LogisticRegression(), + ngram_range=(1, 1), + label_dict=label_dict, +) + +# --------------------------------------------------------- +# Step 10: Post-process LLM predictions +# --------------------------------------------------------- +# llm_postprocessor keeps predicted "yes" pairs as final matchings. +matchings = llm_postprocessor( + predicts=predictions, + mapper=mapper, + dataset=llm_dataset, +) + +# --------------------------------------------------------- +# Step 11: Evaluate property matchings +# --------------------------------------------------------- +evaluation = metrics.evaluation_report( + predicts=matchings, + references=dataset["reference"], +) +print("Property LLM Matching Evaluation Report:") +print(json.dumps(evaluation, indent=4)) + +# --------------------------------------------------------- +# Step 12: Save XML matchings +# --------------------------------------------------------- +xml_str = xmlify.xml_alignment_generator(matchings=matchings) +xml_output_file = "property_llm_matchings.xml" +with open(xml_output_file, "w", encoding="utf-8") as xml_file: + xml_file.write(xml_str) +print(f"Saved property LLM matchings XML to: {xml_output_file}") + +# --------------------------------------------------------- +# Step 13: Save JSON matchings +# --------------------------------------------------------- +json_output_file = "property_llm_matchings.json" + +with open(json_output_file, "w", encoding="utf-8") as json_file: + json.dump(matchings, json_file, indent=4, ensure_ascii=False) + +print(f"Saved property LLM matchings JSON to: {json_output_file}") \ No newline at end of file diff --git a/examples/property_alignment/propmatch_rag.py b/examples/property_alignment/propmatch_rag.py new file mode 100644 index 0000000..bb2537e --- /dev/null +++ b/examples/property_alignment/propmatch_rag.py @@ -0,0 +1,96 @@ +import json + +from ontoaligner.ontology import PropertyOMDataset +from ontoaligner.utils import metrics, xmlify +from ontoaligner.aligner import FalconLLMBERTRetrieverRAG +from ontoaligner.encoder import PropertyFullTextRAGEncoder +from ontoaligner.postprocess import rag_hybrid_postprocessor + +# Step 1: Initialize the property ontology matching task +task = PropertyOMDataset() +print("Property Matching Task:", task) + +# Step 2: Collect source ontology, target ontology, and reference property alignments +dataset = task.collect( + source_ontology_path="../assets/MI-MatOnto/mi_ontology.xml", + target_ontology_path="../assets/MI-MatOnto/matonto_ontology.xml", + reference_matching_path="../assets/MI-MatOnto/property_matchings.xml", +) + +# Step 3: Initialize the property RAG encoder +# This encoder should use: +# retrieval_encoder = PropMatchEncoder +# llm_encoder = "PropertyFullTextRAGDataset" +encoder_model = PropertyFullTextRAGEncoder() + +# Step 4: Encode the property ontologies +encoded_ontology = encoder_model( + source=dataset["source"], + target=dataset["target"], + reference=dataset["reference"], +) + +# Step 5: Define model configuration +config = { + "retriever_config": { + "device": "cpu", + "top_k": 5, + "threshold": 0.1, + }, + "llm_config": { + "device": "cpu", + "max_length": 300, + "max_new_tokens": 5, + "huggingface_access_token": "", + "device_map": "auto", + "batch_size": 8, + "answer_set": { + "yes": ["yes", "correct", "true", "positive", "valid"], + "no": ["no", "incorrect", "false", "negative", "invalid"], + }, + }, +} + +# Step 6: Initialize the normal RAG model +model = FalconLLMBERTRetrieverRAG(**config) + +# Step 7: Load small LLM and retriever model +model.load( + llm_path="Qwen/Qwen2.5-0.5B-Instruct", + ir_path="all-MiniLM-L6-v2", +) + +# Step 8: Generate property matching predictions +predicts = model.generate(input_data=encoded_ontology) + + +# Step 9: Apply hybrid postprocessing +hybrid_matchings, hybrid_configs = rag_hybrid_postprocessor( + predicts=predicts, + ir_score_threshold=0.4, + llm_confidence_th=0.5, +) + +# Step 10: Evaluate property matchings +evaluation = metrics.evaluation_report( + predicts=hybrid_matchings, + references=dataset["reference"], +) + +print("Property Hybrid Matching Evaluation Report:") +print(json.dumps(evaluation, indent=4)) + +# Step 11: Print hybrid postprocessing configuration +print("Property Hybrid Matching Obtained Configuration:") +print(hybrid_configs) + +# Step 12: Convert final property matchings to XML +xml_str = xmlify.xml_alignment_generator(matchings=hybrid_matchings) + +# Step 13: Save XML output +output_file_path = "property_matchings.xml" + +with open(output_file_path, "w", encoding="utf-8") as xml_file: + xml_file.write(xml_str) + +print(f"Saved property matchings to: {output_file_path}") \ No newline at end of file diff --git a/ontoaligner/aligner/llm/dataset.py b/ontoaligner/aligner/llm/dataset.py index 38d8abb..6a7dbf2 100644 --- a/ontoaligner/aligner/llm/dataset.py +++ b/ontoaligner/aligner/llm/dataset.py @@ -117,3 +117,108 @@ def fill_one_sample(self, input_data: Any) -> str: .replace("{target_children}", target_children) ) return template + +class PropertyLLMDataset(LLMDataset): + prompt = """Determine whether the following two ontology properties represent the same semantic relation. Respond with "yes" or "no" only. +### Property 1: +{source} +### Property 2: +{target} +### Your Answer:""" + + def fill_one_sample(self, input_data: Any) -> str: + template = self.prompt + + source = self.preprocess(input_data["source"].get("label", "")) + target = self.preprocess(input_data["target"].get("label", "")) + + template = ( + template.replace("{source}", source) + .replace("{target}", target) + ) + + return template + +class PropertyFullTextLLMDataset(LLMDataset): + prompt = """Determine whether the following two ontology properties represent the same semantic relation. Respond with "yes" or "no" only. +### Property 1: +{source} +**Domain**: {source_domain} +**Range**: {source_range} +**Inverse**: {source_inverse} + +### Property 2: +{target} +**Domain**: {target_domain} +**Range**: {target_range} +**Inverse**: {target_inverse} + +### Your Answer:""" + + def fill_one_sample(self, input_data: Any) -> str: + template = self.prompt + + source = self.preprocess(input_data["source"].get("label", "")) + target = self.preprocess(input_data["target"].get("label", "")) + + source_domain = ( + " ".join(input_data["source"].get("domain_text", [])) + if len(input_data["source"].get("domain_text", [])) > 0 + else "" + ) + + target_domain = ( + " ".join(input_data["target"].get("domain_text", [])) + if len(input_data["target"].get("domain_text", [])) > 0 + else "" + ) + + source_range = ( + " ".join(input_data["source"].get("range_text", [])) + if len(input_data["source"].get("range_text", [])) > 0 + else "" + ) + + target_range = ( + " ".join(input_data["target"].get("range_text", [])) + if len(input_data["target"].get("range_text", [])) > 0 + else "" + ) + + source_inverse = "" + if input_data["source"].get("inverse_of"): + source_inverse = ( + " ".join(input_data["source"].get("inverse_label", [])) + if len(input_data["source"].get("inverse_label", [])) > 0 + else "" + ) + + target_inverse = "" + if input_data["target"].get("inverse_of"): + target_inverse = ( + " ".join(input_data["target"].get("inverse_label", [])) + if len(input_data["target"].get("inverse_label", [])) > 0 + else "" + ) + + source_domain = self.preprocess(source_domain) + target_domain = self.preprocess(target_domain) + + source_range = self.preprocess(source_range) + target_range = self.preprocess(target_range) + + source_inverse = self.preprocess(source_inverse) + target_inverse = self.preprocess(target_inverse) + + template = ( + template.replace("{source}", source) + .replace("{target}", target) + .replace("{source_domain}", source_domain) + .replace("{target_domain}", target_domain) + .replace("{source_range}", source_range) + .replace("{target_range}", target_range) + .replace("{source_inverse}", source_inverse) + .replace("{target_inverse}", target_inverse) + ) + + return template \ No newline at end of file diff --git a/ontoaligner/aligner/rag/dataset.py b/ontoaligner/aligner/rag/dataset.py index 135a745..d929155 100644 --- a/ontoaligner/aligner/rag/dataset.py +++ b/ontoaligner/aligner/rag/dataset.py @@ -26,7 +26,6 @@ from torch.utils.data import Dataset - class RAGDataset(Dataset): """ A base dataset class for handling real-world entity classification tasks. This class preprocesses data and formats it into @@ -229,3 +228,115 @@ def fill_one_sample(self, input_data: Any) -> str: .replace("{target_children}", target_children) ) return template + +class PropertyRAGDataset(RAGDataset): + """ + A subclass of RAGDataset used for ontology property matching using only property labels. + """ + + prompt = """Classify if two ontology properties represent the same semantic relation or not (answer only yes or no). +### First property: +{source} +### Second property: +{target} +### Answer:""" + + def fill_one_sample(self, input_data: Any) -> str: + template = self.prompt + + source = self.preprocess(input_data["source"]["label"]) + target = self.preprocess(input_data["target"]["label"]) + + template = ( + template.replace("{source}", source) + .replace("{target}", target) + ) + + return template + +class PropertyFullTextRAGDataset(RAGDataset): + """ + A subclass of RAGDataset used for ontology property matching using property label, + domain, range, and inverse property. + """ + + prompt = """Classify if two ontology properties represent the same semantic relation or not (answer only yes or no). +### First property: +{source} +Domain: {source_domain} +Range: {source_range} +Inverse: {source_inverse} + +### Second property: +{target} +Domain: {target_domain} +Range: {target_range} +Inverse: {target_inverse} + +### Answer:""" + + def fill_one_sample(self, input_data: Any) -> str: + template = self.prompt + + source = self.preprocess(input_data["source"]["label"]) + target = self.preprocess(input_data["target"]["label"]) + + source_domain = ( + " ".join(input_data["source"]["domain_text"]) + if len(input_data["source"]["domain_text"]) > 0 + else "" + ) + + target_domain = ( + " ".join(input_data["target"]["domain_text"]) + if len(input_data["target"]["domain_text"]) > 0 + else "" + ) + + source_range = ( + " ".join(input_data["source"]["range_text"]) + if len(input_data["source"]["range_text"]) > 0 + else "" + ) + + target_range = ( + " ".join(input_data["target"]["range_text"]) + if len(input_data["target"]["range_text"]) > 0 + else "" + ) + + source_inverse = "" + if input_data["source"]["inverse_of"]: + source_inverse = ( + " ".join(input_data["source"]["inverse_label"]) + if len(input_data["source"]["inverse_label"]) > 0 + else "" + ) + + target_inverse = "" + if input_data["target"]["inverse_of"]: + target_inverse = ( + " ".join(input_data["target"]["inverse_label"]) + if len(input_data["target"]["inverse_label"]) > 0 + else "" + ) + + source_domain = self.preprocess(source_domain) + target_domain = self.preprocess(target_domain) + source_range = self.preprocess(source_range) + target_range = self.preprocess(target_range) + source_inverse = self.preprocess(source_inverse) + target_inverse = self.preprocess(target_inverse) + + template = ( + template.replace("{source}", source) + .replace("{target}", target) + .replace("{source_domain}", source_domain) + .replace("{target_domain}", target_domain) + .replace("{source_range}", source_range) + .replace("{target_range}", target_range) + .replace("{source_inverse}", source_inverse) + .replace("{target_inverse}", target_inverse) + ) + + return template \ No newline at end of file diff --git a/ontoaligner/aligner/rag/rag.py b/ontoaligner/aligner/rag/rag.py index 33aea82..78ae2ca 100644 --- a/ontoaligner/aligner/rag/rag.py +++ b/ontoaligner/aligner/rag/rag.py @@ -235,7 +235,7 @@ def __init__(self, retriever = None, llm = None, retriever_config=None, llm_conf self.Retrieval = retriever(**self.kwargs["retriever_config"]) if not llm: try: - self.Retrieval = self.LLM(**self.kwargs["llm_config"]) + self.LLM = self.LLM(**self.kwargs["llm_config"]) except Exception as error: raise ValueError(f"{error}\n LLM model must be provided.") else: diff --git a/ontoaligner/encoder/llm.py b/ontoaligner/encoder/llm.py index 388e02d..60dbaad 100644 --- a/ontoaligner/encoder/llm.py +++ b/ontoaligner/encoder/llm.py @@ -157,3 +157,92 @@ def get_owl_items(self, owl: Dict) -> Any: """ parents = ", ".join([parent["label"] for parent in owl["parents"]]) return {"iri": owl["iri"], "concept": owl["label"], "parents": str(parents)} + +class PropertyLLMEncoder(LLMEncoder): + """ + Encodes OWL/RDF items that represent properties. + + This class inherits from the `LLMEncoder` class and is designed to encode OWL/RDF property items. + The `get_owl_items` method retrieves the IRI, label, and definition of the property. + + Attributes: + items_in_owl (str): Specifies the type of OWL items being encoded, in this case, a Property. + """ + items_in_owl: str = "(Property)" + + def get_owl_items(self, prop: Dict) -> Any: + """ + Extracts the IRI, label, and definition of a property from the given OWL item. + + Parameters: + owl (Dict): A dictionary representing an OWL/RDF property item, expected to contain + 'iri', 'label', and optionally 'definition' keys. + + Returns: + Dict: A dictionary containing the IRI, label, definition, and combined text of the property. + """ + label = prop.get("label", "") + + combined_text = label + + return { + "iri": prop["iri"], + "label": label, + "text": combined_text, + } + +class PropertyFullTextLLMEncoder(LLMEncoder): + """ + Encodes OWL/RDF items that represent properties with domain, range, inverse property, and definition. + + This class inherits from the `LLMEncoder` class and is designed to encode OWL/RDF property items. + The `get_owl_items` method retrieves the IRI, label, definition, domain, range, and inverse property information. + + Attributes: + items_in_owl (str): Specifies the type of OWL items being encoded, in this case, + a Property with Definition, Domain, Range, and Inverse. + """ + items_in_owl: str = "(Property, Domain, Range, Inverse)" + + def get_owl_items(self, prop: Dict) -> Any: + label = prop.get("label", "") + + domain_text = ( + " ".join(prop.get("domain_text", [])) + if len(prop.get("domain_text", [])) > 0 + else "" + ) + + range_text = ( + " ".join(prop.get("range_text", [])) + if len(prop.get("range_text", [])) > 0 + else "" + ) + + inverse_text = "" + if prop.get("inverse_of"): + inverse_text = ( + " ".join(prop.get("inverse_label", [])) + if len(prop.get("inverse_label", [])) > 0 + else "" + ) + + combined_text = label + + if domain_text: + combined_text += " " + domain_text + + if range_text: + combined_text += " " + range_text + + if inverse_text: + combined_text += " inverse: " + inverse_text + + return { + "iri": prop["iri"], + "label": label, + "domain": domain_text, + "range": range_text, + "inverse": inverse_text, + "text": combined_text, + } \ No newline at end of file diff --git a/ontoaligner/encoder/rag.py b/ontoaligner/encoder/rag.py index 4c42c37..7ea7075 100644 --- a/ontoaligner/encoder/rag.py +++ b/ontoaligner/encoder/rag.py @@ -26,7 +26,7 @@ from ..base import BaseEncoder from .lightweight import ConceptLightweightEncoder - +from .property import PropertyEncoder, PropMatchEncoder class RAGEncoder(BaseEncoder): """ @@ -287,3 +287,43 @@ def __str__(self): dict: A dictionary with the encoder name and items in OWL. """ return f"OLaLaEncoder{self.items_in_owl}" + +class PropertyRAGEncoder(RAGEncoder): + """ + Encodes OWL/RDF items representing a Property using retrieval-based and language model encoders. + + This class extends the `RAGEncoder` class and is specialized in encoding OWL/RDF items that consist of + a Property. The retrieval encoder uses the `PropertyEncoder` class to retrieve the necessary property items, + while the language model encoder is set to "PropertyRAGDataset". + + Attributes: + items_in_owl (str): Specifies the type of OWL items being encoded, in this case, a Property. + retrieval_encoder (Any): The retrieval encoder used for fetching OWL/RDF property items, + set to `PropertyEncoder`. + llm_encoder (str): The language model encoder used, set to "PropertyRAGDataset". + """ + items_in_owl: str = "(Property)" + retrieval_encoder: Any = PropertyEncoder + llm_encoder: str = "PropertyRAGDataset" + + +class PropertyRAGEncoder(RAGEncoder): + """ + Encodes OWL/RDF items representing a Property with its Domain, Range, and Inverse property using + retrieval-based and language model encoders. + + This class extends the `RAGEncoder` class and is specialized in encoding OWL/RDF items that consist of + a Property, its Domain, Range, and Inverse property information. The retrieval encoder uses the + `PropMatchEncoder` class to retrieve the necessary property items, while the language model encoder is + set to "PropertyFullTextRAGDataset". + + Attributes: + items_in_owl (str): Specifies the type of OWL items being encoded, in this case, + a Property with Domain, Range, and Inverse property. + retrieval_encoder (Any): The retrieval encoder used for fetching OWL/RDF property items, + set to `PropMatchEncoder`. + llm_encoder (str): The language model encoder used, set to "PropertyFullTextRAGDataset". + """ + items_in_owl: str = "(Property, Domain, Range, Inverse)" + retrieval_encoder: Any = PropMatchEncoder + llm_encoder: str = "PropertyFullTextRAGDataset" \ No newline at end of file From ebd4d250378a8d29e16edc84d187e984f278bf31 Mon Sep 17 00:00:00 2001 From: KrishnaRani Date: Sat, 30 May 2026 00:24:55 +0200 Subject: [PATCH 2/3] restructured code --- ontoaligner/aligner/llm/dataset.py | 105 -------- ontoaligner/aligner/llm/models.py | 2 +- ontoaligner/aligner/propmatch/__init__.py | 2 + ontoaligner/aligner/propmatch/llm.py | 285 +++++++++++++++++++++ ontoaligner/aligner/propmatch/rag.py | 298 ++++++++++++++++++++++ ontoaligner/aligner/rag/dataset.py | 112 -------- ontoaligner/aligner/rag/rag.py | 2 +- ontoaligner/encoder/llm.py | 89 ------- ontoaligner/encoder/property.py | 133 +++++++++- ontoaligner/encoder/rag.py | 41 --- 10 files changed, 719 insertions(+), 350 deletions(-) create mode 100644 ontoaligner/aligner/propmatch/llm.py create mode 100644 ontoaligner/aligner/propmatch/rag.py diff --git a/ontoaligner/aligner/llm/dataset.py b/ontoaligner/aligner/llm/dataset.py index 6a7dbf2..38d8abb 100644 --- a/ontoaligner/aligner/llm/dataset.py +++ b/ontoaligner/aligner/llm/dataset.py @@ -117,108 +117,3 @@ def fill_one_sample(self, input_data: Any) -> str: .replace("{target_children}", target_children) ) return template - -class PropertyLLMDataset(LLMDataset): - prompt = """Determine whether the following two ontology properties represent the same semantic relation. Respond with "yes" or "no" only. -### Property 1: -{source} -### Property 2: -{target} -### Your Answer:""" - - def fill_one_sample(self, input_data: Any) -> str: - template = self.prompt - - source = self.preprocess(input_data["source"].get("label", "")) - target = self.preprocess(input_data["target"].get("label", "")) - - template = ( - template.replace("{source}", source) - .replace("{target}", target) - ) - - return template - -class PropertyFullTextLLMDataset(LLMDataset): - prompt = """Determine whether the following two ontology properties represent the same semantic relation. Respond with "yes" or "no" only. -### Property 1: -{source} -**Domain**: {source_domain} -**Range**: {source_range} -**Inverse**: {source_inverse} - -### Property 2: -{target} -**Domain**: {target_domain} -**Range**: {target_range} -**Inverse**: {target_inverse} - -### Your Answer:""" - - def fill_one_sample(self, input_data: Any) -> str: - template = self.prompt - - source = self.preprocess(input_data["source"].get("label", "")) - target = self.preprocess(input_data["target"].get("label", "")) - - source_domain = ( - " ".join(input_data["source"].get("domain_text", [])) - if len(input_data["source"].get("domain_text", [])) > 0 - else "" - ) - - target_domain = ( - " ".join(input_data["target"].get("domain_text", [])) - if len(input_data["target"].get("domain_text", [])) > 0 - else "" - ) - - source_range = ( - " ".join(input_data["source"].get("range_text", [])) - if len(input_data["source"].get("range_text", [])) > 0 - else "" - ) - - target_range = ( - " ".join(input_data["target"].get("range_text", [])) - if len(input_data["target"].get("range_text", [])) > 0 - else "" - ) - - source_inverse = "" - if input_data["source"].get("inverse_of"): - source_inverse = ( - " ".join(input_data["source"].get("inverse_label", [])) - if len(input_data["source"].get("inverse_label", [])) > 0 - else "" - ) - - target_inverse = "" - if input_data["target"].get("inverse_of"): - target_inverse = ( - " ".join(input_data["target"].get("inverse_label", [])) - if len(input_data["target"].get("inverse_label", [])) > 0 - else "" - ) - - source_domain = self.preprocess(source_domain) - target_domain = self.preprocess(target_domain) - - source_range = self.preprocess(source_range) - target_range = self.preprocess(target_range) - - source_inverse = self.preprocess(source_inverse) - target_inverse = self.preprocess(target_inverse) - - template = ( - template.replace("{source}", source) - .replace("{target}", target) - .replace("{source_domain}", source_domain) - .replace("{target_domain}", target_domain) - .replace("{source_range}", source_range) - .replace("{target_range}", target_range) - .replace("{source_inverse}", source_inverse) - .replace("{target_inverse}", target_inverse) - ) - - return template \ No newline at end of file diff --git a/ontoaligner/aligner/llm/models.py b/ontoaligner/aligner/llm/models.py index bde3e3f..61eff58 100644 --- a/ontoaligner/aligner/llm/models.py +++ b/ontoaligner/aligner/llm/models.py @@ -20,7 +20,7 @@ from transformers import (AutoTokenizer, AutoModelForCausalLM, T5Tokenizer, T5ForConditionalGeneration) from .llm import EncoderDecoderLLMArch, DecoderLLMArch, OpenAILLMArch - +from ..propmatch import PropertyLLMDataset, PropertyFullTextLLMDataset class FlanT5LEncoderDecoderLM(EncoderDecoderLLMArch): """ diff --git a/ontoaligner/aligner/propmatch/__init__.py b/ontoaligner/aligner/propmatch/__init__.py index 0d49dab..9822f86 100644 --- a/ontoaligner/aligner/propmatch/__init__.py +++ b/ontoaligner/aligner/propmatch/__init__.py @@ -12,3 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. from .propmatch import * # NOQA +from .rag import * +from .llm import * \ No newline at end of file diff --git a/ontoaligner/aligner/propmatch/llm.py b/ontoaligner/aligner/propmatch/llm.py new file mode 100644 index 0000000..f9c7ad1 --- /dev/null +++ b/ontoaligner/aligner/propmatch/llm.py @@ -0,0 +1,285 @@ +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This script defines custom dataset classes for property-level language model (LLM) ontology matching tasks. +These datasets preprocess source and target ontology properties and format them into structured prompts for a language model, +with variations on how much property information is included, such as labels only or full metadata with domain, range, and inverse-property context. + +Classes: + - BasePropertyLLMDataset: The base class for creating property-level LLM datasets from source and target ontology properties. + - PropertyLLMDataset: A subclass of BasePropertyLLMDataset that creates prompts using only source and target property labels. + - PropertyFullTextLLMDataset: A subclass of BasePropertyLLMDataset that creates prompts using labels, domain, range, and inverse-property information. +""" + +from typing import Any, Dict, List + +from torch.utils.data import Dataset + + +class BasePropertyLLMDataset(Dataset): + """ + Base dataset class for property-level LLM ontology matching. + + This class creates all possible source-target property pairs from two ontology + property collections. It also provides shared helper methods for text + preprocessing, field extraction, sample formatting, and batch collation. + + Attributes: + prompt: Prompt template used by subclasses. + data: List of source-target property-pair dictionaries. + len: Number of source-target property pairs in the dataset. + """ + + prompt: str = None + + def __init__(self, source_onto: Any, target_onto: Any) -> None: + """ + Initialize the dataset from source and target ontology properties. + + Args: + source_onto: Iterable containing source ontology property dictionaries. + target_onto: Iterable containing target ontology property dictionaries. + """ + self.data = [] + + for source in source_onto: + for target in target_onto: + self.data.append({ + "source": source, + "target": target, + }) + + self.len = len(self.data) + + def preprocess(self, text: Any) -> str: + """ + Normalize text before inserting it into a prompt. + + The method converts the input value to a string, replaces underscores with + spaces, lowercases the text, and converts None values to an empty string. + + Args: + text: Text or value to normalize. + + Returns: + Normalized text as a string. + """ + if text is None: + return "" + + text = str(text) + text = text.replace("_", " ") + text = text.lower() + return text + + def join_text_list(self, value: Any) -> str: + """ + Convert a text value or list of text values into a single string. + + Args: + value: A text value, list of text values, or None. + + Returns: + A single string. Lists are joined with spaces, and None is converted + to an empty string. + """ + if value is None: + return "" + + if isinstance(value, list): + return " ".join(str(v) for v in value) + + return str(value) + + def get_text(self, item: Dict, key: str) -> str: + """ + Extract and normalize a text field from an ontology property item. + + Args: + item: Ontology property dictionary. + key: Dictionary key to extract. + + Returns: + Normalized text for the requested field. + """ + return self.preprocess(self.join_text_list(item.get(key, ""))) + + def __getitem__(self, index: int) -> Dict: + """ + Return one formatted dataset sample. + + Args: + index: Index of the source-target property pair. + + Returns: + Dictionary containing the generated prompt and the corresponding + source and target property IRIs. + """ + sample = self.data[index] + + return { + "prompts": self.fill_one_sample(sample), + "iris": [ + sample["source"]["iri"], + sample["target"]["iri"], + ], + } + + def __len__(self) -> int: + """ + Return the number of source-target property pairs. + + Returns: + Dataset length. + """ + return self.len + + def fill_one_sample(self, input_data: Any) -> str: + """ + Convert one source-target property pair into a prompt. + + Subclasses must override this method to define the specific prompt format. + + Args: + input_data: Dictionary containing source and target property data. + + Raises: + NotImplementedError: This method must be implemented by subclasses. + """ + raise NotImplementedError + + def collate_fn(self, batchs: List[Dict]) -> Dict: + """ + Collate multiple dataset samples into a batch. + + Args: + batchs: List of samples returned by __getitem__. + + Returns: + Dictionary containing batched prompts and source-target IRI pairs. + """ + batchs_clear = { + "prompts": [], + "iris": [], + } + + for batch in batchs: + batchs_clear["prompts"].append(batch["prompts"]) + batchs_clear["iris"].append(batch["iris"]) + + return batchs_clear + + +class PropertyLLMDataset(BasePropertyLLMDataset): + """ + Dataset class for label-only property-level LLM ontology matching. + + This class creates prompts that compare two ontology properties using only + their labels and asks the model whether they represent the same semantic relation. + """ + + prompt = """Determine whether the following two ontology properties represent the same semantic relation. Respond with "yes" or "no" only. +### Property 1: +{source} +### Property 2: +{target} +### Your Answer:""" + + def fill_one_sample(self, input_data: Any) -> str: + """ + Build a label-only prompt for one source-target property pair. + + Args: + input_data: Dictionary containing source and target property dictionaries. + + Returns: + Formatted prompt string for the property pair. + """ + source = self.preprocess(input_data["source"].get("label", "")) + target = self.preprocess(input_data["target"].get("label", "")) + + return ( + self.prompt + .replace("{source}", source) + .replace("{target}", target) + ) + + +class PropertyFullTextLLMDataset(BasePropertyLLMDataset): + """ + Dataset class for metadata-rich property-level LLM ontology matching. + + This class creates prompts that compare two ontology properties using their + labels, domain information, range information, and inverse-property labels + when available. + """ + + prompt = """Determine whether the following two ontology properties represent the same semantic relation. Respond with "yes" or "no" only. +### Property 1: +{source} +**Domain**: {source_domain} +**Range**: {source_range} +**Inverse**: {source_inverse} + +### Property 2: +{target} +**Domain**: {target_domain} +**Range**: {target_range} +**Inverse**: {target_inverse} + +### Your Answer:""" + + def fill_one_sample(self, input_data: Any) -> str: + """ + Build a metadata-rich prompt for one source-target property pair. + + Args: + input_data: Dictionary containing source and target property dictionaries, + including optional domain, range, and inverse-property metadata. + + Returns: + Formatted prompt string for the property pair. + """ + source_item = input_data["source"] + target_item = input_data["target"] + + source = self.preprocess(source_item.get("label", "")) + target = self.preprocess(target_item.get("label", "")) + + source_domain = self.get_text(source_item, "domain_text") + target_domain = self.get_text(target_item, "domain_text") + + source_range = self.get_text(source_item, "range_text") + target_range = self.get_text(target_item, "range_text") + + source_inverse = "" + if source_item.get("inverse_of"): + source_inverse = self.get_text(source_item, "inverse_label") + + target_inverse = "" + if target_item.get("inverse_of"): + target_inverse = self.get_text(target_item, "inverse_label") + + return ( + self.prompt + .replace("{source}", source) + .replace("{target}", target) + .replace("{source_domain}", source_domain) + .replace("{target_domain}", target_domain) + .replace("{source_range}", source_range) + .replace("{target_range}", target_range) + .replace("{source_inverse}", source_inverse) + .replace("{target_inverse}", target_inverse) + ) \ No newline at end of file diff --git a/ontoaligner/aligner/propmatch/rag.py b/ontoaligner/aligner/propmatch/rag.py new file mode 100644 index 0000000..cf304f0 --- /dev/null +++ b/ontoaligner/aligner/propmatch/rag.py @@ -0,0 +1,298 @@ +# Copyright 2025 Scientific Knowledge Organization (SciKnowOrg) Research Group. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This script defines custom dataset classes for property-level retrieval-augmented generation (RAG) ontology matching tasks. +These datasets preprocess source and target ontology properties and format them into structured prompts for a language model, +with variations on how much property information is included, such as labels only or full metadata with domain, range, and inverse-property context. + +Classes: + - BasePropertyRAGDataset: The base class for creating property-level RAG datasets from source-target property pairs. + - PropertyRAGDataset: A subclass of BasePropertyRAGDataset that creates prompts using only source and target property labels. + - PropertyFullTextRAGDataset: A subclass of BasePropertyRAGDataset that creates prompts using labels, domain, range, and inverse-property information. +""" + +from typing import Any, Dict, List + +from torch.utils.data import Dataset + +class BasePropertyRAGDataset(Dataset): + """Base dataset for property-level RAG ontology matching. + + This class prepares source-target property pairs and provides shared helper + methods for text normalization, field extraction, batching, and indexing. + Subclasses must implement :meth:`fill_one_sample` to convert each property + pair into a model prompt. + + Attributes: + prompt: Prompt template used by subclasses. + data: List of source-target property-pair dictionaries. + len: Number of property pairs in the dataset. + """ + + prompt: str = None + + def __init__( + self, + data: Any = None, + source_onto: Any = None, + target_onto: Any = None, + ) -> None: + """Initialize the dataset from explicit pairs or ontology collections. + + Args: + data: Optional precomputed list of source-target property pairs. + source_onto: Optional iterable of source ontology properties. + target_onto: Optional iterable of target ontology properties. + + Raises: + ValueError: If neither ``data`` nor both ``source_onto`` and + ``target_onto`` are provided. + """ + if data is not None: + self.data = data + + elif source_onto is not None and target_onto is not None: + self.data = [] + + for source in source_onto: + for target in target_onto: + self.data.append( + { + "source": source, + "target": target, + } + ) + + else: + raise ValueError( + "BasePropertyRAGDataset requires either data=... or source_onto=... and target_onto=..." + ) + + self.len = len(self.data) + + def preprocess(self, text: Any) -> str: + """Normalize text for prompt construction. + + The method converts input values to strings, replaces underscores with + spaces, lowercases the text, and converts ``None`` values to an empty + string. + + Args: + text: Input text or value to normalize. + + Returns: + A normalized string. + """ + if text is None: + return "" + + text = str(text) + text = text.replace("_", " ") + text = text.lower() + return text + + def join_text_list(self, value: Any) -> str: + """Convert a scalar or list-like text field into a string. + + Args: + value: A text value, list of text values, or ``None``. + + Returns: + A single string representation of the value. Lists are joined with + spaces, and ``None`` is converted to an empty string. + """ + if value is None: + return "" + + if isinstance(value, list): + return " ".join(str(v) for v in value) + + return str(value) + + def get_text(self, item: Dict, key: str) -> str: + """Extract and normalize a text field from an ontology item. + + Args: + item: Ontology property dictionary. + key: Field name to extract from the dictionary. + + Returns: + The normalized field value. + """ + return self.preprocess(self.join_text_list(item.get(key, ""))) + + def __getitem__(self, index: int) -> Dict: + """Return a single prompt-ready dataset item. + + Args: + index: Index of the source-target property pair. + + Returns: + A dictionary containing the generated prompt and the corresponding + source and target IRIs. + """ + sample = self.data[index] + + return { + "prompts": self.fill_one_sample(sample), + "iris": [ + sample["source"]["iri"], + sample["target"]["iri"], + ], + } + + def __len__(self) -> int: + """Return the number of property pairs in the dataset. + + Returns: + Dataset length. + """ + return self.len + + def fill_one_sample(self, input_data: Any) -> str: + """Convert a source-target property pair into a prompt. + + Subclasses must override this method with a concrete prompt generation + strategy. + + Args: + input_data: Source-target property-pair dictionary. + + Raises: + NotImplementedError: Always raised by the base implementation. + """ + raise NotImplementedError + + def collate_fn(self, batchs: List[Dict]) -> Dict: + """Collate dataset samples into a batch dictionary. + + Args: + batchs: List of dataset samples returned by :meth:`__getitem__`. + + Returns: + A dictionary containing batched prompts and source-target IRI pairs. + """ + batchs_clear = { + "prompts": [], + "iris": [], + } + + for batch in batchs: + batchs_clear["prompts"].append(batch["prompts"]) + batchs_clear["iris"].append(batch["iris"]) + + return batchs_clear + + +class PropertyRAGDataset(BasePropertyRAGDataset): + """Prompt dataset using only source and target property labels. + + This dataset creates binary yes/no prompts that ask whether two ontology + properties represent the same semantic relation using only their labels. + """ + + prompt = """Determine whether the following two ontology properties represent the same semantic relation. Respond with "yes" or "no" only. + +### Property 1: +{source} + +### Property 2: +{target} + +### Your Answer:""" + + def fill_one_sample(self, input_data: Any) -> str: + """Build a label-only matching prompt for one property pair. + + Args: + input_data: Dictionary containing ``source`` and ``target`` property + dictionaries. + + Returns: + A formatted prompt string for the property pair. + """ + source = self.preprocess(input_data["source"].get("label", "")) + target = self.preprocess(input_data["target"].get("label", "")) + + return ( + self.prompt + .replace("{source}", source) + .replace("{target}", target) + ) + + +class PropertyFullTextRAGDataset(BasePropertyRAGDataset): + """Prompt dataset using labels and additional property metadata. + + This dataset creates binary yes/no prompts using property labels together + with domain, range, and inverse-property information when available. + """ + + prompt = """Determine whether the following two ontology properties represent the same semantic relation. Respond with "yes" or "no" only. + +### Property 1: +{source} +**Domain**: {source_domain} +**Range**: {source_range} +**Inverse**: {source_inverse} + +### Property 2: +{target} +**Domain**: {target_domain} +**Range**: {target_range} +**Inverse**: {target_inverse} + +### Your Answer:""" + + def fill_one_sample(self, input_data: Any) -> str: + """Build a metadata-rich matching prompt for one property pair. + + Args: + input_data: Dictionary containing ``source`` and ``target`` property + dictionaries with optional domain, range, and inverse metadata. + + Returns: + A formatted prompt string for the property pair. + """ + source_item = input_data["source"] + target_item = input_data["target"] + + source = self.preprocess(source_item.get("label", "")) + target = self.preprocess(target_item.get("label", "")) + + source_domain = self.get_text(source_item, "domain_text") + target_domain = self.get_text(target_item, "domain_text") + + source_range = self.get_text(source_item, "range_text") + target_range = self.get_text(target_item, "range_text") + + source_inverse = "" + if source_item.get("inverse_of"): + source_inverse = self.get_text(source_item, "inverse_label") + + target_inverse = "" + if target_item.get("inverse_of"): + target_inverse = self.get_text(target_item, "inverse_label") + + return ( + self.prompt + .replace("{source}", source) + .replace("{target}", target) + .replace("{source_domain}", source_domain) + .replace("{target_domain}", target_domain) + .replace("{source_range}", source_range) + .replace("{target_range}", target_range) + .replace("{source_inverse}", source_inverse) + .replace("{target_inverse}", target_inverse) + ) \ No newline at end of file diff --git a/ontoaligner/aligner/rag/dataset.py b/ontoaligner/aligner/rag/dataset.py index d929155..788377a 100644 --- a/ontoaligner/aligner/rag/dataset.py +++ b/ontoaligner/aligner/rag/dataset.py @@ -228,115 +228,3 @@ def fill_one_sample(self, input_data: Any) -> str: .replace("{target_children}", target_children) ) return template - -class PropertyRAGDataset(RAGDataset): - """ - A subclass of RAGDataset used for ontology property matching using only property labels. - """ - - prompt = """Classify if two ontology properties represent the same semantic relation or not (answer only yes or no). -### First property: -{source} -### Second property: -{target} -### Answer:""" - - def fill_one_sample(self, input_data: Any) -> str: - template = self.prompt - - source = self.preprocess(input_data["source"]["label"]) - target = self.preprocess(input_data["target"]["label"]) - - template = ( - template.replace("{source}", source) - .replace("{target}", target) - ) - - return template - -class PropertyFullTextRAGDataset(RAGDataset): - """ - A subclass of RAGDataset used for ontology property matching using property label, - domain, range, and inverse property. - """ - - prompt = """Classify if two ontology properties represent the same semantic relation or not (answer only yes or no). -### First property: -{source} -Domain: {source_domain} -Range: {source_range} -Inverse: {source_inverse} - -### Second property: -{target} -Domain: {target_domain} -Range: {target_range} -Inverse: {target_inverse} - -### Answer:""" - - def fill_one_sample(self, input_data: Any) -> str: - template = self.prompt - - source = self.preprocess(input_data["source"]["label"]) - target = self.preprocess(input_data["target"]["label"]) - - source_domain = ( - " ".join(input_data["source"]["domain_text"]) - if len(input_data["source"]["domain_text"]) > 0 - else "" - ) - - target_domain = ( - " ".join(input_data["target"]["domain_text"]) - if len(input_data["target"]["domain_text"]) > 0 - else "" - ) - - source_range = ( - " ".join(input_data["source"]["range_text"]) - if len(input_data["source"]["range_text"]) > 0 - else "" - ) - - target_range = ( - " ".join(input_data["target"]["range_text"]) - if len(input_data["target"]["range_text"]) > 0 - else "" - ) - - source_inverse = "" - if input_data["source"]["inverse_of"]: - source_inverse = ( - " ".join(input_data["source"]["inverse_label"]) - if len(input_data["source"]["inverse_label"]) > 0 - else "" - ) - - target_inverse = "" - if input_data["target"]["inverse_of"]: - target_inverse = ( - " ".join(input_data["target"]["inverse_label"]) - if len(input_data["target"]["inverse_label"]) > 0 - else "" - ) - - source_domain = self.preprocess(source_domain) - target_domain = self.preprocess(target_domain) - source_range = self.preprocess(source_range) - target_range = self.preprocess(target_range) - source_inverse = self.preprocess(source_inverse) - target_inverse = self.preprocess(target_inverse) - - template = ( - template.replace("{source}", source) - .replace("{target}", target) - .replace("{source_domain}", source_domain) - .replace("{target_domain}", target_domain) - .replace("{source_range}", source_range) - .replace("{target_range}", target_range) - .replace("{source_inverse}", source_inverse) - .replace("{target_inverse}", target_inverse) - ) - - return template \ No newline at end of file diff --git a/ontoaligner/aligner/rag/rag.py b/ontoaligner/aligner/rag/rag.py index 78ae2ca..bfc79c8 100644 --- a/ontoaligner/aligner/rag/rag.py +++ b/ontoaligner/aligner/rag/rag.py @@ -40,7 +40,7 @@ from ..llm import DecoderLLMArch, OpenAILLMArch from .dataset import * # NOQA from ...postprocess import process - +from ..propmatch import PropertyRAGDataset, PropertyFullTextRAGDataset class RAGBasedDecoderLLMArch(DecoderLLMArch): """ diff --git a/ontoaligner/encoder/llm.py b/ontoaligner/encoder/llm.py index 60dbaad..388e02d 100644 --- a/ontoaligner/encoder/llm.py +++ b/ontoaligner/encoder/llm.py @@ -157,92 +157,3 @@ def get_owl_items(self, owl: Dict) -> Any: """ parents = ", ".join([parent["label"] for parent in owl["parents"]]) return {"iri": owl["iri"], "concept": owl["label"], "parents": str(parents)} - -class PropertyLLMEncoder(LLMEncoder): - """ - Encodes OWL/RDF items that represent properties. - - This class inherits from the `LLMEncoder` class and is designed to encode OWL/RDF property items. - The `get_owl_items` method retrieves the IRI, label, and definition of the property. - - Attributes: - items_in_owl (str): Specifies the type of OWL items being encoded, in this case, a Property. - """ - items_in_owl: str = "(Property)" - - def get_owl_items(self, prop: Dict) -> Any: - """ - Extracts the IRI, label, and definition of a property from the given OWL item. - - Parameters: - owl (Dict): A dictionary representing an OWL/RDF property item, expected to contain - 'iri', 'label', and optionally 'definition' keys. - - Returns: - Dict: A dictionary containing the IRI, label, definition, and combined text of the property. - """ - label = prop.get("label", "") - - combined_text = label - - return { - "iri": prop["iri"], - "label": label, - "text": combined_text, - } - -class PropertyFullTextLLMEncoder(LLMEncoder): - """ - Encodes OWL/RDF items that represent properties with domain, range, inverse property, and definition. - - This class inherits from the `LLMEncoder` class and is designed to encode OWL/RDF property items. - The `get_owl_items` method retrieves the IRI, label, definition, domain, range, and inverse property information. - - Attributes: - items_in_owl (str): Specifies the type of OWL items being encoded, in this case, - a Property with Definition, Domain, Range, and Inverse. - """ - items_in_owl: str = "(Property, Domain, Range, Inverse)" - - def get_owl_items(self, prop: Dict) -> Any: - label = prop.get("label", "") - - domain_text = ( - " ".join(prop.get("domain_text", [])) - if len(prop.get("domain_text", [])) > 0 - else "" - ) - - range_text = ( - " ".join(prop.get("range_text", [])) - if len(prop.get("range_text", [])) > 0 - else "" - ) - - inverse_text = "" - if prop.get("inverse_of"): - inverse_text = ( - " ".join(prop.get("inverse_label", [])) - if len(prop.get("inverse_label", [])) > 0 - else "" - ) - - combined_text = label - - if domain_text: - combined_text += " " + domain_text - - if range_text: - combined_text += " " + range_text - - if inverse_text: - combined_text += " inverse: " + inverse_text - - return { - "iri": prop["iri"], - "label": label, - "domain": domain_text, - "range": range_text, - "inverse": inverse_text, - "text": combined_text, - } \ No newline at end of file diff --git a/ontoaligner/encoder/property.py b/ontoaligner/encoder/property.py index 6cac3e8..32935a7 100644 --- a/ontoaligner/encoder/property.py +++ b/ontoaligner/encoder/property.py @@ -21,7 +21,8 @@ from typing import Any, Dict from ..base import BaseEncoder - +from .llm import LLMEncoder +from .rag import RAGEncoder class PropertyEncoder(BaseEncoder): """ @@ -155,3 +156,133 @@ def get_encoder_info(self) -> str: def __str__(self): """Returns a string representation of the encoder.""" return {"PropMatchEncoder": self.items_in_owl} + +class PropertyRAGEncoder(RAGEncoder): + """ + Encodes OWL/RDF items representing a Property using retrieval-based and language model encoders. + + This class extends the `RAGEncoder` class and is specialized in encoding OWL/RDF items that consist of + a Property. The retrieval encoder uses the `PropertyEncoder` class to retrieve the necessary property items, + while the language model encoder is set to "PropertyRAGDataset". + + Attributes: + items_in_owl (str): Specifies the type of OWL items being encoded, in this case, a Property. + retrieval_encoder (Any): The retrieval encoder used for fetching OWL/RDF property items, + set to `PropertyEncoder`. + llm_encoder (str): The language model encoder used, set to "PropertyRAGDataset". + """ + items_in_owl: str = "(Property)" + retrieval_encoder: Any = PropertyEncoder + llm_encoder: str = "PropertyRAGDataset" + + +class PropertyFullTextRAGEncoder(RAGEncoder): + """ + Encodes OWL/RDF items representing a Property with its Domain, Range, and Inverse property using + retrieval-based and language model encoders. + + This class extends the `RAGEncoder` class and is specialized in encoding OWL/RDF items that consist of + a Property, its Domain, Range, and Inverse property information. The retrieval encoder uses the + `PropMatchEncoder` class to retrieve the necessary property items, while the language model encoder is + set to "PropertyFullTextRAGDataset". + + Attributes: + items_in_owl (str): Specifies the type of OWL items being encoded, in this case, + a Property with Domain, Range, and Inverse property. + retrieval_encoder (Any): The retrieval encoder used for fetching OWL/RDF property items, + set to `PropMatchEncoder`. + llm_encoder (str): The language model encoder used, set to "PropertyFullTextRAGDataset". + """ + items_in_owl: str = "(Property, Domain, Range, Inverse)" + retrieval_encoder: Any = PropMatchEncoder + llm_encoder: str = "PropertyFullTextRAGDataset" + + +class PropertyLLMEncoder(LLMEncoder): + """ + Encodes OWL/RDF items that represent properties. + + This class inherits from the `LLMEncoder` class and is designed to encode OWL/RDF property items. + The `get_owl_items` method retrieves the IRI, label, and definition of the property. + + Attributes: + items_in_owl (str): Specifies the type of OWL items being encoded, in this case, a Property. + """ + items_in_owl: str = "(Property)" + + def get_owl_items(self, prop: Dict) -> Any: + """ + Extracts the IRI, label, and definition of a property from the given OWL item. + + Parameters: + owl (Dict): A dictionary representing an OWL/RDF property item, expected to contain + 'iri', 'label', and optionally 'definition' keys. + + Returns: + Dict: A dictionary containing the IRI, label, definition, and combined text of the property. + """ + label = prop.get("label", "") + + combined_text = label + + return { + "iri": prop["iri"], + "label": label, + "text": combined_text, + } + +class PropertyFullTextLLMEncoder(LLMEncoder): + """ + Encodes OWL/RDF items that represent properties with domain, range, inverse property, and definition. + + This class inherits from the `LLMEncoder` class and is designed to encode OWL/RDF property items. + The `get_owl_items` method retrieves the IRI, label, definition, domain, range, and inverse property information. + + Attributes: + items_in_owl (str): Specifies the type of OWL items being encoded, in this case, + a Property with Definition, Domain, Range, and Inverse. + """ + items_in_owl: str = "(Property, Domain, Range, Inverse)" + + def get_owl_items(self, prop: Dict) -> Any: + label = prop.get("label", "") + + domain_text = ( + " ".join(prop.get("domain_text", [])) + if len(prop.get("domain_text", [])) > 0 + else "" + ) + + range_text = ( + " ".join(prop.get("range_text", [])) + if len(prop.get("range_text", [])) > 0 + else "" + ) + + inverse_text = "" + if prop.get("inverse_of"): + inverse_text = ( + " ".join(prop.get("inverse_label", [])) + if len(prop.get("inverse_label", [])) > 0 + else "" + ) + + combined_text = label + + if domain_text: + combined_text += " " + domain_text + + if range_text: + combined_text += " " + range_text + + if inverse_text: + combined_text += " inverse: " + inverse_text + + return { + "iri": prop["iri"], + "label": label, + "domain": domain_text, + "range": range_text, + "inverse": inverse_text, + "text": combined_text, + } \ No newline at end of file diff --git a/ontoaligner/encoder/rag.py b/ontoaligner/encoder/rag.py index 7ea7075..899b776 100644 --- a/ontoaligner/encoder/rag.py +++ b/ontoaligner/encoder/rag.py @@ -26,7 +26,6 @@ from ..base import BaseEncoder from .lightweight import ConceptLightweightEncoder -from .property import PropertyEncoder, PropMatchEncoder class RAGEncoder(BaseEncoder): """ @@ -287,43 +286,3 @@ def __str__(self): dict: A dictionary with the encoder name and items in OWL. """ return f"OLaLaEncoder{self.items_in_owl}" - -class PropertyRAGEncoder(RAGEncoder): - """ - Encodes OWL/RDF items representing a Property using retrieval-based and language model encoders. - - This class extends the `RAGEncoder` class and is specialized in encoding OWL/RDF items that consist of - a Property. The retrieval encoder uses the `PropertyEncoder` class to retrieve the necessary property items, - while the language model encoder is set to "PropertyRAGDataset". - - Attributes: - items_in_owl (str): Specifies the type of OWL items being encoded, in this case, a Property. - retrieval_encoder (Any): The retrieval encoder used for fetching OWL/RDF property items, - set to `PropertyEncoder`. - llm_encoder (str): The language model encoder used, set to "PropertyRAGDataset". - """ - items_in_owl: str = "(Property)" - retrieval_encoder: Any = PropertyEncoder - llm_encoder: str = "PropertyRAGDataset" - - -class PropertyRAGEncoder(RAGEncoder): - """ - Encodes OWL/RDF items representing a Property with its Domain, Range, and Inverse property using - retrieval-based and language model encoders. - - This class extends the `RAGEncoder` class and is specialized in encoding OWL/RDF items that consist of - a Property, its Domain, Range, and Inverse property information. The retrieval encoder uses the - `PropMatchEncoder` class to retrieve the necessary property items, while the language model encoder is - set to "PropertyFullTextRAGDataset". - - Attributes: - items_in_owl (str): Specifies the type of OWL items being encoded, in this case, - a Property with Domain, Range, and Inverse property. - retrieval_encoder (Any): The retrieval encoder used for fetching OWL/RDF property items, - set to `PropMatchEncoder`. - llm_encoder (str): The language model encoder used, set to "PropertyFullTextRAGDataset". - """ - items_in_owl: str = "(Property, Domain, Range, Inverse)" - retrieval_encoder: Any = PropMatchEncoder - llm_encoder: str = "PropertyFullTextRAGDataset" \ No newline at end of file From 6d7e5f20c32f9c3e021781d51b7f2658ed744bfc Mon Sep 17 00:00:00 2001 From: KrishnaRani Date: Sat, 30 May 2026 00:28:59 +0200 Subject: [PATCH 3/3] propMatch Aligner extension --- ontoaligner/aligner/rag/dataset.py | 1 + ontoaligner/encoder/rag.py | 1 + 2 files changed, 2 insertions(+) diff --git a/ontoaligner/aligner/rag/dataset.py b/ontoaligner/aligner/rag/dataset.py index 788377a..135a745 100644 --- a/ontoaligner/aligner/rag/dataset.py +++ b/ontoaligner/aligner/rag/dataset.py @@ -26,6 +26,7 @@ from torch.utils.data import Dataset + class RAGDataset(Dataset): """ A base dataset class for handling real-world entity classification tasks. This class preprocesses data and formats it into diff --git a/ontoaligner/encoder/rag.py b/ontoaligner/encoder/rag.py index 899b776..4c42c37 100644 --- a/ontoaligner/encoder/rag.py +++ b/ontoaligner/encoder/rag.py @@ -27,6 +27,7 @@ from ..base import BaseEncoder from .lightweight import ConceptLightweightEncoder + class RAGEncoder(BaseEncoder): """ A retrieval-augmented generation (RAG) encoder for ontology mapping.