Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 142 additions & 0 deletions examples/property_alignment/propmatch_llm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import json
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression

from ontoaligner.ontology import PropertyOMDataset
from ontoaligner.encoder import PropMatchEncoder
from ontoaligner.aligner import AutoModelDecoderLLM
from ontoaligner.aligner import PropertyFullTextLLMDataset

from ontoaligner.postprocess import TFIDFLabelMapper, llm_postprocessor
from ontoaligner.utils import metrics, xmlify

# ---------------------------------------------------------
# Step 1: Initialize the property ontology matching task
# ---------------------------------------------------------
task = PropertyOMDataset()

print("Property Matching Task:", task)

# ---------------------------------------------------------
# Step 2: Collect source ontology, target ontology, and references
# ---------------------------------------------------------
dataset = task.collect(
source_ontology_path="../assets/MI-MatOnto/mi_ontology.xml",
target_ontology_path="../assets/MI-MatOnto/matonto_ontology.xml",
reference_matching_path="../assets/MI-MatOnto/property_matchings.xml",
)

# ---------------------------------------------------------
# Step 3: Encode properties
# ---------------------------------------------------------
# PropMatchEncoder should produce property dictionaries containing:
# iri, label, domain, range, inverse
#
# These fields are used by PropertyFullTextLLMDataset.
encoder_model = PropMatchEncoder()

source_onto, target_onto = encoder_model(
source=dataset["source"],
target=dataset["target"],
)
# ---------------------------------------------------------
# Step 4: Prepare property LLM dataset
# ---------------------------------------------------------
llm_dataset = PropertyFullTextLLMDataset(
source_onto=source_onto,
target_onto=target_onto,
)
print("Number of property pairs:", len(llm_dataset))

# ---------------------------------------------------------
# Step 5: Create DataLoader
# ---------------------------------------------------------
dataloader = DataLoader(
llm_dataset,
batch_size=128,
shuffle=False,
collate_fn=llm_dataset.collate_fn,
)

# ---------------------------------------------------------
# Step 6: Initialize LLM model
# ---------------------------------------------------------
model = AutoModelDecoderLLM(
device="cpu", # Use "cpu" if GPU is not available
max_length=300,
max_new_tokens=10,
)

# ---------------------------------------------------------
# Step 7: Load LLM
# ---------------------------------------------------------
model.load(
path="Qwen/Qwen2.5-0.5B-Instruct"
)

# ---------------------------------------------------------
# Step 8: Generate LLM predictions
# ---------------------------------------------------------
predictions = []

for batch in tqdm(dataloader):
prompts = batch["prompts"]
sequences = model.generate(prompts)
predictions.extend(sequences)

print("Number of predictions:", len(predictions))


# ---------------------------------------------------------
# Step 9: Map LLM outputs to yes/no
# ---------------------------------------------------------
label_dict = {
"yes": ["yes", "correct", "true", "positive", "valid"],
"no": ["no", "incorrect", "false", "negative", "invalid"],
}

mapper = TFIDFLabelMapper(
classifier=LogisticRegression(),
ngram_range=(1, 1),
label_dict=label_dict,
)

# ---------------------------------------------------------
# Step 10: Post-process LLM predictions
# ---------------------------------------------------------
# llm_postprocessor keeps predicted "yes" pairs as final matchings.
matchings = llm_postprocessor(
predicts=predictions,
mapper=mapper,
dataset=llm_dataset,
)

# ---------------------------------------------------------
# Step 11: Evaluate property matchings
# ---------------------------------------------------------
evaluation = metrics.evaluation_report(
predicts=matchings,
references=dataset["reference"],
)
print("Property LLM Matching Evaluation Report:")
print(json.dumps(evaluation, indent=4))

# ---------------------------------------------------------
# Step 12: Save XML matchings
# ---------------------------------------------------------
xml_str = xmlify.xml_alignment_generator(matchings=matchings)
xml_output_file = "property_llm_matchings.xml"
with open(xml_output_file, "w", encoding="utf-8") as xml_file:
xml_file.write(xml_str)
print(f"Saved property LLM matchings XML to: {xml_output_file}")

# ---------------------------------------------------------
# Step 13: Save JSON matchings
# ---------------------------------------------------------
json_output_file = "property_llm_matchings.json"

with open(json_output_file, "w", encoding="utf-8") as json_file:
json.dump(matchings, json_file, indent=4, ensure_ascii=False)

print(f"Saved property LLM matchings JSON to: {json_output_file}")
96 changes: 96 additions & 0 deletions examples/property_alignment/propmatch_rag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import json

from ontoaligner.ontology import PropertyOMDataset
from ontoaligner.utils import metrics, xmlify
from ontoaligner.aligner import FalconLLMBERTRetrieverRAG
from ontoaligner.encoder import PropertyFullTextRAGEncoder
from ontoaligner.postprocess import rag_hybrid_postprocessor

# Step 1: Initialize the property ontology matching task
task = PropertyOMDataset()
print("Property Matching Task:", task)

# Step 2: Collect source ontology, target ontology, and reference property alignments
dataset = task.collect(
source_ontology_path="../assets/MI-MatOnto/mi_ontology.xml",
target_ontology_path="../assets/MI-MatOnto/matonto_ontology.xml",
reference_matching_path="../assets/MI-MatOnto/property_matchings.xml",
)

# Step 3: Initialize the property RAG encoder
# This encoder should use:
# retrieval_encoder = PropMatchEncoder
# llm_encoder = "PropertyFullTextRAGDataset"
encoder_model = PropertyFullTextRAGEncoder()

# Step 4: Encode the property ontologies
encoded_ontology = encoder_model(
source=dataset["source"],
target=dataset["target"],
reference=dataset["reference"],
)

# Step 5: Define model configuration
config = {
"retriever_config": {
"device": "cpu",
"top_k": 5,
"threshold": 0.1,
},
"llm_config": {
"device": "cpu",
"max_length": 300,
"max_new_tokens": 5,
"huggingface_access_token": "",
"device_map": "auto",
"batch_size": 8,
"answer_set": {
"yes": ["yes", "correct", "true", "positive", "valid"],
"no": ["no", "incorrect", "false", "negative", "invalid"],
},
},
}

# Step 6: Initialize the normal RAG model
model = FalconLLMBERTRetrieverRAG(**config)

# Step 7: Load small LLM and retriever model
model.load(
llm_path="Qwen/Qwen2.5-0.5B-Instruct",
ir_path="all-MiniLM-L6-v2",
)

# Step 8: Generate property matching predictions
predicts = model.generate(input_data=encoded_ontology)


# Step 9: Apply hybrid postprocessing
hybrid_matchings, hybrid_configs = rag_hybrid_postprocessor(
predicts=predicts,
ir_score_threshold=0.4,
llm_confidence_th=0.5,
)

# Step 10: Evaluate property matchings
evaluation = metrics.evaluation_report(
predicts=hybrid_matchings,
references=dataset["reference"],
)

print("Property Hybrid Matching Evaluation Report:")
print(json.dumps(evaluation, indent=4))

# Step 11: Print hybrid postprocessing configuration
print("Property Hybrid Matching Obtained Configuration:")
print(hybrid_configs)

# Step 12: Convert final property matchings to XML
xml_str = xmlify.xml_alignment_generator(matchings=hybrid_matchings)

# Step 13: Save XML output
output_file_path = "property_matchings.xml"

with open(output_file_path, "w", encoding="utf-8") as xml_file:
xml_file.write(xml_str)

print(f"Saved property matchings to: {output_file_path}")
2 changes: 1 addition & 1 deletion ontoaligner/aligner/llm/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from transformers import (AutoTokenizer, AutoModelForCausalLM, T5Tokenizer, T5ForConditionalGeneration)

from .llm import EncoderDecoderLLMArch, DecoderLLMArch, OpenAILLMArch

from ..propmatch import PropertyLLMDataset, PropertyFullTextLLMDataset

class FlanT5LEncoderDecoderLM(EncoderDecoderLLMArch):
"""
Expand Down
2 changes: 2 additions & 0 deletions ontoaligner/aligner/propmatch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .propmatch import * # NOQA
from .rag import *
from .llm import *
Loading