From 971575286f51cb514b73f97fb44ae278ef3e6397 Mon Sep 17 00:00:00 2001 From: Roque Lopez Date: Mon, 13 Jan 2025 17:26:53 -0500 Subject: [PATCH] feat: Add Magneto --- bdikit/download.py | 1 + .../one2one/matcher_factory.py | 19 +++ bdikit/schema_matching/topk/magneto.py | 132 ++++++++++++++++++ .../schema_matching/topk/matcher_factory.py | 20 +++ requirements.txt | 3 +- 5 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 bdikit/schema_matching/topk/magneto.py diff --git a/bdikit/download.py b/bdikit/download.py index 9bcc5d8..d677b9d 100644 --- a/bdikit/download.py +++ b/bdikit/download.py @@ -15,6 +15,7 @@ BUILTIN_MODELS_BOX_URL = { "cl-reducer-v0.1": "https://nyu.box.com/shared/static/hc4qxzbuxz0uoynfwy4pe2yxo5ch6xgm.pt", "bdi-cl-v0.2": "https://nyu.box.com/shared/static/1vdc28kzbpoj6ey95bksaww541p9gj31.pt", + "magneto-gdc-v0.1": "https://nyu.box.com/shared/static/140g2rq1izc1wqs1ssrml6jzag3qa0mu.pth", } BDIKIT_EMBEDDINGS_CACHE_DIR = os.path.join(BDIKIT_CACHE_DIR, "embeddings") diff --git a/bdikit/schema_matching/one2one/matcher_factory.py b/bdikit/schema_matching/one2one/matcher_factory.py index 30e44d6..df29853 100644 --- a/bdikit/schema_matching/one2one/matcher_factory.py +++ b/bdikit/schema_matching/one2one/matcher_factory.py @@ -38,6 +38,25 @@ class SchemaMatchers(Enum): "max_val_sim", "bdikit.schema_matching.one2one.maxvalsim.MaxValSimSchemaMatcher", ) + MAGNETO = ( + "magneto", + "bdikit.schema_matching.topk.magneto.Magneto", + ) + + MAGNETO_FT = ( + "magneto_ft", + "bdikit.schema_matching.topk.magneto.MagnetoFT", + ) + + MAGNETO_GPT = ( + "magneto_gpt", + "bdikit.schema_matching.topk.magneto.MagnetoGPT", + ) + + MAGNETO_FTGPT = ( + "magneto_ftgpt", + "bdikit.schema_matching.topk.magneto.MagnetoFTGPT", + ) def __init__(self, matcher_name: str, matcher_path: str): self.matcher_name = matcher_name diff --git a/bdikit/schema_matching/topk/magneto.py b/bdikit/schema_matching/topk/magneto.py new file mode 100644 index 0000000..8b59284 --- /dev/null +++ b/bdikit/schema_matching/topk/magneto.py @@ -0,0 +1,132 @@ +import pandas as pd +from typing import Dict, Any, List +from magneto import Magneto as Magneto_Lib +from bdikit.schema_matching.one2one.base import BaseSchemaMatcher +from bdikit.download import get_cached_model_or_download +from bdikit.schema_matching.topk.base import ( + ColumnScore, + TopkMatching, + BaseTopkSchemaMatcher, +) + +DEFAULT_MAGNETO_MODEL = "magneto-gdc-v0.1" + + +class MagnetoBase(BaseSchemaMatcher, BaseTopkSchemaMatcher): + def __init__(self, kwargs: Dict[str, Any] = None): + if kwargs is None: + kwargs = {} + self.magneto = Magneto_Lib(**kwargs) + + def map( + self, + source: pd.DataFrame, + target: pd.DataFrame, + ): + # There is an issue in Magneto to get the top-1 match, so get top 2 and then filter + self.magneto.params["topk"] = 2 # Magneto does not provide a method to set topk + raw_matches = self.magneto.get_matches(source, target) + print("raw_matches", raw_matches) + + # Organizing data into the desired structure + sorted_dict = {} + for (source, target), score in raw_matches.items(): + source_column = source[1] + target_column = target[1] + if source_column not in sorted_dict: + sorted_dict[source_column] = [] + sorted_dict[source_column].append((target_column, score)) + + # Sorting the lists by value in descending order and get top 1 + formatted_matches = {} + for key in sorted_dict: + sorted_matches = sorted(sorted_dict[key], key=lambda x: x[1], reverse=True) + formatted_matches[key] = sorted_matches[0][0] + + return formatted_matches + + def get_recommendations( + self, source: pd.DataFrame, target: pd.DataFrame, top_k: int + ) -> List[TopkMatching]: + self.magneto.params["topk"] = ( + top_k # Magneto does not provide a method to set topk + ) + raw_matches = self.magneto.get_matches(source, target) + + # Organizing data into the desired structure + sorted_dict = {} + for (source, target), score in raw_matches.items(): + source_column = source[1] + target_column = target[1] + if source_column not in sorted_dict: + sorted_dict[source_column] = [] + sorted_dict[source_column].append((target_column, score)) + + # Sorting the lists by value in descending order and format top k + top_k_results = [] + for key in sorted_dict: + sorted_matches = sorted(sorted_dict[key], key=lambda x: x[1], reverse=True) + top_k_columns = [ColumnScore(name, score) for name, score in sorted_matches] + top_k_results.append( + { + "source_column": [key] * len(top_k_columns), + "top_k_columns": top_k_columns, + } + ) + + return top_k_results + + +class Magneto(MagnetoBase): + def __init__(self): + super().__init__() + + +class MagnetoFT(MagnetoBase): + def __init__( + self, + encoding_mode: str = "header_values_verbose", + model_name: str = DEFAULT_MAGNETO_MODEL, + model_path: str = None, + ): + embedding_model = check_magneto_model(model_name, model_path) + kwargs = {"encoding_mode": encoding_mode, "embedding_model": embedding_model} + super().__init__(kwargs) + + +class MagnetoGPT(MagnetoBase): + def __init__(self): + kwargs = {"use_bp_reranker": False, "use_gpt_reranker": True} + super().__init__(kwargs) + + +class MagnetoFTGPT(MagnetoBase): + def __init__( + self, + encoding_mode: str = "header_values_verbose", + model_name: str = DEFAULT_MAGNETO_MODEL, + model_path: str = None, + ): + embedding_model = check_magneto_model(model_name, model_path) + kwargs = { + "encoding_mode": encoding_mode, + "embedding_model": embedding_model, + "use_bp_reranker": False, + "use_gpt_reranker": True, + } + super().__init__(kwargs) + + +def check_magneto_model(model_name: str, model_path: str): + if model_name and model_path: + raise ValueError( + "Only one of model_name or model_path should be provided " + "(they are mutually exclusive)" + ) + + if model_path: + return model_path + elif model_name: + return get_cached_model_or_download(model_name) + else: + raise ValueError("Either model_name or model_path must be provided") diff --git a/bdikit/schema_matching/topk/matcher_factory.py b/bdikit/schema_matching/topk/matcher_factory.py index e256787..1d8caf0 100644 --- a/bdikit/schema_matching/topk/matcher_factory.py +++ b/bdikit/schema_matching/topk/matcher_factory.py @@ -10,6 +10,26 @@ class TopkMatchers(Enum): "bdikit.schema_matching.topk.contrastivelearning.CLTopkSchemaMatcher", ) + MAGNETO = ( + "magneto", + "bdikit.schema_matching.topk.magneto.Magneto", + ) + + MAGNETO_FT = ( + "magneto_ft", + "bdikit.schema_matching.topk.magneto.MagnetoFT", + ) + + MAGNETO_GPT = ( + "magneto_gpt", + "bdikit.schema_matching.topk.magneto.MagnetoGPT", + ) + + MAGNETO_FTGPT = ( + "magneto_ftgpt", + "bdikit.schema_matching.topk.magneto.MagnetoFTGPT", + ) + def __init__(self, matcher_name: str, matcher_path: str): self.matcher_name = matcher_name self.matcher_path = matcher_path diff --git a/requirements.txt b/requirements.txt index c41ffba..2408f2d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,5 @@ requests scipy<1.13 matplotlib<3.9 panel!=1.4.3 -nltk>=3.9.1 \ No newline at end of file +nltk>=3.9.1 +magneto-python \ No newline at end of file