From a8790bb6d945ebb5b859de42b9bc9fed63f39e71 Mon Sep 17 00:00:00 2001 From: Roque Lopez Date: Mon, 13 Jan 2025 17:26:53 -0500 Subject: [PATCH] feat: Add Magneto --- bdikit/download.py | 1 + bdikit/schema_matching/one2one/magneto.py | 95 +++++++++++++++++++ .../one2one/matcher_factory.py | 20 ++++ requirements.txt | 3 +- 4 files changed, 118 insertions(+), 1 deletion(-) create mode 100644 bdikit/schema_matching/one2one/magneto.py diff --git a/bdikit/download.py b/bdikit/download.py index 9bcc5d8..ee6371b 100644 --- a/bdikit/download.py +++ b/bdikit/download.py @@ -15,6 +15,7 @@ BUILTIN_MODELS_BOX_URL = { "cl-reducer-v0.1": "https://nyu.box.com/shared/static/hc4qxzbuxz0uoynfwy4pe2yxo5ch6xgm.pt", "bdi-cl-v0.2": "https://nyu.box.com/shared/static/1vdc28kzbpoj6ey95bksaww541p9gj31.pt", + "magneto-v0.1": "https://nyu.box.com/shared/static/140g2rq1izc1wqs1ssrml6jzag3qa0mu.pth", } BDIKIT_EMBEDDINGS_CACHE_DIR = os.path.join(BDIKIT_CACHE_DIR, "embeddings") diff --git a/bdikit/schema_matching/one2one/magneto.py b/bdikit/schema_matching/one2one/magneto.py new file mode 100644 index 0000000..3d99e2e --- /dev/null +++ b/bdikit/schema_matching/one2one/magneto.py @@ -0,0 +1,95 @@ +import pandas as pd +from typing import Dict, Any +from magneto import Magneto as Magneto_Lib +from bdikit.schema_matching.one2one.base import BaseSchemaMatcher +from bdikit.download import get_cached_model_or_download + +DEFAULT_MAGNETO_MODEL = "magneto-v0.1" + + +class MagnetoBase(BaseSchemaMatcher): + def __init__(self, kwargs: Dict[str, Any] = None): + if kwargs is None: + kwargs = {} + self.magneto = Magneto_Lib(**kwargs) + + def map( + self, + source: pd.DataFrame, + target: pd.DataFrame, + ): + raw_matches = self.magneto.get_matches(source, target) + # Initialize result dictionary + result = {} + + # Iterate through the input dictionary + for (source, target), score in raw_matches.items(): + source_column = source[1] + target_column = target[1] + + # Update the result if it's a new source or has a higher score + if ( + source_column not in result + or raw_matches[ + (("source", source_column), ("target", result[source_column])) + ] + < score + ): + result[source_column] = target_column + + return result + + +class Magneto(MagnetoBase): + def __init__(self): + super().__init__() + + +class MagnetoFT(MagnetoBase): + def __init__( + self, + encoding_mode: str = "header_values_verbose", + model_name: str = DEFAULT_MAGNETO_MODEL, + model_path: str = None, + ): + embedding_model = check_magneto_model(model_name, model_path) + kwargs = {"encoding_mode": encoding_mode, "embedding_model": embedding_model} + super().__init__(kwargs) + + +class MagnetoGPT(MagnetoBase): + def __init__(self): + kwargs = {"use_bp_reranker": False, "use_gpt_reranker": True} + super().__init__(kwargs) + + +class MagnetoFTGPT(MagnetoBase): + def __init__( + self, + encoding_mode: str = "header_values_verbose", + model_name: str = DEFAULT_MAGNETO_MODEL, + model_path: str = None, + ): + embedding_model = check_magneto_model(model_name, model_path) + kwargs = { + "encoding_mode": encoding_mode, + "embedding_model": embedding_model, + "use_bp_reranker": False, + "use_gpt_reranker": True, + } + super().__init__(kwargs) + + +def check_magneto_model(model_name: str, model_path: str): + if model_name and model_path: + raise ValueError( + "Only one of model_name or model_path should be provided " + "(they are mutually exclusive)" + ) + + if model_path: + return model_path + elif model_name: + return get_cached_model_or_download(model_name) + else: + raise ValueError("Either model_name or model_path must be provided") diff --git a/bdikit/schema_matching/one2one/matcher_factory.py b/bdikit/schema_matching/one2one/matcher_factory.py index 30e44d6..8c080e0 100644 --- a/bdikit/schema_matching/one2one/matcher_factory.py +++ b/bdikit/schema_matching/one2one/matcher_factory.py @@ -39,6 +39,26 @@ class SchemaMatchers(Enum): "bdikit.schema_matching.one2one.maxvalsim.MaxValSimSchemaMatcher", ) + MAGNETO = ( + "magneto", + "bdikit.schema_matching.one2one.magneto.Magneto", + ) + + MAGNETO_FT = ( + "magneto_ft", + "bdikit.schema_matching.one2one.magneto.MagnetoFT", + ) + + MAGNETO_GPT = ( + "magneto_gpt", + "bdikit.schema_matching.one2one.magneto.MagnetoGPT", + ) + + MAGNETO_FTGPT = ( + "magneto_ftgpt", + "bdikit.schema_matching.one2one.magneto.MagnetoFTGPT", + ) + def __init__(self, matcher_name: str, matcher_path: str): self.matcher_name = matcher_name self.matcher_path = matcher_path diff --git a/requirements.txt b/requirements.txt index c41ffba..2408f2d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,5 @@ requests scipy<1.13 matplotlib<3.9 panel!=1.4.3 -nltk>=3.9.1 \ No newline at end of file +nltk>=3.9.1 +magneto-python \ No newline at end of file