diff --git a/docs/mmteb/points/888.jsonl b/docs/mmteb/points/888.jsonl new file mode 100644 index 0000000000..223cd50f62 --- /dev/null +++ b/docs/mmteb/points/888.jsonl @@ -0,0 +1,4 @@ +{"GitHub": "KennethEnevoldsen", "Bug fixes": 8} +{"GitHub": "Muennighoff", "Review PR": 2} +{"GitHub": "orionw", "Review PR": 2} +{"GitHub": "imenelydiaker", "Review PR": 2} \ No newline at end of file diff --git a/mteb/abstasks/AbsTaskBitextMining.py b/mteb/abstasks/AbsTaskBitextMining.py index b029727fe7..910efb6bd6 100644 --- a/mteb/abstasks/AbsTaskBitextMining.py +++ b/mteb/abstasks/AbsTaskBitextMining.py @@ -39,7 +39,10 @@ def evaluate(self, model, split, **kwargs) -> dict[HFSubset, ScoresDict]: scores = {} if self.parallel_subsets: scores["default"] = self._evaluate_subset( - model, self.dataset[split], parallel=True, **kwargs + model, + self.dataset[split], # type: ignore + parallel=True, + **kwargs, ) else: for hf_subet in hf_subsets: @@ -52,7 +55,10 @@ def evaluate(self, model, split, **kwargs) -> dict[HFSubset, ScoresDict]: else: data_split = self.dataset[hf_subet][split] scores[hf_subet] = self._evaluate_subset( - model, data_split, subsets=["sentence1", "sentence2"], **kwargs + model, + data_split, # type: ignore + subsets=["sentence1", "sentence2"], + **kwargs, ) return scores @@ -60,7 +66,9 @@ def evaluate(self, model, split, **kwargs) -> dict[HFSubset, ScoresDict]: def _evaluate_subset( self, model, data_split: Dataset, parallel=False, **kwargs ) -> ScoresDict: - evaluator = BitextMiningEvaluator(data_split, **kwargs) + evaluator = BitextMiningEvaluator( + data_split, task_name=self.metadata.name, **kwargs + ) metrics = evaluator(model) if parallel: for v in metrics.values(): diff --git a/mteb/abstasks/AbsTaskClassification.py b/mteb/abstasks/AbsTaskClassification.py index b2b813964e..6b86fa979b 100644 --- a/mteb/abstasks/AbsTaskClassification.py +++ b/mteb/abstasks/AbsTaskClassification.py @@ -118,6 +118,7 @@ def _evaluate_subset( y_sampled, eval_split["text"], eval_split["label"], + task_name=self.metadata.name, **params, ) elif self.method == "kNN-pytorch": @@ -126,6 +127,7 @@ def _evaluate_subset( y_sampled, eval_split["text"], eval_split["label"], + task_name=self.metadata.name, **params, ) elif self.method == "logReg": @@ -134,6 +136,7 @@ def _evaluate_subset( y_sampled, eval_split["text"], eval_split["label"], + task_name=self.metadata.name, **params, ) else: diff --git a/mteb/abstasks/AbsTaskClustering.py b/mteb/abstasks/AbsTaskClustering.py index 2512b90f16..aaed06ff6c 100644 --- a/mteb/abstasks/AbsTaskClustering.py +++ b/mteb/abstasks/AbsTaskClustering.py @@ -38,6 +38,7 @@ def _evaluate_subset( evaluator = ClusteringEvaluator( cluster_set["sentences"], # type: ignore cluster_set["labels"], # type: ignore + task_name=self.metadata.name, **kwargs, ) metrics = evaluator(model) diff --git a/mteb/abstasks/AbsTaskClusteringFast.py b/mteb/abstasks/AbsTaskClusteringFast.py index de5d9feb4e..a5142f3081 100644 --- a/mteb/abstasks/AbsTaskClusteringFast.py +++ b/mteb/abstasks/AbsTaskClusteringFast.py @@ -12,6 +12,7 @@ from datasets import Dataset, DatasetDict from sklearn.metrics.cluster import v_measure_score +from ..evaluation.evaluators.model_encode import model_encode from ..MTEBResults import HFSubset from .AbsTask import AbsTask @@ -116,13 +117,16 @@ def _evaluate_subset( example_indices = rng_state.sample( range(len(dataset)), k=self.max_documents_to_embed ) - downsampled_dataset = dataset.select(example_indices) + downsampled_dataset = dataset.select(example_indices) # type: ignore else: downsampled_dataset = dataset - logger.info(f"Encoding {len(downsampled_dataset)} sentences...") + embeddings = model_encode( + downsampled_dataset["sentences"], # type: ignore + model=model, + task_name=self.metadata.name, + ) - embeddings = model.encode(downsampled_dataset["sentences"]) labels = [] for label in downsampled_dataset["labels"]: if not isinstance(label, list): diff --git a/mteb/abstasks/AbsTaskInstructionRetrieval.py b/mteb/abstasks/AbsTaskInstructionRetrieval.py index dbcead59c4..4371049ffc 100644 --- a/mteb/abstasks/AbsTaskInstructionRetrieval.py +++ b/mteb/abstasks/AbsTaskInstructionRetrieval.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import json import logging import os @@ -22,10 +24,10 @@ class HFDataLoaderInstructions(HFDataLoader): def __init__( self, - hf_repo: str = None, - hf_repo_qrels: str = None, - data_folder: str = None, - prefix: str = None, + hf_repo: str | None = None, + hf_repo_qrels: str | None = None, + data_folder: str | None = None, + prefix: str | None = None, corpus_file: str = "corpus.jsonl", query_file: str = "queries.jsonl", qrels_folder: str = "qrels", @@ -323,7 +325,9 @@ def load_data(self, **kwargs): self.data_loaded = True def evaluate(self, model, split="test", **kwargs): - retriever = InstructionRetrievalEvaluator(model, **kwargs) + retriever = InstructionRetrievalEvaluator( + model=model, task_name=self.metadata.name, **kwargs + ) scores_og = {} scores_changed = {} diff --git a/mteb/abstasks/AbsTaskMultilabelClassification.py b/mteb/abstasks/AbsTaskMultilabelClassification.py index 49db456774..3d6bbff959 100644 --- a/mteb/abstasks/AbsTaskMultilabelClassification.py +++ b/mteb/abstasks/AbsTaskMultilabelClassification.py @@ -12,6 +12,7 @@ from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import MultiLabelBinarizer +from ..evaluation.evaluators.model_encode import model_encode from ..MTEBResults import HFSubset, ScoresDict from .AbsTask import AbsTask @@ -122,8 +123,12 @@ def _evaluate_subset( # Encode all unique sentences at the indices unique_train_indices = list(set(itertools.chain.from_iterable(train_samples))) unique_train_sentences = train_split.select(unique_train_indices)["text"] + + _unique_train_embeddings = model_encode( + unique_train_sentences, model=model, task_name=self.metadata.name + ) unique_train_embeddings = dict( - zip(unique_train_indices, model.encode(unique_train_sentences)) + zip(unique_train_indices, _unique_train_embeddings) ) test_text = eval_split["text"] binarizer = MultiLabelBinarizer() @@ -136,7 +141,8 @@ def _evaluate_subset( ) except ValueError: logger.warning("Couldn't subsample, continuing with the entire test set.") - X_test = model.encode(test_text) + + X_test = model_encode(test_text, model=model, task_name=self.metadata.name) for i_experiment, sample_indices in enumerate(train_samples): logger.info( "=" * 10 diff --git a/mteb/abstasks/AbsTaskPairClassification.py b/mteb/abstasks/AbsTaskPairClassification.py index 8b3be7f3f8..085ba0469a 100644 --- a/mteb/abstasks/AbsTaskPairClassification.py +++ b/mteb/abstasks/AbsTaskPairClassification.py @@ -41,7 +41,11 @@ def _evaluate_subset( "sentence_transformers.evaluation.PairClassificationEvaluator" ).setLevel(logging.WARN) evaluator = PairClassificationEvaluator( - data_split["sent1"], data_split["sent2"], data_split["labels"], **kwargs + data_split["sent1"], + data_split["sent2"], + data_split["labels"], + task_name=self.metadata.name, + **kwargs, ) scores = evaluator.compute_metrics(model) diff --git a/mteb/abstasks/AbsTaskReranking.py b/mteb/abstasks/AbsTaskReranking.py index f69cd576ce..92e446f88e 100644 --- a/mteb/abstasks/AbsTaskReranking.py +++ b/mteb/abstasks/AbsTaskReranking.py @@ -29,7 +29,9 @@ def _evaluate_subset( data_split: Dataset, **kwargs: Any, ) -> ScoresDict: - evaluator = RerankingEvaluator(data_split, **kwargs) + evaluator = RerankingEvaluator( + data_split, task_name=self.metadata.name, **kwargs + ) scores = evaluator(model) self._add_main_score(scores) diff --git a/mteb/abstasks/AbsTaskRetrieval.py b/mteb/abstasks/AbsTaskRetrieval.py index b446779fc9..d42e124566 100644 --- a/mteb/abstasks/AbsTaskRetrieval.py +++ b/mteb/abstasks/AbsTaskRetrieval.py @@ -245,8 +245,10 @@ def load_data(self, **kwargs): self.data_loaded = True - def evaluate(self, model, split="test", **kwargs): - retriever = RetrievalEvaluator(model, **kwargs) + def evaluate(self, model, split: str = "test", **kwargs): + retriever = RetrievalEvaluator( + retriever=model, task_name=self.metadata.name, **kwargs + ) scores = {} hf_subsets = ( diff --git a/mteb/abstasks/AbsTaskSTS.py b/mteb/abstasks/AbsTaskSTS.py index 0f658438cd..e3595d4391 100644 --- a/mteb/abstasks/AbsTaskSTS.py +++ b/mteb/abstasks/AbsTaskSTS.py @@ -38,6 +38,7 @@ def normalize(x): data_split["sentence1"], data_split["sentence2"], normalized_scores, + task_name=self.metadata.name, **kwargs, ) scores = evaluator(model) diff --git a/mteb/abstasks/AbsTaskSummarization.py b/mteb/abstasks/AbsTaskSummarization.py index 99a1018efa..12c7f91be0 100644 --- a/mteb/abstasks/AbsTaskSummarization.py +++ b/mteb/abstasks/AbsTaskSummarization.py @@ -46,6 +46,7 @@ def _evaluate_subset(self, model, data_split, **kwargs) -> ScoresDict: human_summaries=data_split["human_summaries"], texts=data_split["text"], gold_scores=normalized_scores, + task_name=self.metadata.name, **kwargs, ) scores = evaluator(model) diff --git a/mteb/encoder_interface.py b/mteb/encoder_interface.py index 8440da6a01..edca380fb9 100644 --- a/mteb/encoder_interface.py +++ b/mteb/encoder_interface.py @@ -1,23 +1,26 @@ from __future__ import annotations -from typing import Any, Protocol, runtime_checkable +from typing import Any, Dict, List, Protocol, Sequence, Union, runtime_checkable import numpy as np import torch +Corpus = Union[List[Dict[str, str]], Dict[str, List[str]]] + @runtime_checkable class Encoder(Protocol): - """The interface for an encoder in MTEB.""" + """The interface for an encoder in MTEB. In general we try to keep this interface aligned with sentence-transformers.""" def encode( - self, sentences: list[str], prompt: str, **kwargs: Any + self, sentences: Sequence[str], *, prompt_name: str | None = None, **kwargs: Any ) -> torch.Tensor | np.ndarray: """Encodes the given sentences using the encoder. Args: sentences: The sentences to encode. - prompt: The prompt to use. Useful for prompt-based models. + prompt_name: The name of the prompt. This will just be the name of the task. Sentence-transformers uses this to + determine which prompt to use from a specified dictionary. **kwargs: Additional arguments to pass to the encoder. Returns: @@ -28,16 +31,17 @@ def encode( @runtime_checkable class EncoderWithQueryCorpusEncode(Encoder, Protocol): - """The interface for an encoder that supports encoding queries and a corpus.""" + """The optional interface for an encoder that supports encoding queries and a corpus.""" def encode_queries( - self, queries: list[str], prompt: str, **kwargs: Any + self, queries: Sequence[str], *, prompt_name: str | None = None, **kwargs: Any ) -> torch.Tensor | np.ndarray: """Encodes the given queries using the encoder. Args: queries: The queries to encode. - prompt: The prompt to use. Useful for prompt-based models. + prompt_name: The name of the prompt. This will just be the name of the task. Sentence-transformers uses this to + determine which prompt to use from a specified dictionary. **kwargs: Additional arguments to pass to the encoder. Returns: @@ -46,13 +50,14 @@ def encode_queries( ... def encode_corpus( - self, corpus: list[str], prompt: str, **kwargs: Any + self, corpus: Corpus, *, prompt_name: str | None = None, **kwargs: Any ) -> torch.Tensor | np.ndarray: """Encodes the given corpus using the encoder. Args: corpus: The corpus to encode. - prompt: The prompt to use. Useful for prompt-based models. + prompt_name: The name of the prompt. This will just be the name of the task. Sentence-transformers uses this to + determine which prompt to use from a specified dictionary. **kwargs: Additional arguments to pass to the encoder. Returns: diff --git a/mteb/evaluation/evaluators/BitextMiningEvaluator.py b/mteb/evaluation/evaluators/BitextMiningEvaluator.py index 25ac0dd652..c017869154 100644 --- a/mteb/evaluation/evaluators/BitextMiningEvaluator.py +++ b/mteb/evaluation/evaluators/BitextMiningEvaluator.py @@ -2,19 +2,21 @@ import logging -import numpy as np import torch import tqdm from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score +from mteb.encoder_interface import Encoder + from .Evaluator import Evaluator +from .model_encode import model_encode from .utils import cos_sim logger = logging.getLogger(__name__) class BitextMiningEvaluator(Evaluator): - def __init__(self, sentences, batch_size=32, limit=None, subsets=None, **kwargs): + def __init__(self, sentences, task_name: str | None = None, subsets=None, **kwargs): super().__init__(**kwargs) # By default, all the columns in sentences will serve for evaluation # Specifying a 'subsets' attribute will limit to certain columns @@ -29,22 +31,21 @@ def __init__(self, sentences, batch_size=32, limit=None, subsets=None, **kwargs) if "gold" not in sentences else sentences["gold"] ) + self.task_name = task_name - self.batch_size = batch_size - - def __call__(self, model): + def __call__(self, model: Encoder): scores = self.compute_metrics(model) return scores - def compute_metrics(self, model): + def compute_metrics(self, model: Encoder): # Compute embeddings logger.info(f"Encoding {self.n_subsets}x{self.n} sentences") embeddings = {} for sub in tqdm.tqdm( self.subsets, desc=f"Encoding {self.n_subsets}x{self.n} sentences" ): - embeddings[sub] = np.asarray( - model.encode(self.sentences[sub], batch_size=self.batch_size) + embeddings[sub] = model_encode( + self.sentences[sub], model=model, task_name=self.task_name ) if set(self.subsets) == {"sentence1", "sentence2"}: # Case of a single pair @@ -84,12 +85,12 @@ def _compute_metrics( scores = { "precision": precision_score( - labels, predictions, zero_division=0.0, average="weighted" + labels, predictions, zero_division=0, average="weighted" ), "recall": recall_score( - labels, predictions, zero_division=0.0, average="weighted" + labels, predictions, zero_division=0, average="weighted" ), - "f1": f1_score(labels, predictions, zero_division=0.0, average="weighted"), + "f1": f1_score(labels, predictions, zero_division=0, average="weighted"), "accuracy": accuracy_score(labels, predictions), } return scores @@ -98,20 +99,24 @@ def _similarity_search( self, query_embeddings, corpus_embeddings, - query_chunk_size=100, - corpus_chunk_size=500000, - top_k=10, + query_chunk_size: int = 100, + corpus_chunk_size: int = 500000, + top_k: int = 10, score_function=cos_sim, ): """This function performs a cosine similarity search between a list of query embeddings and a list of corpus embeddings. It can be used for Information Retrieval / Semantic Search for corpora up to about 1 Million entries. - :param query_embeddings: A 2 dimensional tensor with the query embeddings. - :param corpus_embeddings: A 2 dimensional tensor with the corpus embeddings. - :param query_chunk_size: Process 100 queries simultaneously. Increasing that value increases the speed, but requires more memory. - :param corpus_chunk_size: Scans the corpus 100k entries at a time. Increasing that value increases the speed, but requires more memory. - :param top_k: Retrieve top k matching entries. - :param score_function: Function for computing scores. By default, cosine similarity. - :return: Returns a list with one entry for each query. Each entry is a list of dictionaries with the keys 'corpus_id' and 'score', sorted by decreasing cosine similarity scores. + + Args: + query_embeddings: A 2 dimensional tensor with the query embeddings. + corpus_embeddings: A 2 dimensional tensor with the corpus embeddings. + query_chunk_size: Process 100 queries simultaneously. Increasing that value increases the speed, but requires more memory. + corpus_chunk_size: Scans the corpus 100k entries at a time. Increasing that value increases the speed, but requires more memory. + top_k: Retrieve top k matching entries. + score_function: Function for computing scores. By default, cosine similarity. + + Returns: + Returns a list with one entry for each query. Each entry is a list of dictionaries with the keys 'corpus_id' and 'score', sorted by decreasing cosine similarity scores. """ query_embeddings = torch.from_numpy(query_embeddings) corpus_embeddings = torch.from_numpy(corpus_embeddings) diff --git a/mteb/evaluation/evaluators/ClassificationEvaluator.py b/mteb/evaluation/evaluators/ClassificationEvaluator.py index 2d68d1cf56..6ee9655463 100644 --- a/mteb/evaluation/evaluators/ClassificationEvaluator.py +++ b/mteb/evaluation/evaluators/ClassificationEvaluator.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +from typing import Any import numpy as np import torch @@ -14,6 +15,9 @@ from sklearn.neighbors import KNeighborsClassifier from torch import Tensor +from mteb.encoder_interface import Encoder +from mteb.evaluation.evaluators.model_encode import model_encode + from .Evaluator import Evaluator logger = logging.getLogger(__name__) @@ -30,9 +34,10 @@ def __init__( y_train, embeddings_test, y_test, - k=1, - batch_size=32, - limit=None, + task_name: str | None, + k: int = 1, + batch_size: int = 32, + limit: int | None = None, **kwargs, ): super().__init__(**kwargs) @@ -47,10 +52,10 @@ def __init__( self.y_test = y_test self.batch_size = batch_size - self.k = k + self.task_name = task_name - def __call__(self, model, test_cache=None): + def __call__(self, model: Encoder, test_cache=None): scores = {} max_accuracy = 0 max_f1 = 0 @@ -86,9 +91,10 @@ def __init__( y_train, sentences_test, y_test, - k=1, - batch_size=32, - limit=None, + task_name: str, + k: int = 1, + batch_size: int = 32, + limit: int | None = None, **kwargs, ): super().__init__(**kwargs) @@ -102,8 +108,8 @@ def __init__( self.sentences_test = sentences_test self.y_test = y_test + self.task_name = task_name self.batch_size = batch_size - self.k = k def __call__(self, model, test_cache=None): @@ -111,12 +117,18 @@ def __call__(self, model, test_cache=None): max_accuracy = 0 max_f1 = 0 max_ap = 0 - X_train = np.asarray( - model.encode(self.sentences_train, batch_size=self.batch_size) + X_train = model_encode( + self.sentences_train, + model=model, + task_name=self.task_name, + batch_size=self.batch_size, ) if test_cache is None: - X_test = np.asarray( - model.encode(self.sentences_test, batch_size=self.batch_size) + X_test = model_encode( + self.sentences_test, + model=model, + task_name=self.task_name, + batch_size=self.batch_size, ) test_cache = X_test else: @@ -150,10 +162,11 @@ def __init__( y_train, sentences_test, y_test, - k=1, - batch_size=32, - limit=None, - **kwargs, + task_name: str, + k: int = 1, + batch_size: int = 32, + limit: int | None = None, + **kwargs: Any, ): super().__init__(**kwargs) if limit is not None: @@ -167,21 +180,28 @@ def __init__( self.sentences_test = sentences_test self.y_test = y_test + self.task_name = task_name self.batch_size = batch_size - self.k = k - def __call__(self, model, test_cache=None): + def __call__(self, model: Encoder, test_cache=None): scores = {} max_accuracy = 0 max_f1 = 0 max_ap = 0 - X_train = np.asarray( - model.encode(self.sentences_train, batch_size=self.batch_size) + X_train = model_encode( + self.sentences_train, + model=model, + task_name=self.task_name, + batch_size=self.batch_size, ) + if test_cache is None: - X_test = np.asarray( - model.encode(self.sentences_test, batch_size=self.batch_size) + X_test = model_encode( + self.sentences_test, + model=model, + task_name=self.task_name, + batch_size=self.batch_size, ) test_cache = X_test else: @@ -243,7 +263,9 @@ def _cos_sim(a: Tensor, b: Tensor): @staticmethod def _euclidean_dist(a: Tensor, b: Tensor): """Computes the euclidean distance euclidean_dist(a[i], b[j]) for all i and j. - :return: Matrix with res[i][j] = euclidean_dist(a[i], b[j]) + + Returns: + Matrix with res[i][j] = euclidean_dist(a[i], b[j]) """ if not isinstance(a, torch.Tensor): a = torch.tensor(a) @@ -262,7 +284,9 @@ def _euclidean_dist(a: Tensor, b: Tensor): @staticmethod def _dot_score(a: Tensor, b: Tensor): """Computes the dot-product dot_prod(a[i], b[j]) for all i and j. - :return: Matrix with res[i][j] = dot_prod(a[i], b[j]) + + Returns: + Matrix with res[i][j] = dot_prod(a[i], b[j]) """ if not isinstance(a, torch.Tensor): a = torch.tensor(a) @@ -286,9 +310,10 @@ def __init__( y_train, sentences_test, y_test, - max_iter=100, - batch_size=32, - limit=None, + task_name: str, + max_iter: int = 100, + batch_size: int = 32, + limit: int | None = None, **kwargs, ): super().__init__(**kwargs) @@ -304,6 +329,7 @@ def __init__( self.max_iter = max_iter self.batch_size = batch_size + self.task_name = task_name def __call__(self, model, test_cache=None): scores = {} @@ -313,14 +339,18 @@ def __call__(self, model, test_cache=None): max_iter=self.max_iter, verbose=1 if logger.isEnabledFor(logging.DEBUG) else 0, ) - logger.info(f"Encoding {len(self.sentences_train)} training sentences...") - X_train = np.asarray( - model.encode(self.sentences_train, batch_size=self.batch_size) + X_train = model_encode( + self.sentences_train, + model=model, + task_name=self.task_name, + batch_size=self.batch_size, ) - logger.info(f"Encoding {len(self.sentences_test)} test sentences...") if test_cache is None: - X_test = np.asarray( - model.encode(self.sentences_test, batch_size=self.batch_size) + X_test = model_encode( + self.sentences_test, + model=model, + task_name=self.task_name, + batch_size=self.batch_size, ) test_cache = X_test else: @@ -329,14 +359,15 @@ def __call__(self, model, test_cache=None): clf.fit(X_train, self.y_train) logger.info("Evaluating...") y_pred = clf.predict(X_test) - accuracy = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average="macro") - scores["accuracy"] = accuracy - scores["f1"] = f1 + scores["accuracy"] = accuracy_score(self.y_test, y_pred) + scores["f1"] = f1_score(self.y_test, y_pred, average="macro") + scores["f1_weighted"] = f1_score(self.y_test, y_pred, average="weighted") # if binary classification if len(np.unique(self.y_train)) == 2: - ap = average_precision_score(self.y_test, y_pred) - scores["ap"] = ap + scores["ap"] = average_precision_score(self.y_test, y_pred, average="macro") + scores["ap_weighted"] = average_precision_score( + self.y_test, y_pred, average="weighted" + ) return scores, test_cache diff --git a/mteb/evaluation/evaluators/ClusteringEvaluator.py b/mteb/evaluation/evaluators/ClusteringEvaluator.py index bbf2aadeaa..5de1533a2e 100644 --- a/mteb/evaluation/evaluators/ClusteringEvaluator.py +++ b/mteb/evaluation/evaluators/ClusteringEvaluator.py @@ -2,11 +2,14 @@ import logging -import numpy as np import sklearn import sklearn.cluster +from sklearn import metrics + +from mteb.encoder_interface import Encoder from .Evaluator import Evaluator +from .model_encode import model_encode logger = logging.getLogger(__name__) @@ -16,9 +19,10 @@ def __init__( self, sentences, labels, - clustering_batch_size=500, - batch_size=32, - limit=None, + task_name: str | None = None, + clustering_batch_size: int = 500, + batch_size: int = 32, + limit: int | None = None, **kwargs, ): super().__init__(**kwargs) @@ -29,11 +33,14 @@ def __init__( self.labels = labels self.clustering_batch_size = clustering_batch_size self.batch_size = batch_size - - def __call__(self, model): - logger.info(f"Encoding {len(self.sentences)} sentences...") - corpus_embeddings = np.asarray( - model.encode(self.sentences, batch_size=self.batch_size) + self.task_name = task_name + + def __call__(self, model: Encoder): + corpus_embeddings = model_encode( + self.sentences, + model=model, + task_name=self.task_name, + batch_size=self.batch_size, ) logger.info("Fitting Mini-Batch K-Means model...") @@ -46,8 +53,6 @@ def __call__(self, model): cluster_assignment = clustering_model.labels_ logger.info("Evaluating...") - v_measure = sklearn.metrics.cluster.v_measure_score( - self.labels, cluster_assignment - ) + v_measure = metrics.cluster.v_measure_score(self.labels, cluster_assignment) return {"v_measure": v_measure} diff --git a/mteb/evaluation/evaluators/Evaluator.py b/mteb/evaluation/evaluators/Evaluator.py index 6ab6631d91..466b5615d7 100644 --- a/mteb/evaluation/evaluators/Evaluator.py +++ b/mteb/evaluation/evaluators/Evaluator.py @@ -2,17 +2,20 @@ import random from abc import ABC, abstractmethod +from typing import Any import numpy as np import torch +from mteb.encoder_interface import Encoder + class Evaluator(ABC): """Base class for all evaluators Extend this class and implement __call__ for custom evaluators. """ - def __init__(self, seed=42, **kwargs): + def __init__(self, seed: int = 42, **kwargs: Any): self.seed = seed random.seed(self.seed) np.random.seed(self.seed) @@ -20,7 +23,7 @@ def __init__(self, seed=42, **kwargs): torch.cuda.manual_seed_all(self.seed) @abstractmethod - def __call__(self, model): + def __call__(self, model: Encoder): """This is called during training to evaluate the model. It returns scores. diff --git a/mteb/evaluation/evaluators/PairClassificationEvaluator.py b/mteb/evaluation/evaluators/PairClassificationEvaluator.py index 8c18e6be9c..69d3d9c189 100644 --- a/mteb/evaluation/evaluators/PairClassificationEvaluator.py +++ b/mteb/evaluation/evaluators/PairClassificationEvaluator.py @@ -10,6 +10,9 @@ paired_manhattan_distances, ) +from mteb.encoder_interface import Encoder +from mteb.evaluation.evaluators.model_encode import model_encode + from .Evaluator import Evaluator logger = logging.getLogger(__name__) @@ -22,16 +25,25 @@ class PairClassificationEvaluator(Evaluator): The returned score is the accuracy with a specified metric. The results are written in a CSV. If a CSV already exists, then values are appended. The labels need to be 0 for dissimilar pairs and 1 for similar pairs. - :param sentences1: The first column of sentences - :param sentences2: The second column of sentences - :param labels: labels[i] is the label for the pair (sentences1[i], sentences2[i]). Must be 0 or 1 - :param name: Name for the output - :param batch_size: Batch size used to compute embeddings - :param write_csv: Write results to a CSV file + + Args: + sentences1: The first column of sentences + sentences2: The second column of sentences + labels: labels[i] is the label for the pair (sentences1[i], sentences2[i]). Must be 0 or 1 + name: Name for the output + batch_size: Batch size used to compute embeddings + write_csv: Write results to a CSV file """ def __init__( - self, sentences1, sentences2, labels, batch_size=32, limit=None, **kwargs + self, + sentences1, + sentences2, + labels, + task_name: str | None = None, + batch_size: int = 32, + limit: int | None = None, + **kwargs, ): super().__init__(**kwargs) if limit: @@ -42,13 +54,14 @@ def __init__( self.sentences2 = sentences2 self.labels = labels self.batch_size = batch_size + self.task_name = task_name assert len(self.sentences1) == len(self.sentences2) assert len(self.sentences1) == len(self.labels) for label in labels: assert label == 0 or label == 1 - def __call__(self, model): + def __call__(self, model: Encoder): scores = self.compute_metrics(model) # Main score is the max of Average Precision (AP) @@ -56,15 +69,23 @@ def __call__(self, model): scores["main_score"] = main_score return scores - def compute_metrics(self, model): + def compute_metrics(self, model: Encoder): sentences = list(set(self.sentences1 + self.sentences2)) - logger.info(f"Encoding {len(sentences)} sentences...") - embeddings = np.asarray(model.encode(sentences, batch_size=self.batch_size)) + + total_sents = len(self.sentences1) + len(self.sentences2) + n_duplicates = total_sents - len(sentences) + if n_duplicates: + logger.warning( + f"Found {n_duplicates}/{total_sents} duplicates in the input data. Only encoding unique sentences." + ) + embeddings = model_encode( + sentences, model=model, task_name=self.task_name, batch_size=self.batch_size + ) emb_dict = {sent: emb for sent, emb in zip(sentences, embeddings)} embeddings1 = [emb_dict[sent] for sent in self.sentences1] embeddings2 = [emb_dict[sent] for sent in self.sentences2] - logger.info("Computing similarity distances...") + logger.info("Computing similarity distances.") cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2) manhattan_distances = paired_manhattan_distances(embeddings1, embeddings2) euclidean_distances = paired_euclidean_distances(embeddings1, embeddings2) diff --git a/mteb/evaluation/evaluators/RerankingEvaluator.py b/mteb/evaluation/evaluators/RerankingEvaluator.py index 53dd0fb8a1..2552aecedb 100644 --- a/mteb/evaluation/evaluators/RerankingEvaluator.py +++ b/mteb/evaluation/evaluators/RerankingEvaluator.py @@ -1,15 +1,17 @@ from __future__ import annotations import logging -from typing import Dict, List +from functools import partial +from typing import Callable, Dict, List import numpy as np import torch import tqdm from sklearn.metrics import average_precision_score -from ...encoder_interface import EncoderWithQueryCorpusEncode +from ...encoder_interface import Encoder, EncoderWithQueryCorpusEncode from .Evaluator import Evaluator +from .model_encode import model_encode from .utils import confidence_scores, cos_sim, nAUC logger = logging.getLogger(__name__) @@ -29,6 +31,7 @@ class RerankingEvaluator(Evaluator): def __init__( self, samples, + task_name: str | None, mrr_at_k: int = 10, name: str = "", similarity_fct=cos_sim, @@ -46,6 +49,7 @@ def __init__( self.similarity_fct = similarity_fct self.batch_size = batch_size self.use_batched_encoding = use_batched_encoding + self.task_name = task_name if isinstance(self.samples, dict): self.samples = list(self.samples.values()) @@ -68,7 +72,7 @@ def compute_metrics(self, model): else self.compute_metrics_individual(model) ) - def compute_metrics_batched(self, model): + def compute_metrics_batched(self, model: Encoder | EncoderWithQueryCorpusEncode): """Computes the metrices in a batched way, by batching all queries and all documents together """ @@ -81,12 +85,12 @@ def compute_metrics_batched(self, model): encode_queries_func = ( model.encode_queries if isinstance(model, EncoderWithQueryCorpusEncode) - else model.encode + else partial(model_encode, model=model) ) encode_corpus_func = ( model.encode_corpus if isinstance(model, EncoderWithQueryCorpusEncode) - else model.encode + else partial(model_encode, model=model) ) logger.info("Encoding queries...") @@ -94,6 +98,7 @@ def compute_metrics_batched(self, model): all_query_embs = np.asarray( encode_queries_func( [sample["query"] for sample in self.samples], + task_name=self.task_name, batch_size=self.batch_size, ) ) @@ -103,7 +108,10 @@ def compute_metrics_batched(self, model): q for sample in self.samples for q in sample["query"] ] all_query_embs = self._encode_unique_texts( - all_query_flattened, encode_corpus_func + all_query_flattened, + encode_queries_func, + task_name=self.task_name, + batch_size=self.batch_size, ) else: raise ValueError( @@ -116,7 +124,12 @@ def compute_metrics_batched(self, model): all_docs.extend(sample["positive"]) all_docs.extend(sample["negative"]) - all_docs_embs = self._encode_unique_texts(all_docs, encode_corpus_func) + all_docs_embs = self._encode_unique_texts( + all_docs, + encode_corpus_func, + task_name=self.task_name, + batch_size=self.batch_size, + ) # Compute scores and confidence scores logger.info("Evaluating...") @@ -189,9 +202,15 @@ def compute_metrics_individual(self, model): # .encoding interface requires List[str] as input query = [query] query_emb = np.asarray( - encode_queries_func(query, batch_size=self.batch_size) + encode_queries_func( + query, task_name=self.task_name, batch_size=self.batch_size + ) + ) + docs_emb = np.asarray( + encode_corpus_func( + docs, task_name=self.task_name, batch_size=self.batch_size + ) ) - docs_emb = np.asarray(encode_corpus_func(docs, batch_size=self.batch_size)) sim_scores = self._compute_sim_scores_instance(query_emb, docs_emb) scores = self._compute_metrics_instance(sim_scores, is_relevant) @@ -210,7 +229,13 @@ def compute_metrics_individual(self, model): return {**{"map": mean_ap, "mrr": mean_mrr}, **naucs_map, **naucs_mrr} - def _encode_unique_texts(self, all_texts, encode_queries_func): + @staticmethod + def _encode_unique_texts( + all_texts: list[str], + encode_fn: Callable, + task_name: str, + batch_size: int, + ): index_map, all_unique_texts, all_texts_indexes = {}, [], [] for text in all_texts: text_hash = hash(text) @@ -219,10 +244,10 @@ def _encode_unique_texts(self, all_texts, encode_queries_func): all_unique_texts.append(text) all_texts_indexes.append(index_map[text_hash]) logger.warning( - f"A total on {len(all_texts) - len(all_unique_texts)} duplicate texts were found during encoding. Only encoding unique text and duplicating embeddings across." + f"A total on {len(all_texts) - len(all_unique_texts)}/{len(all_texts)} duplicate texts were found during encoding. Only encoding unique text and duplicating embeddings across." ) all_unique_texts_embs = np.asarray( - encode_queries_func(all_unique_texts, batch_size=self.batch_size) + encode_fn(all_unique_texts, task_name=task_name, batch_size=batch_size) ) return all_unique_texts_embs[all_texts_indexes] diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py index 2f2426e2d3..ecb51c180b 100644 --- a/mteb/evaluation/evaluators/RetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py @@ -14,7 +14,10 @@ from sentence_transformers import CrossEncoder, SentenceTransformer from sentence_transformers.models import Transformer, WordEmbeddings +from mteb.encoder_interface import EncoderWithQueryCorpusEncode + from .Evaluator import Evaluator +from .model_encode import model_encode from .utils import ( confidence_scores, convert_conv_history_to_query, @@ -35,10 +38,10 @@ class DenseRetrievalExactSearch: def __init__( self, - model, + model: EncoderWithQueryCorpusEncode, batch_size: int = 128, corpus_chunk_size: int = 50000, - previous_results: str = None, + previous_results: str | None = None, **kwargs, ): # Model is class that provides encode_corpus() and encode_queries() @@ -71,6 +74,7 @@ def search( queries: dict[str, Union[str, List[str]]], top_k: int, score_function: str, + task_name: str, return_sorted: bool = False, **kwargs, ) -> dict[str, dict[str, float]]: @@ -79,18 +83,17 @@ def search( # Returns a ranked list with the corpus ids if score_function not in self.score_functions: raise ValueError( - "score function: {} must be either (cos_sim) for cosine similarity or (dot) for dot product".format( - score_function - ) + f"score function: {score_function} must be either (cos_sim) for cosine similarity or (dot) for dot product" ) - logger.info("Encoding Queries...") + logger.info("Encoding Queries.") query_ids = list(queries.keys()) self.results = {qid: {} for qid in query_ids} queries = [queries[qid] for qid in queries] if isinstance(queries[0], list): query_embeddings = self.model.encode_conversations( queries, + task_name=task_name, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_tensor=self.convert_to_tensor, @@ -99,6 +102,7 @@ def search( else: query_embeddings = self.model.encode_queries( queries, + task_name=task_name, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_tensor=self.convert_to_tensor, @@ -141,7 +145,8 @@ def search( else: # Encode chunk of corpus sub_corpus_embeddings = self.model.encode_corpus( - corpus[corpus_start_idx:corpus_end_idx], + corpus[corpus_start_idx:corpus_end_idx], # type: ignore + task_name=task_name, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_tensor=self.convert_to_tensor, @@ -303,13 +308,17 @@ def predict(self, queries, passages, **kwargs): "You must implement a predict method for your reranker model" ) - def encode_conversations(self, conversations: List[List[str]], **kwargs): + def encode_conversations( + self, conversations: List[List[str]], task_name: str, **kwargs + ): if callable(getattr(self.model, "encode_conversations", None)): - return self.model.encode_conversations(conversations, **kwargs) + return self.model.encode_conversations( + conversations, task_name=task_name, **kwargs + ) # otherwise fallback to default implementation # TODO: add a warning here queries = self.convert_conv_history_to_query(conversations) - return self.encode_queries(queries, **kwargs) + return self.encode_queries(queries, task_name=task_name, **kwargs) def convert_conv_history_to_query(self, conversations: List[List[str]]) -> str: if callable(getattr(self.model, "convert_conv_history_to_query", None)): @@ -328,7 +337,9 @@ def __init__(self, model, **kwargs): self.save_corpus_embeddings = kwargs.get("save_corpus_embeddings", False) self.corpus_embeddings = {} - def encode_queries(self, queries: List[str], batch_size: int, **kwargs): + def encode_queries( + self, queries: List[str], *, task_name: str, batch_size: int, **kwargs + ): if self.use_sbert_model: if isinstance(self.model._first_module(), Transformer): logger.info( @@ -352,9 +363,17 @@ def encode_queries(self, queries: List[str], batch_size: int, **kwargs): # can't just delete, cuz assign by reference on kwargs new_kwargs = kwargs - return self.model.encode(queries, batch_size=batch_size, **new_kwargs) + return model_encode( + queries, + model=self.model, + task_name=task_name, + batch_size=batch_size, + **new_kwargs, + ) - def encode_corpus(self, corpus: List[Dict[str, str]], batch_size: int, **kwargs): + def encode_corpus( + self, corpus: List[Dict[str, str]], task_name: str, batch_size: int, **kwargs + ): if ( "qid" in kwargs and self.save_corpus_embeddings @@ -385,27 +404,39 @@ def encode_corpus(self, corpus: List[Dict[str, str]], batch_size: int, **kwargs) # can't just delete, cuz assign by reference on kwargs new_kwargs = kwargs - corpus_embeddings = self.model.encode( - sentences, batch_size=batch_size, **new_kwargs + corpus_embeddings = model_encode( + sentences, + model=self.model, + task_name=task_name, + batch_size=batch_size, + **new_kwargs, ) + if self.save_corpus_embeddings and "qid" in kwargs: - if isinstance(corpus_embeddings, torch.tensor): - corpus_embeddings = corpus_embeddings.cpu().detach() self.corpus_embeddings[kwargs["qid"]] = corpus_embeddings return corpus_embeddings - def encode(self, sentences: List[str], **kwargs): - return self.model.encode(sentences, **kwargs) + def encode(self, sentences: List[str], task_name: str, **kwargs): + return self.encode_queries(sentences, task_name=task_name, **kwargs) def encode_conversations( - self, conversations: List[List[str]], batch_size: int, **kwargs + self, + conversations: List[List[str]], + *, + batch_size: int, + task_name: str, + **kwargs, ): if callable(getattr(self.model, "encode_conversations", None)): - return self.model.encode_conversations(conversations, **kwargs) + return self.model.encode_conversations( + conversations, task_name=task_name, **kwargs + ) # otherwise fallback to default implementation # TODO: add a warning here queries = self.convert_conv_history_to_query(conversations) - return self.encode_queries(queries, batch_size=batch_size, **kwargs) + return self.encode_queries( + queries, batch_size=batch_size, task_name=task_name, **kwargs + ) def convert_conv_history_to_query(self, conversations: List[List[str]]) -> str: if callable(getattr(self.model, "convert_conv_history_to_query", None)): @@ -433,6 +464,7 @@ class RetrievalEvaluator(Evaluator): def __init__( self, retriever=None, + task_name: str | None = None, k_values: List[int] = [1, 3, 5, 10, 20, 100, 1000], score_function: str = "cos_sim", **kwargs, @@ -451,12 +483,16 @@ def __init__( ) self.retriever = DenseRetrievalExactSearch(retriever, **kwargs) else: + logger.info( + "The model does not have the optional encode_queries and encode_corpus functions. Wrapping it in DRESModel." + ) self.retriever = DenseRetrievalExactSearch(DRESModel(retriever), **kwargs) self.k_values = k_values self.top_k = ( max(k_values) if "top_k" not in kwargs else kwargs["top_k"] ) # can lower it if reranking self.score_function = score_function + self.task_name = task_name def __call__( self, @@ -470,7 +506,11 @@ def __call__( return self.retriever.search_cross_encoder(corpus, queries, self.top_k) else: return self.retriever.search( - corpus, queries, self.top_k, self.score_function + corpus, + queries, + self.top_k, + self.score_function, + task_name=self.task_name, ) @staticmethod diff --git a/mteb/evaluation/evaluators/STSEvaluator.py b/mteb/evaluation/evaluators/STSEvaluator.py index 0fdc4ecc28..b8994d6ed2 100644 --- a/mteb/evaluation/evaluators/STSEvaluator.py +++ b/mteb/evaluation/evaluators/STSEvaluator.py @@ -2,7 +2,6 @@ import logging -import numpy as np from scipy.stats import pearsonr, spearmanr from sklearn.metrics.pairwise import ( paired_cosine_distances, @@ -11,13 +10,21 @@ ) from .Evaluator import Evaluator +from .model_encode import model_encode logger = logging.getLogger(__name__) class STSEvaluator(Evaluator): def __init__( - self, sentences1, sentences2, gold_scores, batch_size=64, limit=None, **kwargs + self, + sentences1, + sentences2, + gold_scores, + task_name: str | None = None, + batch_size: int = 64, + limit: int | None = None, + **kwargs, ): super().__init__(**kwargs) if limit is not None: @@ -28,15 +35,20 @@ def __init__( self.sentences2 = sentences2 self.gold_scores = gold_scores self.batch_size = batch_size + self.task_name = task_name def __call__(self, model): - logger.info(f"Encoding {len(self.sentences1)} sentences1...") - embeddings1 = np.asarray( - model.encode(self.sentences1, batch_size=self.batch_size) + embeddings1 = model_encode( + self.sentences1, + model=model, + task_name=self.task_name, + batch_size=self.batch_size, ) - logger.info(f"Encoding {len(self.sentences2)} sentences2...") - embeddings2 = np.asarray( - model.encode(self.sentences2, batch_size=self.batch_size) + embeddings2 = model_encode( + self.sentences2, + model=model, + task_name=self.task_name, + batch_size=self.batch_size, ) logger.info("Evaluating...") diff --git a/mteb/evaluation/evaluators/SummarizationEvaluator.py b/mteb/evaluation/evaluators/SummarizationEvaluator.py index a779ec51c7..7010913287 100644 --- a/mteb/evaluation/evaluators/SummarizationEvaluator.py +++ b/mteb/evaluation/evaluators/SummarizationEvaluator.py @@ -7,7 +7,10 @@ import tqdm from scipy.stats import pearsonr, spearmanr +from mteb.encoder_interface import Encoder + from .Evaluator import Evaluator +from .model_encode import model_encode from .utils import cos_sim, dot_score logger = logging.getLogger(__name__) @@ -16,12 +19,13 @@ class SummarizationEvaluator(Evaluator): def __init__( self, + task_name: str | None, human_summaries=None, machine_summaries=None, texts=None, gold_scores=None, - limit=None, - batch_size=32, + limit: int | None = None, + batch_size: int = 32, **kwargs, ): # human_summaries shape: (None, num_human_summaries) @@ -39,8 +43,9 @@ def __init__( self.texts = texts self.gold_scores = gold_scores self.batch_size = batch_size + self.task_name = task_name - def __call__(self, model): + def __call__(self, model: Encoder): cosine_spearman_scores = [] cosine_pearson_scores = [] dot_spearman_scores = [] @@ -52,22 +57,27 @@ def __call__(self, model): len(machine_summaries) for machine_summaries in self.machine_summaries ] - logger.info(f"Encoding {sum(human_lens)} human summaries...") - embs_human_summaries_all = model.encode( + logger.info("Encoding human summaries...") + embs_human_summaries_all = model_encode( [ summary for human_summaries in self.human_summaries for summary in human_summaries ], + model=model, + task_name=self.task_name, batch_size=self.batch_size, ) - logger.info(f"Encoding {sum(machine_lens)} machine summaries...") - embs_machine_summaries_all = model.encode( + + logger.info("Encoding machine summaries...") + embs_machine_summaries_all = model_encode( [ summary for machine_summaries in self.machine_summaries for summary in machine_summaries ], + model=model, + task_name=self.task_name, batch_size=self.batch_size, ) diff --git a/mteb/evaluation/evaluators/model_encode.py b/mteb/evaluation/evaluators/model_encode.py new file mode 100644 index 0000000000..e17f99f42b --- /dev/null +++ b/mteb/evaluation/evaluators/model_encode.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +import logging +from typing import Sequence + +import numpy as np +import torch + +from mteb.encoder_interface import Encoder + +logger = logging.getLogger(__name__) + + +def model_encode( + sentences: Sequence[str], *, model: Encoder, task_name: str | None, **kwargs +) -> np.ndarray: + kwargs["prompt_name"] = task_name + if hasattr(model, "prompts") and task_name not in model.prompts: # type: ignore + logger.info( + f"Prompt {task_name} not found in model prompts. Removing prompt_name argument." + ) + kwargs.pop("prompt_name") + + logger.info(f"Encoding {len(sentences)} sentences.") + + embeddings = model.encode(sentences, **kwargs) + if isinstance(embeddings, torch.Tensor): + embeddings = embeddings.cpu().detach() + + return np.asarray(embeddings) diff --git a/mteb/models/__init__.py b/mteb/models/__init__.py index b3d8717a4b..84e61e1f47 100644 --- a/mteb/models/__init__.py +++ b/mteb/models/__init__.py @@ -9,6 +9,7 @@ from mteb.model_meta import ModelMeta from mteb.models import ( e5_models, + e5_instruct, gritlm, openai_models, sentence_transformers_models, @@ -54,7 +55,7 @@ def get_model_meta(model_name: str, revision: str | None = None) -> ModelMeta: A model metadata object """ if model_name in models: - if not models[model_name].revision == revision: + if revision and (not models[model_name].revision == revision): raise ValueError(f"Model {revision} not found for model {model_name}") return models[model_name] else: # assume it is a sentence-transformers model @@ -106,6 +107,7 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe model_modules = [ e5_models, + e5_instruct, gritlm, openai_models, sentence_transformers_models, @@ -113,7 +115,6 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe ] models = {} - for module in model_modules: for mdl in vars(module).values(): if isinstance(mdl, ModelMeta): diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py new file mode 100644 index 0000000000..8d51839769 --- /dev/null +++ b/mteb/models/e5_instruct.py @@ -0,0 +1,230 @@ +from __future__ import annotations + +import logging +from itertools import islice +from typing import Any, Iterable, Literal, Optional, Sequence, TypeVar + +import numpy as np +import torch +from torch import Tensor +from tqdm import tqdm +from transformers import AutoModel, AutoTokenizer, BatchEncoding +from transformers.modeling_outputs import ModelOutput + +from mteb.encoder_interface import Encoder +from mteb.model_meta import ModelMeta + +from .e5_models import E5_PAPER_RELEASE_DATE, XLMR_LANGUAGES +from .instructions import task_to_instruction + +logger = logging.getLogger(__name__) + +T = TypeVar("T") +EncodeTypes = Literal["query", "passage"] + +MISTRAL_LANGUAGES = ["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"] + + +def batched(iterable: Iterable[T], n: int) -> Iterable[tuple[T, ...]]: + """batched('ABCDEFG', 3) --> ABC DEF G""" # noqa + if n < 1: + raise ValueError("n must be at least one") + it = iter(iterable) + while batch := tuple(islice(it, n)): + yield batch + + +class E5InstructWrapper(Encoder): + def __init__( + self, + model_name: str, + revision: str, + max_length: int, + max_batch_size: Optional[int] = None, + **kwargs: Any, + ): + logger.info("Started loading e5 instruct model") + self.tokenizer = AutoTokenizer.from_pretrained( + model_name, revision=revision, **kwargs + ) + self.model = AutoModel.from_pretrained(model_name, **kwargs) + self.max_length = max_length + self.max_batch_size = max_batch_size + + def preprocess( + self, sentences: Sequence[str], instruction: str, encode_type: EncodeTypes + ) -> BatchEncoding: + if encode_type == "query": + sentences = [ + f"Instruction: {instruction}\nQuery: {sentence}" + for sentence in sentences + ] + + batch_dict = self.tokenizer( + sentences, # type: ignore + max_length=512, + padding=True, + truncation=True, + return_tensors="pt", + ) + + return batch_dict.to(self.model.device) + + def get_embedding_from_output( + self, output: ModelOutput, batch_dict: BatchEncoding + ) -> torch.Tensor: + return self.average_pool(output.last_hidden_state, batch_dict["attention_mask"]) # type: ignore + + @staticmethod + def average_pool( + last_hidden_states: torch.Tensor, attention_mask: torch.Tensor + ) -> Tensor: + last_hidden = last_hidden_states.masked_fill( + ~attention_mask[..., None].bool(), 0.0 + ) + return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] + + def encode( + self, + sentences: list[str], + *, + prompt_name: str | None = None, + batch_size: int = 32, + encode_type: EncodeTypes = "query", + **kwargs: Any, # noqa + ) -> np.ndarray: + if self.max_batch_size and batch_size > self.max_batch_size: + batch_size = self.max_batch_size + batched_embeddings = [] + if prompt_name is not None: + instruction = task_to_instruction(prompt_name) + else: + instruction = "" + for batch in tqdm(batched(sentences, batch_size)): + with torch.inference_mode(): + batch_dict = self.preprocess( + batch, instruction=instruction, encode_type=encode_type + ) + outputs = self.model(**batch_dict) + embeddings = self.get_embedding_from_output(outputs, batch_dict) + batched_embeddings.append(embeddings.detach().cpu()) + + return torch.cat(batched_embeddings).to("cpu").detach().numpy() + + def encode_corpus( + self, + corpus: list[dict[str, str]] | dict[str, list[str]] | list[str], + **kwargs: Any, + ) -> np.ndarray: + sep = " " + if isinstance(corpus, dict): + sentences = [ + (corpus["title"][i] + sep + corpus["text"][i]).strip() + if "title" in corpus + else corpus["text"][i].strip() # type: ignore + for i in range(len(corpus["text"])) # type: ignore + ] + else: + if isinstance(corpus[0], str): + sentences = corpus + else: + sentences = [ + (doc["title"] + sep + doc["text"]).strip() + if "title" in doc + else doc["text"].strip() + for doc in corpus + ] + return self.encode(sentences, encode_type="passage", **kwargs) + + def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray: + return self.encode(queries, encode_type="query", **kwargs) + + +class E5MistralWrapper(E5InstructWrapper): + def __init__( + self, + revision: str, + max_batch_size: int = 4, + torch_dtype=torch.float16, + **kwargs, + ): + super().__init__( + "intfloat/e5-mistral-7b-instruct", + revision=revision, + max_length=4096, + max_batch_size=max_batch_size, + torch_dtype=torch_dtype, + ) + + @staticmethod + def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor: + left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0] + if left_padding: + return last_hidden_states[:, -1] + sequence_lengths = attention_mask.sum(dim=1) - 1 + batch_size = last_hidden_states.shape[0] + return last_hidden_states[ + torch.arange(batch_size, device=last_hidden_states.device), + sequence_lengths, + ] + + def get_embbeding_from_output( + self, output: ModelOutput, batch_dict: BatchEncoding + ) -> torch.Tensor: + return self.last_token_pool( + output.last_hidden_state, # type: ignore + batch_dict["attention_mask"], # type: ignore + ) + + def preprocess( + self, sentences: Sequence[str], instruction: str, encode_type: EncodeTypes + ) -> BatchEncoding: + if encode_type == "query": + sentences = [ + f"Instruction: {instruction}\nQuery: {sentence}" + for sentence in sentences + ] + batch_dict: BatchEncoding = self.tokenizer( + sentences, # type: ignore + max_length=self.max_length - 1, + return_attention_mask=False, + padding=False, + truncation=True, + ) + # append eos_token_id to every input_ids + batch_dict["input_ids"] = [ + [*input_ids, self.tokenizer.eos_token_id] + for input_ids in batch_dict["input_ids"] # type: ignore + ] + batch_dict = self.tokenizer.pad( + batch_dict, padding=True, return_attention_mask=True, return_tensors="pt" + ) + + return batch_dict.to(self.model.device) + + +e5_instruct = ModelMeta( + loader=lambda: E5InstructWrapper( + "intfloat/multilingual-e5-large-instruct", + revision="baa7be480a7de1539afce709c8f13f833a510e0a", + max_length=512, + ), + name="intfloat/multilingual-e5-large-instruct", + languages=XLMR_LANGUAGES, + open_source=True, + revision="baa7be480a7de1539afce709c8f13f833a510e0a", + release_date=E5_PAPER_RELEASE_DATE, +) + +e5_mistral = ModelMeta( + loader=lambda: E5MistralWrapper( + revision="07163b72af1488142a360786df853f237b1a3ca1", + max_batch_size=4, + torch_dtype=torch.float16, + ), + name="intfloat/e5-mistral-7b-instruct", + languages=XLMR_LANGUAGES, + open_source=True, + revision="07163b72af1488142a360786df853f237b1a3ca1", + release_date=E5_PAPER_RELEASE_DATE, +) diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py index 8e360e3ba7..fec334709c 100644 --- a/mteb/models/e5_models.py +++ b/mteb/models/e5_models.py @@ -9,8 +9,8 @@ from mteb.model_meta import ModelMeta from mteb.models.text_formatting_utils import corpus_to_texts -e5_paper_release_date = "2024-02-08" -xlmr_languages = [ +E5_PAPER_RELEASE_DATE = "2024-02-08" +XLMR_LANGUAGES = [ "afr_Latn", "amh_Latn", "ara_Latn", @@ -153,27 +153,27 @@ def encode_corpus( e5_mult_small = ModelMeta( loader=partial(E5Wrapper, model_name="intfloat/multilingual-e5-small"), # type: ignore name="intfloat/multilingual-e5-small", - languages=xlmr_languages, + languages=XLMR_LANGUAGES, open_source=True, revision="e4ce9877abf3edfe10b0d82785e83bdcb973e22e", - release_date=e5_paper_release_date, + release_date=E5_PAPER_RELEASE_DATE, ) e5_mult_base = ModelMeta( loader=partial(E5Wrapper, model_name="intfloat/multilingual-e5-base"), # type: ignore name="intfloat/multilingual-e5-base", - languages=xlmr_languages, + languages=XLMR_LANGUAGES, open_source=True, revision="d13f1b27baf31030b7fd040960d60d909913633f", - release_date=e5_paper_release_date, + release_date=E5_PAPER_RELEASE_DATE, ) e5_mult_large = ModelMeta( loader=partial(E5Wrapper, model_name="intfloat/multilingual-e5-large"), # type: ignore name="intfloat/multilingual-e5-large", - languages=xlmr_languages, + languages=XLMR_LANGUAGES, open_source=True, revision="4dc6d853a804b9c8886ede6dda8a073b7dc08a81", - release_date=e5_paper_release_date, + release_date=E5_PAPER_RELEASE_DATE, ) diff --git a/mteb/models/instructions.py b/mteb/models/instructions.py new file mode 100644 index 0000000000..1a78801bf1 --- /dev/null +++ b/mteb/models/instructions.py @@ -0,0 +1,136 @@ +"""This specifies the default instructions for tasks within MTEB. These are optional to use and some models might want to use their own instructions.""" + +import mteb + +# prompt are derived from: +# scandinavian embedding benchmark: https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/blob/c8376f967d1294419be1d3eb41217d04cd3a65d3/src/seb/registered_models/e5_instruct_models.py +# e5 documentation: https://github.com/microsoft/unilm/blob/9c0f1ff7ca53431fe47d2637dfe253643d94185b/e5/utils.py#L106 +DEFAULT_PROMPTS = { + "STS": "Retrieve semantically similar text", + "Summarization": "Given a news summary, retrieve other semantically similar summaries", + "BitextMining": "Retrieve parallel sentences.", + "Classification": "Classify user passages", + "Clustering": "Identify categories in user passages", + "Reranking": "Retrieve text based on user query.", + "Retrieval": "Retrieve text based on user query.", + "InstructionRetrieval": "Retrieve text based on user query.", + "PairClassification": "Retrieve text that are semantically similar to the given text", +} + + +# This list is NOT comprehensive even for the tasks within MTEB +# TODO: We should probably move this prompt to the task object +TASKNAME2INSTRUCTIONS = { + # BitextMining + "BornholmBitextMining": "Retrieve parallel sentences in Danish and Bornholmsk", + "NorwegianCourtsBitextMining ": "Retrieve parallel sentences in Norwegian Bokmål and Nynorsk", + # Classification + "AngryTweetsClassification": "Classify Danish tweets by sentiment. (positive, negative, neutral)", + "DKHateClassification": "Classify Danish tweets based on offensiveness (offensive, not offensive)", + "DanishPoliticalCommentsClassification": "Classify Danish political comments for sentiment", + "DalajClassification": "Classify texts based on linguistic acceptability in Swedish", + "LccSentimentClassification": "Classify texts based on sentiment", + "NordicLangClassification": "Classify texts based on language", + "MassiveIntentClassification": "Given a user utterance as query, find the user intents", + "Massive Scenario": "Given a user utterance as query, find the user scenarios", + "NoRecClassification": "Classify Norwegian reviews by sentiment", + "SweRecClassification": "Classify Swedish reviews by sentiment", + "Norwegian parliament": "Classify parliament speeches in Norwegian based on political affiliation", + "ScalaClassification": "Classify passages in Scandinavian Languages based on linguistic acceptability", + "AmazonCounterfactualClassification": "Classify a given Amazon customer review text as either counterfactual or not-counterfactual", + "AmazonPolarityClassification": "Classify Amazon reviews into positive or negative sentiment", + "AmazonReviewsClassification": "Classify the given Amazon review into its appropriate rating category", + "Banking77Classification": "Given a online banking query, find the corresponding intents", + "EmotionClassification": "Classify the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise", + "ImdbClassification": "Classify the sentiment expressed in the given movie review text from the IMDB dataset", + "MassiveScenarioClassification": "Given a user utterance as query, find the user scenarios", + "MTOPDomainClassification": "Classify the intent domain of the given utterance in task-oriented conversation", + "MTOPIntentClassification": "Classify the intent of the given utterance in task-oriented conversation", + "ToxicConversationsClassification": "Classify the given comments as either toxic or not toxic", + "TweetSentimentExtractionClassification": "Classify the sentiment of a given tweet as either positive, negative, or neutral", + "TNews": "Classify the fine-grained category of the given news title", + "IFlyTek": "Given an App description text, find the appropriate fine-grained category", + "MultilingualSentiment": "Classify sentiment of the customer review into positive, neutral, or negative", + "JDReview": "Classify the customer review for iPhone on e-commerce platform into positive or negative", + "OnlineShopping": "Classify the customer review for online shopping into positive or negative", + "Waimai": "Classify the customer review from a food takeaway platform into positive or negative", + # Clustering + "VGHierarchicalClusteringP2P": "Identify the categories (e.g. sports) of given articles in Norwegian", + "VGHierarchicalClusteringS2S": "Identify the categories (e.g. sports) of given articles in Norwegian", + "SNLHierarchicalClusteringP2P": "Identify categories in a Norwegian lexicon", + "SNLHierarchicalClusteringS2S": "Identify categories in a Norwegian lexicon", + "SwednClusteringP2P": "Identify news categories in Swedish passages", + "SwednClusteringS2S": "Identify news categories in Swedish passages", + "ArxivClusteringP2P": "Identify the main and secondary category of Arxiv papers based on the titles and abstracts", + "ArxivClusteringS2S": "Identify the main and secondary category of Arxiv papers based on the titles", + "BiorxivClusteringP2P": "Identify the main category of Biorxiv papers based on the titles and abstracts", + "BiorxivClusteringS2S": "Identify the main category of Biorxiv papers based on the titles", + "MedrxivClusteringP2P": "Identify the main category of Medrxiv papers based on the titles and abstracts", + "MedrxivClusteringS2S": "Identify the main category of Medrxiv papers based on the titles", + "RedditClustering": "Identify the topic or theme of Reddit posts based on the titles", + "RedditClusteringP2P": "Identify the topic or theme of Reddit posts based on the titles and posts", + "StackExchangeClustering": "Identify the topic or theme of StackExchange posts based on the titles", + "StackExchangeClusteringP2P": "Identify the topic or theme of StackExchange posts based on the given paragraphs", + "TwentyNewsgroupsClustering": "Identify the topic or theme of the given news articles", + "CLSClusteringS2S": "Identify the main category of scholar papers based on the titles", + "CLSClusteringP2P": "Identify the main category of scholar papers based on the titles and abstracts", + "ThuNewsClusteringS2S": "Identify the topic or theme of the given news articles based on the titles", + "ThuNewsClusteringP2P": "Identify the topic or theme of the given news articles based on the titles and contents", + # Reranking and pair classification + "AskUbuntuDupQuestions": "Retrieve duplicate questions from AskUbuntu forum", + "MindSmallReranking": "Retrieve relevant news articles based on user browsing history", + "SciDocsRR": "Given a title of a scientific paper, retrieve the titles of other relevant papers", + "StackOverflowDupQuestions": "Retrieve duplicate questions from StackOverflow forum", + "SprintDuplicateQuestions": "Retrieve duplicate questions from Sprint forum", + "TwitterSemEval2015": "Retrieve tweets that are semantically similar to the given tweet", + "TwitterURLCorpus": "Retrieve tweets that are semantically similar to the given tweet", + "T2Reranking": "Given a Chinese search query, retrieve web passages that answer the question", + "MMarcoReranking": "Given a Chinese search query, retrieve web passages that answer the question", + "CMedQAv1": "Given a Chinese community medical question, retrieve replies that best answer the question", + "CMedQAv2": "Given a Chinese community medical question, retrieve replies that best answer the question", + "Ocnli": "Retrieve semantically similar text.", + "Cmnli": "Retrieve semantically similar text.", + # Retrieval + "TwitterHjerneRetrieval": "Retrieve answers to questions asked in Danish tweets", + "SwednRetrieval": "Given a Swedish news headline retrieve summaries or news articles", + "TV2Nordretrieval": "Given a summary of a Danish news article retrieve the corresponding news article", + "DanFEVER": "Given a claim in Danish, retrieve documents that support the claim", + "SNLRetrieval": "Given a lexicon headline in Norwegian, retrieve its article", + "NorQuadRetrieval": "Given a question in Norwegian, retrieve the answer from Wikipedia articles", + "SweFaqRetrieval": "Retrieve answers given questions in Swedish", + "ArguAna": "Given a claim, find documents that refute the claim", + "ClimateFEVER": "Given a claim about climate change, retrieve documents that support or refute the claim", + "DBPedia": "Given a query, retrieve relevant entity descriptions from DBPedia", + "FEVER": "Given a claim, retrieve documents that support or refute the claim", + "FiQA2018": "Given a financial question, retrieve user replies that best answer the question", + "HotpotQA": "Given a multi-hop question, retrieve documents that can help answer the question", + "MSMARCO": "Given a web search query, retrieve relevant passages that answer the query", + "NFCorpus": "Given a question, retrieve relevant documents that best answer the question", + "NQ": "Given a question, retrieve Wikipedia passages that answer the question", + "QuoraRetrieval": "Given a question, retrieve questions that are semantically equivalent to the given question", + "SCIDOCS": "Given a scientific paper title, retrieve paper abstracts that are cited by the given paper", + "SciFact": "Given a scientific claim, retrieve documents that support or refute the claim", + "Touche2020": "Given a question, retrieve detailed and persuasive arguments that answer the question", + "TRECCOVID": "Given a query on COVID-19, retrieve documents that answer the query", + "T2Retrieval": "Given a Chinese search query, retrieve web passages that answer the question", + "MMarcoRetrieval": "Given a web search query, retrieve relevant passages that answer the query", + "DuRetrieval": "Given a Chinese search query, retrieve web passages that answer the question", + "CovidRetrieval": "Given a question on COVID-19, retrieve news articles that answer the question", + "CmedqaRetrieval": "Given a Chinese community medical question, retrieve replies that best answer the question", + "EcomRetrieval": "Given a user query from an e-commerce website, retrieve description sentences of relevant products", + "MedicalRetrieval": "Given a medical question, retrieve user replies that best answer the question", + "VideoRetrieval": "Given a video search query, retrieve the titles of relevant videos", +} + + +def task_to_instruction(task_name: str) -> str: + if task_name in TASKNAME2INSTRUCTIONS: + return TASKNAME2INSTRUCTIONS[task_name] + + task = mteb.get_task(task_name) + meta = task.metadata + + if meta.type in DEFAULT_PROMPTS: + return DEFAULT_PROMPTS[meta.type] + + return "" diff --git a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py index 3c4459ffb9..4e65d857ee 100644 --- a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py +++ b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +from functools import partial from typing import Any import numpy as np @@ -11,6 +12,7 @@ from mteb.abstasks.TaskMetadata import TaskMetadata from mteb.encoder_interface import Encoder, EncoderWithQueryCorpusEncode from mteb.evaluation.evaluators import RerankingEvaluator +from mteb.evaluation.evaluators.model_encode import model_encode from mteb.evaluation.evaluators.RetrievalEvaluator import RetrievalEvaluator from mteb.evaluation.evaluators.utils import cos_sim from mteb.MTEBResults import ScoresDict @@ -89,7 +91,9 @@ def _evaluate_subset( data_split: Dataset, **kwargs: Any, ) -> ScoresDict: - evaluator = MIRACLRerankingEvaluator(data_split, **kwargs) + evaluator = MIRACLRerankingEvaluator( + samples=data_split, task_name=self.metadata.name, **kwargs + ) scores = evaluator(model) self._add_main_score(scores) @@ -106,6 +110,7 @@ class MIRACLRerankingEvaluator(RerankingEvaluator): def __init__( self, samples: list[dict], + task_name: str, mrr_at_k: int = 10, name: str = "", similarity_fct=cos_sim, @@ -115,17 +120,15 @@ def __init__( k_values: list[int] = [1, 3, 5, 10, 20, 100, 1000], **kwargs, ): - """Args: - k_values: ranking cutoff threshold when applicable - """ super().__init__( samples, - mrr_at_k, - name, - similarity_fct, - batch_size, - use_batched_encoding, - limit, + task_name=task_name, + mrr_at_k=mrr_at_k, + name=name, + similarity_fct=similarity_fct, + batch_size=batch_size, + use_batched_encoding=use_batched_encoding, + limit=limit, **kwargs, ) self.k_values = k_values @@ -166,12 +169,12 @@ def compute_metrics_batched(self, model: Encoder | EncoderWithQueryCorpusEncode) encode_queries_func = ( model.encode_queries if isinstance(model, EncoderWithQueryCorpusEncode) - else model.encode + else partial(model_encode, model=model) ) encode_corpus_func = ( model.encode_corpus if isinstance(model, EncoderWithQueryCorpusEncode) - else model.encode + else partial(model_encode, model=model) ) logger.info("Encoding queries...") @@ -180,6 +183,7 @@ def compute_metrics_batched(self, model: Encoder | EncoderWithQueryCorpusEncode) encode_queries_func( [sample["query"] for sample in self.samples], batch_size=self.batch_size, + task_name=self.task_name, ) ) elif isinstance(self.samples[0]["query"], list): @@ -188,7 +192,11 @@ def compute_metrics_batched(self, model: Encoder | EncoderWithQueryCorpusEncode) q for sample in self.samples for q in sample["query"] ] all_query_embs = np.asarray( - encode_queries_func(all_query_flattened, batch_size=self.batch_size) + encode_queries_func( + all_query_flattened, + batch_size=self.batch_size, + task_name=self.task_name, + ) ) else: raise ValueError( @@ -201,7 +209,9 @@ def compute_metrics_batched(self, model: Encoder | EncoderWithQueryCorpusEncode) all_docs.extend(sample["candidates"]) all_docs_embs = np.asarray( - encode_corpus_func(all_docs, batch_size=self.batch_size) + encode_corpus_func( + all_docs, batch_size=self.batch_size, task_name=self.task_name + ) ) # Compute scores @@ -246,10 +256,14 @@ def compute_metrics_individual(self, model): # using encode_queries and encode_corpus functions if they exists, # which can be defined by users to add different instructions for query and passage conveniently encode_queries_func = ( - model.encode_queries if hasattr(model, "encode_queries") else model.encode + model.encode_queries + if hasattr(model, "encode_queries") + else partial(model_encode, model=model) ) encode_corpus_func = ( - model.encode_corpus if hasattr(model, "encode_corpus") else model.encode + model.encode_corpus + if hasattr(model, "encode_corpus") + else partial(model_encode, model=model) ) results, qrels = {}, {} @@ -261,10 +275,14 @@ def compute_metrics_individual(self, model): if isinstance(query, str): # .encoding interface requires List[str] as input query_emb = np.asarray( - encode_queries_func([query], batch_size=self.batch_size) + encode_queries_func( + [query], batch_size=self.batch_size, task_name=self.task_name + ) ) docs_emb = np.asarray( - encode_corpus_func(docs, batch_size=self.batch_size) + encode_corpus_func( + docs, batch_size=self.batch_size, task_name=self.task_name + ) ) fake_qid = str(i) diff --git a/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/Banking77Classification.json b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/Banking77Classification.json new file mode 100644 index 0000000000..3ddd13c015 --- /dev/null +++ b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/Banking77Classification.json @@ -0,0 +1,62 @@ +{ + "dataset_revision": "0fd18e25b25c072e09e0d92ab615fda904d66300", + "evaluation_time": 495.5568108558655, + "kg_co2_emissions": null, + "mteb_version": "1.12.18", + "scores": { + "test": [ + { + "accuracy": 0.8568831168831169, + "f1": 0.8564393054232017, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.8568831168831169, + "scores_per_experiment": [ + { + "accuracy": 0.8561688311688311, + "f1": 0.8561640056638862 + }, + { + "accuracy": 0.8626623376623377, + "f1": 0.8625616812044904 + }, + { + "accuracy": 0.8587662337662337, + "f1": 0.858853977879566 + }, + { + "accuracy": 0.8542207792207792, + "f1": 0.8530235815568621 + }, + { + "accuracy": 0.8542207792207792, + "f1": 0.8531653263702142 + }, + { + "accuracy": 0.8542207792207792, + "f1": 0.8538604779527291 + }, + { + "accuracy": 0.8594155844155844, + "f1": 0.8596115961739491 + }, + { + "accuracy": 0.85, + "f1": 0.8489869165538313 + }, + { + "accuracy": 0.8538961038961039, + "f1": 0.8535117411025733 + }, + { + "accuracy": 0.8652597402597403, + "f1": 0.8646537497739164 + } + ] + } + ] + }, + "task_name": "Banking77Classification" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/BornholmBitextMining.json b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/BornholmBitextMining.json new file mode 100644 index 0000000000..a258b485ca --- /dev/null +++ b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/BornholmBitextMining.json @@ -0,0 +1,22 @@ +{ + "dataset_revision": "3bc5cfb4ec514264fe2db5615fac9016f7251552", + "evaluation_time": 53.00084090232849, + "kg_co2_emissions": null, + "mteb_version": "1.12.18", + "scores": { + "test": [ + { + "accuracy": 0.622, + "f1": 0.5522333333333334, + "hf_subset": "default", + "languages": [ + "dan-Latn" + ], + "main_score": 0.5522333333333334, + "precision": 0.5230857142857143, + "recall": 0.622 + } + ] + }, + "task_name": "BornholmBitextMining" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/MalteseNewsClassification.json b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/MalteseNewsClassification.json new file mode 100644 index 0000000000..a7576c2a12 --- /dev/null +++ b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/MalteseNewsClassification.json @@ -0,0 +1,73 @@ +{ + "dataset_revision": "6bb0321659c4f07c4c2176c30c98c971be6571b4", + "evaluation_time": 1856.504804134369, + "kg_co2_emissions": null, + "mteb_version": "1.12.19", + "scores": { + "test": [ + { + "accuracy": 0.25280801044841095, + "f1": 0.24889369214167711, + "hf_subset": "default", + "languages": [ + "mlt-Latn" + ], + "lrap": 0.355959001653179, + "main_score": 0.25280801044841095, + "scores_per_experiment": [ + { + "accuracy": 0.23073574227252938, + "f1": 0.23187180801670923, + "lrap": 0.33005540047972465 + }, + { + "accuracy": 0.2520679146713104, + "f1": 0.25373802065298356, + "lrap": 0.35149534345735095 + }, + { + "accuracy": 0.22072268175881585, + "f1": 0.24272385518667572, + "lrap": 0.32003060257623694 + }, + { + "accuracy": 0.2477144101001306, + "f1": 0.26016557291089626, + "lrap": 0.3581173653614535 + }, + { + "accuracy": 0.22551153678711364, + "f1": 0.24842014532409157, + "lrap": 0.3325565315372844 + }, + { + "accuracy": 0.305180670439704, + "f1": 0.23566755002595474, + "lrap": 0.4099870248491046 + }, + { + "accuracy": 0.2929908576404005, + "f1": 0.26205861820501747, + "lrap": 0.4110625965666929 + }, + { + "accuracy": 0.24031345232912493, + "f1": 0.22251843494270826, + "lrap": 0.33190670695791175 + }, + { + "accuracy": 0.27340008707009145, + "f1": 0.27476032268856, + "lrap": 0.3775704883607624 + }, + { + "accuracy": 0.239442751414889, + "f1": 0.2570125934631742, + "lrap": 0.3368079563852675 + } + ] + } + ] + }, + "task_name": "MalteseNewsClassification" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/NFCorpus.json b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/NFCorpus.json new file mode 100644 index 0000000000..bc48762e01 --- /dev/null +++ b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/NFCorpus.json @@ -0,0 +1,158 @@ +{ + "dataset_revision": "ec0fa4fe99da2ff19ca1214b7966684033a58814", + "evaluation_time": 1830.9513890743256, + "kg_co2_emissions": null, + "mteb_version": "1.12.19", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.33759, + "map_at_1": 0.05733, + "map_at_10": 0.12373, + "map_at_100": 0.15949, + "map_at_1000": 0.17423, + "map_at_20": 0.13883, + "map_at_3": 0.09195, + "map_at_5": 0.10634, + "mrr_at_1": 0.4458204334365325, + "mrr_at_10": 0.525482824708831, + "mrr_at_100": 0.5331231387892924, + "mrr_at_1000": 0.5335208890387875, + "mrr_at_20": 0.5300903238227712, + "mrr_at_3": 0.5030959752321983, + "mrr_at_5": 0.5173374613003097, + "nauc_map_at_1000_diff1": 0.22000919986338666, + "nauc_map_at_1000_max": 0.2818621523093725, + "nauc_map_at_1000_std": 0.1687614529460899, + "nauc_map_at_100_diff1": 0.2291050000585145, + "nauc_map_at_100_max": 0.27736969184829113, + "nauc_map_at_100_std": 0.13456743110734984, + "nauc_map_at_10_diff1": 0.25480763195449, + "nauc_map_at_10_max": 0.21946820430262828, + "nauc_map_at_10_std": 0.027875765224249168, + "nauc_map_at_1_diff1": 0.3970364766678549, + "nauc_map_at_1_max": 0.11237996782144487, + "nauc_map_at_1_std": -0.06180968417144867, + "nauc_map_at_20_diff1": 0.23849341261254972, + "nauc_map_at_20_max": 0.24952827203846273, + "nauc_map_at_20_std": 0.07012693074422964, + "nauc_map_at_3_diff1": 0.29677628886897245, + "nauc_map_at_3_max": 0.14401201036943195, + "nauc_map_at_3_std": -0.053834947434673625, + "nauc_map_at_5_diff1": 0.27449434600357103, + "nauc_map_at_5_max": 0.17542065740450516, + "nauc_map_at_5_std": -0.022841319948502678, + "nauc_mrr_at_1000_diff1": 0.30593647648814115, + "nauc_mrr_at_1000_max": 0.4235696162825443, + "nauc_mrr_at_1000_std": 0.2894877502080374, + "nauc_mrr_at_100_diff1": 0.3060503742900363, + "nauc_mrr_at_100_max": 0.4238093431888815, + "nauc_mrr_at_100_std": 0.28979665731418836, + "nauc_mrr_at_10_diff1": 0.3062205157322983, + "nauc_mrr_at_10_max": 0.42053905333323877, + "nauc_mrr_at_10_std": 0.2870372217734654, + "nauc_mrr_at_1_diff1": 0.3217843623759516, + "nauc_mrr_at_1_max": 0.39071449716012446, + "nauc_mrr_at_1_std": 0.2633060671303849, + "nauc_mrr_at_20_diff1": 0.30475527892769305, + "nauc_mrr_at_20_max": 0.42293104362472544, + "nauc_mrr_at_20_std": 0.28914027495149763, + "nauc_mrr_at_3_diff1": 0.30671211911662816, + "nauc_mrr_at_3_max": 0.40928745402245664, + "nauc_mrr_at_3_std": 0.2629673153833384, + "nauc_mrr_at_5_diff1": 0.31179020370461996, + "nauc_mrr_at_5_max": 0.4209728797156386, + "nauc_mrr_at_5_std": 0.2802389716746124, + "nauc_ndcg_at_1000_diff1": 0.2235013833369612, + "nauc_ndcg_at_1000_max": 0.431313195335335, + "nauc_ndcg_at_1000_std": 0.3590222384212166, + "nauc_ndcg_at_100_diff1": 0.21665778892502022, + "nauc_ndcg_at_100_max": 0.3666101771382062, + "nauc_ndcg_at_100_std": 0.28545278075936975, + "nauc_ndcg_at_10_diff1": 0.20210991819537377, + "nauc_ndcg_at_10_max": 0.3475073553993102, + "nauc_ndcg_at_10_std": 0.2675720462123475, + "nauc_ndcg_at_1_diff1": 0.33776529472089306, + "nauc_ndcg_at_1_max": 0.3784456197875559, + "nauc_ndcg_at_1_std": 0.2607114493318912, + "nauc_ndcg_at_20_diff1": 0.19360451855954672, + "nauc_ndcg_at_20_max": 0.3363203021023225, + "nauc_ndcg_at_20_std": 0.2765474813917874, + "nauc_ndcg_at_3_diff1": 0.22996736709873364, + "nauc_ndcg_at_3_max": 0.343571543156797, + "nauc_ndcg_at_3_std": 0.22369376712842995, + "nauc_ndcg_at_5_diff1": 0.21316061734744693, + "nauc_ndcg_at_5_max": 0.34611913168490926, + "nauc_ndcg_at_5_std": 0.2422989696702848, + "nauc_precision_at_1000_diff1": -0.11695396648431491, + "nauc_precision_at_1000_max": 0.008493561384507457, + "nauc_precision_at_1000_std": 0.31329385834328394, + "nauc_precision_at_100_diff1": -0.06138460433619485, + "nauc_precision_at_100_max": 0.1476619973302544, + "nauc_precision_at_100_std": 0.4033664292691477, + "nauc_precision_at_10_diff1": 0.042703570565127344, + "nauc_precision_at_10_max": 0.3509306226650643, + "nauc_precision_at_10_std": 0.34840168666793614, + "nauc_precision_at_1_diff1": 0.3300992378088183, + "nauc_precision_at_1_max": 0.3978224494867221, + "nauc_precision_at_1_std": 0.24811023000096288, + "nauc_precision_at_20_diff1": -0.00938717678173266, + "nauc_precision_at_20_max": 0.29061859881747243, + "nauc_precision_at_20_std": 0.390997604981726, + "nauc_precision_at_3_diff1": 0.1315182868079486, + "nauc_precision_at_3_max": 0.355558565080408, + "nauc_precision_at_3_std": 0.24223710472856888, + "nauc_precision_at_5_diff1": 0.08489061320715555, + "nauc_precision_at_5_max": 0.3643952213066352, + "nauc_precision_at_5_std": 0.29513655613220935, + "nauc_recall_at_1000_diff1": 0.09895576965795408, + "nauc_recall_at_1000_max": 0.23336915954771714, + "nauc_recall_at_1000_std": 0.247569599375039, + "nauc_recall_at_100_diff1": 0.16923639937820362, + "nauc_recall_at_100_max": 0.23822415943347106, + "nauc_recall_at_100_std": 0.16592888919010765, + "nauc_recall_at_10_diff1": 0.23932736119148648, + "nauc_recall_at_10_max": 0.21445653724531652, + "nauc_recall_at_10_std": 0.041552441035636635, + "nauc_recall_at_1_diff1": 0.3970364766678549, + "nauc_recall_at_1_max": 0.11237996782144487, + "nauc_recall_at_1_std": -0.06180968417144867, + "nauc_recall_at_20_diff1": 0.19864451293218446, + "nauc_recall_at_20_max": 0.23546941735435595, + "nauc_recall_at_20_std": 0.08226150988525655, + "nauc_recall_at_3_diff1": 0.2951367606167307, + "nauc_recall_at_3_max": 0.13373400191846002, + "nauc_recall_at_3_std": -0.0662853786741525, + "nauc_recall_at_5_diff1": 0.2696589975005167, + "nauc_recall_at_5_max": 0.16792601837813337, + "nauc_recall_at_5_std": -0.01913038469848657, + "ndcg_at_1": 0.4257, + "ndcg_at_10": 0.33759, + "ndcg_at_100": 0.31846, + "ndcg_at_1000": 0.40594, + "ndcg_at_20": 0.31892, + "ndcg_at_3": 0.3851, + "ndcg_at_5": 0.36392, + "precision_at_1": 0.44272, + "precision_at_10": 0.25077, + "precision_at_100": 0.08303, + "precision_at_1000": 0.02124, + "precision_at_20": 0.19071, + "precision_at_3": 0.36017, + "precision_at_5": 0.31331, + "recall_at_1": 0.05733, + "recall_at_10": 0.16438, + "recall_at_100": 0.33548, + "recall_at_1000": 0.65005, + "recall_at_20": 0.20529, + "recall_at_3": 0.10332, + "recall_at_5": 0.12999 + } + ] + }, + "task_name": "NFCorpus" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/STS12.json b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/STS12.json new file mode 100644 index 0000000000..3138ea60c8 --- /dev/null +++ b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/STS12.json @@ -0,0 +1,30 @@ +{ + "dataset_revision": "a0d554a64d88156834ff5ae9920b964011b16384", + "evaluation_time": 241.31703805923462, + "kg_co2_emissions": null, + "mteb_version": "1.12.19", + "scores": { + "test": [ + { + "cos_sim": { + "pearson": 0.8761423959946872, + "spearman": 0.8159986918042205 + }, + "euclidean": { + "pearson": 0.8448855362564247, + "spearman": 0.8108163112123713 + }, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.8159986918042205, + "manhattan": { + "pearson": 0.8444615039105288, + "spearman": 0.8100715964917419 + } + } + ] + }, + "task_name": "STS12" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/SciDocsRR.json b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/SciDocsRR.json new file mode 100644 index 0000000000..c73e0e1785 --- /dev/null +++ b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/SciDocsRR.json @@ -0,0 +1,26 @@ +{ + "dataset_revision": "d3c5e1fc0b855ab6097bf1cda04dd73947d7caab", + "evaluation_time": 8973.335940122604, + "kg_co2_emissions": null, + "mteb_version": "1.12.19", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.845770953876529, + "map": 0.845770953876529, + "mrr": 0.9549352789548868, + "nAUC_map_diff1": 0.058213679202273994, + "nAUC_map_max": 0.5379359729288017, + "nAUC_map_std": 0.6421650595861734, + "nAUC_mrr_diff1": 0.49519666367691556, + "nAUC_mrr_max": 0.8115703003643855, + "nAUC_mrr_std": 0.7149086687989342 + } + ] + }, + "task_name": "SciDocsRR" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/SprintDuplicateQuestions.json b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/SprintDuplicateQuestions.json new file mode 100644 index 0000000000..ea737f251b --- /dev/null +++ b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/SprintDuplicateQuestions.json @@ -0,0 +1,109 @@ +{ + "dataset_revision": "d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46", + "evaluation_time": 643.6962480545044, + "kg_co2_emissions": null, + "mteb_version": "1.12.19", + "scores": { + "test": [ + { + "cos_sim": { + "accuracy": 0.9964752475247525, + "accuracy_threshold": 0.9500249028205872, + "ap": 0.9061927927549405, + "f1": 0.8145833333333332, + "f1_threshold": 0.9467196464538574, + "precision": 0.85, + "recall": 0.782 + }, + "dot": { + "accuracy": 0.9904950495049505, + "accuracy_threshold": 584.1082763671875, + "ap": 0.2509248614749473, + "f1": 0.320855614973262, + "f1_threshold": 545.7283935546875, + "precision": 0.3122043519394513, + "recall": 0.33 + }, + "euclidean": { + "accuracy": 0.9963861386138614, + "accuracy_threshold": 7.489836692810059, + "ap": 0.9019357237203477, + "f1": 0.8066350710900474, + "f1_threshold": 8.185409545898438, + "precision": 0.7666666666666667, + "recall": 0.851 + }, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.9061927927549405, + "manhattan": { + "accuracy": 0.9964059405940594, + "accuracy_threshold": 191.58888244628906, + "ap": 0.9026010070230925, + "f1": 0.8081395348837209, + "f1_threshold": 205.01075744628906, + "precision": 0.7838345864661654, + "recall": 0.834 + }, + "max": { + "accuracy": 0.9964752475247525, + "ap": 0.9061927927549405, + "f1": 0.8145833333333332 + } + } + ], + "validation": [ + { + "cos_sim": { + "accuracy": 0.9965841584158416, + "accuracy_threshold": 0.9377014636993408, + "ap": 0.905061888311704, + "f1": 0.8322981366459627, + "f1_threshold": 0.935670018196106, + "precision": 0.7968892955169259, + "recall": 0.871 + }, + "dot": { + "accuracy": 0.9904950495049505, + "accuracy_threshold": 601.9107666015625, + "ap": 0.2320520242735479, + "f1": 0.2885964912280702, + "f1_threshold": 555.5357666015625, + "precision": 0.25703125, + "recall": 0.329 + }, + "euclidean": { + "accuracy": 0.9963762376237624, + "accuracy_threshold": 8.274085998535156, + "ap": 0.8975203371632946, + "f1": 0.8224479922593131, + "f1_threshold": 8.528027534484863, + "precision": 0.7966260543580131, + "recall": 0.85 + }, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.905061888311704, + "manhattan": { + "accuracy": 0.9964356435643564, + "accuracy_threshold": 207.76455688476562, + "ap": 0.899080186157363, + "f1": 0.8232491662696524, + "f1_threshold": 218.84339904785156, + "precision": 0.7861692447679709, + "recall": 0.864 + }, + "max": { + "accuracy": 0.9965841584158416, + "ap": 0.905061888311704, + "f1": 0.8322981366459627 + } + } + ] + }, + "task_name": "SprintDuplicateQuestions" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/SummEval.json b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/SummEval.json new file mode 100644 index 0000000000..f75f6c5a2a --- /dev/null +++ b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/SummEval.json @@ -0,0 +1,26 @@ +{ + "dataset_revision": "cda12ad7615edc362dbf25a00fdd61d3b1eaf93c", + "evaluation_time": 342.8376178741455, + "kg_co2_emissions": null, + "mteb_version": "1.12.19", + "scores": { + "test": [ + { + "cos_sim": { + "pearson": 0.3075575301462621, + "spearman": 0.3069119423167846 + }, + "dot": { + "pearson": 0.12092690449262697, + "spearman": 0.13855042553837368 + }, + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.3069119423167846 + } + ] + }, + "task_name": "SummEval" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/TwentyNewsgroupsClustering.json b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/TwentyNewsgroupsClustering.json new file mode 100644 index 0000000000..30ce7a843f --- /dev/null +++ b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/TwentyNewsgroupsClustering.json @@ -0,0 +1,32 @@ +{ + "dataset_revision": "6125ec4e24fa026cec8a478383ee943acfbd5449", + "evaluation_time": 3385.347732782364, + "kg_co2_emissions": null, + "mteb_version": "1.12.18", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.5102756919608628, + "v_measure": 0.5102756919608628, + "v_measure_std": 0.011076139337750578, + "v_measures": [ + 0.5357812410289583, + 0.5215538479830223, + 0.49993297563312133, + 0.5192276076086956, + 0.5033929619262995, + 0.5001161353801409, + 0.5086401918254301, + 0.5035760839313547, + 0.5078820170319454, + 0.5026538572596595 + ] + } + ] + }, + "task_name": "TwentyNewsgroupsClustering" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/TwentyNewsgroupsClustering.v2.json b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/TwentyNewsgroupsClustering.v2.json new file mode 100644 index 0000000000..aa3ba1d08b --- /dev/null +++ b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/TwentyNewsgroupsClustering.v2.json @@ -0,0 +1,33 @@ +{ + "dataset_revision": "6125ec4e24fa026cec8a478383ee943acfbd5449", + "evaluation_time": 68.14173698425293, + "kg_co2_emissions": null, + "mteb_version": "1.12.18", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "eng-Latn" + ], + "main_score": 0.5285711335186855, + "v_measure": 0.5285711335186855, + "v_measures": { + "Level 0": [ + 0.5208466236270913, + 0.5415470379924898, + 0.5200706169746894, + 0.5302034420570773, + 0.519173866027448, + 0.5425077476797463, + 0.5258772657115914, + 0.5335588084835677, + 0.5368118881898065, + 0.5151140384433467 + ] + } + } + ] + }, + "task_name": "TwentyNewsgroupsClustering.v2" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/model_meta.json b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/model_meta.json new file mode 100644 index 0000000000..1daa74bc98 --- /dev/null +++ b/results/intfloat__multilingual-e5-large-instruct/baa7be480a7de1539afce709c8f13f833a510e0a/model_meta.json @@ -0,0 +1 @@ +{"name": "intfloat/multilingual-e5-large-instruct", "revision": "baa7be480a7de1539afce709c8f13f833a510e0a", "release_date": "2024-02-08", "languages": ["afr_Latn", "amh_Latn", "ara_Latn", "asm_Latn", "aze_Latn", "bel_Latn", "bul_Latn", "ben_Latn", "ben_Beng", "bre_Latn", "bos_Latn", "cat_Latn", "ces_Latn", "cym_Latn", "dan_Latn", "deu_Latn", "ell_Latn", "eng_Latn", "epo_Latn", "spa_Latn", "est_Latn", "eus_Latn", "fas_Latn", "fin_Latn", "fra_Latn", "fry_Latn", "gle_Latn", "gla_Latn", "glg_Latn", "guj_Latn", "hau_Latn", "heb_Latn", "hin_Latn", "hin_Deva", "hrv_Latn", "hun_Latn", "hye_Latn", "ind_Latn", "isl_Latn", "ita_Latn", "jpn_Latn", "jav_Latn", "kat_Latn", "kaz_Latn", "khm_Latn", "kan_Latn", "kor_Latn", "kur_Latn", "kir_Latn", "lat_Latn", "lao_Latn", "lit_Latn", "lav_Latn", "mlg_Latn", "mkd_Latn", "mal_Latn", "mon_Latn", "mar_Latn", "msa_Latn", "mya_Latn", "nep_Latn", "nld_Latn", "nob_Latn", "orm_Latn", "ori_Latn", "pan_Latn", "pol_Latn", "pus_Latn", "por_Latn", "ron_Latn", "rus_Latn", "san_Latn", "snd_Latn", "sin_Latn", "slk_Latn", "slv_Latn", "som_Latn", "sqi_Latn", "srp_Latn", "sun_Latn", "swe_Latn", "swa_Latn", "tam_Latn", "tam_Taml", "tel_Latn", "tel_Telu", "tha_Latn", "tgl_Latn", "tur_Latn", "uig_Latn", "ukr_Latn", "urd_Latn", "urd_Arab", "uzb_Latn", "vie_Latn", "xho_Latn", "yid_Latn", "zho_Hant", "zho_Hans"], "n_parameters": null, "memory_usage": null, "max_tokens": null, "embed_dim": null, "license": null, "open_source": true, "framework": [], "loader": ""} \ No newline at end of file diff --git a/tests/test_ClusteringEvaluator.py b/tests/test_ClusteringEvaluator.py index 0769a088cd..ca21b1b0bb 100644 --- a/tests/test_ClusteringEvaluator.py +++ b/tests/test_ClusteringEvaluator.py @@ -10,12 +10,19 @@ class TestClusteringEvaluator: def test_clustering_v_measure(self): class Model: - def encode(self, sentences: List[str], batch_size=32) -> np.ndarray: + def encode( + self, + sentences: List[str], + prompt_name: str | None = None, + batch_size=32, + ) -> np.ndarray: return np.eye(len(sentences)) model = Model() sentences = ["dog walked home", "cat walked home", "robot walked to the park"] - clusterer = ClusteringEvaluator(sentences=sentences, labels=[1, 2, 3]) + clusterer = ClusteringEvaluator( + sentences=sentences, labels=[1, 2, 3], task_name="test" + ) result = clusterer(model) assert result == {"v_measure": 1.0} diff --git a/tests/test_InstructionRetrievalEvaluator.py b/tests/test_InstructionRetrievalEvaluator.py index d40c0b83b9..8595378ef6 100644 --- a/tests/test_InstructionRetrievalEvaluator.py +++ b/tests/test_InstructionRetrievalEvaluator.py @@ -10,7 +10,9 @@ def setup_method(self): setup_method is invoked for every test method of a class. """ # checks that it loads - self.evaluator = InstructionRetrievalEvaluator.InstructionRetrievalEvaluator() + self.evaluator = InstructionRetrievalEvaluator.InstructionRetrievalEvaluator( + task_name="test" + ) def test_p_mrr(self): changed_qrels = { diff --git a/tests/test_RerankingEvaluator.py b/tests/test_RerankingEvaluator.py index 14ff723f62..614aa17a97 100644 --- a/tests/test_RerankingEvaluator.py +++ b/tests/test_RerankingEvaluator.py @@ -13,7 +13,7 @@ def setup_method(self): setup_method is invoked for every test method of a class. """ - self.evaluator = RerankingEvaluator([]) + self.evaluator = RerankingEvaluator([], task_name="test") def test_mrr_at_k(self): is_relevant = [1, 1, 1, 0, 0, 0, 0, 0, 0] diff --git a/tests/test_mteb.py b/tests/test_mteb.py index 3db6962730..bec59e565b 100644 --- a/tests/test_mteb.py +++ b/tests/test_mteb.py @@ -3,11 +3,14 @@ import logging from typing import Union +import numpy as np import pytest from sentence_transformers import SentenceTransformer +import mteb from mteb import MTEB from mteb.abstasks import AbsTask +from mteb.encoder_interface import Encoder from mteb.tasks.BitextMining.dan.BornholmskBitextMining import BornholmBitextMining logging.basicConfig(level=logging.INFO) @@ -53,6 +56,49 @@ def test_mteb_task(task: Union[str, AbsTask], model_name: str): eval.run(model, output_folder="tests/results", overwrite_results=True) +@pytest.mark.parametrize( + "task_name", + [ + "BornholmBitextMining", + "TwentyNewsgroupsClustering", + "TwentyNewsgroupsClustering.v2", + "Banking77Classification", + "SciDocsRR", + "SprintDuplicateQuestions", + "NFCorpus", + "MalteseNewsClassification", + "STS12", + "SummEval", + ], +) +def test_mteb_with_instructions(task_name: str): + """Test that all tasks correctly pass down the task_name to the encoder which supports it, and that the encoder which does not support it does not + receive it. + """ + + class EncoderWithInstructions(Encoder): + def encode(self, sentences, prompt_name: str | None = None, **kwargs): + assert prompt_name == task_name + return np.zeros((len(sentences), 10)) + + class EncoderWithoutInstructions(SentenceTransformer): + def encode(self, sentences, **kwargs): + assert "prompt_name" not in kwargs + return super().encode(sentences, **kwargs) + + tasks = mteb.get_tasks(tasks=[task_name]) + + eval = mteb.MTEB(tasks=tasks) + + # Test that the task_name is passed down to the encoder + model = EncoderWithInstructions() + eval.run(model, output_folder="tests/results", overwrite_results=True) + # Test that the task_name is not passed down to the encoder + model = EncoderWithoutInstructions("average_word_embeddings_levy_dependency") + assert model.prompts == {}, "The encoder should not have any prompts" + eval.run(model, output_folder="tests/results", overwrite_results=True) + + def test_all_tasks_fetch(): """Test that all tasks can be fetched""" MTEB.mteb_tasks()