Skip to content

Commit

Permalink
Merge branch 'main' into compare-scores-clustering-fast
Browse files Browse the repository at this point in the history
  • Loading branch information
isaac-chung committed Jun 17, 2024
2 parents 241f6c8 + e53821e commit 9e163a3
Show file tree
Hide file tree
Showing 186 changed files with 36,369 additions and 696 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -139,4 +139,5 @@ tests/results
tmp.py

# sandbox
sb.ipynb
sb.ipynb
tests/create_meta/model_card.md
Binary file added docs/images/mmteb_overview_wide.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
7,358 changes: 7,358 additions & 0 deletions docs/images/mmteb_overview_wide.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/mmteb_overview_wide_centered.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
7,358 changes: 7,358 additions & 0 deletions docs/images/mmteb_overview_wide_centered.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
7,363 changes: 7,363 additions & 0 deletions docs/images/mteb_overview.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 2 additions & 0 deletions docs/mmteb/points.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,5 @@ Please also add your first name and last name are as you want them to appear in
| ManuelFay | Manuel | Faysse | manuel.faysse@centralesupelec.fr | ~Manuel_Faysse1 | CentraleSupélec & Illuin Technology |
| hgissbkh | Hippolyte | Gisserot-Boukhlef | hippolyte.gisserot-boukhlef@centralesupelec.fr | ~Hippolyte_Gisserot-Boukhlef1 | CentraleSupélec & Artefact Research Center |
| sted97 | Simone | Tedeschi | tedeschi@diag.uniroma1.it | ~Simone_Tedeschi1 | Sapienza University of Rome |
| gentaiscool | Genta Indra | Winata | genta.winata@capitalone.com | ~Genta_Indra_Winata1 | Capital One |
| henilp105 | Henil | Panchal | henilp105@gmail.com | ~Henil_Shalin_Panchal1 | Nirma University |
2 changes: 2 additions & 0 deletions docs/mmteb/points/808.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "awinml", "New dataset": 8}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
4 changes: 4 additions & 0 deletions docs/mmteb/points/888.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"GitHub": "KennethEnevoldsen", "Bug fixes": 8}
{"GitHub": "Muennighoff", "Review PR": 2}
{"GitHub": "orionw", "Review PR": 2}
{"GitHub": "imenelydiaker", "Review PR": 2}
3 changes: 3 additions & 0 deletions docs/mmteb/points/898.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"GitHub": "akshita-sukhlecha", "Bug fixes": 2}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
{"GitHub": "imenelydiaker", "Review PR": 2}
2 changes: 2 additions & 0 deletions docs/mmteb/points/914.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "gentaiscool", "New dataset": 26}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
2 changes: 2 additions & 0 deletions docs/mmteb/points/915.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "gentaiscool", "New dataset": 18}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
2 changes: 2 additions & 0 deletions docs/mmteb/points/917.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "akshita-sukhlecha", "New dataset": 34}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
2 changes: 2 additions & 0 deletions docs/mmteb/points/922.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "gentaiscool", "New dataset": 38}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
5 changes: 5 additions & 0 deletions docs/mmteb/points/923.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"GitHub": "MariyaTikhonova", "Dataset annotations": 1}
{"GitHub": "anpalmak2003", "Dataset annotations": 1}
{"GitHub": "ab1992ao", "Dataset annotations": 1}
{"GitHub": "Alenush", "Dataset annotations": 1}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
2 changes: 2 additions & 0 deletions docs/mmteb/points/927.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "gentaiscool", "New dataset": 18}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
2 changes: 2 additions & 0 deletions docs/mmteb/points/928.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "gentaiscool", "New dataset": 2}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
2 changes: 2 additions & 0 deletions docs/mmteb/points/936.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "henilp105", "Dataset annotations": 9}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
46 changes: 24 additions & 22 deletions docs/mmteb/points_table.md

Large diffs are not rendered by default.

467 changes: 250 additions & 217 deletions docs/tasks.md

Large diffs are not rendered by default.

14 changes: 10 additions & 4 deletions mteb/MTEBResults.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,13 +209,19 @@ def _convert_from_before_v1_11_0(cls, data: dict) -> MTEBResults:
main_score = task.metadata.main_score
for split, split_score in scores.items():
for hf_subset, hf_subset_scores in split_score.items():
if task.metadata.type == "STS":
for name, prev_name in [
("cosine", "cos_sim"),
("manhattan", "manhattan"),
("euclidean", "euclidean"),
]:
prev_name_scores = hf_subset_scores.pop(prev_name)
for k, v in prev_name_scores.items():
hf_subset_scores[f"{name}_{k}"] = v

if "main_score" not in hf_subset_scores:
if main_score in hf_subset_scores:
hf_subset_scores["main_score"] = hf_subset_scores[main_score]
elif main_score == "cosine_spearman":
hf_subset_scores["main_score"] = hf_subset_scores["cos_sim"][
"spearman"
]
else:
logger.warning(f"Main score {main_score} not found in scores")
hf_subset_scores["main_score"] = None
Expand Down
14 changes: 11 additions & 3 deletions mteb/abstasks/AbsTaskBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,10 @@ def evaluate(self, model, split, **kwargs) -> dict[HFSubset, ScoresDict]:
scores = {}
if self.parallel_subsets:
scores["default"] = self._evaluate_subset(
model, self.dataset[split], parallel=True, **kwargs
model,
self.dataset[split], # type: ignore
parallel=True,
**kwargs,
)
else:
for hf_subet in hf_subsets:
Expand All @@ -52,15 +55,20 @@ def evaluate(self, model, split, **kwargs) -> dict[HFSubset, ScoresDict]:
else:
data_split = self.dataset[hf_subet][split]
scores[hf_subet] = self._evaluate_subset(
model, data_split, subsets=["sentence1", "sentence2"], **kwargs
model,
data_split, # type: ignore
subsets=["sentence1", "sentence2"],
**kwargs,
)

return scores

def _evaluate_subset(
self, model, data_split: Dataset, parallel=False, **kwargs
) -> ScoresDict:
evaluator = BitextMiningEvaluator(data_split, **kwargs)
evaluator = BitextMiningEvaluator(
data_split, task_name=self.metadata.name, **kwargs
)
metrics = evaluator(model)
if parallel:
for v in metrics.values():
Expand Down
3 changes: 3 additions & 0 deletions mteb/abstasks/AbsTaskClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ def _evaluate_subset(
y_sampled,
eval_split["text"],
eval_split["label"],
task_name=self.metadata.name,
**params,
)
elif self.method == "kNN-pytorch":
Expand All @@ -126,6 +127,7 @@ def _evaluate_subset(
y_sampled,
eval_split["text"],
eval_split["label"],
task_name=self.metadata.name,
**params,
)
elif self.method == "logReg":
Expand All @@ -134,6 +136,7 @@ def _evaluate_subset(
y_sampled,
eval_split["text"],
eval_split["label"],
task_name=self.metadata.name,
**params,
)
else:
Expand Down
1 change: 1 addition & 0 deletions mteb/abstasks/AbsTaskClustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def _evaluate_subset(
evaluator = ClusteringEvaluator(
cluster_set["sentences"], # type: ignore
cluster_set["labels"], # type: ignore
task_name=self.metadata.name,
**kwargs,
)
metrics = evaluator(model)
Expand Down
8 changes: 6 additions & 2 deletions mteb/abstasks/AbsTaskClusteringFast.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from datasets import Dataset, DatasetDict
from sklearn.metrics.cluster import v_measure_score

from ..evaluation.evaluators.model_encode import model_encode
from ..MTEBResults import HFSubset
from .AbsTask import AbsTask

Expand Down Expand Up @@ -124,9 +125,12 @@ def _evaluate_subset(
)
downsampled_dataset = dataset.select(example_indices)

logger.info(f"Encoding {len(downsampled_dataset)} sentences...")
embeddings = model_encode(
downsampled_dataset["sentences"], # type: ignore
model=model,
task_name=self.metadata.name,
)

embeddings = model.encode(downsampled_dataset["sentences"])
labels = []
for label in downsampled_dataset["labels"]:
if not isinstance(label, list):
Expand Down
14 changes: 9 additions & 5 deletions mteb/abstasks/AbsTaskInstructionRetrieval.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import json
import logging
import os
Expand All @@ -22,10 +24,10 @@
class HFDataLoaderInstructions(HFDataLoader):
def __init__(
self,
hf_repo: str = None,
hf_repo_qrels: str = None,
data_folder: str = None,
prefix: str = None,
hf_repo: str | None = None,
hf_repo_qrels: str | None = None,
data_folder: str | None = None,
prefix: str | None = None,
corpus_file: str = "corpus.jsonl",
query_file: str = "queries.jsonl",
qrels_folder: str = "qrels",
Expand Down Expand Up @@ -323,7 +325,9 @@ def load_data(self, **kwargs):
self.data_loaded = True

def evaluate(self, model, split="test", **kwargs):
retriever = InstructionRetrievalEvaluator(model, **kwargs)
retriever = InstructionRetrievalEvaluator(
model=model, task_name=self.metadata.name, **kwargs
)

scores_og = {}
scores_changed = {}
Expand Down
10 changes: 8 additions & 2 deletions mteb/abstasks/AbsTaskMultilabelClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MultiLabelBinarizer

from ..evaluation.evaluators.model_encode import model_encode
from ..MTEBResults import HFSubset, ScoresDict
from .AbsTask import AbsTask

Expand Down Expand Up @@ -122,8 +123,12 @@ def _evaluate_subset(
# Encode all unique sentences at the indices
unique_train_indices = list(set(itertools.chain.from_iterable(train_samples)))
unique_train_sentences = train_split.select(unique_train_indices)["text"]

_unique_train_embeddings = model_encode(
unique_train_sentences, model=model, task_name=self.metadata.name
)
unique_train_embeddings = dict(
zip(unique_train_indices, model.encode(unique_train_sentences))
zip(unique_train_indices, _unique_train_embeddings)
)
test_text = eval_split["text"]
binarizer = MultiLabelBinarizer()
Expand All @@ -136,7 +141,8 @@ def _evaluate_subset(
)
except ValueError:
logger.warning("Couldn't subsample, continuing with the entire test set.")
X_test = model.encode(test_text)

X_test = model_encode(test_text, model=model, task_name=self.metadata.name)
for i_experiment, sample_indices in enumerate(train_samples):
logger.info(
"=" * 10
Expand Down
6 changes: 5 additions & 1 deletion mteb/abstasks/AbsTaskPairClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,11 @@ def _evaluate_subset(
"sentence_transformers.evaluation.PairClassificationEvaluator"
).setLevel(logging.WARN)
evaluator = PairClassificationEvaluator(
data_split["sent1"], data_split["sent2"], data_split["labels"], **kwargs
data_split["sent1"],
data_split["sent2"],
data_split["labels"],
task_name=self.metadata.name,
**kwargs,
)
scores = evaluator.compute_metrics(model)

Expand Down
4 changes: 3 additions & 1 deletion mteb/abstasks/AbsTaskReranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ def _evaluate_subset(
data_split: Dataset,
**kwargs: Any,
) -> ScoresDict:
evaluator = RerankingEvaluator(data_split, **kwargs)
evaluator = RerankingEvaluator(
data_split, task_name=self.metadata.name, **kwargs
)
scores = evaluator(model)

self._add_main_score(scores)
Expand Down
6 changes: 4 additions & 2 deletions mteb/abstasks/AbsTaskRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,10 @@ def load_data(self, **kwargs):

self.data_loaded = True

def evaluate(self, model, split="test", **kwargs):
retriever = RetrievalEvaluator(model, **kwargs)
def evaluate(self, model, split: str = "test", **kwargs):
retriever = RetrievalEvaluator(
retriever=model, task_name=self.metadata.name, **kwargs
)

scores = {}
hf_subsets = (
Expand Down
6 changes: 2 additions & 4 deletions mteb/abstasks/AbsTaskSTS.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def normalize(x):
data_split["sentence1"],
data_split["sentence2"],
normalized_scores,
task_name=self.metadata.name,
**kwargs,
)
scores = evaluator(model)
Expand All @@ -46,7 +47,4 @@ def normalize(x):
return scores

def _add_main_score(self, scores: ScoresDict) -> None:
m_score = self.metadata.main_score
dist, metric = m_score.split("_")
dist_mapping = {"cosine": "cos_sim"}
scores["main_score"] = scores[dist_mapping.get(dist, dist)][metric]
scores["main_score"] = scores[self.metadata.main_score]
6 changes: 2 additions & 4 deletions mteb/abstasks/AbsTaskSummarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,12 @@ def _evaluate_subset(self, model, data_split, **kwargs) -> ScoresDict:
human_summaries=data_split["human_summaries"],
texts=data_split["text"],
gold_scores=normalized_scores,
task_name=self.metadata.name,
**kwargs,
)
scores = evaluator(model)
self._add_main_score(scores)
return scores

def _add_main_score(self, scores: ScoresDict) -> None:
m_score = self.metadata.main_score
dist, metric = m_score.split("_")
dist_mapping = {"cosine": "cos_sim"}
scores["main_score"] = scores[dist_mapping.get(dist, dist)][metric]
scores["main_score"] = scores[self.metadata.main_score]
2 changes: 2 additions & 0 deletions mteb/abstasks/TaskMetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
"Cross-Lingual Semantic Discrimination",
"Textual Entailment",
"Counterfactual Detection",
"Emotion classification",
"Reasoning as Retrieval",
]

TASK_DOMAIN = Literal[
Expand Down
21 changes: 16 additions & 5 deletions mteb/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,26 +245,37 @@ def create_meta(args: argparse.Namespace) -> None:
]

task_results = [MTEBResults.from_disk(path) for path in json_files]
task_results = sorted(task_results, key=lambda x: x.task_name)

yaml_results = []
for task_result in task_results:
task = mteb.get_task(task_result.task_name)

for split, hf_subset_scores in task_result.scores.items():
for hf_subset_score in hf_subset_scores:
metrics = [
{
"type": k,
"value": v,
}
for k, v in hf_subset_score.items()
if isinstance(v, (int, float))
]
if task.metadata.main_score not in hf_subset_score:
raise ValueError(
f"Main score {task.metadata.main_score} not found in metrics or is not a number."
)

yaml_result = {
"task": {"type": task.metadata.type},
"dataset": {
"type": task.metadata.dataset["path"],
"name": f"MTEB {task.metadata.name}",
"name": f"MTEB {task.metadata.name} ({hf_subset_score['hf_subset']})",
"config": hf_subset_score["hf_subset"],
"split": split,
"revision": task_result.dataset_revision,
},
"metric": {
"type": task.metadata.main_score,
"value": hf_subset_score["main_score"],
},
"metrics": metrics,
}
yaml_results.append(yaml_result)

Expand Down
Loading

0 comments on commit 9e163a3

Please sign in to comment.