Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Compare Cluster and ClusterFast scores and speedup #892

Merged
merged 33 commits into from
Jun 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
3e94983
first go at getting spearman corr for e5-base
isaac-chung Jun 7, 2024
c64c144
add back large
isaac-chung Jun 7, 2024
4153981
small and large results
isaac-chung Jun 7, 2024
b8a5be4
v3 means downsampling by stratified subsampling + bootstrap to k=max_…
isaac-chung Jun 8, 2024
6295dcc
v3-1 means swapping values of max_documents_per_cluster and max_docum…
isaac-chung Jun 8, 2024
3a0d9c4
v3-2 means increasing max_documents_per_cluster to 65536
isaac-chung Jun 8, 2024
3c19d0f
task-wise comparison
isaac-chung Jun 9, 2024
974db74
use recommended syntax
isaac-chung Jun 9, 2024
a504e7c
add back no-op changes
isaac-chung Jun 9, 2024
f69e316
add back no-op changes
isaac-chung Jun 9, 2024
7ca40c9
option c is now v2; remove all v3 variants; add back level 0 in resul…
isaac-chung Jun 9, 2024
2c70b7c
paraphrase-multilingual-MiniLM-L12-v2 results
isaac-chung Jun 9, 2024
a376b9e
lint script
isaac-chung Jun 9, 2024
a990b78
cluster without fast should not have levels
isaac-chung Jun 10, 2024
3a06557
spearman on significant rank
isaac-chung Jun 10, 2024
61ff34a
add more small model results
isaac-chung Jun 11, 2024
cfb2f95
2x max_documents_to_embed to 4096
isaac-chung Jun 13, 2024
f2acd07
max_documents_to_embed=8192
isaac-chung Jun 13, 2024
dfee8a5
t
KennethEnevoldsen Jun 13, 2024
1d5e57e
Added plots
KennethEnevoldsen Jun 13, 2024
6ce12f6
format
KennethEnevoldsen Jun 13, 2024
3e69880
use 32k samples for bigger cluster datasets
isaac-chung Jun 13, 2024
2424735
use 4% n_samples and update task metadata
isaac-chung Jun 14, 2024
a9ead3e
make lint
isaac-chung Jun 14, 2024
59917c3
Merge branch 'main' into compare-scores-clustering-fast
isaac-chung Jun 14, 2024
dd683b8
tests passing
isaac-chung Jun 14, 2024
66ba122
make lint
isaac-chung Jun 14, 2024
d2ea2e8
add paraphrase-multilingual-mpnet-base-v2 and e5-large-v2 results
isaac-chung Jun 14, 2024
241f6c8
add e5_eng_base_v2,labse,mxbai_embed_large_v1,bge_base_en_v1.5
isaac-chung Jun 17, 2024
9e163a3
Merge branch 'main' into compare-scores-clustering-fast
isaac-chung Jun 17, 2024
ccaa151
move plot scripts to mmteb srcipts repo
isaac-chung Jun 17, 2024
3a7e923
replace use_dataset_as_is wtih max_document_to_embed and add descript…
isaac-chung Jun 18, 2024
cc58d85
lint
isaac-chung Jun 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 41 additions & 12 deletions mteb/abstasks/AbsTaskClusteringFast.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,13 @@ class AbsTaskClusteringFast(AbsTask):
The similarity then is calculated using the V-measure metric, which is invariant to the permutation of the labels.
This approach is then repeated K times.

If the clustering is hieararchical, and more than one label is specified in order for each observation,
There are two ways to specify how a dataset is downsampled:
- max_document_to_embe (int): default to None
- max_fraction_of_documents_to_embed (float): default to 4%.
If both parameters are set to None, no downsampling is done in self._evaluate_subset().
Only one of these two parameters can be not None at the same time.

If the clustering is hierarchical, and more than one label is specified in order for each observation,
V-measures are calculated in the outlined way on each of the levels separately.

self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset.
Expand All @@ -91,8 +97,9 @@ class AbsTaskClusteringFast(AbsTask):
labels: list[str] | list[list[str]]
"""

max_documents_to_embed = 16_384
max_documents_per_cluster = 2048
max_fraction_of_documents_to_embed = 0.04
isaac-chung marked this conversation as resolved.
Show resolved Hide resolved
max_document_to_embed = None
max_documents_per_cluster = 16_384
n_clusters = 10
k_mean_batch_size = 512
max_depth = None
Expand All @@ -113,13 +120,29 @@ def _evaluate_subset(
) -> dict[str, float | dict[str, list[float]]]:
rng_state = random.Random(self.seed)

if len(dataset) > self.max_documents_to_embed:
example_indices = rng_state.sample(
range(len(dataset)), k=self.max_documents_to_embed
if (
self.max_document_to_embed is not None
and self.max_fraction_of_documents_to_embed is not None
):
raise Exception(
"Both max_document_to_embed and max_fraction_of_documents_to_embed are set. Please only set one."
)
downsampled_dataset = dataset.select(example_indices) # type: ignore
else:

if (
self.max_document_to_embed is None
and self.max_fraction_of_documents_to_embed is None
):
downsampled_dataset = dataset
else:
max_documents_to_embed = self.max_document_to_embed
if self.max_fraction_of_documents_to_embed is not None:
max_documents_to_embed = int(
self.max_fraction_of_documents_to_embed * len(dataset)
)
example_indices = rng_state.sample(
range(len(dataset)), k=max_documents_to_embed
)
downsampled_dataset = dataset.select(example_indices)

embeddings = model_encode(
downsampled_dataset["sentences"], # type: ignore
Expand All @@ -133,7 +156,7 @@ def _evaluate_subset(
label = [label]
labels.append(label)

v_measures = evaluate_clustering_bootstrapped(
all_v_scores = evaluate_clustering_bootstrapped(
embeddings,
labels,
n_clusters=self.n_clusters,
Expand All @@ -142,9 +165,15 @@ def _evaluate_subset(
max_depth=self.max_depth,
rng_state=rng_state,
)
all_v_scores = itertools.chain.from_iterable(v_measures.values())
mean_v_measure = np.mean(list(all_v_scores))
scores = {"v_measures": v_measures, "v_measure": float(mean_v_measure)}
v_measures = list(itertools.chain.from_iterable(all_v_scores.values()))

mean_v_measure = np.mean(v_measures)
v_std = np.std(v_measures)
scores = {
"v_measures": all_v_scores,
"v_measure": float(mean_v_measure),
"v_measure_std": v_std,
}
self._add_main_score(scores)
return scores

Expand Down
4 changes: 2 additions & 2 deletions mteb/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,8 +198,8 @@ def add_run_parser(subparsers) -> None:
parser.add_argument(
"--output_folder",
type=str,
default=None,
help="Output directory for results. Will default to results/{model_name} if not set.",
default="results",
help="Output directory for results. Will default to `results` if not set.",
)
parser.add_argument(
"-v", "--verbosity", type=int, default=2, help="Verbosity level"
Expand Down
2 changes: 1 addition & 1 deletion mteb/model_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class ModelMeta(BaseModel):
def to_dict(self):
dict_repr = self.model_dump()
loader = dict_repr.pop("loader", None)
dict_repr["loader"] = get_loader_name(loader)
dict_repr["loader"] = loader.func.__name__ if loader is not None else None
return dict_repr

def load_model(self, **kwargs: Any) -> Encoder | EncoderWithQueryCorpusEncode:
Expand Down
18 changes: 13 additions & 5 deletions mteb/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
from mteb.encoder_interface import Encoder, EncoderWithQueryCorpusEncode
from mteb.model_meta import ModelMeta
from mteb.models import (
bge_models,
e5_instruct,
e5_models,
gritlm,
mxbai_models,
openai_models,
sentence_transformers_models,
voyage_models,
Expand Down Expand Up @@ -55,8 +57,10 @@ def get_model_meta(model_name: str, revision: str | None = None) -> ModelMeta:
A model metadata object
"""
if model_name in models:
if revision and (not models[model_name].revision == revision):
raise ValueError(f"Model {revision} not found for model {model_name}")
if not models[model_name].revision == revision:
raise ValueError(
f"Model revision {revision} not found for model {model_name}"
)
return models[model_name]
else: # assume it is a sentence-transformers model
logger.info(
Expand All @@ -65,7 +69,9 @@ def get_model_meta(model_name: str, revision: str | None = None) -> ModelMeta:
logger.info(
f"Attempting to extract metadata by loading the model ({model_name}) using sentence-transformers."
)
model = SentenceTransformer(model_name, revision=revision)
model = SentenceTransformer(
model_name, revision=revision, trust_remote_code=True
)
meta = model_meta_from_sentence_transformers(model)

meta.revision = revision
Expand Down Expand Up @@ -108,11 +114,13 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe

model_modules = [
e5_models,
e5_instruct,
gritlm,
openai_models,
e5_instruct,
sentence_transformers_models,
openai_models,
voyage_models,
bge_models,
mxbai_models,
]
models = {}

Expand Down
58 changes: 58 additions & 0 deletions mteb/models/bge_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from __future__ import annotations

from functools import partial
from typing import Any

import torch
from sentence_transformers import SentenceTransformer

from mteb.model_meta import ModelMeta
from mteb.models.text_formatting_utils import corpus_to_texts


class BGEWrapper:
"""following the hf model card documentation."""

def __init__(self, model_name: str, **kwargs: Any):
self.model_name = model_name
self.mdl = SentenceTransformer(model_name)

def to(self, device: torch.device) -> None:
self.mdl.to(device)

def encode( # type: ignore
self,
sentences: list[str],
*,
batch_size: int = 32,
**kwargs: Any,
):
return self.mdl.encode(sentences, batch_size=batch_size, **kwargs)

def encode_queries(self, queries: list[str], batch_size: int = 32, **kwargs: Any):
sentences = [
"Represent this sentence for searching relevant passages: " + sentence
for sentence in queries
]
emb = self.mdl.encode(sentences, batch_size=batch_size, **kwargs)
return emb

def encode_corpus(
self,
corpus: list[dict[str, str]] | dict[str, list[str]],
batch_size: int = 32,
**kwargs: Any,
):
sentences = corpus_to_texts(corpus)
emb = self.mdl.encode(sentences, batch_size=batch_size, **kwargs)
return emb


bge_base_en_v1_5 = ModelMeta(
loader=partial(BGEWrapper, model_name="BAAI/bge-base-en-v1.5"), # type: ignore
name="BAAI/bge-base-en-v1.5",
languages=["eng_Latn"],
open_source=True,
revision="a5beb1e3e68b9ab74eb54cfd186867f64f240e1a",
release_date="2023-09-11", # initial commit of hf model.
)
36 changes: 36 additions & 0 deletions mteb/models/e5_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,3 +177,39 @@ def encode_corpus(
revision="4dc6d853a804b9c8886ede6dda8a073b7dc08a81",
release_date=E5_PAPER_RELEASE_DATE,
)

e5_eng_small_v2 = ModelMeta(
loader=partial(E5Wrapper, model_name="intfloat/e5-small-v2"), # type: ignore
name="intfloat/e5-small-v2",
languages=["eng_Latn"],
open_source=True,
revision="dca8b1a9dae0d4575df2bf423a5edb485a431236",
release_date=E5_PAPER_RELEASE_DATE,
)

e5_eng_small = ModelMeta(
loader=partial(E5Wrapper, model_name="intfloat/e5-small"), # type: ignore
name="intfloat/e5-small",
languages=["eng_Latn"],
open_source=True,
revision="e272f3049e853b47cb5ca3952268c6662abda68f",
release_date=E5_PAPER_RELEASE_DATE,
)

e5_eng_base_v2 = ModelMeta(
loader=partial(E5Wrapper, model_name="intfloat/e5-base-v2"), # type: ignore
name="intfloat/e5-base-v2",
languages=["eng_Latn"],
open_source=True,
revision="1c644c92ad3ba1efdad3f1451a637716616a20e8",
release_date=E5_PAPER_RELEASE_DATE,
)

e5_eng_large_v2 = ModelMeta(
loader=partial(E5Wrapper, model_name="intfloat/e5-large-v2"), # type: ignore
name="intfloat/e5-large-v2",
languages=["eng_Latn"],
open_source=True,
revision="b322e09026e4ea05f42beadf4d661fb4e101d311",
release_date=E5_PAPER_RELEASE_DATE,
)
58 changes: 58 additions & 0 deletions mteb/models/mxbai_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from __future__ import annotations

from functools import partial
from typing import Any

import torch
from sentence_transformers import SentenceTransformer

from mteb.model_meta import ModelMeta
from mteb.models.text_formatting_utils import corpus_to_texts


class MxbaiWrapper:
"""following the hf model card documentation."""

def __init__(self, model_name: str, **kwargs: Any):
self.model_name = model_name
self.mdl = SentenceTransformer(model_name)

def to(self, device: torch.device) -> None:
self.mdl.to(device)

def encode( # type: ignore
self,
sentences: list[str],
*,
batch_size: int = 32,
**kwargs: Any,
):
return self.mdl.encode(sentences, batch_size=batch_size, **kwargs)

def encode_queries(self, queries: list[str], batch_size: int = 32, **kwargs: Any):
sentences = [
"Represent this sentence for searching relevant passages: " + sentence
for sentence in queries
]
emb = self.mdl.encode(sentences, batch_size=batch_size, **kwargs)
return emb

def encode_corpus(
self,
corpus: list[dict[str, str]] | dict[str, list[str]],
batch_size: int = 32,
**kwargs: Any,
):
sentences = corpus_to_texts(corpus)
emb = self.mdl.encode(sentences, batch_size=batch_size, **kwargs)
return emb


mxbai_embed_large_v1 = ModelMeta(
loader=partial(MxbaiWrapper, model_name="mixedbread-ai/mxbai-embed-large-v1"), # type: ignore
name="mixedbread-ai/mxbai-embed-large-v1",
languages=["eng_Latn"],
open_source=True,
revision="990580e27d329c7408b3741ecff85876e128e203",
release_date="2024-03-07", # initial commit of hf model.
)
Loading
Loading