From eda46de55318600d2e936f52adf1c25039d0f2c5 Mon Sep 17 00:00:00 2001 From: Jordan Clive Date: Wed, 12 Jun 2024 17:13:29 +0100 Subject: [PATCH 01/14] start merge --- .../evaluators/RerankingEvaluator.py | 64 ++++++++++++++++--- 1 file changed, 54 insertions(+), 10 deletions(-) diff --git a/mteb/evaluation/evaluators/RerankingEvaluator.py b/mteb/evaluation/evaluators/RerankingEvaluator.py index 53dd0fb8a1..8b7ca80f4b 100644 --- a/mteb/evaluation/evaluators/RerankingEvaluator.py +++ b/mteb/evaluation/evaluators/RerankingEvaluator.py @@ -8,7 +8,7 @@ import tqdm from sklearn.metrics import average_precision_score -from ...encoder_interface import EncoderWithQueryCorpusEncode +from ...encoder_interface import Encoder, EncoderWithQueryCorpusEncode from .Evaluator import Evaluator from .utils import confidence_scores, cos_sim, nAUC @@ -28,13 +28,15 @@ class RerankingEvaluator(Evaluator): def __init__( self, - samples, + samples: list[dict], mrr_at_k: int = 10, name: str = "", similarity_fct=cos_sim, batch_size: int = 512, use_batched_encoding: bool = True, limit: int | None = None, + k_values: list[int] = [1, 3, 5, 10, 20, 100, 1000], + evaluator_type: str = "standard" **kwargs, ): super().__init__(**kwargs) @@ -46,6 +48,8 @@ def __init__( self.similarity_fct = similarity_fct self.batch_size = batch_size self.use_batched_encoding = use_batched_encoding + self.k_values = k_values + self.evaluator_type = evaluator_type if isinstance(self.samples, dict): self.samples = list(self.samples.values()) @@ -68,13 +72,11 @@ def compute_metrics(self, model): else self.compute_metrics_individual(model) ) - def compute_metrics_batched(self, model): + def compute_metrics_batched(self, model: Encoder | EncoderWithQueryCorpusEncode): """Computes the metrices in a batched way, by batching all queries and all documents together """ - all_mrr_scores = [] - all_ap_scores = [] - all_conf_scores = [] + # using encode_queries and encode_corpus functions if they exists, # which can be defined by users to add different instructions for query and passage conveniently @@ -102,14 +104,29 @@ def compute_metrics_batched(self, model): all_query_flattened = [ q for sample in self.samples for q in sample["query"] ] - all_query_embs = self._encode_unique_texts( - all_query_flattened, encode_corpus_func - ) + if self.evaluator_type == "standard": + all_query_embs = self._encode_unique_texts( + all_query_flattened, encode_corpus_func + ) + elif self.evaluator_type == "miracl": + all_query_embs = np.asarray( + encode_queries_func(all_query_flattened, batch_size=self.batch_size) + ) else: raise ValueError( f"Query must be a string or a list of strings but is {type(self.samples[0]['query'])}" ) - + + if self.evaluator_type == "standard": + results = self.encode_candidates(all_query_embs) + elif self.evaluator_type == "miracl": + results = self.rerank_candidates(all_query_embs) + return results + + def encode_candidates(self, all_query_embs): + all_mrr_scores = [] + all_ap_scores = [] + all_conf_scores = [] logger.info("Encoding candidates...") all_docs = [] for sample in self.samples: @@ -209,6 +226,33 @@ def compute_metrics_individual(self, model): naucs_mrr = self.nAUC_scores(all_conf_scores, all_mrr_scores, "mrr") return {**{"map": mean_ap, "mrr": mean_mrr}, **naucs_map, **naucs_mrr} + + def rerank( + self, query_emb: torch.Tensor, docs_emb: torch.Tensor + ) -> dict[str, float]: + """Rerank documents (docs_emb) given the query (query_emb) + + Args: + query_emb: Query embedding of shape `(num_queries, hidden_size)`) + if `num_queries` > 0: we take the closest document to any of the queries + docs_emb: Candidates documents embeddings of shape `(num_pos+num_neg, hidden_size)`) + + Returns: + similarity_scores: + """ + if not query_emb.shape[0]: + raise ValueError("Empty query embedding") + + if not docs_emb.shape[0]: + return {"empty-docid": 0} + + pred_scores = self.similarity_fct(query_emb, docs_emb) + if len(pred_scores.shape) > 1: + pred_scores = torch.amax(pred_scores, dim=0) + + return { + str(i): score.detach().numpy().item() for i, score in enumerate(pred_scores) + } def _encode_unique_texts(self, all_texts, encode_queries_func): index_map, all_unique_texts, all_texts_indexes = {}, [], [] From 615dbbb5de2809ce38c1472deff6d3f765aca08d Mon Sep 17 00:00:00 2001 From: Jordan Clive Date: Wed, 12 Jun 2024 17:52:37 +0100 Subject: [PATCH 02/14] removing redundancy --- .../evaluators/RerankingEvaluator.py | 179 ++++++++++++------ 1 file changed, 123 insertions(+), 56 deletions(-) diff --git a/mteb/evaluation/evaluators/RerankingEvaluator.py b/mteb/evaluation/evaluators/RerankingEvaluator.py index 8b7ca80f4b..6d70b806d4 100644 --- a/mteb/evaluation/evaluators/RerankingEvaluator.py +++ b/mteb/evaluation/evaluators/RerankingEvaluator.py @@ -8,6 +8,8 @@ import tqdm from sklearn.metrics import average_precision_score +from mteb.evaluation.evaluators.RetrievalEvaluator import RetrievalEvaluator + from ...encoder_interface import Encoder, EncoderWithQueryCorpusEncode from .Evaluator import Evaluator from .utils import confidence_scores, cos_sim, nAUC @@ -76,8 +78,6 @@ def compute_metrics_batched(self, model: Encoder | EncoderWithQueryCorpusEncode) """Computes the metrices in a batched way, by batching all queries and all documents together """ - - # using encode_queries and encode_corpus functions if they exists, # which can be defined by users to add different instructions for query and passage conveniently encode_queries_func = ( @@ -118,22 +118,59 @@ def compute_metrics_batched(self, model: Encoder | EncoderWithQueryCorpusEncode) ) if self.evaluator_type == "standard": - results = self.encode_candidates(all_query_embs) + results = self.encode_candidates(all_query_embs,encode_corpus_func,True) + elif self.evaluator_type == "miracl": + results = self.encode_candidates_miracl(all_query_embs, encode_corpus_func) + return results + + def compute_metrics_individual(self, model): + """Embeds every (query, positive, negative) tuple individually. + Is slower than the batched version, but saves memory as only the + embeddings for one tuple are needed. Useful when you have + a really large test set + """ + + + # using encode_queries and encode_corpus functions if they exists, + # which can be defined by users to add different instructions for query and passage conveniently + encode_queries_func = ( + model.encode_queries if hasattr(model, "encode_queries") else model.encode + ) + encode_corpus_func = ( + model.encode_corpus if hasattr(model, "encode_corpus") else model.encode + ) + if self.evaluator_type == "standard": + results = self.encode_candidates(encode_queries_func, encode_corpus_func,False,encode_corpus_func=encode_corpus_func) elif self.evaluator_type == "miracl": - results = self.rerank_candidates(all_query_embs) + results = self.encode_candidates_miracl_individual(encode_queries_func,encode_corpus_func) return results - def encode_candidates(self, all_query_embs): + def encode_candidates(self, all_query_embs,encode_corpus_func,batched,encode_queries_func=None): all_mrr_scores = [] all_ap_scores = [] all_conf_scores = [] logger.info("Encoding candidates...") + if batched: + self.encode_candidates_batched(all_query_embs, encode_corpus_func,all_mrr_scores, all_ap_scores, all_conf_scores) + else: + self.encode_candidates_individual(encode_queries_func, encode_corpus_func,all_mrr_scores, all_ap_scores, all_conf_scores) + mean_ap = np.mean(all_ap_scores) + mean_mrr = np.mean(all_mrr_scores) + + # Compute nAUCs + naucs_map = self.nAUC_scores(all_conf_scores, all_ap_scores, "map") + naucs_mrr = self.nAUC_scores(all_conf_scores, all_mrr_scores, "mrr") + + return {**{"map": mean_ap, "mrr": mean_mrr}, **naucs_map, **naucs_mrr} + + + def encode_candidates_batched(self, all_query_embs, encode_corpus_func,all_mrr_scores, all_ap_scores, all_conf_scores): all_docs = [] for sample in self.samples: all_docs.extend(sample["positive"]) all_docs.extend(sample["negative"]) - all_docs_embs = self._encode_unique_texts(all_docs, encode_corpus_func) + all_docs_embs = self._encode_unique_texts(all_docs, encode_corpus_func,) # Compute scores and confidence scores logger.info("Evaluating...") @@ -152,45 +189,11 @@ def encode_candidates(self, all_query_embs): if num_pos == 0 or num_neg == 0: continue - is_relevant = [True] * num_pos + [False] * num_neg - - sim_scores = self._compute_sim_scores_instance(query_emb, docs_emb) - scores = self._compute_metrics_instance(sim_scores, is_relevant) - conf_scores = self.conf_scores(sim_scores.tolist()) - - all_mrr_scores.append(scores["mrr"]) - all_ap_scores.append(scores["ap"]) - all_conf_scores.append(conf_scores) - - mean_ap = np.mean(all_ap_scores) - mean_mrr = np.mean(all_mrr_scores) - - # Compute nAUCs - naucs_map = self.nAUC_scores(all_conf_scores, all_ap_scores, "map") - naucs_mrr = self.nAUC_scores(all_conf_scores, all_mrr_scores, "mrr") - - return {**{"map": mean_ap, "mrr": mean_mrr}, **naucs_map, **naucs_mrr} - - def compute_metrics_individual(self, model): - """Embeds every (query, positive, negative) tuple individually. - Is slower than the batched version, but saves memory as only the - embeddings for one tuple are needed. Useful when you have - a really large test set - """ - all_mrr_scores = [] - all_ap_scores = [] - all_conf_scores = [] - - # using encode_queries and encode_corpus functions if they exists, - # which can be defined by users to add different instructions for query and passage conveniently - encode_queries_func = ( - model.encode_queries if hasattr(model, "encode_queries") else model.encode - ) - encode_corpus_func = ( - model.encode_corpus if hasattr(model, "encode_corpus") else model.encode - ) - + self.apply_sim_scores(query_emb, docs_emb, is_relevant, all_mrr_scores, all_ap_scores, all_conf_scores) + + + def encode_candidates_individual(self, encode_queries_func, encode_corpus_func,all_mrr_scores, all_ap_scores, all_conf_scores): for instance in tqdm.tqdm(self.samples, desc="Samples"): query = instance["query"] positive = list(instance["positive"]) @@ -209,24 +212,88 @@ def compute_metrics_individual(self, model): encode_queries_func(query, batch_size=self.batch_size) ) docs_emb = np.asarray(encode_corpus_func(docs, batch_size=self.batch_size)) + self.apply_sim_scores(query_emb, docs_emb, is_relevant, all_mrr_scores, all_ap_scores, all_conf_scores) + + def apply_sim_scores(self,query_emb, docs_emb, is_relevant, all_mrr_scores, all_ap_scores, all_conf_scores): + sim_scores = self._compute_sim_scores_instance(query_emb, docs_emb) + scores = self._compute_metrics_instance(sim_scores, is_relevant) + conf_scores = self.conf_scores(sim_scores.tolist()) + + all_mrr_scores.append(scores["mrr"]) + all_ap_scores.append(scores["ap"]) + all_conf_scores.append(conf_scores) + + def encode_candidates_miracl(self, all_query_embs, encode_corpus_func): + all_docs = [] + for sample in self.samples: + all_docs.extend(sample["candidates"]) - sim_scores = self._compute_sim_scores_instance(query_emb, docs_emb) - scores = self._compute_metrics_instance(sim_scores, is_relevant) - conf_scores = self.conf_scores(sim_scores.tolist()) + all_docs_embs = np.asarray( + encode_corpus_func(all_docs, batch_size=self.batch_size) + ) - all_mrr_scores.append(scores["mrr"]) - all_ap_scores.append(scores["ap"]) - all_conf_scores.append(conf_scores) + # Compute scores + logger.info("Evaluating...") + query_idx, docs_idx = 0, 0 + results, qrels = {}, {} + for instance in self.samples: + num_subqueries = ( + len(instance["query"]) if isinstance(instance["query"], list) else 1 + ) + query_emb = all_query_embs[query_idx : query_idx + num_subqueries] + query_idx += num_subqueries - mean_ap = np.mean(all_ap_scores) - mean_mrr = np.mean(all_mrr_scores) + positive = instance["positive"] + docs = instance["candidates"] + num_doc = len(docs) + docs_emb = all_docs_embs[docs_idx : docs_idx + num_doc] + docs_idx += num_doc - # Compute nAUCs - naucs_map = self.nAUC_scores(all_conf_scores, all_ap_scores, "map") - naucs_mrr = self.nAUC_scores(all_conf_scores, all_mrr_scores, "mrr") + fake_qid = str(query_idx) + results[fake_qid] = self.rerank(query_emb, docs_emb) + qrels[fake_qid] = { + str(i): 1 if doc in positive else 0 for i, doc in enumerate(docs) + } - return {**{"map": mean_ap, "mrr": mean_mrr}, **naucs_map, **naucs_mrr} + scores_miracl = self.collect_miracl_results(results, qrels) + return scores_miracl + + def encode_candidates_miracl_individual(self, encode_queries_func, encode_corpus_func): + results, qrels = {}, {} + for i, instance in enumerate(tqdm.tqdm(self.samples, desc="Samples")): + query = instance["query"] + positive = set(instance["positive"]) + docs = list(instance["candidates"]) + + if isinstance(query, str): + # .encoding interface requires List[str] as input + query_emb = np.asarray( + encode_queries_func([query], batch_size=self.batch_size) + ) + docs_emb = np.asarray( + encode_corpus_func(docs, batch_size=self.batch_size) + ) + + fake_qid = str(i) + results[fake_qid] = self.rerank(query_emb, docs_emb) + qrels[fake_qid] = { + str(i): 1 if doc in positive else 0 for i, doc in enumerate(docs) + } + + scores_miracl = self.collect_miracl_results(results, qrels) + return scores_miracl + def collect_miracl_results(self, results, qrels): + ndcg, _map, recall, precision, naucs = RetrievalEvaluator.evaluate( + qrels=qrels, + results=results, + k_values=self.k_values, + ignore_identical_ids=False, + ) + scores = {**ndcg, **_map, **recall, **precision, **naucs} + scores_miracl = {f"{k}(MIRACL)": v for k, v in scores.items()} + return scores_miracl + def rerank( self, query_emb: torch.Tensor, docs_emb: torch.Tensor ) -> dict[str, float]: From 0df380966dac568942b8b826dfe1acb377216628 Mon Sep 17 00:00:00 2001 From: Jordan Clive Date: Wed, 12 Jun 2024 17:56:44 +0100 Subject: [PATCH 03/14] removing MIRACLevaluator --- .../Reranking/multilingual/MIRACLReranking.py | 188 +----------------- 1 file changed, 1 insertion(+), 187 deletions(-) diff --git a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py index 3c4459ffb9..e1e9e6827f 100644 --- a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py +++ b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py @@ -89,196 +89,10 @@ def _evaluate_subset( data_split: Dataset, **kwargs: Any, ) -> ScoresDict: - evaluator = MIRACLRerankingEvaluator(data_split, **kwargs) + evaluator = RerankingEvaluator(data_split, evaluator_type='miracl',**kwargs) scores = evaluator(model) self._add_main_score(scores) return scores -class MIRACLRerankingEvaluator(RerankingEvaluator): - """This class evaluates a SentenceTransformer model for the task of re-ranking. - MIRACLRerankingEvaluator differs from RerankingEvaluator in two ways: - 1. it uses the pytrec_eval via RetrievalEvaluator instead of the metrics provided by sklearn; - 2. it reranks the top-k `candidates` from previous-stage retrieval which may not include all ground-truth `positive` documents - """ - - def __init__( - self, - samples: list[dict], - mrr_at_k: int = 10, - name: str = "", - similarity_fct=cos_sim, - batch_size: int = 512, - use_batched_encoding: bool = True, - limit: int | None = None, - k_values: list[int] = [1, 3, 5, 10, 20, 100, 1000], - **kwargs, - ): - """Args: - k_values: ranking cutoff threshold when applicable - """ - super().__init__( - samples, - mrr_at_k, - name, - similarity_fct, - batch_size, - use_batched_encoding, - limit, - **kwargs, - ) - self.k_values = k_values - - def rerank( - self, query_emb: torch.Tensor, docs_emb: torch.Tensor - ) -> dict[str, float]: - """Rerank documents (docs_emb) given the query (query_emb) - - Args: - query_emb: Query embedding of shape `(num_queries, hidden_size)`) - if `num_queries` > 0: we take the closest document to any of the queries - docs_emb: Candidates documents embeddings of shape `(num_pos+num_neg, hidden_size)`) - - Returns: - similarity_scores: - """ - if not query_emb.shape[0]: - raise ValueError("Empty query embedding") - - if not docs_emb.shape[0]: - return {"empty-docid": 0} - - pred_scores = self.similarity_fct(query_emb, docs_emb) - if len(pred_scores.shape) > 1: - pred_scores = torch.amax(pred_scores, dim=0) - - return { - str(i): score.detach().numpy().item() for i, score in enumerate(pred_scores) - } - - def compute_metrics_batched(self, model: Encoder | EncoderWithQueryCorpusEncode): - """Computes the metrices in a batched way, by batching all queries and - all documents together - """ - # using encode_queries and encode_corpus functions if they exists, - # which can be defined by users to add different instructions for query and passage conveniently - encode_queries_func = ( - model.encode_queries - if isinstance(model, EncoderWithQueryCorpusEncode) - else model.encode - ) - encode_corpus_func = ( - model.encode_corpus - if isinstance(model, EncoderWithQueryCorpusEncode) - else model.encode - ) - - logger.info("Encoding queries...") - if isinstance(self.samples[0]["query"], str): - all_query_embs = np.asarray( - encode_queries_func( - [sample["query"] for sample in self.samples], - batch_size=self.batch_size, - ) - ) - elif isinstance(self.samples[0]["query"], list): - # In case the query is a list of strings, we get the most similar embedding to any of the queries - all_query_flattened = [ - q for sample in self.samples for q in sample["query"] - ] - all_query_embs = np.asarray( - encode_queries_func(all_query_flattened, batch_size=self.batch_size) - ) - else: - raise ValueError( - f"Query must be a string or a list of strings but is {type(self.samples[0]['query'])}" - ) - - logger.info("Encoding candidates...") - all_docs = [] - for sample in self.samples: - all_docs.extend(sample["candidates"]) - - all_docs_embs = np.asarray( - encode_corpus_func(all_docs, batch_size=self.batch_size) - ) - - # Compute scores - logger.info("Evaluating...") - query_idx, docs_idx = 0, 0 - results, qrels = {}, {} - for instance in self.samples: - num_subqueries = ( - len(instance["query"]) if isinstance(instance["query"], list) else 1 - ) - query_emb = all_query_embs[query_idx : query_idx + num_subqueries] - query_idx += num_subqueries - - positive = instance["positive"] - docs = instance["candidates"] - num_doc = len(docs) - docs_emb = all_docs_embs[docs_idx : docs_idx + num_doc] - docs_idx += num_doc - - fake_qid = str(query_idx) - results[fake_qid] = self.rerank(query_emb, docs_emb) - qrels[fake_qid] = { - str(i): 1 if doc in positive else 0 for i, doc in enumerate(docs) - } - - ndcg, _map, recall, precision, naucs = RetrievalEvaluator.evaluate( - qrels=qrels, - results=results, - k_values=self.k_values, - ignore_identical_ids=False, - ) - scores = {**ndcg, **_map, **recall, **precision, **naucs} - scores_miracl = {f"{k}(MIRACL)": v for k, v in scores.items()} - return scores_miracl - - def compute_metrics_individual(self, model): - """Embeds every (query, positive, negative) tuple individually. - Is slower than the batched version, but saves memory as only the - embeddings for one tuple are needed. Useful when you have - a really large test set - """ - # using encode_queries and encode_corpus functions if they exists, - # which can be defined by users to add different instructions for query and passage conveniently - encode_queries_func = ( - model.encode_queries if hasattr(model, "encode_queries") else model.encode - ) - encode_corpus_func = ( - model.encode_corpus if hasattr(model, "encode_corpus") else model.encode - ) - - results, qrels = {}, {} - for i, instance in enumerate(tqdm.tqdm(self.samples, desc="Samples")): - query = instance["query"] - positive = set(instance["positive"]) - docs = list(instance["candidates"]) - - if isinstance(query, str): - # .encoding interface requires List[str] as input - query_emb = np.asarray( - encode_queries_func([query], batch_size=self.batch_size) - ) - docs_emb = np.asarray( - encode_corpus_func(docs, batch_size=self.batch_size) - ) - - fake_qid = str(i) - results[fake_qid] = self.rerank(query_emb, docs_emb) - qrels[fake_qid] = { - str(i): 1 if doc in positive else 0 for i, doc in enumerate(docs) - } - - ndcg, _map, recall, precision, naucs = RetrievalEvaluator.evaluate( - qrels=qrels, - results=results, - k_values=self.k_values, - ignore_identical_ids=False, - ) - scores = {**ndcg, **_map, **recall, **precision, **naucs} - scores_miracl = {f"{k}(MIRACL)": v for k, v in scores.items()} - return scores_miracl From 0f35c8daa91b06daab24d3ee6cfc6a1b1225b560 Mon Sep 17 00:00:00 2001 From: Jordan Clive Date: Wed, 12 Jun 2024 18:06:24 +0100 Subject: [PATCH 04/14] add linting --- .../evaluators/RerankingEvaluator.py | 104 ++++++++++++++---- .../Reranking/multilingual/MIRACLReranking.py | 9 +- 2 files changed, 82 insertions(+), 31 deletions(-) diff --git a/mteb/evaluation/evaluators/RerankingEvaluator.py b/mteb/evaluation/evaluators/RerankingEvaluator.py index 6d70b806d4..d65899a79d 100644 --- a/mteb/evaluation/evaluators/RerankingEvaluator.py +++ b/mteb/evaluation/evaluators/RerankingEvaluator.py @@ -38,7 +38,7 @@ def __init__( use_batched_encoding: bool = True, limit: int | None = None, k_values: list[int] = [1, 3, 5, 10, 20, 100, 1000], - evaluator_type: str = "standard" + evaluator_type: str = "standard", **kwargs, ): super().__init__(**kwargs) @@ -116,21 +116,19 @@ def compute_metrics_batched(self, model: Encoder | EncoderWithQueryCorpusEncode) raise ValueError( f"Query must be a string or a list of strings but is {type(self.samples[0]['query'])}" ) - + if self.evaluator_type == "standard": - results = self.encode_candidates(all_query_embs,encode_corpus_func,True) + results = self.encode_candidates(all_query_embs, encode_corpus_func, True) elif self.evaluator_type == "miracl": results = self.encode_candidates_miracl(all_query_embs, encode_corpus_func) return results - + def compute_metrics_individual(self, model): """Embeds every (query, positive, negative) tuple individually. Is slower than the batched version, but saves memory as only the embeddings for one tuple are needed. Useful when you have a really large test set """ - - # using encode_queries and encode_corpus functions if they exists, # which can be defined by users to add different instructions for query and passage conveniently encode_queries_func = ( @@ -140,20 +138,41 @@ def compute_metrics_individual(self, model): model.encode_corpus if hasattr(model, "encode_corpus") else model.encode ) if self.evaluator_type == "standard": - results = self.encode_candidates(encode_queries_func, encode_corpus_func,False,encode_corpus_func=encode_corpus_func) + results = self.encode_candidates( + encode_queries_func, + encode_corpus_func, + False, + encode_corpus_func=encode_corpus_func, + ) elif self.evaluator_type == "miracl": - results = self.encode_candidates_miracl_individual(encode_queries_func,encode_corpus_func) + results = self.encode_candidates_miracl_individual( + encode_queries_func, encode_corpus_func + ) return results - def encode_candidates(self, all_query_embs,encode_corpus_func,batched,encode_queries_func=None): + def encode_candidates( + self, all_query_embs, encode_corpus_func, batched, encode_queries_func=None + ): all_mrr_scores = [] all_ap_scores = [] all_conf_scores = [] logger.info("Encoding candidates...") if batched: - self.encode_candidates_batched(all_query_embs, encode_corpus_func,all_mrr_scores, all_ap_scores, all_conf_scores) + self.encode_candidates_batched( + all_query_embs, + encode_corpus_func, + all_mrr_scores, + all_ap_scores, + all_conf_scores, + ) else: - self.encode_candidates_individual(encode_queries_func, encode_corpus_func,all_mrr_scores, all_ap_scores, all_conf_scores) + self.encode_candidates_individual( + encode_queries_func, + encode_corpus_func, + all_mrr_scores, + all_ap_scores, + all_conf_scores, + ) mean_ap = np.mean(all_ap_scores) mean_mrr = np.mean(all_mrr_scores) @@ -163,14 +182,23 @@ def encode_candidates(self, all_query_embs,encode_corpus_func,batched,encode_que return {**{"map": mean_ap, "mrr": mean_mrr}, **naucs_map, **naucs_mrr} - - def encode_candidates_batched(self, all_query_embs, encode_corpus_func,all_mrr_scores, all_ap_scores, all_conf_scores): + def encode_candidates_batched( + self, + all_query_embs, + encode_corpus_func, + all_mrr_scores, + all_ap_scores, + all_conf_scores, + ): all_docs = [] for sample in self.samples: all_docs.extend(sample["positive"]) all_docs.extend(sample["negative"]) - all_docs_embs = self._encode_unique_texts(all_docs, encode_corpus_func,) + all_docs_embs = self._encode_unique_texts( + all_docs, + encode_corpus_func, + ) # Compute scores and confidence scores logger.info("Evaluating...") @@ -190,10 +218,23 @@ def encode_candidates_batched(self, all_query_embs, encode_corpus_func,all_mrr_s if num_pos == 0 or num_neg == 0: continue is_relevant = [True] * num_pos + [False] * num_neg - self.apply_sim_scores(query_emb, docs_emb, is_relevant, all_mrr_scores, all_ap_scores, all_conf_scores) - - - def encode_candidates_individual(self, encode_queries_func, encode_corpus_func,all_mrr_scores, all_ap_scores, all_conf_scores): + self.apply_sim_scores( + query_emb, + docs_emb, + is_relevant, + all_mrr_scores, + all_ap_scores, + all_conf_scores, + ) + + def encode_candidates_individual( + self, + encode_queries_func, + encode_corpus_func, + all_mrr_scores, + all_ap_scores, + all_conf_scores, + ): for instance in tqdm.tqdm(self.samples, desc="Samples"): query = instance["query"] positive = list(instance["positive"]) @@ -212,9 +253,24 @@ def encode_candidates_individual(self, encode_queries_func, encode_corpus_func,a encode_queries_func(query, batch_size=self.batch_size) ) docs_emb = np.asarray(encode_corpus_func(docs, batch_size=self.batch_size)) - self.apply_sim_scores(query_emb, docs_emb, is_relevant, all_mrr_scores, all_ap_scores, all_conf_scores) + self.apply_sim_scores( + query_emb, + docs_emb, + is_relevant, + all_mrr_scores, + all_ap_scores, + all_conf_scores, + ) - def apply_sim_scores(self,query_emb, docs_emb, is_relevant, all_mrr_scores, all_ap_scores, all_conf_scores): + def apply_sim_scores( + self, + query_emb, + docs_emb, + is_relevant, + all_mrr_scores, + all_ap_scores, + all_conf_scores, + ): sim_scores = self._compute_sim_scores_instance(query_emb, docs_emb) scores = self._compute_metrics_instance(sim_scores, is_relevant) conf_scores = self.conf_scores(sim_scores.tolist()) @@ -222,7 +278,7 @@ def apply_sim_scores(self,query_emb, docs_emb, is_relevant, all_mrr_scores, all_ all_mrr_scores.append(scores["mrr"]) all_ap_scores.append(scores["ap"]) all_conf_scores.append(conf_scores) - + def encode_candidates_miracl(self, all_query_embs, encode_corpus_func): all_docs = [] for sample in self.samples: @@ -258,7 +314,9 @@ def encode_candidates_miracl(self, all_query_embs, encode_corpus_func): scores_miracl = self.collect_miracl_results(results, qrels) return scores_miracl - def encode_candidates_miracl_individual(self, encode_queries_func, encode_corpus_func): + def encode_candidates_miracl_individual( + self, encode_queries_func, encode_corpus_func + ): results, qrels = {}, {} for i, instance in enumerate(tqdm.tqdm(self.samples, desc="Samples")): query = instance["query"] @@ -282,7 +340,7 @@ def encode_candidates_miracl_individual(self, encode_queries_func, encode_corpus scores_miracl = self.collect_miracl_results(results, qrels) return scores_miracl - + def collect_miracl_results(self, results, qrels): ndcg, _map, recall, precision, naucs = RetrievalEvaluator.evaluate( qrels=qrels, diff --git a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py index e1e9e6827f..f5eb69a032 100644 --- a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py +++ b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py @@ -3,16 +3,11 @@ import logging from typing import Any -import numpy as np -import torch -import tqdm from datasets import Dataset from mteb.abstasks.TaskMetadata import TaskMetadata from mteb.encoder_interface import Encoder, EncoderWithQueryCorpusEncode from mteb.evaluation.evaluators import RerankingEvaluator -from mteb.evaluation.evaluators.RetrievalEvaluator import RetrievalEvaluator -from mteb.evaluation.evaluators.utils import cos_sim from mteb.MTEBResults import ScoresDict from ....abstasks import MultilingualTask @@ -89,10 +84,8 @@ def _evaluate_subset( data_split: Dataset, **kwargs: Any, ) -> ScoresDict: - evaluator = RerankingEvaluator(data_split, evaluator_type='miracl',**kwargs) + evaluator = RerankingEvaluator(data_split, evaluator_type="miracl", **kwargs) scores = evaluator(model) self._add_main_score(scores) return scores - - From f5c980a6f23bb8cc1686bdb7e41b06513ed1c144 Mon Sep 17 00:00:00 2001 From: Jordan Clive Date: Wed, 12 Jun 2024 18:30:09 +0100 Subject: [PATCH 05/14] clean up method names --- .../evaluators/RerankingEvaluator.py | 64 +++++++++---------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/mteb/evaluation/evaluators/RerankingEvaluator.py b/mteb/evaluation/evaluators/RerankingEvaluator.py index d65899a79d..16c883c229 100644 --- a/mteb/evaluation/evaluators/RerankingEvaluator.py +++ b/mteb/evaluation/evaluators/RerankingEvaluator.py @@ -118,9 +118,9 @@ def compute_metrics_batched(self, model: Encoder | EncoderWithQueryCorpusEncode) ) if self.evaluator_type == "standard": - results = self.encode_candidates(all_query_embs, encode_corpus_func, True) + results = self._encode_candidates(all_query_embs, encode_corpus_func, True) elif self.evaluator_type == "miracl": - results = self.encode_candidates_miracl(all_query_embs, encode_corpus_func) + results = self._encode_candidates_miracl(all_query_embs, encode_corpus_func) return results def compute_metrics_individual(self, model): @@ -138,40 +138,40 @@ def compute_metrics_individual(self, model): model.encode_corpus if hasattr(model, "encode_corpus") else model.encode ) if self.evaluator_type == "standard": - results = self.encode_candidates( - encode_queries_func, - encode_corpus_func, - False, + results = self._encode_candidates( encode_corpus_func=encode_corpus_func, + encode_queries_func=encode_queries_func, + batched=False, ) elif self.evaluator_type == "miracl": - results = self.encode_candidates_miracl_individual( - encode_queries_func, encode_corpus_func + results = self._encode_candidates_miracl_individual( + encode_queries_func=encode_queries_func, + encode_corpus_func=encode_corpus_func, ) return results - def encode_candidates( - self, all_query_embs, encode_corpus_func, batched, encode_queries_func=None + def _encode_candidates( + self, encode_corpus_func, batched, all_query_embs=None, encode_queries_func=None ): all_mrr_scores = [] all_ap_scores = [] all_conf_scores = [] logger.info("Encoding candidates...") if batched: - self.encode_candidates_batched( - all_query_embs, - encode_corpus_func, - all_mrr_scores, - all_ap_scores, - all_conf_scores, + self._encode_candidates_batched( + all_query_embs=all_query_embs, + encode_corpus_func=encode_corpus_func, + all_mrr_scores=all_mrr_scores, + all_ap_scores=all_ap_scores, + all_conf_scores=all_conf_scores, ) else: - self.encode_candidates_individual( - encode_queries_func, - encode_corpus_func, - all_mrr_scores, - all_ap_scores, - all_conf_scores, + self._encode_candidates_individual( + encode_queries_func=encode_queries_func, + encode_corpus_func=encode_corpus_func, + all_mrr_scores=all_mrr_scores, + all_ap_scores=all_ap_scores, + all_conf_scores=all_conf_scores, ) mean_ap = np.mean(all_ap_scores) mean_mrr = np.mean(all_mrr_scores) @@ -182,7 +182,7 @@ def encode_candidates( return {**{"map": mean_ap, "mrr": mean_mrr}, **naucs_map, **naucs_mrr} - def encode_candidates_batched( + def _encode_candidates_batched( self, all_query_embs, encode_corpus_func, @@ -218,7 +218,7 @@ def encode_candidates_batched( if num_pos == 0 or num_neg == 0: continue is_relevant = [True] * num_pos + [False] * num_neg - self.apply_sim_scores( + self._apply_sim_scores( query_emb, docs_emb, is_relevant, @@ -227,7 +227,7 @@ def encode_candidates_batched( all_conf_scores, ) - def encode_candidates_individual( + def _encode_candidates_individual( self, encode_queries_func, encode_corpus_func, @@ -253,7 +253,7 @@ def encode_candidates_individual( encode_queries_func(query, batch_size=self.batch_size) ) docs_emb = np.asarray(encode_corpus_func(docs, batch_size=self.batch_size)) - self.apply_sim_scores( + self._apply_sim_scores( query_emb, docs_emb, is_relevant, @@ -262,7 +262,7 @@ def encode_candidates_individual( all_conf_scores, ) - def apply_sim_scores( + def _apply_sim_scores( self, query_emb, docs_emb, @@ -279,7 +279,7 @@ def apply_sim_scores( all_ap_scores.append(scores["ap"]) all_conf_scores.append(conf_scores) - def encode_candidates_miracl(self, all_query_embs, encode_corpus_func): + def _encode_candidates_miracl(self, all_query_embs, encode_corpus_func): all_docs = [] for sample in self.samples: all_docs.extend(sample["candidates"]) @@ -311,10 +311,10 @@ def encode_candidates_miracl(self, all_query_embs, encode_corpus_func): str(i): 1 if doc in positive else 0 for i, doc in enumerate(docs) } - scores_miracl = self.collect_miracl_results(results, qrels) + scores_miracl = self._collect_miracl_results(results, qrels) return scores_miracl - def encode_candidates_miracl_individual( + def _encode_candidates_miracl_individual( self, encode_queries_func, encode_corpus_func ): results, qrels = {}, {} @@ -338,10 +338,10 @@ def encode_candidates_miracl_individual( str(i): 1 if doc in positive else 0 for i, doc in enumerate(docs) } - scores_miracl = self.collect_miracl_results(results, qrels) + scores_miracl = self._collect_miracl_results(results, qrels) return scores_miracl - def collect_miracl_results(self, results, qrels): + def _collect_miracl_results(self, results, qrels): ndcg, _map, recall, precision, naucs = RetrievalEvaluator.evaluate( qrels=qrels, results=results, From 21426d3f3e54811c63a53c860177ff490541db64 Mon Sep 17 00:00:00 2001 From: Jordan Clive Date: Fri, 14 Jun 2024 13:26:37 +0100 Subject: [PATCH 06/14] correct arg bug. --- mteb/evaluation/evaluators/RerankingEvaluator.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mteb/evaluation/evaluators/RerankingEvaluator.py b/mteb/evaluation/evaluators/RerankingEvaluator.py index 16c883c229..5cff1bc8ab 100644 --- a/mteb/evaluation/evaluators/RerankingEvaluator.py +++ b/mteb/evaluation/evaluators/RerankingEvaluator.py @@ -118,7 +118,12 @@ def compute_metrics_batched(self, model: Encoder | EncoderWithQueryCorpusEncode) ) if self.evaluator_type == "standard": - results = self._encode_candidates(all_query_embs, encode_corpus_func, True) + results = self._encode_candidates( + encode_corpus_func=encode_corpus_func, + batched=True, + all_query_embs=all_query_embs, + encode_queries_func=encode_queries_func, + ) elif self.evaluator_type == "miracl": results = self._encode_candidates_miracl(all_query_embs, encode_corpus_func) return results From e9cada5416a2a9042693148ed8bc53feee326d69 Mon Sep 17 00:00:00 2001 From: Jordan Clive Date: Fri, 14 Jun 2024 14:05:35 +0100 Subject: [PATCH 07/14] remove type annotation --- mteb/evaluation/evaluators/RerankingEvaluator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/evaluation/evaluators/RerankingEvaluator.py b/mteb/evaluation/evaluators/RerankingEvaluator.py index 5cff1bc8ab..cef735807f 100644 --- a/mteb/evaluation/evaluators/RerankingEvaluator.py +++ b/mteb/evaluation/evaluators/RerankingEvaluator.py @@ -30,7 +30,7 @@ class RerankingEvaluator(Evaluator): def __init__( self, - samples: list[dict], + samples, mrr_at_k: int = 10, name: str = "", similarity_fct=cos_sim, From 155059063896654634342170e2132d3f38847011 Mon Sep 17 00:00:00 2001 From: Jordan Example Date: Sun, 16 Jun 2024 10:38:36 +0000 Subject: [PATCH 08/14] combine unique texts --- .../evaluators/RerankingEvaluator.py | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/mteb/evaluation/evaluators/RerankingEvaluator.py b/mteb/evaluation/evaluators/RerankingEvaluator.py index cef735807f..74f515dbe8 100644 --- a/mteb/evaluation/evaluators/RerankingEvaluator.py +++ b/mteb/evaluation/evaluators/RerankingEvaluator.py @@ -104,19 +104,14 @@ def compute_metrics_batched(self, model: Encoder | EncoderWithQueryCorpusEncode) all_query_flattened = [ q for sample in self.samples for q in sample["query"] ] - if self.evaluator_type == "standard": - all_query_embs = self._encode_unique_texts( - all_query_flattened, encode_corpus_func - ) - elif self.evaluator_type == "miracl": - all_query_embs = np.asarray( - encode_queries_func(all_query_flattened, batch_size=self.batch_size) - ) + all_query_embs = self._encode_unique_texts( + all_query_flattened, encode_corpus_func + ) + else: raise ValueError( f"Query must be a string or a list of strings but is {type(self.samples[0]['query'])}" ) - if self.evaluator_type == "standard": results = self._encode_candidates( encode_corpus_func=encode_corpus_func, @@ -178,14 +173,8 @@ def _encode_candidates( all_ap_scores=all_ap_scores, all_conf_scores=all_conf_scores, ) - mean_ap = np.mean(all_ap_scores) - mean_mrr = np.mean(all_mrr_scores) - - # Compute nAUCs - naucs_map = self.nAUC_scores(all_conf_scores, all_ap_scores, "map") - naucs_mrr = self.nAUC_scores(all_conf_scores, all_mrr_scores, "mrr") - - return {**{"map": mean_ap, "mrr": mean_mrr}, **naucs_map, **naucs_mrr} + scores = self._collect_results(all_mrr_scores, all_ap_scores, all_conf_scores) + return scores def _encode_candidates_batched( self, @@ -346,6 +335,16 @@ def _encode_candidates_miracl_individual( scores_miracl = self._collect_miracl_results(results, qrels) return scores_miracl + def _collect_results(self, all_mrr_scores, all_ap_scores, all_conf_scores): + mean_ap = np.mean(all_ap_scores) + mean_mrr = np.mean(all_mrr_scores) + + # Compute nAUCs + naucs_map = self.nAUC_scores(all_conf_scores, all_ap_scores, "map") + naucs_mrr = self.nAUC_scores(all_conf_scores, all_mrr_scores, "mrr") + + return {**{"map": mean_ap, "mrr": mean_mrr}, **naucs_map, **naucs_mrr} + def _collect_miracl_results(self, results, qrels): ndcg, _map, recall, precision, naucs = RetrievalEvaluator.evaluate( qrels=qrels, From f4247f582767a8737f84a55f259ea30b869906c1 Mon Sep 17 00:00:00 2001 From: Jordan Example Date: Mon, 17 Jun 2024 14:00:55 +0000 Subject: [PATCH 09/14] improve readability --- .../evaluators/RerankingEvaluator.py | 83 ++++++++++++------- 1 file changed, 53 insertions(+), 30 deletions(-) diff --git a/mteb/evaluation/evaluators/RerankingEvaluator.py b/mteb/evaluation/evaluators/RerankingEvaluator.py index 74f515dbe8..61a04495d7 100644 --- a/mteb/evaluation/evaluators/RerankingEvaluator.py +++ b/mteb/evaluation/evaluators/RerankingEvaluator.py @@ -114,13 +114,18 @@ def compute_metrics_batched(self, model: Encoder | EncoderWithQueryCorpusEncode) ) if self.evaluator_type == "standard": results = self._encode_candidates( + encode_queries_func=encode_queries_func, encode_corpus_func=encode_corpus_func, batched=True, all_query_embs=all_query_embs, - encode_queries_func=encode_queries_func, ) elif self.evaluator_type == "miracl": - results = self._encode_candidates_miracl(all_query_embs, encode_corpus_func) + results = self._encode_candidates_miracl( + encode_queries_func=encode_queries_func, + encode_corpus_func=encode_corpus_func, + batched=True, + all_query_embs=all_query_embs, + ) return results def compute_metrics_individual(self, model): @@ -139,14 +144,15 @@ def compute_metrics_individual(self, model): ) if self.evaluator_type == "standard": results = self._encode_candidates( - encode_corpus_func=encode_corpus_func, encode_queries_func=encode_queries_func, + encode_corpus_func=encode_corpus_func, batched=False, ) elif self.evaluator_type == "miracl": - results = self._encode_candidates_miracl_individual( + results = self._encode_candidates_miracl( encode_queries_func=encode_queries_func, encode_corpus_func=encode_corpus_func, + batched=False, ) return results @@ -159,8 +165,8 @@ def _encode_candidates( logger.info("Encoding candidates...") if batched: self._encode_candidates_batched( - all_query_embs=all_query_embs, encode_corpus_func=encode_corpus_func, + all_query_embs=all_query_embs, all_mrr_scores=all_mrr_scores, all_ap_scores=all_ap_scores, all_conf_scores=all_conf_scores, @@ -256,24 +262,34 @@ def _encode_candidates_individual( all_conf_scores, ) - def _apply_sim_scores( + def _collect_results(self, all_mrr_scores, all_ap_scores, all_conf_scores): + mean_ap = np.mean(all_ap_scores) + mean_mrr = np.mean(all_mrr_scores) + + # Compute nAUCs + naucs_map = self.nAUC_scores(all_conf_scores, all_ap_scores, "map") + naucs_mrr = self.nAUC_scores(all_conf_scores, all_mrr_scores, "mrr") + + return {**{"map": mean_ap, "mrr": mean_mrr}, **naucs_map, **naucs_mrr} + + def _encode_candidates_miracl( self, - query_emb, - docs_emb, - is_relevant, - all_mrr_scores, - all_ap_scores, - all_conf_scores, + encode_corpus_func, + encode_queries_func, + batched, + all_query_embs=None, ): - sim_scores = self._compute_sim_scores_instance(query_emb, docs_emb) - scores = self._compute_metrics_instance(sim_scores, is_relevant) - conf_scores = self.conf_scores(sim_scores.tolist()) - - all_mrr_scores.append(scores["mrr"]) - all_ap_scores.append(scores["ap"]) - all_conf_scores.append(conf_scores) + if batched: + return self._encode_candidates_miracl_batched( + all_query_embs=all_query_embs, encode_corpus_func=encode_corpus_func + ) + else: + return self._encode_candidates_miracl_individual( + encode_queries_func=encode_queries_func, + encode_corpus_func=encode_corpus_func, + ) - def _encode_candidates_miracl(self, all_query_embs, encode_corpus_func): + def _encode_candidates_miracl_batched(self, all_query_embs, encode_corpus_func): all_docs = [] for sample in self.samples: all_docs.extend(sample["candidates"]) @@ -335,16 +351,6 @@ def _encode_candidates_miracl_individual( scores_miracl = self._collect_miracl_results(results, qrels) return scores_miracl - def _collect_results(self, all_mrr_scores, all_ap_scores, all_conf_scores): - mean_ap = np.mean(all_ap_scores) - mean_mrr = np.mean(all_mrr_scores) - - # Compute nAUCs - naucs_map = self.nAUC_scores(all_conf_scores, all_ap_scores, "map") - naucs_mrr = self.nAUC_scores(all_conf_scores, all_mrr_scores, "mrr") - - return {**{"map": mean_ap, "mrr": mean_mrr}, **naucs_map, **naucs_mrr} - def _collect_miracl_results(self, results, qrels): ndcg, _map, recall, precision, naucs = RetrievalEvaluator.evaluate( qrels=qrels, @@ -383,6 +389,23 @@ def rerank( str(i): score.detach().numpy().item() for i, score in enumerate(pred_scores) } + def _apply_sim_scores( + self, + query_emb, + docs_emb, + is_relevant, + all_mrr_scores, + all_ap_scores, + all_conf_scores, + ): + sim_scores = self._compute_sim_scores_instance(query_emb, docs_emb) + scores = self._compute_metrics_instance(sim_scores, is_relevant) + conf_scores = self.conf_scores(sim_scores.tolist()) + + all_mrr_scores.append(scores["mrr"]) + all_ap_scores.append(scores["ap"]) + all_conf_scores.append(conf_scores) + def _encode_unique_texts(self, all_texts, encode_queries_func): index_map, all_unique_texts, all_texts_indexes = {}, [], [] for text in all_texts: From a0ec658da8e0d47821ef9c3696f1fbdba70d7643 Mon Sep 17 00:00:00 2001 From: Jordan Example Date: Mon, 17 Jun 2024 14:22:10 +0000 Subject: [PATCH 10/14] merge main --- .../evaluators/RerankingEvaluator.py | 158 ++++++++++++- .../Reranking/multilingual/MIRACLReranking.py | 210 +----------------- 2 files changed, 149 insertions(+), 219 deletions(-) diff --git a/mteb/evaluation/evaluators/RerankingEvaluator.py b/mteb/evaluation/evaluators/RerankingEvaluator.py index 96e9f8f448..af0a363029 100644 --- a/mteb/evaluation/evaluators/RerankingEvaluator.py +++ b/mteb/evaluation/evaluators/RerankingEvaluator.py @@ -9,6 +9,8 @@ import tqdm from sklearn.metrics import average_precision_score +from mteb.evaluation.evaluators.RetrievalEvaluator import RetrievalEvaluator + from ...encoder_interface import Encoder, EncoderWithQueryCorpusEncode from .Evaluator import Evaluator from .model_encode import model_encode @@ -52,6 +54,8 @@ def __init__( self.batch_size = batch_size self.use_batched_encoding = use_batched_encoding self.task_name = task_name + self.k_values = k_values + self.evaluator_type = evaluator_type if isinstance(self.samples, dict): self.samples = list(self.samples.values()) @@ -111,11 +115,11 @@ def compute_metrics_batched(self, model: Encoder | EncoderWithQueryCorpusEncode) task_name=self.task_name, batch_size=self.batch_size, ) - else: raise ValueError( f"Query must be a string or a list of strings but is {type(self.samples[0]['query'])}" ) + if self.evaluator_type == "standard": results = self._encode_candidates( encode_queries_func=encode_queries_func, @@ -202,8 +206,6 @@ def _encode_candidates_batched( all_docs_embs = self._encode_unique_texts( all_docs, encode_corpus_func, - task_name=self.task_name, - batch_size=self.batch_size, ) # Compute scores and confidence scores @@ -256,14 +258,16 @@ def _encode_candidates_individual( # .encoding interface requires List[str] as input query = [query] query_emb = np.asarray( - encode_queries_func( - query, task_name=self.task_name, batch_size=self.batch_size - ) + encode_queries_func(query, batch_size=self.batch_size) ) - docs_emb = np.asarray( - encode_corpus_func( - docs, task_name=self.task_name, batch_size=self.batch_size - ) + docs_emb = np.asarray(encode_corpus_func(docs, batch_size=self.batch_size)) + self._apply_sim_scores( + query_emb, + docs_emb, + is_relevant, + all_mrr_scores, + all_ap_scores, + all_conf_scores, ) def _collect_results(self, all_mrr_scores, all_ap_scores, all_conf_scores): @@ -276,6 +280,140 @@ def _collect_results(self, all_mrr_scores, all_ap_scores, all_conf_scores): return {**{"map": mean_ap, "mrr": mean_mrr}, **naucs_map, **naucs_mrr} + def _encode_candidates_miracl( + self, + encode_corpus_func, + encode_queries_func, + batched, + all_query_embs=None, + ): + if batched: + return self._encode_candidates_miracl_batched( + all_query_embs=all_query_embs, encode_corpus_func=encode_corpus_func + ) + else: + return self._encode_candidates_miracl_individual( + encode_queries_func=encode_queries_func, + encode_corpus_func=encode_corpus_func, + ) + + def _encode_candidates_miracl_batched(self, all_query_embs, encode_corpus_func): + all_docs = [] + for sample in self.samples: + all_docs.extend(sample["candidates"]) + + all_docs_embs = np.asarray( + encode_corpus_func(all_docs, batch_size=self.batch_size) + ) + + # Compute scores + logger.info("Evaluating...") + query_idx, docs_idx = 0, 0 + results, qrels = {}, {} + for instance in self.samples: + num_subqueries = ( + len(instance["query"]) if isinstance(instance["query"], list) else 1 + ) + query_emb = all_query_embs[query_idx : query_idx + num_subqueries] + query_idx += num_subqueries + + positive = instance["positive"] + docs = instance["candidates"] + num_doc = len(docs) + docs_emb = all_docs_embs[docs_idx : docs_idx + num_doc] + docs_idx += num_doc + + fake_qid = str(query_idx) + results[fake_qid] = self.rerank(query_emb, docs_emb) + qrels[fake_qid] = { + str(i): 1 if doc in positive else 0 for i, doc in enumerate(docs) + } + + scores_miracl = self._collect_miracl_results(results, qrels) + return scores_miracl + + def _encode_candidates_miracl_individual( + self, encode_queries_func, encode_corpus_func + ): + results, qrels = {}, {} + for i, instance in enumerate(tqdm.tqdm(self.samples, desc="Samples")): + query = instance["query"] + positive = set(instance["positive"]) + docs = list(instance["candidates"]) + + if isinstance(query, str): + # .encoding interface requires List[str] as input + query_emb = np.asarray( + encode_queries_func([query], batch_size=self.batch_size) + ) + docs_emb = np.asarray( + encode_corpus_func(docs, batch_size=self.batch_size) + ) + + fake_qid = str(i) + results[fake_qid] = self.rerank(query_emb, docs_emb) + qrels[fake_qid] = { + str(i): 1 if doc in positive else 0 for i, doc in enumerate(docs) + } + + scores_miracl = self._collect_miracl_results(results, qrels) + return scores_miracl + + def _collect_miracl_results(self, results, qrels): + ndcg, _map, recall, precision, naucs = RetrievalEvaluator.evaluate( + qrels=qrels, + results=results, + k_values=self.k_values, + ignore_identical_ids=False, + ) + scores = {**ndcg, **_map, **recall, **precision, **naucs} + scores_miracl = {f"{k}(MIRACL)": v for k, v in scores.items()} + return scores_miracl + + def rerank( + self, query_emb: torch.Tensor, docs_emb: torch.Tensor + ) -> dict[str, float]: + """Rerank documents (docs_emb) given the query (query_emb) + + Args: + query_emb: Query embedding of shape `(num_queries, hidden_size)`) + if `num_queries` > 0: we take the closest document to any of the queries + docs_emb: Candidates documents embeddings of shape `(num_pos+num_neg, hidden_size)`) + + Returns: + similarity_scores: + """ + if not query_emb.shape[0]: + raise ValueError("Empty query embedding") + + if not docs_emb.shape[0]: + return {"empty-docid": 0} + + pred_scores = self.similarity_fct(query_emb, docs_emb) + if len(pred_scores.shape) > 1: + pred_scores = torch.amax(pred_scores, dim=0) + + return { + str(i): score.detach().numpy().item() for i, score in enumerate(pred_scores) + } + + def _apply_sim_scores( + self, + query_emb, + docs_emb, + is_relevant, + all_mrr_scores, + all_ap_scores, + all_conf_scores, + ): + sim_scores = self._compute_sim_scores_instance(query_emb, docs_emb) + scores = self._compute_metrics_instance(sim_scores, is_relevant) + conf_scores = self.conf_scores(sim_scores.tolist()) + + all_mrr_scores.append(scores["mrr"]) + all_ap_scores.append(scores["ap"]) + all_conf_scores.append(conf_scores) + @staticmethod def _encode_unique_texts( all_texts: list[str], diff --git a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py index 3849f483d8..f5eb69a032 100644 --- a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py +++ b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py @@ -1,7 +1,6 @@ from __future__ import annotations import logging -from functools import partial from typing import Any from datasets import Dataset @@ -9,9 +8,6 @@ from mteb.abstasks.TaskMetadata import TaskMetadata from mteb.encoder_interface import Encoder, EncoderWithQueryCorpusEncode from mteb.evaluation.evaluators import RerankingEvaluator -from mteb.evaluation.evaluators.model_encode import model_encode -from mteb.evaluation.evaluators.RetrievalEvaluator import RetrievalEvaluator -from mteb.evaluation.evaluators.utils import cos_sim from mteb.MTEBResults import ScoresDict from ....abstasks import MultilingualTask @@ -88,212 +84,8 @@ def _evaluate_subset( data_split: Dataset, **kwargs: Any, ) -> ScoresDict: - evaluator = MIRACLRerankingEvaluator( - samples=data_split, task_name=self.metadata.name, **kwargs - ) + evaluator = RerankingEvaluator(data_split, evaluator_type="miracl", **kwargs) scores = evaluator(model) self._add_main_score(scores) return scores - - -class MIRACLRerankingEvaluator(RerankingEvaluator): - """This class evaluates a SentenceTransformer model for the task of re-ranking. - MIRACLRerankingEvaluator differs from RerankingEvaluator in two ways: - 1. it uses the pytrec_eval via RetrievalEvaluator instead of the metrics provided by sklearn; - 2. it reranks the top-k `candidates` from previous-stage retrieval which may not include all ground-truth `positive` documents - """ - - def __init__( - self, - samples: list[dict], - task_name: str, - mrr_at_k: int = 10, - name: str = "", - similarity_fct=cos_sim, - batch_size: int = 512, - use_batched_encoding: bool = True, - limit: int | None = None, - k_values: list[int] = [1, 3, 5, 10, 20, 100, 1000], - **kwargs, - ): - super().__init__( - samples, - task_name=task_name, - mrr_at_k=mrr_at_k, - name=name, - similarity_fct=similarity_fct, - batch_size=batch_size, - use_batched_encoding=use_batched_encoding, - limit=limit, - **kwargs, - ) - self.k_values = k_values - - def rerank( - self, query_emb: torch.Tensor, docs_emb: torch.Tensor - ) -> dict[str, float]: - """Rerank documents (docs_emb) given the query (query_emb) - - Args: - query_emb: Query embedding of shape `(num_queries, hidden_size)`) - if `num_queries` > 0: we take the closest document to any of the queries - docs_emb: Candidates documents embeddings of shape `(num_pos+num_neg, hidden_size)`) - - Returns: - similarity_scores: - """ - if not query_emb.shape[0]: - raise ValueError("Empty query embedding") - - if not docs_emb.shape[0]: - return {"empty-docid": 0} - - pred_scores = self.similarity_fct(query_emb, docs_emb) - if len(pred_scores.shape) > 1: - pred_scores = torch.amax(pred_scores, dim=0) - - return { - str(i): score.detach().numpy().item() for i, score in enumerate(pred_scores) - } - - def compute_metrics_batched(self, model: Encoder | EncoderWithQueryCorpusEncode): - """Computes the metrices in a batched way, by batching all queries and - all documents together - """ - # using encode_queries and encode_corpus functions if they exists, - # which can be defined by users to add different instructions for query and passage conveniently - encode_queries_func = ( - model.encode_queries - if isinstance(model, EncoderWithQueryCorpusEncode) - else partial(model_encode, model=model) - ) - encode_corpus_func = ( - model.encode_corpus - if isinstance(model, EncoderWithQueryCorpusEncode) - else partial(model_encode, model=model) - ) - - logger.info("Encoding queries...") - if isinstance(self.samples[0]["query"], str): - all_query_embs = np.asarray( - encode_queries_func( - [sample["query"] for sample in self.samples], - batch_size=self.batch_size, - task_name=self.task_name, - ) - ) - elif isinstance(self.samples[0]["query"], list): - # In case the query is a list of strings, we get the most similar embedding to any of the queries - all_query_flattened = [ - q for sample in self.samples for q in sample["query"] - ] - all_query_embs = np.asarray( - encode_queries_func( - all_query_flattened, - batch_size=self.batch_size, - task_name=self.task_name, - ) - ) - else: - raise ValueError( - f"Query must be a string or a list of strings but is {type(self.samples[0]['query'])}" - ) - - logger.info("Encoding candidates...") - all_docs = [] - for sample in self.samples: - all_docs.extend(sample["candidates"]) - - all_docs_embs = np.asarray( - encode_corpus_func( - all_docs, batch_size=self.batch_size, task_name=self.task_name - ) - ) - - # Compute scores - logger.info("Evaluating...") - query_idx, docs_idx = 0, 0 - results, qrels = {}, {} - for instance in self.samples: - num_subqueries = ( - len(instance["query"]) if isinstance(instance["query"], list) else 1 - ) - query_emb = all_query_embs[query_idx : query_idx + num_subqueries] - query_idx += num_subqueries - - positive = instance["positive"] - docs = instance["candidates"] - num_doc = len(docs) - docs_emb = all_docs_embs[docs_idx : docs_idx + num_doc] - docs_idx += num_doc - - fake_qid = str(query_idx) - results[fake_qid] = self.rerank(query_emb, docs_emb) - qrels[fake_qid] = { - str(i): 1 if doc in positive else 0 for i, doc in enumerate(docs) - } - - ndcg, _map, recall, precision, naucs = RetrievalEvaluator.evaluate( - qrels=qrels, - results=results, - k_values=self.k_values, - ignore_identical_ids=False, - ) - scores = {**ndcg, **_map, **recall, **precision, **naucs} - scores_miracl = {f"{k}(MIRACL)": v for k, v in scores.items()} - return scores_miracl - - def compute_metrics_individual(self, model): - """Embeds every (query, positive, negative) tuple individually. - Is slower than the batched version, but saves memory as only the - embeddings for one tuple are needed. Useful when you have - a really large test set - """ - # using encode_queries and encode_corpus functions if they exists, - # which can be defined by users to add different instructions for query and passage conveniently - encode_queries_func = ( - model.encode_queries - if hasattr(model, "encode_queries") - else partial(model_encode, model=model) - ) - encode_corpus_func = ( - model.encode_corpus - if hasattr(model, "encode_corpus") - else partial(model_encode, model=model) - ) - - results, qrels = {}, {} - for i, instance in enumerate(tqdm.tqdm(self.samples, desc="Samples")): - query = instance["query"] - positive = set(instance["positive"]) - docs = list(instance["candidates"]) - - if isinstance(query, str): - # .encoding interface requires List[str] as input - query_emb = np.asarray( - encode_queries_func( - [query], batch_size=self.batch_size, task_name=self.task_name - ) - ) - docs_emb = np.asarray( - encode_corpus_func( - docs, batch_size=self.batch_size, task_name=self.task_name - ) - ) - - fake_qid = str(i) - results[fake_qid] = self.rerank(query_emb, docs_emb) - qrels[fake_qid] = { - str(i): 1 if doc in positive else 0 for i, doc in enumerate(docs) - } - - ndcg, _map, recall, precision, naucs = RetrievalEvaluator.evaluate( - qrels=qrels, - results=results, - k_values=self.k_values, - ignore_identical_ids=False, - ) - scores = {**ndcg, **_map, **recall, **precision, **naucs} - scores_miracl = {f"{k}(MIRACL)": v for k, v in scores.items()} - return scores_miracl From 22b626ab7a28c4a42f37dc05e7791944d90bc09c Mon Sep 17 00:00:00 2001 From: Jordan Example Date: Mon, 17 Jun 2024 14:25:33 +0000 Subject: [PATCH 11/14] add back main changes --- mteb/evaluation/evaluators/RerankingEvaluator.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mteb/evaluation/evaluators/RerankingEvaluator.py b/mteb/evaluation/evaluators/RerankingEvaluator.py index af0a363029..ec6bb20e35 100644 --- a/mteb/evaluation/evaluators/RerankingEvaluator.py +++ b/mteb/evaluation/evaluators/RerankingEvaluator.py @@ -206,6 +206,8 @@ def _encode_candidates_batched( all_docs_embs = self._encode_unique_texts( all_docs, encode_corpus_func, + task_name=self.task_name, + batch_size=self.batch_size, ) # Compute scores and confidence scores @@ -258,9 +260,14 @@ def _encode_candidates_individual( # .encoding interface requires List[str] as input query = [query] query_emb = np.asarray( - encode_queries_func(query, batch_size=self.batch_size) + encode_queries_func( + query, task_name=self.task_name, batch_size=self.batch_size + )) + docs_emb = np.asarray( + encode_corpus_func( + docs, task_name=self.task_name, batch_size=self.batch_size + ) ) - docs_emb = np.asarray(encode_corpus_func(docs, batch_size=self.batch_size)) self._apply_sim_scores( query_emb, docs_emb, From cc82157c03f7ef797134ba8381850eb7a54b7880 Mon Sep 17 00:00:00 2001 From: Jordan Example Date: Mon, 17 Jun 2024 14:38:25 +0000 Subject: [PATCH 12/14] adjust for task_name changes --- mteb/evaluation/evaluators/RerankingEvaluator.py | 13 +++++-------- .../tasks/Reranking/multilingual/MIRACLReranking.py | 4 +++- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/mteb/evaluation/evaluators/RerankingEvaluator.py b/mteb/evaluation/evaluators/RerankingEvaluator.py index ec6bb20e35..80091235c6 100644 --- a/mteb/evaluation/evaluators/RerankingEvaluator.py +++ b/mteb/evaluation/evaluators/RerankingEvaluator.py @@ -260,14 +260,9 @@ def _encode_candidates_individual( # .encoding interface requires List[str] as input query = [query] query_emb = np.asarray( - encode_queries_func( - query, task_name=self.task_name, batch_size=self.batch_size - )) - docs_emb = np.asarray( - encode_corpus_func( - docs, task_name=self.task_name, batch_size=self.batch_size - ) + encode_queries_func(query, batch_size=self.batch_size) ) + docs_emb = np.asarray(encode_corpus_func(docs, batch_size=self.batch_size)) self._apply_sim_scores( query_emb, docs_emb, @@ -310,7 +305,9 @@ def _encode_candidates_miracl_batched(self, all_query_embs, encode_corpus_func): all_docs.extend(sample["candidates"]) all_docs_embs = np.asarray( - encode_corpus_func(all_docs, batch_size=self.batch_size) + encode_corpus_func( + all_docs, task_name=self.task_name, batch_size=self.batch_size + ) ) # Compute scores diff --git a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py index f5eb69a032..f8c0f282a2 100644 --- a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py +++ b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py @@ -84,7 +84,9 @@ def _evaluate_subset( data_split: Dataset, **kwargs: Any, ) -> ScoresDict: - evaluator = RerankingEvaluator(data_split, evaluator_type="miracl", **kwargs) + evaluator = RerankingEvaluator( + data_split, evaluator_type="miracl", task_name=self.metadata.name, **kwargs + ) scores = evaluator(model) self._add_main_score(scores) From 84aa1143478250c246844c8a44e85a5917c4cd84 Mon Sep 17 00:00:00 2001 From: Jordan Clive Date: Mon, 17 Jun 2024 15:42:08 +0100 Subject: [PATCH 13/14] add back --- mteb/tasks/Reranking/multilingual/MIRACLReranking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py index f8c0f282a2..bac57a804b 100644 --- a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py +++ b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py @@ -85,7 +85,7 @@ def _evaluate_subset( **kwargs: Any, ) -> ScoresDict: evaluator = RerankingEvaluator( - data_split, evaluator_type="miracl", task_name=self.metadata.name, **kwargs + samples=data_split, evaluator_type="miracl", task_name=self.metadata.name, **kwargs ) scores = evaluator(model) From 49d676cd17d6b0416e0d0be0fad38524204eac7c Mon Sep 17 00:00:00 2001 From: Jordan Clive Date: Mon, 17 Jun 2024 15:45:08 +0100 Subject: [PATCH 14/14] lint --- mteb/tasks/Reranking/multilingual/MIRACLReranking.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py index bac57a804b..0a5990db0e 100644 --- a/mteb/tasks/Reranking/multilingual/MIRACLReranking.py +++ b/mteb/tasks/Reranking/multilingual/MIRACLReranking.py @@ -85,7 +85,10 @@ def _evaluate_subset( **kwargs: Any, ) -> ScoresDict: evaluator = RerankingEvaluator( - samples=data_split, evaluator_type="miracl", task_name=self.metadata.name, **kwargs + samples=data_split, + evaluator_type="miracl", + task_name=self.metadata.name, + **kwargs, ) scores = evaluator(model)