Skip to content

Commit

Permalink
Add more annotations (#1833)
Browse files Browse the repository at this point in the history
* apply additions from #1794

* add annotations for rumodels

* add nomic training data

* fix metadata

* update rest of model meta

* fix bge reranker
  • Loading branch information
Samoed authored Jan 22, 2025
1 parent 4985da9 commit 12ed9c5
Show file tree
Hide file tree
Showing 9 changed files with 322 additions and 137 deletions.
115 changes: 26 additions & 89 deletions mteb/models/bge_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
model_prompts = {"query": "Represent this sentence for searching relevant passages: "}
model_prompts_zh = {"query": "为这个句子生成表示以用于检索相关文章:"}

bge_m_training_data = {
# source: https://arxiv.org/pdf/2402.03216
bge_m3_training_data = {
# source: https://arxiv.org/abs/2402.03216
"MIRACLRetrieval": ["train"],
"MIRACLRetrievalHardNegatives": ["train"],
"MIRACLReranking": ["train"],
Expand All @@ -28,6 +28,28 @@
"HotpotQA": ["train"],
"HotpotQA-PL": ["train"], # translation not trained on
"HotpotQAHardNegatives": ["train"],
"T2Retrieval": ["train"],
"DuReader": ["train"],
"MMarcoReranking": ["train"],
"CodeSearchNet": ["train"],
# not in mteb
# "s2orc"
# Wikipedia
# "xP3"
# "mC4"
# "CC-News"
# "MTP"
# "NLLB"
# "CCMatrix"
# TriviaQA
# COL-IEE
# PubMedQA
# SQuAD
# SimCSE
# mMARCO-ZH
# LawGPT
# NLI-zh2, LeCaRDv2,
# NLI, MultiLongDoc (their syntetic)
# + synthetic data
}

Expand Down Expand Up @@ -89,38 +111,6 @@
# "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract)
}

bgem3_training_data = {
# source https://arxiv.org/abs/2402.03216
"T2Retrieval": ["train"],
"DuReader": ["train"],
"MMarcoReranking": ["train"],
"CMedQAv2-reranking": ["train"],
"HotpotQA": ["train"],
"NQ": ["train"],
"MSMARCO": ["train"],
"MrTidyRetrieval": ["train"],
"MIRACLRetrieval": ["train"],
"CodeSearchNet": ["train"],
# not in mteb
# "s2orc"
# Wikipedia
# "xP3"
# "mC4"
# "CC-News"
# "MTP"
# "NLLB"
# "CCMatrix"
# TriviaQA
# COL-IEE
# PubMedQA
# SQuAD
# SimCSE
# mMARCO-ZH
# LawGPT
# NLI-zh2, LeCaRDv2,
# NLI, MultiLongDoc (their syntetic)
}

# https://huggingface.co/BAAI/bge-m3/discussions/29
bgem3_languages = [
"afr_Latn", # af
Expand Down Expand Up @@ -298,59 +288,6 @@
"zho_Hans", # zh
]

bge_m_training_data = {
# source: https://arxiv.org/pdf/2402.03216
"MIRACLRetrieval": ["train"],
"MIRACLRetrievalHardNegatives": ["train"],
"MIRACLReranking": ["train"],
"LeCaRDv2": ["train"],
"CMedQAv1-reranking": ["train"],
"CMedQAv2-reranking": ["train"],
"MrTidyRetrieval": ["train"],
"T2Reranking": ["train"],
"MSMARCO": ["train"],
"MSMARCOHardNegatives": ["train"],
"NanoMSMARCORetrieval": ["train"],
"MSMARCO-PL": ["train"], # translation not trained on
"NQ": ["train"],
"NQHardNegatives": ["train"],
"NanoNQRetrieval": ["train"],
"NQ-PL": ["train"], # translation not trained on
"HotpotQA": ["train"],
"HotpotQA-PL": ["train"], # translation not trained on
"HotpotQAHardNegatives": ["train"],
# + synthetic data
}

bge_training_data = {
# source: https://data.baai.ac.cn/details/BAAI-MTP
"NQ": ["test"],
"NQHardNegatives": ["test"],
"AmazonReviewsClassification": [
"validation",
"test",
], # assumed from: amazon_reviews_multi
"MLQARetrieval": [
"validation",
"test",
], # assumed from mlqa (question, context)
# not in mteb
# Dataset Pairs
# wudao (title, passage)
# cmrc2018 (query, context)
# dureader (query, context)
# simclue (sentence_a, sentence_b)
# csl (title, abstract)
# amazon_reviews_multi (title, body)
# wiki_atomic_edits (base_sentence, edited_sentence)
# mlqa (question, context)
# xlsum (title, summary) (title, text)
# "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
# "wikipedia": [], # title + section title, passage
# "reddit": [], # title, body
# "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer)
# "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract)
}

bge_small_en_v1_5 = ModelMeta(
loader=partial( # type: ignore
Expand Down Expand Up @@ -522,8 +459,8 @@
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
public_training_code=None,
public_training_data=None,
training_datasets=bgem3_training_data,
public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
training_datasets=bge_m3_training_data,
)


Expand Down
10 changes: 8 additions & 2 deletions mteb/models/colbert_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,9 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
use_instructions=False,
adapted_from=None,
superseded_by=None,
training_datasets=None,
training_datasets={
"MSMARCO": ["train"], # dev?
},
)


Expand Down Expand Up @@ -218,5 +220,9 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
use_instructions=False,
adapted_from=None,
superseded_by=None,
training_datasets=None,
training_datasets={
"MSMARCO": ["train"],
"DuRetrieval": [],
"MIRACL": ["train"],
},
)
67 changes: 63 additions & 4 deletions mteb/models/ibm_granite_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,65 @@
"zho_Hans",
]

granite_training_data = {
# Multilingual MC4
# Multilingual Webhose
# English Wikipedia
# Multilingual Wikimedia
"WikipediaRetrievalMultilingual": [],
"WikipediaRerankingMultilingual": [],
# Miracl Corpus (Title-Body)
# Stack Exchange Duplicate questions (titles)
# Stack Exchange Duplicate questions (titles)
# Stack Exchange Duplicate questions (bodies)
"StackOverflowDupQuestions": [],
"AskUbuntuDupQuestions": [],
# Stack Exchange (Title, Answer) pairs
# Stack Exchange (Title, Body) pairs
# Stack Exchange (Title, Body) pairs
# Machine Translations of Stack Exchange Duplicate questions (titles)
# Machine Translations of Stack Exchange (Title+Body, Answer) pairs
"StackExchangeClusteringP2P": [],
"StackExchangeClusteringP2P.v2": [],
"StackExchangeClustering": [],
"StackExchangeClustering.v2": [],
# SearchQA
# S2ORC (Title, Abstract)
# WikiAnswers Duplicate question pairs
# CCNews
# XSum
# SimpleWiki
# Machine Translated Cross Lingual Parallel Corpora
# SPECTER citation triplets
# Machine Translations of SPECTER citation triplets
# Natural Questions (NQ)
"NQ": ["test"],
"NQHardNegatives": ["test"],
# SQuAD2.0
# HotpotQA
"HotPotQA": ["test"],
"HotPotQAHardNegatives": ["test"],
"HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on)
# Fever
"FEVER": ["test"],
"FEVERHardNegatives": ["test"],
# PubMed
# Multilingual Miracl Triples
"MIRACLRetrieval": ["train"],
"MIRACLRetrievalHardNegatives": ["train"],
"MIRACLReranking": ["train"],
# Multilingual MrTydi Triples
"MrTidyRetrieval": ["train"],
# Sadeeem Question Asnwering
# DBPedia Title-Body Pairs
"DBPedia": ["train"],
# Synthetic: English Query-Wikipedia Passage
# Synthetic: English Fact Verification
# Synthetic: Multilingual Query-Wikipedia Passage
# Synthetic: Multilingual News Summaries
# IBM Internal Triples
# IBM Internal Title-Body Pairs
}

granite_107m_multilingual = ModelMeta(
loader=partial( # type: ignore
Expand All @@ -44,7 +103,7 @@
public_training_code=None,
public_training_data=None,
use_instructions=False,
training_datasets=None,
training_datasets=granite_training_data,
)

granite_278m_multilingual = ModelMeta(
Expand All @@ -70,7 +129,7 @@
public_training_code=None,
public_training_data=None,
use_instructions=False,
training_datasets=None,
training_datasets=granite_training_data,
)

granite_30m_english = ModelMeta(
Expand All @@ -96,7 +155,7 @@
public_training_code=None,
public_training_data=None,
use_instructions=False,
training_datasets=None,
training_datasets=granite_training_data,
)

granite_125m_english = ModelMeta(
Expand All @@ -122,5 +181,5 @@
public_training_code=None,
public_training_data=None,
use_instructions=False,
training_datasets=None,
training_datasets=granite_training_data,
)
18 changes: 17 additions & 1 deletion mteb/models/jina_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,9 +222,25 @@ def encode(
framework=["Sentence Transformers", "PyTorch"],
use_instructions=True,
reference="https://huggingface.co/jinaai/jina-embeddings-v3",
training_datasets=None,
public_training_code=None,
public_training_data=None,
training_datasets={
# CulturaX
"STS12": [],
# "SICK": [],
# "WMT19": [],
# "MADLAD-3B": [],
# NLI
"MSMARCO": ["train"],
"MSMARCOHardNegatives": ["train"],
"NanoMSMARCORetrieval": ["train"],
"NQ": ["train"],
"NQHardNegatives": ["train"],
"NanoNQRetrieval": ["train"],
"NQ-PL": ["train"], # translation not trained on
# oasst1, oasst2
},
adapted_from="XLM-RoBERTa",
)


Expand Down
4 changes: 2 additions & 2 deletions mteb/models/misc_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from mteb.model_meta import ModelMeta, sentence_transformers_loader
from mteb.models.e5_models import E5_TRAINING_DATA

from .bge_models import bge_m_training_data, bge_training_data
from .bge_models import bge_m3_training_data, bge_training_data
from .sentence_transformers_models import sent_trf_training_dataset

Haon_Chen__speed_embedding_7b_instruct = ModelMeta(
Expand Down Expand Up @@ -1445,7 +1445,7 @@
reference="https://huggingface.co/deepvk/USER-bge-m3",
similarity_fn_name="cosine",
use_instructions=None,
training_datasets=bge_m_training_data, # derived from.
training_datasets=bge_m3_training_data, # derived from.
# not in MTEB:
# "deepvk/ru-HNP": ["train"],
# "deepvk/ru-WANLI": ["train"],
Expand Down
Loading

0 comments on commit 12ed9c5

Please sign in to comment.