Skip to content

Commit

Permalink
fix: Adding missing model meta (#1856)
Browse files Browse the repository at this point in the history
* Added CDE models

* Added bge-en-icl

* Updated CDE to bge_full_data

* Fixed public_training_data flag type to include boolean, as this is how all models are annotated

* Added public training data link instead of bool to CDE and BGE

* Added GME models

* Changed Torch to PyTorch

* Added metadata on LENS models

* Added ember_v1

* Added metadata for amazon titan

* Removed GME implementation
  • Loading branch information
x-tabdeveloping authored Jan 22, 2025
1 parent fde446d commit 692bd26
Show file tree
Hide file tree
Showing 6 changed files with 292 additions and 0 deletions.
82 changes: 82 additions & 0 deletions mteb/models/bge_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

from mteb.model_meta import ModelMeta, sentence_transformers_loader

from .e5_instruct import E5_MISTRAL_TRAINING_DATA

model_prompts = {"query": "Represent this sentence for searching relevant passages: "}
model_prompts_zh = {"query": "为这个句子生成表示以用于检索相关文章:"}

Expand Down Expand Up @@ -496,3 +498,83 @@
public_training_data=None,
training_datasets=None, # not disclosed
)

# Contents of cfli/bge-full-data
bge_full_data = {
# source: https://arxiv.org/pdf/2409.15700
# Charles Goodhart is turning back and forth
# in his grave as I'm annotating this
# |Retrieval|
# ELI5
# SQuaD
# TriviaQA
# QuoraDuplicateQuestions
"HotpotQA": ["train"],
"FEVER": ["train"],
"MSMARCO": ["train"],
"NQ": ["train"],
"ArguAna": ["train"],
"FiQA2018": ["train"],
# |Reranking|
"SciDocsReranking": ["train"],
"StackOverflowDupQuestions": ["train"],
# |Classification|
"AmazonReviewsClassification": ["train"],
"AmazonCounterfactualClassification": ["train"],
"Banking77Classification": ["train"],
"EmotionClassification": ["train"],
"TweetSentimentExtractionClassification": ["train"],
"MTOPIntentClassification": ["train"],
"ImdbClassification": ["train"],
"ToxicConversationsClassification": ["train"],
# |Clustering|
"ArxivClusteringS2S": ["train"],
"ArxivClusteringP2P": ["train"],
"BiorxivClusteringS2S": ["train"],
"BiorxivClusteringP2P": ["train"],
"MedrxivClusteringS2S": ["train"],
"MedrxivClusteringP2P": ["train"],
"BiorxivClusteringS2S.v2": ["train"],
"BiorxivClusteringP2P.v2": ["train"],
"MedrxivClusteringS2S.v2": ["train"],
"MedrxivClusteringP2P.v2": ["train"],
"RedditClusteringP2P": ["train"],
"RedditClustering": ["train"],
"RedditClustering.v2": ["train"],
"TwentyNewsgroupsClustering": ["train"],
"TwentyNewsgroupsClustering.v2": ["train"],
# |STS|
"STS22": ["train"],
"STS22.v2": ["train"],
"STSBenchmark": ["train"],
}

bge_en_icl = ModelMeta(
loader=partial(
sentence_transformers_loader,
model_name="BAAI/bge-en-icl",
revision="971c7e1445cc86656ca0bd85ed770b8675a40bb5",
),
name="BAAI/bge-en-icl",
languages=[
"eng_Latn",
],
open_weights=True,
revision="971c7e1445cc86656ca0bd85ed770b8675a40bb5",
release_date="2024-07-25", # initial commit of hf model.
n_parameters=7.11 * 1e9,
embed_dim=4096,
license="apache-2",
max_tokens=32768,
reference="https://huggingface.co/BAAI/bge-en-icl",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
public_training_code="https://github.com/FlagOpen/FlagEmbedding",
public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
training_datasets={
**E5_MISTRAL_TRAINING_DATA,
**bge_full_data,
},
adapted_from="intfloat/e5-mistral-7b-instruct",
)
54 changes: 54 additions & 0 deletions mteb/models/cde_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from __future__ import annotations

import logging

from mteb.model_meta import ModelMeta

from .bge_models import bge_full_data

logger = logging.getLogger(__name__)


cde_small_v1 = ModelMeta(
loader=None, # I will leave this at None for now,
name="jxm/cde-small-v1",
languages=["eng_Latn"],
open_weights=True,
revision="8d5736163718a8b65cd787b75ed61020d18bad3c",
release_date="2024-09-24",
n_parameters=int(281 * 1e6), # Though the second-stage model is only 140M
max_tokens=512,
embed_dim=768,
license="mit",
similarity_fn_name="cosine",
framework=["Sentence Transformers"],
reference="https://huggingface.co/jxm/cde-small-v1",
use_instructions=True,
adapted_from="nomic-ai/nomic-bert-2048",
superseded_by="jxm/cde-small-v2",
training_datasets=bge_full_data,
public_training_code="https://github.com/jxmorris12/cde",
public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
)

cde_small_v2 = ModelMeta(
loader=None, # I will leave this at None for now,
name="jxm/cde-small-v2",
languages=["eng_Latn"],
open_weights=True,
revision="a7e5882ad52c27ea2831fc8258f24379c25cb459",
release_date="2025-01-13",
n_parameters=int(306 * 1e6), # Though the second-stage model is only 140M
max_tokens=512,
embed_dim=768,
license="mit",
similarity_fn_name="cosine",
framework=["Sentence Transformers"],
reference="https://huggingface.co/jxm/cde-small-v1",
use_instructions=True,
adapted_from="answerdotai/ModernBERT-base",
superseded_by="jxm/cde-small-v2",
training_datasets=bge_full_data,
public_training_code="https://github.com/jxmorris12/cde",
public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
)
63 changes: 63 additions & 0 deletions mteb/models/gme_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from __future__ import annotations

import logging
from functools import partial

from mteb.model_meta import ModelMeta

logger = logging.getLogger(__name__)


gme_qwen2_vl_2b_instruct = ModelMeta(
loader=None,
name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct",
languages=["eng_Latn"],
open_weights=True,
revision="cfeb66885b598de483cc04eb08c7d9da534d7afe",
release_date="2024-12-21",
n_parameters=int(2.21 * 1e9),
max_tokens=32768,
embed_dim=1536,
license="mit",
similarity_fn_name="cosine",
framework=["PyTorch"],
reference="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct",
use_instructions=True,
adapted_from=None,
superseded_by=None,
training_datasets={
# Only annotating text data for now
# source: https://arxiv.org/pdf/2412.16855
"MSMARCO": ["train"],
"MSMARCO.v2": ["train"],
},
public_training_code=None,
public_training_data=None,
)

gme_qwen2_vl_7b_instruct = ModelMeta(
loader=None,
name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct",
languages=["eng_Latn"],
open_weights=True,
revision="d42eca5a540526cfa982a349724b24b25c12a95e",
release_date="2024-12-21",
n_parameters=int(8.29 * 1e9),
max_tokens=32768,
embed_dim=3584,
license="mit",
similarity_fn_name="cosine",
framework=["PyTorch"],
reference="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct",
use_instructions=True,
adapted_from=None,
superseded_by=None,
training_datasets={
# Only annotating text data for now
# source: https://arxiv.org/pdf/2412.16855
"MSMARCO": ["train"],
"MSMARCO.v2": ["train"],
},
public_training_code=None,
public_training_data=None,
)
49 changes: 49 additions & 0 deletions mteb/models/lens_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from __future__ import annotations

from functools import partial

import torch

from mteb.encoder_interface import PromptType
from mteb.model_meta import ModelMeta, sentence_transformers_loader
from mteb.models.instruct_wrapper import instruct_wrapper

lens_d4000 = ModelMeta(
loader=None, # TODO: implement this in the future
name="yibinlei/LENS-d4000",
languages=None,
open_weights=True,
revision="e473b33364e6c48a324796fd1411d3b93670c6fe",
release_date="2025-01-17",
n_parameters=int(7.11 * 1e9),
embed_dim=4000,
license="apache-2.0",
reference="https://huggingface.co/yibinlei/LENS-d4000",
similarity_fn_name="cosine",
framework=["PyTorch"],
use_instructions=True,
public_training_code=None,
public_training_data=None,
training_datasets=None,
max_tokens=32768,
)

lens_d8000 = ModelMeta(
loader=None, # TODO: implement this in the future
name="yibinlei/LENS-d8000",
languages=None,
open_weights=True,
revision="a0b87bd91cb27b6f2f0b0fe22c28026da1d464ef",
release_date="2025-01-17",
n_parameters=int(7.11 * 1e9),
embed_dim=8000,
license="apache-2.0",
reference="https://huggingface.co/yibinlei/LENS-d8000",
similarity_fn_name="cosine",
framework=["PyTorch"],
use_instructions=True,
public_training_code=None,
public_training_data=None,
training_datasets=None,
max_tokens=32768,
)
38 changes: 38 additions & 0 deletions mteb/models/misc_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1737,3 +1737,41 @@
training_datasets=None, # They "scraped" things from the internet, we don't know, could be leakage
superseded_by=None,
)
ember_v1 = ModelMeta(
name="llmrails/ember-v1",
revision="5e5ce5904901f6ce1c353a95020f17f09e5d021d",
release_date="2023-10-10",
languages=["eng_Latn"],
n_parameters=335 * 1e6,
max_tokens=512,
embed_dim=1024,
license="mit",
open_weights=True,
public_training_code=None,
public_training_data=None,
framework=["PyTorch", "Sentence Transformers"],
reference="https://huggingface.co/llmrails/ember-v1",
similarity_fn_name="cosine",
use_instructions=None,
training_datasets=None,
superseded_by=None,
)
amazon_titan_text_embeddings_v2 = ModelMeta(
name="amazon/Titan-text-embeddings-v2",
revision="1",
release_date="2024-04-30",
languages=["eng_Latn"],
n_parameters=None,
max_tokens=None,
embed_dim=None,
license="proprietary",
open_weights=False,
public_training_code=None,
public_training_data=None,
framework=[],
reference="https://huggingface.co/amazon/Titan-text-embeddings-v2",
similarity_fn_name="cosine",
use_instructions=False,
training_datasets=None,
superseded_by=None,
)
6 changes: 6 additions & 0 deletions mteb/models/overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,20 @@
arctic_models,
bge_models,
bm25,
cde_models,
cohere_models,
colbert_models,
e5_instruct,
e5_models,
gme_models,
google_models,
gritlm_models,
gte_models,
ibm_granite_models,
inf_models,
jasper_models,
jina_models,
lens_models,
linq_models,
llm2vec_models,
misc_models,
Expand Down Expand Up @@ -56,6 +59,7 @@
arctic_models,
bge_models,
bm25,
cde_models,
cohere_models,
colbert_models,
e5_instruct,
Expand All @@ -64,9 +68,11 @@
google_models,
gritlm_models,
gte_models,
gme_models,
ibm_granite_models,
inf_models,
jina_models,
lens_models,
linq_models,
llm2vec_models,
mxbai_models,
Expand Down

0 comments on commit 692bd26

Please sign in to comment.