From 8b68dbde4ab85fc6b790165f7dab77b743fe4274 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Sun, 26 Jan 2025 13:36:21 +0000 Subject: [PATCH 1/3] add blip and blip 2 model meta --- mteb/models/blip2_models.py | 40 +++++--- mteb/models/blip_models.py | 188 +++++++++++++++++++++--------------- 2 files changed, 132 insertions(+), 96 deletions(-) diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py index af15e205c..a04499117 100644 --- a/mteb/models/blip2_models.py +++ b/mteb/models/blip2_models.py @@ -225,18 +225,22 @@ def get_fused_embeddings( revision="51572668da0eb669e01a189dc22abe6088589a24", release_date="2024-03-22", modalities=["image", "text"], - n_parameters=None, + n_parameters=3_740_000_000, max_tokens=None, - embed_dim=None, - license=None, - open_weights=None, - public_training_code=None, + embed_dim=768, + license="mit", + open_weights=True, + public_training_code="https://github.com/salesforce/LAVIS/tree/main/projects/blip2", public_training_data=None, framework=["PyTorch"], - reference=None, + reference="https://huggingface.co/Salesforce/blip2-opt-2.7b", similarity_fn_name=None, - use_instructions=None, - training_datasets=None, + use_instructions=False, + training_datasets=[ + # COCO + # CC3M+CC12M+SBU + # LAION400M + ], ) blip2_opt_6_7b_coco = ModelMeta( @@ -249,18 +253,22 @@ def get_fused_embeddings( revision="0d580de59320a25a4d2c386387bcef310d5f286e", release_date="2024-03-31", modalities=["image", "text"], - n_parameters=None, + n_parameters=7_750_000_000, max_tokens=None, - embed_dim=None, - license=None, - open_weights=None, - public_training_code=None, + embed_dim=768, + license="mit", + open_weights=True, + public_training_code="https://github.com/salesforce/LAVIS/tree/main/projects/blip2", public_training_data=None, framework=["PyTorch"], - reference=None, + reference="https://huggingface.co/Salesforce/blip2-opt-2.7b", similarity_fn_name=None, - use_instructions=None, - training_datasets=None, + use_instructions=False, + training_datasets=[ + # COCO + # CC3M+CC12M+SBU + # LAION400M + ], ) diff --git a/mteb/models/blip_models.py b/mteb/models/blip_models.py index d1670af4d..9fbd22d99 100644 --- a/mteb/models/blip_models.py +++ b/mteb/models/blip_models.py @@ -164,18 +164,22 @@ def get_fused_embeddings( revision="2227ac38c9f16105cb0412e7cab4759978a8fd90", release_date="2023-12-07", modalities=["image", "text"], - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=None, - public_training_code=None, - public_training_data=None, + n_parameters=470_000_000, + max_tokens=512, + embed_dim=768, + license="bsd-3-clause", + open_weights=True, + public_training_code="https://github.com/salesforce/BLIP", + public_training_data="https://github.com/salesforce/BLIP", framework=["PyTorch"], - reference=None, + reference="https://huggingface.co/Salesforce/blip-image-captioning-large", similarity_fn_name=None, - use_instructions=None, - training_datasets=None, + use_instructions=False, + training_datasets=[ + # COCO + # CC3M+CC12M+SBU + # LAION115M + ], ) blip_image_captioning_base = ModelMeta( @@ -188,18 +192,22 @@ def get_fused_embeddings( revision="89b09ea1789f7addf2f6d6f0dfc4ce10ab58ef84", release_date="2023-08-01", modalities=["image", "text"], - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=None, - public_training_code=None, - public_training_data=None, + n_parameters=247_000_000, + max_tokens=512, + embed_dim=768, + license="bsd-3-clause", + open_weights=True, + public_training_code="https://github.com/salesforce/BLIP", + public_training_data="https://github.com/salesforce/BLIP", framework=["PyTorch"], - reference=None, + reference="https://huggingface.co/Salesforce/blip-image-captioning-large", similarity_fn_name=None, - use_instructions=None, - training_datasets=None, + use_instructions=False, + training_datasets=[ + # COCO + # CC3M+CC12M+SBU + # LAION115M + ], ) @@ -213,18 +221,21 @@ def get_fused_embeddings( revision="c7df8e7cd7aa2ee9af18f56e2b29e59a92651b64", release_date="2023-12-07", modalities=["image", "text"], - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=None, - public_training_code=None, - public_training_data=None, + n_parameters=247_000_000, + max_tokens=512, + embed_dim=768, + license="bsd-3-clause", + open_weights=True, + public_training_code="https://github.com/salesforce/BLIP", + public_training_data="https://github.com/salesforce/BLIP", framework=["PyTorch"], - reference=None, + reference="https://huggingface.co/Salesforce/blip-image-captioning-large", similarity_fn_name=None, - use_instructions=None, - training_datasets=None, + use_instructions=False, + training_datasets=[ + # CC3M+CC12M+SBU + # LAION115M + ], ) blip_vqa_capfilt_large = ModelMeta( @@ -237,18 +248,21 @@ def get_fused_embeddings( revision="e53f95265aeab69013fabb5380500ab984adbbb4", release_date="2023-01-22", modalities=["image", "text"], - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=None, - public_training_code=None, - public_training_data=None, + n_parameters=247_000_000, + max_tokens=512, + embed_dim=768, + license="bsd-3-clause", + open_weights=True, + public_training_code="https://github.com/salesforce/BLIP", + public_training_data="https://github.com/salesforce/BLIP", framework=["PyTorch"], - reference=None, + reference="https://huggingface.co/Salesforce/blip-image-captioning-large", similarity_fn_name=None, - use_instructions=None, - training_datasets=None, + use_instructions=False, + training_datasets=[ + # CC3M+CC12M+SBU + # LAION115M + ], ) blip_itm_base_coco = ModelMeta( @@ -261,18 +275,21 @@ def get_fused_embeddings( revision="7eaa90c11850c0b17fc38c6a11e7d88bd6ac231f", release_date="2023-08-01", modalities=["image", "text"], - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=None, - public_training_code=None, - public_training_data=None, + n_parameters=247_000_000, + max_tokens=512, + embed_dim=768, + license="bsd-3-clause", + open_weights=True, + public_training_code="https://github.com/salesforce/BLIP", + public_training_data="https://github.com/salesforce/BLIP", framework=["PyTorch"], - reference=None, + reference="https://huggingface.co/Salesforce/blip-image-captioning-large", similarity_fn_name=None, - use_instructions=None, - training_datasets=None, + use_instructions=False, + training_datasets=[ + # CC3M+CC12M+SBU + # LAION115M + ], ) blip_itm_large_coco = ModelMeta( @@ -285,18 +302,22 @@ def get_fused_embeddings( revision="fef05cafc05298067cbbca00b125749394a77a6f", release_date="2023-08-01", modalities=["image", "text"], - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=None, - public_training_code=None, - public_training_data=None, + n_parameters=470_000_000, + max_tokens=512, + embed_dim=768, + license="bsd-3-clause", + open_weights=True, + public_training_code="https://github.com/salesforce/BLIP", + public_training_data="https://github.com/salesforce/BLIP", framework=["PyTorch"], - reference=None, + reference="https://huggingface.co/Salesforce/blip-image-captioning-large", similarity_fn_name=None, - use_instructions=None, - training_datasets=None, + use_instructions=False, + training_datasets=[ + # COCO + # CC3M+CC12M+SBU + # LAION115M + ], ) blip_itm_base_flickr = ModelMeta( @@ -309,18 +330,22 @@ def get_fused_embeddings( revision="1de29e660d91ae1786c1876212ea805a22eab251", release_date="2023-08-01", modalities=["image", "text"], - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=None, - public_training_code=None, - public_training_data=None, + n_parameters=247_000_000, + max_tokens=512, + embed_dim=768, + license="bsd-3-clause", + open_weights=True, + public_training_code="https://github.com/salesforce/BLIP", + public_training_data="https://github.com/salesforce/BLIP", framework=["PyTorch"], - reference=None, + reference="https://huggingface.co/Salesforce/blip-image-captioning-large", similarity_fn_name=None, - use_instructions=None, - training_datasets=None, + use_instructions=False, + training_datasets=[ + # CC3M+CC12M+SBU + # LAION115M + # Flickr30k + ], ) blip_itm_large_flickr = ModelMeta( @@ -333,18 +358,21 @@ def get_fused_embeddings( revision="bda12e6506758f54261b5ab174b2c55a3ba143fb", release_date="2023-08-01", modalities=["image", "text"], - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=None, - public_training_code=None, - public_training_data=None, + n_parameters=470_000_000, + max_tokens=512, + embed_dim=768, + license="bsd-3-clause", + open_weights=True, + public_training_code="https://github.com/salesforce/BLIP", + public_training_data="https://github.com/salesforce/BLIP", framework=["PyTorch"], - reference=None, + reference="https://huggingface.co/Salesforce/blip-image-captioning-large", similarity_fn_name=None, - use_instructions=None, - training_datasets=None, + use_instructions=False, + training_datasets=[ + # CC3M+CC12M+SBU + # LAION115M + ], ) From 9b2b1b4b3859394c47143f4030e8c436e82480b8 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Sun, 26 Jan 2025 13:57:24 +0000 Subject: [PATCH 2/3] fix references --- mteb/models/blip2_models.py | 2 +- mteb/models/blip_models.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py index a04499117..ea5a7a0ae 100644 --- a/mteb/models/blip2_models.py +++ b/mteb/models/blip2_models.py @@ -261,7 +261,7 @@ def get_fused_embeddings( public_training_code="https://github.com/salesforce/LAVIS/tree/main/projects/blip2", public_training_data=None, framework=["PyTorch"], - reference="https://huggingface.co/Salesforce/blip2-opt-2.7b", + reference="https://huggingface.co/Salesforce/blip2-opt-6.7b-coco", similarity_fn_name=None, use_instructions=False, training_datasets=[ diff --git a/mteb/models/blip_models.py b/mteb/models/blip_models.py index 9fbd22d99..18ace2d0f 100644 --- a/mteb/models/blip_models.py +++ b/mteb/models/blip_models.py @@ -200,7 +200,7 @@ def get_fused_embeddings( public_training_code="https://github.com/salesforce/BLIP", public_training_data="https://github.com/salesforce/BLIP", framework=["PyTorch"], - reference="https://huggingface.co/Salesforce/blip-image-captioning-large", + reference="https://huggingface.co/Salesforce/blip-image-captioning-base", similarity_fn_name=None, use_instructions=False, training_datasets=[ @@ -229,7 +229,7 @@ def get_fused_embeddings( public_training_code="https://github.com/salesforce/BLIP", public_training_data="https://github.com/salesforce/BLIP", framework=["PyTorch"], - reference="https://huggingface.co/Salesforce/blip-image-captioning-large", + reference="https://huggingface.co/Salesforce/blip-vqa-base", similarity_fn_name=None, use_instructions=False, training_datasets=[ @@ -256,7 +256,7 @@ def get_fused_embeddings( public_training_code="https://github.com/salesforce/BLIP", public_training_data="https://github.com/salesforce/BLIP", framework=["PyTorch"], - reference="https://huggingface.co/Salesforce/blip-image-captioning-large", + reference="https://huggingface.co/Salesforce/blip-vqa-capfilt-large", similarity_fn_name=None, use_instructions=False, training_datasets=[ @@ -283,7 +283,7 @@ def get_fused_embeddings( public_training_code="https://github.com/salesforce/BLIP", public_training_data="https://github.com/salesforce/BLIP", framework=["PyTorch"], - reference="https://huggingface.co/Salesforce/blip-image-captioning-large", + reference="https://huggingface.co/Salesforce/blip-itm-base-coco", similarity_fn_name=None, use_instructions=False, training_datasets=[ @@ -310,7 +310,7 @@ def get_fused_embeddings( public_training_code="https://github.com/salesforce/BLIP", public_training_data="https://github.com/salesforce/BLIP", framework=["PyTorch"], - reference="https://huggingface.co/Salesforce/blip-image-captioning-large", + reference="https://huggingface.co/Salesforce/blip-itm-large-coco", similarity_fn_name=None, use_instructions=False, training_datasets=[ @@ -338,7 +338,7 @@ def get_fused_embeddings( public_training_code="https://github.com/salesforce/BLIP", public_training_data="https://github.com/salesforce/BLIP", framework=["PyTorch"], - reference="https://huggingface.co/Salesforce/blip-image-captioning-large", + reference="https://huggingface.co/Salesforce/blip-itm-base-flickr", similarity_fn_name=None, use_instructions=False, training_datasets=[ @@ -366,7 +366,7 @@ def get_fused_embeddings( public_training_code="https://github.com/salesforce/BLIP", public_training_data="https://github.com/salesforce/BLIP", framework=["PyTorch"], - reference="https://huggingface.co/Salesforce/blip-image-captioning-large", + reference="https://huggingface.co/Salesforce/blip-itm-large-flickr", similarity_fn_name=None, use_instructions=False, training_datasets=[ From 7a384dd201b70c82d2d1b394193b53e5609b3a04 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Sun, 26 Jan 2025 14:31:38 +0000 Subject: [PATCH 3/3] fix training datasets --- mteb/models/blip2_models.py | 18 ++++++++---------- mteb/models/blip_models.py | 32 ++++++++++++++++---------------- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/mteb/models/blip2_models.py b/mteb/models/blip2_models.py index ea5a7a0ae..9cb914b15 100644 --- a/mteb/models/blip2_models.py +++ b/mteb/models/blip2_models.py @@ -215,6 +215,12 @@ def get_fused_embeddings( return BLIP2ModelWrapper(**kwargs) +blip2_training_datasets = { + # COCO + # CC3M+CC12M+SBU + # LAION400M +} + blip2_opt_2_7b = ModelMeta( loader=partial( blip2_loader, @@ -236,11 +242,7 @@ def get_fused_embeddings( reference="https://huggingface.co/Salesforce/blip2-opt-2.7b", similarity_fn_name=None, use_instructions=False, - training_datasets=[ - # COCO - # CC3M+CC12M+SBU - # LAION400M - ], + training_datasets=blip2_training_datasets, ) blip2_opt_6_7b_coco = ModelMeta( @@ -264,11 +266,7 @@ def get_fused_embeddings( reference="https://huggingface.co/Salesforce/blip2-opt-6.7b-coco", similarity_fn_name=None, use_instructions=False, - training_datasets=[ - # COCO - # CC3M+CC12M+SBU - # LAION400M - ], + training_datasets=blip2_training_datasets, ) diff --git a/mteb/models/blip_models.py b/mteb/models/blip_models.py index 18ace2d0f..da23a2be3 100644 --- a/mteb/models/blip_models.py +++ b/mteb/models/blip_models.py @@ -175,11 +175,11 @@ def get_fused_embeddings( reference="https://huggingface.co/Salesforce/blip-image-captioning-large", similarity_fn_name=None, use_instructions=False, - training_datasets=[ + training_datasets={ # COCO # CC3M+CC12M+SBU # LAION115M - ], + }, ) blip_image_captioning_base = ModelMeta( @@ -203,11 +203,11 @@ def get_fused_embeddings( reference="https://huggingface.co/Salesforce/blip-image-captioning-base", similarity_fn_name=None, use_instructions=False, - training_datasets=[ + training_datasets={ # COCO # CC3M+CC12M+SBU # LAION115M - ], + }, ) @@ -232,10 +232,10 @@ def get_fused_embeddings( reference="https://huggingface.co/Salesforce/blip-vqa-base", similarity_fn_name=None, use_instructions=False, - training_datasets=[ + training_datasets={ # CC3M+CC12M+SBU # LAION115M - ], + }, ) blip_vqa_capfilt_large = ModelMeta( @@ -259,10 +259,10 @@ def get_fused_embeddings( reference="https://huggingface.co/Salesforce/blip-vqa-capfilt-large", similarity_fn_name=None, use_instructions=False, - training_datasets=[ + training_datasets={ # CC3M+CC12M+SBU # LAION115M - ], + }, ) blip_itm_base_coco = ModelMeta( @@ -286,10 +286,10 @@ def get_fused_embeddings( reference="https://huggingface.co/Salesforce/blip-itm-base-coco", similarity_fn_name=None, use_instructions=False, - training_datasets=[ + training_datasets={ # CC3M+CC12M+SBU # LAION115M - ], + }, ) blip_itm_large_coco = ModelMeta( @@ -313,11 +313,11 @@ def get_fused_embeddings( reference="https://huggingface.co/Salesforce/blip-itm-large-coco", similarity_fn_name=None, use_instructions=False, - training_datasets=[ + training_datasets={ # COCO # CC3M+CC12M+SBU # LAION115M - ], + }, ) blip_itm_base_flickr = ModelMeta( @@ -341,11 +341,11 @@ def get_fused_embeddings( reference="https://huggingface.co/Salesforce/blip-itm-base-flickr", similarity_fn_name=None, use_instructions=False, - training_datasets=[ + training_datasets={ # CC3M+CC12M+SBU # LAION115M # Flickr30k - ], + }, ) blip_itm_large_flickr = ModelMeta( @@ -369,10 +369,10 @@ def get_fused_embeddings( reference="https://huggingface.co/Salesforce/blip-itm-large-flickr", similarity_fn_name=None, use_instructions=False, - training_datasets=[ + training_datasets={ # CC3M+CC12M+SBU # LAION115M - ], + }, )