Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: nomic prompts #1685

Merged
merged 8 commits into from
Jan 3, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 33 additions & 18 deletions mteb/models/nomic_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@
import numpy as np
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer

import mteb
from mteb.encoder_interface import PromptType
from mteb.model_meta import ModelMeta

from .wrapper import Wrapper
from .sentence_transformer_wrapper import SentenceTransformerWrapper

logger = logging.getLogger(__name__)


class NomicWrapper(Wrapper):
class NomicWrapper(SentenceTransformerWrapper):
"""following the hf model card documentation."""

def __init__(
Expand All @@ -28,10 +28,7 @@ def __init__(
**kwargs: Any,
):
self.model_name = model_name
self.model = SentenceTransformer(model_name, revision=revision, **kwargs)
self.model_prompts = (
self.validate_task_to_prompt_name(model_prompts) if model_prompts else None
)
super().__init__(model_name, revision, model_prompts, **kwargs)

def to(self, device: torch.device) -> None:
self.model.to(device)
Expand All @@ -45,33 +42,51 @@ def encode( # type: ignore
batch_size: int = 32,
**kwargs: Any,
) -> np.ndarray:
input_type = self.get_prompt_name(self.model_prompts, task_name, prompt_type)

# default to search_document if input_type and prompt_name are not provided
if input_type is None:
input_type = "search_document"

sentences = [f"{input_type}: {sentence}" for sentence in sentences]

emb = self.model.encode(sentences, batch_size=batch_size, **kwargs)
prompt_name = (
self.get_prompt_name(self.model_prompts, task_name, prompt_type)
or PromptType.passage.value
)
task = mteb.get_task(task_name)
# normalization not applied to classification
# https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/eval/mteb_eval/eval_mteb.py#L172
normalize = task.metadata.type not in (
"Classification",
"MultilabelClassification",
"PairClassification",
"Reranking",
"STS",
"Summarization",
)
emb = self.model.encode(
sentences,
prompt_name=prompt_name,
batch_size=batch_size,
**kwargs,
)
# v1.5 has a non-trainable layer norm to unit normalize the embeddings for binary quantization
# the outputs are similar to if we just normalized but keeping the same for consistency
if self.model_name == "nomic-ai/nomic-embed-text-v1.5":
if not isinstance(emb, torch.Tensor):
emb = torch.tensor(emb)
emb = F.layer_norm(emb, normalized_shape=(emb.shape[1],))
emb = F.normalize(emb, p=2, dim=1)
if normalize:
emb = F.normalize(emb, p=2, dim=1)

if isinstance(emb, torch.Tensor):
emb = emb.cpu().detach().float().numpy()

return emb


# https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/eval/mteb_eval/eval_mteb.py#L142-L159
model_prompts = {
"Classification": "classification: ",
"MultilabelClassification": "classification: ",
"Clustering": "clustering: ",
"PairClassification": "classification: ",
"Reranking": "classification: ",
"STS": "classification: ",
"Summarization": "classification: ",
PromptType.query.value: "search_query: ",
PromptType.passage.value: "search_document: ",
}
Expand Down Expand Up @@ -155,7 +170,7 @@ def encode( # type: ignore
)


nomic_embed_v1_ablated = ModelMeta(
nomic_embed_v1_unsupervised = ModelMeta(
loader=partial( # type: ignore
NomicWrapper,
trust_remote_code=True,
Expand Down
Loading