From d4b3aee18d5f1fbd98ae6dfd7c61fc0bbd7c37f7 Mon Sep 17 00:00:00 2001 From: le zhang Date: Sat, 23 Nov 2024 10:21:36 -0500 Subject: [PATCH] update readme --- evaluation/eval_knn_aimv2.py | 239 +++++++++++++++++++++++++++++++++++ model/sail_model.py | 2 +- model/vision_model.py | 41 +++--- readme.md | 20 +-- scripts/alignment_probing.sh | 4 +- scripts/encode.sh | 6 +- slurm-5671467.out | 106 ++++++++++++++++ 7 files changed, 377 insertions(+), 41 deletions(-) create mode 100644 evaluation/eval_knn_aimv2.py create mode 100644 slurm-5671467.out diff --git a/evaluation/eval_knn_aimv2.py b/evaluation/eval_knn_aimv2.py new file mode 100644 index 0000000..9144416 --- /dev/null +++ b/evaluation/eval_knn_aimv2.py @@ -0,0 +1,239 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys +import argparse + +import torch +from torch import nn +import torch.distributed as dist +import torch.backends.cudnn as cudnn +from torchvision import datasets +from torchvision import transforms as pth_transforms +from torchvision import models as torchvision_models +from transformers import AutoModel + +import knn_utils as utils + + +def extract_feature_pipeline(args): + # ============ preparing data ... ============ + transform = pth_transforms.Compose([ + pth_transforms.Resize(224, interpolation=3), # Corresponds to "do_resize" and "size.shortest_edge" + pth_transforms.CenterCrop(224), # Corresponds to "do_center_crop" and "crop_size" + pth_transforms.ToTensor(), # Converts to tensor + pth_transforms.Normalize( + mean=(0.48145466, 0.4578275, 0.40821073), # Corresponds to "image_mean" + std=(0.26862954, 0.26130258, 0.27577711) # Corresponds to "image_std" + ), + ]) + dataset_train = ReturnIndexDataset(os.path.join(args.data_path, "train"), transform=transform) + dataset_val = ReturnIndexDataset(os.path.join(args.data_path, "val"), transform=transform) + sampler = torch.utils.data.DistributedSampler(dataset_train, shuffle=False) + data_loader_train = torch.utils.data.DataLoader( + dataset_train, + sampler=sampler, + batch_size=args.batch_size_per_gpu, + num_workers=args.num_workers, + pin_memory=True, + drop_last=False, + ) + data_loader_val = torch.utils.data.DataLoader( + dataset_val, + batch_size=args.batch_size_per_gpu, + num_workers=args.num_workers, + pin_memory=True, + drop_last=False, + ) + print(f"Data loaded with {len(dataset_train)} train and {len(dataset_val)} val imgs.") + + # ============ building network ... ============ + + model = AutoModel.from_pretrained("apple/aimv2-large-patch14-224", trust_remote_code=True) + model.cuda() + model.eval() + + # ============ extract features ... ============ + print("Extracting features for train set...") + train_features = extract_features(model, data_loader_train, args.use_cuda) + print("Extracting features for val set...") + test_features = extract_features(model, data_loader_val, args.use_cuda) + + if utils.get_rank() == 0: + train_features = nn.functional.normalize(train_features, dim=1, p=2) + test_features = nn.functional.normalize(test_features, dim=1, p=2) + + train_labels = torch.tensor([s[-1] for s in dataset_train.samples]).long() + test_labels = torch.tensor([s[-1] for s in dataset_val.samples]).long() + # save features and labels + if args.dump_features and dist.get_rank() == 0: + torch.save(train_features.cpu(), os.path.join(args.dump_features, "trainfeat.pth")) + torch.save(test_features.cpu(), os.path.join(args.dump_features, "testfeat.pth")) + torch.save(train_labels.cpu(), os.path.join(args.dump_features, "trainlabels.pth")) + torch.save(test_labels.cpu(), os.path.join(args.dump_features, "testlabels.pth")) + return train_features, test_features, train_labels, test_labels + + +@torch.no_grad() +def extract_features(model, data_loader, use_cuda=True, multiscale=False): + metric_logger = utils.MetricLogger(delimiter=" ") + features = None + for samples, index in metric_logger.log_every(data_loader, 10): + samples = samples.cuda(non_blocking=True) + index = index.cuda(non_blocking=True) + if multiscale: + feats = utils.multi_scale(samples, model) + else: + feats = torch.mean(model(samples).last_hidden_state,dim=1).clone() + + # init storage feature matrix + if dist.get_rank() == 0 and features is None: + features = torch.zeros(len(data_loader.dataset), feats.shape[-1]) + if use_cuda: + features = features.cuda(non_blocking=True) + print(f"Storing features into tensor of shape {features.shape}") + + # get indexes from all processes + y_all = torch.empty(dist.get_world_size(), index.size(0), dtype=index.dtype, device=index.device) + y_l = list(y_all.unbind(0)) + y_all_reduce = torch.distributed.all_gather(y_l, index, async_op=True) + y_all_reduce.wait() + index_all = torch.cat(y_l) + + # share features between processes + feats_all = torch.empty( + dist.get_world_size(), + feats.size(0), + feats.size(1), + dtype=feats.dtype, + device=feats.device, + ) + output_l = list(feats_all.unbind(0)) + output_all_reduce = torch.distributed.all_gather(output_l, feats, async_op=True) + output_all_reduce.wait() + + # update storage feature matrix + if dist.get_rank() == 0: + if use_cuda: + print("features shape:", features.shape) + print("concatenated output shape:", torch.cat(output_l).shape) + print("index_all shape:", index_all.shape) + + features.index_copy_(0, index_all, torch.cat(output_l)) + else: + features.index_copy_(0, index_all.cpu(), torch.cat(output_l).cpu()) + return features + + +@torch.no_grad() +def knn_classifier(train_features, train_labels, test_features, test_labels, k, T, num_classes=1000): + top1, top5, total = 0.0, 0.0, 0 + train_features = train_features.t() + num_test_images, num_chunks = test_labels.shape[0], 100 + imgs_per_chunk = num_test_images // num_chunks + retrieval_one_hot = torch.zeros(k, num_classes).to(train_features.device) + for idx in range(0, num_test_images, imgs_per_chunk): + # get the features for test images + features = test_features[ + idx : min((idx + imgs_per_chunk), num_test_images), : + ] + targets = test_labels[idx : min((idx + imgs_per_chunk), num_test_images)] + batch_size = targets.shape[0] + + # calculate the dot product and compute top-k neighbors + similarity = torch.mm(features, train_features) + distances, indices = similarity.topk(k, largest=True, sorted=True) + candidates = train_labels.view(1, -1).expand(batch_size, -1) + retrieved_neighbors = torch.gather(candidates, 1, indices) + + retrieval_one_hot.resize_(batch_size * k, num_classes).zero_() + retrieval_one_hot.scatter_(1, retrieved_neighbors.view(-1, 1), 1) + distances_transform = distances.clone().div_(T).exp_() + probs = torch.sum( + torch.mul( + retrieval_one_hot.view(batch_size, -1, num_classes), + distances_transform.view(batch_size, -1, 1), + ), + 1, + ) + _, predictions = probs.sort(1, True) + + # find the predictions that match the target + correct = predictions.eq(targets.data.view(-1, 1)) + top1 = top1 + correct.narrow(1, 0, 1).sum().item() + top5 = top5 + correct.narrow(1, 0, min(5, k)).sum().item() # top5 does not make sense if k < 5 + total += targets.size(0) + top1 = top1 * 100.0 / total + top5 = top5 * 100.0 / total + return top1, top5 + + +class ReturnIndexDataset(datasets.ImageFolder): + def __getitem__(self, idx): + img, lab = super(ReturnIndexDataset, self).__getitem__(idx) + return img, idx + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('Evaluation with weighted k-NN on ImageNet') + parser.add_argument('--batch_size_per_gpu', default=128, type=int, help='Per-GPU batch-size') + parser.add_argument('--nb_knn', default=[10, 20, 100, 200], nargs='+', type=int, + help='Number of NN to use. 20 is usually working the best.') + parser.add_argument('--temperature', default=0.07, type=float, + help='Temperature used in the voting coefficient') + parser.add_argument('--pretrained_weights', default='', type=str, help="Path to pretrained weights to evaluate.") + parser.add_argument('--use_cuda', default=True, type=utils.bool_flag, + help="Should we store the features on GPU? We recommend setting this to False if you encounter OOM") + parser.add_argument('--arch', default='vit_small', type=str, help='Architecture') + parser.add_argument('--patch_size', default=16, type=int, help='Patch resolution of the model.') + parser.add_argument("--checkpoint_key", default="teacher", type=str, + help='Key to use in the checkpoint (example: "teacher")') + parser.add_argument('--dump_features', default=None, + help='Path where to save computed features, empty for no saving') + parser.add_argument('--load_features', default=None, help="""If the features have + already been computed, where to find them.""") + parser.add_argument('--num_workers', default=10, type=int, help='Number of data loading workers per GPU.') + parser.add_argument("--dist_url", default="env://", type=str, help="""url used to set up + distributed training; see https://pytorch.org/docs/stable/distributed.html""") + parser.add_argument("--local-rank", default=0, type=int, help="Please ignore and do not set this argument.") + parser.add_argument('--data_path', default='/path/to/imagenet/', type=str) + args = parser.parse_args() + + utils.init_distributed_mode(args) + print("git:\n {}\n".format(utils.get_sha())) + print("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items()))) + cudnn.benchmark = True + + if args.load_features: + train_features = torch.load(os.path.join(args.load_features, "trainfeat.pth")) + test_features = torch.load(os.path.join(args.load_features, "testfeat.pth")) + train_labels = torch.load(os.path.join(args.load_features, "trainlabels.pth")) + test_labels = torch.load(os.path.join(args.load_features, "testlabels.pth")) + else: + # need to extract features ! + train_features, test_features, train_labels, test_labels = extract_feature_pipeline(args) + + if utils.get_rank() == 0: + if args.use_cuda: + train_features = train_features.cuda() + test_features = test_features.cuda() + train_labels = train_labels.cuda() + test_labels = test_labels.cuda() + + print("Features are ready!\nStart the k-NN classification.") + for k in args.nb_knn: + top1, top5 = knn_classifier(train_features, train_labels, + test_features, test_labels, k, args.temperature) + print(f"{k}-NN classifier result: Top1: {top1}, Top5: {top5}") + dist.barrier() \ No newline at end of file diff --git a/model/sail_model.py b/model/sail_model.py index 80d3095..cb52611 100644 --- a/model/sail_model.py +++ b/model/sail_model.py @@ -227,7 +227,7 @@ def __init__( super(SAILModel, self).__init__() self.text_model = SentenceEmbedding(text_model_name) self.vision_model = ImageEmbedding(vision_model_name, seg=seg, agg_mode=agg_mode) - if any(x in vision_model_name for x in ['mae','ibot','dinov1','aim','ijepa','clip']) or 'patch' in agg_mode or 'cls' in agg_mode: + if any(x in vision_model_name for x in ['mae','ibot','dinov1','ml-aim','ijepa','clip','aimv2']) or 'patch' in agg_mode or 'cls' in agg_mode: if hasattr(self.vision_model.model, 'config'): vision_dimesion = self.vision_model.model.config.hidden_size else: diff --git a/model/vision_model.py b/model/vision_model.py index 5cb56d6..99412ea 100644 --- a/model/vision_model.py +++ b/model/vision_model.py @@ -68,7 +68,7 @@ def __init__(self, model_name="facebook/dinov2-base", device=None, seg: bool = F self.agg_mode = agg_mode self.model_name = model_name - if any(x in model_name for x in ['ibot', 'mae', 'dinov1', 'aim', 'ijepa']): + if any(x in model_name for x in ['ibot', 'mae', 'dinov1', 'ml-aim', 'ijepa']): # load from local if 'ibot' in model_name: self.model = get_ibot_vit(model_name) @@ -84,7 +84,7 @@ def __init__(self, model_name="facebook/dinov2-base", device=None, seg: bool = F elif 'resnet' in model_name: self.model = torch.hub.load('facebookresearch/dino:main', 'dino_resnet50') self.model.embed_dim = 2048 - elif 'aim' in model_name: + elif 'ml-aim' in model_name: if '1B' in model_name: self.model = torch.hub.load("apple/ml-aim", "aim_1B") self.model.embed_dim = 2048 @@ -107,6 +107,9 @@ def __init__(self, model_name="facebook/dinov2-base", device=None, seg: bool = F elif any(x in model_name.lower() for x in ['clip']): self.model = CLIPVisionModel.from_pretrained(model_name, torch_dtype=torch.float16) self.image_processor = AutoImageProcessor.from_pretrained(model_name) + elif any(x in model_name.lower() for x in ['aimv2']): + self.model = AutoModel.from_pretrained(model_name, torch_dtype=torch.float16, trust_remote_code=True) + self.image_processor = AutoImageProcessor.from_pretrained(model_name) else: if seg: modify_vit('seg') @@ -133,13 +136,6 @@ def load_single_image(args): images[idx] = img return images - - # def load_images_from_directory(self, images_path: List[str]) -> List[Image.Image]: - # images = [] - # for image_path in images_path: - # with Image.open(image_path) as img: - # images.append(img.convert("RGB")) - # return images def get_visual_embeddings_from_directory(self, images_path: List[str]): images = self.load_images_from_directory(images_path) @@ -161,7 +157,7 @@ def forward(self, inputs, patch_mode=False, attetion_type='qk', ignore_residual= self.model.encoder.attetion_type = attetion_type self.model.encoder.ignore_residual = ignore_residual - if any(x in self.model_name.lower() for x in ['mae', 'convnextv2']): + if any(x in self.model_name.lower() for x in ['mae']): if isinstance(inputs, torch.Tensor): outputs = self.model.forward_features(inputs) else: @@ -170,43 +166,46 @@ def forward(self, inputs, patch_mode=False, attetion_type='qk', ignore_residual= if isinstance(inputs, torch.Tensor): outputs = self.model(inputs) elif isinstance(inputs, dict) or isinstance(inputs, BaseBatchFeature): - if any(x in self.model_name.lower() for x in ['ibot', 'dinov1', 'aim', 'ijepa']): + if any(x in self.model_name.lower() for x in ['ibot', 'dinov1', 'ml-aim', 'ijepa']): outputs = self.model(inputs['pixel_values']) else: + # huggingface transformer vision model outputs = self.model(**inputs) else: raise ValueError(f"Unsupported input type: {type(inputs)}") # extract the embeddings if any(x in self.model_name.lower() for x in ['ijepa']): - linear_input = outputs.mean(dim=1) + embedding = outputs.mean(dim=1) elif any(x in self.model_name.lower() for x in ['clip']): - linear_input = outputs.pooler_output + embedding = outputs.pooler_output + elif any(x in self.model_name.lower() for x in ['aimv2']): + embedding = torch.mean(outputs.last_hidden_state, dim=1) else: sequence_output = outputs[0] # batch_size, sequence_length, hidden_size if patch_mode: patch_tokens = sequence_output[:, 1:] cls_token = sequence_output[:, 0].unsqueeze(1).repeat(1, patch_tokens.shape[1], 1) - linear_input = torch.cat([cls_token, patch_tokens], dim=-1) + embedding = torch.cat([cls_token, patch_tokens], dim=-1) else: - if any(x in self.model_name.lower() for x in ['ibot', 'r101', 'r152', 'mae', 'convnextv2', 'dinov1']): - linear_input = outputs + if any(x in self.model_name.lower() for x in ['ibot', 'mae', 'dinov1']): + embedding = outputs elif any(x in self.model_name.lower() for x in ['aim']): - linear_input = outputs[1] + embedding = outputs[1] else: cls_token = sequence_output[:, 0] patch_tokens = sequence_output[:, 1:] if self.agg_mode == 'patch': - linear_input = patch_tokens.mean(dim=1) + embedding = patch_tokens.mean(dim=1) elif self.agg_mode == 'cls': - linear_input = cls_token + embedding = cls_token elif self.agg_mode == 'concat': - linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1) + embedding = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1) else: raise ValueError(f"Invalid agg_mode: {self.agg_mode}") - return linear_input + return embedding if __name__ == "__main__": diff --git a/readme.md b/readme.md index a7d7f69..66c3341 100644 --- a/readme.md +++ b/readme.md @@ -30,8 +30,11 @@ This codebase enables you to train your own CLIP-like model on a single GPU by a 2. **Download the Alignment Layer Checkpoint** You can download the pretrained alignment layer checkpoints from the links below: - - **SAIL DinoV2 Large with NV-Embed V2**: [sail_dinov2l_nv2.pt](https://huggingface.co/le723z/sail/resolve/main/sail_dinov2l_nv2.pt?download=true) - - **SAIL DinoV2 Large with GTE**: [sail_dinov2l_gte.pt](https://huggingface.co/le723z/sail/resolve/main/sail_dinov2l_gte.pt?download=true) + | **Data** | **Model** | Alignment Layer | IN-1K | **I2T R@1 (MSCOCO)** | **T2I R@1 (MSCOCO)** | **I2T R@1 (Flickr30k)** | **T2I R@1 (Flickr30k)** | **Text (Winoground)** | **Image (Winoground)** | **Group (Winoground)** | **Avg. (MMVP)** | + | ------------ | ------------ | ------------------------------------------------------------ | -------- | -------------------- | -------------------- | ----------------------- | ----------------------- | --------------------- | ---------------------- | ---------------------- | --------------- | + | 23M | SAIL-L (GTE) | [download](https://huggingface.co/le723z/sail/resolve/main/sail_dinov2l_gte.pt?download=true) | 65.4 | 54.1 | 42.7 | 80.8 | 68.9 | 34.0 | 13.25 | 8.75 | 22.2 | + | 23M | SAIL-L (NV2) | [download](https://huggingface.co/le723z/sail/resolve/main/sail_dinov2l_nv2.pt?download=true) | **73.4** | **62.4** | **48.6** | **87.6** | **75.7** | **40.25** | **18.75** | **15.0** | **28.9** | + | *LAION-400M* | *CLIP-L* | | 72.7 | *59.7* | *43.0* | *87.6* | *70.2* | *30.5* | *11.5* | *8.75* | *20.0* | 3. **Run the Model** @@ -73,19 +76,6 @@ This codebase enables you to train your own CLIP-like model on a single GPU by a The codebase builds upon [OpenCLIP](https://github.com/mlfoundations/open_clip) (for training SAIL) and [LLaVA](https://github.com/haotian-liu/LLaVA/tree/main) (for testing SAIL's vision encoder in MLLMs). Please ensure the necessary dependency packages for these frameworks are installed. -| **Data** | **Model** | **I2T R@1 (MSCOCO)** | **T2I R@1 (MSCOCO)** | **I2T R@1 (Flickr30k)** | **T2I R@1 (Flickr30k)** | **Text (Winoground)** | **Image (Winoground)** | **Group (Winoground)** | **Avg. (MMVP)** | -| ----------- | ------------ | -------------------- | -------------------- | ----------------------- | ----------------------- | --------------------- | ---------------------- | ---------------------- | --------------- | -| 23M | SAIL-L (GTE) | 54.1 | 42.7 | 80.8 | 68.9 | 34.0 | 13.25 | 8.75 | 22.2 | -| 23M | SAIL-L (NV2) | **62.4** | **48.6** | **87.6** | **75.7** | **40.25** | **18.75** | **15.0** | **28.9** | -| *LAION400M* | *CLIP-L* | *59.7* | *43.0* | *87.6* | *70.2* | *30.5* | *11.5* | *8.75* | *20.0* | - -| Data | Model | Food101 | CIFAR10 | CIFAR100 | SUN397 | Cars | Aircraft | DTD | Pets | Cal101 | Flowers | Avg. | INet | -| ----------- | ------------ | ------- | -------- | -------- | ------ | ------ | -------- | -------- | ------ | -------- | -------- | ------ | -------- | -| 23M | SAIL-L (NV2) | 86.1 | **96.7** | **86.7** | 69.8 | 44.6 | **28.6** | **63.5** | 82.3 | **85.4** | **77.2** | 72.1 | **73.4** | -| *LAION400M* | *CLIP-L* | *90.1* | *94.6* | *77.4* | *72.6* | *89.6* | *25* | *60.4* | *91.7* | *82.1* | *75.5* | *75.9* | *72.7* | - ---- - ### Data Preparation SAIL leverages high-quality, MLLM-enhanced captions for training, using datasets introduced in [DreamLIP](https://github.com/zyf0619sjtu/DreamLIP). To streamline this process, we provide a script for automated dataset preparation. Note that this process is time-intensive, as it involves handling 23M data samples. diff --git a/scripts/alignment_probing.sh b/scripts/alignment_probing.sh index 86a275d..9ae1638 100644 --- a/scripts/alignment_probing.sh +++ b/scripts/alignment_probing.sh @@ -4,8 +4,8 @@ text_embedding_list="data/tensor_data/text_embedding/gte-large-en-v1.5/dreamclipcc3m_raw" extra_text_embedding_list="data/tensor_data/text_embedding/gte-large-en-v1.5/dreamclipcc3m_longSV" -image_embedding_list="data/tensor_data/image_embedding/dinov2-base/dreamclipcc3m" -output_name="alignment_probing_dinov2b_gte" +image_embedding_list="data/tensor_data/image_embedding/aimv2-large-patch14-224/dreamclipcc3m_concat" +output_name="alignment_probing_aimv2l_gte" epoch_num=100 logit_scale=20 diff --git a/scripts/encode.sh b/scripts/encode.sh index bb45124..463bb61 100644 --- a/scripts/encode.sh +++ b/scripts/encode.sh @@ -3,14 +3,16 @@ #==============================================================================# # VISION MODEL # #==============================================================================# -vision_model="facebook/dinov2-large" +vision_model="apple/aimv2-large-patch14-224" # Available options: # vision_model="ijepa-huge" # IJEPA # vision_model="openai/clip-vit-large-patch14" # OpenAI CLIP # vision_model="mae-base" # MAE # vision_model="dinov1-vitb16" # DINOv1 +# vision_model="facebook/dinov2-large" # DINOv2 # vision_model="aim_1B" # AIM # vision_model="ibot-base" # iBOT +# vision_model="aimv2-large-patch14-224" # AIMv2 #==============================================================================# # TEXT MODEL # @@ -40,7 +42,7 @@ domain="image" # "image" or "text", each time we only encode one modality #==============================================================================# # BATCH SIZE # #==============================================================================# -batch_size=2048 # adjust based on GPU memory +batch_size=1024 # adjust based on GPU memory #==============================================================================# # Additional options # #==============================================================================# diff --git a/slurm-5671467.out b/slurm-5671467.out new file mode 100644 index 0000000..bbda1f8 --- /dev/null +++ b/slurm-5671467.out @@ -0,0 +1,106 @@ +/var/spool/slurmd/job5671467/slurm_script: line 61: [: : integer expression expected +bash output: Running tasks sequentially on a single GPU... +bash output: Using vision model: apple/aimv2-large-patch14-224 +bash output: Using text model: nvidia/NV-Embed-v2 +bash output: Processing dataset: dreamclipcc3m +bash output: Using domain: image +bash output: Using batch size: 1024 +bash output: Using source caption: longSV_captions +2024-11-22,18:30:26 | INFO | Start index: 0, End index: 2238073 +2024-11-22,18:30:26 | INFO | Number of sentences: 0 +2024-11-22,18:30:26 | INFO | Number of image_paths: 2238073 +2024-11-22,18:30:26 | INFO | Encoding image data dreamclipcc3m with model apple/aimv2-large-patch14-224 of batch size 1024... +2024-11-22,18:30:26 | INFO | First 5 items of image_paths paths: ['/network/scratch/l/le.zhang/datasets/DownloadCC3M/CC3M/images/0000000/0000000.jpg', '/network/scratch/l/le.zhang/datasets/DownloadCC3M/CC3M/images/0000000/0000002.jpg', '/network/scratch/l/le.zhang/datasets/DownloadCC3M/CC3M/images/0000000/0000004.jpg', '/network/scratch/l/le.zhang/datasets/DownloadCC3M/CC3M/images/0000000/0000005.jpg', '/network/scratch/l/le.zhang/datasets/DownloadCC3M/CC3M/images/0000000/0000006.jpg'] +loading configuration file config.json from cache at /network/scratch/l/le.zhang/hub/hub/models--apple--aimv2-large-patch14-224/snapshots/7f072a185935008b4d46a880cd4d33ff5acc08f3/config.json +loading configuration file config.json from cache at /network/scratch/l/le.zhang/hub/hub/models--apple--aimv2-large-patch14-224/snapshots/7f072a185935008b4d46a880cd4d33ff5acc08f3/config.json +Model config AIMv2Config { + "_name_or_path": "apple/aimv2-large-patch14-224", + "architectures": [ + "AIMv2Model" + ], + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "apple/aimv2-large-patch14-224--configuration_aimv2.AIMv2Config", + "AutoModel": "apple/aimv2-large-patch14-224--modeling_aimv2.AIMv2Model", + "FlaxAutoModel": "apple/aimv2-large-patch14-224--modeling_flax_aimv2.FlaxAIMv2Model" + }, + "hidden_size": 1024, + "image_size": 224, + "intermediate_size": 2816, + "model_type": "aimv2", + "num_attention_heads": 8, + "num_channels": 3, + "num_hidden_layers": 24, + "patch_size": 14, + "projection_dropout": 0.0, + "qkv_bias": false, + "rms_norm_eps": 1e-05, + "torch_dtype": "float16", + "transformers_version": "4.47.0.dev0", + "use_bias": false +} + +loading weights file model.safetensors from cache at /network/scratch/l/le.zhang/hub/hub/models--apple--aimv2-large-patch14-224/snapshots/7f072a185935008b4d46a880cd4d33ff5acc08f3/model.safetensors +Instantiating AIMv2Model model under default dtype torch.float16. +All model checkpoint weights were used when initializing AIMv2Model. + +All the weights of AIMv2Model were initialized from the model checkpoint at apple/aimv2-large-patch14-224. +If your task is similar to the task the model of the checkpoint was trained on, you can already use AIMv2Model for predictions without further training. +loading configuration file preprocessor_config.json from cache at /network/scratch/l/le.zhang/hub/hub/models--apple--aimv2-large-patch14-224/snapshots/7f072a185935008b4d46a880cd4d33ff5acc08f3/preprocessor_config.json +Image processor CLIPImageProcessor { + "crop_size": { + "height": 224, + "width": 224 + }, + "do_center_crop": true, + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.48145466, + 0.4578275, + 0.40821073 + ], + "image_processor_type": "CLIPImageProcessor", + "image_std": [ + 0.26862954, + 0.26130258, + 0.27577711 + ], + "resample": 3, + "rescale_factor": 0.00392156862745098, + "size": { + "shortest_edge": 224 + } +} + +Time taken to read 'Image Path' column: 23.73 seconds +Traceback (most recent call last): + File "/network/scratch/l/le.zhang/light_align/encode.py", line 195, in + main() + File "/network/scratch/l/le.zhang/light_align/encode.py", line 192, in main + encode_image(args, image_paths, start_index) + File "/home/mila/l/le.zhang/.conda/envs/openflamingo/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context + return func(*args, **kwargs) + File "/network/scratch/l/le.zhang/light_align/encode.py", line 167, in encode_image + model = model.to('cuda') # Move model to GPU + File "/home/mila/l/le.zhang/.conda/envs/openflamingo/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1160, in to + return self._apply(convert) + File "/home/mila/l/le.zhang/.conda/envs/openflamingo/lib/python3.9/site-packages/torch/nn/modules/module.py", line 810, in _apply + module._apply(fn) + File "/home/mila/l/le.zhang/.conda/envs/openflamingo/lib/python3.9/site-packages/torch/nn/modules/module.py", line 810, in _apply + module._apply(fn) + File "/home/mila/l/le.zhang/.conda/envs/openflamingo/lib/python3.9/site-packages/torch/nn/modules/module.py", line 810, in _apply + module._apply(fn) + [Previous line repeated 1 more time] + File "/home/mila/l/le.zhang/.conda/envs/openflamingo/lib/python3.9/site-packages/torch/nn/modules/module.py", line 833, in _apply + param_applied = fn(param) + File "/home/mila/l/le.zhang/.conda/envs/openflamingo/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1158, in convert + return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking) + File "/home/mila/l/le.zhang/.conda/envs/openflamingo/lib/python3.9/site-packages/torch/cuda/__init__.py", line 298, in _lazy_init + torch._C._cuda_init() +RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx +[2024-11-22 18:30:32,242] torch._dynamo.utils: [INFO] TorchDynamo compilation metrics: +[2024-11-22 18:30:32,242] torch._dynamo.utils: [INFO] Function Runtimes (s) +[2024-11-22 18:30:32,242] torch._dynamo.utils: [INFO] ---------- --------------