|
| 1 | +# coding=utf-8 |
| 2 | +# Copyright 2024 HuggingFace Inc. |
| 3 | +# |
| 4 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +# you may not use this file except in compliance with the License. |
| 6 | +# You may obtain a copy of the License at |
| 7 | +# |
| 8 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +# |
| 10 | +# Unless required by applicable law or agreed to in writing, software |
| 11 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +# See the License for the specific language governing permissions and |
| 14 | +# limitations under the License. |
| 15 | + |
| 16 | +import time |
| 17 | +from unittest import TestCase |
| 18 | + |
| 19 | +import habana_frameworks.torch as ht |
| 20 | +import numpy as np |
| 21 | +import requests |
| 22 | +import torch |
| 23 | +from PIL import Image |
| 24 | +from transformers import AutoModel, AutoProcessor |
| 25 | + |
| 26 | +from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi |
| 27 | + |
| 28 | + |
| 29 | +adapt_transformers_to_gaudi() |
| 30 | + |
| 31 | +# For Gaudi 2 |
| 32 | +LATENCY_OWLVIT_BF16_GRAPH_BASELINE = 3.7109851837158203 |
| 33 | +LATENCY_SAM_BF16_GRAPH_BASELINE = 98.92215728759766 |
| 34 | + |
| 35 | + |
| 36 | +class GaudiSAMTester(TestCase): |
| 37 | + """ |
| 38 | + Tests for Segment Anything Model - SAM |
| 39 | + """ |
| 40 | + |
| 41 | + def prepare_model_and_processor(self): |
| 42 | + model = AutoModel.from_pretrained("facebook/sam-vit-huge").to("hpu") |
| 43 | + processor = AutoProcessor.from_pretrained("facebook/sam-vit-huge") |
| 44 | + model = model.eval() |
| 45 | + return model, processor |
| 46 | + |
| 47 | + def prepare_data(self): |
| 48 | + image = Image.open( |
| 49 | + requests.get( |
| 50 | + "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png", stream=True |
| 51 | + ).raw |
| 52 | + ).convert("RGB") |
| 53 | + input_points = [[[450, 600]]] |
| 54 | + return input_points, image |
| 55 | + |
| 56 | + def test_inference_default(self): |
| 57 | + model, processor = self.prepare_model_and_processor() |
| 58 | + input_points, image = self.prepare_data() |
| 59 | + inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu") |
| 60 | + outputs = model(**inputs) |
| 61 | + scores = outputs.iou_scores |
| 62 | + scores = scores[0][0] |
| 63 | + expected_scores = np.array([0.9912, 0.9818, 0.9666]) |
| 64 | + self.assertEqual(len(scores), 3) |
| 65 | + self.assertLess(np.abs(scores.cpu().detach().numpy() - expected_scores).max(), 0.02) |
| 66 | + |
| 67 | + def test_inference_bf16(self): |
| 68 | + model, processor = self.prepare_model_and_processor() |
| 69 | + input_points, image = self.prepare_data() |
| 70 | + inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu") |
| 71 | + |
| 72 | + with torch.autocast(device_type="hpu", dtype=torch.bfloat16): # Autocast BF16 |
| 73 | + outputs = model(**inputs) |
| 74 | + scores = outputs.iou_scores |
| 75 | + scores = scores[0][0] |
| 76 | + expected_scores = np.array([0.9912, 0.9818, 0.9666]) |
| 77 | + self.assertEqual(len(scores), 3) |
| 78 | + self.assertLess(np.abs(scores.to(torch.float32).cpu().detach().numpy() - expected_scores).max(), 0.02) |
| 79 | + |
| 80 | + def test_inference_hpu_graphs(self): |
| 81 | + model, processor = self.prepare_model_and_processor() |
| 82 | + input_points, image = self.prepare_data() |
| 83 | + inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu") |
| 84 | + |
| 85 | + model = ht.hpu.wrap_in_hpu_graph(model) # Apply graph |
| 86 | + |
| 87 | + outputs = model(**inputs) |
| 88 | + scores = outputs.iou_scores |
| 89 | + scores = scores[0][0] |
| 90 | + expected_scores = np.array([0.9912, 0.9818, 0.9666]) |
| 91 | + self.assertEqual(len(scores), 3) |
| 92 | + self.assertLess(np.abs(scores.to(torch.float32).cpu().detach().numpy() - expected_scores).max(), 0.02) |
| 93 | + |
| 94 | + def test_no_latency_regression_bf16(self): |
| 95 | + warmup = 3 |
| 96 | + iterations = 10 |
| 97 | + |
| 98 | + model, processor = self.prepare_model_and_processor() |
| 99 | + input_points, image = self.prepare_data() |
| 100 | + |
| 101 | + model = ht.hpu.wrap_in_hpu_graph(model) |
| 102 | + |
| 103 | + with torch.no_grad(), torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True): |
| 104 | + for i in range(warmup): |
| 105 | + inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu") |
| 106 | + _ = model(**inputs) |
| 107 | + torch.hpu.synchronize() |
| 108 | + |
| 109 | + total_model_time = 0 |
| 110 | + for i in range(iterations): |
| 111 | + inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu") |
| 112 | + model_start_time = time.time() |
| 113 | + _ = model(**inputs) |
| 114 | + torch.hpu.synchronize() |
| 115 | + model_end_time = time.time() |
| 116 | + total_model_time = total_model_time + (model_end_time - model_start_time) |
| 117 | + |
| 118 | + latency = total_model_time * 1000 / iterations # in terms of ms |
| 119 | + self.assertLessEqual(latency, 1.05 * LATENCY_SAM_BF16_GRAPH_BASELINE) |
0 commit comments