Add retry for gpt api call and improve air_bench aggregation function (…

…#376) * add retry for api calls and change air_bench_foundation aggregation function * make azure default api * minor changes
EvolvingLMMs-Lab · Oct 30, 2024 · fdcd2a8 · fdcd2a8
1 parent 58c89e1
commit fdcd2a8
Show file tree

Hide file tree

Showing 11 changed files with 147 additions and 91 deletions.
diff --git a/lmms_eval/tasks/air_bench/air_bench_chat_mixed.yaml b/lmms_eval/tasks/air_bench/air_bench_chat_mixed.yaml
@@ -4,14 +4,17 @@ test_split: mixed
 doc_to_target: "answer_gt"
 doc_to_visual: !function utils.air_bench_doc_to_audio
 doc_to_text: !function utils.air_bench_doc_to_text_chat
+
 generation_kwargs:
-  max_new_tokens: 512
-  temperature: 0
-  do_sample: False
+  max_new_tokens: 1024
+  temperature: 0.2
+  top_p: 1.0
+  num_beams: 1
+
 lmms_eval_specific_kwargs:
   default:
     pre_prompt: ""
-    post_prompt: ""
+    post_prompt: "Give a detail answer to the question in English."
 metric_list:
   - metric: gpt_eval
     aggregation: !function utils.air_bench_aggregate_results_chat

diff --git a/lmms_eval/tasks/air_bench/air_bench_chat_music.yaml b/lmms_eval/tasks/air_bench/air_bench_chat_music.yaml
@@ -4,14 +4,17 @@ test_split: music
 doc_to_target: "answer_gt"
 doc_to_visual: !function utils.air_bench_doc_to_audio
 doc_to_text: !function utils.air_bench_doc_to_text_chat
+
 generation_kwargs:
-  max_new_tokens: 512
-  temperature: 0
-  do_sample: False
+  max_new_tokens: 1024
+  temperature: 0.2
+  top_p: 1.0
+  num_beams: 1
+
 lmms_eval_specific_kwargs:
   default:
     pre_prompt: ""
-    post_prompt: ""
+    post_prompt: "Give a detail answer to the question in English."
 metric_list:
   - metric: gpt_eval
     aggregation: !function utils.air_bench_aggregate_results_chat

diff --git a/lmms_eval/tasks/air_bench/air_bench_chat_sound.yaml b/lmms_eval/tasks/air_bench/air_bench_chat_sound.yaml
@@ -4,14 +4,17 @@ test_split: sound
 doc_to_target: "answer_gt"
 doc_to_visual: !function utils.air_bench_doc_to_audio
 doc_to_text: !function utils.air_bench_doc_to_text_chat
+
 generation_kwargs:
-  max_new_tokens: 512
-  temperature: 0
-  do_sample: False
+  max_new_tokens: 1024
+  temperature: 0.2
+  top_p: 1.0
+  num_beams: 1
+
 lmms_eval_specific_kwargs:
   default:
     pre_prompt: ""
-    post_prompt: ""
+    post_prompt: "Give a detail answer to the question in English."
 metric_list:
   - metric: gpt_eval
     aggregation: !function utils.air_bench_aggregate_results_chat

diff --git a/lmms_eval/tasks/air_bench/air_bench_chat_speech.yaml b/lmms_eval/tasks/air_bench/air_bench_chat_speech.yaml
@@ -4,14 +4,17 @@ test_split: speech
 doc_to_target: "answer_gt"
 doc_to_visual: !function utils.air_bench_doc_to_audio
 doc_to_text: !function utils.air_bench_doc_to_text_chat
+
 generation_kwargs:
-  max_new_tokens: 512
-  temperature: 0
-  do_sample: False
+  max_new_tokens: 1024
+  temperature: 0.2
+  top_p: 1.0
+  num_beams: 1
+
 lmms_eval_specific_kwargs:
   default:
     pre_prompt: ""
-    post_prompt: ""
+    post_prompt: "Give a detail answer to the question in English."
 metric_list:
   - metric: gpt_eval
     aggregation: !function utils.air_bench_aggregate_results_chat

diff --git a/lmms_eval/tasks/air_bench/air_bench_foundation_music.yaml b/lmms_eval/tasks/air_bench/air_bench_foundation_music.yaml
@@ -15,7 +15,7 @@ lmms_eval_specific_kwargs:
     post_prompt: "\nAnswer with the option's letter from the given choices directly."
 metric_list:
   - metric: accuracy
-    aggregation: mean
+    aggregation: !function utils.air_bench_aggregate_results_foundation
     higher_is_better: true
   - metric: submission
     aggregation: !function utils.air_bench_aggregate_results_for_submission

diff --git a/lmms_eval/tasks/air_bench/air_bench_foundation_sound.yaml b/lmms_eval/tasks/air_bench/air_bench_foundation_sound.yaml
@@ -15,7 +15,7 @@ lmms_eval_specific_kwargs:
     post_prompt: "\nAnswer with the option's letter from the given choices directly."
 metric_list:
   - metric: accuracy
-    aggregation: mean
+    aggregation: !function utils.air_bench_aggregate_results_foundation
     higher_is_better: true
   - metric: submission
     aggregation: !function utils.air_bench_aggregate_results_for_submission

diff --git a/lmms_eval/tasks/air_bench/air_bench_foundation_speech.yaml b/lmms_eval/tasks/air_bench/air_bench_foundation_speech.yaml
@@ -15,7 +15,7 @@ lmms_eval_specific_kwargs:
     post_prompt: "\nAnswer with the option's letter from the given choices directly."
 metric_list:
   - metric: accuracy
-    aggregation: mean
+    aggregation: !function utils.air_bench_aggregate_results_foundation
     higher_is_better: true
   - metric: submission
     aggregation: !function utils.air_bench_aggregate_results_for_submission

diff --git a/lmms_eval/tasks/air_bench/utils.py b/lmms_eval/tasks/air_bench/utils.py
@@ -48,11 +48,8 @@ def air_bench_doc_to_text_chat(doc, lmms_eval_specific_kwargs):
         "Content-Type": "application/json",
     }
 elif API_TYPE == "azure":
-    API_KEY = os.getenv("AZURE_API_KEY", None)
-    deployment_name = os.getenv("AZURE_DEPLOYMENT_NAME", None)
-    resource_name = os.getenv("AZURE_RESOURCE_NAME", None)
-    api_version = os.getenv("AZURE_API_VERSION", None)
-    API_URL = f"https://{resource_name}.openai.azure.com/openai/deployments/{deployment_name}/chat/completions?api-version={api_version}"
+    API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
+    API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
     headers = {
         "api-key": API_KEY,
         "Content-Type": "application/json",
@@ -72,8 +69,11 @@ def air_bench_doc_to_text_chat(doc, lmms_eval_specific_kwargs):
     "The two scores are separated by a space. Please only output the 2 required number and no text."
 )
 
+retries = 3
+NUM_SECONDS_TO_SLEEP = 5
 
-def get_eval(max_tokens: int, content: str):
+
+def get_eval(max_tokens: int, content: str, retries: int = retries):
     global headers
 
     messages = [
@@ -83,21 +83,28 @@ def get_eval(max_tokens: int, content: str):
     payload = {
         "model": GPT_EVAL_MODEL_NAME,
         "messages": messages,
-        "temperature": 0.2,
+        "temperature": 0,
         "max_tokens": max_tokens,
     }
 
-    try:
-        response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
-        response.raise_for_status()
-        response_data = response.json()
-
-        content = response_data["choices"][0]["message"]["content"].strip()
-        if content != "":
-            return content, response_data["model"]
-    except Exception as e:
-        eval_logger.info(f"Attempt failed with error: {e}")
-        return "", ""
+    for attempt in range(retries):
+        try:
+            response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
+            response.raise_for_status()
+            response_data = response.json()
+
+            content = response_data["choices"][0]["message"]["content"].strip()
+            if content != "":
+                return content, response_data["model"]
+            break  # If successful, break out of the loop
+
+        except Exception as e:
+            eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
+            if attempt < retries:  # If we have retries left, sleep and then continue to next attempt
+                time.sleep(NUM_SECONDS_TO_SLEEP)
+            else:  # If this was the last attempt, log and return empty
+                eval_logger.error(f"All {retries} attempts failed. Last error message: {e}")
+                return "", ""
     return "", ""
 
 
@@ -182,7 +189,7 @@ def air_bench_process_results_foundation(doc, result):
     score = 1.0 if pred == gt_ans else 0.0
     submission_dict = {}
     submission_dict = {doc.get("uniq_id", "unknown"): pred}
-    return {"accuracy": score, "submission": submission_dict}
+    return {"accuracy": {"score": score, "task": doc["task_name"]}, "submission": submission_dict}
 
 
 def air_bench_aggregate_results_for_submission(results, args):
@@ -192,6 +199,21 @@ def air_bench_aggregate_results_for_submission(results, args):
     eval_logger.info(f"Results saved to {path}.")
 
 
+def air_bench_aggregate_results_foundation(results):
+    score = 0
+    categorical_correct = {}
+    categorical_total = {}
+    for result in results:
+        score += result["score"]
+        if result["task"] not in categorical_correct.keys():
+            categorical_correct[result["task"]] = 0
+            categorical_total[result["task"]] = 0
+        categorical_correct[result["task"]] += result["score"]
+        categorical_total[result["task"]] += 1
+
+    return {"overall_accuracy": score / len(results), "categorical_accuracy": {task: categorical_correct[task] / categorical_total[task] for task in categorical_correct.keys()}}
+
+
 def parse_multi_choice_response(response, all_choices):
     """
     Parse the prediction from the generated response.

diff --git a/lmms_eval/tasks/alpaca_audio/utils.py b/lmms_eval/tasks/alpaca_audio/utils.py
@@ -48,11 +48,8 @@ def doc_to_text(doc, lmms_eval_specific_kwargs):
     }
 
 elif API_TYPE == "azure":
-    API_KEY = os.getenv("AZURE_API_KEY", None)
-    deployment_name = os.getenv("AZURE_DEPLOYMENT_NAME", None)
-    resource_name = os.getenv("AZURE_RESOURCE_NAME", None)
-    api_version = os.getenv("AZURE_API_VERSION", None)
-    API_URL = f"https://{resource_name}.openai.azure.com/openai/deployments/{deployment_name}/chat/completions?api-version={api_version}"
+    API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
+    API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
     headers = {
         "api-key": API_KEY,
         "Content-Type": "application/json",
@@ -82,8 +79,11 @@ def doc_to_text(doc, lmms_eval_specific_kwargs):
             Explanation: (Provide a concise explanation of your rating, comparing the reference answer with the model's response. "The reference answer is [XXX], while the model's answer is [YYY]. I think ...")
             Rating: (int)"""
 
+retries = 3
+NUM_SECONDS_TO_SLEEP = 5
 
-def get_eval(max_tokens: int, content: str):
+
+def get_eval(max_tokens: int, content: str, retries: int = retries):
     global headers
 
     messages = [
@@ -92,17 +92,24 @@ def get_eval(max_tokens: int, content: str):
 
     payload = {"model": GPT_EVAL_MODEL_NAME, "messages": messages, "temperature": 0.7, "max_tokens": max_tokens, "top_p": 0.95, "frequency_penalty": 0, "presence_penalty": 0, "stop": None}
 
-    try:
-        response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
-        response.raise_for_status()
-        response_data = response.json()
-
-        content = response_data["choices"][0]["message"]["content"].strip()
-        if content != "":
-            return content, response_data["model"]
-    except Exception as e:
-        eval_logger.info(f"Attempt failed with error: {e}")
-        return "", ""
+    for attempt in range(retries):
+        try:
+            response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
+            response.raise_for_status()
+            response_data = response.json()
+
+            content = response_data["choices"][0]["message"]["content"].strip()
+            if content != "":
+                return content, response_data["model"]
+            break  # If successful, break out of the loop
+
+        except Exception as e:
+            eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
+            if attempt < retries:  # If we have retries left, sleep and then continue to next attempt
+                time.sleep(NUM_SECONDS_TO_SLEEP)
+            else:  # If this was the last attempt, log and return empty
+                eval_logger.error(f"All {retries} attempts failed. Last error message: {e}")
+                return "", ""
     return "", ""
 
 

diff --git a/lmms_eval/tasks/clotho_aqa/utils.py b/lmms_eval/tasks/clotho_aqa/utils.py
@@ -40,7 +40,7 @@ def clotho_aqa_doc_to_text(doc, lmms_eval_specific_kwargs):
 
 NUM_SECONDS_TO_SLEEP = 2
 GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
-API_TYPE = os.getenv("API_TYPE", "openai")
+API_TYPE = os.getenv("API_TYPE", "azure")
 
 if API_TYPE == "openai":
     API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
@@ -49,6 +49,7 @@ def clotho_aqa_doc_to_text(doc, lmms_eval_specific_kwargs):
         "Authorization": f"Bearer {API_KEY}",
         "Content-Type": "application/json",
     }
+
 elif API_TYPE == "azure":
     API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
     API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
@@ -82,31 +83,37 @@ def clotho_aqa_doc_to_text(doc, lmms_eval_specific_kwargs):
             Rating: (int)"""
 
 
-def get_eval(max_tokens: int, content: str):
+retries = 3
+NUM_SECONDS_TO_SLEEP = 5
+
+
+def get_eval(max_tokens: int, content: str, retries: int = retries):
     global headers
 
     messages = [
         {"role": "user", "content": content},
     ]
 
-    payload = {
-        "model": GPT_EVAL_MODEL_NAME,
-        "messages": messages,
-        "temperature": 0.2,
-        "max_tokens": max_tokens,
-    }
+    payload = {"model": GPT_EVAL_MODEL_NAME, "messages": messages, "temperature": 0.7, "max_tokens": max_tokens, "top_p": 0.95, "frequency_penalty": 0, "presence_penalty": 0, "stop": None}
 
-    try:
-        response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
-        response.raise_for_status()
-        response_data = response.json()
-
-        content = response_data["choices"][0]["message"]["content"].strip()
-        if content != "":
-            return content, response_data["model"]
-    except Exception as e:
-        eval_logger.info(f"Attempt failed with error: {e}")
-        return "", ""
+    for attempt in range(retries):
+        try:
+            response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
+            response.raise_for_status()
+            response_data = response.json()
+
+            content = response_data["choices"][0]["message"]["content"].strip()
+            if content != "":
+                return content, response_data["model"]
+            break  # If successful, break out of the loop
+
+        except Exception as e:
+            eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
+            if attempt < retries:  # If we have retries left, sleep and then continue to next attempt
+                time.sleep(NUM_SECONDS_TO_SLEEP)
+            else:  # If this was the last attempt, log and return empty
+                eval_logger.error(f"All {retries} attempts failed. Last error message: {e}")
+                return "", ""
     return "", ""