Skip to content

Commit

Permalink
Add retry for gpt api call and improve air_bench aggregation function (
Browse files Browse the repository at this point in the history
…#376)

* add retry for api calls and change air_bench_foundation aggregation function

* make azure default api

* minor changes
  • Loading branch information
pbcong authored Oct 30, 2024
1 parent 58c89e1 commit fdcd2a8
Show file tree
Hide file tree
Showing 11 changed files with 147 additions and 91 deletions.
11 changes: 7 additions & 4 deletions lmms_eval/tasks/air_bench/air_bench_chat_mixed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@ test_split: mixed
doc_to_target: "answer_gt"
doc_to_visual: !function utils.air_bench_doc_to_audio
doc_to_text: !function utils.air_bench_doc_to_text_chat

generation_kwargs:
max_new_tokens: 512
temperature: 0
do_sample: False
max_new_tokens: 1024
temperature: 0.2
top_p: 1.0
num_beams: 1

lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: ""
post_prompt: "Give a detail answer to the question in English."
metric_list:
- metric: gpt_eval
aggregation: !function utils.air_bench_aggregate_results_chat
Expand Down
11 changes: 7 additions & 4 deletions lmms_eval/tasks/air_bench/air_bench_chat_music.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@ test_split: music
doc_to_target: "answer_gt"
doc_to_visual: !function utils.air_bench_doc_to_audio
doc_to_text: !function utils.air_bench_doc_to_text_chat

generation_kwargs:
max_new_tokens: 512
temperature: 0
do_sample: False
max_new_tokens: 1024
temperature: 0.2
top_p: 1.0
num_beams: 1

lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: ""
post_prompt: "Give a detail answer to the question in English."
metric_list:
- metric: gpt_eval
aggregation: !function utils.air_bench_aggregate_results_chat
Expand Down
11 changes: 7 additions & 4 deletions lmms_eval/tasks/air_bench/air_bench_chat_sound.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@ test_split: sound
doc_to_target: "answer_gt"
doc_to_visual: !function utils.air_bench_doc_to_audio
doc_to_text: !function utils.air_bench_doc_to_text_chat

generation_kwargs:
max_new_tokens: 512
temperature: 0
do_sample: False
max_new_tokens: 1024
temperature: 0.2
top_p: 1.0
num_beams: 1

lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: ""
post_prompt: "Give a detail answer to the question in English."
metric_list:
- metric: gpt_eval
aggregation: !function utils.air_bench_aggregate_results_chat
Expand Down
11 changes: 7 additions & 4 deletions lmms_eval/tasks/air_bench/air_bench_chat_speech.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@ test_split: speech
doc_to_target: "answer_gt"
doc_to_visual: !function utils.air_bench_doc_to_audio
doc_to_text: !function utils.air_bench_doc_to_text_chat

generation_kwargs:
max_new_tokens: 512
temperature: 0
do_sample: False
max_new_tokens: 1024
temperature: 0.2
top_p: 1.0
num_beams: 1

lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: ""
post_prompt: "Give a detail answer to the question in English."
metric_list:
- metric: gpt_eval
aggregation: !function utils.air_bench_aggregate_results_chat
Expand Down
2 changes: 1 addition & 1 deletion lmms_eval/tasks/air_bench/air_bench_foundation_music.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ lmms_eval_specific_kwargs:
post_prompt: "\nAnswer with the option's letter from the given choices directly."
metric_list:
- metric: accuracy
aggregation: mean
aggregation: !function utils.air_bench_aggregate_results_foundation
higher_is_better: true
- metric: submission
aggregation: !function utils.air_bench_aggregate_results_for_submission
Expand Down
2 changes: 1 addition & 1 deletion lmms_eval/tasks/air_bench/air_bench_foundation_sound.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ lmms_eval_specific_kwargs:
post_prompt: "\nAnswer with the option's letter from the given choices directly."
metric_list:
- metric: accuracy
aggregation: mean
aggregation: !function utils.air_bench_aggregate_results_foundation
higher_is_better: true
- metric: submission
aggregation: !function utils.air_bench_aggregate_results_for_submission
Expand Down
2 changes: 1 addition & 1 deletion lmms_eval/tasks/air_bench/air_bench_foundation_speech.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ lmms_eval_specific_kwargs:
post_prompt: "\nAnswer with the option's letter from the given choices directly."
metric_list:
- metric: accuracy
aggregation: mean
aggregation: !function utils.air_bench_aggregate_results_foundation
higher_is_better: true
- metric: submission
aggregation: !function utils.air_bench_aggregate_results_for_submission
Expand Down
60 changes: 41 additions & 19 deletions lmms_eval/tasks/air_bench/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,8 @@ def air_bench_doc_to_text_chat(doc, lmms_eval_specific_kwargs):
"Content-Type": "application/json",
}
elif API_TYPE == "azure":
API_KEY = os.getenv("AZURE_API_KEY", None)
deployment_name = os.getenv("AZURE_DEPLOYMENT_NAME", None)
resource_name = os.getenv("AZURE_RESOURCE_NAME", None)
api_version = os.getenv("AZURE_API_VERSION", None)
API_URL = f"https://{resource_name}.openai.azure.com/openai/deployments/{deployment_name}/chat/completions?api-version={api_version}"
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
headers = {
"api-key": API_KEY,
"Content-Type": "application/json",
Expand All @@ -72,8 +69,11 @@ def air_bench_doc_to_text_chat(doc, lmms_eval_specific_kwargs):
"The two scores are separated by a space. Please only output the 2 required number and no text."
)

retries = 3
NUM_SECONDS_TO_SLEEP = 5

def get_eval(max_tokens: int, content: str):

def get_eval(max_tokens: int, content: str, retries: int = retries):
global headers

messages = [
Expand All @@ -83,21 +83,28 @@ def get_eval(max_tokens: int, content: str):
payload = {
"model": GPT_EVAL_MODEL_NAME,
"messages": messages,
"temperature": 0.2,
"temperature": 0,
"max_tokens": max_tokens,
}

try:
response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
response.raise_for_status()
response_data = response.json()

content = response_data["choices"][0]["message"]["content"].strip()
if content != "":
return content, response_data["model"]
except Exception as e:
eval_logger.info(f"Attempt failed with error: {e}")
return "", ""
for attempt in range(retries):
try:
response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
response.raise_for_status()
response_data = response.json()

content = response_data["choices"][0]["message"]["content"].strip()
if content != "":
return content, response_data["model"]
break # If successful, break out of the loop

except Exception as e:
eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
if attempt < retries: # If we have retries left, sleep and then continue to next attempt
time.sleep(NUM_SECONDS_TO_SLEEP)
else: # If this was the last attempt, log and return empty
eval_logger.error(f"All {retries} attempts failed. Last error message: {e}")
return "", ""
return "", ""


Expand Down Expand Up @@ -182,7 +189,7 @@ def air_bench_process_results_foundation(doc, result):
score = 1.0 if pred == gt_ans else 0.0
submission_dict = {}
submission_dict = {doc.get("uniq_id", "unknown"): pred}
return {"accuracy": score, "submission": submission_dict}
return {"accuracy": {"score": score, "task": doc["task_name"]}, "submission": submission_dict}


def air_bench_aggregate_results_for_submission(results, args):
Expand All @@ -192,6 +199,21 @@ def air_bench_aggregate_results_for_submission(results, args):
eval_logger.info(f"Results saved to {path}.")


def air_bench_aggregate_results_foundation(results):
score = 0
categorical_correct = {}
categorical_total = {}
for result in results:
score += result["score"]
if result["task"] not in categorical_correct.keys():
categorical_correct[result["task"]] = 0
categorical_total[result["task"]] = 0
categorical_correct[result["task"]] += result["score"]
categorical_total[result["task"]] += 1

return {"overall_accuracy": score / len(results), "categorical_accuracy": {task: categorical_correct[task] / categorical_total[task] for task in categorical_correct.keys()}}


def parse_multi_choice_response(response, all_choices):
"""
Parse the prediction from the generated response.
Expand Down
41 changes: 24 additions & 17 deletions lmms_eval/tasks/alpaca_audio/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,8 @@ def doc_to_text(doc, lmms_eval_specific_kwargs):
}

elif API_TYPE == "azure":
API_KEY = os.getenv("AZURE_API_KEY", None)
deployment_name = os.getenv("AZURE_DEPLOYMENT_NAME", None)
resource_name = os.getenv("AZURE_RESOURCE_NAME", None)
api_version = os.getenv("AZURE_API_VERSION", None)
API_URL = f"https://{resource_name}.openai.azure.com/openai/deployments/{deployment_name}/chat/completions?api-version={api_version}"
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
headers = {
"api-key": API_KEY,
"Content-Type": "application/json",
Expand Down Expand Up @@ -82,8 +79,11 @@ def doc_to_text(doc, lmms_eval_specific_kwargs):
Explanation: (Provide a concise explanation of your rating, comparing the reference answer with the model's response. "The reference answer is [XXX], while the model's answer is [YYY]. I think ...")
Rating: (int)"""

retries = 3
NUM_SECONDS_TO_SLEEP = 5

def get_eval(max_tokens: int, content: str):

def get_eval(max_tokens: int, content: str, retries: int = retries):
global headers

messages = [
Expand All @@ -92,17 +92,24 @@ def get_eval(max_tokens: int, content: str):

payload = {"model": GPT_EVAL_MODEL_NAME, "messages": messages, "temperature": 0.7, "max_tokens": max_tokens, "top_p": 0.95, "frequency_penalty": 0, "presence_penalty": 0, "stop": None}

try:
response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
response.raise_for_status()
response_data = response.json()

content = response_data["choices"][0]["message"]["content"].strip()
if content != "":
return content, response_data["model"]
except Exception as e:
eval_logger.info(f"Attempt failed with error: {e}")
return "", ""
for attempt in range(retries):
try:
response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
response.raise_for_status()
response_data = response.json()

content = response_data["choices"][0]["message"]["content"].strip()
if content != "":
return content, response_data["model"]
break # If successful, break out of the loop

except Exception as e:
eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
if attempt < retries: # If we have retries left, sleep and then continue to next attempt
time.sleep(NUM_SECONDS_TO_SLEEP)
else: # If this was the last attempt, log and return empty
eval_logger.error(f"All {retries} attempts failed. Last error message: {e}")
return "", ""
return "", ""


Expand Down
45 changes: 26 additions & 19 deletions lmms_eval/tasks/clotho_aqa/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def clotho_aqa_doc_to_text(doc, lmms_eval_specific_kwargs):

NUM_SECONDS_TO_SLEEP = 2
GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
API_TYPE = os.getenv("API_TYPE", "openai")
API_TYPE = os.getenv("API_TYPE", "azure")

if API_TYPE == "openai":
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
Expand All @@ -49,6 +49,7 @@ def clotho_aqa_doc_to_text(doc, lmms_eval_specific_kwargs):
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json",
}

elif API_TYPE == "azure":
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
Expand Down Expand Up @@ -82,31 +83,37 @@ def clotho_aqa_doc_to_text(doc, lmms_eval_specific_kwargs):
Rating: (int)"""


def get_eval(max_tokens: int, content: str):
retries = 3
NUM_SECONDS_TO_SLEEP = 5


def get_eval(max_tokens: int, content: str, retries: int = retries):
global headers

messages = [
{"role": "user", "content": content},
]

payload = {
"model": GPT_EVAL_MODEL_NAME,
"messages": messages,
"temperature": 0.2,
"max_tokens": max_tokens,
}
payload = {"model": GPT_EVAL_MODEL_NAME, "messages": messages, "temperature": 0.7, "max_tokens": max_tokens, "top_p": 0.95, "frequency_penalty": 0, "presence_penalty": 0, "stop": None}

try:
response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
response.raise_for_status()
response_data = response.json()

content = response_data["choices"][0]["message"]["content"].strip()
if content != "":
return content, response_data["model"]
except Exception as e:
eval_logger.info(f"Attempt failed with error: {e}")
return "", ""
for attempt in range(retries):
try:
response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
response.raise_for_status()
response_data = response.json()

content = response_data["choices"][0]["message"]["content"].strip()
if content != "":
return content, response_data["model"]
break # If successful, break out of the loop

except Exception as e:
eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
if attempt < retries: # If we have retries left, sleep and then continue to next attempt
time.sleep(NUM_SECONDS_TO_SLEEP)
else: # If this was the last attempt, log and return empty
eval_logger.error(f"All {retries} attempts failed. Last error message: {e}")
return "", ""
return "", ""


Expand Down
Loading

0 comments on commit fdcd2a8

Please sign in to comment.