Skip to content

Commit

Permalink
add results for alibaba model
Browse files Browse the repository at this point in the history
  • Loading branch information
semio committed Jan 31, 2024
1 parent b7d5c5e commit 9c24076
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@

raw_results

# double check the numbers
n = raw_results.groupby('question_id')['question'].count()
n.describe() # the count should be same for all questions
# as for 2024-01 experiment, it should be 30: 3 prompts * 5 times * 2 langs


# load AI Eval Spreadsheet
ai_eval_sheet = read_ai_eval_spreadsheet()

Expand All @@ -57,26 +63,48 @@ def suggest_language(q_text):
for _, row in raw_results[['question_id', 'question']].drop_duplicates().iterrows():
q_text = row['question']
q_id = row['question_id']
matched = False
for q, _ in questions:
if q_id == q.question_id:
if q_text.strip() == q.published_version_of_question.strip():
matched = True
q_text_to_q_id_mapping[q_text] = (q.question_id, q.language)
else:
lang = suggest_language(q_text)
if lang == q.language:
matched = True
q_text_to_q_id_mapping[q_text] = (q.question_id, q.language)
print(f"Q{q_id} have different question text.")
print(q_text.strip())
print(q.published_version_of_question.strip())
if matched:
break
else:

if not matched:
lang = suggest_language(q_text)
print(q_id, q_text[:10], '...', 'does not exist, detected lang:', lang)
q_text_to_q_id_mapping[q_text] = (q_id, lang)


# q_text_to_q_id_mapping
# len(q_text_to_q_id_mapping)
len(q_text_to_q_id_mapping)

# double check: numbers of english questions and chinese questions
en = list(filter(lambda v: v[1] == 'en-US', q_text_to_q_id_mapping.values()))
en_ids = [x[0] for x in en]
cn = list(filter(lambda v: v[1] == 'zh-CN', q_text_to_q_id_mapping.values()))
cn_ids = [x[0] for x in cn]

set(en_ids) - set(cn_ids)
# => {'55'}.
# I checked and found the issue: question 55 was translated but
# somehow it was still english in Contentful.

raw_results[raw_results.question_id == '55']['question']

# we don't need to fix that I think, the language of the question will be en-US
# and we will have a datapoint for asking English question to Qwen model.


# create a mapping from model_id, parameters -> model_config id
model_configs = get_model_configs(ai_eval_sheet, include_all=False)
Expand Down Expand Up @@ -187,5 +215,3 @@ def get_grade(dictionary):
result_full_df = result_full_df[backup.columns]

ai_eval_sheet.evaluation_results.replace_data(result_full_df)


Binary file not shown.
1 change: 1 addition & 0 deletions automation-api/yival_experiments/output/result_comb.csv
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
model_id,model_params,correctness_with_indecisive,correctness_without_indecisive
gpt-4-0125-preview,{'temperature': 0.01},79.13669064748201,80.0
gpt-4-1106-preview,{'temperature': 0.01},79.49640287769785,79.49640287769785
qwen-max-1201,{'temperature': 0.01},66.90647482014388,72.65625
gemini/gemini-pro,{'temperature': 0.01},54.31654676258992,54.710144927536234
gpt-3.5-turbo-1106,{'temperature': 0.01},38.1294964028777,38.26714801444043
21 changes: 21 additions & 0 deletions automation-api/yival_experiments/output/result_comb_prompt.csv
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ B. {option_b}
C. {option_c}

Answer:",76.61870503597122,76.61870503597122
qwen-max-1201,{'temperature': 0.01},"问题:
{question_text}
是 {option_a},{option_b},还是 {option_c}?
答案:
",64.74820143884892,75.0
gpt-4-0125-preview,{'temperature': 0.01},"Pick the correct answer: A, B or C. If you can't determine the answer please make your best guess.

Question:
Expand All @@ -45,6 +50,22 @@ B. {option_b}
C. {option_c}

Answer:",74.46043165467626,74.46043165467626
qwen-max-1201,{'temperature': 0.01},"请回答以下单选题。如果你不知道哪个答案正确,请回答你认为最接近的答案。
问题:
{question_text}
A. {option_a}
B. {option_b}
C. {option_c}
答案:
",64.38848920863309,72.17741935483872
qwen-max-1201,{'temperature': 0.01},"以下单选题,请你从 A,B,C 中选择一个正确的答案。如果你不知道哪个答案正确,请回答你认为最接近的答案。
问题:
{question_text}
A. {option_a}
B. {option_b}
C. {option_c}
答案:
",62.23021582733813,67.31517509727627
gpt-3.5-turbo-1106,{'temperature': 0.01},"Question:
{question_text}
Is it: {option_a}; {option_b}; or {option_c}?
Expand Down
Binary file modified automation-api/yival_experiments/output/results.xlsx
Binary file not shown.

0 comments on commit 9c24076

Please sign in to comment.