From 1c49fb8713da1f9477a73ff961a054b4aff96700 Mon Sep 17 00:00:00 2001 From: semio Date: Fri, 7 Jun 2024 10:24:19 +0800 Subject: [PATCH] update notebook --- .../notebooks/result_data_analysis.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/automation-api/yival_experiments/notebooks/result_data_analysis.py b/automation-api/yival_experiments/notebooks/result_data_analysis.py index 875cb19..29ba865 100644 --- a/automation-api/yival_experiments/notebooks/result_data_analysis.py +++ b/automation-api/yival_experiments/notebooks/result_data_analysis.py @@ -737,6 +737,53 @@ +# # for double checking the evaluators +# check the top 10, bottom 10 questions per model + +# + magic_args="--save double_check_results" language="sql" +# select +# question_id, +# model_configuration_id, +# (100 - correct_rate) as ai_wrong_percentage, +# human_wrong_percentage, +# ai_wrong_percentage - human_wrong_percentage as diff, +# sdg_topic, +# other_topics +# from model_topic_stat +# -- where diff > 0 +# order by +# "sdg_topic", +# cast(other_topics as varchar), +# "model_configuration_id" + +# + language="sql" +# select * +# from double_check_results +# where model_configuration_id = 'mc026' AND ai_wrong_percentage = 0 +# order by question_id + +# + magic_args="--save double_check_results_1" language="sql" +# select +# model_configuration_id, +# question_id, +# ai_wrong_percentage, +# rank() over (partition by model_configuration_id order by ai_wrong_percentage) as rank +# from double_check_results +# order by model_configuration_id, rank, question_id + +# + magic_args="to_check <<" language="sql" +# +# select * from double_check_results_1 where rank <= 10 OR rank >= 275 +# - + +to_check_df = to_check.DataFrame() + +to_check_df[to_check_df['model_configuration_id'] == 'mc026'] + + + + + # # for climate study questions @@ -814,6 +861,10 @@ +outputs.to_parquet("./data/outputs/latest_results.parquet") + + + # alibaba = %sql select * from outputs where model_id = 'qwen-max-0403' # err = %sql select * from outputs where model_id = 'qwen-max-0403' and raw_output like '%Error%'