Experiment config and results for gpt-4-0125-preview, gpt-4-1106-prev…

…iew and gpt-3.5-turbo-1106
Gapminder · Jan 30, 2024 · 604d649 · 604d649
1 parent 693f081
commit 604d649
Show file tree

Hide file tree

Showing 5 changed files with 258 additions and 0 deletions.
diff --git a/...mation-api/yival_experiments/experiment_configurations/experiment_202401292237_en-US.yaml b/...mation-api/yival_experiments/experiment_configurations/experiment_202401292237_en-US.yaml
@@ -0,0 +1,186 @@
+custom_wrappers:
+  model_config_wrapper:
+    class: ./custom_configuration/model_config_wrapper.ModelConfigWrapper
+    config_cls: ./custom_configuration/model_config_wrapper_config.ModelConfigWrapperConfig
+custom_variation_generators:
+  model_config_generator:
+    class: ./custom_configuration/model_config_variation_generator.ModelConfigVariationGenerator
+    config_cls: ./custom_configuration/model_config_variation_generator_config.ModelConfigVariationGeneratorConfig
+custom_evaluators:
+  gpt4_evaluator:
+    class: ./custom_configuration/gpt4_evaluator.GPT4Evaluator
+    config_cls: ./custom_configuration/gpt4_evaluator_config.GPT4EvaluatorConfig
+custom_function: model_compare.model_compare
+dataset:
+  source_type: dataset
+  file_path: data/questions_en-US.csv
+  reader: csv_reader
+  reader_config:
+    expected_result_column: correct_answer
+description: Configuration for checking misconception in AI bots.
+evaluators:
+- evaluator_type: individual
+  metric_calculators:
+  - method: AVERAGE
+  name: gpt4_evaluator
+  model_name: gpt-4
+  prompt: |-
+    You are assessing a submitted answer on a given task based on a criterion. Here is the data:
+    - Task: Answer a question.
+    - Is the answer correct?
+    [Input]: {question_text}
+    Possible Options and Correctness of Options:
+    A. {option_a} ({option_a_correctness})
+    B. {option_b} ({option_b_correctness})
+    C. {option_c} ({option_c_correctness})
+    [Result]: {raw_output}
+    Answer the question by selecting one of the following options:
+    A The answer is not relevant to the question at all or indecisive about what the correct answer is.
+    B The answer is not correct, and matches/is close to the Very Wrong answer.
+    C The answer is not correct, and matches/is close to the Wrong answer.
+    D The answer is correct, i.e. matches the Correct answer.
+  choices:
+  - A
+  - B
+  - C
+  - D
+  description: Is the answer correct?
+  choice_scores:
+    A: 0
+    B: 1
+    C: 2
+    D: 3
+  scale_description: 0-3
+  display_name: correctness
+variations:
+- name: model_config
+  generator_name: model_config_generator
+  generator_config:
+    models:
+    - vendor: OpenAI
+      model_id: gpt-4-1106-preview
+      params:
+        temperature: 0.01
+      round: 1
+    - vendor: OpenAI
+      model_id: gpt-4-1106-preview
+      params:
+        temperature: 0.01
+      round: 2
+    - vendor: OpenAI
+      model_id: gpt-4-1106-preview
+      params:
+        temperature: 0.01
+      round: 3
+    - vendor: OpenAI
+      model_id: gpt-4-1106-preview
+      params:
+        temperature: 0.01
+      round: 4
+    - vendor: OpenAI
+      model_id: gpt-4-1106-preview
+      params:
+        temperature: 0.01
+      round: 5
+    - vendor: OpenAI
+      model_id: gpt-3.5-turbo-1106
+      params:
+        temperature: 0.01
+      round: 1
+    - vendor: OpenAI
+      model_id: gpt-3.5-turbo-1106
+      params:
+        temperature: 0.01
+      round: 2
+    - vendor: OpenAI
+      model_id: gpt-3.5-turbo-1106
+      params:
+        temperature: 0.01
+      round: 3
+    - vendor: OpenAI
+      model_id: gpt-3.5-turbo-1106
+      params:
+        temperature: 0.01
+      round: 4
+    - vendor: OpenAI
+      model_id: gpt-3.5-turbo-1106
+      params:
+        temperature: 0.01
+      round: 5
+    - vendor: OpenAI
+      model_id: gpt-4-0125-preview
+      params:
+        temperature: 0.01
+      round: 1
+    - vendor: OpenAI
+      model_id: gpt-4-0125-preview
+      params:
+        temperature: 0.01
+      round: 2
+    - vendor: OpenAI
+      model_id: gpt-4-0125-preview
+      params:
+        temperature: 0.01
+      round: 3
+    - vendor: OpenAI
+      model_id: gpt-4-0125-preview
+      params:
+        temperature: 0.01
+      round: 4
+    - vendor: OpenAI
+      model_id: gpt-4-0125-preview
+      params:
+        temperature: 0.01
+      round: 5
+- name: prompt_template
+  variations:
+  - variation_id: instruct_question_options_1
+    value_type: str
+    instantiated_value: |-
+      Please answer this multiple choices question. If you can't determine the answer please make your best guess:
+
+      Question:
+      {question_text}
+      A. {option_a}
+      B. {option_b}
+      C. {option_c}
+
+      Answer:
+    value: |-
+      Please answer this multiple choices question. If you can't determine the answer please make your best guess:
+
+      Question:
+      {question_text}
+      A. {option_a}
+      B. {option_b}
+      C. {option_c}
+
+      Answer:
+  - variation_id: instruct_question_options_2
+    value_type: str
+    instantiated_value: |-
+      Pick the correct answer: A, B or C. If you can't determine the answer please make your best guess.
+
+      Question:
+      {question_text}
+      A. {option_a}
+      B. {option_b}
+      C. {option_c}
+
+      Answer:
+    value: |-
+      Pick the correct answer: A, B or C. If you can't determine the answer please make your best guess.
+
+      Question:
+      {question_text}
+      A. {option_a}
+      B. {option_b}
+      C. {option_c}
+
+      Answer:
+  - variation_id: no_option_letter
+    value_type: str
+    instantiated_value: "Question:\n{question_text}\nIs it: {option_a};  {option_b};\
+      \ or {option_c}? \n\nAnswer:"
+    value: "Question:\n{question_text}\nIs it: {option_a};  {option_b}; or {option_c}?\
+      \ \n\nAnswer:"
diff --git a/automation-api/yival_experiments/output/experiment_202401292237_en-US_0.pkl b/automation-api/yival_experiments/output/experiment_202401292237_en-US_0.pkl
diff --git a/automation-api/yival_experiments/output/result_comb.csv b/automation-api/yival_experiments/output/result_comb.csv
@@ -1,2 +1,5 @@
 model_id,model_params,correctness_with_indecisive,correctness_without_indecisive
+gpt-4-0125-preview,{'temperature': 0.01},79.13669064748201,80.0
+gpt-4-1106-preview,{'temperature': 0.01},79.49640287769785,79.49640287769785
 gemini/gemini-pro,{'temperature': 0.01},54.31654676258992,54.710144927536234
+gpt-3.5-turbo-1106,{'temperature': 0.01},38.1294964028777,38.26714801444043
diff --git a/automation-api/yival_experiments/output/result_comb_prompt.csv b/automation-api/yival_experiments/output/result_comb_prompt.csv
@@ -1,4 +1,55 @@
 model_id,model_params,prompt_template,correctness_with_indecisive,correctness_without_indecisive
+gpt-4-1106-preview,{'temperature': 0.01},"Question:
+{question_text}
+Is it: {option_a};  {option_b}; or {option_c}? 
+
+Answer:",82.37410071942446,84.50184501845018
+gpt-4-0125-preview,{'temperature': 0.01},"Question:
+{question_text}
+Is it: {option_a};  {option_b}; or {option_c}? 
+
+Answer:",84.17266187050359,84.47653429602889
+gpt-4-0125-preview,{'temperature': 0.01},"Please answer this multiple choices question. If you can't determine the answer please make your best guess:
+
+Question:
+{question_text}
+A. {option_a}
+B. {option_b}
+C. {option_c}
+
+Answer:",79.85611510791367,79.85611510791367
+gpt-4-1106-preview,{'temperature': 0.01},"Please answer this multiple choices question. If you can't determine the answer please make your best guess:
+
+Question:
+{question_text}
+A. {option_a}
+B. {option_b}
+C. {option_c}
+
+Answer:",78.77697841726618,79.34782608695652
+gpt-4-1106-preview,{'temperature': 0.01},"Pick the correct answer: A, B or C. If you can't determine the answer please make your best guess.
+
+Question:
+{question_text}
+A. {option_a}
+B. {option_b}
+C. {option_c}
+
+Answer:",76.61870503597122,76.61870503597122
+gpt-4-0125-preview,{'temperature': 0.01},"Pick the correct answer: A, B or C. If you can't determine the answer please make your best guess.
+
+Question:
+{question_text}
+A. {option_a}
+B. {option_b}
+C. {option_c}
+
+Answer:",74.46043165467626,74.46043165467626
+gpt-3.5-turbo-1106,{'temperature': 0.01},"Question:
+{question_text}
+Is it: {option_a};  {option_b}; or {option_c}? 
+
+Answer:",65.10791366906474,65.81818181818181
 gemini/gemini-pro,{'temperature': 0.01},"Question:
 {question_text}
 Is it: {option_a};  {option_b}; or {option_c}? 
@@ -22,3 +73,21 @@ B. {option_b}
 C. {option_c}
 
 Answer:",50.0,50.0
+gpt-3.5-turbo-1106,{'temperature': 0.01},"Please answer this multiple choices question. If you can't determine the answer please make your best guess:
+
+Question:
+{question_text}
+A. {option_a}
+B. {option_b}
+C. {option_c}
+
+Answer:",37.05035971223021,37.18411552346571
+gpt-3.5-turbo-1106,{'temperature': 0.01},"Pick the correct answer: A, B or C. If you can't determine the answer please make your best guess.
+
+Question:
+{question_text}
+A. {option_a}
+B. {option_b}
+C. {option_c}
+
+Answer:",35.25179856115108,35.25179856115108
diff --git a/automation-api/yival_experiments/output/results.xlsx b/automation-api/yival_experiments/output/results.xlsx