-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathevaluation.py
121 lines (101 loc) · 4.29 KB
/
evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os, json, shutil
from tools import edit_file
import subprocess
from inference import run_ice_prompt, Changes
def criteria_file_exists(changes: dict, test):
exists = []
for file in changes["file_to_change"]:
fpath = file["file_path"][1:] if file["file_path"].startswith("/") else file["file_path"]
path = os.path.join("code", test["source"], fpath)
if file['mode'] != "create":
exists.append(os.path.exists(path))
else:
exists.append(not os.path.exists(path))
return round(sum(exists) / (len(exists) + 10e-10), 3)
def criteria_change(changes: dict, test):
exists = []
file_changes = ""
for file in changes["file_to_change"]:
path = os.path.join("code", test["source"], file["file_path"][1:])
content = ""
if file["mode"] == "change" and not os.path.exists(path):
exists.append(False)
continue
if file["mode"] == "overwrite" and not os.path.exists(path):
exists.append(False)
continue
if file["mode"] == "create" and os.path.exists(path):
exists.append(False)
continue
file_changes += f"--- {file['mode']} ---\n"
if file["mode"] == "change":
with open(path, "r") as fp:
content = fp.read()
for change in file["changes"]:
exists.append(change["original_snippet"] in content)
return round(sum(exists) / (len(exists) + 10e-10), 3)
def criteria_execute(changes: dict, test):
if os.path.exists("temp"):
shutil.rmtree("temp")
shutil.copytree(os.path.join("code", test["source"]), "temp")
for file in changes["file_to_change"]:
status_ok = edit_file("temp", file)
if not status_ok:
return 0
try:
subprocess.run("cd temp; terraform init -backend=false", shell=True, check=True)
subprocess.run("terraform validate", shell=True, check=True)
return 1
except subprocess.CalledProcessError:
return 0
def criteria_ice(changes: dict, test):
useful = run_ice_prompt("azure/gpt-35-turbo-1106", "usefulness", test["prompt"], str(Changes.parse_obj(changes)))
functional = run_ice_prompt("azure/gpt-35-turbo-1106", "functional correctness", test["prompt"],
str(Changes.parse_obj(changes)))
score = (useful + functional) / 8
return score
class Run:
def __init__(self, result, test):
self.ice = criteria_ice(result, test)
self.execute = criteria_execute(result, test)
self.file_exists = criteria_file_exists(result, test)
self.snippet_exists = criteria_change(result, test)
class Score:
def __init__(self, test):
self.test = test
self.runs = []
def add_run(self, result, test):
self.runs.append(Run(result, test))
def as_dict(self):
return {"files_exist": sum(run.file_exists for run in self.runs) / len(self.runs),
"edit_exists": sum(run.snippet_exists for run in self.runs) / len(self.runs),
"execution": sum(run.execute for run in self.runs) / len(self.runs),
"ice": sum(run.ice for run in self.runs) / len(self.runs)}
def __str__(self):
r = f"---\nfile_exists: {sum(run.file_exists for run in self.runs) / len(self.runs)}\n" \
f"snippet exists: {sum(run.snippet_exists for run in self.runs) / len(self.runs)}\n" \
f"can be executed: {sum(run.execute for run in self.runs) / len(self.runs)}\n" \
f"ICE score: {sum(run.ice for run in self.runs) / len(self.runs)}\n"
return r
def summarize_score(scores: list[Score]):
criteria = {}
for score in scores:
if not score:
continue
for key, value in score.as_dict().items():
if key not in criteria:
criteria[key] = [value]
else:
criteria[key].append(value)
for category, scores in criteria.items():
print(f"{category}: {round(sum(scores) / len(scores), 3)}")
def evaluate(test: dict, results: list):
score = Score(test)
for r in results:
if r["task_id"] == test["task_id"]:
for output in r["outputs"]:
score.add_run(output, test)
print(score.test)
return score
else:
return None