Skip to content

Commit

Permalink
Merge pull request #12 from ShiboSoftwareDev/main
Browse files Browse the repository at this point in the history
Refactored evalite benchmark
  • Loading branch information
ShiboSoftwareDev authored Dec 19, 2024
2 parents 2561dc5 + c2673fb commit 4f4014d
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 48 deletions.
107 changes: 60 additions & 47 deletions benchmarks-evalite/benchmark.eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ import path from "node:path"
import toml from "toml"
import { anthropic } from "../lib/code-runner/anthropic"
import { safeEvaluateCode } from "../lib/code-runner/safe-evaluate-code"
import { askAboutOutput } from "../tests/fixtures/ask-about-output"
import { createPrompt } from "./prompt"
import { evalite } from "evalite"
import { ExactMatch } from "autoevals"
import { CircuitScorer } from "./scorers/circuit-scorer"
import { askAboutOutput } from "tests/fixtures/ask-about-output"

interface Problem {
prompt: string
Expand Down Expand Up @@ -45,48 +45,61 @@ const runAI = async (prompt: string): Promise<string> => {
return (completion as any).content[0]?.text || ""
}

const problems = loadProblems(path.join(__dirname, "./problems.toml"))
let problemNumber = 0
for (const problem of problems) {
problemNumber++
evalite(problem.title, {
data: async () => {
const aiResponse = await runAI(problem.prompt)
const codeMatch = aiResponse.match(/```tsx\s*([\s\S]*?)\s*```/)
const code = codeMatch ? codeMatch[1].trim() : ""
const evaluation = safeEvaluateCode(code, {
outputType: "board",
preSuppliedImports: {},
})
return problem.questions.map((question) => ({
input: {
code: evaluation.success ? code : null,
question: question.text,
},
expected: question.answer.toString(),
}))
},
task: async (input) => {
if (!input.code) return ""
const answer = await askAboutOutput(input.code, input.question)
return answer.toString()
},
experimental_customColumns: async (result) => {
return [
{
label: "Question",
value: result.input.question,
},
{
label: "Output",
value: result.output,
},
{
label: "Expected",
value: result.expected,
},
]
},
scorers: [ExactMatch],
})
}
evalite("Electronics Engineer", {
data: () => {
const problems = loadProblems(path.join(__dirname, "./problems.toml"))

return problems.map((problem) => ({
input: {
prompt: problem.prompt,
questions: problem.questions,
},
}))
},
task: async (input) => {
const aiResponse = await runAI(input.prompt)
const codeMatch = aiResponse.match(/```tsx\s*([\s\S]*?)\s*```/)
const code = codeMatch ? codeMatch[1].trim() : ""
const evaluation = safeEvaluateCode(code, {
outputType: "board",
preSuppliedImports: {},
})

const output: {
results: { result: boolean; expected: boolean }[]
code: string
} = { results: [], code: "" }

if (evaluation.success) {
output.code = code
for (const question of input.questions) {
output.results.push({
result: await askAboutOutput(code, question.text),
expected: question.answer,
})
}
}

return output
},
experimental_customColumns: async (result) => {
return [
{
label: "Prompt",
value: result.input.prompt,
},
{
label: "Code",
value: result.output.code,
},
{
label: "Result",
value:
result.output.results.length > 0
? "Circuit passed"
: "Circuit failed",
},
]
},
scorers: [CircuitScorer],
})
42 changes: 42 additions & 0 deletions benchmarks-evalite/scorers/circuit-scorer.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import { createScorer } from "evalite"

export const CircuitScorer = createScorer<
{
prompt: string
questions: {
text: string
answer: boolean
}[]
},
{
results: {
result: boolean
expected: boolean
}[]
code: string
}
>({
name: "circuit_scorer",
description: "Evaluates circuit code for presence of key components",
scorer: ({ input, output }) => {
if (!output) {
return { score: 0 }
}

const score = output.results.reduce((acc, { result, expected }) => {
return acc + (result === expected ? 0.25 : 0)
}, 0)

return {
score,
metadata:
output.results.length > 0
? input.questions.map((question, index) => ({
question: question.text,
expected: question.answer,
result: output.results[index].result,
}))
: { result: "Circuit failed" },
}
},
})
Binary file modified bun.lockb
Binary file not shown.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
"@typescript/ata": "^0.9.7",
"@typescript/vfs": "^1.6.0",
"autoevals": "^0.0.108",
"evalite": "^0.7.0",
"react": "^18.3.1",
"tsup": "^8.3.0",
"vite": "^6.0.3",
Expand All @@ -46,6 +45,7 @@
"debug": "^4.3.7",
"dotenv": "^16.4.7",
"extract-codefence": "^0.0.4",
"evalite": "^0.7.3",
"toml": "^3.0.0"
}
}

0 comments on commit 4f4014d

Please sign in to comment.