diff --git a/benchmarks-evalite/benchmark.eval.ts b/benchmarks-evalite/benchmark.eval.ts index 1173c6e..3c0245f 100644 --- a/benchmarks-evalite/benchmark.eval.ts +++ b/benchmarks-evalite/benchmark.eval.ts @@ -3,10 +3,10 @@ import path from "node:path" import toml from "toml" import { anthropic } from "../lib/code-runner/anthropic" import { safeEvaluateCode } from "../lib/code-runner/safe-evaluate-code" -import { askAboutOutput } from "../tests/fixtures/ask-about-output" import { createPrompt } from "./prompt" import { evalite } from "evalite" -import { ExactMatch } from "autoevals" +import { CircuitScorer } from "./scorers/circuit-scorer" +import { askAboutOutput } from "tests/fixtures/ask-about-output" interface Problem { prompt: string @@ -45,48 +45,61 @@ const runAI = async (prompt: string): Promise => { return (completion as any).content[0]?.text || "" } -const problems = loadProblems(path.join(__dirname, "./problems.toml")) -let problemNumber = 0 -for (const problem of problems) { - problemNumber++ - evalite(problem.title, { - data: async () => { - const aiResponse = await runAI(problem.prompt) - const codeMatch = aiResponse.match(/```tsx\s*([\s\S]*?)\s*```/) - const code = codeMatch ? codeMatch[1].trim() : "" - const evaluation = safeEvaluateCode(code, { - outputType: "board", - preSuppliedImports: {}, - }) - return problem.questions.map((question) => ({ - input: { - code: evaluation.success ? code : null, - question: question.text, - }, - expected: question.answer.toString(), - })) - }, - task: async (input) => { - if (!input.code) return "" - const answer = await askAboutOutput(input.code, input.question) - return answer.toString() - }, - experimental_customColumns: async (result) => { - return [ - { - label: "Question", - value: result.input.question, - }, - { - label: "Output", - value: result.output, - }, - { - label: "Expected", - value: result.expected, - }, - ] - }, - scorers: [ExactMatch], - }) -} +evalite("Electronics Engineer", { + data: () => { + const problems = loadProblems(path.join(__dirname, "./problems.toml")) + + return problems.map((problem) => ({ + input: { + prompt: problem.prompt, + questions: problem.questions, + }, + })) + }, + task: async (input) => { + const aiResponse = await runAI(input.prompt) + const codeMatch = aiResponse.match(/```tsx\s*([\s\S]*?)\s*```/) + const code = codeMatch ? codeMatch[1].trim() : "" + const evaluation = safeEvaluateCode(code, { + outputType: "board", + preSuppliedImports: {}, + }) + + const output: { + results: { result: boolean; expected: boolean }[] + code: string + } = { results: [], code: "" } + + if (evaluation.success) { + output.code = code + for (const question of input.questions) { + output.results.push({ + result: await askAboutOutput(code, question.text), + expected: question.answer, + }) + } + } + + return output + }, + experimental_customColumns: async (result) => { + return [ + { + label: "Prompt", + value: result.input.prompt, + }, + { + label: "Code", + value: result.output.code, + }, + { + label: "Result", + value: + result.output.results.length > 0 + ? "Circuit passed" + : "Circuit failed", + }, + ] + }, + scorers: [CircuitScorer], +}) diff --git a/benchmarks-evalite/scorers/circuit-scorer.ts b/benchmarks-evalite/scorers/circuit-scorer.ts new file mode 100644 index 0000000..cc62e54 --- /dev/null +++ b/benchmarks-evalite/scorers/circuit-scorer.ts @@ -0,0 +1,42 @@ +import { createScorer } from "evalite" + +export const CircuitScorer = createScorer< + { + prompt: string + questions: { + text: string + answer: boolean + }[] + }, + { + results: { + result: boolean + expected: boolean + }[] + code: string + } +>({ + name: "circuit_scorer", + description: "Evaluates circuit code for presence of key components", + scorer: ({ input, output }) => { + if (!output) { + return { score: 0 } + } + + const score = output.results.reduce((acc, { result, expected }) => { + return acc + (result === expected ? 0.25 : 0) + }, 0) + + return { + score, + metadata: + output.results.length > 0 + ? input.questions.map((question, index) => ({ + question: question.text, + expected: question.answer, + result: output.results[index].result, + })) + : { result: "Circuit failed" }, + } + }, +}) diff --git a/bun.lockb b/bun.lockb index 513592a..5fa4c1d 100755 Binary files a/bun.lockb and b/bun.lockb differ diff --git a/package.json b/package.json index 8de41ef..744852f 100644 --- a/package.json +++ b/package.json @@ -31,7 +31,6 @@ "@typescript/ata": "^0.9.7", "@typescript/vfs": "^1.6.0", "autoevals": "^0.0.108", - "evalite": "^0.7.0", "react": "^18.3.1", "tsup": "^8.3.0", "vite": "^6.0.3", @@ -46,6 +45,7 @@ "debug": "^4.3.7", "dotenv": "^16.4.7", "extract-codefence": "^0.0.4", + "evalite": "^0.7.3", "toml": "^3.0.0" } }