From ed22fa8b7c92f588905aa7272f6b46109f313683 Mon Sep 17 00:00:00 2001 From: Shibo Date: Mon, 3 Feb 2025 18:25:38 +0200 Subject: [PATCH] random circuit generation --- ...benchmark-error-correction.eval.paused.ts} | 0 ...xt => prompt-2025-02-03T16-06-49-506Z.txt} | 0 ...xt => prompt-2025-02-03T16-08-04-961Z.txt} | 0 ...xt => prompt-2025-02-03T16-15-17-255Z.txt} | 0 ...xt => prompt-2025-02-03T16-17-53-563Z.txt} | 0 ...xt => prompt-2025-02-03T16-18-53-697Z.txt} | 0 ...xt => prompt-2025-02-03T16-19-44-193Z.txt} | 0 ...xt => prompt-2025-02-03T16-22-07-143Z.txt} | 0 .../random-circuit-generation.eval.ts | 293 ++++++++++++++++++ .../scorers/ai-circuit-scorer.ts | 73 +++++ 10 files changed, 366 insertions(+) rename benchmarks-evalite/{benchmark-error-correction.eval.ts => benchmark-error-correction.eval.paused.ts} (100%) rename benchmarks-evalite/prompts/{prompt-2025-01-31T13-56-51-229Z.txt => prompt-2025-02-03T16-06-49-506Z.txt} (100%) rename benchmarks-evalite/prompts/{prompt-2025-01-31T13-58-31-642Z.txt => prompt-2025-02-03T16-08-04-961Z.txt} (100%) rename benchmarks-evalite/prompts/{prompt-2025-01-31T14-00-00-488Z.txt => prompt-2025-02-03T16-15-17-255Z.txt} (100%) rename benchmarks-evalite/prompts/{prompt-2025-01-31T14-00-33-482Z.txt => prompt-2025-02-03T16-17-53-563Z.txt} (100%) rename benchmarks-evalite/prompts/{prompt-2025-01-31T14-01-40-998Z.txt => prompt-2025-02-03T16-18-53-697Z.txt} (100%) rename benchmarks-evalite/prompts/{prompt-2025-01-31T14-10-12-840Z.txt => prompt-2025-02-03T16-19-44-193Z.txt} (100%) rename benchmarks-evalite/prompts/{prompt-2025-01-31T14-15-20-608Z.txt => prompt-2025-02-03T16-22-07-143Z.txt} (100%) create mode 100644 benchmarks-evalite/random-circuit-generation.eval.ts create mode 100644 benchmarks-evalite/scorers/ai-circuit-scorer.ts diff --git a/benchmarks-evalite/benchmark-error-correction.eval.ts b/benchmarks-evalite/benchmark-error-correction.eval.paused.ts similarity index 100% rename from benchmarks-evalite/benchmark-error-correction.eval.ts rename to benchmarks-evalite/benchmark-error-correction.eval.paused.ts diff --git a/benchmarks-evalite/prompts/prompt-2025-01-31T13-56-51-229Z.txt b/benchmarks-evalite/prompts/prompt-2025-02-03T16-06-49-506Z.txt similarity index 100% rename from benchmarks-evalite/prompts/prompt-2025-01-31T13-56-51-229Z.txt rename to benchmarks-evalite/prompts/prompt-2025-02-03T16-06-49-506Z.txt diff --git a/benchmarks-evalite/prompts/prompt-2025-01-31T13-58-31-642Z.txt b/benchmarks-evalite/prompts/prompt-2025-02-03T16-08-04-961Z.txt similarity index 100% rename from benchmarks-evalite/prompts/prompt-2025-01-31T13-58-31-642Z.txt rename to benchmarks-evalite/prompts/prompt-2025-02-03T16-08-04-961Z.txt diff --git a/benchmarks-evalite/prompts/prompt-2025-01-31T14-00-00-488Z.txt b/benchmarks-evalite/prompts/prompt-2025-02-03T16-15-17-255Z.txt similarity index 100% rename from benchmarks-evalite/prompts/prompt-2025-01-31T14-00-00-488Z.txt rename to benchmarks-evalite/prompts/prompt-2025-02-03T16-15-17-255Z.txt diff --git a/benchmarks-evalite/prompts/prompt-2025-01-31T14-00-33-482Z.txt b/benchmarks-evalite/prompts/prompt-2025-02-03T16-17-53-563Z.txt similarity index 100% rename from benchmarks-evalite/prompts/prompt-2025-01-31T14-00-33-482Z.txt rename to benchmarks-evalite/prompts/prompt-2025-02-03T16-17-53-563Z.txt diff --git a/benchmarks-evalite/prompts/prompt-2025-01-31T14-01-40-998Z.txt b/benchmarks-evalite/prompts/prompt-2025-02-03T16-18-53-697Z.txt similarity index 100% rename from benchmarks-evalite/prompts/prompt-2025-01-31T14-01-40-998Z.txt rename to benchmarks-evalite/prompts/prompt-2025-02-03T16-18-53-697Z.txt diff --git a/benchmarks-evalite/prompts/prompt-2025-01-31T14-10-12-840Z.txt b/benchmarks-evalite/prompts/prompt-2025-02-03T16-19-44-193Z.txt similarity index 100% rename from benchmarks-evalite/prompts/prompt-2025-01-31T14-10-12-840Z.txt rename to benchmarks-evalite/prompts/prompt-2025-02-03T16-19-44-193Z.txt diff --git a/benchmarks-evalite/prompts/prompt-2025-01-31T14-15-20-608Z.txt b/benchmarks-evalite/prompts/prompt-2025-02-03T16-22-07-143Z.txt similarity index 100% rename from benchmarks-evalite/prompts/prompt-2025-01-31T14-15-20-608Z.txt rename to benchmarks-evalite/prompts/prompt-2025-02-03T16-22-07-143Z.txt diff --git a/benchmarks-evalite/random-circuit-generation.eval.ts b/benchmarks-evalite/random-circuit-generation.eval.ts new file mode 100644 index 0000000..2ec4672 --- /dev/null +++ b/benchmarks-evalite/random-circuit-generation.eval.ts @@ -0,0 +1,293 @@ +import fs, { readdirSync, rmSync } from "node:fs" +import path from "node:path" +import toml from "toml" +import { anthropic } from "../lib/code-runner/anthropic" +import { safeEvaluateCode } from "../lib/code-runner/safe-evaluate-code" +import { createPrompt } from "./prompt" +import { evalite } from "evalite" +import { CircuitScorer } from "./scorers/circuit-scorer" +import { askAboutOutput } from "tests/fixtures/ask-about-output" +import { AiCircuitScorer } from "./scorers/ai-circuit-scorer" + +const cleanupLogDirectory = () => { + const logsDir = path.join(__dirname, "./attempt-logs") + if (fs.existsSync(logsDir)) { + rmSync(logsDir, { recursive: true, force: true }) + } + fs.mkdirSync(logsDir, { recursive: true }) +} + +const generatePrompts = async (): Promise => { + const completion = await anthropic.messages.create({ + model: "claude-3-5-haiku-20241022", + max_tokens: 2048, + messages: [ + { + role: "user", + content: + "Generate 10 different prompts for creating electronic circuits. Each prompt should describe a unique circuit with specific requirements and constraints. Return the prompts as a numbered list.", + }, + ], + }) + + const response = (completion as any).content[0]?.text || "" + return response + .split("\n") + .filter((line: string) => /^\d+\./.test(line)) + .map((line: string) => line.replace(/^\d+\.\s*/, "").trim()) + .slice(0, 10) +} + +const saveAttemptLog = ( + fileName: string, + prompt: string, + code: string, + error: string, +) => { + const logsDir = path.join(__dirname, "./attempt-logs") + if (!fs.existsSync(logsDir)) { + fs.mkdirSync(logsDir, { recursive: true }) + } + + const content = `# Attempt Log + +## Prompt +${prompt} + +## Error +\`\`\` +${error} +\`\`\` + +## Code +\`\`\`tsx +${code} +\`\`\` +` + + fs.writeFileSync(path.join(logsDir, fileName), content) +} + +const savePrompt = (prompt: string, fileName: string) => { + const promptsDir = path.join(__dirname, "./prompts") + + if (!fs.existsSync(promptsDir)) { + fs.mkdirSync(promptsDir, { recursive: true }) + } + + const files = readdirSync(promptsDir) + .filter((f) => f.startsWith("prompt-")) + .sort() + + if (files.length >= 10) { + fs.unlinkSync(path.join(promptsDir, files[0])) + } + + fs.writeFileSync(path.join(promptsDir, fileName), prompt) +} + +interface Problem { + prompt: string + title: string + questions: { text: string; answer: boolean }[] +} + +let systemPrompt = "" +let promptNumber = 0 + +const loadProblems = (filePath: string): Problem[] => { + const tomlContent = fs.readFileSync(filePath, "utf-8") + const parsedToml = toml.parse(tomlContent) + + return parsedToml.problems.map((problem: any) => ({ + prompt: problem.prompt, + title: problem.title, + questions: problem.questions.map((q: any) => ({ + text: q.text, + answer: q.answer, + })), + })) +} + +interface AttemptHistory { + code: string + error: string +} + +const runAI = async ({ + prompt, + previousAttempts, +}: { + prompt: string + previousAttempts?: AttemptHistory[] +}): Promise => { + const messages: { role: "assistant" | "user"; content: string }[] = [ + { role: "user", content: prompt }, + ] + + if (previousAttempts?.length) { + messages.push({ + role: "user", + content: "Previous attempts failed. Here are the details:", + }) + + previousAttempts.forEach((attempt, index) => { + messages.push( + { role: "assistant", content: attempt.code }, + { + role: "user", + content: `Attempt ${index + 1} error: ${attempt.error}`, + }, + ) + }) + + messages.push({ + role: "user", + content: + "Please provide a new solution that addresses these errors. Avoid approaches that led to previous failures.", + }) + } + + let result = "" + + try { + const completion = await anthropic.messages.create({ + model: "claude-3-5-haiku-20241022", + max_tokens: 2048, + system: systemPrompt, + messages: messages, + }) + result = (completion as any).content[0]?.text + } catch { + result = "Error in AI API request" + } + + return result +} + +const errorCorrection = async ({ + attempts = 0, + prompt, + promptNumber, + previousAttempts = [], +}: { + attempts?: number + prompt: string + promptNumber: number + previousAttempts?: AttemptHistory[] +}): Promise<{ + code: string + codeBlock: string + error: string +}> => { + const aiResponse = await runAI({ prompt, previousAttempts }) + const codeMatch = aiResponse.match(/```tsx\s*([\s\S]*?)\s*```/) + const code = codeMatch ? codeMatch[1].trim() : "" + const codeBlockMatch = aiResponse.match(/```tsx[\s\S]*?```/) + const codeBlock = codeBlockMatch ? codeBlockMatch[0] : "" + const evaluation = safeEvaluateCode(code, { + outputType: "board", + preSuppliedImports: {}, + }) + + if (evaluation.success) { + return { code, codeBlock, error: "" } + } + + const error = evaluation.error || "" + attempts++ + previousAttempts.push({ code, error }) + saveAttemptLog( + `prompt-${promptNumber}-attempt-${attempts}.md`, + prompt, + code, + error, + ) + + if (attempts > 3) { + return { + code, + codeBlock, + error: previousAttempts[previousAttempts.length - 1].error || "", + } + } + return await errorCorrection({ + attempts, + prompt, + promptNumber, + previousAttempts, + }) +} + +evalite("Electronics Engineer Making Random Circuits", { + data: async () => { + cleanupLogDirectory() + const problems = await generatePrompts() + systemPrompt = await createPrompt() + + const timestamp = new Date().toISOString().replace(/[:.]/g, "-") + const promptFileName = `prompt-${timestamp}.txt` + savePrompt(systemPrompt, promptFileName) + + return problems.map((problem) => ({ + input: { + prompt: problem, + promptFileName, + }, + })) + }, + task: async (input) => { + const { code, codeBlock, error } = await errorCorrection({ + prompt: input.prompt, + promptNumber: ++promptNumber, + }) + + const output: { + code: string + codeBlock: string + } = { code: "", codeBlock: "" } + + if (!error) { + output.code = code + output.codeBlock = codeBlock + return output + } + + return `${error}. Code:\n${codeBlock}` + }, + experimental_customColumns: async (result) => { + if (typeof result.output === "string") + return [ + { + label: "Prompt", + value: result.input.prompt, + }, + { + label: "Code", + value: result.output, + }, + { + label: "Result", + value: "Circuit failed", + }, + ] + return [ + { + label: "Prompt", + value: result.input.prompt, + }, + { + label: "Code", + value: result.output.codeBlock, + }, + { + label: "Result", + value: + !result.output || typeof result.output === "string" + ? "Circuit failed" + : "Circuit passed", + }, + ] + }, + scorers: [AiCircuitScorer], +}) diff --git a/benchmarks-evalite/scorers/ai-circuit-scorer.ts b/benchmarks-evalite/scorers/ai-circuit-scorer.ts new file mode 100644 index 0000000..a7db63d --- /dev/null +++ b/benchmarks-evalite/scorers/ai-circuit-scorer.ts @@ -0,0 +1,73 @@ +import { createScorer } from "evalite" +import { anthropic } from "lib/code-runner/anthropic" + +const getAiScore = async (prompt: string, code: string): Promise => { + const scoringPrompt = `You are an electronics expert. Please evaluate this circuit code and give it a score from 0 to 100 based on: + - Correctness of implementation + - Proper use of components + - Circuit complexity + - Code quality + Return only a number between 0 and 1. + So that 0 meaning it's very bad, 1 meaning it's a perfect circuit. + + Original prompt: ${prompt} + + Circuit code: + ${code}` + + try { + const completion = await anthropic.messages.create({ + model: "claude-3-5-haiku-20241022", + max_tokens: 1024, + messages: [ + { + role: "user", + content: scoringPrompt, + }, + ], + }) + + const scoreText = (completion as any).content[0]?.text || "0" + const result = Math.min(1, Math.max(0, parseFloat(scoreText) || 0)) + + return result + } catch (error) { + return 0 + } +} + +export const AiCircuitScorer = createScorer< + { + prompt: string + promptFileName: string + }, + | { + code: string + codeBlock: string + } + | string +>({ + name: "ai_circuit_scorer", + description: "Evaluates circuit code for presence of key components", + scorer: async ({ input, output }) => { + if (!output) { + return { score: 0, metadata: { promptFileName: input.promptFileName } } + } + if (typeof output === "string") + return { score: 0, metadata: { promptFileName: input.promptFileName } } + + let score = 0 + try { + score = await getAiScore(input.prompt, output.code) + } catch (error) { + console.error(error) + } + + return { + score, + metadata: { + promptFileName: input.promptFileName, + }, + } + }, +})