From 6afad20bc58a99e9f3fe0a76ff6b7642471d63a7 Mon Sep 17 00:00:00 2001 From: William Guss Date: Fri, 4 Oct 2024 16:57:33 -0700 Subject: [PATCH] eval update --- examples/eval.py | 40 ---------------------------------------- 1 file changed, 40 deletions(-) diff --git a/examples/eval.py b/examples/eval.py index 8d5d4df4b..f2be16e81 100644 --- a/examples/eval.py +++ b/examples/eval.py @@ -164,43 +164,3 @@ def summarizer(text: str): print("Mean length of completions:", np.mean(result.scores[:, 1])) - - -""" -UX/IMPL TODOs -- [ ] Database Schemas based on the evalsandmetrics.md -- [ ] View an eval -- [ ] View different runs of an eval -- [ ] Somehow show the source for various different evaluations and have the ability to grab evals by name -- [ ] Clarify whether or not we should show the evals in the computation graph on ell studio -- [ ] Show the actual scores for a given input on ell studio as opposed to just the mean -- [ ] Easy comparison across many models -- [ ] Easy to change parameters of individual models in a chain -- [ ] UX for showing the model is different -- [ ] UX for api params -- [ ] Working verbose mode for @function -- [ ] Fix ell.function in general -- [ ] Support failure modes in metric computation -- [ ] Implement parsers/structured outputs to make this cleaner -- [ ] Group runs more cleanly so that they are a part of an eval in the invocation view -- [ ] Full UX for comparing different evals across any arbitrary axis -- [ ] Arbitrary support for failure mode in lmp invocations -- [ ] Clarity into why a currently running invocation is working or not - -Next Step TODOS -- [ ] Implement a bunch of standard criteria -- [ ] Dataset construction needs to be easy and there should be libraries around this, also matching parity with OpenAI evals - -""" - - - - - -""" -There are two components of eval creaiton: -1. Does the eval align with human intuition about what hte score should be? (Prompt engineering the criterion) -2. Prompt enigneering the result. - -We need a clean way of grouping runs in ell studio so it's clear that they are a part of an eval in the invocation view -""" \ No newline at end of file