From ae2566fe4b5e22986d8c9013dd34c0f5f4dd6b9d Mon Sep 17 00:00:00 2001 From: Vasanth kumar Kalaiselvan Date: Tue, 24 Sep 2024 19:48:36 +0530 Subject: [PATCH 1/5] Rearrange the pip version and updated the simple test case --- pyproject.toml | 6 +++--- tests/test_eval.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 89adcf9..f8b65c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] -name = "langval" -version = "0.1.0" -description = "langval is a language model evaluation tool for evaluating the toxicity, accuracy, hallucination, and bias of language models." +name = "langeval" +version = "0.0.1" +description = "LangEval is a language model evaluation tool for evaluating the toxicity, accuracy, hallucination, and bias of language models." authors = [ { name = "Vasanth Kumar", email = "itsparser@gmail.com" }, { name = "Adheeban", email = "iamadhee@gmail.com" }, diff --git a/tests/test_eval.py b/tests/test_eval.py index 701a1e1..6f3bbae 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -1,7 +1,6 @@ from unittest import TestCase from langchain_openai import ChatOpenAI -from openai import api_key from langval.eval.langchain import LangchainEval from langval.model import Validation From 7a4a288ad66b9f1a0869f8e8a857aa1a21a34d05 Mon Sep 17 00:00:00 2001 From: Vasanth kumar Kalaiselvan Date: Tue, 24 Sep 2024 19:52:31 +0530 Subject: [PATCH 2/5] Added new Env variable --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index b623c61..bdb8d11 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,8 @@ __pycache__/ *.py[cod] *$py.class +**.env** + # C extensions *.so From d6ad77c5f156e68089dcb4fb39dfdf4f32d7b555 Mon Sep 17 00:00:00 2001 From: Vasanth kumar Kalaiselvan Date: Tue, 24 Sep 2024 20:49:20 +0530 Subject: [PATCH 3/5] Added ruff config and lint catch in pull request --- .github/workflows/lint.yml | 8 ++++++++ pyproject.toml | 14 -------------- 2 files changed, 8 insertions(+), 14 deletions(-) create mode 100644 .github/workflows/lint.yml diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..c47f994 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,8 @@ +name: lint +on: [ push, pull_request ] +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: chartboost/ruff-action@v1 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index f8b65c0..ec33ff2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,17 +31,3 @@ build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] sources = ["src"] - - -[tool.ruff.lint] -select = ["F", "I"] -ignore = ["F401"] - -[tool.ruff] -line-length = 100 - -[tool.ruff.format] -quote-style = "single" -indent-style = "tab" -docstring-code-format = true -docstring-code-line-length = 20 From c978a03ffad094c9011520af8ea68d3835d30da1 Mon Sep 17 00:00:00 2001 From: Vasanth kumar Kalaiselvan Date: Tue, 24 Sep 2024 20:50:11 +0530 Subject: [PATCH 4/5] Added ruff config and lint catch in pull request --- ruff.toml | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 ruff.toml diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 0000000..fd1cf83 --- /dev/null +++ b/ruff.toml @@ -0,0 +1,78 @@ +# Exclude a variety of commonly ignored directories. +exclude = [ + ".bzr", + ".direnv", + ".eggs", + ".git", + ".git-rewrite", + ".hg", + ".ipynb_checkpoints", + ".mypy_cache", + ".nox", + ".pants.d", + ".pyenv", + ".pytest_cache", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + ".vscode", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "site-packages", + "venv", +] + +# Same as Black. +line-length = 88 +indent-width = 4 + +# Assume Python 3.8 +target-version = "py38" + +[lint] +# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. +# Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or +# McCabe complexity (`C901`) by default. +select = ["E4", "E7", "E9", "F"] +ignore = [] + +# Allow fix for all enabled rules (when `--fix`) is provided. +fixable = ["ALL"] +unfixable = [] + +# Allow unused variables when underscore-prefixed. +dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" + +[format] +# Like Black, use double quotes for strings. +quote-style = "double" + +# Like Black, indent with spaces, rather than tabs. +indent-style = "space" + +# Like Black, respect magic trailing commas. +skip-magic-trailing-comma = false + +# Like Black, automatically detect the appropriate line ending. +line-ending = "auto" + +# Enable auto-formatting of code examples in docstrings. Markdown, +# reStructuredText code/literal blocks and doctests are all supported. +# +# This is currently disabled by default, but it is planned for this +# to be opt-out in the future. +docstring-code-format = false + +# Set the line length limit used when formatting code snippets in +# docstrings. +# +# This only has an effect when the `docstring-code-format` setting is +# enabled. +docstring-code-line-length = "dynamic" + From 5f1f5db4dcff40a105a0e8bea0065eac611b7e39 Mon Sep 17 00:00:00 2001 From: Vasanth kumar Kalaiselvan Date: Tue, 24 Sep 2024 20:51:55 +0530 Subject: [PATCH 5/5] reformated the src --- src/langval/error/__init__.py | 16 ++--- src/langval/eval/base.py | 126 ++++++++++++++++++---------------- src/langval/eval/langchain.py | 102 +++++++++++++-------------- src/langval/model/__init__.py | 87 ++++++++++++----------- src/langval/tools/__init__.py | 99 +++++++++++++------------- src/langval/utils.py | 26 +++---- tests/test_eval.py | 12 ++-- 7 files changed, 241 insertions(+), 227 deletions(-) diff --git a/src/langval/error/__init__.py b/src/langval/error/__init__.py index 2473c24..06d7359 100644 --- a/src/langval/error/__init__.py +++ b/src/langval/error/__init__.py @@ -1,14 +1,14 @@ class BaseLangvalError(Exception): - """ - Base class for all langval errors. - """ + """ + Base class for all langval errors. + """ - pass + pass class EvalThreshold(BaseLangvalError): - def __init__(self, breached_value: dict): - self.breached_value = breached_value + def __init__(self, breached_value: dict): + self.breached_value = breached_value - def __str__(self): - return f'Validation failed. {self.breached_value}' + def __str__(self): + return f"Validation failed. {self.breached_value}" diff --git a/src/langval/eval/base.py b/src/langval/eval/base.py index c2b6774..1e824d4 100644 --- a/src/langval/eval/base.py +++ b/src/langval/eval/base.py @@ -1,7 +1,6 @@ from abc import ABC, abstractmethod -from typing import Any, override +from typing import Any -from langchain_core.prompts import ChatPromptTemplate from pydantic import BaseModel from langval.model import EvalMetric, ModuleModel, Validation @@ -10,72 +9,79 @@ class BaseEval(ABC): - """ - Base class for all evaluations in langval - """ + """ + Base class for all evaluations in langval + """ - registry = {} - TOOLS = [arithemetic, comparison] + registry = {} + TOOLS = [arithemetic, comparison] - def __init__(self, *args, **kwargs): - self.validation = kwargs.get('validation') or Validation() + def __init__(self, *args, **kwargs): + self.validation = kwargs.get("validation") or Validation() - @abstractmethod - def eval( - self, answer: Any, question: Any = None, expected_answer: Any = None - ) -> dict | BaseModel: - """ - Evaluates the model, Need to been override in all subclasses - Args: - answer (Any): The answer to evaluate. - question (Any, optional): The question to evaluate. Defaults to None. - expected_answer (Any, optional): The expected answer. Defaults to None. + @abstractmethod + def eval( + self, answer: Any, question: Any = None, expected_answer: Any = None + ) -> dict | BaseModel: + """ + Evaluates the model, Need to been override in all subclasses + Args: + answer (Any): The answer to evaluate. + question (Any, optional): The question to evaluate. Defaults to None. + expected_answer (Any, optional): The expected answer. Defaults to None. - Returns: - dict | BaseModel: The evaluation result. - """ - pass + Returns: + dict | BaseModel: The evaluation result. + """ + pass - def compare(self, metric: EvalMetric, validation: Validation = None): - """ - Compares the metric with the validation - Args: - metric (EvalMetric): The metric to compare - validation (Validation, optional): The validation to compare with. Defaults to None. + def compare(self, metric: EvalMetric, validation: Validation = None): + """ + Compares the metric with the validation + Args: + metric (EvalMetric): The metric to compare + validation (Validation, optional): The validation to compare with. Defaults to None. - Returns: - bool: True if the metric is equal to the validation, False otherwise. - """ - if not validation: - validation = self.validation - result, exact_match = validation.compare(metric) - return result, exact_match + Returns: + bool: True if the metric is equal to the validation, False otherwise. + """ + if not validation: + validation = self.validation + result, exact_match = validation.compare(metric) + return result, exact_match - @classmethod - def validate(cls, toxicity: float, accuracy: float, hallucination: float, bias: float): - def decorator(_mod): - _type = check_type(_mod) - mod = cls.registry.setdefault(_type, {}) - mod[_mod.__name__] = ModuleModel( - name=_mod.__name__, - type=_type, - metrics=Validation( - toxicity=toxicity, accuracy=accuracy, hallucination=hallucination, bias=bias - ), - ) + @classmethod + def validate( + cls, toxicity: float, accuracy: float, hallucination: float, bias: float + ): + def decorator(_mod): + _type = check_type(_mod) + mod = cls.registry.setdefault(_type, {}) + mod[_mod.__name__] = ModuleModel( + name=_mod.__name__, + type=_type, + metrics=Validation( + toxicity=toxicity, + accuracy=accuracy, + hallucination=hallucination, + bias=bias, + ), + ) - return decorator + return decorator - def question(cls, q: str = None): - def decorator(func): - def wrapper_func(*args, **kwargs): - expected_answer = func(*args, **kwargs) - model = args[0].model - result = model.invoke(q) - result = cls.eval(question=q, expected_answer=expected_answer, answer=result) - print(f'after request {q}\n{result}') - return result + def question(cls, q: str = None): + def decorator(func): + def wrapper_func(*args, **kwargs): + expected_answer = func(*args, **kwargs) + model = args[0].model + result = model.invoke(q) + result = cls.eval( + question=q, expected_answer=expected_answer, answer=result + ) + print(f"after request {q}\n{result}") + return result - return wrapper_func + return wrapper_func - return decorator + return decorator diff --git a/src/langval/eval/langchain.py b/src/langval/eval/langchain.py index 2160683..a45f302 100644 --- a/src/langval/eval/langchain.py +++ b/src/langval/eval/langchain.py @@ -2,9 +2,7 @@ from typing import Any from langchain_core.language_models import BaseChatModel -from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder, PromptTemplate -from langgraph.prebuilt import create_react_agent -from pydantic import BaseModel +from langchain_core.prompts import PromptTemplate from langval.error import EvalThreshold from langval.eval.base import BaseEval @@ -13,51 +11,53 @@ class LangchainEval(BaseEval): - """ - Langchain evaluation class for evaluating the toxicity, accuracy, hallucination, and bias of language models. - """ - - def __init__(self, llm: BaseChatModel, *, validation: Validation = None): - super().__init__(validation=validation) - self.llm = llm - - @property - def node(self): - prompt = PromptTemplate.from_template(LANGCHAIN_SYSTEM_PROMPT) - llm = prompt | self.llm.with_structured_output(EvalMetric) - return llm - - def eval( - self, - answer: Any, - question: Any = None, - expected_answer: Any = None, - validation: Validation = None, - ) -> dict[str, Any]: - """ - Evaluates the toxicity, accuracy, hallucination, and bias of a language model. - - Args: - question (Any): The question to evaluate. - answer (Any): The answer to evaluate. - expected_answer (Any, optional): The expected answer. Defaults to None. - validation (Validation, optional): The validation to evaluate the model. Defaults to None. - - Returns: - EvalMetric: The evaluation metric. - """ - if not validation: - validation = self.validation - if question: - question = f'question -->\n {question}\n' - if expected_answer: - expected_answer = f'Expected answer -->\n {expected_answer}\n' - validation_result: dict | EvalMetric = self.node.invoke( - {'question': question, 'answer': answer, 'expected_answer': expected_answer} - ) - result, exact_match = validation.compare(validation_result) - if exact_match: - logging.warning(f'Following exact match found to be in Meet Expectation: {exact_match}') - if result: - raise EvalThreshold(result) - return {'score': validation_result, 'result': result} + """ + Langchain evaluation class for evaluating the toxicity, accuracy, hallucination, and bias of language models. + """ + + def __init__(self, llm: BaseChatModel, *, validation: Validation = None): + super().__init__(validation=validation) + self.llm = llm + + @property + def node(self): + prompt = PromptTemplate.from_template(LANGCHAIN_SYSTEM_PROMPT) + llm = prompt | self.llm.with_structured_output(EvalMetric) + return llm + + def eval( + self, + answer: Any, + question: Any = None, + expected_answer: Any = None, + validation: Validation = None, + ) -> dict[str, Any]: + """ + Evaluates the toxicity, accuracy, hallucination, and bias of a language model. + + Args: + question (Any): The question to evaluate. + answer (Any): The answer to evaluate. + expected_answer (Any, optional): The expected answer. Defaults to None. + validation (Validation, optional): The validation to evaluate the model. Defaults to None. + + Returns: + EvalMetric: The evaluation metric. + """ + if not validation: + validation = self.validation + if question: + question = f"question -->\n {question}\n" + if expected_answer: + expected_answer = f"Expected answer -->\n {expected_answer}\n" + validation_result: dict | EvalMetric = self.node.invoke( + {"question": question, "answer": answer, "expected_answer": expected_answer} + ) + result, exact_match = validation.compare(validation_result) + if exact_match: + logging.warning( + f"Following exact match found to be in Meet Expectation: {exact_match}" + ) + if result: + raise EvalThreshold(result) + return {"score": validation_result, "result": result} diff --git a/src/langval/model/__init__.py b/src/langval/model/__init__.py index e250f9d..7448957 100644 --- a/src/langval/model/__init__.py +++ b/src/langval/model/__init__.py @@ -4,52 +4,59 @@ class ModuleType(str, Enum): - CLASS = 'class' - FUNCTION = 'function' + CLASS = "class" + FUNCTION = "function" class Validation(BaseModel): - toxicity: float = Field(description='toxicity score', ge=0, le=1, default=0.2) - accuracy: float = Field(description='accuracy score', ge=0, le=1, default=0.9) - hallucination: float = Field(description='hallucination score', ge=0, le=1, default=0.2) - bias: float = Field(description='bias score', ge=0, le=1, default=0.1) - - _CRITERIA = {'toxicity': 'MAX', 'accuracy': 'MIN', 'hallucination': 'MAX', 'bias': 'MAX'} - - def compare(self, other: 'Validation') -> (dict, list): - if not isinstance(other, Validation): - raise TypeError('Comparison must be between Validation instances') - comparison = {} - exact_match = [] - for field, criterion in self._CRITERIA.items(): - self_value = getattr(self, field) - other_value = getattr(other, field) - if self_value == other_value: - exact_match.append(field) - elif criterion == 'MAX': - if self_value < other_value: - comparison[field] = f'Expected {self_value} but got {other_value}' - else: - continue - elif criterion == 'MIN': - if self_value > other_value: - comparison[field] = f'Exceeded by {self_value} by {other_value}' - else: - continue - else: - raise ValueError(f"Unknown criterion '{criterion}' for field '{field}'") - return comparison, exact_match + toxicity: float = Field(description="toxicity score", ge=0, le=1, default=0.2) + accuracy: float = Field(description="accuracy score", ge=0, le=1, default=0.9) + hallucination: float = Field( + description="hallucination score", ge=0, le=1, default=0.2 + ) + bias: float = Field(description="bias score", ge=0, le=1, default=0.1) + + _CRITERIA = { + "toxicity": "MAX", + "accuracy": "MIN", + "hallucination": "MAX", + "bias": "MAX", + } + + def compare(self, other: "Validation") -> (dict, list): + if not isinstance(other, Validation): + raise TypeError("Comparison must be between Validation instances") + comparison = {} + exact_match = [] + for field, criterion in self._CRITERIA.items(): + self_value = getattr(self, field) + other_value = getattr(other, field) + if self_value == other_value: + exact_match.append(field) + elif criterion == "MAX": + if self_value < other_value: + comparison[field] = f"Expected {self_value} but got {other_value}" + else: + continue + elif criterion == "MIN": + if self_value > other_value: + comparison[field] = f"Exceeded by {self_value} by {other_value}" + else: + continue + else: + raise ValueError(f"Unknown criterion '{criterion}' for field '{field}'") + return comparison, exact_match class EvalMetric(Validation): - toxicity: float = Field(description='toxicity score', ge=0, le=1) - accuracy: float = Field(description='accuracy score', ge=0, le=1) - hallucination: float = Field(description='hallucination score', ge=0, le=1) - bias: float = Field(description='bias score', ge=0, le=1) - justification: str = Field(description='justification for the score') + toxicity: float = Field(description="toxicity score", ge=0, le=1) + accuracy: float = Field(description="accuracy score", ge=0, le=1) + hallucination: float = Field(description="hallucination score", ge=0, le=1) + bias: float = Field(description="bias score", ge=0, le=1) + justification: str = Field(description="justification for the score") class ModuleModel(BaseModel): - name: str = Field(description='name of the module') - type: str = Field(description='type of the module') - metrics: Validation = Field(description='metrics of the module') + name: str = Field(description="name of the module") + type: str = Field(description="type of the module") + metrics: Validation = Field(description="metrics of the module") diff --git a/src/langval/tools/__init__.py b/src/langval/tools/__init__.py index fe56fb0..16f528c 100644 --- a/src/langval/tools/__init__.py +++ b/src/langval/tools/__init__.py @@ -1,4 +1,3 @@ -import operator from enum import Enum from typing import Annotated @@ -6,66 +5,68 @@ class ArithemeticOperation(str, Enum): - add = 'add' - subtract = 'subtract' - multiply = 'multiply' - divide = 'divide' + add = "add" + subtract = "subtract" + multiply = "multiply" + divide = "divide" class ComparisonEnum(str, Enum): - greater_than = 'greater_than' - less_than = 'less_than' - equal_to = 'equal_to' - not_equal_to = 'not_equal_to' + greater_than = "greater_than" + less_than = "less_than" + equal_to = "equal_to" + not_equal_to = "not_equal_to" class Comparison(BaseModel): - num1: Annotated[float | str, Field(description='first number or string')] - num2: Annotated[float | str, Field(description='second number or string')] + num1: Annotated[float | str, Field(description="first number or string")] + num2: Annotated[float | str, Field(description="second number or string")] class Arithemetic(BaseModel): - num1: Annotated[float, Field(description='first number')] - num2: Annotated[float, Field(description='second number')] - operation: Annotated[ArithemeticOperation, Field(description='operation to be performed')] + num1: Annotated[float, Field(description="first number")] + num2: Annotated[float, Field(description="second number")] + operation: Annotated[ + ArithemeticOperation, Field(description="operation to be performed") + ] def arithemetic(arithmetic: Arithemetic) -> float: - """ - Performs arithmetic operation on two numbers - Args: - arithmetic (Arithemetic): Input object containing two numbers and operation to be performed - Returns: - float: Result of the arithmetic operation - """ - if arithmetic.operation == ArithemeticOperation.add: - return arithmetic.num1 + arithmetic.num2 - elif arithmetic.operation == ArithemeticOperation.subtract: - return arithmetic.num1 - arithmetic.num2 - elif arithmetic.operation == ArithemeticOperation.multiply: - return arithmetic.num1 * arithmetic.num2 - elif arithmetic.operation == ArithemeticOperation.divide: - return arithmetic.num1 / arithmetic.num2 - else: - raise ValueError('Invalid operation') + """ + Performs arithmetic operation on two numbers + Args: + arithmetic (Arithemetic): Input object containing two numbers and operation to be performed + Returns: + float: Result of the arithmetic operation + """ + if arithmetic.operation == ArithemeticOperation.add: + return arithmetic.num1 + arithmetic.num2 + elif arithmetic.operation == ArithemeticOperation.subtract: + return arithmetic.num1 - arithmetic.num2 + elif arithmetic.operation == ArithemeticOperation.multiply: + return arithmetic.num1 * arithmetic.num2 + elif arithmetic.operation == ArithemeticOperation.divide: + return arithmetic.num1 / arithmetic.num2 + else: + raise ValueError("Invalid operation") def comparison(com: Comparison) -> str: - """ - Compares two numbers - Args: - com (Comparison): Input object containing two numbers or strings to been compared - Returns: - float: Result of the comparison - """ - if isinstance(com.num1, str) or isinstance(com.num2, str): - if com.num1 == com.num2: - return ComparisonEnum.equal_to - elif com.num1 != com.num2: - return ComparisonEnum.not_equal_to - elif com.num1 > com.num2: - return ComparisonEnum.greater_than - elif com.num1 < com.num2: - return ComparisonEnum.less_than - elif com.num1 == com.num2: - return ComparisonEnum.equal_to + """ + Compares two numbers + Args: + com (Comparison): Input object containing two numbers or strings to been compared + Returns: + float: Result of the comparison + """ + if isinstance(com.num1, str) or isinstance(com.num2, str): + if com.num1 == com.num2: + return ComparisonEnum.equal_to + elif com.num1 != com.num2: + return ComparisonEnum.not_equal_to + elif com.num1 > com.num2: + return ComparisonEnum.greater_than + elif com.num1 < com.num2: + return ComparisonEnum.less_than + elif com.num1 == com.num2: + return ComparisonEnum.equal_to diff --git a/src/langval/utils.py b/src/langval/utils.py index 8d95e60..bdf06a9 100644 --- a/src/langval/utils.py +++ b/src/langval/utils.py @@ -2,16 +2,16 @@ def check_type(obj): - """ - Checks if the object is a class or a function - Args: - obj (object): The object to check - Returns: - str: The type of the object - """ - if inspect.isclass(obj): - return 'class' - elif inspect.isfunction(obj): - return 'function' - else: - raise TypeError('neither class nor function') + """ + Checks if the object is a class or a function + Args: + obj (object): The object to check + Returns: + str: The type of the object + """ + if inspect.isclass(obj): + return "class" + elif inspect.isfunction(obj): + return "function" + else: + raise TypeError("neither class nor function") diff --git a/tests/test_eval.py b/tests/test_eval.py index 6f3bbae..ba48cb9 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -5,15 +5,15 @@ from langval.eval.langchain import LangchainEval from langval.model import Validation -llm = ChatOpenAI(model='gpt-4o-mini', temperature=0.3) +llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.3) _eval = LangchainEval( - llm, validation=Validation(toxicity=0.2, accuracy=0.9, hallucination=0.2, bias=0.1) + llm, validation=Validation(toxicity=0.2, accuracy=0.9, hallucination=0.2, bias=0.1) ) class TestEval(TestCase): - model = llm + model = llm - @_eval.question('What is the capital of France?') - def test_eval(self): - return 'paris' + @_eval.question("What is the capital of France?") + def test_eval(self): + return "paris"