-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #9 from invariantlabs-ai/workspace
[Feat]: Functionality for sensitvity of files in the workspace
- Loading branch information
Showing
12 changed files
with
326 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
from invariant.runtime.utils.base import BaseDetector, DetectorResult | ||
from invariant.runtime.utils.copyright.software_licenses import * | ||
|
||
# TODO: Maybe want to use more sophisticated approach like https://github.com/licensee/licensee at some point | ||
|
||
SOFTWARE_LICENSES = { | ||
"GNU_AGPL_V3": GNU_AGPL_V3, | ||
"GNU_GPL_V2": GNU_GPL_V2, | ||
"GNU_LGPL_V3": GNU_LGPL_V3, | ||
"MOZILLA_PUBLIC_LICENSE_2.0": MOZILLA_PUBLIC_LICENSE_2_0, | ||
"APACHE_LICENSE_2.0": APACHE_LICENSE_2_0, | ||
"MIT_LICENSE": MIT_LICENSE, | ||
"BOOST_SOFTWARE_LICENSE": BOOST_SOFTWARE_LICENSE, | ||
} | ||
|
||
COPYRIGHT_PATTERNS = [ | ||
"Copyright (C)", | ||
"Copyright ©", | ||
] | ||
|
||
class CopyrightAnalyzer(BaseDetector): | ||
|
||
def detect_software_licenses(self, text: str, threshold: int = 0.5) -> list[DetectorResult]: | ||
# First check if text starts with the license string | ||
for license_name, license_text in SOFTWARE_LICENSES.items(): | ||
if text.strip().startswith(license_text.strip()): | ||
return [DetectorResult(license_name, 0, len(license_text))] | ||
|
||
# Next, use heuristics that checks how many tokens of the license text are in the given text | ||
res = [] | ||
text_tokens = set(text.strip().split(" ")) | ||
for license_name, license_text in SOFTWARE_LICENSES.items(): | ||
tokens = list(filter(lambda x: len(x) > 0, license_text.strip().split(" "))) | ||
in_text = [token in text_tokens for token in tokens] | ||
in_ratio = sum(in_text) / float(len(tokens)) | ||
if in_ratio >= threshold: | ||
res += [DetectorResult(license_name, 0, len(license_text))] | ||
return res | ||
|
||
def detect_copyright_patterns(self, text: str, threshold: int = 0.5) -> list[DetectorResult]: | ||
res = [] | ||
for pattern in COPYRIGHT_PATTERNS: | ||
pos = text.find(pattern) | ||
if pos != -1: | ||
res += [DetectorResult("COPYRIGHT", pos, pos+len(pattern))] | ||
return res | ||
|
||
def detect_all(self, text: str, threshold: int = 0.5) -> list[DetectorResult]: | ||
res = [] | ||
res.extend(self.detect_software_licenses(text, threshold)) | ||
res.extend(self.detect_copyright_patterns(text, threshold)) | ||
return res | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
GNU_AGPL_V3 = """ | ||
GNU AFFERO GENERAL PUBLIC LICENSE | ||
Version 3, 19 November 2007 | ||
""" | ||
|
||
GNU_GPL_V2 = """ | ||
GNU GENERAL PUBLIC LICENSE | ||
Version 3, 29 June 2007 | ||
""" | ||
|
||
GNU_LGPL_V3 = """ | ||
GNU LESSER GENERAL PUBLIC LICENSE | ||
Version 3, 29 June 2007 | ||
""" | ||
|
||
MOZILLA_PUBLIC_LICENSE_2_0 = """ | ||
Mozilla Public License Version 2.0 | ||
""" | ||
|
||
APACHE_LICENSE_2_0 = """ | ||
Apache License | ||
Version 2.0, January 2004 | ||
""" | ||
|
||
MIT_LICENSE = """ | ||
MIT License | ||
""" | ||
|
||
BOOST_SOFTWARE_LICENSE = """ | ||
Boost Software License - Version 1.0 - August 17th, 2003 | ||
""" | ||
|
||
BSL_LICENSE = """ | ||
License text copyright (c) 2020 MariaDB Corporation Ab, All Rights Reserved. | ||
“Business Source License” is a trademark of MariaDB Corporation Ab. | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
from invariant.runtime.functions import cache | ||
|
||
COPYRIGHT_ANALYZER = None | ||
|
||
@cache | ||
def copyright(data: str | list, **config) -> list[str]: | ||
"""Predicate which detects PII in the given data. | ||
Returns the list of PII detected in the data. | ||
Supported data types: | ||
- str: A single message | ||
""" | ||
global COPYRIGHT_ANALYZER | ||
if COPYRIGHT_ANALYZER is None: | ||
from invariant.runtime.utils.copyright.copyright import CopyrightAnalyzer | ||
COPYRIGHT_ANALYZER = CopyrightAnalyzer() | ||
|
||
if type(data) is str: | ||
return COPYRIGHT_ANALYZER.get_entities(COPYRIGHT_ANALYZER.detect_all(data)) | ||
if type(data) is not list: | ||
data = [data] | ||
|
||
all_copyright = [] | ||
for message in data: | ||
if message.content is None: | ||
continue | ||
res = COPYRIGHT_ANALYZER.detect_all(message.content) | ||
all_copyright.extend(COPYRIGHT_ANALYZER.get_entities(res)) | ||
return all_copyright |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
import re | ||
from invariant.stdlib.invariant.errors import PolicyViolation | ||
from pathlib import Path | ||
from pydantic.dataclasses import dataclass | ||
from typing import Optional, Callable | ||
|
||
@dataclass | ||
class File: | ||
path: str | ||
content: str | ||
|
||
|
||
def filter_path(path: list[Path], pattern: Optional[str]) -> Path: | ||
return pattern is None or path.match(pattern) | ||
|
||
|
||
def join_paths(workspace_path: str, path: str) -> Path: | ||
"""Checks if path is inside workspace_path and it exists.""" | ||
joined_path = Path(workspace_path) / Path(path) | ||
if (not joined_path.is_relative_to(workspace_path)) or (not joined_path.exists()): | ||
raise FileNotFoundError("Path does not exist or is not inside the workspace.") | ||
return joined_path | ||
|
||
|
||
def get_files(workspace_path: str, path: str = ".", pattern: Optional[str] = None) -> list[str]: | ||
"""Returns the list of files in the current agent workspace.""" | ||
path = join_paths(workspace_path, path) | ||
return [file for file in path.iterdir() if file.is_file() and filter_path(file, pattern)] | ||
|
||
|
||
def get_tree_files(workspace_path: str, path: str = ".", pattern: Optional[str] = None) -> list[str]: | ||
"""Returns the list of files in the whole directory tree of the agent workspace.""" | ||
path = join_paths(workspace_path, path) | ||
return [file for file in path.glob("**/*") if file.is_file() and filter_path(file, pattern)] | ||
|
||
|
||
def get_file_content(workspace_path: str, file_path: str) -> File: | ||
"""Returns the content of a file in the agent workspace.""" | ||
file_path = join_paths(workspace_path, file_path) | ||
with open(file_path, "r") as file: | ||
return File(str(file_path), file.read()) | ||
|
||
|
||
def get_file_contents(workspace_path: str, path: str = ".", pattern: Optional[str] = None, tree: bool = True) -> list[File]: | ||
"""Returns the content of all files in the given path in the agent workspace. | ||
Args: | ||
workspace_path: The path to the agent workspace. | ||
path: The path to the directory to search for files. | ||
pattern: A regular expression pattern to filter the files. | ||
tree: If True, search the whole directory tree of the workspace. | ||
""" | ||
if tree: | ||
files = get_tree_files(workspace_path, path) | ||
else: | ||
files = get_files(workspace_path, path) | ||
return [get_file_content(workspace_path, file) for file in files] | ||
|
||
|
||
def is_sensitive(file: File, func: Callable[[str], bool | list]) -> bool: | ||
"""Returns True if the file content is sensitive according to the given function. | ||
Args: | ||
file: The file to check for content sensitivity. | ||
func: The function that determines sensitivity (each should return bool or list of sensitive results) | ||
""" | ||
res = func(file.content) | ||
if type(res) is bool: | ||
return res | ||
if type(res) is list: | ||
return len(res) > 0 | ||
raise ValueError("The sensitivity filter function must return bool or list, found: " + str(type(res))) | ||
|
||
|
||
def is_sensitive_dir(workspace_path: str, | ||
funcs: list[Callable[[str], bool | list]], | ||
path: str = ".", | ||
pattern: Optional[str] = None, | ||
tree: bool = True) -> bool: | ||
"""Returns True if any file in the given directory is sensitive according to any of the given sensitivity functions | ||
Args: | ||
workspace_path: The path to the agent workspace. | ||
funcs: The list of functions that determine sensitivity (each should return bool or list of sensitive results) | ||
path: The path to the directory inside the workspace to search for files. | ||
pattern: A regular expression pattern to filter the files. | ||
tree: If True, search the whole directory tree of the workspace. | ||
""" | ||
files = get_file_contents(workspace_path, path, pattern, tree) | ||
for file in files: | ||
for func in funcs: | ||
if is_sensitive(file, func): | ||
return True | ||
return False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.