Skip to content

Commit

Permalink
Validate exclusion regexes on backend (#2316)
Browse files Browse the repository at this point in the history
  • Loading branch information
tw4l authored Jan 23, 2025
1 parent 763c654 commit 9363095
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 3 deletions.
18 changes: 15 additions & 3 deletions backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
CrawlerProxy,
CrawlerProxies,
)
from .utils import dt_now, slug_from_name
from .utils import dt_now, slug_from_name, validate_regexes

if TYPE_CHECKING:
from .orgs import OrgOps
Expand Down Expand Up @@ -189,7 +189,7 @@ async def get_profile_filename(

return profile_filename

# pylint: disable=invalid-name
# pylint: disable=invalid-name, too-many-branches
async def add_crawl_config(
self,
config_in: CrawlConfigIn,
Expand All @@ -215,6 +215,12 @@ async def add_crawl_config(
if not self.can_org_use_proxy(org, config_in.proxyId):
raise HTTPException(status_code=404, detail="proxy_not_found")

if config_in.config.exclude:
exclude = config_in.config.exclude
if isinstance(exclude, str):
exclude = [exclude]
validate_regexes(exclude)

now = dt_now()
crawlconfig = CrawlConfig(
id=uuid4(),
Expand Down Expand Up @@ -317,11 +323,17 @@ def check_attr_changed(
async def update_crawl_config(
self, cid: UUID, org: Organization, user: User, update: UpdateCrawlConfig
) -> dict[str, bool | str]:
# pylint: disable=too-many-locals
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
"""Update name, scale, schedule, and/or tags for an existing crawl config"""

orig_crawl_config = await self.get_crawl_config(cid, org.id)

if update.config and update.config.exclude:
exclude = update.config.exclude
if isinstance(exclude, str):
exclude = [exclude]
validate_regexes(exclude)

# indicates if any k8s crawl config settings changed
changed = False
changed = changed or (
Expand Down
4 changes: 4 additions & 0 deletions backend/btrixcloud/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
date_to_str,
parse_jsonl_error_messages,
stream_dict_list_as_csv,
validate_regexes,
)
from .basecrawls import BaseCrawlOps
from .crawlmanager import CrawlManager
Expand Down Expand Up @@ -517,6 +518,9 @@ async def add_or_remove_exclusion(
"""add new exclusion to config or remove exclusion from config
for given crawl_id, update config on crawl"""

if add:
validate_regexes([regex])

crawl = await self.get_crawl(crawl_id, org)

if crawl.state not in RUNNING_AND_WAITING_STATES:
Expand Down
10 changes: 10 additions & 0 deletions backend/btrixcloud/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,3 +194,13 @@ def get_origin(headers) -> str:
return default_origin

return scheme + "://" + host


def validate_regexes(regexes: List[str]):
"""Validate regular expressions, raise HTTPException if invalid"""
for regex in regexes:
try:
re.compile(regex)
except re.error:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=400, detail="invalid_regex")
42 changes: 42 additions & 0 deletions backend/test/test_crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,26 @@ def test_update_config_invalid_format(
assert r.status_code == 422


def test_update_config_invalid_exclude_regex(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"exclude": "["}},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"

r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"exclude": ["abc.*", "["]}},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"


def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
Expand Down Expand Up @@ -487,3 +507,25 @@ def test_get_crawler_channels(crawler_auth_headers, default_org_id):
for crawler_channel in crawler_channels:
assert crawler_channel["id"]
assert crawler_channel["image"]


def test_add_crawl_config_invalid_exclude_regex(
crawler_auth_headers, default_org_id, sample_crawl_data
):
sample_crawl_data["config"]["exclude"] = "["
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"

sample_crawl_data["config"]["exclude"] = ["abc.*", "["]
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
9 changes: 9 additions & 0 deletions backend/test/test_run_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,15 @@ def test_add_exclusion(admin_auth_headers, default_org_id):
assert r.json()["success"] == True


def test_add_invalid_exclusion(admin_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=[",
headers=admin_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"


def test_remove_exclusion(admin_auth_headers, default_org_id):
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=test",
Expand Down

0 comments on commit 9363095

Please sign in to comment.