Skip to content

Commit

Permalink
fix: report known malware even when not labeled (#956)
Browse files Browse the repository at this point in the history
Signed-off-by: behnazh-w <behnaz.hassanshahi@oracle.com>
  • Loading branch information
behnazh-w authored Jan 8, 2025
1 parent 4ed5561 commit 7313899
Show file tree
Hide file tree
Showing 15 changed files with 326 additions and 75 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@ macaron.slsa\_analyzer.package\_registry package
Submodules
----------

macaron.slsa\_analyzer.package\_registry.deps\_dev module
---------------------------------------------------------

.. automodule:: macaron.slsa_analyzer.package_registry.deps_dev
:members:
:undoc-members:
:show-inheritance:

macaron.slsa\_analyzer.package\_registry.jfrog\_maven\_registry module
----------------------------------------------------------------------

Expand Down
5 changes: 5 additions & 0 deletions src/macaron/config/defaults.ini
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,11 @@ fileserver_url_scheme = https
inspector_url_netloc = inspector.pypi.io
inspector_url_scheme = https

[deps_dev]
url_netloc = api.deps.dev
url_scheme = https
purl_endpoint = v3alpha/purl

# Configuration options for selecting the checks to run.
# Both the exclude and include are defined as list of strings:
# - The exclude list is used to specify the checks that will not run.
Expand Down
13 changes: 12 additions & 1 deletion src/macaron/errors.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""This module contains error classes for Macaron."""
Expand Down Expand Up @@ -56,6 +56,17 @@ class InvalidHTTPResponseError(MacaronError):
"""Happens when the HTTP response is invalid or unexpected."""


class APIAccessError(MacaronError):
"""Happens when a service API cannot be accessed.
Reasons can include:
* misconfiguration issues
* invalid API request
* network errors
* unexpected response returned by the API
"""


class CheckRegistryError(MacaronError):
"""The Check Registry Error class."""

Expand Down
60 changes: 35 additions & 25 deletions src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from macaron.slsa_analyzer.build_tool.poetry import Poetry
from macaron.slsa_analyzer.checks.base_check import BaseCheck
from macaron.slsa_analyzer.checks.check_result import CheckResultData, CheckResultType, Confidence, JustificationType
from macaron.slsa_analyzer.package_registry.deps_dev import APIAccessError, DepsDevService
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset, PyPIRegistry
from macaron.slsa_analyzer.registry import registry
from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo
Expand Down Expand Up @@ -182,7 +183,7 @@ def __init__(self) -> None:
"""Initialize a check instance."""
check_id = "mcn_detect_malicious_metadata_1"
description = """This check analyzes the metadata of a package based on reports malicious behavior.
Supported ecosystem: PyPI.
Supported ecosystem for unknown malware: PyPI.
"""
super().__init__(check_id=check_id, description=description, eval_reqs=[])

Expand Down Expand Up @@ -288,37 +289,46 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
The result of the check.
"""
result_tables: list[CheckFacts] = []
# First check if this package is a known malware
package_registry_info_entries = ctx.dynamic_data["package_registries"]

# First check if this package is a known malware
data = {"package": {"purl": ctx.component.purl}}
response = send_post_http_raw(self.osv_query_url, json_data=data, headers=None)
res_obj = None
if response:
try:
res_obj = response.json()
except requests.exceptions.JSONDecodeError as error:
logger.debug("Unable to get a valid response from %s: %s", self.osv_query_url, error)
if res_obj:
for vuln in res_obj.get("vulns", {}):
v_id = json_extract(vuln, ["id"], str)
if v_id and v_id.startswith("MAL-"):
result_tables.append(
MaliciousMetadataFacts(
known_malware=f"https://osv.dev/vulnerability/{v_id}",
result={},
detail_information=vuln,
confidence=Confidence.HIGH,

try:
package_exists = bool(DepsDevService.get_package_info(ctx.component.purl))
except APIAccessError as error:
logger.debug(error)

# Known malicious packages must have been removed.
if not package_exists:
response = send_post_http_raw(self.osv_query_url, json_data=data, headers=None)
res_obj = None
if response:
try:
res_obj = response.json()
except requests.exceptions.JSONDecodeError as error:
logger.debug("Unable to get a valid response from %s: %s", self.osv_query_url, error)
if res_obj:
for vuln in res_obj.get("vulns", {}):
if v_id := json_extract(vuln, ["id"], str):
result_tables.append(
MaliciousMetadataFacts(
known_malware=f"https://osv.dev/vulnerability/{v_id}",
result={},
detail_information=vuln,
confidence=Confidence.HIGH,
)
)
if result_tables:
return CheckResultData(
result_tables=result_tables,
result_type=CheckResultType.FAILED,
)
if result_tables:
return CheckResultData(
result_tables=result_tables,
result_type=CheckResultType.FAILED,
)

package_registry_info_entries = ctx.dynamic_data["package_registries"]
# If the package is not a known malware, run malware analysis heuristics.
for package_registry_info_entry in package_registry_info_entries:
match package_registry_info_entry:
# Currently, only PyPI packages are supported.
case PackageRegistryInfo(
build_tool=Pip() | Poetry(),
package_registry=PyPIRegistry() as pypi_registry,
Expand Down
83 changes: 83 additions & 0 deletions src/macaron/slsa_analyzer/package_registry/deps_dev.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""This module contains implementation of deps.dev service."""

import json
import logging
import urllib.parse
from json.decoder import JSONDecodeError
from urllib.parse import quote as encode

from macaron.config.defaults import defaults
from macaron.errors import APIAccessError
from macaron.util import send_get_http_raw

logger: logging.Logger = logging.getLogger(__name__)


class DepsDevService:
"""The deps.dev service class."""

@staticmethod
def get_package_info(purl: str) -> dict | None:
"""Check if the package identified by the PackageURL (PURL) exists and return its information.
Parameters
----------
purl: str
The PackageURL (PURL).
Returns
-------
dict | None
The package metadata or None if it doesn't exist.
Raises
------
APIAccessError
If the service is misconfigured, the API is invalid, a network error happens,
or unexpected response is returned by the API.
"""
section_name = "deps_dev"
if not defaults.has_section(section_name):
return None
section = defaults[section_name]

url_netloc = section.get("url_netloc")
if not url_netloc:
raise APIAccessError(
f'The "url_netloc" key is missing in section [{section_name}] of the .ini configuration file.'
)
url_scheme = section.get("url_scheme", "https")
purl_endpoint = section.get("purl_endpoint")
if not purl_endpoint:
raise APIAccessError(
f'The "purl_endpoint" key is missing in section [{section_name}] of the .ini configuration file.'
)

path_params = "/".join([purl_endpoint, encode(purl, safe="")])
try:
url = urllib.parse.urlunsplit(
urllib.parse.SplitResult(
scheme=url_scheme,
netloc=url_netloc,
path=path_params,
query="",
fragment="",
)
)
except ValueError as error:
raise APIAccessError("Failed to construct the API URL.") from error

response = send_get_http_raw(url)
if response and response.text:
try:
metadata: dict = json.loads(response.text)
except JSONDecodeError as error:
raise APIAccessError(f"Failed to process response from deps.dev for {url}.") from error
if not metadata:
raise APIAccessError(f"Empty response returned by {url} .")
return metadata

return None
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""Assets on a package registry."""
Expand Down Expand Up @@ -816,7 +816,7 @@ def download_asset(self, url: str, dest: str) -> bool:

return True

def find_publish_timestamp(self, purl: str, registry_url: str | None = None) -> datetime:
def find_publish_timestamp(self, purl: str) -> datetime:
"""Make a search request to Maven Central to find the publishing timestamp of an artifact.
The reason for directly fetching timestamps from Maven Central is that deps.dev occasionally
Expand All @@ -829,8 +829,6 @@ def find_publish_timestamp(self, purl: str, registry_url: str | None = None) ->
purl: str
The Package URL (purl) of the package whose publication timestamp is to be retrieved.
This should conform to the PURL specification.
registry_url: str | None
The registry URL that can be set for testing.
Returns
-------
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""The module provides abstractions for the Maven Central package registry."""
Expand Down Expand Up @@ -182,7 +182,7 @@ def is_detected(self, build_tool: BaseBuildTool) -> bool:
compatible_build_tool_classes = [Maven, Gradle]
return any(isinstance(build_tool, build_tool_class) for build_tool_class in compatible_build_tool_classes)

def find_publish_timestamp(self, purl: str, registry_url: str | None = None) -> datetime:
def find_publish_timestamp(self, purl: str) -> datetime:
"""Make a search request to Maven Central to find the publishing timestamp of an artifact.
The reason for directly fetching timestamps from Maven Central is that deps.dev occasionally
Expand All @@ -195,8 +195,6 @@ def find_publish_timestamp(self, purl: str, registry_url: str | None = None) ->
purl: str
The Package URL (purl) of the package whose publication timestamp is to be retrieved.
This should conform to the PURL specification.
registry_url: str | None
The registry URL that can be set for testing.
Returns
-------
Expand Down
47 changes: 10 additions & 37 deletions src/macaron/slsa_analyzer/package_registry/package_registry.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,16 @@
# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""This module defines package registries."""

import json
import logging
import urllib.parse
from abc import ABC, abstractmethod
from datetime import datetime
from urllib.parse import quote as encode

import requests

from macaron.errors import InvalidHTTPResponseError
from macaron.json_tools import json_extract
from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool
from macaron.util import send_get_http_raw
from macaron.slsa_analyzer.package_registry.deps_dev import APIAccessError, DepsDevService

logger: logging.Logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -50,7 +45,7 @@ def is_detected(self, build_tool: BaseBuildTool) -> bool:
based on the given build tool.
"""

def find_publish_timestamp(self, purl: str, registry_url: str | None = None) -> datetime:
def find_publish_timestamp(self, purl: str) -> datetime:
"""Retrieve the publication timestamp for a package specified by its purl from the deps.dev repository by default.
This method constructs a request URL based on the provided purl, sends an HTTP GET
Expand All @@ -65,8 +60,6 @@ def find_publish_timestamp(self, purl: str, registry_url: str | None = None) ->
purl: str
The Package URL (purl) of the package whose publication timestamp is to be retrieved.
This should conform to the PURL specification.
registry_url: str | None
The registry URL that can be set for testing.
Returns
-------
Expand All @@ -86,40 +79,20 @@ def find_publish_timestamp(self, purl: str, registry_url: str | None = None) ->
# in the AnalyzeContext object retrieved by the Repo Finder. This step should be
# implemented at the beginning of the analyze command to ensure that the data
# is available for subsequent processing.

base_url_parsed = urllib.parse.urlparse(registry_url or "https://api.deps.dev")
path_params = "/".join(["v3alpha", "purl", encode(purl, safe="")])
try:
url = urllib.parse.urlunsplit(
urllib.parse.SplitResult(
scheme=base_url_parsed.scheme,
netloc=base_url_parsed.netloc,
path=path_params,
query="",
fragment="",
)
)
except ValueError as error:
raise InvalidHTTPResponseError("Failed to construct the API URL.") from error

response = send_get_http_raw(url)
if response and response.text:
try:
metadata: dict = json.loads(response.text)
except requests.exceptions.JSONDecodeError as error:
raise InvalidHTTPResponseError(f"Failed to process response from deps.dev for {url}.") from error
if not metadata:
raise InvalidHTTPResponseError(f"Empty response returned by {url} .")

metadata = DepsDevService.get_package_info(purl)
except APIAccessError as error:
raise InvalidHTTPResponseError(f"Invalid response from deps.dev for {purl}.") from error
if metadata:
timestamp = json_extract(metadata, ["version", "publishedAt"], str)
if not timestamp:
raise InvalidHTTPResponseError(f"The timestamp is missing in the response returned by {url}.")
raise InvalidHTTPResponseError(f"The timestamp is missing in the response returned for {purl}.")

logger.debug("Found timestamp: %s.", timestamp)

try:
return datetime.fromisoformat(timestamp)
except ValueError as error:
raise InvalidHTTPResponseError(f"The timestamp returned by {url} is invalid") from error
raise InvalidHTTPResponseError(f"The timestamp returned for {purl} is invalid") from error

raise InvalidHTTPResponseError(f"Invalid response from deps.dev for {url}.")
raise InvalidHTTPResponseError(f"Invalid response from deps.dev for {purl}.")
10 changes: 10 additions & 0 deletions tests/integration/cases/ultralytics/policy.dl
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
/* Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. */
/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */

#include "prelude.dl"

Policy("check-malicious-package", component_id, "Check the malicious package.") :-
check_passed(component_id, "mcn_detect_malicious_metadata_1").

apply_policy_to("check-malicious-package", component_id) :-
is_component(component_id, "pkg:pypi/ultralytics").
21 changes: 21 additions & 0 deletions tests/integration/cases/ultralytics/test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

description: |
Analyzing a popular package that some of its versions are compromised.
tags:
- macaron-python-package
- macaron-docker-image

steps:
- name: Run macaron analyze
kind: analyze
options:
command_args:
- -purl
- pkg:pypi/ultralytics
- name: Run macaron verify-policy to verify that the malicious metadata check passes.
kind: verify
options:
policy: policy.dl
Loading

0 comments on commit 7313899

Please sign in to comment.