diff --git a/clients/python-wrapper/lakefs/client.py b/clients/python-wrapper/lakefs/client.py index 2665d19aca7..fef52b32eb9 100644 --- a/clients/python-wrapper/lakefs/client.py +++ b/clients/python-wrapper/lakefs/client.py @@ -6,16 +6,22 @@ from __future__ import annotations -from typing import Optional +import base64 +import json from threading import Lock +from typing import Optional +from urllib.parse import urlparse, parse_qs import lakefs_sdk +from lakefs_sdk import ExternalLoginInformation from lakefs_sdk.client import LakeFSClient from lakefs.config import ClientConfig from lakefs.exceptions import NotAuthorizedException, ServerException, api_exception_handler from lakefs.models import ServerStorageConfiguration +DEFAULT_REGION = 'us-east-1' + class ServerConfiguration: """ @@ -106,6 +112,132 @@ def version(self) -> str: return self._server_conf.version +def _extract_region_from_endpoint(endpoint): + """ + Extract the region name from an STS endpoint URL. + for example: https://sts.eu-central-1.amazonaws.com/ -> eu-central-1 + and for example: https://sts.amazonaws.com/ -> DEFAULT_REGION + + :param endpoint: The endpoint URL of the STS client. + :return: The region name extracted from the endpoint URL. + """ + + parts = endpoint.split('.') + if len(parts) == 4: + return parts[1] + if len(parts) > 4: + return parts[2] + return DEFAULT_REGION + + +def _get_identity_token( + session: 'boto3.Session', + lakefs_host: str, + additional_headers: dict[str, str], + presign_expiry +) -> str: + """ + Generate the identity token required for lakeFS authentication from an AWS session. + + This function uses the STS client to generate a presigned URL for the `get_caller_identity` action, + extracts the required values from the URL, + and creates a base64-encoded JSON object with these values. + + :param session: A boto3 session object with the necessary AWS credentials and region information. + :return: A base64-encoded JSON string containing the required authentication information. + :raises ValueError: If the session does not have a region name set. + """ + + # this method should only be called when installing the aws-iam additional requirement + from botocore.client import Config # pylint: disable=import-outside-toplevel, import-error + from botocore.signers import RequestSigner # pylint: disable=import-outside-toplevel, import-error + + sts_client = session.client('sts', config=Config(signature_version='v4')) + endpoint = sts_client.meta.endpoint_url + service_id = sts_client.meta.service_model.service_id + region = _extract_region_from_endpoint(endpoint) + # signer is used because the presigned URL generated by the STS does not support additional headers + signer = RequestSigner( + service_id, + region, + 'sts', + 'v4', + session.get_credentials(), + session.events + ) + endpoint_with_params = f"{endpoint}/?Action=GetCallerIdentity&Version=2011-06-15" + if additional_headers is None: + additional_headers = { + 'X-LakeFS-Server-ID': lakefs_host, + } + params = { + 'method': 'POST', + 'url': endpoint_with_params, + 'body': {}, + 'headers': additional_headers, + 'context': {} + } + + presigned_url = signer.generate_presigned_url( + params, + region_name=region, + expires_in=presign_expiry, + operation_name='' + ) + parsed_url = urlparse(presigned_url) + query_params = parse_qs(parsed_url.query) + + # Extract values from query parameters + json_object = { + "method": "POST", + "host": parsed_url.hostname, + "region": region, + "action": query_params['Action'][0], + "date": query_params['X-Amz-Date'][0], + "expiration_duration": query_params['X-Amz-Expires'][0], + "access_key_id": query_params['X-Amz-Credential'][0].split('/')[0], + "signature": query_params['X-Amz-Signature'][0], + "signed_headers": query_params.get('X-Amz-SignedHeaders', [''])[0].split(';'), + "version": query_params['Version'][0], + "algorithm": query_params['X-Amz-Algorithm'][0], + "security_token": query_params.get('X-Amz-Security-Token', [None])[0] + } + + json_string = json.dumps(json_object) + return base64.b64encode(json_string.encode('utf-8')).decode('utf-8') + + +def from_aws_role( + session: 'boto3.Session', + ttl_seconds: int = 3600, + presigned_ttl: int = 60, + additional_headers: dict[str, str] = None, + **kwargs) -> Client: + """ + Create a lakeFS client from an AWS role. + :param session: : The boto3 session. + :param ttl_seconds: The time-to-live for the generated lakeFS token in seconds. The default value is 3600 seconds. + :param presigned_ttl: The time-to-live for the presigned URL in seconds. The default value is 60 seconds. + :param additional_headers: Additional headers to include in the presigned URL. + :param kwargs: The arguments to pass to the client. + :return: A lakeFS client. + """ + + client = Client(**kwargs) + lakefs_host = urlparse(client.config.host).hostname + identity_token = _get_identity_token(session, lakefs_host, presign_expiry=presigned_ttl, + additional_headers=additional_headers) + external_login_information = ExternalLoginInformation(token_expiration_duration=ttl_seconds, identity_request={ + "identity_token": identity_token + }) + + with api_exception_handler(): + auth_token = client.sdk_client.auth_api.external_principal_login(external_login_information) + + client.config.access_token = auth_token.token + return client + + def from_web_identity(code: str, state: str, redirect_uri: str, ttl_seconds: int = 3600, **kwargs) -> Client: """ Authenticate against lakeFS using a code received from an identity provider diff --git a/clients/python-wrapper/setup.py b/clients/python-wrapper/setup.py index 580e48126d2..a60f31a48ca 100644 --- a/clients/python-wrapper/setup.py +++ b/clients/python-wrapper/setup.py @@ -40,5 +40,8 @@ include_package_data=True, license="Apache 2.0", long_description=long_description, - long_description_content_type='text/markdown' + long_description_content_type='text/markdown', + extras_require={ + 'aws-iam': ["boto3 >= 1.26.0"], + }, ) diff --git a/docs/integrations/python.md b/docs/integrations/python.md index 19d200d566f..30f34b93567 100644 --- a/docs/integrations/python.md +++ b/docs/integrations/python.md @@ -48,6 +48,9 @@ In case no authentication parameters exist, it is also possible to explicitly cr Here's how to instantiate a client: +{: .note } +See [here](../reference/security/external-principals-aws.md#login-with-python) for instructions on how to log in with Python using your AWS role. This is applicable for enterprise users. + ```python from lakefs.client import Client diff --git a/docs/reference/security/external-principals-aws.md b/docs/reference/security/external-principals-aws.md index 5ed1461d5bf..49f57e85a07 100644 --- a/docs/reference/security/external-principals-aws.md +++ b/docs/reference/security/external-principals-aws.md @@ -120,11 +120,41 @@ for p in resp.results: # do something ``` + ## Get lakeFS API Token The login to lakeFS is done by calling the [login API][login-api] with the `GetCallerIdentity` request signed by the client. -Currently, the login operation is supported out of the box in [lakeFS Hadoop FileSystem][lakefs-hadoopfs] version 0.2.4, see [Spark usage][lakefs-spark]. -Other clients (i.e HTTP, Python etc) can use the login endpoint to authenticate to lakeFS but, you will have to build the request input. +Currently, the login operation is supported out of the box in: +- [lakeFS Hadoop FileSystem][lakefs-hadoopfs] version 0.2.4, see [Spark usage][lakefs-spark] +- [python](#login-with-python) + +For other use cases authenticate to lakeFS via login endpoint, this will require building the request input. + +## Login with python + +### prerequisites + +lakeFS requires additional python packages to be installed in order to generate a lakeFS client with the assumed role. +To install the required packages, run the following command: + +```sh + pip install lakefs[aws-iam] +``` + +In order to generate a lakeFS client with the assumed role, initiate a boto3 session with the desired role and call the `get_caller_identity` method to get the caller identity: + + +```python +import lakefs +import boto3 +session = boto3.Session() +myclient = lakefs.client.from_aws_role(session=session, ttl_seconds = 7200, host="") + +# list repositories +repos = lakefs.repositories(client=myclient) +for r in repos: + print(r) +``` [external-principal-admin]: {% link reference/cli.md %}#external