Skip to content

Commit

Permalink
Support Presigning for use with custom domain (#2249)
Browse files Browse the repository at this point in the history
If access_endpoint_url is provided:
- Use virtual host addressing style, so presigned URLs are of the form
`https://bucket.s3-host.example.com/path/` instead of
`https://s3-host.example.com/bucket/path/`
- Allow for replacing `https://bucket.s3-host.example.com/path/` ->
`https://my-custom-domain.example.com/path/`, where
`https://my-custom-domain.example.com/path/` is the access_endpoint_url
- Remove old `use_access_for_presign` which is no longer used
- Fixes #2248
- docs: update deployment docs storages section to mention custom storages, access_endpoint_url

---------

Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
  • Loading branch information
ikreymer and tw4l authored Dec 20, 2024
1 parent 8e37533 commit 2060ee7
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 39 deletions.
1 change: 0 additions & 1 deletion backend/btrixcloud/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1180,7 +1180,6 @@ class S3Storage(BaseModel):
secret_key: str
access_endpoint_url: str
region: str = ""
use_access_for_presign: bool = True


# ============================================================================
Expand Down
55 changes: 25 additions & 30 deletions backend/btrixcloud/storages.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from fastapi import Depends, HTTPException
from stream_zip import stream_zip, NO_COMPRESSION_64, Method
from remotezip import RemoteZip
from aiobotocore.config import AioConfig

import aiobotocore.session
import requests
Expand All @@ -50,7 +51,7 @@
AddedResponseName,
)

from .utils import is_bool, slug_from_name
from .utils import slug_from_name
from .version import __version__


Expand All @@ -77,15 +78,12 @@ class StorageOps:
org_ops: OrgOps
crawl_manager: CrawlManager

is_local_minio: bool
frontend_origin: str

def __init__(self, org_ops, crawl_manager) -> None:
self.org_ops = org_ops
self.crawl_manager = crawl_manager

self.is_local_minio = is_bool(os.environ.get("IS_LOCAL_MINIO"))

frontend_origin = os.environ.get(
"FRONTEND_ORIGIN", "http://browsertrix-cloud-frontend"
)
Expand Down Expand Up @@ -138,12 +136,7 @@ def _create_s3_storage(self, storage: dict[str, str]) -> S3Storage:
if bucket_name:
endpoint_url += bucket_name + "/"

if self.is_local_minio:
access_endpoint_url = "/data/"
use_access_for_presign = False
else:
access_endpoint_url = storage.get("access_endpoint_url") or endpoint_url
use_access_for_presign = is_bool(storage.get("use_access_for_presign"))
access_endpoint_url = storage.get("access_endpoint_url") or endpoint_url

return S3Storage(
access_key=storage["access_key"],
Expand All @@ -152,7 +145,6 @@ def _create_s3_storage(self, storage: dict[str, str]) -> S3Storage:
endpoint_url=endpoint_url,
endpoint_no_bucket_url=endpoint_no_bucket_url,
access_endpoint_url=access_endpoint_url,
use_access_for_presign=use_access_for_presign,
)

async def add_custom_storage(
Expand All @@ -177,7 +169,6 @@ async def add_custom_storage(
endpoint_url=endpoint_url,
endpoint_no_bucket_url=endpoint_no_bucket_url,
access_endpoint_url=storagein.access_endpoint_url or storagein.endpoint_url,
use_access_for_presign=True,
)

try:
Expand Down Expand Up @@ -264,12 +255,12 @@ def get_available_storages(self, org: Organization) -> List[StorageRef]:

@asynccontextmanager
async def get_s3_client(
self, storage: S3Storage, use_access=False
self, storage: S3Storage, for_presign=False
) -> AsyncIterator[tuple[AIOS3Client, str, str]]:
"""context manager for s3 client"""
endpoint_url = (
storage.endpoint_url if not use_access else storage.access_endpoint_url
)
# parse bucket and key from standard endpoint_url
endpoint_url = storage.endpoint_url

if not endpoint_url.endswith("/"):
endpoint_url += "/"

Expand All @@ -280,12 +271,17 @@ async def get_s3_client(

session = aiobotocore.session.get_session()

config = None
if for_presign and storage.access_endpoint_url != storage.endpoint_url:
config = AioConfig(s3={"addressing_style": "virtual"})

async with session.create_client(
"s3",
region_name=storage.region,
region_name=storage.region or "us-east-1",
endpoint_url=endpoint_url,
aws_access_key_id=storage.access_key,
aws_secret_access_key=storage.secret_key,
config=config,
) as client:
yield client, bucket, key

Expand Down Expand Up @@ -454,24 +450,27 @@ async def get_presigned_url(

s3storage = self.get_org_storage_by_ref(org, crawlfile.storage)

async with self.get_s3_client(s3storage, s3storage.use_access_for_presign) as (
client,
bucket,
key,
):
async with self.get_s3_client(
s3storage,
for_presign=True,
) as (client, bucket, key):
orig_key = key
key += crawlfile.filename

presigned_url = await client.generate_presigned_url(
"get_object", Params={"Bucket": bucket, "Key": key}, ExpiresIn=duration
)

if (
not s3storage.use_access_for_presign
and s3storage.access_endpoint_url
s3storage.access_endpoint_url
and s3storage.access_endpoint_url != s3storage.endpoint_url
):
parts = urlsplit(s3storage.endpoint_url)
host_endpoint_url = (
f"{parts.scheme}://{bucket}.{parts.netloc}/{orig_key}"
)
presigned_url = presigned_url.replace(
s3storage.endpoint_url, s3storage.access_endpoint_url
host_endpoint_url, s3storage.access_endpoint_url
)

return presigned_url
Expand All @@ -490,11 +489,7 @@ async def _delete_file(

s3storage = self.get_org_storage_by_ref(org, storage)

async with self.get_s3_client(s3storage) as (
client,
bucket,
key,
):
async with self.get_s3_client(s3storage) as (client, bucket, key):
key += filename
response = await client.delete_object(Bucket=bucket, Key=key)
status_code = response["ResponseMetadata"]["HTTPStatusCode"]
Expand Down
1 change: 1 addition & 0 deletions chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,7 @@ storages:
bucket_name: *local_bucket_name

endpoint_url: "http://local-minio.default:9000/"
access_endpoint_url: "/data/"


# optional: duration in minutes for WACZ download links to be valid
Expand Down
71 changes: 63 additions & 8 deletions frontend/docs/docs/deploy/customization.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,33 +32,87 @@ crawler_channels:

## Storage

The `storage` setting is used to specify primary and replica storage for a Browsertrix deployment. All configured storage options must be S3-compatible buckets. At minimum, there must be one configured storage option, as can be seen in the default configuration:
The `storage` setting is used to specify primary and replica storage for a Browsertrix deployment. All configured storage options must be S3-compatible buckets. At minimum, there must be one configured storage option, which includes a `is_default_primary: true`.

### Using Local Minio Storage

Browsertrix includes a built-in Minio storage service, which is enabled by default (`minio_local: true` is set).

The configuration for this is as follows:


```yaml
storages:
- name: "default"
type: "s3"
access_key: "ADMIN"
secret_key: "PASSW0RD"
bucket_name: *local_bucket_name
bucket_name: btrix-data
endpoint_url: "http://local-minio.default:9000/"
access_endpoint_url: /data/
```

It is possible to add one or more replica storage locations. If replica locations are enabled, all stored content in the application will be automatically replicated to each configured replica storage location in background jobs after being stored in the default primary storage. If replica locations are enabled, at least one must be set as the default replica location for primary backups. This is indicated with `is_default_replica: True`. If more than one storage location is configured, the primary storage must also be indicated with `is_default_primary: True`.
The `access_key` and `secret_key` should be changed, otherwise no additional changes are needed, and all local data will be stored in this Minio instance by default.

The S3 bucket is accessible via `/data/` path on the same host Browsertrix is running on, eg. `http://localhost:30870/data/`.


### Using External S3 Storage Providers

Browsertrix can also be used with external S3 storage providers, which can be configured as follows:

```yaml
storages:
- name: default
type: "s3"
access_key: "accesskey"
secret_key: "secret"
endpoint_url: "https://s3provider.example.com/bucket/path/"
access_endpoint_url: "https://my-custom-domain.example.com/path/" #optional
is_default_primary: true
```


When using an external S3 provider, a custom `access_endpoint_url` can be provided, and the `bucket_name` need to be specified separately.
This URL is used for direct access to WACZ files, and can be used to specify a custom domain to access the bucket.

The `endpoint_url` should be provided in 'path prefix' form (with the bucket after the path), eg:
`https://s3provider.example.com/bucket/path/`.

Browsertrix will handle presigning S3 URLs so that WACZ files (and other data) can be accessed directly, using URLs of the form: `https://s3provider.example.com/bucket/path/to/files/crawl.wacz?signature...`

Since the local Minio service is not used, `minio_local: false` can be set to save resource in not deploying Minio.


### Custom Access Endpoint URL

It may be useful to provide a custom access endpoint for accessing WACZ files and other data. if the `access_endpoint_url` is provided,
it should be in 'virtual host' form (the bucket is not added to the path, but is assumed to be the in the host).

The host portion of the URL is then replaced with the `access_endpoint_url`. For example, given `endpoint_url: https://s3provider.example.com/bucket/path/` and `access_endpoint_url: https://my-custom-domain.example.com/path/`, a URL to a WACZ files in 'virtual host' form may be `https://bucket.s3provider.example.com/path/to/files/crawl.wacz?signature...`.

The `https://bucket.s3provider.example.com/path/` is then replaced with the `https://my-custom-domain.example.com/path/`, and the final URL becomes `https://my-custom-domain.example.com/path/to/files/crawl.wacz?signature...`.


### Storage Replicas

It is possible to add one or more replica storage locations. If replica locations are enabled, all stored content in the application will be automatically replicated to each configured replica storage location in background jobs after being stored in the default primary storage. If replica locations are enabled, at least one must be set as the default replica location for primary backups. This is indicated with `is_default_replica: true`. If more than one storage location is configured, the primary storage must also be indicated with `is_default_primary: true`.

For example, here is what a storage configuration with two replica locations, one in another bucket on the same Minio S3 service as primary storage as well as another in an external S3 provider:
For example, here is what a storage configuration with two replica locations, one in another bucket on the same local Minio S3 service as primary storage as well as another in an external S3 provider:

```yaml
storages:
- name: "default"
type: "s3"
access_key: "ADMIN"
secret_key: "PASSW0RD"
bucket_name: *local_bucket_name
bucket_name: btrix-data
access_endpoint_url: /data/
endpoint_url: "http://local-minio.default:9000/"
is_default_primary: True
is_default_primary: true
- name: "replica-0"
type: "s3"
Expand All @@ -67,15 +121,16 @@ storages:
bucket_name: "replica-0"
endpoint_url: "http://local-minio.default:9000/"
is_default_replica: True
is_default_replica: true
- name: "replica-1"
type: "s3"
access_key: "accesskey"
secret_key: "secret"
bucket_name: "replica-1"
endpoint_url: "http://s3provider.example.com"
endpoint_url: "https://s3provider.example.com/bucket/path/"
access_endpoint_url: "https://my-custom-domain.example.com/path/"
```

## Horizontal Autoscaling
Expand Down

0 comments on commit 2060ee7

Please sign in to comment.