Skip to content

Commit

Permalink
feat: push mobile dataset to AWS S3
Browse files Browse the repository at this point in the history
  • Loading branch information
raphael0202 committed Nov 8, 2024
1 parent b8096c0 commit ebef351
Show file tree
Hide file tree
Showing 10 changed files with 203 additions and 8 deletions.
9 changes: 8 additions & 1 deletion .env
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,11 @@ NUM_RQ_WORKERS=4
ENVIRONMENT=dev

# Enable or disable the push of data to HF
ENABLE_HF_PUSH=0
ENABLE_HF_PUSH=0

# Enable or disable the push of data to S3
ENABLE_S3_PUSH=0

# AWS access key and secret key for pushing data to AWS S3
AWS_ACCESS_KEY=
AWS_SECRET_KEY=
5 changes: 5 additions & 0 deletions .github/workflows/container-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ jobs:
# configurations
echo "ENVIRONMENT=preprod" >> $GITHUB_ENV
echo "ENABLE_HF_PUSH=0" >> $GITHUB_ENV
echo "ENABLE_S3_PUSH=0" >> $GITHUB_ENV
# deploy target
echo "SSH_PROXY_HOST=ovh1.openfoodfacts.org" >> $GITHUB_ENV
echo "SSH_USERNAME=off" >> $GITHUB_ENV
Expand All @@ -36,6 +37,7 @@ jobs:
# configurations
echo "ENVIRONMENT=prod" >> $GITHUB_ENV
echo "ENABLE_HF_PUSH=0" >> $GITHUB_ENV
echo "ENABLE_S3_PUSH=0" >> $GITHUB_ENV
# deploy target
echo "SSH_PROXY_HOST=45.147.209.254" >> $GITHUB_ENV
echo "SSH_USERNAME=off" >> $GITHUB_ENV
Expand Down Expand Up @@ -119,10 +121,13 @@ jobs:
echo "REDIS_HOST=redis" >> .env
echo "NUM_RQ_WORKERS=4" >> .env
echo "ENABLE_HF_PUSH=${{ env.ENABLE_HF_PUSH }}" >> .env
echo "ENABLE_S3_PUSH=${{ env.ENABLE_S3_PUSH }}" >> .env
# Secrets
echo "SENTRY_DSN=${{ secrets.SENTRY_DSN }}" >> .env
echo "HF_TOKEN=${{ secrets.HF_TOKEN }}" >> .env
echo "HF_TOKEN=${{ secrets.AWS_ACCESS_KEY }}" >> .env
echo "HF_TOKEN=${{ secrets.AWS_SECRET_KEY }}" >> .env
- name: Create Docker volumes
uses: appleboy/ssh-action@master
Expand Down
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,7 @@ wheels/
.venv

# VS Code
*.code-workspace
*.code-workspace

# local environment variables
.envrc
3 changes: 3 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@ x-service-base-env:
ENVIRONMENT:
REDIS_HOST:
ENABLE_HF_PUSH:
ENABLE_S3_PUSH:
HF_TOKEN: # Hugging Face token to push to the dataset hub
AWS_ACCESS_KEY:
AWS_SECRET_KEY:


services:
Expand Down
23 changes: 23 additions & 0 deletions openfoodfacts_exports/exports/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import duckdb

from openfoodfacts_exports import settings
from openfoodfacts_exports.utils import get_minio_client

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -46,3 +47,25 @@ def generate_mobile_app_dump(parquet_path: Path, output_path: Path) -> None:
shutil.move(tmp_file_path, output_path)

logger.info("Mobile app dump generation done")


def generate_push_mobile_app_dump(parquet_path: Path) -> None:
"""Generate mobile app dump from a Parquet dump and push it to AWS S3.
Args:
parquet_path (Path): Path to the parquet file to generate the mobile app dump
from.
"""
generate_mobile_app_dump(parquet_path, MOBILE_APP_DUMP_DATASET_PATH)

if settings.ENABLE_S3_PUSH:
logger.info("Uploading mobile app dump to S3")
client = get_minio_client()
client.fput_object(
settings.AWS_S3_DATASET_BUCKET,
"openfoodfacts-mobile-dump-products.tsv.gz",
file_path=str(MOBILE_APP_DUMP_DATASET_PATH),
)
logger.info("Mobile app dump uploaded to S3")
else:
logger.info("S3 push is disabled, skipping upload of mobile app dump")
4 changes: 4 additions & 0 deletions openfoodfacts_exports/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,7 @@
ENVIRONMENT = os.environ.get("ENVIRONMENT", "dev")

ENABLE_HF_PUSH = int(os.getenv("ENABLE_HF_PUSH", "0"))

ENABLE_S3_PUSH = int(os.getenv("ENABLE_S3_PUSH", "0"))

AWS_S3_DATASET_BUCKET = os.getenv("AWS_S3_DATASET_BUCKET", "openfoodfacts-ds")
13 changes: 7 additions & 6 deletions openfoodfacts_exports/tasks.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
import logging

from openfoodfacts import get_dataset
from openfoodfacts.types import DatasetType, Flavor

from openfoodfacts_exports.exports.csv import (
MOBILE_APP_DUMP_DATASET_PATH,
generate_mobile_app_dump,
)
from openfoodfacts_exports.exports.csv import generate_push_mobile_app_dump
from openfoodfacts_exports.exports.parquet import PARQUET_DATASET_PATH, export_parquet
from openfoodfacts_exports.workers.queues import high_queue

logger = logging.getLogger(__name__)


def export_job(flavor: Flavor) -> None:
"""Download the JSONL dataset and launch exports through new rq jobs."""
logger.info("Start export job for flavor %s", flavor)
dataset_path = get_dataset(
flavor=flavor, dataset_type=DatasetType.jsonl, download_newer=True
)
Expand All @@ -23,9 +25,8 @@ def export_job(flavor: Flavor) -> None:
job_timeout="1h",
)
high_queue.enqueue(
generate_mobile_app_dump,
generate_push_mobile_app_dump,
PARQUET_DATASET_PATH,
MOBILE_APP_DUMP_DATASET_PATH,
depends_on=export_parquet_job,
job_timeout="1h",
)
7 changes: 7 additions & 0 deletions openfoodfacts_exports/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import sentry_sdk
import toml
from minio import Minio
from minio.credentials import EnvAWSProvider
from sentry_sdk.integrations import Integration
from sentry_sdk.integrations.logging import LoggingIntegration

Expand Down Expand Up @@ -32,3 +34,8 @@ def get_package_version() -> str:
return toml.load(str(settings.PROJECT_DIR / "pyproject.toml"))["tool"]["poetry"][
"version"
]


def get_minio_client() -> Minio:
"""Return a Minio client with AWS credentials from environment."""
return Minio("s3.amazonaws.com", credentials=EnvAWSProvider())
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ dependencies = [
"apscheduler>=3.10.4",
"duckdb>=1.1.3",
"huggingface-hub>=0.26.2",
"minio>=7.2.10",
"openfoodfacts>=1.1.5",
"pytz>=2024.2",
"requests>=2.32.3",
Expand Down
Loading

0 comments on commit ebef351

Please sign in to comment.