-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsetup.py
139 lines (95 loc) · 3.88 KB
/
setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
## This is a setup file is meant to parse the original data into seperate dates and store them into the chosen bucket. This is so we can simulate batch work.
import pandas as pd
import boto3
import os
from dotenv import load_dotenv
from utils.io import save_file
import snowflake.connector
AWS_REGION = os.environ.get('AWS_REGION')
BUCKET_NAME = os.environ.get('BUCKET_NAMEr')
snowflake_user = os.environ.get('snowflake_user')
snowflake_password = os.environ.get('snowflake_password')
snowflake_account = os.environ.get('snowflake_account')
snowflake_database = os.environ.get('ddtosfpipeline')
snowflake_schema = os.environ.get('snowflake_schema')
snowflake_table = os.environ.get('snowflake_table')
columns = [
'Order_total FLOAT',
'Amount_of_discount FLOAT',
'Refunded_amount FLOAT',
'Day INT',
'Total Minutes FLOAT',
'Is_New_False BOOLEAN',
'Is_New_True BOOLEAN',
'Delivery_Region_Mountain View BOOLEAN',
'Delivery_Region_None BOOLEAN',
'Delivery_Region_Palo Alto BOOLEAN',
'Delivery_Region_San Jose BOOLEAN',
'Is_ASAP_False BOOLEAN',
'Is_ASAP_True BOOLEAN',
'Tip BOOLEAN?']
def bucketexists(client):
# Initialize S3 client
s3 = client
# Define the name of the bucket you want to check
bucket_name = 'my-bucket'
# Check if the bucket exists
bucket_exists = True
try:
s3.head_bucket(Bucket= BUCKET_NAME)
except:
bucket_exists = False
# If the bucket exists, empty and delete it
if bucket_exists:
print(f"{BUCKET_NAME} already exists")
# List all objects in the bucket
objects = s3.list_objects(Bucket= BUCKET_NAME)['Contents']
# Create a list of object keys
keys = [{'Key': obj['Key']} for obj in objects]
# Delete all objects in the bucket
s3.delete_objects(Bucket=bucket_name, Delete={'Objects': keys})
# Delete the bucket
s3.delete_bucket(Bucket=bucket_name)
print(f"{bucket_name} deleted successfully!")
else:
print(f"{bucket_name} does not exist.")
def main():
"""
Script that creates and populates raw S3 bucket. This assigns each row arbitrarily a day from 1-31, representing a day in
"""
# S3 Setup
client = boto3.client("s3", region_name=AWS_REGION)
## Checks if bucket already exists. If it does, delete it. Then create a new Bucket.
bucketexists(client=client)
response = client.create_bucket(Bucket=BUCKET_NAME, CreateBucketConfiguration=location)
## Populates the newly created s3 bucket
url = 'https://raw.githubusercontent.com/ralfsantacruz/Doordash-Analytics/master/resources/analytics.csv'
df = pd.read_csv(url)
df["Day"] = df['Customer_placed_order_datetime'].str[:2].astype(int)
location = {'LocationConstraint': AWS_REGION}
for i in range(1,32):
holder = df[df['Day'] == i]
save_file('raw',str(i),holder)
print( 'Your bucket ' + BUCKET_NAME + 'created with a raw directory containing files')
### SnowFlake SetUp
snowflake_conn = snowflake.connector.connect(
user=snowflake_user,
password=snowflake_password,
account=snowflake_account,
database = snowflake_database,
schema= snowflake_schema
)
snowflake_cursor = snowflake_conn.cursor()
#Deletes if exists
cur.execute(f"DROP DATABASE IF EXISTS {snowflake_database}")
#Creates the database
snowflake_cursor.execute(f"CREATE DATABASE {snowflake_database}")
print('Your database ' + snowflake_database + ' has been created')
#Creates the schema
snowflake_cursor.execute("CREATE SCHEMA " + schemaname)
print ('Your schema ' + schemaname + ' has been created')
snowflake_conn.close()
print('Your setup is complete. T')
return
if __name__ == "__main__":
main()