-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpandasDataFrames.py
112 lines (102 loc) · 4.8 KB
/
pandasDataFrames.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import json, requests, xmltodict, pandas as pd
def refresh_XmlToPkl_Channel(FreeFeedURL):
#Pull XML --> Transform into json -->
response = requests.get(FreeFeedURL)
json_data = json.loads(json.dumps(xmltodict.parse(response.content)))
#Save Channel list as xml_channel.pkl
df_channel = pd.json_normalize(json_data["rss"]["channel"]).drop(['item', ], axis=1)
df_channel = pd.DataFrame(df_channel.to_dict(orient='records'))
df_channel.to_pickle("pkl/xml_channel.pkl")
return df_channel
def refresh_XmlToPkl_Podcast(FreeFeedURL):
#Pull XML --> Transform into json -->
response = requests.get(FreeFeedURL)
json_data = json.loads(json.dumps(xmltodict.parse(response.content)))
#Save podcast list as xml_podcast.pkl
df_item = pd.json_normalize(json_data["rss"]["channel"], 'item')
df_item = pd.DataFrame(df_item.to_dict(orient='records'))
df_item.to_pickle("pkl/xml_podcast.pkl")
return df_item
def refresh_XmlToPkl_All(FreeFeedURL):
df_item = refresh_XmlToPkl_Podcast(FreeFeedURL)
df_channel = refresh_XmlToPkl_Channel(FreeFeedURL)
return df_channel, df_item
def refresh_FirebaseToPkl_Channel(store):
# Save channel df as a pickle
channel = store.collection(u'channel').stream()
channel_df = list(map(lambda x: x.to_dict(), channel))
cdf = pd.DataFrame(channel_df)
cdf.to_pickle("pkl/firebase_channel.pkl")
return cdf
def refresh_FirebaseToPkl_Podcast(store):
# Save podcast df as a pickle
podcast_stream = store.collection(u'item').stream()
podCount = 0
for podcast in podcast_stream:
df = pd.DataFrame([podcast.to_dict()])
curEpId = podcast.id
if podCount == 0:
# If 1st episode in stream, make dataframe
pdf = pd.DataFrame(df)
pdf.insert(0, 'podcastEpisodeId', curEpId)
podCount += 1
continue
# If >0st episode in stream, append to dataframe
pdf2 = pd.DataFrame(df)
pdf2.insert(0, 'podcastEpisodeId', curEpId)
pdf = pdf.append(pdf2, ignore_index=True)
# podcasts_df = list(map(lambda x: x.to_dict(), podcast_stream))
# pdf = pd.DataFrame(podcasts_df)
pdf.to_pickle("pkl/firebase_podcast.pkl")
return pdf
def refresh_FirebaseToPkl_Transcript(store):
# Get Episode Transcripts into ONE DATAFRAME from FireStore
podcast_stream = store.collection(u'item').stream()
podCount = 0
for podcast in podcast_stream:
curEpId = podcast.id
transcript_stream = store.collection(u'item').document(curEpId).collection(u'transcript').stream()
transcript_df = list(map(lambda x: x.to_dict(), transcript_stream))
if podCount == 0:
# If 0st episode in stream, make dataframe
tdf = pd.DataFrame(transcript_df)
tdf.insert(0, 'podcastEpisodeId', curEpId)
podCount += 1
continue
# If >0st episode in stream, append to dataframe
tdf2 = pd.DataFrame(transcript_df)
tdf2.insert(0, 'podcastEpisodeId', curEpId)
tdf = tdf.append(tdf2, ignore_index=True)
# Save transcript df as pickle
tdf.to_pickle("pkl/firebase_transcript.pkl")
return tdf
def refresh_FirebaseToPkl_All(store):
cdf = refresh_FirebaseToPkl_Channel(store)
pdf = refresh_FirebaseToPkl_Podcast(store)
tdf = refresh_FirebaseToPkl_Transcript(store)
return cdf, pdf, tdf
def pklToDataFrame():
df_xml_channel = pd.read_pickle('pkl/xml_channel.pkl')
df_xml_podcast = pd.read_pickle('pkl/xml_podcast.pkl')
df_firebase_channel = pd.read_pickle('pkl/firebase_channel.pkl')
df_firebase_podcast = pd.read_pickle('pkl/firebase_podcast.pkl')
df_firebase_transcript = pd.read_pickle('pkl/firebase_transcript.pkl')
return df_xml_channel, df_xml_podcast, df_firebase_channel, df_firebase_podcast, df_firebase_transcript
def podcasts_XmlToFireBase_WhereMissing(xmldf, pdf, FreeFeedURL = None, store = None):
if FreeFeedURL is not None and store is not None:
xmldf = refresh_XmlToPkl_Podcast(FreeFeedURL)
pdf = refresh_FirebaseToPkl_Podcast(store)
else:
xmldf = pd.read_pickle('pkl/xml_podcast.pkl')
pdf = pd.read_pickle('pkl/firebase_podcast.pkl')
cdf = pd.merge(xmldf, pdf['title'], on='title', how="outer", indicator=True).query('_merge=="left_only"').drop(columns=['_merge'])
return cdf
def podcasts_WhereTranscriptMissing(store, refresh = 1):
if refresh == 1:
pdf = refresh_FirebaseToPkl_Podcast(store)
tdf = refresh_FirebaseToPkl_Transcript(store)
else:
pdf = pd.read_pickle('pkl/firebase_podcast.pkl')
tdf = pd.read_pickle('pkl/firebase_transcript.pkl')
cdf = pd.merge(pdf, tdf['podcastEpisodeId'], on='podcastEpisodeId', how="outer", indicator=True).query('_merge=="left_only"').drop(columns=['_merge'])
return cdf