-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathto_google_sheets.py
253 lines (225 loc) · 10.4 KB
/
to_google_sheets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
# Author: Filippo Pisello
from typing import List, Dict, Union
import gspread
from spreadsheet import Spreadsheet
class GoogleSheet(Spreadsheet):
"""
Class finalized to export a pandas data frame to a Google Sheet workbook.
---------------
The class intakes as main argument a pandas dataframe. Using the API keys
from Google Sheet, it connects with a Google Sheet workbook and uploads to
it the data frame content.
It can be decided where to place the content within the target sheet through
the starting_cell argument.
Arguments
----------------
dataframe: pandas dataframe object (mandatory)
Dataframe to be considered.
google_workbook_id: str (mandatory)
Title or key of the Google Sheet workbook which should receive the data.
sheet_id: str or int, default=0
Argument to identify the target sheet within the workbook. If int, it is
interpreted as sheet index, if str as sheet name. If str and no sheet
is found, a sheet with the provided label is created.
index: Bool, default=False
If True, the index is exported together with the header and body.
starting_cell: str, default="A1"
The cell where it will be placed the top left corner of the dataframe.
correct_lists: Bool, default=True
If True, the lists stored as the dataframe entries are modified to be more
readable in the traditional spreadsheet softwares. It helps with the
Google Sheet compatibility. Type help(GoogleSheet.correct_lists_for_export)
for further details.
auth_keys: None or str or dict, default=None
If None, it is assumed that the json file for the authentication is in
the default folder "~/.config/gspread/your_file.json".
If str, it is the custom path of the json authentication file.
If dict, it contains the parsed content of the authentication file.
"""
def __init__(
self,
dataframe,
google_workbook_id: str,
sheet_id: Union[str, int] = 0,
index: bool = False,
starting_cell: str = "A1",
correct_lists: bool = True,
auth_keys: Union[None, str, dict] = None,
):
super().__init__(dataframe, index, starting_cell, correct_lists)
self.workbook = self.get_workbook(google_workbook_id, auth_keys)
self.sheet = self.get_sheet(sheet_id)
# -------------------------------------------------------------------------
# 1 - Main Elements
# -------------------------------------------------------------------------
# 1.1 - Functions to define the class attributes
# --------------------------------
@staticmethod
def get_workbook(
workbook_id: str, auth_keys: Union[None, str, dict]
) -> gspread.Spreadsheet:
"""
Gathers the authorization and returns a workbook object
"""
# Gather authentication
g_client = GoogleSheet.authentication(auth_keys)
# Get the instance of the Spreadsheet: since keys are 44 chars long, try
# to use str as a key only if the workbook_id matches that len
if len(workbook_id) == 44:
try:
return g_client.open_by_key(workbook_id)
except gspread.SpreadsheetNotFound:
pass
return g_client.open(workbook_id)
@staticmethod
def authentication(auth_keys: Union[None, str, dict]) -> gspread.client.Client:
"""
Gathers the authentication either by using the API json file or its info
parsed in a dictionary.
Arguments
----------------
auth_keys: None or str or dict, default=None
If None, the script assumes that the json file generated by the
Google Sheet API for the authentication is in the default folder
"~/.config/gspread/your_file.json".
If str, it is the custom path of the json authentication file.
If dict, it contains the parsed authentication parameters.
More info here https://gspread.readthedocs.io/en/latest/oauth2.html
"""
if auth_keys is None:
# Cannot assign a default value to argument filename as it takes
# different value depending on the user machine
return gspread.service_account()
if isinstance(auth_keys, str):
return gspread.service_account(filename=auth_keys)
if isinstance(auth_keys, dict):
return gspread.service_account_from_dict(auth_keys)
raise TypeError(
"Authentication keys can be either None, str or dict. "
f"{type(auth_keys)} was passed"
)
def get_sheet(self, sheet_id: Union[str, int]) -> gspread.Worksheet:
"""
Returns the worksheet object for the requested sheet.
---------------
If sheet_id is str, it is interpreted as the sheet name. If no sheet
is found for the given name, a new sheet is created. If sheet_id is int,
it is interpreted as the sheet index.
"""
if isinstance(sheet_id, str):
try:
return self.workbook.worksheet(sheet_id)
except gspread.WorksheetNotFound:
self.workbook.add_worksheet(title=sheet_id, rows="100", cols="20")
return self.workbook.worksheet(sheet_id)
return self.workbook.get_worksheet(sheet_id)
# --------------------------------
# 1.2 - Main Methods
# --------------------------------
def to_google_sheet(
self, fill_na_with: str = " ", clear_sheet: bool = False, header: bool = True
):
"""
Exports data frame to target sheet within a Google Sheet workbook.
---------------
Before the upload, the function slightly adapts the content of the data
frame to ensure compatibility with Google Sheet. Dates are always
turned to str and missing values are filled with str. By default, it is
also applied a correction to lists.
For the upload, the batch update method is used to ensure the maximum
efficiency possible. It also limits the number of request fired to the
Google Sheet API.
Arguments
----------------
fill_na_with: str, default=" "
The str missing values should be replaced by. This is necessary to
avoid errors as Google Sheet does not accept missing values.
clear_sheet: Bool, default=False
If True, the whole sheet gets erased before the new data is uploaded
to it. Note that the value of the destination cells is updated
anyway.
header: Bool, default=True
If False, the header is not exported to the Google Sheet. The other
table parts will still be placed in the cells as if the header was
in place. This allows to manually edit the header after an export
and preserved it after later refreshes.
"""
self._prepare_table(fill_na_with)
if clear_sheet:
self.sheet.clear()
self.sheet.batch_update(self._batch_list(header))
# -------------------------------------------------------------------------
# 2 - Worker Methods
# -------------------------------------------------------------------------
def _prepare_table(self, fill_na_with: str) -> None:
"""
Converts datetime and category columns into str and fills missing values.
"""
# Convert datetime columns into string
for column in self.df.columns:
date_types = [
"datetime64[ns]",
"datetime64",
"timedelta64[ns]",
"datetime64[ns, UTC]",
]
if self.df[column].dtype.name in date_types:
self.df[column] = self.df[column].astype(str).str.replace("NaT", "")
elif self.df[column].dtype.name == "category":
self.df[column] = self.df[column].astype(str)
# Replace missing values with something else
self.df.fillna(fill_na_with, inplace=True)
def _batch_list(self, keep_header: bool) -> List[Dict]:
"""
Puts together the list of dict to be passed as argument of the
batch_update method.
---------------
The list is in the following form:
[{"range":["A1", "A2", ...], "values":[[values row1], [values row2]]}]
It can contain up to three dictionaries if header index and body are kept.
"""
# Body is always exported
output = [{"range": self.body.cells_range, "values": self.df.values.tolist()}]
# If header is kept, add to batch list
if keep_header:
output.append(
{"range": self.header.cells_range, "values": self._columns_for_batch()}
)
# If index is kept, add to batch list
if self.keep_index:
output.append(
{"range": self.index.cells_range, "values": self._index_for_batch()}
)
return output
def _columns_for_batch(self) -> Union[List[List], List[str]]:
"""
It reshapes the output of dataframe.columns.values.tolist() making it
adapt for the batch_update() method.
---------------
The key here is that each row of values should be on a separated list,
otherwise the batch_update() method will throw an error. Thus, in case
of multicolumns, the content should be reshaped in n list, where n is
the multicolumns's depth.
"""
# Handling case with multicolumns
if self.indexes_depth[1] > 1:
output = []
for level in range(self.indexes_depth[1]):
output.append([i[level] for i in self.df.columns.values.tolist()])
return output
# Handling case with simple columns
return [self.df.columns.values.tolist()]
def _index_for_batch(self) -> Union[List[List], List[str]]:
"""
It reshapes the output of dataframe.index.values.tolist() making it
adapt for the batch_update() method.
---------------
The key here is that each row of values should be on a separated list,
otherwise the batch_update() method will throw an error. Thus, if there
is no multiindex, each index value should be placed in its own list.
"""
# Handling case with multiindex
if self.indexes_depth[0] > 1:
return self.df.index.values.tolist()
# Handling case with simple index
return [[x] for x in self.df.index.values.tolist()]