to_google_sheets.py

# Author: Filippo Pisello
from typing import List, Dict, Union

import gspread

from spreadsheet import Spreadsheet


class GoogleSheet(Spreadsheet):
    """
    Class finalized to export a pandas data frame to a Google Sheet workbook.

    ---------------
    The class intakes as main argument a pandas dataframe. Using the API keys
    from Google Sheet, it connects with a Google Sheet workbook and uploads to
    it the data frame content.

    It can be decided where to place the content within the target sheet through
    the starting_cell argument.

    Arguments
    ----------------
    dataframe: pandas dataframe object (mandatory)
        Dataframe to be considered.
    google_workbook_id: str (mandatory)
        Title or key of the Google Sheet workbook which should receive the data.
    sheet_id: str or int, default=0
        Argument to identify the target sheet within the workbook. If int, it is
        interpreted as sheet index, if str as sheet name. If str and no sheet
        is found, a sheet with the provided label is created.
    index: Bool, default=False
        If True, the index is exported together with the header and body.
    starting_cell: str, default="A1"
        The cell where it will be placed the top left corner of the dataframe.
    correct_lists: Bool, default=True
        If True, the lists stored as the dataframe entries are modified to be more
        readable in the traditional spreadsheet softwares. It helps with the
        Google Sheet compatibility. Type help(GoogleSheet.correct_lists_for_export)
        for further details.
    auth_keys: None or str or dict, default=None
        If None, it is assumed that the json file for the authentication is in
        the default folder "~/.config/gspread/your_file.json".
        If str, it is the custom path of the json authentication file.
        If dict, it contains the parsed content of the authentication file.
    """

    def __init__(
        self,
        dataframe,
        google_workbook_id: str,
        sheet_id: Union[str, int] = 0,
        index: bool = False,
        starting_cell: str = "A1",
        correct_lists: bool = True,
        auth_keys: Union[None, str, dict] = None,
    ):
        super().__init__(dataframe, index, starting_cell, correct_lists)
        self.workbook = self.get_workbook(google_workbook_id, auth_keys)
        self.sheet = self.get_sheet(sheet_id)

    # -------------------------------------------------------------------------
    # 1 - Main Elements
    # -------------------------------------------------------------------------
    # 1.1 - Functions to define the class attributes
    # --------------------------------
    @staticmethod
    def get_workbook(
        workbook_id: str, auth_keys: Union[None, str, dict]
    ) -> gspread.Spreadsheet:
        """
        Gathers the authorization and returns a workbook object
        """
        # Gather authentication
        g_client = GoogleSheet.authentication(auth_keys)
        # Get the instance of the Spreadsheet: since keys are 44 chars long, try
        # to use str as a key only if the workbook_id matches that len
        if len(workbook_id) == 44:
            try:
                return g_client.open_by_key(workbook_id)
            except gspread.SpreadsheetNotFound:
                pass
        return g_client.open(workbook_id)

    @staticmethod
    def authentication(auth_keys: Union[None, str, dict]) -> gspread.client.Client:
        """
        Gathers the authentication either by using the API json file or its info
        parsed in a dictionary.

        Arguments
        ----------------
        auth_keys: None or str or dict, default=None
            If None, the script assumes that the json file generated by the
            Google Sheet API for the authentication is in the default folder
            "~/.config/gspread/your_file.json".
            If str, it is the custom path of the json authentication file.
            If dict, it contains the parsed authentication parameters.

        More info here https://gspread.readthedocs.io/en/latest/oauth2.html
        """
        if auth_keys is None:
            # Cannot assign a default value to argument filename as it takes
            # different value depending on the user machine
            return gspread.service_account()
        if isinstance(auth_keys, str):
            return gspread.service_account(filename=auth_keys)
        if isinstance(auth_keys, dict):
            return gspread.service_account_from_dict(auth_keys)
        raise TypeError(
            "Authentication keys can be either None, str or dict. "
            f"{type(auth_keys)} was passed"
        )

    def get_sheet(self, sheet_id: Union[str, int]) -> gspread.Worksheet:
        """
        Returns the worksheet object for the requested sheet.

        ---------------
        If sheet_id is str, it is interpreted as the sheet name. If no sheet
        is found for the given name, a new sheet is created. If sheet_id is int,
        it is interpreted as the sheet index.
        """
        if isinstance(sheet_id, str):
            try:
                return self.workbook.worksheet(sheet_id)
            except gspread.WorksheetNotFound:
                self.workbook.add_worksheet(title=sheet_id, rows="100", cols="20")
                return self.workbook.worksheet(sheet_id)
        return self.workbook.get_worksheet(sheet_id)

    # --------------------------------
    # 1.2 - Main Methods
    # --------------------------------
    def to_google_sheet(
        self, fill_na_with: str = " ", clear_sheet: bool = False, header: bool = True
    ):
        """
        Exports data frame to target sheet within a Google Sheet workbook.

        ---------------
        Before the upload, the function slightly adapts the content of the data
        frame to ensure  compatibility with Google Sheet. Dates are always
        turned to str and  missing values are filled with str. By default, it is
        also applied a correction to lists.

        For the upload, the batch update method is used to ensure the maximum
        efficiency possible. It also limits the number of request fired to the
        Google Sheet API.

        Arguments
        ----------------
        fill_na_with: str, default=" "
            The str missing values should be replaced by. This is necessary to
            avoid errors as Google Sheet does not accept missing values.
        clear_sheet: Bool, default=False
            If True, the whole sheet gets erased before the new data is uploaded
            to it. Note that the value of the destination cells is updated
            anyway.
        header: Bool, default=True
            If False, the header is not exported to the Google Sheet. The other
            table parts will still be placed in the cells as if the header was
            in place. This allows to manually edit the header after an export
            and preserved it after later refreshes.
        """
        self._prepare_table(fill_na_with)

        if clear_sheet:
            self.sheet.clear()

        self.sheet.batch_update(self._batch_list(header))

    # -------------------------------------------------------------------------
    # 2 - Worker Methods
    # -------------------------------------------------------------------------
    def _prepare_table(self, fill_na_with: str) -> None:
        """
        Converts datetime and category columns into str and fills missing values.
        """
        # Convert datetime columns into string
        for column in self.df.columns:
            date_types = [
                "datetime64[ns]",
                "datetime64",
                "timedelta64[ns]",
                "datetime64[ns, UTC]",
            ]
            if self.df[column].dtype.name in date_types:
                self.df[column] = self.df[column].astype(str).str.replace("NaT", "")
            elif self.df[column].dtype.name == "category":
                self.df[column] = self.df[column].astype(str)
        # Replace missing values with something else
        self.df.fillna(fill_na_with, inplace=True)

    def _batch_list(self, keep_header: bool) -> List[Dict]:
        """
        Puts together the list of dict to be passed as argument of the
        batch_update method.

        ---------------
        The list is in the following form:
        [{"range":["A1", "A2", ...], "values":[[values row1], [values row2]]}]

        It can contain up to three dictionaries if header index and body are kept.
        """
        # Body is always exported
        output = [{"range": self.body.cells_range, "values": self.df.values.tolist()}]
        # If header is kept, add to batch list
        if keep_header:
            output.append(
                {"range": self.header.cells_range, "values": self._columns_for_batch()}
            )
        # If index is kept, add to batch list
        if self.keep_index:
            output.append(
                {"range": self.index.cells_range, "values": self._index_for_batch()}
            )
        return output

    def _columns_for_batch(self) -> Union[List[List], List[str]]:
        """
        It reshapes the output of dataframe.columns.values.tolist() making it
        adapt for the batch_update() method.

        ---------------
        The key here is that each row of values should be on a separated list,
        otherwise the batch_update() method will throw an error. Thus, in case
        of multicolumns, the content should be reshaped in n list, where n is
        the multicolumns's depth.
        """
        # Handling case with multicolumns
        if self.indexes_depth[1] > 1:
            output = []
            for level in range(self.indexes_depth[1]):
                output.append([i[level] for i in self.df.columns.values.tolist()])
            return output
        # Handling case with simple columns
        return [self.df.columns.values.tolist()]

    def _index_for_batch(self) -> Union[List[List], List[str]]:
        """
        It reshapes the output of dataframe.index.values.tolist() making it
        adapt for the batch_update() method.

        ---------------
        The key here is that each row of values should be on a separated list,
        otherwise the batch_update() method will throw an error. Thus, if there
        is no multiindex, each index value should be placed in its own list.
        """
        # Handling case with multiindex
        if self.indexes_depth[0] > 1:
            return self.df.index.values.tolist()
        # Handling case with simple index
        return [[x] for x in self.df.index.values.tolist()]