From e8e57cbec1b66ea692b623f0309d18b738a2fda4 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 29 Mar 2021 15:41:41 -0500 Subject: [PATCH 1/3] fix: avoid 403 from to_gbq when table has policyTags --- docs/source/changelog.rst | 5 + pandas_gbq/features.py | 95 +++++++++++++++ pandas_gbq/gbq.py | 100 +++------------ pandas_gbq/load.py | 48 +++++--- pandas_gbq/schema.py | 38 +++++- tests/unit/test_features.py | 28 +++++ tests/unit/test_gbq.py | 235 ++++++++++++++---------------------- tests/unit/test_load.py | 63 ++++++++-- 8 files changed, 357 insertions(+), 255 deletions(-) create mode 100644 pandas_gbq/features.py create mode 100644 tests/unit/test_features.py diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 2ca3650e..e9553f88 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -13,6 +13,11 @@ Features client project. Specify the target table ID as ``project.dataset.table`` to use this feature. (:issue:`321`, :issue:`347`) +Bug fixes +~~~~~~~~~ + +- Avoid 403 error from ``to_gbq`` when table has ``policyTags``. (:issue:`354`) + Dependencies ~~~~~~~~~~~~ diff --git a/pandas_gbq/features.py b/pandas_gbq/features.py new file mode 100644 index 00000000..3a63189b --- /dev/null +++ b/pandas_gbq/features.py @@ -0,0 +1,95 @@ +"""Module for checking dependency versions and supported features.""" + +# https://github.com/googleapis/python-bigquery/blob/master/CHANGELOG.md +BIGQUERY_MINIMUM_VERSION = "1.11.1" +BIGQUERY_CLIENT_INFO_VERSION = "1.12.0" +BIGQUERY_BQSTORAGE_VERSION = "1.24.0" +BIGQUERY_FROM_DATAFRAME_CSV_VERSION = "2.6.0" +PANDAS_VERBOSITY_DEPRECATION_VERSION = "0.23.0" + + +class Features: + def __init__(self): + self._bigquery_installed_version = None + self._pandas_installed_version = None + + @property + def bigquery_installed_version(self): + import google.cloud.bigquery + import pkg_resources + + if self._bigquery_installed_version is not None: + return self._bigquery_installed_version + + self._bigquery_installed_version = pkg_resources.parse_version( + google.cloud.bigquery.__version__ + ) + bigquery_minimum_version = pkg_resources.parse_version( + BIGQUERY_MINIMUM_VERSION + ) + + if self._bigquery_installed_version < bigquery_minimum_version: + raise ImportError( + "pandas-gbq requires google-cloud-bigquery >= {0}, " + "current version {1}".format( + bigquery_minimum_version, self._bigquery_installed_version + ) + ) + + return self._bigquery_installed_version + + @property + def bigquery_has_client_info(self): + import pkg_resources + + bigquery_client_info_version = pkg_resources.parse_version( + BIGQUERY_CLIENT_INFO_VERSION + ) + return self.bigquery_installed_version >= bigquery_client_info_version + + @property + def bigquery_has_bqstorage(self): + import pkg_resources + + bigquery_bqstorage_version = pkg_resources.parse_version( + BIGQUERY_BQSTORAGE_VERSION + ) + return self.bigquery_installed_version >= bigquery_bqstorage_version + + @property + def bigquery_has_from_dataframe_with_csv(self): + import pkg_resources + + bigquery_from_dataframe_version = pkg_resources.parse_version( + BIGQUERY_FROM_DATAFRAME_CSV_VERSION + ) + return ( + self.bigquery_installed_version >= bigquery_from_dataframe_version + ) + + @property + def pandas_installed_version(self): + import pandas + import pkg_resources + + if self._pandas_installed_version is not None: + return self._pandas_installed_version + + self._pandas_installed_version = pkg_resources.parse_version( + pandas.__version__ + ) + return self._pandas_installed_version + + @property + def pandas_has_deprecated_verbose(self): + import pkg_resources + + # Add check for Pandas version before showing deprecation warning. + # https://github.com/pydata/pandas-gbq/issues/157 + pandas_verbosity_deprecation = pkg_resources.parse_version( + PANDAS_VERBOSITY_DEPRECATION_VERSION + ) + return self.pandas_installed_version >= pandas_verbosity_deprecation + + +FEATURES = Features() diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 8d7c15c8..884d5470 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -16,100 +16,45 @@ from pandas_gbq.exceptions import AccessDenied from pandas_gbq.exceptions import PerformanceWarning +from pandas_gbq import features +from pandas_gbq.features import FEATURES import pandas_gbq.schema import pandas_gbq.timestamp logger = logging.getLogger(__name__) -BIGQUERY_INSTALLED_VERSION = None -BIGQUERY_CLIENT_INFO_VERSION = "1.12.0" -BIGQUERY_BQSTORAGE_VERSION = "1.24.0" -HAS_CLIENT_INFO = False -HAS_BQSTORAGE_SUPPORT = False - try: import tqdm # noqa except ImportError: tqdm = None -def _check_google_client_version(): - global BIGQUERY_INSTALLED_VERSION, HAS_CLIENT_INFO, HAS_BQSTORAGE_SUPPORT, SHOW_VERBOSE_DEPRECATION - - try: - import pkg_resources - - except ImportError: - raise ImportError("Could not import pkg_resources (setuptools).") - - # https://github.com/googleapis/python-bigquery/blob/master/CHANGELOG.md - bigquery_minimum_version = pkg_resources.parse_version("1.11.0") - bigquery_client_info_version = pkg_resources.parse_version( - BIGQUERY_CLIENT_INFO_VERSION - ) - bigquery_bqstorage_version = pkg_resources.parse_version( - BIGQUERY_BQSTORAGE_VERSION - ) - BIGQUERY_INSTALLED_VERSION = pkg_resources.get_distribution( - "google-cloud-bigquery" - ).parsed_version - - HAS_CLIENT_INFO = ( - BIGQUERY_INSTALLED_VERSION >= bigquery_client_info_version - ) - HAS_BQSTORAGE_SUPPORT = ( - BIGQUERY_INSTALLED_VERSION >= bigquery_bqstorage_version - ) - - if BIGQUERY_INSTALLED_VERSION < bigquery_minimum_version: - raise ImportError( - "pandas-gbq requires google-cloud-bigquery >= {0}, " - "current version {1}".format( - bigquery_minimum_version, BIGQUERY_INSTALLED_VERSION - ) - ) - - # Add check for Pandas version before showing deprecation warning. - # https://github.com/pydata/pandas-gbq/issues/157 - pandas_installed_version = pkg_resources.get_distribution( - "pandas" - ).parsed_version - pandas_version_wo_verbosity = pkg_resources.parse_version("0.23.0") - SHOW_VERBOSE_DEPRECATION = ( - pandas_installed_version >= pandas_version_wo_verbosity - ) - - def _test_google_api_imports(): + try: + import pkg_resources # noqa + except ImportError as ex: + raise ImportError("pandas-gbq requires setuptools") from ex try: import pydata_google_auth # noqa except ImportError as ex: - raise ImportError( - "pandas-gbq requires pydata-google-auth: {0}".format(ex) - ) + raise ImportError("pandas-gbq requires pydata-google-auth") from ex try: from google_auth_oauthlib.flow import InstalledAppFlow # noqa except ImportError as ex: - raise ImportError( - "pandas-gbq requires google-auth-oauthlib: {0}".format(ex) - ) + raise ImportError("pandas-gbq requires google-auth-oauthlib") from ex try: import google.auth # noqa except ImportError as ex: - raise ImportError("pandas-gbq requires google-auth: {0}".format(ex)) + raise ImportError("pandas-gbq requires google-auth") from ex try: from google.cloud import bigquery # noqa except ImportError as ex: - raise ImportError( - "pandas-gbq requires google-cloud-bigquery: {0}".format(ex) - ) - - _check_google_client_version() + raise ImportError("pandas-gbq requires google-cloud-bigquery") from ex class DatasetCreationError(ValueError): @@ -416,7 +361,7 @@ def get_client(self): # In addition to new enough version of google-api-core, a new enough # version of google-cloud-bigquery is required to populate the # client_info. - if HAS_CLIENT_INFO: + if FEATURES.bigquery_has_client_info: return bigquery.Client( project=self.project_id, credentials=self.credentials, @@ -550,14 +495,15 @@ def _download_results( if user_dtypes is None: user_dtypes = {} - if self.use_bqstorage_api and not HAS_BQSTORAGE_SUPPORT: + if self.use_bqstorage_api and not FEATURES.bigquery_has_bqstorage: warnings.warn( ( "use_bqstorage_api was set, but have google-cloud-bigquery " "version {}. Requires google-cloud-bigquery version " "{} or later." ).format( - BIGQUERY_INSTALLED_VERSION, BIGQUERY_BQSTORAGE_VERSION + FEATURES.bigquery_installed_version, + features.BIGQUERY_BQSTORAGE_VERSION, ), PerformanceWarning, stacklevel=4, @@ -568,7 +514,7 @@ def _download_results( create_bqstorage_client = False to_dataframe_kwargs = {} - if HAS_BQSTORAGE_SUPPORT: + if FEATURES.bigquery_has_bqstorage: to_dataframe_kwargs[ "create_bqstorage_client" ] = create_bqstorage_client @@ -880,7 +826,7 @@ def read_gbq( _test_google_api_imports() - if verbose is not None and SHOW_VERBOSE_DEPRECATION: + if verbose is not None and FEATURES.pandas_has_deprecated_verbose: warnings.warn( "verbose is deprecated and will be removed in " "a future version. Set logging level in order to vary " @@ -1054,7 +1000,7 @@ def to_gbq( _test_google_api_imports() - if verbose is not None and SHOW_VERBOSE_DEPRECATION: + if verbose is not None and FEATURES.pandas_has_deprecated_verbose: warnings.warn( "verbose is deprecated and will be removed in " "a future version. Set logging level in order to vary " @@ -1133,8 +1079,8 @@ def to_gbq( "schema of the destination table." ) - # Update the local `table_schema` so mode matches. - # See: https://github.com/pydata/pandas-gbq/issues/315 + # Update the local `table_schema` so mode (NULLABLE/REQUIRED) + # matches. See: https://github.com/pydata/pandas-gbq/issues/315 table_schema = pandas_gbq.schema.update_schema( table_schema, original_schema ) @@ -1252,7 +1198,6 @@ def create(self, table_id, schema): dataframe. """ from google.cloud.bigquery import DatasetReference - from google.cloud.bigquery import SchemaField from google.cloud.bigquery import Table from google.cloud.bigquery import TableReference @@ -1274,12 +1219,7 @@ def create(self, table_id, schema): DatasetReference(self.project_id, self.dataset_id), table_id ) table = Table(table_ref) - - schema = pandas_gbq.schema.add_default_nullable_mode(schema) - - table.schema = [ - SchemaField.from_api_repr(field) for field in schema["fields"] - ] + table.schema = pandas_gbq.schema.to_google_cloud_bigquery(schema) try: self.client.create_table(table) diff --git a/pandas_gbq/load.py b/pandas_gbq/load.py index d9e59c1d..98211482 100644 --- a/pandas_gbq/load.py +++ b/pandas_gbq/load.py @@ -4,6 +4,7 @@ from google.cloud import bigquery +from pandas_gbq.features import FEATURES import pandas_gbq.schema @@ -30,10 +31,10 @@ def encode_chunk(dataframe): return io.BytesIO(body) -def encode_chunks(dataframe, chunksize=None): +def split_dataframe(dataframe, chunksize=None): dataframe = dataframe.reset_index(drop=True) if chunksize is None: - yield 0, encode_chunk(dataframe) + yield 0, dataframe return remaining_rows = len(dataframe) @@ -41,10 +42,10 @@ def encode_chunks(dataframe, chunksize=None): start_index = 0 while start_index < total_rows: end_index = start_index + chunksize - chunk_buffer = encode_chunk(dataframe[start_index:end_index]) + chunk = dataframe[start_index:end_index] start_index += chunksize remaining_rows = max(0, remaining_rows - chunksize) - yield remaining_rows, chunk_buffer + yield remaining_rows, chunk def load_chunks( @@ -60,24 +61,35 @@ def load_chunks( job_config.source_format = "CSV" job_config.allow_quoted_newlines = True - if schema is None: + # Explicit schema? Use that! + if schema is not None: + schema = pandas_gbq.schema.remove_policy_tags(schema) + job_config.schema = pandas_gbq.schema.to_google_cloud_bigquery(schema) + # If not, let BigQuery determine schema unless we are encoding the CSV files ourselves. + elif not FEATURES.bigquery_has_from_dataframe_with_csv: schema = pandas_gbq.schema.generate_bq_schema(dataframe) + schema = pandas_gbq.schema.remove_policy_tags(schema) + job_config.schema = pandas_gbq.schema.to_google_cloud_bigquery(schema) - schema = pandas_gbq.schema.add_default_nullable_mode(schema) + chunks = split_dataframe(dataframe, chunksize=chunksize) + for remaining_rows, chunk in chunks: + yield remaining_rows - job_config.schema = [ - bigquery.SchemaField.from_api_repr(field) for field in schema["fields"] - ] - - chunks = encode_chunks(dataframe, chunksize=chunksize) - for remaining_rows, chunk_buffer in chunks: - try: - yield remaining_rows - client.load_table_from_file( - chunk_buffer, + if FEATURES.bigquery_has_from_dataframe_with_csv: + client.load_table_from_dataframe( + chunk, destination_table_ref, job_config=job_config, location=location, ).result() - finally: - chunk_buffer.close() + else: + try: + chunk_buffer = encode_chunk(chunk) + client.load_table_from_file( + chunk_buffer, + destination_table_ref, + job_config=job_config, + location=location, + ).result() + finally: + chunk_buffer.close() diff --git a/pandas_gbq/schema.py b/pandas_gbq/schema.py index ffc1c362..9deaeb7c 100644 --- a/pandas_gbq/schema.py +++ b/pandas_gbq/schema.py @@ -27,6 +27,19 @@ def to_pandas_gbq(client_schema): return {"fields": remote_fields} +def to_google_cloud_bigquery(pandas_gbq_schema): + """Given a schema in pandas-gbq API format, + return a sequence of :class:`google.cloud.bigquery.schema.SchemaField`. + """ + from google.cloud import bigquery + + # Need to convert from JSON representation to format used by client library. + schema = add_default_nullable_mode(pandas_gbq_schema) + return [ + bigquery.SchemaField.from_api_repr(field) for field in schema["fields"] + ] + + def _clean_schema_fields(fields): """Return a sanitized version of the schema for comparisons. @@ -129,13 +142,30 @@ def update_schema(schema_old, schema_new): def add_default_nullable_mode(schema): - """Manually create the schema objects, adding NULLABLE mode.""" - # Workaround for: - # https://github.com/GoogleCloudPlatform/google-cloud-python/issues/4456 - # + """Manually create the schema objects, adding NULLABLE mode. + + Workaround for error in SchemaField.from_api_repr, which required + "mode" to be set: + https://github.com/GoogleCloudPlatform/google-cloud-python/issues/4456 + """ # Returns a copy rather than modifying the mutable arg, # per Issue #277 result = copy.deepcopy(schema) for field in result["fields"]: field.setdefault("mode", "NULLABLE") return result + + +def remove_policy_tags(schema): + """Manually create the schema objects, removing policyTags. + + Workaround for 403 error with policy tags, which are not required in a load + job: https://github.com/googleapis/python-bigquery/pull/557 + """ + # Returns a copy rather than modifying the mutable arg, + # per Issue #277 + result = copy.deepcopy(schema) + for field in result["fields"]: + if "policyTags" in field: + del field["policyTags"] + return result diff --git a/tests/unit/test_features.py b/tests/unit/test_features.py new file mode 100644 index 00000000..65cefb1c --- /dev/null +++ b/tests/unit/test_features.py @@ -0,0 +1,28 @@ +import pytest + +from pandas_gbq.features import FEATURES + + +@pytest.fixture(autouse=True) +def fresh_bigquery_version(monkeypatch): + monkeypatch.setattr(FEATURES, "_bigquery_installed_version", None) + + +@pytest.mark.parametrize( + ["bigquery_version", "expected"], + [ + ("1.11.1", False), + ("1.26.0", False), + ("2.5.4", False), + ("2.6.0", True), + ("2.6.1", True), + ("2.12.0", True), + ], +) +def test_bigquery_has_from_dataframe_with_csv( + monkeypatch, bigquery_version, expected +): + import google.cloud.bigquery + + monkeypatch.setattr(google.cloud.bigquery, "__version__", bigquery_version) + assert FEATURES.bigquery_has_from_dataframe_with_csv == expected diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 4426e8dc..fbf21500 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -7,31 +7,21 @@ import numpy import pandas from pandas import DataFrame -import pkg_resources import pytest from pandas_gbq import gbq +from pandas_gbq.features import FEATURES pytestmark = pytest.mark.filterwarnings( "ignore:credentials from Google Cloud SDK" ) -pandas_installed_version = pkg_resources.get_distribution( - "pandas" -).parsed_version def _make_connector(project_id="some-project", **kwargs): return gbq.GbqConnector(project_id, **kwargs) -@pytest.fixture -def min_bq_version(): - import pkg_resources - - return pkg_resources.parse_version("1.11.0") - - def mock_get_credentials_no_project(*args, **kwargs): import google.auth.credentials @@ -101,7 +91,11 @@ def test__bqschema_to_nullsafe_dtypes(type_, expected): def test_GbqConnector_get_client_w_old_bq(monkeypatch, mock_bigquery_client): gbq._test_google_api_imports() connector = _make_connector() - monkeypatch.setattr(gbq, "HAS_CLIENT_INFO", False) + monkeypatch.setattr( + type(FEATURES), + "bigquery_has_client_info", + mock.PropertyMock(return_value=False), + ) connector.get_client() @@ -113,9 +107,8 @@ def test_GbqConnector_get_client_w_old_bq(monkeypatch, mock_bigquery_client): def test_GbqConnector_get_client_w_new_bq(mock_bigquery_client): gbq._test_google_api_imports() - pytest.importorskip( - "google.cloud.bigquery", minversion=gbq.BIGQUERY_CLIENT_INFO_VERSION - ) + if not FEATURES.bigquery_has_client_info: + pytest.skip("google-cloud-bigquery missing client_info feature") pytest.importorskip("google.api_core.client_info") connector = _make_connector() @@ -143,83 +136,58 @@ def test_to_gbq_with_no_project_id_given_should_fail(monkeypatch): gbq.to_gbq(DataFrame([[1]]), "dataset.tablename") -def test_to_gbq_with_verbose_new_pandas_warns_deprecation(min_bq_version): - import pkg_resources - - pandas_version = pkg_resources.parse_version("0.23.0") - with pytest.warns(FutureWarning), mock.patch( - "pkg_resources.Distribution.parsed_version", - new_callable=mock.PropertyMock, - ) as mock_version: - mock_version.side_effect = [min_bq_version, pandas_version] - try: - gbq.to_gbq( - DataFrame([[1]]), - "dataset.tablename", - project_id="my-project", - verbose=True, - ) - except gbq.TableCreationError: - pass - - -def test_to_gbq_with_not_verbose_new_pandas_warns_deprecation(min_bq_version): - import pkg_resources - - pandas_version = pkg_resources.parse_version("0.23.0") - with pytest.warns(FutureWarning), mock.patch( - "pkg_resources.Distribution.parsed_version", - new_callable=mock.PropertyMock, - ) as mock_version: - mock_version.side_effect = [min_bq_version, pandas_version] +@pytest.mark.parametrize(["verbose"], [(True,), (False,)]) +def test_to_gbq_with_verbose_new_pandas_warns_deprecation( + monkeypatch, verbose +): + monkeypatch.setattr( + type(FEATURES), + "pandas_has_deprecated_verbose", + mock.PropertyMock(return_value=True), + ) + with pytest.warns(FutureWarning, match="verbose is deprecated"): try: gbq.to_gbq( DataFrame([[1]]), "dataset.tablename", project_id="my-project", - verbose=False, + verbose=verbose, ) except gbq.TableCreationError: pass -def test_to_gbq_wo_verbose_w_new_pandas_no_warnings(recwarn, min_bq_version): - import pkg_resources - - pandas_version = pkg_resources.parse_version("0.23.0") - with mock.patch( - "pkg_resources.Distribution.parsed_version", - new_callable=mock.PropertyMock, - ) as mock_version: - mock_version.side_effect = [min_bq_version, pandas_version] - try: - gbq.to_gbq( - DataFrame([[1]]), "dataset.tablename", project_id="my-project" - ) - except gbq.TableCreationError: - pass - assert len(recwarn) == 0 - +def test_to_gbq_wo_verbose_w_new_pandas_no_warnings(monkeypatch, recwarn): + monkeypatch.setattr( + type(FEATURES), + "pandas_has_deprecated_verbose", + mock.PropertyMock(return_value=True), + ) + try: + gbq.to_gbq( + DataFrame([[1]]), "dataset.tablename", project_id="my-project" + ) + except gbq.TableCreationError: + pass + assert len(recwarn) == 0 -def test_to_gbq_with_verbose_old_pandas_no_warnings(recwarn, min_bq_version): - import pkg_resources - pandas_version = pkg_resources.parse_version("0.22.0") - with mock.patch( - "pkg_resources.Distribution.parsed_version", - new_callable=mock.PropertyMock, - ) as mock_version: - mock_version.side_effect = [min_bq_version, pandas_version] - try: - gbq.to_gbq( - DataFrame([[1]]), - "dataset.tablename", - project_id="my-project", - verbose=True, - ) - except gbq.TableCreationError: - pass - assert len(recwarn) == 0 +def test_to_gbq_with_verbose_old_pandas_no_warnings(monkeypatch, recwarn): + monkeypatch.setattr( + type(FEATURES), + "pandas_has_deprecated_verbose", + mock.PropertyMock(return_value=False), + ) + try: + gbq.to_gbq( + DataFrame([[1]]), + "dataset.tablename", + project_id="my-project", + verbose=True, + ) + except gbq.TableCreationError: + pass + assert len(recwarn) == 0 def test_to_gbq_with_private_key_raises_notimplementederror(): @@ -232,9 +200,7 @@ def test_to_gbq_with_private_key_raises_notimplementederror(): ) -def test_to_gbq_doesnt_run_query( - recwarn, mock_bigquery_client, min_bq_version -): +def test_to_gbq_doesnt_run_query(mock_bigquery_client): try: gbq.to_gbq( DataFrame([[1]]), "dataset.tablename", project_id="my-project" @@ -370,76 +336,54 @@ def test_read_gbq_with_max_results_ten(monkeypatch, mock_bigquery_client): mock_bigquery_client.list_rows.assert_called_with(mock.ANY, max_results=10) -def test_read_gbq_with_verbose_new_pandas_warns_deprecation(min_bq_version): - import pkg_resources - - pandas_version = pkg_resources.parse_version("0.23.0") - with pytest.warns(FutureWarning), mock.patch( - "pkg_resources.Distribution.parsed_version", - new_callable=mock.PropertyMock, - ) as mock_version: - mock_version.side_effect = [min_bq_version, pandas_version] - gbq.read_gbq("SELECT 1", project_id="my-project", verbose=True) +@pytest.mark.parametrize(["verbose"], [(True,), (False,)]) +def test_read_gbq_with_verbose_new_pandas_warns_deprecation( + monkeypatch, verbose +): + monkeypatch.setattr( + type(FEATURES), + "pandas_has_deprecated_verbose", + mock.PropertyMock(return_value=True), + ) + with pytest.warns(FutureWarning, match="verbose is deprecated"): + gbq.read_gbq("SELECT 1", project_id="my-project", verbose=verbose) -def test_read_gbq_with_not_verbose_new_pandas_warns_deprecation( - min_bq_version, -): - import pkg_resources - - pandas_version = pkg_resources.parse_version("0.23.0") - with pytest.warns(FutureWarning), mock.patch( - "pkg_resources.Distribution.parsed_version", - new_callable=mock.PropertyMock, - ) as mock_version: - mock_version.side_effect = [min_bq_version, pandas_version] - gbq.read_gbq("SELECT 1", project_id="my-project", verbose=False) - - -def test_read_gbq_wo_verbose_w_new_pandas_no_warnings(recwarn, min_bq_version): - import pkg_resources - - pandas_version = pkg_resources.parse_version("0.23.0") - with mock.patch( - "pkg_resources.Distribution.parsed_version", - new_callable=mock.PropertyMock, - ) as mock_version: - mock_version.side_effect = [min_bq_version, pandas_version] - gbq.read_gbq("SELECT 1", project_id="my-project", dialect="standard") - assert len(recwarn) == 0 - - -def test_read_gbq_with_old_bq_raises_importerror(): - import pkg_resources - - bigquery_version = pkg_resources.parse_version("0.27.0") - with pytest.raises(ImportError, match="google-cloud-bigquery"), mock.patch( - "pkg_resources.Distribution.parsed_version", - new_callable=mock.PropertyMock, - ) as mock_version: - mock_version.side_effect = [bigquery_version] - gbq.read_gbq( - "SELECT 1", - project_id="my-project", - ) +def test_read_gbq_wo_verbose_w_new_pandas_no_warnings(monkeypatch, recwarn): + monkeypatch.setattr( + type(FEATURES), + "pandas_has_deprecated_verbose", + mock.PropertyMock(return_value=False), + ) + gbq.read_gbq("SELECT 1", project_id="my-project", dialect="standard") + assert len(recwarn) == 0 -def test_read_gbq_with_verbose_old_pandas_no_warnings(recwarn, min_bq_version): - import pkg_resources +def test_read_gbq_with_old_bq_raises_importerror(monkeypatch): + import google.cloud.bigquery - pandas_version = pkg_resources.parse_version("0.22.0") - with mock.patch( - "pkg_resources.Distribution.parsed_version", - new_callable=mock.PropertyMock, - ) as mock_version: - mock_version.side_effect = [min_bq_version, pandas_version] + monkeypatch.setattr(google.cloud.bigquery, "__version__", "0.27.0") + monkeypatch.setattr(FEATURES, "_bigquery_installed_version", None) + with pytest.raises(ImportError, match="google-cloud-bigquery"): gbq.read_gbq( "SELECT 1", project_id="my-project", - dialect="standard", - verbose=True, ) - assert len(recwarn) == 0 + + +def test_read_gbq_with_verbose_old_pandas_no_warnings(monkeypatch, recwarn): + monkeypatch.setattr( + type(FEATURES), + "pandas_has_deprecated_verbose", + mock.PropertyMock(return_value=False), + ) + gbq.read_gbq( + "SELECT 1", + project_id="my-project", + dialect="standard", + verbose=True, + ) + assert len(recwarn) == 0 def test_read_gbq_with_private_raises_notimplmentederror(): @@ -542,8 +486,7 @@ def test_read_gbq_passes_dtypes( def test_read_gbq_use_bqstorage_api( mock_bigquery_client, mock_service_account_credentials ): - gbq._check_google_client_version() - if not gbq.HAS_BQSTORAGE_SUPPORT: + if not FEATURES.bigquery_has_bqstorage: pytest.skip("requires BigQuery Storage API") mock_service_account_credentials.project_id = "service_account_project_id" diff --git a/tests/unit/test_load.py b/tests/unit/test_load.py index 7ed463c1..a864d972 100644 --- a/tests/unit/test_load.py +++ b/tests/unit/test_load.py @@ -2,13 +2,22 @@ import textwrap from io import StringIO +from unittest import mock import numpy import pandas +import pytest +from pandas_gbq.features import FEATURES from pandas_gbq import load +def load_method(bqclient): + if FEATURES.bigquery_has_from_dataframe_with_csv: + return bqclient.load_table_from_dataframe + return bqclient.load_table_from_file + + def test_encode_chunk_with_unicode(): """Test that a dataframe containing unicode can be encoded as a file. @@ -64,19 +73,59 @@ def test_encode_chunk_with_newlines(): assert '"ij\r\nkl"' in csv_string -def test_encode_chunks_splits_dataframe(): +def test_split_dataframe(): df = pandas.DataFrame(numpy.random.randn(6, 4), index=range(6)) - chunks = list(load.encode_chunks(df, chunksize=2)) + chunks = list(load.split_dataframe(df, chunksize=2)) assert len(chunks) == 3 - remaining, buffer = chunks[0] + remaining, chunk = chunks[0] assert remaining == 4 - assert len(buffer.readlines()) == 2 + assert len(chunk.index) == 2 def test_encode_chunks_with_chunksize_none(): df = pandas.DataFrame(numpy.random.randn(6, 4), index=range(6)) - chunks = list(load.encode_chunks(df)) + chunks = list(load.split_dataframe(df)) assert len(chunks) == 1 - remaining, buffer = chunks[0] + remaining, chunk = chunks[0] assert remaining == 0 - assert len(buffer.readlines()) == 6 + assert len(chunk.index) == 6 + + +@pytest.mark.parametrize( + ["bigquery_has_from_dataframe_with_csv"], [(True,), (False,)] +) +def test_load_chunks_omits_policy_tags( + monkeypatch, mock_bigquery_client, bigquery_has_from_dataframe_with_csv +): + """Ensure that policyTags are omitted. + + We don't want to change the policyTags via a load job, as this can cause + 403 error. See: https://github.com/googleapis/python-bigquery/pull/557 + """ + import google.cloud.bigquery + + monkeypatch.setattr( + type(FEATURES), + "bigquery_has_from_dataframe_with_csv", + mock.PropertyMock(return_value=bigquery_has_from_dataframe_with_csv), + ) + df = pandas.DataFrame({"col1": [1, 2, 3]}) + destination = google.cloud.bigquery.TableReference.from_string( + "my-project.my_dataset.my_table" + ) + schema = { + "fields": [ + {"name": "col1", "type": "INT64", "policyTags": ["tag1", "tag2"]} + ] + } + + _ = list( + load.load_chunks(mock_bigquery_client, df, destination, schema=schema) + ) + + mock_load = load_method(mock_bigquery_client) + assert mock_load.called + _, kwargs = mock_load.call_args + assert "job_config" in kwargs + sent_field = kwargs["job_config"].schema[0].to_api_repr() + assert "policyTags" not in sent_field From f38cb93b1b55d48712e1f57e725a0f7782a7663b Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 30 Mar 2021 10:02:06 -0500 Subject: [PATCH 2/3] pin dependency versions in conda test session --- ci/requirements-3.7-0.23.2.conda | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/requirements-3.7-0.23.2.conda b/ci/requirements-3.7-0.23.2.conda index f36e096d..afe14499 100644 --- a/ci/requirements-3.7-0.23.2.conda +++ b/ci/requirements-3.7-0.23.2.conda @@ -2,8 +2,11 @@ codecov coverage fastavro flake8 +numpy==1.14.5 google-cloud-bigquery==1.11.1 -google-cloud-bigquery-storage +google-cloud-bigquery-storage==1.1.0 +pyarrow==1.0.0 pydata-google-auth pytest pytest-cov +tqdm==4.23.0 From 34f5104e0b6a8984b910fa08fd88eb4ac7a47dd3 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 30 Mar 2021 10:09:33 -0500 Subject: [PATCH 3/3] remove pyarrow and bqstorage API from conda session with min pandas --- ci/requirements-3.7-0.23.2.conda | 2 -- 1 file changed, 2 deletions(-) diff --git a/ci/requirements-3.7-0.23.2.conda b/ci/requirements-3.7-0.23.2.conda index afe14499..af4768ab 100644 --- a/ci/requirements-3.7-0.23.2.conda +++ b/ci/requirements-3.7-0.23.2.conda @@ -4,8 +4,6 @@ fastavro flake8 numpy==1.14.5 google-cloud-bigquery==1.11.1 -google-cloud-bigquery-storage==1.1.0 -pyarrow==1.0.0 pydata-google-auth pytest pytest-cov