diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 139ea9d220453..0000000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,155 +0,0 @@ -version: 2.1 - -jobs: - test-linux-arm: - machine: - image: default - resource_class: arm.large - environment: - ENV_FILE: ci/deps/circle-311-arm64.yaml - PYTEST_WORKERS: auto - PATTERN: "not single_cpu and not slow and not network and not clipboard and not arm_slow and not db" - PYTEST_TARGET: "pandas" - PANDAS_CI: "1" - steps: - - checkout - - run: - name: Install Environment and Run Tests - shell: /bin/bash -exo pipefail - # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd - command: | - MINI_URL="https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-aarch64.sh" - wget -q $MINI_URL -O Miniforge3.sh - chmod +x Miniforge3.sh - MINI_DIR="$HOME/miniconda3" - rm -rf $MINI_DIR - ./Miniforge3.sh -b -p $MINI_DIR - export PATH=$MINI_DIR/bin:$PATH - conda info -a - conda env create -q -n pandas-dev -f $ENV_FILE - conda list -n pandas-dev - source activate pandas-dev - if pip show pandas 1>/dev/null; then - pip uninstall -y pandas - fi - python -m pip install --no-build-isolation -ve . -Csetup-args="--werror" - PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH - ci/run_tests.sh - test-linux-musl: - docker: - - image: quay.io/pypa/musllinux_1_1_aarch64 - resource_class: arm.large - steps: - # Install pkgs first to have git in the image - # (needed for checkout) - - run: - name: Install System Packages - command: | - apk update - apk add git - apk add musl-locales - - checkout - - run: - name: Install Environment and Run Tests - command: | - /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev - . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 - python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" - python -m pip list --no-cache-dir - export PANDAS_CI=1 - python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml - build-aarch64: - parameters: - cibw-build: - type: string - machine: - image: default - resource_class: arm.large - environment: - TRIGGER_SOURCE: << pipeline.trigger_source >> - steps: - - checkout - - run: - name: Check if build is necessary - command: | - # Check if tag is defined or TRIGGER_SOURCE is scheduled - if [[ -n "$CIRCLE_TAG" ]]; then - echo 'export IS_PUSH="true"' >> "$BASH_ENV" - elif [[ $TRIGGER_SOURCE == "scheduled_pipeline" ]]; then - echo 'export IS_SCHEDULE_DISPATCH="true"' >> "$BASH_ENV" - # Look for the build label/[wheel build] in commit - # grep takes a regex, so need to escape brackets - elif (git log --format=oneline -n 1 $CIRCLE_SHA1) | grep -q '\[wheel build\]'; then - : # Do nothing - elif ! (curl https://api.github.com/repos/pandas-dev/pandas/issues/$CIRCLE_PR_NUMBER | jq '.labels' | grep -q 'Build'); then - circleci-agent step halt - fi - - run: - name: Build aarch64 wheels - no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that - command: | - pip3 install cibuildwheel==2.20.0 - if [[ $CIBW_BUILD == cp313t* ]]; then - # TODO: temporarily run 3.13 free threaded builds without build isolation - # since we need pre-release cython - CIBW_BUILD_FRONTEND="pip; args: --no-build-isolation" cibuildwheel --output-dir wheelhouse - else - cibuildwheel --output-dir wheelhouse - fi - - environment: - CIBW_BUILD: << parameters.cibw-build >> - - - run: - name: Install Anaconda Client & Upload Wheels - shell: /bin/bash -exo pipefail - command: | - MINI_URL="https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-aarch64.sh" - wget -q $MINI_URL -O Miniforge3.sh - chmod +x Miniforge3.sh - MINI_DIR="$HOME/miniconda3" - rm -rf $MINI_DIR - ./Miniforge3.sh -b -p $MINI_DIR - export PATH=$MINI_DIR/bin:$PATH - conda install -y -c conda-forge anaconda-client - source ci/upload_wheels.sh - set_upload_vars - upload_wheels - - store_artifacts: - path: wheelhouse/ - -workflows: - test: - # Don't run trigger this one when scheduled pipeline runs - when: - not: - equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] - jobs: - - test-linux-arm - test-musl: - # Don't run trigger this one when scheduled pipeline runs - when: - not: - equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] - jobs: - - test-linux-musl - build-wheels: - jobs: - - build-aarch64: - filters: - tags: - only: /^v.*/ - matrix: - parameters: - cibw-build: ["cp310-manylinux_aarch64", - "cp311-manylinux_aarch64", - "cp312-manylinux_aarch64", - "cp313-manylinux_aarch64", - "cp313t-manylinux_aarch64", - "cp310-musllinux_aarch64", - "cp311-musllinux_aarch64", - "cp312-musllinux_aarch64", - "cp313-musllinux_aarch64", - "cp313t-musllinux_aarch64"] diff --git a/.gitattributes b/.gitattributes index f77da2339b20f..d94c19e7edb1f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -61,7 +61,6 @@ pandas/_version.py export-subst *.pxi export-ignore # Ignoring stuff from the top level -.circleci export-ignore .github export-ignore asv_bench export-ignore ci export-ignore diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index bacca20780191..39f52bb3edd8e 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -4,9 +4,6 @@ # ci ci/ @mroeschke -# web -web/ @datapythonista - # docs doc/cheatsheet @Dr-Irv doc/source/development @noatamir diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 899b49cc4eff5..08c41a1eeb21f 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -22,10 +22,11 @@ defaults: jobs: ubuntu: - runs-on: ubuntu-22.04 + runs-on: ${{ matrix.platform }} timeout-minutes: 90 strategy: matrix: + platform: [ubuntu-22.04, ubuntu-24.04-arm] env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml] # Prevent the include jobs from overriding other jobs pattern: [""] @@ -35,9 +36,11 @@ jobs: env_file: actions-311-downstream_compat.yaml pattern: "not slow and not network and not single_cpu" pytest_target: "pandas/tests/test_downstream.py" + platform: ubuntu-22.04 - name: "Minimum Versions" env_file: actions-310-minimum_versions.yaml pattern: "not slow and not network and not single_cpu" + platform: ubuntu-22.04 - name: "Locale: it_IT" env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" @@ -48,6 +51,7 @@ jobs: # Also install it_IT (its encoding is ISO8859-1) but do not activate it. # It will be temporarily activated during tests with locale.setlocale extra_loc: "it_IT" + platform: ubuntu-22.04 - name: "Locale: zh_CN" env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" @@ -58,25 +62,32 @@ jobs: # Also install zh_CN (its encoding is gb2312) but do not activate it. # It will be temporarily activated during tests with locale.setlocale extra_loc: "zh_CN" + platform: ubuntu-22.04 - name: "Future infer strings" env_file: actions-312.yaml pandas_future_infer_string: "1" + platform: ubuntu-22.04 - name: "Future infer strings (without pyarrow)" env_file: actions-311.yaml pandas_future_infer_string: "1" + platform: ubuntu-22.04 - name: "Pypy" env_file: actions-pypy-39.yaml pattern: "not slow and not network and not single_cpu" test_args: "--max-worker-restart 0" + platform: ubuntu-22.04 - name: "Numpy Dev" env_file: actions-311-numpydev.yaml pattern: "not slow and not network and not single_cpu" test_args: "-W error::DeprecationWarning -W error::FutureWarning" + platform: ubuntu-22.04 - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" + pandas_future_infer_string: "1" + platform: ubuntu-22.04 fail-fast: false - name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }} + name: ${{ matrix.name || format('{0} {1}', matrix.platform, matrix.env_file) }} env: PATTERN: ${{ matrix.pattern }} LANG: ${{ matrix.lang || 'C.UTF-8' }} @@ -91,12 +102,12 @@ jobs: REMOVE_PYARROW: ${{ matrix.name == 'Future infer strings (without pyarrow)' && '1' || '0' }} concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_future_infer_string }} + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_future_infer_string }}-${{ matrix.platform }} cancel-in-progress: true services: mysql: - image: mysql:8 + image: mysql:9 env: MYSQL_ALLOW_EMPTY_PASSWORD: yes MYSQL_DATABASE: pandas @@ -109,7 +120,7 @@ jobs: - 3306:3306 postgres: - image: postgres:16 + image: postgres:17 env: PGUSER: postgres POSTGRES_USER: postgres @@ -124,7 +135,7 @@ jobs: - 5432:5432 moto: - image: motoserver/moto:5.0.0 + image: motoserver/moto:5.0.27 env: AWS_ACCESS_KEY_ID: foobar_key AWS_SECRET_ACCESS_KEY: foobar_secret @@ -231,15 +242,14 @@ jobs: - name: Build environment and Run Tests # https://github.com/numpy/numpy/issues/24703#issuecomment-1722379388 run: | - /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev + /opt/python/cp313-cp313/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install numpy -Csetup-args="-Dallow-noblas=true" python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" python -m pip list --no-cache-dir - export PANDAS_CI=1 - python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml + PANDAS_CI=1 python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-32bit @@ -248,7 +258,7 @@ jobs: Linux-Musl: runs-on: ubuntu-22.04 container: - image: quay.io/pypa/musllinux_1_1_x86_64 + image: quay.io/pypa/musllinux_1_2_x86_64 steps: - name: Checkout pandas Repo # actions/checkout does not work since it requires node @@ -270,7 +280,7 @@ jobs: apk add musl-locales - name: Build environment run: | - /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev + /opt/python/cp313-cp313/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 @@ -280,8 +290,7 @@ jobs: - name: Run Tests run: | . ~/virtualenvs/pandas-dev/bin/activate - export PANDAS_CI=1 - python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml + PANDAS_CI=1 python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-musl @@ -346,8 +355,7 @@ jobs: python --version python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy - python -m pip install versioneer[toml] - python -m pip install python-dateutil tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov + python -m pip install versioneer[toml] python-dateutil tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov python -m pip install -ve . --no-build-isolation --no-index --no-deps -Csetup-args="--werror" python -m pip list @@ -364,7 +372,7 @@ jobs: concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-python-freethreading-dev + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-python-freethreading-dev cancel-in-progress: true env: @@ -385,14 +393,11 @@ jobs: nogil: true - name: Build Environment - # TODO: Once numpy 2.2.1 is out, don't install nightly version - # Tests segfault with numpy 2.2.0: https://github.com/numpy/numpy/pull/27955 run: | python --version - python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 - python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython numpy - python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov + python -m pip install --upgrade pip setuptools wheel numpy meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython + python -m pip install versioneer[toml] python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov python -m pip install -ve . --no-build-isolation --no-index --no-deps -Csetup-args="--werror" python -m pip list @@ -419,20 +424,20 @@ jobs: with: fetch-depth: 0 - - name: Set up Python for Pyodide + - name: Set up Python for pyodide-build id: setup-python uses: actions/setup-python@v5 with: - python-version: '3.11.3' + python-version: '3.12' - name: Set up Emscripten toolchain uses: mymindstorm/setup-emsdk@v14 with: - version: '3.1.46' + version: '3.1.58' actions-cache-folder: emsdk-cache - name: Install pyodide-build - run: pip install "pyodide-build==0.25.1" + run: pip install "pyodide-build>=0.29.2" - name: Build pandas for Pyodide run: | @@ -441,10 +446,13 @@ jobs: - name: Set up Node.js uses: actions/setup-node@v4 with: - node-version: '18' + node-version: '20' - name: Set up Pyodide virtual environment + env: + pyodide-version: '0.27.1' run: | + pyodide xbuildenv install ${{ env.pyodide-version }} pyodide venv .venv-pyodide source .venv-pyodide/bin/activate pip install dist/*.whl diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 32ca5573ac08a..a4c2a732f9fc8 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -94,7 +94,8 @@ jobs: buildplat: - [ubuntu-22.04, manylinux_x86_64] - [ubuntu-22.04, musllinux_x86_64] - - [macos-12, macosx_x86_64] + - [ubuntu-24.04-arm, manylinux_aarch64] + - [macos-13, macosx_x86_64] # Note: M1 images on Github Actions start from macOS 14 - [macos-14, macosx_arm64] - [windows-2022, win_amd64] diff --git a/.gitignore b/.gitignore index a188e216d9f70..d951f3fb9cbad 100644 --- a/.gitignore +++ b/.gitignore @@ -137,3 +137,7 @@ doc/source/savefig/ # Interactive terminal generated files # ######################################## .jupyterlite.doit.db + +# Pyodide/WASM related files # +############################## +/.pyodide-xbuildenv-* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b7b9b1818c122..c13c38b20a7f8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ ci: skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.1 + rev: v0.9.4 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -34,14 +34,14 @@ repos: - id: ruff-format exclude: ^scripts|^pandas/tests/frame/test_query_eval.py - repo: https://github.com/jendrikseipp/vulture - rev: 'v2.13' + rev: 'v2.14' hooks: - id: vulture entry: python scripts/run_vulture.py pass_filenames: true require_serial: false - repo: https://github.com/codespell-project/codespell - rev: v2.3.0 + rev: v2.4.1 hooks: - id: codespell types_or: [python, rst, markdown, cython, c] @@ -70,11 +70,11 @@ repos: - id: trailing-whitespace args: [--markdown-linebreak-ext=md] - repo: https://github.com/PyCQA/isort - rev: 5.13.2 + rev: 6.0.0 hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v3.19.0 + rev: v3.19.1 hooks: - id: pyupgrade args: [--py310-plus] @@ -95,12 +95,22 @@ repos: - id: sphinx-lint args: ["--enable", "all", "--disable", "line-too-long"] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v19.1.4 + rev: v19.1.7 hooks: - id: clang-format files: ^pandas/_libs/src|^pandas/_libs/include args: [-i] types_or: [c, c++] +- repo: https://github.com/trim21/pre-commit-mirror-meson + rev: v1.7.0 + hooks: + - id: meson-fmt + args: ['--inplace'] +- repo: https://github.com/shellcheck-py/shellcheck-py + rev: v0.10.0.1 + hooks: + - id: shellcheck + args: ["--severity=warning"] - repo: local hooks: - id: pyright diff --git a/LICENSE b/LICENSE index 2d1c34fd9d7fc..c343da2ebe870 100644 --- a/LICENSE +++ b/LICENSE @@ -3,7 +3,7 @@ BSD 3-Clause License Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team All rights reserved. -Copyright (c) 2011-2024, Open source contributors. +Copyright (c) 2011-2025, Open source contributors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index ff0ccffced0f3..3a15f754ae523 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -594,7 +594,7 @@ def setup(self): self.StringIO_input = StringIO(data) def time_read_csv_index_col(self): - read_csv(self.StringIO_input, index_col="a") + read_csv(self.data(self.StringIO_input), index_col="a") class ReadCSVDatePyarrowEngine(StringIORewind): @@ -605,7 +605,7 @@ def setup(self): def time_read_csv_index_col(self): read_csv( - self.StringIO_input, + self.data(self.StringIO_input), parse_dates=["a"], engine="pyarrow", dtype_backend="pyarrow", diff --git a/asv_bench/benchmarks/io/style.py b/asv_bench/benchmarks/io/style.py index 24fd8a0d20aba..0486cabb29845 100644 --- a/asv_bench/benchmarks/io/style.py +++ b/asv_bench/benchmarks/io/style.py @@ -13,8 +13,8 @@ class Render: def setup(self, cols, rows): self.df = DataFrame( np.random.randn(rows, cols), - columns=[f"float_{i+1}" for i in range(cols)], - index=[f"row_{i+1}" for i in range(rows)], + columns=[f"float_{i + 1}" for i in range(cols)], + index=[f"row_{i + 1}" for i in range(rows)], ) def time_apply_render(self, cols, rows): diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 39cea0c361a72..5782b2b171e07 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -24,15 +24,15 @@ else fi [[ -z "$CHECK" || "$CHECK" == "code" || "$CHECK" == "doctests" || "$CHECK" == "docstrings" || "$CHECK" == "single-docs" || "$CHECK" == "notebooks" ]] || \ - { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 9999; } + { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 1; } -BASE_DIR="$(dirname $0)/.." +BASE_DIR="$(dirname "$0")/.." RET=0 ### CODE ### if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then - MSG='Check import. No warnings, and blocklist some optional dependencies' ; echo $MSG + MSG='Check import. No warnings, and blocklist some optional dependencies' ; echo "$MSG" python -W error -c " import sys import pandas @@ -49,30 +49,29 @@ if mods: sys.stderr.write('err: pandas should not import: {}\n'.format(', '.join(mods))) sys.exit(len(mods)) " - RET=$(($RET + $?)) ; echo $MSG "DONE" + RET=$(($RET + $?)) ; echo "$MSG" "DONE" fi ### DOCTESTS ### if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then - MSG='Python and Cython Doctests' ; echo $MSG + MSG='Python and Cython Doctests' ; echo "$MSG" python -c 'import pandas as pd; pd.test(run_doctests=True)' - RET=$(($RET + $?)) ; echo $MSG "DONE" + RET=$(($RET + $?)) ; echo "$MSG" "DONE" fi ### DOCSTRINGS ### if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then - MSG='Validate Docstrings' ; echo $MSG - $BASE_DIR/scripts/validate_docstrings.py \ + MSG='Validate Docstrings' ; echo "$MSG" + "$BASE_DIR"/scripts/validate_docstrings.py \ --format=actions \ -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ - -i "pandas.RangeIndex.from_range PR01,SA01" \ -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ @@ -80,21 +79,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.min PR02" \ -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.tzinfo GL08" \ - -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ - -i "pandas.arrays.NumpyExtensionArray SA01" \ - -i "pandas.arrays.TimedeltaArray PR07,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ - -i "pandas.core.resample.Resampler.max PR01,RT03,SA01" \ - -i "pandas.core.resample.Resampler.mean SA01" \ - -i "pandas.core.resample.Resampler.min PR01,RT03,SA01" \ - -i "pandas.core.resample.Resampler.prod SA01" \ -i "pandas.core.resample.Resampler.quantile PR01,PR07" \ - -i "pandas.core.resample.Resampler.std SA01" \ - -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \ - -i "pandas.core.resample.Resampler.var SA01" \ - -i "pandas.errors.ValueLabelTypeMismatch SA01" \ - -i "pandas.plotting.andrews_curves RT03,SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.n GL08" \ @@ -158,7 +145,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.CustomBusinessMonthBegin PR02" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.calendar GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.holidays GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthBegin.is_on_offset SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.m_offset GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.n GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.normalize GL08" \ @@ -166,7 +152,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.CustomBusinessMonthEnd PR02" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.calendar GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.holidays GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthEnd.is_on_offset SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.m_offset GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.n GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.normalize GL08" \ @@ -203,7 +188,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.Hour.is_on_offset GL08" \ -i "pandas.tseries.offsets.Hour.n GL08" \ -i "pandas.tseries.offsets.Hour.normalize GL08" \ - -i "pandas.tseries.offsets.LastWeekOfMonth SA01" \ -i "pandas.tseries.offsets.LastWeekOfMonth.is_on_offset GL08" \ -i "pandas.tseries.offsets.LastWeekOfMonth.n GL08" \ -i "pandas.tseries.offsets.LastWeekOfMonth.normalize GL08" \ @@ -281,7 +265,7 @@ fi if [[ -z "$CHECK" || "$CHECK" == "notebooks" ]]; then MSG='Notebooks' ; echo $MSG - jupyter nbconvert --execute $(find doc/source -name '*.ipynb') --to notebook + jupyter nbconvert --execute "$(find doc/source -name '*.ipynb')" --to notebook RET=$(($RET + $?)) ; echo $MSG "DONE" fi diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 22e4907e5a6e5..2d3d11c294e12 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -23,7 +23,7 @@ dependencies: - pip: - "tzdata>=2022.7" - - "--extra-index-url https://pypi.fury.io/arrow-nightlies/" + - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" - "--prefer-binary" - "--pre" - "pyarrow" diff --git a/ci/deps/circle-311-arm64.yaml b/ci/deps/circle-311-arm64.yaml deleted file mode 100644 index 3f09e27d0fe4b..0000000000000 --- a/ci/deps/circle-311-arm64.yaml +++ /dev/null @@ -1,61 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.11 - - # build dependencies - - versioneer - - cython>=0.29.33 - - meson=1.2.1 - - meson-python=0.13.1 - - # test dependencies - - pytest>=7.3.2 - - pytest-cov - - pytest-xdist>=3.4.0 - - pytest-localserver>=0.8.1 - - pytest-qt>=4.4.0 - - boto3 - - # required dependencies - - python-dateutil - - numpy - - # optional dependencies - - beautifulsoup4>=4.11.2 - - blosc>=1.21.3 - - bottleneck>=1.3.6 - - fastparquet>=2023.10.0 - - fsspec>=2022.11.0 - - html5lib>=1.1 - - hypothesis>=6.84.0 - - gcsfs>=2022.11.0 - - jinja2>=3.1.2 - - lxml>=4.9.2 - - matplotlib>=3.6.3 - - numba>=0.56.4 - - numexpr>=2.8.4 - - odfpy>=1.4.1 - - qtpy>=2.3.0 - - openpyxl>=3.1.0 - - psycopg2>=2.9.6 - - pyarrow>=10.0.1 - - pymysql>=1.0.2 - - pyqt>=5.15.9 - - pyreadstat>=1.2.0 - - pytables>=3.8.0 - - python-calamine>=0.1.7 - - pytz>=2023.4 - - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 - - sqlalchemy>=2.0.0 - - tabulate>=0.9.0 - - xarray>=2022.12.0, <2024.10.0 - - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 - - pip: - - adbc-driver-postgresql>=0.8.0 - - adbc-driver-sqlite>=0.8.0 diff --git a/ci/run_tests.sh b/ci/run_tests.sh index d2c2f58427a23..16292beec612b 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -3,10 +3,8 @@ # Workaround for pytest-xdist (it collects different tests in the workers if PYTHONHASHSEED is not set) # https://github.com/pytest-dev/pytest/issues/920 # https://github.com/pytest-dev/pytest/issues/1075 -export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') - -# May help reproduce flaky CI builds if set in subsequent runs -echo PYTHONHASHSEED=$PYTHONHASHSEED +PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') +export PYTHONHASHSEED COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.toml" @@ -16,5 +14,5 @@ if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" fi -echo $PYTEST_CMD +echo "$PYTEST_CMD" sh -c "$PYTEST_CMD" diff --git a/ci/upload_wheels.sh b/ci/upload_wheels.sh index 3c4aa76c02003..c7c7ca00ee466 100644 --- a/ci/upload_wheels.sh +++ b/ci/upload_wheels.sh @@ -1,3 +1,4 @@ +#!/bin/bash # Modified from numpy's https://github.com/numpy/numpy/blob/main/tools/wheels/upload_wheels.sh set_upload_vars() { @@ -19,20 +20,20 @@ set_upload_vars() { fi } upload_wheels() { - echo ${PWD} + echo "${PWD}" if [[ ${ANACONDA_UPLOAD} == true ]]; then - if [ -z ${TOKEN} ]; then + if [ -z "${TOKEN}" ]; then echo no token set, not uploading else # sdists are located under dist folder when built through setup.py if compgen -G "./dist/*.gz"; then echo "Found sdist" - anaconda -q -t ${TOKEN} upload --skip -u ${ANACONDA_ORG} ./dist/*.gz + anaconda -q -t "${TOKEN}" upload --skip -u "${ANACONDA_ORG}" ./dist/*.gz echo "Uploaded sdist" fi if compgen -G "./wheelhouse/*.whl"; then echo "Found wheel" - anaconda -q -t ${TOKEN} upload --skip -u ${ANACONDA_ORG} ./wheelhouse/*.whl + anaconda -q -t "${TOKEN}" upload --skip -u "${ANACONDA_ORG}" ./wheelhouse/*.whl echo "Uploaded wheel" fi echo "PyPI-style index: https://pypi.anaconda.org/$ANACONDA_ORG/simple" diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pdf new file mode 100644 index 0000000000000..ea356385e9fb1 Binary files /dev/null and b/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pdf differ diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pptx new file mode 100644 index 0000000000000..1b812c1a2595a Binary files /dev/null and b/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pptx differ diff --git a/doc/cheatsheet/README.md b/doc/cheatsheet/README.md index 6c33de104ed90..b8599acff2f6e 100644 --- a/doc/cheatsheet/README.md +++ b/doc/cheatsheet/README.md @@ -6,10 +6,12 @@ and pick "PDF" as the format. This cheat sheet, originally written by Irv Lustig, [Princeton Consultants](https://www.princetonoptimization.com/), was inspired by the [RStudio Data Wrangling Cheatsheet](https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf). -| Topic | PDF | PPT | -|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Pandas_Cheat_Sheet | | | -| Pandas_Cheat_Sheet_JA | | | +| Topic | Language | PDF | PPT | +|------------------------|-------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Pandas_Cheat_Sheet | English | | | +| Pandas_Cheat_Sheet_JA | Japanese | | | +| Pandas_Cheat_Sheet_FA | Persian | | | + **Alternative** diff --git a/doc/make.py b/doc/make.py index 02deb5002fea1..9542563dc037b 100755 --- a/doc/make.py +++ b/doc/make.py @@ -260,8 +260,7 @@ def latex(self, force=False): for i in range(3): self._run_os("pdflatex", "-interaction=nonstopmode", "pandas.tex") raise SystemExit( - "You should check the file " - '"build/latex/pandas.pdf" for problems.' + 'You should check the file "build/latex/pandas.pdf" for problems.' ) self._run_os("make") return ret_code @@ -343,8 +342,7 @@ def main(): dest="verbosity", default=0, help=( - "increase verbosity (can be repeated), " - "passed to the sphinx build command" + "increase verbosity (can be repeated), passed to the sphinx build command" ), ) argparser.add_argument( diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index c1cfb0d7a623b..143aebd8f236a 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -344,7 +344,7 @@ be located. - tests.scalar - tests.tseries.offsets -2. Does your test depend only on code in pd._libs? +2. Does your test depend only on code in ``pd._libs``? This test likely belongs in one of: - tests.libs diff --git a/doc/source/development/contributing_gitpod.rst b/doc/source/development/contributing_gitpod.rst index 2ba43a44b87d3..b70981b4d307d 100644 --- a/doc/source/development/contributing_gitpod.rst +++ b/doc/source/development/contributing_gitpod.rst @@ -109,7 +109,7 @@ development experience: * `VSCode rst extension `_ * `Markdown All in One `_ -* `VSCode Gitlens extension `_ +* `VSCode GitLens extension `_ * `VSCode Git Graph extension `_ Development workflow with Gitpod diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst index 6de237b70f08d..c5c4b7c449ce7 100644 --- a/doc/source/development/developer.rst +++ b/doc/source/development/developer.rst @@ -99,7 +99,7 @@ Column metadata * Boolean: ``'bool'`` * Integers: ``'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64'`` * Floats: ``'float16', 'float32', 'float64'`` -* Date and Time Types: ``'datetime', 'datetimetz'``, ``'timedelta'`` +* Date and Time Types: ``'datetime', 'datetimetz', 'timedelta'`` * String: ``'unicode', 'bytes'`` * Categorical: ``'categorical'`` * Other Python objects: ``'object'`` diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index 1e4a851d0e72d..c572559dcc3e0 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -488,7 +488,7 @@ Post-Release for reference): - The pandas-dev and pydata mailing lists - - Twitter, Mastodon, Telegram and LinkedIn + - X, Mastodon, Telegram and LinkedIn 7. Update this release instructions to fix anything incorrect and to update about any change since the last release. diff --git a/doc/source/getting_started/comparison/comparison_with_spss.rst b/doc/source/getting_started/comparison/comparison_with_spss.rst new file mode 100644 index 0000000000000..12c64bfd180a3 --- /dev/null +++ b/doc/source/getting_started/comparison/comparison_with_spss.rst @@ -0,0 +1,229 @@ +.. _compare_with_spss: + +{{ header }} + +Comparison with SPSS +******************** +For potential users coming from `SPSS `__, this page is meant to demonstrate +how various SPSS operations would be performed using pandas. + +.. include:: includes/introduction.rst + +Data structures +--------------- + +General terminology translation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. csv-table:: + :header: "pandas", "SPSS" + :widths: 20, 20 + + :class:`DataFrame`, data file + column, variable + row, case + groupby, split file + :class:`NaN`, system-missing + +:class:`DataFrame` +~~~~~~~~~~~~~~~~~~ + +A :class:`DataFrame` in pandas is analogous to an SPSS data file - a two-dimensional +data source with labeled columns that can be of different types. As will be shown in this +document, almost any operation that can be performed in SPSS can also be accomplished in pandas. + +:class:`Series` +~~~~~~~~~~~~~~~ + +A :class:`Series` is the data structure that represents one column of a :class:`DataFrame`. SPSS doesn't have a +separate data structure for a single variable, but in general, working with a :class:`Series` is analogous +to working with a variable in SPSS. + +:class:`Index` +~~~~~~~~~~~~~~ + +Every :class:`DataFrame` and :class:`Series` has an :class:`Index` -- labels on the *rows* of the data. SPSS does not +have an exact analogue, as cases are simply numbered sequentially from 1. In pandas, if no index is +specified, a :class:`RangeIndex` is used by default (first row = 0, second row = 1, and so on). + +While using a labeled :class:`Index` or :class:`MultiIndex` can enable sophisticated analyses and is ultimately an +important part of pandas to understand, for this comparison we will essentially ignore the :class:`Index` and +just treat the :class:`DataFrame` as a collection of columns. Please see the :ref:`indexing documentation` +for much more on how to use an :class:`Index` effectively. + + +Copies vs. in place operations +------------------------------ + +.. include:: includes/copies.rst + + +Data input / output +------------------- + +Reading external data +~~~~~~~~~~~~~~~~~~~~~ + +Like SPSS, pandas provides utilities for reading in data from many formats. The ``tips`` dataset, found within +the pandas tests (`csv `_) +will be used in many of the following examples. + +In SPSS, you would use File > Open > Data to import a CSV file: + +.. code-block:: text + + FILE > OPEN > DATA + /TYPE=CSV + /FILE='tips.csv' + /DELIMITERS="," + /FIRSTCASE=2 + /VARIABLES=col1 col2 col3. + +The pandas equivalent would use :func:`read_csv`: + +.. code-block:: python + + url = ( + "https://raw.githubusercontent.com/pandas-dev" + "/pandas/main/pandas/tests/io/data/csv/tips.csv" + ) + tips = pd.read_csv(url) + tips + +Like SPSS's data import wizard, ``read_csv`` can take a number of parameters to specify how the data should be parsed. +For example, if the data was instead tab delimited, and did not have column names, the pandas command would be: + +.. code-block:: python + + tips = pd.read_csv("tips.csv", sep="\t", header=None) + + # alternatively, read_table is an alias to read_csv with tab delimiter + tips = pd.read_table("tips.csv", header=None) + + +Data operations +--------------- + +Filtering +~~~~~~~~~ + +In SPSS, filtering is done through Data > Select Cases: + +.. code-block:: text + + SELECT IF (total_bill > 10). + EXECUTE. + +In pandas, boolean indexing can be used: + +.. code-block:: python + + tips[tips["total_bill"] > 10] + + +Sorting +~~~~~~~ + +In SPSS, sorting is done through Data > Sort Cases: + +.. code-block:: text + + SORT CASES BY sex total_bill. + EXECUTE. + +In pandas, this would be written as: + +.. code-block:: python + + tips.sort_values(["sex", "total_bill"]) + + +String processing +----------------- + +Finding length of string +~~~~~~~~~~~~~~~~~~~~~~~~ + +In SPSS: + +.. code-block:: text + + COMPUTE length = LENGTH(time). + EXECUTE. + +.. include:: includes/length.rst + + +Changing case +~~~~~~~~~~~~~ + +In SPSS: + +.. code-block:: text + + COMPUTE upper = UPCASE(time). + COMPUTE lower = LOWER(time). + EXECUTE. + +.. include:: includes/case.rst + + +Merging +------- + +In SPSS, merging data files is done through Data > Merge Files. + +.. include:: includes/merge_setup.rst +.. include:: includes/merge.rst + + +GroupBy operations +------------------ + +Split-file processing +~~~~~~~~~~~~~~~~~~~~~ + +In SPSS, split-file analysis is done through Data > Split File: + +.. code-block:: text + + SORT CASES BY sex. + SPLIT FILE BY sex. + DESCRIPTIVES VARIABLES=total_bill tip + /STATISTICS=MEAN STDDEV MIN MAX. + +The pandas equivalent would be: + +.. code-block:: python + + tips.groupby("sex")[["total_bill", "tip"]].agg(["mean", "std", "min", "max"]) + + +Missing data +------------ + +SPSS uses the period (``.``) for numeric missing values and blank spaces for string missing values. +pandas uses ``NaN`` (Not a Number) for numeric missing values and ``None`` or ``NaN`` for string +missing values. + +.. include:: includes/missing.rst + + +Other considerations +-------------------- + +Output management +----------------- + +While pandas does not have a direct equivalent to SPSS's Output Management System (OMS), you can +capture and export results in various ways: + +.. code-block:: python + + # Save summary statistics to CSV + tips.groupby('sex')[['total_bill', 'tip']].mean().to_csv('summary.csv') + + # Save multiple results to Excel sheets + with pd.ExcelWriter('results.xlsx') as writer: + tips.describe().to_excel(writer, sheet_name='Descriptives') + tips.groupby('sex').mean().to_excel(writer, sheet_name='Means by Gender') diff --git a/doc/source/getting_started/comparison/index.rst b/doc/source/getting_started/comparison/index.rst index c3f58ce1f3d6d..3133d74afa3db 100644 --- a/doc/source/getting_started/comparison/index.rst +++ b/doc/source/getting_started/comparison/index.rst @@ -14,3 +14,4 @@ Comparison with other tools comparison_with_spreadsheets comparison_with_sas comparison_with_stata + comparison_with_spss diff --git a/doc/source/getting_started/tutorials.rst b/doc/source/getting_started/tutorials.rst index 4393c3716bdad..eae7771418485 100644 --- a/doc/source/getting_started/tutorials.rst +++ b/doc/source/getting_started/tutorials.rst @@ -112,7 +112,7 @@ Various tutorials * `Wes McKinney's (pandas BDFL) blog `_ * `Statistical analysis made easy in Python with SciPy and pandas DataFrames, by Randal Olson `_ -* `Statistical Data Analysis in Python, tutorial videos, by Christopher Fonnesbeck from SciPy 2013 `_ +* `Statistical Data Analysis in Python, tutorial by Christopher Fonnesbeck from SciPy 2013 `_ * `Financial analysis in Python, by Thomas Wiecki `_ * `Intro to pandas data structures, by Greg Reda `_ * `Pandas DataFrames Tutorial, by Karlijn Willems `_ diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst index 3b02ffe20c10e..fc180c8161a7e 100644 --- a/doc/source/reference/groupby.rst +++ b/doc/source/reference/groupby.rst @@ -104,6 +104,7 @@ Function application DataFrameGroupBy.shift DataFrameGroupBy.size DataFrameGroupBy.skew + DataFrameGroupBy.kurt DataFrameGroupBy.std DataFrameGroupBy.sum DataFrameGroupBy.var @@ -159,6 +160,7 @@ Function application SeriesGroupBy.shift SeriesGroupBy.size SeriesGroupBy.skew + SeriesGroupBy.kurt SeriesGroupBy.std SeriesGroupBy.sum SeriesGroupBy.var diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 43d7480899dc4..6006acc8f5e16 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -25,6 +25,7 @@ Attributes Series.array Series.values Series.dtype + Series.info Series.shape Series.nbytes Series.ndim diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst index 0e1d93841d52f..742263c788c2f 100644 --- a/doc/source/reference/style.rst +++ b/doc/source/reference/style.rst @@ -27,6 +27,7 @@ Styler properties Styler.template_html_style Styler.template_html_table Styler.template_latex + Styler.template_typst Styler.template_string Styler.loader @@ -77,6 +78,7 @@ Style export and import Styler.to_html Styler.to_latex + Styler.to_typst Styler.to_excel Styler.to_string Styler.export diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst index 14af2b8a120e0..2aeb57faac112 100644 --- a/doc/source/reference/window.rst +++ b/doc/source/reference/window.rst @@ -30,11 +30,14 @@ Rolling window functions Rolling.std Rolling.min Rolling.max + Rolling.first + Rolling.last Rolling.corr Rolling.cov Rolling.skew Rolling.kurt Rolling.apply + Rolling.pipe Rolling.aggregate Rolling.quantile Rolling.sem @@ -71,11 +74,14 @@ Expanding window functions Expanding.std Expanding.min Expanding.max + Expanding.first + Expanding.last Expanding.corr Expanding.cov Expanding.skew Expanding.kurt Expanding.apply + Expanding.pipe Expanding.aggregate Expanding.quantile Expanding.sem diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 1525afcac87f7..91a0b4a4fe967 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -459,7 +459,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df # List the size of the animals with the highest weight. - df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()], include_groups=False) + df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()]) `Using get_group `__ @@ -482,7 +482,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to return pd.Series(["L", avg_weight, True], index=["size", "weight", "adult"]) - expected_df = gb.apply(GrowUp, include_groups=False) + expected_df = gb.apply(GrowUp) expected_df `Expanding apply @@ -874,7 +874,7 @@ Timeseries `__ `Aggregation and plotting time series -`__ +`__ Turn a matrix with hours in columns and days in rows into a continuous row sequence in the form of a time series. `How to rearrange a Python pandas DataFrame? @@ -1043,7 +1043,7 @@ CSV The :ref:`CSV ` docs -`read_csv in action `__ +`read_csv in action `__ `appending to a csv `__ diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index c4721f3a6b09c..e55a6cda47ac2 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -427,7 +427,7 @@ prefer that Numba throw an error if it cannot compile a function in a way that speeds up your code, pass Numba the argument ``nopython=True`` (e.g. ``@jit(nopython=True)``). For more on troubleshooting Numba modes, see the `Numba troubleshooting page -`__. +`__. Using ``parallel=True`` (e.g. ``@jit(parallel=True)``) may result in a ``SIGABRT`` if the threading layer leads to unsafe behavior. You can first `specify a safe threading layer `__ diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index 842f30f06676e..e85eead4e0f09 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -372,5 +372,5 @@ constructors using something similar to the following: s = pd.Series(newx) See `the NumPy documentation on byte order -`__ for more +`__ for more details. diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index acb5a2b7919ac..4ec34db6ed959 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -418,7 +418,7 @@ You can also include the grouping columns if you want to operate on them. .. note:: - The ``groupby`` operation in Pandas drops the ``name`` field of the columns Index object + The ``groupby`` operation in pandas drops the ``name`` field of the columns Index object after the operation. This change ensures consistency in syntax between different column selection methods within groupby operations. @@ -1074,7 +1074,7 @@ missing values with the ``ffill()`` method. ).set_index("date") df_re - df_re.groupby("group").resample("1D", include_groups=False).ffill() + df_re.groupby("group").resample("1D").ffill() .. _groupby.filter: @@ -1252,13 +1252,13 @@ the argument ``group_keys`` which defaults to ``True``. Compare .. ipython:: python - df.groupby("A", group_keys=True).apply(lambda x: x, include_groups=False) + df.groupby("A", group_keys=True).apply(lambda x: x) with .. ipython:: python - df.groupby("A", group_keys=False).apply(lambda x: x, include_groups=False) + df.groupby("A", group_keys=False).apply(lambda x: x) Numba accelerated routines @@ -1742,7 +1742,7 @@ column index name will be used as the name of the inserted column: result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()} return pd.Series(result, name="metrics") - result = df.groupby("a").apply(compute_metrics, include_groups=False) + result = df.groupby("a").apply(compute_metrics) result diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 503f7cc7cbe73..ed5c7806b2e23 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -858,9 +858,10 @@ and :ref:`Advanced Indexing ` you may select along more than one axis .. warning:: - ``iloc`` supports two kinds of boolean indexing. If the indexer is a boolean ``Series``, - an error will be raised. For instance, in the following example, ``df.iloc[s.values, 1]`` is ok. - The boolean indexer is an array. But ``df.iloc[s, 1]`` would raise ``ValueError``. + While ``loc`` supports two kinds of boolean indexing, ``iloc`` only supports indexing with a + boolean array. If the indexer is a boolean ``Series``, an error will be raised. For instance, + in the following example, ``df.iloc[s.values, 1]`` is ok. The boolean indexer is an array. + But ``df.iloc[s, 1]`` would raise ``ValueError``. .. ipython:: python diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 7c165c87adb46..07d06f61b3fd6 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -990,7 +990,7 @@ Thousand separators For large numbers that have been written with a thousands separator, you can set the ``thousands`` keyword to a string of length 1 so that integers will be parsed -correctly: +correctly. By default, numbers with a thousands separator will be parsed as strings: @@ -2340,6 +2340,7 @@ Read a URL with no options: .. code-block:: ipython In [320]: url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list" + In [321]: pd.read_html(url) Out[321]: [ Bank NameBank CityCity StateSt ... Acquiring InstitutionAI Closing DateClosing FundFund @@ -2366,6 +2367,7 @@ Read a URL while passing headers alongside the HTTP request: .. code-block:: ipython In [322]: url = 'https://www.sump.org/notes/request/' # HTTP request reflector + In [323]: pd.read_html(url) Out[323]: [ 0 1 @@ -2378,14 +2380,16 @@ Read a URL while passing headers alongside the HTTP request: 1 Host: www.sump.org 2 User-Agent: Python-urllib/3.8 3 Connection: close] + In [324]: headers = { - In [325]: 'User-Agent':'Mozilla Firefox v14.0', - In [326]: 'Accept':'application/json', - In [327]: 'Connection':'keep-alive', - In [328]: 'Auth':'Bearer 2*/f3+fe68df*4' - In [329]: } - In [340]: pd.read_html(url, storage_options=headers) - Out[340]: + .....: 'User-Agent':'Mozilla Firefox v14.0', + .....: 'Accept':'application/json', + .....: 'Connection':'keep-alive', + .....: 'Auth':'Bearer 2*/f3+fe68df*4' + .....: } + + In [325]: pd.read_html(url, storage_options=headers) + Out[325]: [ 0 1 0 Remote Socket: 51.15.105.256:51760 1 Protocol Version: HTTP/1.1 diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index cfd2f40aa93a3..60a66f5e6f2a8 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -586,7 +586,7 @@ A string argument to ``indicator`` will use the value as the name for the indica Overlapping value columns ~~~~~~~~~~~~~~~~~~~~~~~~~ -The merge ``suffixes`` argument takes a tuple of list of strings to append to +The merge ``suffixes`` argument takes a tuple or list of strings to append to overlapping column names in the input :class:`DataFrame` to disambiguate the result columns: @@ -906,7 +906,7 @@ resetting indexes. Joining multiple :class:`DataFrame` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -A list or tuple of ``:class:`DataFrame``` can also be passed to :meth:`~DataFrame.join` +A list or tuple of :class:`DataFrame` can also be passed to :meth:`~DataFrame.join` to join them together on their indexes. .. ipython:: python @@ -979,7 +979,7 @@ nearest key rather than equal keys. For each row in the ``left`` :class:`DataFra the last row in the ``right`` :class:`DataFrame` are selected where the ``on`` key is less than the left's key. Both :class:`DataFrame` must be sorted by the key. -Optionally an :func:`merge_asof` can perform a group-wise merge by matching the +Optionally :func:`merge_asof` can perform a group-wise merge by matching the ``by`` key in addition to the nearest match on the ``on`` key. .. ipython:: python diff --git a/doc/source/user_guide/pyarrow.rst b/doc/source/user_guide/pyarrow.rst index aecbce0441b53..1807341530e69 100644 --- a/doc/source/user_guide/pyarrow.rst +++ b/doc/source/user_guide/pyarrow.rst @@ -22,7 +22,7 @@ Data Structure Integration A :class:`Series`, :class:`Index`, or the columns of a :class:`DataFrame` can be directly backed by a :external+pyarrow:py:class:`pyarrow.ChunkedArray` which is similar to a NumPy array. To construct these from the main pandas data structures, you can pass in a string of the type followed by -``[pyarrow]``, e.g. ``"int64[pyarrow]""`` into the ``dtype`` parameter +``[pyarrow]``, e.g. ``"int64[pyarrow]"`` into the ``dtype`` parameter .. ipython:: python diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index 29df2994fbc35..d12993f7ead4b 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -5,7 +5,7 @@ Scaling to large datasets ************************* pandas provides data structures for in-memory analytics, which makes using pandas -to analyze datasets that are larger than memory datasets somewhat tricky. Even datasets +to analyze datasets that are larger than memory somewhat tricky. Even datasets that are a sizable fraction of memory become unwieldy, as some pandas operations need to make intermediate copies. diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index abb7181fc8d72..9cda1486eb48b 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -1288,7 +1288,7 @@ "outputs": [], "source": [ "df2.loc[:4].style.highlight_max(\n", - " axis=1, props=(\"color:white; \" \"font-weight:bold; \" \"background-color:darkblue;\")\n", + " axis=1, props=(\"color:white; font-weight:bold; background-color:darkblue;\")\n", ")" ] }, diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 4299dca4774b9..d046d13f71daf 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1580,7 +1580,7 @@ the pandas objects. ts = ts[:5] ts.shift(1) -The ``shift`` method accepts an ``freq`` argument which can accept a +The ``shift`` method accepts a ``freq`` argument which can accept a ``DateOffset`` class or other ``timedelta``-like object or also an :ref:`offset alias `. @@ -2570,7 +2570,7 @@ because daylight savings time (DST) in a local time zone causes some times to oc twice within one day ("clocks fall back"). The following options are available: * ``'raise'``: Raises a ``ValueError`` (the default behavior) -* ``'infer'``: Attempt to determine the correct offset base on the monotonicity of the timestamps +* ``'infer'``: Attempt to determine the correct offset based on the monotonicity of the timestamps * ``'NaT'``: Replaces ambiguous times with ``NaT`` * ``bool``: ``True`` represents a DST time, ``False`` represents non-DST time. An array-like of ``bool`` values is supported for a sequence of times. diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 66eeb74b363a3..4b5cdca23103c 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1210,11 +1210,6 @@ You may set the ``xlabel`` and ``ylabel`` arguments to give the plot custom labe for x and y axis. By default, pandas will pick up index name as xlabel, while leaving it empty for ylabel. -.. ipython:: python - :suppress: - - plt.figure(); - .. ipython:: python df.plot(); diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst index 0581951d5bfad..5b27442c80bb8 100644 --- a/doc/source/user_guide/window.rst +++ b/doc/source/user_guide/window.rst @@ -70,7 +70,8 @@ which will first group the data by the specified keys and then perform a windowi Some windowing aggregation, ``mean``, ``sum``, ``var`` and ``std`` methods may suffer from numerical imprecision due to the underlying windowing algorithms accumulating sums. When values differ - with magnitude :math:`1/np.finfo(np.double).eps` this results in truncation. It must be + with magnitude ``1/np.finfo(np.double).eps`` (approximately :math:`4.5 \times 10^{15}`), + this results in truncation. It must be noted, that large values may have an impact on windows, which do not include these values. `Kahan summation `__ is used to compute the rolling sums to preserve accuracy as much as possible. @@ -356,11 +357,11 @@ See :ref:`enhancing performance with Numba ` for general us Numba will be applied in potentially two routines: -#. If ``func`` is a standard Python function, the engine will `JIT `__ the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again. +#. If ``func`` is a standard Python function, the engine will `JIT `__ the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again. #. The engine will JIT the for loop where the apply function is applied to each window. The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the -`numba.jit decorator `__. +`numba.jit decorator `__. These keyword arguments will be applied to *both* the passed function (if a standard Python function) and the apply for loop over each window. diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index b107a5d3ba100..09134763977c3 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -35,8 +35,11 @@ Other enhancements - The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been updated to work correctly with NumPy >= 2 (:issue:`57739`) +- :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`) +- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) +- The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`) +- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) -- .. --------------------------------------------------------------------------- .. _whatsnew_230.notable_bug_fixes: @@ -51,6 +54,16 @@ These are bug fixes that might have notable behavior changes. notable_bug_fix1 ^^^^^^^^^^^^^^^^ +.. _whatsnew_230.api_changes: + +API changes +~~~~~~~~~~~ + +- When enabling the ``future.infer_string`` option: Index set operations (like + union or intersection) will now ignore the dtype of an empty ``RangeIndex`` or + empty ``Index`` with object dtype when determining the dtype of the resulting + Index (:issue:`60797`) + .. --------------------------------------------------------------------------- .. _whatsnew_230.deprecations: @@ -95,7 +108,7 @@ Timezones Numeric ^^^^^^^ -- +- Enabled :class:`Series.mode` and :class:`DataFrame.mode` with ``dropna=False`` to sort the result for all dtypes in the presence of NA values; previously only certain dtypes would sort (:issue:`60702`) - Conversion @@ -105,6 +118,7 @@ Conversion Strings ^^^^^^^ +- Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` did not raise for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`) - Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`) - Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) @@ -175,7 +189,6 @@ Other ^^^^^ - Fixed usage of ``inspect`` when the optional dependencies ``pyarrow`` or ``jinja2`` are not installed (:issue:`60196`) -- .. --------------------------------------------------------------------------- .. _whatsnew_230.contributors: diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f33d56bbed6d6..edd205860b4e4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -30,8 +30,13 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) +- :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) +- Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`) +- Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`) +- :class:`pandas.api.typing.NoDefault` is available for typing ``no_default`` - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`) +- :func:`pandas.merge`, :meth:`DataFrame.merge` and :meth:`DataFrame.join` now support anti joins (``left_anti`` and ``right_anti``) in the ``how`` parameter (:issue:`42916`) - :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`). @@ -44,6 +49,7 @@ Other enhancements - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`) - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) +- :class:`Rolling` and :class:`Expanding` now support ``pipe`` method (:issue:`57076`) - :class:`Series` now supports the Arrow PyCapsule Interface for export (:issue:`59518`) - :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`) - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) @@ -51,15 +57,23 @@ Other enhancements - :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`) +- :meth:`DataFrame.to_json` now encodes ``Decimal`` as strings instead of floats (:issue:`60698`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) +- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`) +- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`) - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) +- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) +- :meth:`Series.str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) -- :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) +- :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`) +- Add ``"delete_rows"`` option to ``if_exists`` argument in :meth:`DataFrame.to_sql` deleting all records of the table before inserting data (:issue:`37210`). +- Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`) +- Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`) - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`) - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`) @@ -344,10 +358,14 @@ Other API changes - Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`56519`) - Passing a :class:`Series` input to :func:`json_normalize` will now retain the :class:`Series` :class:`Index`, previously output had a new :class:`RangeIndex` (:issue:`51452`) - Removed :meth:`Index.sort` which always raised a ``TypeError``. This attribute is not defined and will raise an ``AttributeError`` (:issue:`59283`) +- Unused ``dtype`` argument has been removed from the :class:`MultiIndex` constructor (:issue:`60962`) - Updated :meth:`DataFrame.to_excel` so that the output spreadsheet has no styling. Custom styling can still be done using :meth:`Styler.to_excel` (:issue:`54154`) - pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`) - pickled objects from pandas version less than ``1.0.0`` are no longer supported (:issue:`57155`) - when comparing the indexes in :func:`testing.assert_series_equal`, check_exact defaults to True if an :class:`Index` is of integer dtypes. (:issue:`57386`) +- Index set operations (like union or intersection) will now ignore the dtype of + an empty ``RangeIndex`` or empty ``Index`` with object dtype when determining + the dtype of the resulting Index (:issue:`60797`) .. --------------------------------------------------------------------------- .. _whatsnew_300.deprecations: @@ -554,6 +572,7 @@ Other Removals - Removed the ``method`` keyword in ``ExtensionArray.fillna``, implement ``ExtensionArray._pad_or_backfill`` instead (:issue:`53621`) - Removed the attribute ``dtypes`` from :class:`.DataFrameGroupBy` (:issue:`51997`) - Enforced deprecation of ``argmin``, ``argmax``, ``idxmin``, and ``idxmax`` returning a result when ``skipna=False`` and an NA value is encountered or all values are NA values; these operations will now raise in such cases (:issue:`33941`, :issue:`51276`) +- Removed specifying ``include_groups=True`` in :class:`.DataFrameGroupBy.apply` and :class:`.Resampler.apply` (:issue:`7155`) .. --------------------------------------------------------------------------- .. _whatsnew_300.performance: @@ -621,6 +640,7 @@ Datetimelike - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`) - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) - Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`) +- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` casting ``datetime64`` and ``timedelta64`` columns to ``float64`` and losing precision (:issue:`60850`) - Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` returning ``False`` on double-digit frequencies (:issue:`58523`) @@ -652,6 +672,7 @@ Conversion - Bug in :meth:`DataFrame.astype` not casting ``values`` for Arrow-based dictionary dtype correctly (:issue:`58479`) - Bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`) - Bug in :meth:`Series.astype` might modify read-only array inplace when casting to a string dtype (:issue:`57212`) +- Bug in :meth:`Series.convert_dtypes` and :meth:`DataFrame.convert_dtypes` removing timezone information for objects with :class:`ArrowDtype` (:issue:`60237`) - Bug in :meth:`Series.reindex` not maintaining ``float32`` type when a ``reindex`` introduces a missing value (:issue:`45857`) Strings @@ -683,6 +704,7 @@ MultiIndex - :meth:`DataFrame.melt` would not accept multiple names in ``var_name`` when the columns were a :class:`MultiIndex` (:issue:`58033`) - :meth:`MultiIndex.insert` would not insert NA value correctly at unified location of index -1 (:issue:`59003`) - :func:`MultiIndex.get_level_values` accessing a :class:`DatetimeIndex` does not carry the frequency attribute along (:issue:`58327`, :issue:`57949`) +- Bug in :class:`DataFrame` arithmetic operations in case of unaligned MultiIndex columns (:issue:`60498`) - I/O @@ -748,12 +770,14 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ - Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`) +- Bug in :meth:`DataFrame.combine_first` not preserving the column order (:issue:`60427`) - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) - Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`) - Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) - Bug in :meth:`DataFrame.merge` when merging two :class:`DataFrame` on ``intc`` or ``uintc`` types on Windows (:issue:`60091`, :issue:`58713`) - Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`) +- Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) Sparse @@ -764,6 +788,7 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ +- Bug in :class:`Categorical` when constructing with an :class:`Index` with :class:`ArrowDtype` (:issue:`60563`) - Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`) - Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`) - Bug in comparison between object with :class:`ArrowDtype` and incompatible-dtyped (e.g. string vs bool) incorrectly raising instead of returning all-``False`` (for ``==``) or all-``True`` (for ``!=``) (:issue:`59505`) @@ -777,6 +802,7 @@ Styler Other ^^^^^ - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) +- Bug in :class:`Series` ignoring errors when trying to convert :class:`Series` input data to the given ``dtype`` (:issue:`60728`) - Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`) - Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`) - Bug in :func:`eval` with ``engine="numexpr"`` returning unexpected result for float division. (:issue:`59736`) @@ -792,10 +818,13 @@ Other - Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`) - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`) +- Bug in :meth:`MultiIndex.fillna` error message was referring to ``isna`` instead of ``fillna`` (:issue:`60974`) - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) +- Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`) - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) +- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` throwing ``ValueError`` when ``regex=True`` and all NA values. (:issue:`60688`) - Bug in :meth:`Series.to_string` when series contains complex floats with exponents (:issue:`60405`) - Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) @@ -803,6 +832,7 @@ Other - Bug in ``Series.list`` methods not preserving the original name. (:issue:`60522`) - Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`) - Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`) +- Fixed regression in :meth:`DataFrame.from_records` not initializing subclasses properly (:issue:`57008`) .. ***DO NOT USE THIS SECTION*** diff --git a/meson.build b/meson.build index efe543b7a267c..66583095a6e77 100644 --- a/meson.build +++ b/meson.build @@ -1,15 +1,13 @@ # This file is adapted from https://github.com/scipy/scipy/blob/main/meson.build project( 'pandas', - 'c', 'cpp', 'cython', + 'c', + 'cpp', + 'cython', version: run_command(['generate_version.py', '--print'], check: true).stdout().strip(), license: 'BSD-3', meson_version: '>=1.2.1', - default_options: [ - 'buildtype=release', - 'c_std=c11', - 'warning_level=2', - ] + default_options: ['buildtype=release', 'c_std=c11', 'warning_level=2'], ) fs = import('fs') @@ -18,41 +16,40 @@ tempita = files('generate_pxi.py') versioneer = files('generate_version.py') -add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language : 'c') -add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language : 'cpp') +add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language: 'c') +add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language: 'cpp') # Allow supporting older numpys than the version compiled against # Set the define to the min supported version of numpy for pandas # e.g. right now this is targeting numpy 1.21+ -add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language : 'c') -add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language : 'cpp') +add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language: 'c') +add_project_arguments( + '-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', + language: 'cpp', +) if fs.exists('_version_meson.py') py.install_sources('_version_meson.py', subdir: 'pandas') else - custom_target('write_version_file', + custom_target( + 'write_version_file', output: '_version_meson.py', - command: [ - py, versioneer, '-o', '@OUTPUT@' - ], + command: [py, versioneer, '-o', '@OUTPUT@'], build_by_default: true, build_always_stale: true, install: true, - install_dir: py.get_install_dir() / 'pandas' + install_dir: py.get_install_dir() / 'pandas', ) meson.add_dist_script(py, versioneer, '-o', '_version_meson.py') endif cy = meson.get_compiler('cython') if cy.version().version_compare('>=3.1.0') - add_project_arguments('-Xfreethreading_compatible=true', language : 'cython') + add_project_arguments('-Xfreethreading_compatible=true', language: 'cython') endif # Needed by pandas.test() when it looks for the pytest ini options -py.install_sources( - 'pyproject.toml', - subdir: 'pandas' -) +py.install_sources('pyproject.toml', subdir: 'pandas') subdir('pandas') diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 35139979f92fe..ce53e05608ba7 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -141,6 +141,10 @@ def get_option(pat: str) -> Any: """ Retrieve the value of the specified option. + This method allows users to query the current value of a given option + in the pandas configuration system. Options control various display, + performance, and behavior-related settings within pandas. + Parameters ---------- pat : str @@ -321,6 +325,11 @@ def reset_option(pat: str) -> None: """ Reset one or more options to their default value. + This method resets the specified pandas option(s) back to their default + values. It allows partial string matching for convenience, but users should + exercise caution to avoid unintended resets due to changes in option names + in future versions. + Parameters ---------- pat : str/regex @@ -424,6 +433,11 @@ def option_context(*args) -> Generator[None]: """ Context manager to temporarily set options in a ``with`` statement. + This method allows users to set one or more pandas options temporarily + within a controlled block. The previous options' values are restored + once the block is exited. This is useful when making temporary adjustments + to pandas' behavior without affecting the global state. + Parameters ---------- *args : str | object diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index f2b4baf508986..60ee73ef6b43f 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -818,33 +818,7 @@ def is_monotonic(const numeric_object_t[:] arr, bint timelike): if timelike and arr[0] == NPY_NAT: return False, False, False - if numeric_object_t is not object: - with nogil: - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if timelike and cur == NPY_NAT: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if cur < prev: - is_monotonic_inc = 0 - elif cur > prev: - is_monotonic_dec = 0 - elif cur == prev: - is_unique = 0 - else: - # cur or prev is NaN - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if not is_monotonic_inc and not is_monotonic_dec: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - prev = cur - else: - # object-dtype, identical to above except we cannot use `with nogil` + with nogil(numeric_object_t is not object): prev = arr[0] for i in range(1, n): cur = arr[i] diff --git a/pandas/_libs/byteswap.pyx b/pandas/_libs/byteswap.pyx index 67cd7ad58d229..7a8a9fc5a9139 100644 --- a/pandas/_libs/byteswap.pyx +++ b/pandas/_libs/byteswap.pyx @@ -15,7 +15,7 @@ from libc.string cimport memcpy def read_float_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): cdef uint32_t value - assert offset + sizeof(value) < len(data) + assert offset + sizeof(value) < len(data) cdef const void *ptr = (data) + offset memcpy(&value, ptr, sizeof(value)) if byteswap: @@ -28,7 +28,7 @@ def read_float_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): def read_double_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): cdef uint64_t value - assert offset + sizeof(value) < len(data) + assert offset + sizeof(value) < len(data) cdef const void *ptr = (data) + offset memcpy(&value, ptr, sizeof(value)) if byteswap: @@ -41,7 +41,7 @@ def read_double_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): def read_uint16_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): cdef uint16_t res - assert offset + sizeof(res) < len(data) + assert offset + sizeof(res) < len(data) memcpy(&res, (data) + offset, sizeof(res)) if byteswap: res = _byteswap2(res) @@ -50,7 +50,7 @@ def read_uint16_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): def read_uint32_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): cdef uint32_t res - assert offset + sizeof(res) < len(data) + assert offset + sizeof(res) < len(data) memcpy(&res, (data) + offset, sizeof(res)) if byteswap: res = _byteswap4(res) @@ -59,7 +59,7 @@ def read_uint32_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): def read_uint64_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap): cdef uint64_t res - assert offset + sizeof(res) < len(data) + assert offset + sizeof(res) < len(data) memcpy(&res, (data) + offset, sizeof(res)) if byteswap: res = _byteswap8(res) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 53f5f73624232..163fc23535022 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -13,6 +13,7 @@ def group_median_float64( mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., is_datetimelike: bool = ..., # bint + skipna: bool = ..., ) -> None: ... def group_cumprod( out: np.ndarray, # float64_t[:, ::1] @@ -66,6 +67,7 @@ def group_sum( result_mask: np.ndarray | None = ..., min_count: int = ..., is_datetimelike: bool = ..., + skipna: bool = ..., ) -> None: ... def group_prod( out: np.ndarray, # int64float_t[:, ::1] @@ -75,6 +77,7 @@ def group_prod( mask: np.ndarray | None, result_mask: np.ndarray | None = ..., min_count: int = ..., + skipna: bool = ..., ) -> None: ... def group_var( out: np.ndarray, # floating[:, ::1] @@ -87,6 +90,7 @@ def group_var( result_mask: np.ndarray | None = ..., is_datetimelike: bool = ..., name: str = ..., + skipna: bool = ..., ) -> None: ... def group_skew( out: np.ndarray, # float64_t[:, ::1] @@ -97,6 +101,15 @@ def group_skew( result_mask: np.ndarray | None = ..., skipna: bool = ..., ) -> None: ... +def group_kurt( + out: np.ndarray, # float64_t[:, ::1] + counts: np.ndarray, # int64_t[::1] + values: np.ndarray, # ndarray[float64_T, ndim=2] + labels: np.ndarray, # const intp_t[::1] + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., + skipna: bool = ..., +) -> None: ... def group_mean( out: np.ndarray, # floating[:, ::1] counts: np.ndarray, # int64_t[::1] @@ -106,6 +119,7 @@ def group_mean( is_datetimelike: bool = ..., # bint mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., + skipna: bool = ..., ) -> None: ... def group_ohlc( out: np.ndarray, # floatingintuint_t[:, ::1] @@ -172,6 +186,7 @@ def group_max( is_datetimelike: bool = ..., mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., + skipna: bool = ..., ) -> None: ... def group_min( out: np.ndarray, # groupby_t[:, ::1] @@ -182,6 +197,7 @@ def group_min( is_datetimelike: bool = ..., mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., + skipna: bool = ..., ) -> None: ... def group_idxmin_idxmax( out: npt.NDArray[np.intp], diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d7e485f74e58b..f65fa2368967a 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -62,7 +62,12 @@ cdef enum InterpolationEnumType: INTERPOLATION_MIDPOINT -cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept nogil: +cdef float64_t median_linear_mask( + float64_t* a, + int n, + uint8_t* mask, + bint skipna=True +) noexcept nogil: cdef: int i, j, na_count = 0 float64_t* tmp @@ -77,7 +82,7 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n na_count += 1 if na_count: - if na_count == n: + if na_count == n or not skipna: return NaN tmp = malloc((n - na_count) * sizeof(float64_t)) @@ -104,7 +109,8 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n cdef float64_t median_linear( float64_t* a, int n, - bint is_datetimelike=False + bint is_datetimelike=False, + bint skipna=True, ) noexcept nogil: cdef: int i, j, na_count = 0 @@ -125,7 +131,7 @@ cdef float64_t median_linear( na_count += 1 if na_count: - if na_count == n: + if na_count == n or not skipna: return NaN tmp = malloc((n - na_count) * sizeof(float64_t)) @@ -186,6 +192,7 @@ def group_median_float64( const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -229,7 +236,7 @@ def group_median_float64( for j in range(ngroups): size = _counts[j + 1] - result = median_linear_mask(ptr, size, ptr_mask) + result = median_linear_mask(ptr, size, ptr_mask, skipna) out[j, i] = result if result != result: @@ -244,7 +251,7 @@ def group_median_float64( ptr += _counts[0] for j in range(ngroups): size = _counts[j + 1] - out[j, i] = median_linear(ptr, size, is_datetimelike) + out[j, i] = median_linear(ptr, size, is_datetimelike, skipna) ptr += size @@ -700,18 +707,19 @@ def group_sum( uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 using Kahan summation """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - sum_t val, t, y + sum_t val, t, y, nan_val sum_t[:, ::1] sumx, compensation int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) bint uses_mask = mask is not None - bint isna_entry + bint isna_entry, isna_result if len_values != len_labels: raise ValueError("len(index) != len(labels)") @@ -722,6 +730,15 @@ def group_sum( compensation = np.zeros((out).shape, dtype=(out).base.dtype) N, K = (values).shape + if uses_mask: + nan_val = 0 + elif is_datetimelike: + nan_val = NPY_NAT + elif sum_t is int64_t or sum_t is uint64_t: + # This has no effect as int64 can't be nan. Setting to 0 to avoid type error + nan_val = 0 + else: + nan_val = NAN with nogil(sum_t is not object): for i in range(N): @@ -739,6 +756,18 @@ def group_sum( else: isna_entry = _treat_as_na(val, is_datetimelike) + if not skipna: + if uses_mask: + isna_result = result_mask[lab, j] + else: + isna_result = _treat_as_na(sumx[lab, j], is_datetimelike) + + if isna_result: + # If sum is already NA, don't add to it. This is important for + # datetimelikebecause adding a value to NPY_NAT may not result + # in a NPY_NAT + continue + if not isna_entry: nobs[lab, j] += 1 @@ -765,6 +794,11 @@ def group_sum( # because of no gil compensation[lab, j] = 0 sumx[lab, j] = t + elif not skipna: + if uses_mask: + result_mask[lab, j] = True + else: + sumx[lab, j] = nan_val _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx @@ -781,17 +815,18 @@ def group_prod( const uint8_t[:, ::1] mask, uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - int64float_t val + int64float_t val, nan_val int64float_t[:, ::1] prodx int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) - bint isna_entry, uses_mask = mask is not None + bint isna_entry, isna_result, uses_mask = mask is not None if len_values != len_labels: raise ValueError("len(index) != len(labels)") @@ -800,6 +835,7 @@ def group_prod( prodx = np.ones((out).shape, dtype=(out).base.dtype) N, K = (values).shape + nan_val = _get_na_val(0, False) with nogil: for i in range(N): @@ -816,9 +852,24 @@ def group_prod( else: isna_entry = _treat_as_na(val, False) + if not skipna: + if uses_mask: + isna_result = result_mask[lab, j] + else: + isna_result = _treat_as_na(prodx[lab, j], False) + + if isna_result: + # If prod is already NA, no need to update it + continue + if not isna_entry: nobs[lab, j] += 1 prodx[lab, j] *= val + elif not skipna: + if uses_mask: + result_mask[lab, j] = True + else: + prodx[lab, j] = nan_val _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx @@ -839,6 +890,7 @@ def group_var( uint8_t[:, ::1] result_mask=None, bint is_datetimelike=False, str name="var", + bint skipna=True, ) -> None: cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) @@ -846,7 +898,7 @@ def group_var( floating[:, ::1] mean int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) - bint isna_entry, uses_mask = mask is not None + bint isna_entry, isna_result, uses_mask = mask is not None bint is_std = name == "std" bint is_sem = name == "sem" @@ -883,11 +935,34 @@ def group_var( else: isna_entry = _treat_as_na(val, is_datetimelike) + if not skipna: + if uses_mask: + isna_result = result_mask[lab, j] + elif is_datetimelike: + # With group_var, we cannot just use _treat_as_na bc + # datetimelike dtypes get cast to float64 instead of + # to int64. + isna_result = out[lab, j] == NPY_NAT + else: + isna_result = _treat_as_na(out[lab, j], is_datetimelike) + + if isna_result: + # If aggregate is already NA, don't add to it. This is + # important for datetimelike because adding a value to NPY_NAT + # may not result in a NPY_NAT + continue + if not isna_entry: nobs[lab, j] += 1 oldmean = mean[lab, j] mean[lab, j] += (val - oldmean) / nobs[lab, j] out[lab, j] += (val - mean[lab, j]) * (val - oldmean) + elif not skipna: + nobs[lab, j] = 0 + if uses_mask: + result_mask[lab, j] = True + else: + out[lab, j] = NAN for i in range(ncounts): for j in range(K): @@ -910,7 +985,7 @@ def group_var( @cython.wraparound(False) @cython.boundscheck(False) @cython.cdivision(True) -@cython.cpow +@cython.cpow(True) def group_skew( float64_t[:, ::1] out, int64_t[::1] counts, @@ -961,7 +1036,7 @@ def group_skew( isna_entry = _treat_as_na(val, False) if not isna_entry: - # Based on RunningStats::Push from + # Running stats update based on RunningStats::Push from # https://www.johndcook.com/blog/skewness_kurtosis/ n1 = nobs[lab, j] n = n1 + 1 @@ -995,6 +1070,100 @@ def group_skew( ) +@cython.wraparound(False) +@cython.boundscheck(False) +@cython.cdivision(True) +@cython.cpow(True) +def group_kurt( + float64_t[:, ::1] out, + int64_t[::1] counts, + ndarray[float64_t, ndim=2] values, + const intp_t[::1] labels, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None, + bint skipna=True, +) -> None: + cdef: + Py_ssize_t i, j, N, K, lab, ngroups = len(counts) + int64_t[:, ::1] nobs + Py_ssize_t len_values = len(values), len_labels = len(labels) + bint isna_entry, uses_mask = mask is not None + float64_t[:, ::1] M1, M2, M3, M4 + float64_t delta, delta_n, delta_n2, term1, val + int64_t n1, n + float64_t ct, num, den, adj + + if len_values != len_labels: + raise ValueError("len(index) != len(labels)") + + nobs = np.zeros((out).shape, dtype=np.int64) + + # M1, M2, M3 and M4 correspond to 1st, 2nd, 3rd and 4th Moments + M1 = np.zeros((out).shape, dtype=np.float64) + M2 = np.zeros((out).shape, dtype=np.float64) + M3 = np.zeros((out).shape, dtype=np.float64) + M4 = np.zeros((out).shape, dtype=np.float64) + + N, K = (values).shape + + out[:, :] = 0.0 + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + + for j in range(K): + val = values[i, j] + + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, False) + + if not isna_entry: + # Running stats update based on RunningStats::Push from + # https://www.johndcook.com/blog/skewness_kurtosis/ + n1 = nobs[lab, j] + n = n1 + 1 + + nobs[lab, j] = n + delta = val - M1[lab, j] + delta_n = delta / n + delta_n2 = delta_n * delta_n + term1 = delta * delta_n * n1 + + M1[lab, j] += delta_n + M4[lab, j] += (term1 * delta_n2 * (n*n - 3*n + 3) + + 6 * delta_n2 * M2[lab, j] + - 4 * delta_n * M3[lab, j]) + M3[lab, j] += term1 * delta_n * (n - 2) - 3 * delta_n * M2[lab, j] + M2[lab, j] += term1 + elif not skipna: + M1[lab, j] = NaN + M2[lab, j] = NaN + M3[lab, j] = NaN + M4[lab, j] = NaN + + for i in range(ngroups): + for j in range(K): + ct = nobs[i, j] + if ct < 4: + if result_mask is not None: + result_mask[i, j] = 1 + out[i, j] = NaN + elif M2[i, j] == 0: + out[i, j] = 0 + else: + num = ct * (ct + 1) * (ct - 1) * M4[i, j] + den = (ct - 2) * (ct - 3) * M2[i, j] ** 2 + adj = 3.0 * (ct - 1) ** 2 / ((ct - 2) * (ct - 3)) + out[i, j] = num / den - adj + + @cython.wraparound(False) @cython.boundscheck(False) def group_mean( @@ -1006,6 +1175,7 @@ def group_mean( bint is_datetimelike=False, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint skipna=True, ) -> None: """ Compute the mean per label given a label assignment for each value. @@ -1031,6 +1201,8 @@ def group_mean( Mask of the input values. result_mask : ndarray[bool, ndim=2], optional Mask of the out array + skipna : bool, optional + If True, ignore nans in `values`. Notes ----- @@ -1044,7 +1216,7 @@ def group_mean( mean_t[:, ::1] sumx, compensation int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) - bint isna_entry, uses_mask = mask is not None + bint isna_entry, isna_result, uses_mask = mask is not None assert min_count == -1, "'min_count' only used in sum and prod" @@ -1084,6 +1256,23 @@ def group_mean( else: isna_entry = _treat_as_na(val, is_datetimelike) + if not skipna: + if uses_mask: + isna_result = result_mask[lab, j] + elif is_datetimelike: + # With group_mean, we cannot just use _treat_as_na bc + # datetimelike dtypes get cast to float64 instead of + # to int64. + isna_result = sumx[lab, j] == NPY_NAT + else: + isna_result = _treat_as_na(sumx[lab, j], is_datetimelike) + + if isna_result: + # If sum is already NA, don't add to it. This is important for + # datetimelike because adding a value to NPY_NAT may not result + # in NPY_NAT + continue + if not isna_entry: nobs[lab, j] += 1 y = val - compensation[lab, j] @@ -1097,6 +1286,14 @@ def group_mean( # because of no gil compensation[lab, j] = 0. sumx[lab, j] = t + elif not skipna: + # Set the nobs to 0 so that in case of datetimelike, + # dividing NPY_NAT by nobs may not result in a NPY_NAT + nobs[lab, j] = 0 + if uses_mask: + result_mask[lab, j] = True + else: + sumx[lab, j] = nan_val for i in range(ncounts): for j in range(K): @@ -1668,6 +1865,7 @@ cdef group_min_max( bint compute_max=True, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint skipna=True, ): """ Compute minimum/maximum of columns of `values`, in row groups `labels`. @@ -1695,6 +1893,8 @@ cdef group_min_max( result_mask : ndarray[bool, ndim=2], optional If not None, these specify locations in the output that are NA. Modified in-place. + skipna : bool, default True + If True, ignore nans in `values`. Notes ----- @@ -1703,17 +1903,18 @@ cdef group_min_max( """ cdef: Py_ssize_t i, j, N, K, lab, ngroups = len(counts) - numeric_t val + numeric_t val, nan_val numeric_t[:, ::1] group_min_or_max int64_t[:, ::1] nobs bint uses_mask = mask is not None - bint isna_entry + bint isna_entry, isna_result if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) + nan_val = _get_na_val(0, is_datetimelike) group_min_or_max = np.empty_like(out) group_min_or_max[:] = _get_min_or_max(0, compute_max, is_datetimelike) @@ -1735,6 +1936,18 @@ cdef group_min_max( else: isna_entry = _treat_as_na(val, is_datetimelike) + if not skipna: + if uses_mask: + isna_result = result_mask[lab, j] + else: + isna_result = _treat_as_na( + group_min_or_max[lab, j], is_datetimelike + ) + + if isna_result: + # If current min/max is already NA, it will always be NA + continue + if not isna_entry: nobs[lab, j] += 1 if compute_max: @@ -1743,6 +1956,11 @@ cdef group_min_max( else: if val < group_min_or_max[lab, j]: group_min_or_max[lab, j] = val + elif not skipna: + if uses_mask: + result_mask[lab, j] = True + else: + group_min_or_max[lab, j] = nan_val _check_below_mincount( out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max @@ -1874,6 +2092,7 @@ def group_max( bint is_datetimelike=False, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint skipna=True, ) -> None: """See group_min_max.__doc__""" group_min_max( @@ -1886,6 +2105,7 @@ def group_max( compute_max=True, mask=mask, result_mask=result_mask, + skipna=skipna, ) @@ -1900,6 +2120,7 @@ def group_min( bint is_datetimelike=False, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint skipna=True, ) -> None: """See group_min_max.__doc__""" group_min_max( @@ -1912,6 +2133,7 @@ def group_min( compute_max=False, mask=mask, result_mask=result_mask, + skipna=skipna, ) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 5500fadb73b6d..f957ebdeaf67a 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -415,20 +415,7 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None): modes = np.empty(nkeys, dtype=values.dtype) - if htfunc_t is not object: - with nogil: - for k in range(nkeys): - count = counts[k] - if count == max_count: - j += 1 - elif count > max_count: - max_count = count - j = 0 - else: - continue - - modes[j] = keys[k] - else: + with nogil(htfunc_t is not object): for k in range(nkeys): count = counts[k] if count == max_count: diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index 9706a8211b61f..04e2d1e39ce2a 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -34,11 +34,9 @@ static void *traced_calloc(size_t num, size_t size) { } static void *traced_realloc(void *old_ptr, size_t size) { + PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)old_ptr); void *ptr = realloc(old_ptr, size); if (ptr != NULL) { - if (old_ptr != ptr) { - PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)old_ptr); - } PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size); } return ptr; diff --git a/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h b/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h index 0d62bb0ba915c..51fdbc50bba57 100644 --- a/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h +++ b/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h @@ -170,8 +170,8 @@ typedef void (*JSPFN_ITERBEGIN)(JSOBJ obj, JSONTypeContext *tc); typedef int (*JSPFN_ITERNEXT)(JSOBJ obj, JSONTypeContext *tc); typedef void (*JSPFN_ITEREND)(JSOBJ obj, JSONTypeContext *tc); typedef JSOBJ (*JSPFN_ITERGETVALUE)(JSOBJ obj, JSONTypeContext *tc); -typedef char *(*JSPFN_ITERGETNAME)(JSOBJ obj, JSONTypeContext *tc, - size_t *outLen); +typedef const char *(*JSPFN_ITERGETNAME)(JSOBJ obj, JSONTypeContext *tc, + size_t *outLen); typedef void *(*JSPFN_MALLOC)(size_t size); typedef void (*JSPFN_FREE)(void *pptr); typedef void *(*JSPFN_REALLOC)(void *base, size_t size); diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 688f943760d1f..c219d0b63870f 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -561,23 +561,15 @@ cdef class StringObjectEngine(ObjectEngine): cdef: object na_value - bint uses_na def __init__(self, ndarray values, na_value): super().__init__(values) self.na_value = na_value - self.uses_na = na_value is C_NA - - cdef bint _checknull(self, object val): - if self.uses_na: - return val is C_NA - else: - return util.is_nan(val) cdef _check_type(self, object val): if isinstance(val, str): return val - elif self._checknull(val): + elif checknull(val): return self.na_value else: raise KeyError(val) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 99737776ff59f..4f0c2892f5a58 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -502,7 +502,7 @@ def get_concat_blkno_indexers(list blknos_list not None): @cython.boundscheck(False) @cython.wraparound(False) def get_blkno_indexers( - int64_t[:] blknos, bint group=True + const int64_t[:] blknos, bint group=True ) -> list[tuple[int, slice | np.ndarray]]: """ Enumerate contiguous runs of integers in ndarray. @@ -596,8 +596,8 @@ def get_blkno_placements(blknos, group: bool = True): @cython.boundscheck(False) @cython.wraparound(False) cpdef update_blklocs_and_blknos( - ndarray[intp_t, ndim=1] blklocs, - ndarray[intp_t, ndim=1] blknos, + const intp_t[:] blklocs, + const intp_t[:] blknos, Py_ssize_t loc, intp_t nblocks, ): diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 564019d7c0d8c..5d0876591a151 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -209,6 +209,12 @@ cdef class IntervalMixin: """ Indicates if an interval is empty, meaning it contains no points. + An interval is considered empty if its `left` and `right` endpoints + are equal, and it is not closed on both sides. This means that the + interval does not include any real points. In the case of an + :class:`pandas.arrays.IntervalArray` or :class:`IntervalIndex`, the + property returns a boolean array indicating the emptiness of each interval. + Returns ------- bool or ndarray diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 368abe60d7237..e6bd5a8de39dc 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -225,7 +225,10 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right, @cython.wraparound(False) @cython.boundscheck(False) -cdef void _get_result_indexer(intp_t[::1] sorter, intp_t[::1] indexer) noexcept nogil: +cdef void _get_result_indexer( + const intp_t[::1] sorter, + intp_t[::1] indexer, +) noexcept nogil: """NOTE: overwrites indexer with the result to avoid allocating another array""" cdef: Py_ssize_t i, n, idx @@ -681,8 +684,8 @@ def outer_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] from pandas._libs.hashtable cimport Int64HashTable -def asof_join_backward_on_X_by_Y(ndarray[numeric_t] left_values, - ndarray[numeric_t] right_values, +def asof_join_backward_on_X_by_Y(const numeric_t[:] left_values, + const numeric_t[:] right_values, const int64_t[:] left_by_values, const int64_t[:] right_by_values, bint allow_exact_matches=True, @@ -752,8 +755,8 @@ def asof_join_backward_on_X_by_Y(ndarray[numeric_t] left_values, return left_indexer, right_indexer -def asof_join_forward_on_X_by_Y(ndarray[numeric_t] left_values, - ndarray[numeric_t] right_values, +def asof_join_forward_on_X_by_Y(const numeric_t[:] left_values, + const numeric_t[:] right_values, const int64_t[:] left_by_values, const int64_t[:] right_by_values, bint allow_exact_matches=1, @@ -824,8 +827,8 @@ def asof_join_forward_on_X_by_Y(ndarray[numeric_t] left_values, return left_indexer, right_indexer -def asof_join_nearest_on_X_by_Y(ndarray[numeric_t] left_values, - ndarray[numeric_t] right_values, +def asof_join_nearest_on_X_by_Y(const numeric_t[:] left_values, + const numeric_t[:] right_values, const int64_t[:] left_by_values, const int64_t[:] right_by_values, bint allow_exact_matches=True, diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index de603beff7836..3c509a3eae11a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -981,16 +981,14 @@ def get_level_sorter( @cython.boundscheck(False) @cython.wraparound(False) -def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, +def count_level_2d(const uint8_t[:, :] mask, const intp_t[:] labels, Py_ssize_t max_bin, ): cdef: - Py_ssize_t i, j, k, n + Py_ssize_t i, j, k = mask.shape[1], n = mask.shape[0] ndarray[int64_t, ndim=2] counts - n, k = (mask).shape - counts = np.zeros((n, max_bin), dtype="i8") with nogil: for i in range(n): @@ -1522,6 +1520,11 @@ def infer_dtype(value: object, skipna: bool = True) -> str: """ Return a string label of the type of a scalar or list-like of values. + This method inspects the elements of the provided input and determines + classification of its data type. It is particularly useful for + handling heterogeneous data inputs where explicit dtype conversion may not + be possible or necessary. + Parameters ---------- value : scalar, list, ndarray, or pandas type @@ -1882,7 +1885,7 @@ cdef class BoolValidator(Validator): cpdef bint is_bool_array(ndarray values, bint skipna=False): cdef: - BoolValidator validator = BoolValidator(len(values), + BoolValidator validator = BoolValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1900,7 +1903,7 @@ cdef class IntegerValidator(Validator): # Note: only python-exposed for tests cpdef bint is_integer_array(ndarray values, bint skipna=True): cdef: - IntegerValidator validator = IntegerValidator(len(values), + IntegerValidator validator = IntegerValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1915,7 +1918,7 @@ cdef class IntegerNaValidator(Validator): cdef bint is_integer_na_array(ndarray values, bint skipna=True): cdef: - IntegerNaValidator validator = IntegerNaValidator(len(values), + IntegerNaValidator validator = IntegerNaValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1931,7 +1934,7 @@ cdef class IntegerFloatValidator(Validator): cdef bint is_integer_float_array(ndarray values, bint skipna=True): cdef: - IntegerFloatValidator validator = IntegerFloatValidator(len(values), + IntegerFloatValidator validator = IntegerFloatValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1949,7 +1952,7 @@ cdef class FloatValidator(Validator): # Note: only python-exposed for tests cpdef bint is_float_array(ndarray values): cdef: - FloatValidator validator = FloatValidator(len(values), values.dtype) + FloatValidator validator = FloatValidator(values.size, values.dtype) return validator.validate(values) @@ -1967,7 +1970,7 @@ cdef class ComplexValidator(Validator): cdef bint is_complex_array(ndarray values): cdef: - ComplexValidator validator = ComplexValidator(len(values), values.dtype) + ComplexValidator validator = ComplexValidator(values.size, values.dtype) return validator.validate(values) @@ -1980,7 +1983,7 @@ cdef class DecimalValidator(Validator): cdef bint is_decimal_array(ndarray values, bint skipna=False): cdef: DecimalValidator validator = DecimalValidator( - len(values), values.dtype, skipna=skipna + values.size, values.dtype, skipna=skipna ) return validator.validate(values) @@ -1996,7 +1999,7 @@ cdef class StringValidator(Validator): cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: - StringValidator validator = StringValidator(len(values), + StringValidator validator = StringValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -2013,7 +2016,7 @@ cdef class BytesValidator(Validator): cdef bint is_bytes_array(ndarray values, bint skipna=False): cdef: - BytesValidator validator = BytesValidator(len(values), values.dtype, + BytesValidator validator = BytesValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -2064,7 +2067,7 @@ cdef class DatetimeValidator(TemporalValidator): cpdef bint is_datetime_array(ndarray values, bint skipna=True): cdef: - DatetimeValidator validator = DatetimeValidator(len(values), + DatetimeValidator validator = DatetimeValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2078,7 +2081,7 @@ cdef class Datetime64Validator(DatetimeValidator): # Note: only python-exposed for tests cpdef bint is_datetime64_array(ndarray values, bint skipna=True): cdef: - Datetime64Validator validator = Datetime64Validator(len(values), + Datetime64Validator validator = Datetime64Validator(values.size, skipna=skipna) return validator.validate(values) @@ -2093,7 +2096,7 @@ cdef class AnyDatetimeValidator(DatetimeValidator): cdef bint is_datetime_or_datetime64_array(ndarray values, bint skipna=True): cdef: - AnyDatetimeValidator validator = AnyDatetimeValidator(len(values), + AnyDatetimeValidator validator = AnyDatetimeValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2105,7 +2108,7 @@ def is_datetime_with_singletz_array(values: ndarray) -> bool: Doesn't check values are datetime-like types. """ cdef: - Py_ssize_t i = 0, j, n = len(values) + Py_ssize_t i = 0, j, n = values.size object base_val, base_tz, val, tz if n == 0: @@ -2153,7 +2156,7 @@ cpdef bint is_timedelta_or_timedelta64_array(ndarray values, bint skipna=True): Infer with timedeltas and/or nat/none. """ cdef: - AnyTimedeltaValidator validator = AnyTimedeltaValidator(len(values), + AnyTimedeltaValidator validator = AnyTimedeltaValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2167,7 +2170,7 @@ cdef class DateValidator(Validator): # Note: only python-exposed for tests cpdef bint is_date_array(ndarray values, bint skipna=False): cdef: - DateValidator validator = DateValidator(len(values), skipna=skipna) + DateValidator validator = DateValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2180,7 +2183,7 @@ cdef class TimeValidator(Validator): # Note: only python-exposed for tests cpdef bint is_time_array(ndarray values, bint skipna=False): cdef: - TimeValidator validator = TimeValidator(len(values), skipna=skipna) + TimeValidator validator = TimeValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2231,14 +2234,14 @@ cpdef bint is_interval_array(ndarray values): Is this an ndarray of Interval (or np.nan) with a single dtype? """ cdef: - Py_ssize_t i, n = len(values) + Py_ssize_t i, n = values.size str closed = None bint numeric = False bint dt64 = False bint td64 = False object val - if len(values) == 0: + if n == 0: return False for i in range(n): diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index c27386743c6e9..5fb6f1118d648 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -1,94 +1,119 @@ -_algos_take_helper = custom_target('algos_take_helper_pxi', +_algos_take_helper = custom_target( + 'algos_take_helper_pxi', output: 'algos_take_helper.pxi', input: 'algos_take_helper.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], ) -_algos_common_helper = custom_target('algos_common_helper_pxi', +_algos_common_helper = custom_target( + 'algos_common_helper_pxi', output: 'algos_common_helper.pxi', input: 'algos_common_helper.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], ) -_khash_primitive_helper = custom_target('khash_primitive_helper_pxi', +_khash_primitive_helper = custom_target( + 'khash_primitive_helper_pxi', output: 'khash_for_primitive_helper.pxi', input: 'khash_for_primitive_helper.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], ) -_hashtable_class_helper = custom_target('hashtable_class_helper_pxi', +_hashtable_class_helper = custom_target( + 'hashtable_class_helper_pxi', output: 'hashtable_class_helper.pxi', input: 'hashtable_class_helper.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], ) -_hashtable_func_helper = custom_target('hashtable_func_helper_pxi', +_hashtable_func_helper = custom_target( + 'hashtable_func_helper_pxi', output: 'hashtable_func_helper.pxi', input: 'hashtable_func_helper.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], ) -_index_class_helper = custom_target('index_class_helper_pxi', +_index_class_helper = custom_target( + 'index_class_helper_pxi', output: 'index_class_helper.pxi', input: 'index_class_helper.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], ) -_sparse_op_helper = custom_target('sparse_op_helper_pxi', +_sparse_op_helper = custom_target( + 'sparse_op_helper_pxi', output: 'sparse_op_helper.pxi', input: 'sparse_op_helper.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], ) -_intervaltree_helper = custom_target('intervaltree_helper_pxi', +_intervaltree_helper = custom_target( + 'intervaltree_helper_pxi', output: 'intervaltree.pxi', input: 'intervaltree.pxi.in', - command: [ - py, tempita, '@INPUT@', '-o', '@OUTDIR@' - ] + command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'], +) +_khash_primitive_helper_dep = declare_dependency( + sources: _khash_primitive_helper, ) -_khash_primitive_helper_dep = declare_dependency(sources: _khash_primitive_helper) subdir('tslibs') libs_sources = { # Dict of extension name -> dict of {sources, include_dirs, and deps} # numpy include dir is implicitly included - 'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper], 'deps': _khash_primitive_helper_dep}, + 'algos': { + 'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper], + 'deps': _khash_primitive_helper_dep, + }, 'arrays': {'sources': ['arrays.pyx']}, 'groupby': {'sources': ['groupby.pyx']}, 'hashing': {'sources': ['hashing.pyx']}, - 'hashtable': {'sources': ['hashtable.pyx', _hashtable_class_helper, _hashtable_func_helper], 'deps': _khash_primitive_helper_dep}, - 'index': {'sources': ['index.pyx', _index_class_helper], 'deps': _khash_primitive_helper_dep}, + 'hashtable': { + 'sources': [ + 'hashtable.pyx', + _hashtable_class_helper, + _hashtable_func_helper, + ], + 'deps': _khash_primitive_helper_dep, + }, + 'index': { + 'sources': ['index.pyx', _index_class_helper], + 'deps': _khash_primitive_helper_dep, + }, 'indexing': {'sources': ['indexing.pyx']}, 'internals': {'sources': ['internals.pyx']}, - 'interval': {'sources': ['interval.pyx', _intervaltree_helper], - 'deps': _khash_primitive_helper_dep}, - 'join': {'sources': ['join.pyx', _khash_primitive_helper], - 'deps': _khash_primitive_helper_dep}, + 'interval': { + 'sources': ['interval.pyx', _intervaltree_helper], + 'deps': _khash_primitive_helper_dep, + }, + 'join': { + 'sources': ['join.pyx', _khash_primitive_helper], + 'deps': _khash_primitive_helper_dep, + }, 'lib': {'sources': ['lib.pyx', 'src/parser/tokenizer.c']}, 'missing': {'sources': ['missing.pyx']}, - 'pandas_datetime': {'sources': ['src/vendored/numpy/datetime/np_datetime.c', - 'src/vendored/numpy/datetime/np_datetime_strings.c', - 'src/datetime/date_conversions.c', - 'src/datetime/pd_datetime.c']}, - 'pandas_parser': {'sources': ['src/parser/tokenizer.c', - 'src/parser/io.c', - 'src/parser/pd_parser.c']}, - 'parsers': {'sources': ['parsers.pyx', 'src/parser/tokenizer.c', 'src/parser/io.c'], - 'deps': _khash_primitive_helper_dep}, - 'json': {'sources': ['src/vendored/ujson/python/ujson.c', - 'src/vendored/ujson/python/objToJSON.c', - 'src/vendored/ujson/python/JSONtoObj.c', - 'src/vendored/ujson/lib/ultrajsonenc.c', - 'src/vendored/ujson/lib/ultrajsondec.c']}, + 'pandas_datetime': { + 'sources': [ + 'src/vendored/numpy/datetime/np_datetime.c', + 'src/vendored/numpy/datetime/np_datetime_strings.c', + 'src/datetime/date_conversions.c', + 'src/datetime/pd_datetime.c', + ], + }, + 'pandas_parser': { + 'sources': [ + 'src/parser/tokenizer.c', + 'src/parser/io.c', + 'src/parser/pd_parser.c', + ], + }, + 'parsers': { + 'sources': ['parsers.pyx', 'src/parser/tokenizer.c', 'src/parser/io.c'], + 'deps': _khash_primitive_helper_dep, + }, + 'json': { + 'sources': [ + 'src/vendored/ujson/python/ujson.c', + 'src/vendored/ujson/python/objToJSON.c', + 'src/vendored/ujson/python/JSONtoObj.c', + 'src/vendored/ujson/lib/ultrajsonenc.c', + 'src/vendored/ujson/lib/ultrajsondec.c', + ], + }, 'ops': {'sources': ['ops.pyx']}, 'ops_dispatch': {'sources': ['ops_dispatch.pyx']}, 'properties': {'sources': ['properties.pyx']}, @@ -98,13 +123,13 @@ libs_sources = { 'sparse': {'sources': ['sparse.pyx', _sparse_op_helper]}, 'tslib': {'sources': ['tslib.pyx']}, 'testing': {'sources': ['testing.pyx']}, - 'writers': {'sources': ['writers.pyx']} + 'writers': {'sources': ['writers.pyx']}, } cython_args = [ '--include-dir', meson.current_build_dir(), - '-X always_allow_keywords=true' + '-X always_allow_keywords=true', ] if get_option('buildtype') == 'debug' cython_args += ['--gdb'] @@ -118,7 +143,7 @@ foreach ext_name, ext_dict : libs_sources include_directories: [inc_np, inc_pd], dependencies: ext_dict.get('deps', ''), subdir: 'pandas/_libs', - install: true + install: true, ) endforeach @@ -148,14 +173,11 @@ sources_to_install = [ 'sparse.pyi', 'testing.pyi', 'tslib.pyi', - 'writers.pyi' + 'writers.pyi', ] -foreach source: sources_to_install - py.install_sources( - source, - subdir: 'pandas/_libs' - ) +foreach source : sources_to_install + py.install_sources(source, subdir: 'pandas/_libs') endforeach subdir('window') diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index 28ea06739e0c8..51c1c75ba631b 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -40,27 +40,7 @@ def unstack(const numeric_object_t[:, :] values, const uint8_t[:] mask, cdef: Py_ssize_t i, j, w, nulls, s, offset - if numeric_object_t is not object: - # evaluated at compile-time - with nogil: - for i in range(stride): - - nulls = 0 - for j in range(length): - - for w in range(width): - - offset = j * width + w - - if mask[offset]: - s = i * width + w - new_values[j, s] = values[offset - nulls, i] - new_mask[j, s] = 1 - else: - nulls += 1 - - else: - # object-dtype, identical to above but we cannot use nogil + with nogil(numeric_object_t is not object): for i in range(stride): nulls = 0 diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index c9f7a796a9b1c..61e96fc835e4d 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -148,7 +148,7 @@ int parser_init(parser_t *self) { self->warn_msg = NULL; // token stream - self->stream = malloc(STREAM_INIT_SIZE * sizeof(char)); + self->stream = malloc(STREAM_INIT_SIZE); if (self->stream == NULL) { parser_cleanup(self); return PARSER_OUT_OF_MEMORY; @@ -221,9 +221,8 @@ static int make_stream_space(parser_t *self, size_t nbytes) { char *orig_ptr = (void *)self->stream; TRACE(("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n", nbytes)) - self->stream = - (char *)grow_buffer((void *)self->stream, self->stream_len, - &self->stream_cap, nbytes * 2, sizeof(char), &status); + self->stream = (char *)grow_buffer((void *)self->stream, self->stream_len, + &self->stream_cap, nbytes * 2, 1, &status); TRACE(("make_stream_space: self->stream=%p, self->stream_len = %zu, " "self->stream_cap=%zu, status=%zu\n", self->stream, self->stream_len, self->stream_cap, status)) diff --git a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c index c8d8b5ab6bd6e..1564ecb64b01d 100644 --- a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c @@ -920,7 +920,7 @@ Perhaps implement recursion detection */ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName) { const char *value; - char *objName; + const char *objName; int count; JSOBJ iterObj; size_t szlen; diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 5f35860c59cb7..8342dbcd1763d 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -53,8 +53,8 @@ Numeric decoder derived from TCL library npy_int64 get_nat(void) { return NPY_MIN_INT64; } -typedef char *(*PFN_PyTypeToUTF8)(JSOBJ obj, JSONTypeContext *ti, - size_t *_outLen); +typedef const char *(*PFN_PyTypeToUTF8)(JSOBJ obj, JSONTypeContext *ti, + size_t *_outLen); int object_is_decimal_type(PyObject *obj); int object_is_dataframe_type(PyObject *obj); @@ -106,7 +106,7 @@ typedef struct __TypeContext { double doubleValue; JSINT64 longValue; - char *cStr; + const char *cStr; NpyArrContext *npyarr; PdBlockContext *pdblock; int transpose; @@ -301,14 +301,15 @@ static npy_float64 total_seconds(PyObject *td) { return double_val; } -static char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), - size_t *_outLen) { +static const char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), + size_t *_outLen) { PyObject *obj = (PyObject *)_obj; *_outLen = PyBytes_GET_SIZE(obj); return PyBytes_AS_STRING(obj); } -static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, size_t *_outLen) { +static const char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, + size_t *_outLen) { char *encoded = (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen); if (encoded == NULL) { /* Something went wrong. @@ -321,8 +322,8 @@ static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, size_t *_outLen) { } /* JSON callback. returns a char* and mutates the pointer to *len */ -static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), - JSONTypeContext *tc, size_t *len) { +static const char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), + JSONTypeContext *tc, size_t *len) { NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; NPY_DATETIMEUNIT valueUnit = ((PyObjectEncoder *)tc->encoder)->valueUnit; GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, valueUnit, base, len); @@ -330,15 +331,15 @@ static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), } /* JSON callback. returns a char* and mutates the pointer to *len */ -static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), - JSONTypeContext *tc, size_t *len) { +static const char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), + JSONTypeContext *tc, size_t *len) { GET_TC(tc)->cStr = int64ToIsoDuration(GET_TC(tc)->longValue, len); return GET_TC(tc)->cStr; } /* JSON callback */ -static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, - size_t *len) { +static const char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, + size_t *len) { if (!PyDate_Check(obj) && !PyDateTime_Check(obj)) { PyErr_SetString(PyExc_TypeError, "Expected date or datetime object"); ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; @@ -349,7 +350,8 @@ static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, return PyDateTimeToIso(obj, base, len); } -static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) { +static const char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, + size_t *outLen) { PyObject *obj = (PyObject *)_obj; PyObject *str = PyObject_CallMethod(obj, "isoformat", NULL); if (str == NULL) { @@ -373,6 +375,27 @@ static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) { return outValue; } +static const char *PyDecimalToUTF8Callback(JSOBJ _obj, JSONTypeContext *tc, + size_t *len) { + PyObject *obj = (PyObject *)_obj; + PyObject *format_spec = PyUnicode_FromStringAndSize("f", 1); + PyObject *str = PyObject_Format(obj, format_spec); + Py_DECREF(format_spec); + + if (str == NULL) { + ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + return NULL; + } + + GET_TC(tc)->newObj = str; + + Py_ssize_t s_len; + char *outValue = (char *)PyUnicode_AsUTF8AndSize(str, &s_len); + *len = s_len; + + return outValue; +} + //============================================================================= // Numpy array iteration functions //============================================================================= @@ -537,10 +560,10 @@ static JSOBJ NpyArr_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { NpyArrContext *npyarr = GET_TC(tc)->npyarr; - char *cStr; + const char *cStr; if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { const npy_intp idx = npyarr->index[npyarr->stridedim] - 1; @@ -588,11 +611,11 @@ static int PdBlock_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { return NpyArr_iterNextItem(obj, tc); } -static char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *tc, size_t *outLen) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; - char *cStr; + const char *cStr; if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) { const npy_intp idx = blkCtxt->colIdx - 1; @@ -610,12 +633,12 @@ static char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, return cStr; } -static char *PdBlock_iterGetName_Transpose(JSOBJ Py_UNUSED(obj), - JSONTypeContext *tc, - size_t *outLen) { +static const char *PdBlock_iterGetName_Transpose(JSOBJ Py_UNUSED(obj), + JSONTypeContext *tc, + size_t *outLen) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - char *cStr; + const char *cStr; if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { const npy_intp idx = npyarr->index[npyarr->stridedim] - 1; @@ -796,9 +819,9 @@ static JSOBJ Tuple_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { +static const char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { return NULL; } @@ -843,9 +866,9 @@ static JSOBJ Set_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Set_iterGetName(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { +static const char *Set_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { return NULL; } @@ -941,8 +964,8 @@ static JSOBJ Dir_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Dir_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *Dir_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); return PyBytes_AS_STRING(GET_TC(tc)->itemName); } @@ -973,9 +996,9 @@ static JSOBJ List_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *List_iterGetName(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { +static const char *List_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { return NULL; } @@ -984,24 +1007,16 @@ static char *List_iterGetName(JSOBJ Py_UNUSED(obj), //============================================================================= static void Index_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } } static int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) { - if (!GET_TC(tc)->cStr) { - return 0; - } - const Py_ssize_t index = GET_TC(tc)->index; Py_XDECREF(GET_TC(tc)->itemValue); if (index == 0) { - memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5); + GET_TC(tc)->cStr = "name"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + GET_TC(tc)->cStr = "data"; GET_TC(tc)->itemValue = get_values(obj); if (!GET_TC(tc)->itemValue) { return 0; @@ -1021,8 +1036,8 @@ static JSOBJ Index_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } @@ -1033,28 +1048,20 @@ static char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, static void Series_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); enc->outputFormat = VALUES; // for contained series - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } } static int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) { - if (!GET_TC(tc)->cStr) { - return 0; - } - const Py_ssize_t index = GET_TC(tc)->index; Py_XDECREF(GET_TC(tc)->itemValue); if (index == 0) { - memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5); + GET_TC(tc)->cStr = "name"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); + GET_TC(tc)->cStr = "index"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); } else if (index == 2) { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + GET_TC(tc)->cStr = "data"; GET_TC(tc)->itemValue = get_values(obj); if (!GET_TC(tc)->itemValue) { return 0; @@ -1076,8 +1083,8 @@ static JSOBJ Series_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } @@ -1088,28 +1095,20 @@ static char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, static void DataFrame_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); enc->outputFormat = VALUES; // for contained series & index - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } } static int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { - if (!GET_TC(tc)->cStr) { - return 0; - } - const Py_ssize_t index = GET_TC(tc)->index; Py_XDECREF(GET_TC(tc)->itemValue); if (index == 0) { - memcpy(GET_TC(tc)->cStr, "columns", sizeof(char) * 8); + GET_TC(tc)->cStr = "columns"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns"); } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); + GET_TC(tc)->cStr = "index"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); } else if (index == 2) { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + GET_TC(tc)->cStr = "data"; Py_INCREF(obj); GET_TC(tc)->itemValue = obj; } else { @@ -1129,8 +1128,8 @@ static JSOBJ DataFrame_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *tc, size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } @@ -1180,8 +1179,8 @@ static JSOBJ Dict_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Dict_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *Dict_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); return PyBytes_AS_STRING(GET_TC(tc)->itemName); } @@ -1467,8 +1466,18 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { tc->type = JT_UTF8; return; } else if (object_is_decimal_type(obj)) { - pc->doubleValue = PyFloat_AsDouble(obj); - tc->type = JT_DOUBLE; + PyObject *is_nan_py = PyObject_RichCompare(obj, obj, Py_NE); + if (is_nan_py == NULL) { + goto INVALID; + } + int is_nan = (is_nan_py == Py_True); + Py_DECREF(is_nan_py); + if (is_nan) { + tc->type = JT_NULL; + return; + } + pc->PyTypeToUTF8 = PyDecimalToUTF8Callback; + tc->type = JT_UTF8; return; } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) { if (object_is_nat_type(obj)) { @@ -1871,7 +1880,6 @@ static void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { GET_TC(tc)->rowLabels = NULL; NpyArr_freeLabels(GET_TC(tc)->columnLabels, GET_TC(tc)->columnLabelsLen); GET_TC(tc)->columnLabels = NULL; - PyObject_Free(GET_TC(tc)->cStr); GET_TC(tc)->cStr = NULL; PyObject_Free(tc->prv); tc->prv = NULL; @@ -1922,8 +1930,8 @@ static JSOBJ Object_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { return GET_TC(tc)->iterGetValue(obj, tc); } -static char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, - size_t *outLen) { +static const char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, + size_t *outLen) { return GET_TC(tc)->iterGetName(obj, tc, outLen); } diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index a635dd33f8420..7a8b4df447aee 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -149,18 +149,18 @@ def cast_from_unit_vectorized( if p: frac = np.round(frac, p) - try: - for i in range(len(values)): + for i in range(len(values)): + try: if base[i] == NPY_NAT: out[i] = NPY_NAT else: out[i] = (base[i] * m) + (frac[i] * m) - except (OverflowError, FloatingPointError) as err: - # FloatingPointError can be issued if we have float dtype and have - # set np.errstate(over="raise") - raise OutOfBoundsDatetime( - f"cannot convert input {values[i]} with the unit '{unit}'" - ) from err + except (OverflowError, FloatingPointError) as err: + # FloatingPointError can be issued if we have float dtype and have + # set np.errstate(over="raise") + raise OutOfBoundsDatetime( + f"cannot convert input {values[i]} with the unit '{unit}'" + ) from err return out diff --git a/pandas/_libs/tslibs/meson.build b/pandas/_libs/tslibs/meson.build index 85410f771233f..052a8568b76af 100644 --- a/pandas/_libs/tslibs/meson.build +++ b/pandas/_libs/tslibs/meson.build @@ -22,7 +22,7 @@ tslibs_sources = { cython_args = [ '--include-dir', meson.current_build_dir(), - '-X always_allow_keywords=true' + '-X always_allow_keywords=true', ] if get_option('buildtype') == 'debug' cython_args += ['--gdb'] @@ -36,7 +36,7 @@ foreach ext_name, ext_dict : tslibs_sources include_directories: [inc_np, inc_pd], dependencies: ext_dict.get('deps', ''), subdir: 'pandas/_libs/tslibs', - install: true + install: true, ) endforeach @@ -56,12 +56,9 @@ sources_to_install = [ 'timestamps.pyi', 'timezones.pyi', 'tzconversion.pyi', - 'vectorized.pyi' + 'vectorized.pyi', ] -foreach source: sources_to_install - py.install_sources( - source, - subdir: 'pandas/_libs/tslibs' - ) +foreach source : sources_to_install + py.install_sources(source, subdir: 'pandas/_libs/tslibs') endforeach diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 1c0a99eb1ea25..2657b1b9d197b 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -704,7 +704,7 @@ class NaTType(_NaT): difference between the current timezone and UTC. Returns - -------- + ------- timedelta The difference between UTC and the local time as a `timedelta` object. diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 7569f8e8864a0..36b431974c121 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -720,11 +720,24 @@ cdef class BaseOffset: """ Return boolean whether a timestamp intersects with this frequency. + This method determines if a given timestamp aligns with the start + of a custom business month, as defined by this offset. It accounts + for custom rules, such as skipping weekends or other non-business days, + and checks whether the provided datetime falls on a valid business day + that marks the beginning of the custom business month. + Parameters ---------- dt : datetime.datetime Timestamp to check intersections with frequency. + See Also + -------- + tseries.offsets.CustomBusinessMonthBegin : Represents the start of a custom + business month. + tseries.offsets.CustomBusinessMonthEnd : Represents the end of a custom + business month. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 1) @@ -3710,6 +3723,15 @@ cdef class LastWeekOfMonth(WeekOfMonthMixin): - 5 is Saturday - 6 is Sunday. + See Also + -------- + tseries.offsets.WeekOfMonth : + Date offset for a specific weekday in a month. + tseries.offsets.MonthEnd : + Date offset for the end of the month. + tseries.offsets.BMonthEnd : + Date offset for the last business day of the month. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 1) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index d6d69a49c9539..bef1956996b4f 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -679,7 +679,7 @@ cdef char* c_strftime(npy_datetimestruct *dts, char *fmt): c_date.tm_yday = get_day_of_year(dts.year, dts.month, dts.day) - 1 c_date.tm_isdst = -1 - result = malloc(result_len * sizeof(char)) + result = malloc(result_len) if result is NULL: raise MemoryError() @@ -2140,6 +2140,12 @@ cdef class _Period(PeriodMixin): """ Get day of the month that a Period falls on. + The `day` property provides a simple way to access the day component + of a `Period` object, which represents time spans in various frequencies + (e.g., daily, hourly, monthly). If the period's frequency does not include + a day component (e.g., yearly or quarterly periods), the returned day + corresponds to the first day of that period. + Returns ------- int @@ -2836,6 +2842,11 @@ class Period(_Period): """ Represents a period of time. + A `Period` represents a specific time span rather than a point in time. + Unlike `Timestamp`, which represents a single instant, a `Period` defines a + duration, such as a month, quarter, or year. The exact representation is + determined by the `freq` parameter. + Parameters ---------- value : Period, str, datetime, date or pandas.Timestamp, default None diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index ed784b6f5ab22..fb89f1328529d 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -924,6 +924,13 @@ cdef (int, int) _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday) correction = date(iso_year, 1, 4).isoweekday() + 3 ordinal = (iso_week * 7) + iso_weekday - correction + + if iso_week == 53: + now = date.fromordinal(date(iso_year, 1, 1).toordinal() + ordinal - iso_weekday) + jan_4th = date(iso_year+1, 1, 4) + if (jan_4th - now).days < 7: + raise ValueError(f"Week 53 does not exist in ISO year {iso_year}.") + # ordinal may be negative or 0 now, which means the date is in the previous # calendar year if ordinal < 1: diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index a3429fc840347..6b4b90167e625 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -2217,7 +2217,7 @@ class Timestamp(_Timestamp): difference between the current timezone and UTC. Returns - -------- + ------- timedelta The difference between UTC and the local time as a `timedelta` object. diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi index a6cfbec9b15b9..ee735761e3dc6 100644 --- a/pandas/_libs/window/aggregations.pyi +++ b/pandas/_libs/window/aggregations.pyi @@ -60,6 +60,18 @@ def roll_min( end: np.ndarray, # np.ndarray[np.int64] minp: int, # int64_t ) -> np.ndarray: ... # np.ndarray[float] +def roll_first( + values: np.ndarray, # np.ndarray[np.float64] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] +def roll_last( + values: np.ndarray, # np.ndarray[np.float64] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] def roll_quantile( values: np.ndarray, # const float64_t[:] start: np.ndarray, # np.ndarray[np.int64] diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 5b9ee095d4643..d33c840371d2a 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1133,6 +1133,89 @@ cdef _roll_min_max(ndarray[float64_t] values, return output +# ---------------------------------------------------------------------- +# Rolling first, last + + +def roll_first(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp) -> np.ndarray: + return _roll_first_last(values, start, end, minp, is_first=1) + + +def roll_last(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp) -> np.ndarray: + return _roll_first_last(values, start, end, minp, is_first=0) + + +cdef _roll_first_last(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, bint is_first): + cdef: + Py_ssize_t i, j, fl_idx + bint is_monotonic_increasing_bounds + int64_t nobs = 0, N = len(start), s, e + float64_t val, res + ndarray[float64_t] output + + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) + + output = np.empty(N, dtype=np.float64) + + if (end - start).max() == 0: + output[:] = NaN + return output + + with nogil: + for i in range(0, N): + s = start[i] + e = end[i] + + if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]: + fl_idx = -1 + nobs = 0 + for j in range(s, e): + val = values[j] + if val == val: + if not is_first or fl_idx < s: + fl_idx = j + nobs += 1 + else: + # handle deletes + for j in range(start[i - 1], s): + val = values[j] + if val == val: + nobs -= 1 + + # update fl_idx if out of range, if first + if is_first and fl_idx < s: + fl_idx = -1 + for j in range(s, end[i - 1]): + val = values[j] + if val == val: + fl_idx = j + break + + # handle adds + for j in range(end[i - 1], e): + val = values[j] + if val == val: + if not is_first or fl_idx < s: + fl_idx = j + nobs += 1 + + if nobs >= minp and fl_idx >= s: + res = values[fl_idx] + else: + res = NaN + + output[i] = res + + if not is_monotonic_increasing_bounds: + nobs = 0 + + return output + cdef enum InterpolationType: LINEAR, diff --git a/pandas/_libs/window/meson.build b/pandas/_libs/window/meson.build index ad15644f73a0c..1d49bba47e139 100644 --- a/pandas/_libs/window/meson.build +++ b/pandas/_libs/window/meson.build @@ -4,8 +4,8 @@ py.extension_module( cython_args: ['-X always_allow_keywords=true'], include_directories: [inc_np, inc_pd], subdir: 'pandas/_libs/window', - override_options : ['cython_language=cpp'], - install: true + override_options: ['cython_language=cpp'], + install: true, ) py.extension_module( @@ -14,18 +14,11 @@ py.extension_module( cython_args: ['-X always_allow_keywords=true'], include_directories: [inc_np, inc_pd], subdir: 'pandas/_libs/window', - install: true + install: true, ) -sources_to_install = [ - '__init__.py', - 'aggregations.pyi', - 'indexers.pyi' -] +sources_to_install = ['__init__.py', 'aggregations.pyi', 'indexers.pyi'] -foreach source: sources_to_install - py.install_sources( - source, - subdir: 'pandas/_libs/window' - ) +foreach source : sources_to_install + py.install_sources(source, subdir: 'pandas/_libs/window') endforeach diff --git a/pandas/_typing.py b/pandas/_typing.py index b515305fb6903..4365ee85f72e3 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -442,7 +442,9 @@ def closed(self) -> bool: AnyAll = Literal["any", "all"] # merge -MergeHow = Literal["left", "right", "inner", "outer", "cross"] +MergeHow = Literal[ + "left", "right", "inner", "outer", "cross", "left_anti", "right_anti" +] MergeValidate = Literal[ "one_to_one", "1:1", diff --git a/pandas/api/typing/__init__.py b/pandas/api/typing/__init__.py index a18a1e9d5cbb7..c1178c72f3edc 100644 --- a/pandas/api/typing/__init__.py +++ b/pandas/api/typing/__init__.py @@ -3,6 +3,7 @@ """ from pandas._libs import NaTType +from pandas._libs.lib import NoDefault from pandas._libs.missing import NAType from pandas.core.groupby import ( @@ -44,6 +45,7 @@ "JsonReader", "NAType", "NaTType", + "NoDefault", "PeriodIndexResamplerGroupby", "Resampler", "Rolling", diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index e7674386408f7..138456f877c5f 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -34,6 +34,7 @@ pa_version_under16p0, pa_version_under17p0, pa_version_under18p0, + pa_version_under19p0, ) if TYPE_CHECKING: @@ -166,4 +167,5 @@ def is_ci_environment() -> bool: "pa_version_under16p0", "pa_version_under17p0", "pa_version_under18p0", + "pa_version_under19p0", ] diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index bd009b544f31e..c501c06b93813 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -18,6 +18,7 @@ pa_version_under16p0 = _palv < Version("16.0.0") pa_version_under17p0 = _palv < Version("17.0.0") pa_version_under18p0 = _palv < Version("18.0.0") + pa_version_under19p0 = _palv < Version("19.0.0") HAS_PYARROW = True except ImportError: pa_version_under10p1 = True @@ -30,4 +31,5 @@ pa_version_under16p0 = True pa_version_under17p0 = True pa_version_under18p0 = True + pa_version_under19p0 = True HAS_PYARROW = False diff --git a/pandas/conftest.py b/pandas/conftest.py index 106518678df6a..f9c10a7758bd2 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1317,6 +1317,22 @@ def nullable_string_dtype(request): return request.param +@pytest.fixture( + params=[ + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + ] +) +def pyarrow_string_dtype(request): + """ + Parametrized fixture for string dtypes backed by Pyarrow. + + * 'str[pyarrow]' + * 'string[pyarrow]' + """ + return pd.StringDtype(*request.param) + + @pytest.fixture( params=[ "python", diff --git a/pandas/core/_numba/kernels/mean_.py b/pandas/core/_numba/kernels/mean_.py index cc10bd003af7e..2b59ea2fe12a5 100644 --- a/pandas/core/_numba/kernels/mean_.py +++ b/pandas/core/_numba/kernels/mean_.py @@ -169,9 +169,10 @@ def grouped_mean( labels: npt.NDArray[np.intp], ngroups: int, min_periods: int, + skipna: bool, ) -> tuple[np.ndarray, list[int]]: output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum( - values, result_dtype, labels, ngroups + values, result_dtype, labels, ngroups, skipna ) # Post-processing, replace sums that don't satisfy min_periods diff --git a/pandas/core/_numba/kernels/min_max_.py b/pandas/core/_numba/kernels/min_max_.py index 59d36732ebae6..68aa1446bbe3c 100644 --- a/pandas/core/_numba/kernels/min_max_.py +++ b/pandas/core/_numba/kernels/min_max_.py @@ -88,6 +88,7 @@ def grouped_min_max( ngroups: int, min_periods: int, is_max: bool, + skipna: bool = True, ) -> tuple[np.ndarray, list[int]]: N = len(labels) nobs = np.zeros(ngroups, dtype=np.int64) @@ -97,13 +98,16 @@ def grouped_min_max( for i in range(N): lab = labels[i] val = values[i] - if lab < 0: + if lab < 0 or (not skipna and nobs[lab] >= 1 and np.isnan(output[lab])): continue if values.dtype.kind == "i" or not np.isnan(val): nobs[lab] += 1 else: - # NaN value cannot be a min/max value + if not skipna: + # If skipna is False and we encounter a NaN, + # both min and max of the group will be NaN + output[lab] = np.nan continue if nobs[lab] == 1: diff --git a/pandas/core/_numba/kernels/sum_.py b/pandas/core/_numba/kernels/sum_.py index 76f4e22b43c4b..9f2e9541b31d0 100644 --- a/pandas/core/_numba/kernels/sum_.py +++ b/pandas/core/_numba/kernels/sum_.py @@ -165,6 +165,7 @@ def grouped_kahan_sum( result_dtype: np.dtype, labels: npt.NDArray[np.intp], ngroups: int, + skipna: bool, ) -> tuple[ np.ndarray, npt.NDArray[np.int64], np.ndarray, npt.NDArray[np.int64], np.ndarray ]: @@ -180,7 +181,15 @@ def grouped_kahan_sum( lab = labels[i] val = values[i] - if lab < 0: + if lab < 0 or np.isnan(output[lab]): + continue + + if not skipna and np.isnan(val): + output[lab] = np.nan + nobs_arr[lab] += 1 + comp_arr[lab] = np.nan + consecutive_counts[lab] = 1 + prev_vals[lab] = np.nan continue sum_x = output[lab] @@ -219,11 +228,12 @@ def grouped_sum( labels: npt.NDArray[np.intp], ngroups: int, min_periods: int, + skipna: bool, ) -> tuple[np.ndarray, list[int]]: na_pos = [] output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum( - values, result_dtype, labels, ngroups + values, result_dtype, labels, ngroups, skipna ) # Post-processing, replace sums that don't satisfy min_periods diff --git a/pandas/core/_numba/kernels/var_.py b/pandas/core/_numba/kernels/var_.py index 69aec4d6522c4..5d720c877815d 100644 --- a/pandas/core/_numba/kernels/var_.py +++ b/pandas/core/_numba/kernels/var_.py @@ -176,6 +176,7 @@ def grouped_var( ngroups: int, min_periods: int, ddof: int = 1, + skipna: bool = True, ) -> tuple[np.ndarray, list[int]]: N = len(labels) @@ -190,7 +191,11 @@ def grouped_var( lab = labels[i] val = values[i] - if lab < 0: + if lab < 0 or np.isnan(output[lab]): + continue + + if not skipna and np.isnan(val): + output[lab] = np.nan continue mean_x = means[lab] diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 56f8adda93251..aafd802b827a5 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -23,6 +23,7 @@ iNaT, lib, ) +from pandas._libs.missing import NA from pandas._typing import ( AnyArrayLike, ArrayLike, @@ -544,10 +545,15 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]: # Ensure np.isin doesn't get object types or it *may* throw an exception # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), # isin is faster for small sizes + + # GH60678 + # Ensure values don't contain , otherwise it throws exception with np.in1d + if ( len(comps_array) > _MINIMUM_COMP_ARR_LEN and len(values) <= 26 and comps_array.dtype != object + and not any(v is NA for v in values) ): # If the values include nan we need to check for nan explicitly # since np.nan it not equal to np.nan @@ -1012,7 +1018,7 @@ def mode( return npresult, res_mask # type: ignore[return-value] try: - npresult = np.sort(npresult) + npresult = safe_sort(npresult) except TypeError as err: warnings.warn( f"Unable to sort modes: {err}", diff --git a/pandas/core/apply.py b/pandas/core/apply.py index af513d49bcfe0..f36fc82fb1a11 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1645,8 +1645,7 @@ def reconstruct_func( # GH 28426 will raise error if duplicated function names are used and # there is no reassigned name raise SpecificationError( - "Function names must be unique if there is no new column names " - "assigned" + "Function names must be unique if there is no new column names assigned" ) if func is None: # nicer error message diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index a9ad66b7cb2e5..debd6368e98a4 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -89,7 +89,8 @@ def _check_comparison_types( op = np.vectorize( lambda x: bool(re.search(b, x)) if isinstance(x, str) and isinstance(b, (str, Pattern)) - else False + else False, + otypes=[bool], ) # GH#32621 use mask to avoid comparing to NAs diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index 43ac69508d1a4..51ddd9e91b227 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -329,7 +329,7 @@ def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes)) if self.ndim == 1: - names = {getattr(x, "name") for x in inputs if hasattr(x, "name")} + names = {x.name for x in inputs if hasattr(x, "name")} name = names.pop() if len(names) == 1 else None reconstruct_kwargs = {"name": name} else: diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 2d1b1eca55e98..1ca52ce64bd77 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -253,6 +253,10 @@ def _str_isalpha(self): result = pc.utf8_is_alpha(self._pa_array) return self._convert_bool_result(result) + def _str_isascii(self): + result = pc.string_is_ascii(self._pa_array) + return self._convert_bool_result(result) + def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) return self._convert_bool_result(result) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index afa219f611992..0b546bed1c2b7 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -41,6 +41,7 @@ is_list_like, is_numeric_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -257,6 +258,7 @@ class ArrowExtensionArray( Parameters ---------- values : pyarrow.Array or pyarrow.ChunkedArray + The input data to initialize the ArrowExtensionArray. Attributes ---------- @@ -270,6 +272,12 @@ class ArrowExtensionArray( ------- ArrowExtensionArray + See Also + -------- + array : Create a Pandas array with a specified dtype. + DataFrame.to_feather : Write a DataFrame to the binary Feather format. + read_feather : Load a feather-format object from the file path. + Notes ----- Most methods are implemented using `pyarrow compute functions. `__ @@ -1390,7 +1398,7 @@ def _to_datetimearray(self) -> DatetimeArray: np_dtype = np.dtype(f"M8[{pa_type.unit}]") dtype = tz_to_dtype(pa_type.tz, pa_type.unit) np_array = self._pa_array.to_numpy() - np_array = np_array.astype(np_dtype) + np_array = np_array.astype(np_dtype, copy=False) return DatetimeArray._simple_new(np_array, dtype=dtype) def _to_timedeltaarray(self) -> TimedeltaArray: @@ -1401,7 +1409,7 @@ def _to_timedeltaarray(self) -> TimedeltaArray: assert pa.types.is_duration(pa_type) np_dtype = np.dtype(f"m8[{pa_type.unit}]") np_array = self._pa_array.to_numpy() - np_array = np_array.astype(np_dtype) + np_array = np_array.astype(np_dtype, copy=False) return TimedeltaArray._simple_new(np_array, dtype=np_dtype) def _values_for_json(self) -> np.ndarray: @@ -1619,6 +1627,9 @@ def _accumulate( ------ NotImplementedError : subclass does not define accumulations """ + if is_string_dtype(self): + return self._str_accumulate(name=name, skipna=skipna, **kwargs) + pyarrow_name = { "cummax": "cumulative_max", "cummin": "cumulative_min", @@ -1654,6 +1665,57 @@ def _accumulate( return type(self)(result) + def _str_accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> ArrowExtensionArray | ExtensionArray: + """ + Accumulate implementation for strings, see `_accumulate` docstring for details. + + pyarrow.compute does not implement these methods for strings. + """ + if name == "cumprod": + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) + + # We may need to strip out trailing NA values + tail: pa.array | None = None + na_mask: pa.array | None = None + pa_array = self._pa_array + np_func = { + "cumsum": np.cumsum, + "cummin": np.minimum.accumulate, + "cummax": np.maximum.accumulate, + }[name] + + if self._hasna: + na_mask = pc.is_null(pa_array) + if pc.all(na_mask) == pa.scalar(True): + return type(self)(pa_array) + if skipna: + if name == "cumsum": + pa_array = pc.fill_null(pa_array, "") + else: + # We can retain the running min/max by forward/backward filling. + pa_array = pc.fill_null_forward(pa_array) + pa_array = pc.fill_null_backward(pa_array) + else: + # When not skipping NA values, the result should be null from + # the first NA value onward. + idx = pc.index(na_mask, True).as_py() + tail = pa.nulls(len(pa_array) - idx, type=pa_array.type) + pa_array = pa_array[:idx] + + # error: Cannot call function of unknown type + pa_result = pa.array(np_func(pa_array), type=pa_array.type) # type: ignore[operator] + + if tail is not None: + pa_result = pa.concat_arrays([pa_result, tail]) + elif na_mask is not None: + pa_result = pc.if_else(na_mask, None, pa_result) + + result = type(self)(pa_result) + return result + def _reduce_pyarrow(self, name: str, *, skipna: bool = True, **kwargs) -> pa.Scalar: """ Return a pyarrow scalar result of performing the reduction operation. @@ -2160,7 +2222,7 @@ def interpolate( """ # NB: we return type(self) even if copy=False if not self.dtype._is_numeric: - raise ValueError("Values must be numeric.") + raise TypeError(f"Cannot interpolate with {self.dtype} dtype") if ( not pa_version_under13p0 @@ -2469,8 +2531,6 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): else: dummies_dtype = np.bool_ dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype) - if dtype == str: - dummies[:] = False dummies[indices] = True dummies = dummies.reshape((n_rows, n_cols)) result = type(self)(pa.array(list(dummies))) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 4835d808f2433..33745438e2aea 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1791,9 +1791,11 @@ def take(self, indices, allow_fill=False, fill_value=None): # type for the array, to the physical storage type for # the data, before passing to take. - result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill) + result = take( + data, indices, fill_value=fill_value, allow_fill=allow_fill + ) return self._from_sequence(result, dtype=self.dtype) - """ # noqa: E501 + """ # Implementer note: The `fill_value` parameter should be a user-facing # value, an instance of self.dtype.type. When passed `fill_value=None`, # the default of `self.dtype.na_value` should be used. @@ -2618,6 +2620,7 @@ def _groupby_op( "sem", "var", "skew", + "kurt", ]: raise TypeError( f"dtype '{self.dtype}' does not support operation '{how}'" diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 99e4cb0545e2d..0ce700772fdcc 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -447,7 +447,12 @@ def __init__( if isinstance(values.dtype, ArrowDtype) and issubclass( values.dtype.type, CategoricalDtypeType ): - arr = values._pa_array.combine_chunks() + from pandas import Index + + if isinstance(values, Index): + arr = values._data._pa_array.combine_chunks() + else: + arr = values._pa_array.combine_chunks() categories = arr.dictionary.to_pandas(types_mapper=ArrowDtype) codes = arr.indices.to_numpy() dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered) @@ -2736,7 +2741,7 @@ def _groupby_op( op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) dtype = self.dtype - if how in ["sum", "prod", "cumsum", "cumprod", "skew"]: + if how in ["sum", "prod", "cumsum", "cumprod", "skew", "kurt"]: raise TypeError(f"{dtype} type does not support {how} operations") if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered: # raise TypeError instead of NotImplementedError to ensure we diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c6b6367e347ba..8a79ab53442c3 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1656,7 +1656,7 @@ def _groupby_op( dtype = self.dtype if dtype.kind == "M": # Adding/multiplying datetimes is not valid - if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]: + if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]: raise TypeError(f"datetime64 type does not support operation '{how}'") if how in ["any", "all"]: # GH#34479 @@ -1667,7 +1667,7 @@ def _groupby_op( elif isinstance(dtype, PeriodDtype): # Adding/multiplying Periods is not valid - if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]: + if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]: raise TypeError(f"Period type does not support {how} operations") if how in ["any", "all"]: # GH#34479 @@ -1677,7 +1677,7 @@ def _groupby_op( ) else: # timedeltas we can add but not multiply - if how in ["prod", "cumprod", "skew", "var"]: + if how in ["prod", "cumprod", "skew", "kurt", "var"]: raise TypeError(f"timedelta64 type does not support {how} operations") # All of the functions implemented here are ordinal, so we can diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 43cc492f82885..df40c9c11b117 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2707,8 +2707,7 @@ def _maybe_infer_tz(tz: tzinfo | None, inferred_tz: tzinfo | None) -> tzinfo | N pass elif not timezones.tz_compare(tz, inferred_tz): raise TypeError( - f"data is already tz-aware {inferred_tz}, unable to " - f"set specified tz: {tz}" + f"data is already tz-aware {inferred_tz}, unable to set specified tz: {tz}" ) return tz diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 9f7238a97d808..ac0823ed903b3 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -71,6 +71,11 @@ class NumpyExtensionArray( # type: ignore[misc] ------- None + See Also + -------- + array : Create an array. + Series.to_numpy : Convert a Series to a NumPy array. + Examples -------- >>> pd.arrays.NumpyExtensionArray(np.array([0, 1, 2, 3])) @@ -287,6 +292,9 @@ def interpolate( See NDFrame.interpolate.__doc__. """ # NB: we return type(self) even if copy=False + if not self.dtype._is_numeric: + raise TypeError(f"Cannot interpolate with {self.dtype} dtype") + if not copy: out_data = self._ndarray else: diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 0ed5f69fe4703..eab8527eef526 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -279,6 +279,11 @@ class SparseFrameAccessor(BaseAccessor, PandasDelegate): """ DataFrame accessor for sparse data. + It allows users to interact with a `DataFrame` that contains sparse data types + (`SparseDtype`). It provides methods and attributes to efficiently work with sparse + storage, reducing memory usage while maintaining compatibility with standard pandas + operations. + Parameters ---------- data : scipy.sparse.spmatrix diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 3b881cfd2df2f..7227ea77ca433 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -49,6 +49,7 @@ ) from pandas.core import ( + missing, nanops, ops, ) @@ -533,6 +534,11 @@ def _str_map_nan_semantics( else: return self._str_map_str_or_object(dtype, na_value, arr, f, mask) + def view(self, dtype: Dtype | None = None) -> ArrayLike: + if dtype is not None: + raise TypeError("Cannot change data-type for string array.") + return super().view(dtype=dtype) + # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is # incompatible with definition in base class "ExtensionArray" @@ -865,6 +871,88 @@ def _reduce( raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArray: + """ + Return an ExtensionArray performing an accumulation operation. + + The underlying data type might change. + + Parameters + ---------- + name : str + Name of the function, supported values are: + - cummin + - cummax + - cumsum + - cumprod + skipna : bool, default True + If True, skip NA values. + **kwargs + Additional keyword arguments passed to the accumulation function. + Currently, there is no supported kwarg. + + Returns + ------- + array + + Raises + ------ + NotImplementedError : subclass does not define accumulations + """ + if name == "cumprod": + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) + + # We may need to strip out trailing NA values + tail: np.ndarray | None = None + na_mask: np.ndarray | None = None + ndarray = self._ndarray + np_func = { + "cumsum": np.cumsum, + "cummin": np.minimum.accumulate, + "cummax": np.maximum.accumulate, + }[name] + + if self._hasna: + na_mask = cast("npt.NDArray[np.bool_]", isna(ndarray)) + if np.all(na_mask): + return type(self)(ndarray) + if skipna: + if name == "cumsum": + ndarray = np.where(na_mask, "", ndarray) + else: + # We can retain the running min/max by forward/backward filling. + ndarray = ndarray.copy() + missing.pad_or_backfill_inplace( + ndarray, + method="pad", + axis=0, + ) + missing.pad_or_backfill_inplace( + ndarray, + method="backfill", + axis=0, + ) + else: + # When not skipping NA values, the result should be null from + # the first NA value onward. + idx = np.argmax(na_mask) + tail = np.empty(len(ndarray) - idx, dtype="object") + tail[:] = self.dtype.na_value + ndarray = ndarray[:idx] + + # mypy: Cannot call function of unknown type + np_result = np_func(ndarray) # type: ignore[operator] + + if tail is not None: + np_result = np.hstack((np_result, tail)) + elif na_mask is not None: + # Argument 2 to "where" has incompatible type "NAType | float" + np_result = np.where(na_mask, self.dtype.na_value, np_result) # type: ignore[arg-type] + + result = type(self)(np_result) + return result + def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: if self.dtype.na_value is np.nan and result is libmissing.NA: # the masked_reductions use pd.NA -> convert to np.nan diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 27c1425d11ac6..d35083fd892a8 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -481,6 +481,9 @@ def _cmp_method(self, other, op): return result.to_numpy(np.bool_, na_value=False) return result + def __pos__(self) -> Self: + raise TypeError(f"bad operand type for unary +: '{self.dtype}'") + class ArrowStringArrayNumpySemantics(ArrowStringArray): _na_value = np.nan diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index a8a0037d0bbb9..c5b3129c506c8 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -115,10 +115,10 @@ class TimedeltaArray(dtl.TimelikeOps): ---------- data : array-like The timedelta data. - dtype : numpy.dtype Currently, only ``numpy.dtype("timedelta64[ns]")`` is accepted. freq : Offset, optional + Frequency of the data. copy : bool, default False Whether to copy the underlying array of data. @@ -130,6 +130,12 @@ class TimedeltaArray(dtl.TimelikeOps): ------- None + See Also + -------- + Timedelta : Represents a duration, the difference between two dates or times. + TimedeltaIndex : Immutable Index of timedelta64 data. + to_timedelta : Convert argument to timedelta. + Examples -------- >>> pd.arrays.TimedeltaArray._from_sequence(pd.TimedeltaIndex(["1h", "2h"])) diff --git a/pandas/core/base.py b/pandas/core/base.py index 61a7c079d87f8..a64cd8633c1db 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -506,6 +506,11 @@ def array(self) -> ExtensionArray: """ The ExtensionArray of the data backing this Series or Index. + This property provides direct access to the underlying array data of a + Series or Index without requiring conversion to a NumPy array. It + returns an ExtensionArray, which is the native storage format for + pandas extension dtypes. + Returns ------- ExtensionArray diff --git a/pandas/core/common.py b/pandas/core/common.py index 9788ec972ba1b..100ad312bd839 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -359,7 +359,7 @@ def is_full_slice(obj, line: int) -> bool: def get_callable_name(obj): # typical case has name if hasattr(obj, "__name__"): - return getattr(obj, "__name__") + return obj.__name__ # some objects don't; could recurse if isinstance(obj, partial): return get_callable_name(obj.func) diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 86f83489e71ae..f8e3200ef2ba0 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -190,8 +190,8 @@ def eval( .. warning:: - ``eval`` can run arbitrary code which can make you vulnerable to code - injection and untrusted data. + This function can run arbitrary code which can make you vulnerable to code + injection if you pass user input to this function. Parameters ---------- @@ -204,7 +204,7 @@ def eval( By default, with the numexpr engine, the following operations are supported: - - Arthimetic operations: ``+``, ``-``, ``*``, ``/``, ``**``, ``%`` + - Arithmetic operations: ``+``, ``-``, ``*``, ``/``, ``**``, ``%`` - Boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not) - Comparison operators: ``<``, ``<=``, ``==``, ``!=``, ``>=``, ``>`` diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 010fad1bbf0b6..14a393b02409c 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -698,7 +698,7 @@ def visit_Call(self, node, side=None, **kwargs): if not isinstance(key, ast.keyword): # error: "expr" has no attribute "id" raise ValueError( - "keyword error in function call " f"'{node.func.id}'" # type: ignore[attr-defined] + f"keyword error in function call '{node.func.id}'" # type: ignore[attr-defined] ) if key.arg: diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 9b26de42e119b..f06ded6d9f98e 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -512,8 +512,7 @@ def __init__(self, op: Literal["+", "-", "~", "not"], operand) -> None: self.func = _unary_ops_dict[op] except KeyError as err: raise ValueError( - f"Invalid unary operator {op!r}, " - f"valid operators are {UNARY_OPS_SYMS}" + f"Invalid unary operator {op!r}, valid operators are {UNARY_OPS_SYMS}" ) from err def __call__(self, env) -> MathCall: diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index 35a6d1c6ad269..8441941797a6e 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -123,16 +123,6 @@ def clean_column_name(name: Hashable) -> Hashable: ------- name : hashable Returns the name after tokenizing and cleaning. - - Notes - ----- - For some cases, a name cannot be converted to a valid Python identifier. - In that case :func:`tokenize_string` raises a SyntaxError. - In that case, we just return the name unmodified. - - If this name was used in the query string (this makes the query call impossible) - an error will be raised by :func:`tokenize_backtick_quoted_string` instead, - which is not caught and propagates to the user level. """ try: # Escape backticks @@ -145,40 +135,6 @@ def clean_column_name(name: Hashable) -> Hashable: return name -def tokenize_backtick_quoted_string( - token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int -) -> tuple[int, str]: - """ - Creates a token from a backtick quoted string. - - Moves the token_generator forwards till right after the next backtick. - - Parameters - ---------- - token_generator : Iterator[tokenize.TokenInfo] - The generator that yields the tokens of the source string (Tuple[int, str]). - The generator is at the first token after the backtick (`) - - source : str - The Python source code string. - - string_start : int - This is the start of backtick quoted string inside the source string. - - Returns - ------- - tok: Tuple[int, str] - The token that represents the backtick quoted string. - The integer is equal to BACKTICK_QUOTED_STRING (100). - """ - for _, tokval, start, _, _ in token_generator: - if tokval == "`": - string_end = start[1] - break - - return BACKTICK_QUOTED_STRING, source[string_start:string_end] - - class ParseState(Enum): DEFAULT = 0 IN_BACKTICK = 1 diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 8df4f7e3e08f9..ada492787a179 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -81,6 +81,10 @@ def array( """ Create an array. + This method constructs an array using pandas extension types when possible. + If `dtype` is specified, it determines the type of array returned. Otherwise, + pandas attempts to infer the appropriate dtype based on `data`. + Parameters ---------- data : Sequence of objects @@ -596,6 +600,8 @@ def sanitize_array( # create an extension array from its dtype _sanitize_non_ordered(data) cls = dtype.construct_array_type() + if not hasattr(data, "__array__"): + data = list(data) subarr = cls._from_sequence(data, dtype=dtype, copy=copy) # GH#846 diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 02b9291da9b31..f11aefeeaaa00 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1113,7 +1113,7 @@ def convert_dtypes( else: inferred_dtype = input_array.dtype - if dtype_backend == "pyarrow": + if dtype_backend == "pyarrow" and not isinstance(inferred_dtype, ArrowDtype): from pandas.core.arrays.arrow.array import to_pyarrow_type from pandas.core.arrays.string_ import StringDtype @@ -1651,7 +1651,7 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n # (test_constructor_coercion_signed_to_unsigned) so safe to ignore. warnings.filterwarnings( "ignore", - "NumPy will stop allowing conversion of " "out-of-bound Python int", + "NumPy will stop allowing conversion of out-of-bound Python int", DeprecationWarning, ) casted = np.asarray(arr, dtype=dtype) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index b0c8ec1ffc083..e92f2363b69f1 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1836,6 +1836,8 @@ def pandas_dtype(dtype) -> DtypeObj: # raise a consistent TypeError if failed try: with warnings.catch_warnings(): + # TODO: warnings.catch_warnings can be removed when numpy>2.3.0 + # is the minimum version # GH#51523 - Series.astype(np.integer) doesn't show # numpy deprecation warning of np.integer # Hence enabling DeprecationWarning diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 1dd1b12d6ae95..6be6787862654 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -161,6 +161,11 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): """ Type for categorical data with the categories and orderedness. + It is a dtype representation for categorical data, which allows users to define + a fixed set of values and optionally impose an ordering. This is particularly + useful for handling categorical variables efficiently, as it can significantly + reduce memory usage compared to using object dtypes. + Parameters ---------- categories : sequence, optional @@ -605,8 +610,7 @@ def update_dtype(self, dtype: str_type | CategoricalDtype) -> CategoricalDtype: return self elif not self.is_dtype(dtype): raise ValueError( - f"a CategoricalDtype must be passed to perform an update, " - f"got {dtype!r}" + f"a CategoricalDtype must be passed to perform an update, got {dtype!r}" ) else: # from here on, dtype is a CategoricalDtype @@ -2339,7 +2343,7 @@ def construct_from_string(cls, string: str) -> ArrowDtype: ) if not string.endswith("[pyarrow]"): raise TypeError(f"'{string}' must end with '[pyarrow]'") - if string == "string[pyarrow]": + if string in ("string[pyarrow]", "str[pyarrow]"): # Ensure Registry.find skips ArrowDtype to use StringDtype instead raise TypeError("string[pyarrow] should be constructed by StringDtype") if pa_version_under10p1: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 34b448a0d8d1c..3199733cfb85f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -315,7 +315,8 @@ ----------%s right : DataFrame or named Series Object to merge with. -how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' +how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'}, + default 'inner' Type of merge to be performed. * left: use only keys from left frame, similar to a SQL left outer join; @@ -328,6 +329,10 @@ join; preserve the order of the left keys. * cross: creates the cartesian product from both frames, preserves the order of the left keys. + * left_anti: use only keys from left frame that are not in right frame, similar + to SQL left anti join; preserve key order. + * right_anti: use only keys from right frame that are not in left frame, similar + to SQL right anti join; preserve key order. on : label or list Column or index level names to join on. These must be found in both DataFrames. If `on` is None and not merging on indexes then this defaults @@ -1016,6 +1021,10 @@ def shape(self) -> tuple[int, int]: """ Return a tuple representing the dimensionality of the DataFrame. + Unlike the `len()` method, which only returns the number of rows, `shape` + provides both row and column counts, making it a more informative method for + understanding dataset size. + See Also -------- numpy.ndarray.shape : Tuple of array dimensions. @@ -2317,7 +2326,10 @@ def maybe_reorder( columns = columns.drop(exclude) mgr = arrays_to_mgr(arrays, columns, result_index) - return cls._from_mgr(mgr, axes=mgr.axes) + df = DataFrame._from_mgr(mgr, axes=mgr.axes) + if cls is not DataFrame: + return cls(df, copy=False) + return df def to_records( self, index: bool = True, column_dtypes=None, index_dtypes=None @@ -3202,9 +3214,13 @@ def to_html( Convert the characters <, >, and & to HTML-safe sequences. notebook : {True, False}, default False Whether the generated HTML is for IPython Notebook. - border : int - A ``border=border`` attribute is included in the opening - `` tag. Default ``pd.options.display.html.border``. + border : int or bool + When an integer value is provided, it sets the border attribute in + the opening tag, specifying the thickness of the border. + If ``False`` or ``0`` is passed, the border attribute will not + be present in the ``
`` tag. + The default value for this parameter is governed by + ``pd.options.display.html.border``. table_id : str, optional A css id is included in the opening `
` tag if specified. render_links : bool, default False @@ -4476,8 +4492,10 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No """ Query the columns of a DataFrame with a boolean expression. - This method can run arbitrary code which can make you vulnerable to code - injection if you pass user input to this function. + .. warning:: + + This method can run arbitrary code which can make you vulnerable to code + injection if you pass user input to this function. Parameters ---------- @@ -4634,6 +4652,11 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: """ Evaluate a string describing operations on DataFrame columns. + .. warning:: + + This method can run arbitrary code which can make you vulnerable to code + injection if you pass user input to this function. + Operates on columns only, not specific rows or elements. This allows `eval` to run arbitrary code, which can make you vulnerable to code injection if you pass user input to this function. @@ -4779,6 +4802,10 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: """ Return a subset of the DataFrame's columns based on the column dtypes. + This method allows for filtering columns based on their data types. + It is useful when working with heterogeneous DataFrames where operations + need to be performed on a specific subset of data types. + Parameters ---------- include, exclude : scalar or list-like @@ -5009,7 +5036,7 @@ def assign(self, **kwargs) -> DataFrame: Parameters ---------- - **kwargs : dict of {str: callable or Series} + **kwargs : callable or Series The column names are keywords. If the values are callable, they are computed on the DataFrame and assigned to the new columns. The callable must not @@ -6880,7 +6907,8 @@ def sort_values( builtin :meth:`sorted` function, with the notable difference that this `key` function should be *vectorized*. It should expect a ``Series`` and return a Series with the same shape as the input. - It will be applied to each column in `by` independently. + It will be applied to each column in `by` independently. The values in the + returned Series will be used as the keys for sorting. Returns ------- @@ -7960,6 +7988,16 @@ def _arith_method_with_reindex(self, right: DataFrame, op) -> DataFrame: new_left = left if lcol_indexer is None else left.iloc[:, lcol_indexer] new_right = right if rcol_indexer is None else right.iloc[:, rcol_indexer] + + # GH#60498 For MultiIndex column alignment + if isinstance(cols, MultiIndex): + # When overwriting column names, make a shallow copy so as to not modify + # the input DFs + new_left = new_left.copy(deep=False) + new_right = new_right.copy(deep=False) + new_left.columns = cols + new_right.columns = cols + result = op(new_left, new_right) # Do the join on the columns instead of using left._align_for_op @@ -7990,6 +8028,18 @@ def _should_reindex_frame_op(self, right, op, axis: int, fill_value, level) -> b if not isinstance(right, DataFrame): return False + if ( + ( + isinstance(self.columns, MultiIndex) + or isinstance(right.columns, MultiIndex) + ) + and not self.columns.equals(right.columns) + and fill_value is None + ): + # GH#60498 Reindex if MultiIndexe columns are not matching + # GH#60903 Don't reindex if fill_value is provided + return True + if fill_value is None and level is None and axis == 1: # TODO: any other cases we should handle here? @@ -8643,6 +8693,7 @@ def combine( 2 NaN 3.0 1.0 """ other_idxlen = len(other.index) # save for compare + other_columns = other.columns this, other = self.align(other) new_index = this.index @@ -8653,8 +8704,8 @@ def combine( if self.empty and len(other) == other_idxlen: return other.copy() - # sorts if possible; otherwise align above ensures that these are set-equal - new_columns = this.columns.union(other.columns) + # preserve column order + new_columns = self.columns.union(other_columns, sort=False) do_fill = fill_value is not None result = {} for col in new_columns: @@ -10576,7 +10627,8 @@ def join( values given, the `other` DataFrame must have a MultiIndex. Can pass an array as the join key if it is not already contained in the calling DataFrame. Like an Excel VLOOKUP operation. - how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left' + how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'}, + default 'left' How to handle the operation of the two objects. * left: use calling frame's index (or column if on is specified) @@ -10588,6 +10640,10 @@ def join( of the calling's one. * cross: creates the cartesian product from both frames, preserves the order of the left keys. + * left_anti: use set difference of calling frame's index and `other`'s + index. + * right_anti: use set difference of `other`'s index and calling frame's + index. lsuffix : str, default '' Suffix to use from left frame's overlapping columns. rsuffix : str, default '' @@ -13644,6 +13700,10 @@ def isin_(x): doc=""" The column labels of the DataFrame. + This property holds the column names as a pandas ``Index`` object. + It provides an immutable sequence of column labels that can be + used for data selection, renaming, and alignment in DataFrame operations. + Returns ------- pandas.Index diff --git a/pandas/core/generic.py b/pandas/core/generic.py index de7fb3682fb4f..ccd801e252f2c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -406,6 +406,12 @@ def set_flags( """ Return a new object with updated flags. + This method creates a shallow copy of the original object, preserving its + underlying data while modifying its global flags. In particular, it allows + you to update properties such as whether duplicate labels are permitted. This + behavior is especially useful in method chains, where one wishes to + adjust DataFrame or Series characteristics without altering the original object. + Parameters ---------- copy : bool, default False @@ -2795,6 +2801,12 @@ def to_sql( Databases supported by SQLAlchemy [1]_ are supported. Tables can be newly created, appended to, or overwritten. + .. warning:: + The pandas library does not attempt to sanitize inputs provided via a to_sql call. + Please refer to the documentation for the underlying database driver to see if it + will properly prevent injection, or alternatively be advised of a security risk when + executing arbitrary commands in a to_sql call. + Parameters ---------- name : str @@ -4884,7 +4896,8 @@ def sort_values( builtin :meth:`sorted` function, with the notable difference that this `key` function should be *vectorized*. It should expect a ``Series`` and return a Series with the same shape as the input. - It will be applied to each column in `by` independently. + It will be applied to each column in `by` independently. The values in the + returned Series will be used as the keys for sorting. Returns ------- @@ -5536,8 +5549,7 @@ def filter( nkw = common.count_not_none(items, like, regex) if nkw > 1: raise TypeError( - "Keyword arguments `items`, `like`, or `regex` " - "are mutually exclusive" + "Keyword arguments `items`, `like`, or `regex` are mutually exclusive" ) if axis is None: diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index bad9749b5ecee..7699fb3d0f864 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -50,6 +50,7 @@ class OutputKey: "sem", "size", "skew", + "kurt", "std", "sum", "var", diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f4e3f3e8b1001..1251403db6ff3 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1272,13 +1272,86 @@ def skew( Name: Max Speed, dtype: float64 """ + return self._cython_agg_general( + "skew", alt=None, skipna=skipna, numeric_only=numeric_only, **kwargs + ) + + def kurt( + self, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ) -> Series: + """ + Return unbiased kurtosis within groups. + + Parameters + ---------- + skipna : bool, default True + Exclude NA/null values when computing the result. + + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series + Unbiased kurtosis within groups. + + See Also + -------- + Series.kurt : Return unbiased kurtosis over requested axis. + + Examples + -------- + >>> ser = pd.Series( + ... [390.0, 350.0, 357.0, 333.0, np.nan, 22.0, 20.0, 30.0, 40.0, 41.0], + ... index=[ + ... "Falcon", + ... "Falcon", + ... "Falcon", + ... "Falcon", + ... "Falcon", + ... "Parrot", + ... "Parrot", + ... "Parrot", + ... "Parrot", + ... "Parrot", + ... ], + ... name="Max Speed", + ... ) + >>> ser + Falcon 390.0 + Falcon 350.0 + Falcon 357.0 + Falcon 333.0 + Falcon NaN + Parrot 22.0 + Parrot 20.0 + Parrot 30.0 + Parrot 40.0 + Parrot 41.0 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).kurt() + Falcon 1.622109 + Parrot -2.878714 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).kurt(skipna=False) + Falcon NaN + Parrot -2.878714 + Name: Max Speed, dtype: float64 + """ + def alt(obj): # This should not be reached since the cython path should raise # TypeError and not NotImplementedError. - raise TypeError(f"'skew' is not supported for dtype={obj.dtype}") + raise TypeError(f"'kurt' is not supported for dtype={obj.dtype}") return self._cython_agg_general( - "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs + "kurt", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs ) @property @@ -2921,6 +2994,111 @@ def alt(obj): "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs ) + def kurt( + self, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ) -> DataFrame: + """ + Return unbiased kurtosis within groups. + + Parameters + ---------- + skipna : bool, default True + Exclude NA/null values when computing the result. + + numeric_only : bool, default False + Include only float, int, boolean columns. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + DataFrame + Unbiased kurtosis within groups. + + See Also + -------- + DataFrame.kurt : Return unbiased kurtosis over requested axis. + + Examples + -------- + >>> arrays = [ + ... [ + ... "falcon", + ... "parrot", + ... "cockatoo", + ... "kiwi", + ... "eagle", + ... "lion", + ... "monkey", + ... "rabbit", + ... "dog", + ... "wolf", + ... ], + ... [ + ... "bird", + ... "bird", + ... "bird", + ... "bird", + ... "bird", + ... "mammal", + ... "mammal", + ... "mammal", + ... "mammal", + ... "mammal", + ... ], + ... ] + >>> index = pd.MultiIndex.from_arrays(arrays, names=("name", "class")) + >>> df = pd.DataFrame( + ... { + ... "max_speed": [ + ... 389.0, + ... 24.0, + ... 70.0, + ... np.nan, + ... 350.0, + ... 80.5, + ... 21.5, + ... 15.0, + ... 40.0, + ... 50.0, + ... ] + ... }, + ... index=index, + ... ) + >>> df + max_speed + name class + falcon bird 389.0 + parrot bird 24.0 + cockatoo bird 70.0 + kiwi bird NaN + eagle bird 350.0 + lion mammal 80.5 + monkey mammal 21.5 + rabbit mammal 15.0 + dog mammal 40.0 + wolf mammal 50.0 + >>> gb = df.groupby(["class"]) + >>> gb.kurt() + max_speed + class + bird -5.493277 + mammal 0.204125 + >>> gb.kurt(skipna=False) + max_speed + class + bird NaN + mammal 0.204125 + """ + + return self._cython_agg_general( + "kurt", alt=None, skipna=skipna, numeric_only=numeric_only, **kwargs + ) + @property @doc(DataFrame.plot.__doc__) def plot(self) -> GroupByPlot: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f0513be3498d1..d0c0ed29b6d44 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -214,6 +214,61 @@ class providing the base-class of operations. {example} """ +_groupby_agg_method_skipna_engine_template = """ +Compute {fname} of group values. + +Parameters +---------- +numeric_only : bool, default {no} + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. + +min_count : int, default {mc} + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + +skipna : bool, default {s} + Exclude NA/null values. If the entire group is NA and ``skipna`` is + ``True``, the result will be NA. + + .. versionchanged:: 3.0.0 + +engine : str, default None {e} + * ``'cython'`` : Runs rolling apply through C-extensions from cython. + * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. + Only available when ``raw`` is set to ``True``. + * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` + +engine_kwargs : dict, default None {ek} + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be + applied to both the ``func`` and the ``apply`` groupby aggregation. + +Returns +------- +Series or DataFrame + Computed {fname} of values within each group. + +See Also +-------- +SeriesGroupBy.min : Return the min of the group values. +DataFrameGroupBy.min : Return the min of the group values. +SeriesGroupBy.max : Return the max of the group values. +DataFrameGroupBy.max : Return the max of the group values. +SeriesGroupBy.sum : Return the sum of the group values. +DataFrameGroupBy.sum : Return the sum of the group values. + +Examples +-------- +{example} +""" + _pipe_template = """ Apply a ``func`` with arguments to this %(klass)s object and return its result. @@ -515,6 +570,13 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: """ Dict {group name -> group indices}. + The dictionary keys represent the group labels (e.g., timestamps for a + time-based resampling operation), and the values are arrays of integer + positions indicating where the elements of each group are located in the + original data. This property is particularly useful when working with + resampled data, as it provides insight into how the original time-series data + has been grouped. + See Also -------- core.groupby.DataFrameGroupBy.indices : Provides a mapping of group rows to @@ -1393,7 +1455,7 @@ def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs): # ----------------------------------------------------------------- # apply/agg/transform - def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: + def apply(self, func, *args, include_groups: bool = False, **kwargs) -> NDFrameT: """ Apply function ``func`` group-wise and combine the results together. @@ -1419,7 +1481,7 @@ def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: *args : tuple Optional positional arguments to pass to ``func``. - include_groups : bool, default True + include_groups : bool, default False When True, will attempt to apply ``func`` to the groupings in the case that they are columns of the DataFrame. If this raises a TypeError, the result will be computed with the groupings excluded. @@ -1427,10 +1489,9 @@ def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: .. versionadded:: 2.2.0 - .. deprecated:: 2.2.0 + .. versionchanged:: 3.0.0 - Setting include_groups to True is deprecated. Only the value - False will be allowed in a future version of pandas. + The default changed from True to False, and True is no longer allowed. **kwargs : dict Optional keyword arguments to pass to ``func``. @@ -1520,7 +1581,7 @@ def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: each group together into a Series, including setting the index as appropriate: - >>> g1.apply(lambda x: x.C.max() - x.B.min(), include_groups=False) + >>> g1.apply(lambda x: x.C.max() - x.B.min()) A a 5 b 2 @@ -1529,11 +1590,13 @@ def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: Example 4: The function passed to ``apply`` returns ``None`` for one of the group. This group is filtered from the result: - >>> g1.apply(lambda x: None if x.iloc[0, 0] == 3 else x, include_groups=False) + >>> g1.apply(lambda x: None if x.iloc[0, 0] == 3 else x) B C 0 1 4 1 2 6 """ + if include_groups: + raise ValueError("include_groups=True is no longer allowed.") if isinstance(func, str): if hasattr(self, func): res = getattr(self, func) @@ -1560,33 +1623,7 @@ def f(g): else: f = func - if not include_groups: - return self._python_apply_general(f, self._obj_with_exclusions) - - try: - result = self._python_apply_general(f, self._selected_obj) - if ( - not isinstance(self.obj, Series) - and self._selection is None - and self._selected_obj.shape != self._obj_with_exclusions.shape - ): - warnings.warn( - message=_apply_groupings_depr.format(type(self).__name__, "apply"), - category=DeprecationWarning, - stacklevel=find_stack_level(), - ) - except TypeError: - # gh-20949 - # try again, with .apply acting as a filtering - # operation, by excluding the grouping column - # This would normally not be triggered - # except if the udf is trying an operation that - # fails on *some* columns, e.g. a numeric operation - # on a string grouper column - - return self._python_apply_general(f, self._obj_with_exclusions) - - return result + return self._python_apply_general(f, self._obj_with_exclusions) @final def _python_apply_general( @@ -2116,6 +2153,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: def mean( self, numeric_only: bool = False, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -2131,6 +2169,11 @@ def mean( numeric_only no longer accepts ``None`` and defaults to ``False``. + skipna : bool, default True + Exclude NA/null values. If an entire group is NA, the result will be NA. + + .. versionadded:: 3.0.0 + engine : str, default None * ``'cython'`` : Runs the operation through C-extensions from cython. * ``'numba'`` : Runs the operation through JIT compiled code from numba. @@ -2197,17 +2240,21 @@ def mean( executor.float_dtype_mapping, engine_kwargs, min_periods=0, + skipna=skipna, ) else: result = self._cython_agg_general( "mean", - alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only), + alt=lambda x: Series(x, copy=False).mean( + numeric_only=numeric_only, skipna=skipna + ), numeric_only=numeric_only, + skipna=skipna, ) return result.__finalize__(self.obj, method="groupby") @final - def median(self, numeric_only: bool = False) -> NDFrameT: + def median(self, numeric_only: bool = False, skipna: bool = True) -> NDFrameT: """ Compute median of groups, excluding missing values. @@ -2222,6 +2269,11 @@ def median(self, numeric_only: bool = False) -> NDFrameT: numeric_only no longer accepts ``None`` and defaults to False. + skipna : bool, default True + Exclude NA/null values. If an entire group is NA, the result will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame @@ -2294,8 +2346,11 @@ def median(self, numeric_only: bool = False) -> NDFrameT: """ result = self._cython_agg_general( "median", - alt=lambda x: Series(x, copy=False).median(numeric_only=numeric_only), + alt=lambda x: Series(x, copy=False).median( + numeric_only=numeric_only, skipna=skipna + ), numeric_only=numeric_only, + skipna=skipna, ) return result.__finalize__(self.obj, method="groupby") @@ -2308,6 +2363,7 @@ def std( engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, numeric_only: bool = False, + skipna: bool = True, ): """ Compute standard deviation of groups, excluding missing values. @@ -2346,6 +2402,11 @@ def std( numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values. If an entire group is NA, the result will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame @@ -2400,14 +2461,16 @@ def std( engine_kwargs, min_periods=0, ddof=ddof, + skipna=skipna, ) ) else: return self._cython_agg_general( "std", - alt=lambda x: Series(x, copy=False).std(ddof=ddof), + alt=lambda x: Series(x, copy=False).std(ddof=ddof, skipna=skipna), numeric_only=numeric_only, ddof=ddof, + skipna=skipna, ) @final @@ -2419,6 +2482,7 @@ def var( engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, numeric_only: bool = False, + skipna: bool = True, ): """ Compute variance of groups, excluding missing values. @@ -2456,6 +2520,11 @@ def var( numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values. If an entire group is NA, the result will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame @@ -2509,13 +2578,15 @@ def var( engine_kwargs, min_periods=0, ddof=ddof, + skipna=skipna, ) else: return self._cython_agg_general( "var", - alt=lambda x: Series(x, copy=False).var(ddof=ddof), + alt=lambda x: Series(x, copy=False).var(ddof=ddof, skipna=skipna), numeric_only=numeric_only, ddof=ddof, + skipna=skipna, ) @final @@ -2557,8 +2628,7 @@ def _value_counts( doesnt_exist = subsetted - unique_cols if doesnt_exist: raise ValueError( - f"Keys {doesnt_exist} in subset do not " - f"exist in the DataFrame." + f"Keys {doesnt_exist} in subset do not exist in the DataFrame." ) else: subsetted = unique_cols @@ -2645,7 +2715,9 @@ def _value_counts( return result.__finalize__(self.obj, method="value_counts") @final - def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: + def sem( + self, ddof: int = 1, numeric_only: bool = False, skipna: bool = True + ) -> NDFrameT: """ Compute standard error of the mean of groups, excluding missing values. @@ -2665,6 +2737,11 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values. If an entire group is NA, the result will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame @@ -2739,9 +2816,10 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: ) return self._cython_agg_general( "sem", - alt=lambda x: Series(x, copy=False).sem(ddof=ddof), + alt=lambda x: Series(x, copy=False).sem(ddof=ddof, skipna=skipna), numeric_only=numeric_only, ddof=ddof, + skipna=skipna, ) @final @@ -2842,10 +2920,11 @@ def size(self) -> DataFrame | Series: @final @doc( - _groupby_agg_method_engine_template, + _groupby_agg_method_skipna_engine_template, fname="sum", no=False, mc=0, + s=True, e=None, ek=None, example=dedent( @@ -2887,6 +2966,7 @@ def sum( self, numeric_only: bool = False, min_count: int = 0, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -2898,6 +2978,7 @@ def sum( executor.default_dtype_mapping, engine_kwargs, min_periods=min_count, + skipna=skipna, ) else: # If we are grouping on categoricals we want unobserved categories to @@ -2909,12 +2990,15 @@ def sum( min_count=min_count, alias="sum", npfunc=np.sum, + skipna=skipna, ) return result @final - def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: + def prod( + self, numeric_only: bool = False, min_count: int = 0, skipna: bool = True + ) -> NDFrameT: """ Compute prod of group values. @@ -2931,6 +3015,11 @@ def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire group is NA, the result will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame @@ -2979,17 +3068,22 @@ def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: 2 30 72 """ return self._agg_general( - numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod + numeric_only=numeric_only, + min_count=min_count, + skipna=skipna, + alias="prod", + npfunc=np.prod, ) @final @doc( - _groupby_agg_method_engine_template, + _groupby_agg_method_skipna_engine_template, fname="min", no=False, mc=-1, e=None, ek=None, + s=True, example=dedent( """\ For SeriesGroupBy: @@ -3029,6 +3123,7 @@ def min( self, numeric_only: bool = False, min_count: int = -1, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -3041,23 +3136,26 @@ def min( engine_kwargs, min_periods=min_count, is_max=False, + skipna=skipna, ) else: return self._agg_general( numeric_only=numeric_only, min_count=min_count, + skipna=skipna, alias="min", npfunc=np.min, ) @final @doc( - _groupby_agg_method_engine_template, + _groupby_agg_method_skipna_engine_template, fname="max", no=False, mc=-1, e=None, ek=None, + s=True, example=dedent( """\ For SeriesGroupBy: @@ -3097,6 +3195,7 @@ def max( self, numeric_only: bool = False, min_count: int = -1, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -3109,11 +3208,13 @@ def max( engine_kwargs, min_periods=min_count, is_max=True, + skipna=skipna, ) else: return self._agg_general( numeric_only=numeric_only, min_count=min_count, + skipna=skipna, alias="max", npfunc=np.max, ) @@ -3135,8 +3236,7 @@ def first( The required number of valid values to perform the operation. If fewer than ``min_count`` valid values are present the result will be NA. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. .. versionadded:: 2.2.1 @@ -3222,8 +3322,7 @@ def last( The required number of valid values to perform the operation. If fewer than ``min_count`` valid values are present the result will be NA. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. .. versionadded:: 2.2.1 @@ -3424,7 +3523,9 @@ def describe( return result @final - def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resampler: + def resample( + self, rule, *args, include_groups: bool = False, **kwargs + ) -> Resampler: """ Provide resampling when using a TimeGrouper. @@ -3449,10 +3550,9 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp .. versionadded:: 2.2.0 - .. deprecated:: 2.2.0 + .. versionchanged:: 3.0 - Setting include_groups to True is deprecated. Only the value - False will be allowed in a future version of pandas. + The default was changed to False, and True is no longer allowed. **kwargs Possible arguments are `how`, `fill_method`, `limit`, `kind` and @@ -3485,7 +3585,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Downsample the DataFrame into 3 minute bins and sum the values of the timestamps falling into a bin. - >>> df.groupby("a").resample("3min", include_groups=False).sum() + >>> df.groupby("a").resample("3min").sum() b a 0 2000-01-01 00:00:00 2 @@ -3494,7 +3594,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Upsample the series into 30 second bins. - >>> df.groupby("a").resample("30s", include_groups=False).sum() + >>> df.groupby("a").resample("30s").sum() b a 0 2000-01-01 00:00:00 1 @@ -3508,7 +3608,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Resample by month. Values are assigned to the month of the period. - >>> df.groupby("a").resample("ME", include_groups=False).sum() + >>> df.groupby("a").resample("ME").sum() b a 0 2000-01-31 3 @@ -3517,11 +3617,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Downsample the series into 3 minute bins as above, but close the right side of the bin interval. - >>> ( - ... df.groupby("a") - ... .resample("3min", closed="right", include_groups=False) - ... .sum() - ... ) + >>> (df.groupby("a").resample("3min", closed="right").sum()) b a 0 1999-12-31 23:57:00 1 @@ -3532,11 +3628,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp the bin interval, but label each bin using the right edge instead of the left. - >>> ( - ... df.groupby("a") - ... .resample("3min", closed="right", label="right", include_groups=False) - ... .sum() - ... ) + >>> (df.groupby("a").resample("3min", closed="right", label="right").sum()) b a 0 2000-01-01 00:00:00 1 @@ -3545,11 +3637,10 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp """ from pandas.core.resample import get_resampler_for_grouping - # mypy flags that include_groups could be specified via `*args` or `**kwargs` - # GH#54961 would resolve. - return get_resampler_for_grouping( # type: ignore[misc] - self, rule, *args, include_groups=include_groups, **kwargs - ) + if include_groups: + raise ValueError("include_groups=True is no longer allowed.") + + return get_resampler_for_grouping(self, rule, *args, **kwargs) @final def rolling( @@ -3568,10 +3659,10 @@ def rolling( Parameters ---------- window : int, timedelta, str, offset, or BaseIndexer subclass - Size of the moving window. + Interval of the moving window. - If an integer, the fixed number of observations used for - each window. + If an integer, the delta between the start and end of each window. + The number of points in the window depends on the ``closed`` argument. If a timedelta, str, or offset, the time period of each window. Each window will be a variable sized based on the observations included in @@ -3617,14 +3708,22 @@ def rolling( an integer index is not used to calculate the rolling window. closed : str, default None - If ``'right'``, the first point in the window is excluded from calculations. + Determines the inclusivity of points in the window + + If ``'right'``, uses the window (first, last] meaning the last point + is included in the calculations. + + If ``'left'``, uses the window [first, last) meaning the first point + is included in the calculations. - If ``'left'``, the last point in the window is excluded from calculations. + If ``'both'``, uses the window [first, last] meaning all points in + the window are included in the calculations. - If ``'both'``, no points in the window are excluded from calculations. + If ``'neither'``, uses the window (first, last) meaning the first + and last points in the window are excluded from calculations. - If ``'neither'``, the first and last points in the window are excluded - from calculations. + () and [] are referencing open and closed set + notation respetively. Default ``None`` (``'right'``). @@ -5424,8 +5523,7 @@ def _idxmax_idxmin( numeric_only : bool, default False Include only float, int, boolean columns. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. ignore_unobserved : bool, default False When True and an unobserved group is encountered, do not raise. This used for transform where unobserved groups do not play an impact on the result. @@ -5561,13 +5659,3 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde mi = MultiIndex(levels=levels, codes=codes, names=[idx.name, None]) return mi - - -# GH#7155 -_apply_groupings_depr = ( - "{}.{} operated on the grouping columns. This behavior is deprecated, " - "and in a future version of pandas the grouping columns will be excluded " - "from the operation. Either pass `include_groups=False` to exclude the " - "groupings or explicitly select the grouping columns after groupby to silence " - "this warning." -) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 5f9ebdcea4a2d..c9d874fc08dbe 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -516,8 +516,7 @@ def __init__( ): grper = pprint_thing(grouping_vector) errmsg = ( - "Grouper result violates len(labels) == " - f"len(data)\nresult: {grper}" + f"Grouper result violates len(labels) == len(data)\nresult: {grper}" ) raise AssertionError(errmsg) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4c7fe604e452d..c4c7f73ee166c 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -144,6 +144,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "std": functools.partial(libgroupby.group_var, name="std"), "sem": functools.partial(libgroupby.group_var, name="sem"), "skew": "group_skew", + "kurt": "group_kurt", "first": "group_nth", "last": "group_last", "ohlc": "group_ohlc", @@ -193,7 +194,7 @@ def _get_cython_function( elif how in ["std", "sem", "idxmin", "idxmax"]: # We have a partial object that does not have __signatures__ return f - elif how == "skew": + elif how in ["skew", "kurt"]: # _get_cython_vals will convert to float64 pass elif "object" not in f.__signatures__: @@ -224,7 +225,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: """ how = self.how - if how in ["median", "std", "sem", "skew"]: + if how in ["median", "std", "sem", "skew", "kurt"]: # median only has a float64 implementation # We should only get here with is_numeric, as non-numeric cases # should raise in _get_cython_function @@ -453,7 +454,7 @@ def _call_cython_op( **kwargs, ) result = result.astype(bool, copy=False) - elif self.how in ["skew"]: + elif self.how in ["skew", "kurt"]: func( out=result, counts=counts, @@ -1021,6 +1022,7 @@ def apply_groupwise( # getattr pattern for __name__ is needed for functools.partial objects if len(group_keys) == 0 and getattr(f, "__name__", None) in [ "skew", + "kurt", "sum", "prod", ]: diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index 0064aa91056e8..88379164534f2 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -478,9 +478,9 @@ def get_window_bounds( ) start = start.astype(np.int64) end = end.astype(np.int64) - assert len(start) == len( - end - ), "these should be equal in length from get_window_bounds" + assert len(start) == len(end), ( + "these should be equal in length from get_window_bounds" + ) # Cannot use groupby_indices as they might not be monotonic with the object # we're rolling over window_indices = np.arange( diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 165fe109c4c94..c17c8c1d9a172 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -19,7 +19,10 @@ import numpy as np -from pandas._config import get_option +from pandas._config import ( + get_option, + using_string_dtype, +) from pandas._libs import ( NaT, @@ -5348,7 +5351,7 @@ def putmask(self, mask, value) -> Index: See Also -------- - numpy.ndarray.putmask : Changes elements of an array + numpy.putmask : Changes elements of an array based on conditional and input values. Examples @@ -6235,6 +6238,24 @@ def _find_common_type_compat(self, target) -> DtypeObj: """ target_dtype, _ = infer_dtype_from(target) + if using_string_dtype(): + # special case: if left or right is a zero-length RangeIndex or + # Index[object], those can be created by the default empty constructors + # -> for that case ignore this dtype and always return the other + # (https://github.com/pandas-dev/pandas/pull/60797) + from pandas.core.indexes.range import RangeIndex + + if len(self) == 0 and ( + isinstance(self, RangeIndex) or self.dtype == np.object_ + ): + return target_dtype + if ( + isinstance(target, Index) + and len(target) == 0 + and (isinstance(target, RangeIndex) or target_dtype == np.object_) + ): + return self.dtype + # special case: if one dtype is uint64 and the other a signed int, return object # See https://github.com/pandas-dev/pandas/issues/26778 for discussion # Now it's: @@ -6888,6 +6909,14 @@ def insert(self, loc: int, item) -> Index: arr = self._values + if using_string_dtype() and len(self) == 0 and self.dtype == np.object_: + # special case: if we are an empty object-dtype Index, also + # take into account the inserted item for the resulting dtype + # (https://github.com/pandas-dev/pandas/pull/60797) + dtype = self._find_common_type_compat(item) + if dtype != self.dtype: + return self.astype(dtype).insert(loc, item) + try: if isinstance(arr, ExtensionArray): res_values = arr.insert(loc, item) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index dc48cd1ed958e..79eb1b693d866 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -212,8 +212,6 @@ class MultiIndex(Index): level). names : optional sequence of objects Names for each of the index levels. (name is accepted for compat). - dtype : Numpy dtype or pandas type, optional - Data type for the MultiIndex. copy : bool, default False Copy the meta-data. name : Label @@ -305,7 +303,6 @@ def __new__( codes=None, sortorder=None, names=None, - dtype=None, copy: bool = False, name=None, verify_integrity: bool = True, @@ -1760,7 +1757,7 @@ def fillna(self, value): """ fillna is not implemented for MultiIndex """ - raise NotImplementedError("isna is not defined for MultiIndex") + raise NotImplementedError("fillna is not defined for MultiIndex") @doc(Index.dropna) def dropna(self, how: AnyAll = "any") -> MultiIndex: diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 935762d0455c5..2db50bbbdfa37 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -190,10 +190,31 @@ def from_range(cls, data: range, name=None, dtype: Dtype | None = None) -> Self: """ Create :class:`pandas.RangeIndex` from a ``range`` object. + This method provides a way to create a :class:`pandas.RangeIndex` directly + from a Python ``range`` object. The resulting :class:`RangeIndex` will have + the same start, stop, and step values as the input ``range`` object. + It is particularly useful for constructing indices in an efficient and + memory-friendly manner. + + Parameters + ---------- + data : range + The range object to be converted into a RangeIndex. + name : str, default None + Name to be stored in the index. + dtype : Dtype or None + Data type for the RangeIndex. If None, the default integer type will + be used. + Returns ------- RangeIndex + See Also + -------- + RangeIndex : Immutable Index implementing a monotonic integer range. + Index : Immutable sequence used for indexing and alignment. + Examples -------- >>> pd.RangeIndex.from_range(range(5)) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e0bc0a23acd9f..8a493fef54d3b 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -160,7 +160,7 @@ def iloc(self) -> _iLocIndexer: .. versionchanged:: 3.0 - Returning a tuple from a callable is deprecated. + Callables which return a tuple are deprecated as input. ``.iloc[]`` is primarily integer position based (from ``0`` to ``length-1`` of the axis), but may also be used with a boolean @@ -975,8 +975,7 @@ def _validate_tuple_indexer(self, key: tuple) -> tuple: self._validate_key(k, i) except ValueError as err: raise ValueError( - "Location based indexing can only have " - f"[{self._valid_types}] types" + f"Location based indexing can only have [{self._valid_types}] types" ) from err return key @@ -1589,8 +1588,7 @@ def _validate_key(self, key, axis: AxisInt) -> None: "is not available" ) raise ValueError( - "iLocation based boolean indexing cannot use " - "an indexable as a mask" + "iLocation based boolean indexing cannot use an indexable as a mask" ) return @@ -1994,8 +1992,7 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): return self._setitem_with_indexer((pi, info_axis[0]), value[0]) raise ValueError( - "Must have equal len keys and value " - "when setting with an iterable" + "Must have equal len keys and value when setting with an iterable" ) elif lplane_indexer == 0 and len(value) == len(self.obj.index): @@ -2023,8 +2020,7 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): else: raise ValueError( - "Must have equal len keys and value " - "when setting with an iterable" + "Must have equal len keys and value when setting with an iterable" ) else: diff --git a/pandas/core/interchange/buffer.py b/pandas/core/interchange/buffer.py index 62bf396256f2a..8953360a91c8e 100644 --- a/pandas/core/interchange/buffer.py +++ b/pandas/core/interchange/buffer.py @@ -31,8 +31,7 @@ def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None: x = x.copy() else: raise RuntimeError( - "Exports cannot be zero-copy in the case " - "of a non-contiguous buffer" + "Exports cannot be zero-copy in the case of a non-contiguous buffer" ) # Store the numpy array in which the data resides as a private diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 5c9b8ac8ea085..b990eca39b3dd 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -41,7 +41,9 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: .. note:: For new development, we highly recommend using the Arrow C Data Interface - alongside the Arrow PyCapsule Interface instead of the interchange protocol + alongside the Arrow PyCapsule Interface instead of the interchange protocol. + From pandas 3.0 onwards, `from_dataframe` uses the PyCapsule Interface, + only falling back to the interchange protocol if that fails. .. warning:: @@ -90,6 +92,18 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: if isinstance(df, pd.DataFrame): return df + if hasattr(df, "__arrow_c_stream__"): + try: + pa = import_optional_dependency("pyarrow", min_version="14.0.0") + except ImportError: + # fallback to _from_dataframe + pass + else: + try: + return pa.table(df).to_pandas(zero_copy_only=not allow_copy) + except pa.ArrowInvalid as e: + raise RuntimeError(e) from e + if not hasattr(df, "__dataframe__"): raise ValueError("`df` does not support __dataframe__") diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f44ad926dda5c..d1a9081b234de 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2264,8 +2264,7 @@ def check_ndim(values, placement: BlockPlacement, ndim: int) -> None: if values.ndim > ndim: # Check for both np.ndarray and ExtensionArray raise ValueError( - "Wrong number of dimensions. " - f"values.ndim > ndim [{values.ndim} > {ndim}]" + f"Wrong number of dimensions. values.ndim > ndim [{values.ndim} > {ndim}]" ) if not is_1d_only_ea_dtype(values.dtype): diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index dfff34656f82b..69da2be0306f6 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -907,8 +907,7 @@ def _validate_or_indexify_columns( if not is_mi_list and len(columns) != len(content): # pragma: no cover # caller's responsibility to check for this... raise AssertionError( - f"{len(columns)} columns passed, passed data had " - f"{len(content)} columns" + f"{len(columns)} columns passed, passed data had {len(content)} columns" ) if is_mi_list: # check if nested list column, length of each sub-list should be equal diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index d6154e2352c63..d1dc0ff809497 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1093,11 +1093,14 @@ def reduction( if values.size == 0: return _na_for_min_count(values, axis) + dtype = values.dtype values, mask = _get_values( values, skipna, fill_value_typ=fill_value_typ, mask=mask ) result = getattr(values, meth)(axis) - result = _maybe_null_out(result, axis, mask, values.shape) + result = _maybe_null_out( + result, axis, mask, values.shape, datetimelike=dtype.kind in "mM" + ) return result return reduction @@ -1499,6 +1502,7 @@ def _maybe_null_out( mask: npt.NDArray[np.bool_] | None, shape: tuple[int, ...], min_count: int = 1, + datetimelike: bool = False, ) -> np.ndarray | float | NaTType: """ Returns @@ -1520,7 +1524,10 @@ def _maybe_null_out( null_mask = np.broadcast_to(below_count, new_shape) if np.any(null_mask): - if is_numeric_dtype(result): + if datetimelike: + # GH#60646 For datetimelike, no need to cast to float + result[null_mask] = iNaT + elif is_numeric_dtype(result): if np.iscomplexobj(result): result = result.astype("c16") elif not is_float_dtype(result): diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 983a3df57e369..3a466b6fc7fc8 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -164,7 +164,7 @@ def _masked_arith_op(x: np.ndarray, y, op) -> np.ndarray: else: if not is_scalar(y): raise TypeError( - f"Cannot broadcast np.ndarray with operand of type { type(y) }" + f"Cannot broadcast np.ndarray with operand of type {type(y)}" ) # mask is only meaningful for x diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 0d1541bbb3afa..1cfc75ea11725 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -31,10 +31,7 @@ Substitution, doc, ) -from pandas.util._exceptions import ( - find_stack_level, - rewrite_warning, -) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -59,7 +56,6 @@ from pandas.core.groupby.groupby import ( BaseGroupBy, GroupBy, - _apply_groupings_depr, _pipe_template, get_groupby, ) @@ -167,14 +163,15 @@ def __init__( gpr_index: Index, group_keys: bool = False, selection=None, - include_groups: bool = True, + include_groups: bool = False, ) -> None: + if include_groups: + raise ValueError("include_groups=True is no longer allowed.") self._timegrouper = timegrouper self.keys = None self.sort = True self.group_keys = group_keys self.as_index = True - self.include_groups = include_groups self.obj, self.ax, self._indexer = self._timegrouper._set_grouper( self._convert_obj(obj), sort=True, gpr_index=gpr_index @@ -381,10 +378,20 @@ def transform(self, arg, *args, **kwargs): ---------- arg : function To apply to each group. Should return a Series with the same index. + *args, **kwargs + Additional arguments and keywords. Returns ------- Series + A Series with the transformed values, maintaining the same index as + the original object. + + See Also + -------- + core.resample.Resampler.apply : Apply a function along each group. + core.resample.Resampler.aggregate : Aggregate using one or more operations + over the specified axis. Examples -------- @@ -465,9 +472,7 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): # a DataFrame column, but aggregate_item_by_item operates column-wise # on Series, raising AttributeError or KeyError # (depending on whether the column lookup uses getattr/__getitem__) - result = _apply( - grouped, how, *args, include_groups=self.include_groups, **kwargs - ) + result = grouped.apply(how, *args, **kwargs) except ValueError as err: if "Must produce aggregated value" in str(err): @@ -479,21 +484,23 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): # we have a non-reducing function # try to evaluate - result = _apply( - grouped, how, *args, include_groups=self.include_groups, **kwargs - ) + result = grouped.apply(how, *args, **kwargs) return self._wrap_result(result) @final def _get_resampler_for_grouping( - self, groupby: GroupBy, key, include_groups: bool = True + self, + groupby: GroupBy, + key, ): """ Return the correct class for resampling with groupby. """ return self._resampler_for_grouping( - groupby=groupby, key=key, parent=self, include_groups=include_groups + groupby=groupby, + key=key, + parent=self, ) def _wrap_result(self, result): @@ -935,7 +942,7 @@ def interpolate( "supported. If you tried to resample and interpolate on a " "grouped data frame, please use:\n" "`df.groupby(...).apply(lambda x: x.resample(...)." - "interpolate(...), include_groups=False)`" + "interpolate(...))`" "\ninstead, as resampling and interpolation has to be " "performed for each group independently." ) @@ -1099,6 +1106,13 @@ def prod( Series or DataFrame Computed prod of values within each group. + See Also + -------- + core.resample.Resampler.sum : Compute sum of groups, excluding missing values. + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + core.resample.Resampler.median : Compute median of groups, excluding missing + values. + Examples -------- >>> ser = pd.Series( @@ -1129,9 +1143,30 @@ def min( """ Compute min value of group. + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. + + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + Returns ------- Series or DataFrame + Compute the minimum value in the given Series or DataFrame. + + See Also + -------- + core.resample.Resampler.max : Compute max value of group. + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + core.resample.Resampler.median : Compute median of groups, excluding missing + values. Examples -------- @@ -1163,9 +1198,30 @@ def max( """ Compute max value of group. + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. + + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + Returns ------- Series or DataFrame + Computes the maximum value in the given Series or Dataframe. + + See Also + -------- + core.resample.Resampler.min : Compute min value of group. + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + core.resample.Resampler.median : Compute median of groups, excluding missing + values. Examples -------- @@ -1213,8 +1269,53 @@ def last( ) @final - @doc(GroupBy.median) def median(self, numeric_only: bool = False): + """ + Compute median of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None`` and defaults to False. + + Returns + ------- + Series or DataFrame + Median of values within each group. + + See Also + -------- + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby to each row or column of a + DataFrame. + + Examples + -------- + + >>> ser = pd.Series( + ... [1, 2, 3, 3, 4, 5], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").median() + 2023-01-01 2.0 + 2023-02-01 4.0 + Freq: MS, dtype: float64 + """ return self._downsample("median", numeric_only=numeric_only) @final @@ -1239,6 +1340,16 @@ def mean( DataFrame or Series Mean of values within each group. + See Also + -------- + core.resample.Resampler.median : Compute median of groups, excluding missing + values. + core.resample.Resampler.sum : Compute sum of groups, excluding missing values. + core.resample.Resampler.std : Compute standard deviation of groups, excluding + missing values. + core.resample.Resampler.var : Compute variance of groups, excluding missing + values. + Examples -------- @@ -1288,6 +1399,14 @@ def std( DataFrame or Series Standard deviation of values within each group. + See Also + -------- + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + core.resample.Resampler.median : Compute median of groups, excluding missing + values. + core.resample.Resampler.var : Compute variance of groups, excluding missing + values. + Examples -------- @@ -1339,6 +1458,14 @@ def var( DataFrame or Series Variance of values within each group. + See Also + -------- + core.resample.Resampler.std : Compute standard deviation of groups, excluding + missing values. + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + core.resample.Resampler.median : Compute median of groups, excluding missing + values. + Examples -------- @@ -1368,12 +1495,61 @@ def var( return self._downsample("var", ddof=ddof, numeric_only=numeric_only) @final - @doc(GroupBy.sem) def sem( self, ddof: int = 1, numeric_only: bool = False, ): + """ + Compute standard error of the mean of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex. + + Parameters + ---------- + ddof : int, default 1 + Degrees of freedom. + + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + + Returns + ------- + Series or DataFrame + Standard error of the mean of values within each group. + + See Also + -------- + DataFrame.sem : Return unbiased standard error of the mean over requested axis. + Series.sem : Return unbiased standard error of the mean over requested axis. + + Examples + -------- + + >>> ser = pd.Series( + ... [1, 3, 2, 4, 3, 8], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").sem() + 2023-01-01 0.577350 + 2023-02-01 1.527525 + Freq: MS, dtype: float64 + """ return self._downsample("sem", ddof=ddof, numeric_only=numeric_only) @final @@ -1541,7 +1717,6 @@ def __init__( groupby: GroupBy, key=None, selection: IndexLabel | None = None, - include_groups: bool = False, ) -> None: # reached via ._gotitem and _get_resampler_for_grouping @@ -1564,7 +1739,6 @@ def __init__( self.ax = parent.ax self.obj = parent.obj - self.include_groups = include_groups @no_type_check def _apply(self, f, *args, **kwargs): @@ -1581,7 +1755,7 @@ def func(x): return x.apply(f, *args, **kwargs) - result = _apply(self._groupby, func, include_groups=self.include_groups) + result = self._groupby.apply(func) return self._wrap_result(result) _upsample = _apply @@ -1937,7 +2111,6 @@ def get_resampler_for_grouping( fill_method=None, limit: int | None = None, on=None, - include_groups: bool = True, **kwargs, ) -> Resampler: """ @@ -1946,9 +2119,7 @@ def get_resampler_for_grouping( # .resample uses 'on' similar to how .groupby uses 'key' tg = TimeGrouper(freq=rule, key=on, **kwargs) resampler = tg._get_resampler(groupby.obj) - return resampler._get_resampler_for_grouping( - groupby=groupby, include_groups=include_groups, key=tg.key - ) + return resampler._get_resampler_for_grouping(groupby=groupby, key=tg.key) class TimeGrouper(Grouper): @@ -2727,18 +2898,3 @@ def _asfreq_compat(index: FreqIndexT, freq) -> FreqIndexT: else: # pragma: no cover raise TypeError(type(index)) return new_index - - -def _apply( - grouped: GroupBy, how: Callable, *args, include_groups: bool, **kwargs -) -> DataFrame: - # GH#7155 - rewrite warning to appear as if it came from `.resample` - target_message = "DataFrameGroupBy.apply operated on the grouping columns" - new_message = _apply_groupings_depr.format("DataFrameGroupBy", "resample") - with rewrite_warning( - target_message=target_message, - target_category=DeprecationWarning, - new_message=new_message, - ): - result = grouped.apply(how, *args, include_groups=include_groups, **kwargs) - return result diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 33ff182f5baee..6a590ee5b227e 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -495,8 +495,7 @@ def from_dummies( if col_isna_mask.any(): raise ValueError( - "Dummy DataFrame contains NA value in column: " - f"'{col_isna_mask.idxmax()}'" + f"Dummy DataFrame contains NA value in column: '{col_isna_mask.idxmax()}'" ) # index data with a list of all columns that are dummies diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 5fddd9f9aca5b..09be82c59a5c6 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -180,7 +180,8 @@ def merge( First pandas object to merge. right : DataFrame or named Series Second pandas object to merge. - how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' + how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti}, + default 'inner' Type of merge to be performed. * left: use only keys from left frame, similar to a SQL left outer join; @@ -193,6 +194,10 @@ def merge( join; preserve the order of the left keys. * cross: creates the cartesian product from both frames, preserves the order of the left keys. + * left_anti: use only keys from left frame that are not in right frame, similar + to SQL left anti join; preserve key order. + * right_anti: use only keys from right frame that are not in left frame, similar + to SQL right anti join; preserve key order. on : label or list Column or index level names to join on. These must be found in both DataFrames. If `on` is None and not merging on indexes then this defaults @@ -953,7 +958,7 @@ def __init__( self, left: DataFrame | Series, right: DataFrame | Series, - how: JoinHow | Literal["asof"] = "inner", + how: JoinHow | Literal["left_anti", "right_anti", "asof"] = "inner", on: IndexLabel | AnyArrayLike | None = None, left_on: IndexLabel | AnyArrayLike | None = None, right_on: IndexLabel | AnyArrayLike | None = None, @@ -968,7 +973,7 @@ def __init__( _right = _validate_operand(right) self.left = self.orig_left = _left self.right = self.orig_right = _right - self.how = how + self.how, self.anti_join = self._validate_how(how) self.on = com.maybe_make_list(on) @@ -998,14 +1003,6 @@ def __init__( ) raise MergeError(msg) - # GH 59435: raise when "how" is not a valid Merge type - merge_type = {"left", "right", "inner", "outer", "cross", "asof"} - if how not in merge_type: - raise ValueError( - f"'{how}' is not a valid Merge type: " - f"left, right, inner, outer, cross, asof" - ) - self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on) ( @@ -1035,6 +1032,37 @@ def __init__( if validate is not None: self._validate_validate_kwd(validate) + @final + def _validate_how( + self, how: JoinHow | Literal["left_anti", "right_anti", "asof"] + ) -> tuple[JoinHow | Literal["asof"], bool]: + """ + Validate the 'how' parameter and return the actual join type and whether + this is an anti join. + """ + # GH 59435: raise when "how" is not a valid Merge type + merge_type = { + "left", + "right", + "inner", + "outer", + "left_anti", + "right_anti", + "cross", + "asof", + } + if how not in merge_type: + raise ValueError( + f"'{how}' is not a valid Merge type: " + f"left, right, inner, outer, left_anti, right_anti, cross, asof" + ) + anti_join = False + if how in {"left_anti", "right_anti"}: + how = how.split("_")[0] # type: ignore[assignment] + anti_join = True + how = cast(JoinHow | Literal["asof"], how) + return how, anti_join + def _maybe_require_matching_dtypes( self, left_join_keys: list[ArrayLike], right_join_keys: list[ArrayLike] ) -> None: @@ -1405,6 +1433,11 @@ def _get_join_info( n = len(left_ax) if left_indexer is None else len(left_indexer) join_index = default_index(n) + if self.anti_join: + join_index, left_indexer, right_indexer = self._handle_anti_join( + join_index, left_indexer, right_indexer + ) + return join_index, left_indexer, right_indexer @final @@ -1447,6 +1480,48 @@ def _create_join_index( return index.copy() return index.take(indexer) + @final + def _handle_anti_join( + self, + join_index: Index, + left_indexer: npt.NDArray[np.intp] | None, + right_indexer: npt.NDArray[np.intp] | None, + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + """ + Handle anti join by returning the correct join index and indexers + + Parameters + ---------- + join_index : Index + join index + left_indexer : np.ndarray[np.intp] or None + left indexer + right_indexer : np.ndarray[np.intp] or None + right indexer + + Returns + ------- + Index, np.ndarray[np.intp] or None, np.ndarray[np.intp] or None + """ + # Make sure indexers are not None + if left_indexer is None: + left_indexer = np.arange(len(self.left)) + if right_indexer is None: + right_indexer = np.arange(len(self.right)) + + assert self.how in {"left", "right"} + if self.how == "left": + # Filter to rows where left keys are not in right keys + filt = right_indexer == -1 + else: + # Filter to rows where right keys are not in left keys + filt = left_indexer == -1 + join_index = join_index[filt] + left_indexer = left_indexer[filt] + right_indexer = right_indexer[filt] + + return join_index, left_indexer, right_indexer + @final def _get_merge_keys( self, @@ -1929,9 +2004,9 @@ def get_join_indexers( np.ndarray[np.intp] or None Indexer into the right_keys. """ - assert len(left_keys) == len( - right_keys - ), "left_keys and right_keys must be the same length" + assert len(left_keys) == len(right_keys), ( + "left_keys and right_keys must be the same length" + ) # fast-path for empty left/right left_n = len(left_keys[0]) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 9b7b768fe7adb..c60fe71a7ff28 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -929,6 +929,8 @@ def _reorder_for_extension_array_stack( def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: if frame.columns.nunique() != len(frame.columns): raise ValueError("Columns with duplicate values are not supported in stack") + if not len(level): + return frame set_levels = set(level) stack_cols = frame.columns._drop_level_numbers( [k for k in range(frame.columns.nlevels - 1, -1, -1) if k not in set_levels] diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index b3f946f289891..034b861a83f43 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -73,7 +73,7 @@ def cut( Parameters ---------- - x : array-like + x : 1d ndarray or Series The input array to be binned. Must be 1-dimensional. bins : int, sequence of scalars, or IntervalIndex The criteria to bin by. @@ -126,7 +126,7 @@ def cut( Categorical for all other inputs. The values stored within are whatever the type in the sequence is. - * False : returns an ndarray of integers. + * False : returns a 1d ndarray or Series of integers. bins : numpy.ndarray or IntervalIndex. The computed or specified bins. Only returned when `retbins=True`. diff --git a/pandas/core/series.py b/pandas/core/series.py index 4fa8b86fa4c16..351622135b31f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -500,7 +500,7 @@ def __init__( # create/copy the manager if isinstance(data, SingleBlockManager): if dtype is not None: - data = data.astype(dtype=dtype, errors="ignore") + data = data.astype(dtype=dtype) elif copy: data = data.copy() else: diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index c68b6303661b9..81f7441846589 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -12,6 +12,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import lib from pandas._typing import ( AlignJoin, @@ -29,8 +31,10 @@ is_extension_array_dtype, is_integer, is_list_like, + is_numeric_dtype, is_object_dtype, is_re, + is_string_dtype, ) from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -399,7 +403,9 @@ def cons_row(x): # This is a mess. _dtype: DtypeObj | str | None = dtype vdtype = getattr(result, "dtype", None) - if self._is_string: + if _dtype is not None: + pass + elif self._is_string: if is_bool_dtype(vdtype): _dtype = result.dtype elif returns_string: @@ -2097,7 +2103,9 @@ def slice_replace(self, start=None, stop=None, repl=None): result = self._data.array._str_slice_replace(start, stop, repl) return self._wrap_result(result) - def decode(self, encoding, errors: str = "strict"): + def decode( + self, encoding, errors: str = "strict", dtype: str | DtypeObj | None = None + ): """ Decode character string in the Series/Index using indicated encoding. @@ -2111,6 +2119,12 @@ def decode(self, encoding, errors: str = "strict"): errors : str, optional Specifies the error handling scheme. Possible values are those supported by :meth:`bytes.decode`. + dtype : str or dtype, optional + The dtype of the result. When not ``None``, must be either a string or + object dtype. When ``None``, the dtype of the result is determined by + ``pd.options.future.infer_string``. + + .. versionadded:: 2.3.0 Returns ------- @@ -2132,6 +2146,10 @@ def decode(self, encoding, errors: str = "strict"): 2 () dtype: object """ + if dtype is not None and not is_string_dtype(dtype): + raise ValueError(f"dtype must be string or object, got {dtype=}") + if dtype is None and get_option("future.infer_string"): + dtype = "str" # TODO: Add a similar _bytes interface. if encoding in _cpython_optimized_decoders: # CPython optimized implementation @@ -2140,9 +2158,8 @@ def decode(self, encoding, errors: str = "strict"): decoder = codecs.getdecoder(encoding) f = lambda x: decoder(x, errors)[0] arr = self._data.array - # assert isinstance(arr, (StringArray,)) result = arr._str_map(f) - return self._wrap_result(result) + return self._wrap_result(result, dtype=dtype) @forbid_nonstring_types(["bytes"]) def encode(self, encoding, errors: str = "strict"): @@ -2524,10 +2541,12 @@ def get_dummies( """ from pandas.core.frame import DataFrame + if dtype is not None and not (is_numeric_dtype(dtype) or is_bool_dtype(dtype)): + raise ValueError("Only numeric or boolean dtypes are supported for 'dtype'") # we need to cast to Series of strings as only that has all # methods available for making the dummies... result, name = self._data.array._str_get_dummies(sep, dtype) - if is_extension_array_dtype(dtype) or isinstance(dtype, ArrowDtype): + if is_extension_array_dtype(dtype): return self._wrap_result( DataFrame(result, columns=name, dtype=dtype), name=name, @@ -3415,7 +3434,8 @@ def len(self): # cases: # upper, lower, title, capitalize, swapcase, casefold # boolean: - # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle + # isalpha, isnumeric isalnum isdigit isdecimal isspace islower + # isupper istitle isascii # _doc_args holds dict of strings to use in substituting casemethod docs _doc_args: dict[str, dict[str, str]] = {} _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""} @@ -3495,6 +3515,7 @@ def casefold(self): Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. @@ -3518,6 +3539,7 @@ def casefold(self): Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. @@ -3527,12 +3549,29 @@ def casefold(self): also includes other characters that can represent quantities such as unicode fractions. - >>> s1 = pd.Series(['one', 'one1', '1', '']) + >>> s1 = pd.Series(['one', 'one1', '1', '', '³', '⅕']) >>> s1.str.isnumeric() 0 False 1 False 2 True 3 False + 4 True + 5 True + dtype: bool + + For a string to be considered numeric, all its characters must have a Unicode + numeric property matching :py:meth:`str.is_numeric`. As a consequence, + the following cases are **not** recognized as numeric: + + - **Decimal numbers** (e.g., "1.1"): due to period ``"."`` + - **Negative numbers** (e.g., "-5"): due to minus sign ``"-"`` + - **Scientific notation** (e.g., "1e3"): due to characters like ``"e"`` + + >>> s2 = pd.Series(["1.1", "-5", "1e3"]) + >>> s2.str.isnumeric() + 0 False + 1 False + 2 False dtype: bool """ _shared_docs["isalnum"] = """ @@ -3544,6 +3583,7 @@ def casefold(self): Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. @@ -3576,6 +3616,7 @@ def casefold(self): Series.str.isdigit : Check whether all characters are digits. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. @@ -3601,6 +3642,7 @@ def casefold(self): Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. @@ -3627,6 +3669,7 @@ def casefold(self): Series.str.isdigit : Check whether all characters are digits. Series.str.isdecimal : Check whether all characters are decimal. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. @@ -3649,6 +3692,7 @@ def casefold(self): Series.str.isdigit : Check whether all characters are digits. Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. @@ -3674,6 +3718,7 @@ def casefold(self): Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.istitle : Check whether all characters are titlecase. Examples @@ -3697,10 +3742,11 @@ def casefold(self): Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Examples - ------------ + -------- The ``s5.str.istitle`` method checks for whether all words are in title case (whether only the first letter of each word is capitalized). Words are assumed to be as any sequence of non-numeric characters separated by @@ -3714,11 +3760,40 @@ def casefold(self): 3 False dtype: bool """ + _shared_docs["isascii"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.istitle : Check whether all characters are titlecase. + Series.str.isupper : Check whether all characters are uppercase. + + Examples + ------------ + The ``s5.str.isascii`` method checks for whether all characters are ascii + characters, which includes digits 0-9, capital and lowercase letters A-Z, + and some other special characters. + + >>> s5 = pd.Series(['ö', 'see123', 'hello world', '']) + >>> s5.str.isascii() + 0 False + 1 True + 2 True + 3 True + dtype: bool + """ + _doc_args["isalnum"] = {"type": "alphanumeric", "method": "isalnum"} _doc_args["isalpha"] = {"type": "alphabetic", "method": "isalpha"} _doc_args["isdigit"] = {"type": "digits", "method": "isdigit"} _doc_args["isspace"] = {"type": "whitespace", "method": "isspace"} _doc_args["islower"] = {"type": "lowercase", "method": "islower"} + _doc_args["isascii"] = {"type": "ascii", "method": "isascii"} _doc_args["isupper"] = {"type": "uppercase", "method": "isupper"} _doc_args["istitle"] = {"type": "titlecase", "method": "istitle"} _doc_args["isnumeric"] = {"type": "numeric", "method": "isnumeric"} @@ -3750,6 +3825,11 @@ def casefold(self): docstring=_shared_docs["ismethods"] % _doc_args["islower"] + _shared_docs["islower"], ) + isascii = _map_and_wrap( + "isascii", + docstring=_shared_docs["ismethods"] % _doc_args["isascii"] + + _shared_docs["isascii"], + ) isupper = _map_and_wrap( "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"] diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index 4ed36f85167c9..78c4f3acbe1aa 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -179,6 +179,10 @@ def _str_isalnum(self): def _str_isalpha(self): pass + @abc.abstractmethod + def _str_isascii(self): + pass + @abc.abstractmethod def _str_isdecimal(self): pass diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 0268194e64d50..0adb7b51cf2b7 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -434,7 +434,7 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): dummies_dtype = _dtype else: dummies_dtype = np.bool_ - dummies = np.empty((len(arr), len(tags2)), dtype=dummies_dtype) + dummies = np.empty((len(arr), len(tags2)), dtype=dummies_dtype, order="F") def _isin(test_elements: str, element: str) -> bool: return element in test_elements @@ -455,6 +455,9 @@ def _str_isalnum(self): def _str_isalpha(self): return self._str_map(str.isalpha, dtype="bool") + def _str_isascii(self): + return self._str_map(str.isascii, dtype="bool") + def _str_isdecimal(self): return self._str_map(str.isdecimal, dtype="bool") diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 30487de7bafd5..0a10001a3113f 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -192,9 +192,9 @@ def should_cache( else: check_count = 500 else: - assert ( - 0 <= check_count <= len(arg) - ), "check_count must be in next bounds: [0; len(arg)]" + assert 0 <= check_count <= len(arg), ( + "check_count must be in next bounds: [0; len(arg)]" + ) if check_count == 0: return False diff --git a/pandas/core/window/doc.py b/pandas/core/window/doc.py index cdb670ee218b4..6dbc52a99e70c 100644 --- a/pandas/core/window/doc.py +++ b/pandas/core/window/doc.py @@ -85,6 +85,63 @@ def create_section_header(header: str) -> str: """ ).replace("\n", "", 1) +template_pipe = """ +Apply a ``func`` with arguments to this %(klass)s object and return its result. + +Use `.pipe` when you want to improve readability by chaining together +functions that expect Series, DataFrames, GroupBy, Rolling, Expanding or Resampler +objects. +Instead of writing + +>>> h = lambda x, arg2, arg3: x + 1 - arg2 * arg3 +>>> g = lambda x, arg1: x * 5 / arg1 +>>> f = lambda x: x ** 4 +>>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, index=pd.date_range('2012-08-02', periods=4)) +>>> h(g(f(df.rolling('2D')), arg1=1), arg2=2, arg3=3) # doctest: +SKIP + +You can write + +>>> (df.rolling('2D') +... .pipe(f) +... .pipe(g, arg1=1) +... .pipe(h, arg2=2, arg3=3)) # doctest: +SKIP + +which is much more readable. + +Parameters +---------- +func : callable or tuple of (callable, str) + Function to apply to this %(klass)s object or, alternatively, + a `(callable, data_keyword)` tuple where `data_keyword` is a + string indicating the keyword of `callable` that expects the + %(klass)s object. +*args : iterable, optional + Positional arguments passed into `func`. +**kwargs : dict, optional + A dictionary of keyword arguments passed into `func`. + +Returns +------- +%(klass)s + The original object with the function `func` applied. + +See Also +-------- +Series.pipe : Apply a function with arguments to a series. +DataFrame.pipe: Apply a function with arguments to a dataframe. +apply : Apply function to each group instead of to the + full %(klass)s object. + +Notes +----- +See more `here +`_ + +Examples +-------- +%(examples)s +""" + numba_notes = ( "See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` for " "extended documentation and performance considerations for the Numba engine.\n\n" diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index bff3a1660eba9..81c89e1ef5428 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -5,9 +5,15 @@ TYPE_CHECKING, Any, Literal, + final, + overload, ) -from pandas.util._decorators import doc +from pandas.util._decorators import ( + Appender, + Substitution, + doc, +) from pandas.core.indexers.objects import ( BaseIndexer, @@ -20,6 +26,7 @@ kwargs_numeric_only, numba_notes, template_header, + template_pipe, template_returns, template_see_also, window_agg_numba_parameters, @@ -34,7 +41,11 @@ from collections.abc import Callable from pandas._typing import ( + Concatenate, + P, QuantileInterpolation, + Self, + T, WindowingRankType, ) @@ -241,6 +252,54 @@ def apply( kwargs=kwargs, ) + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: ... + + @final + @Substitution( + klass="Expanding", + examples=""" + >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, + ... index=pd.date_range('2012-08-02', periods=4)) + >>> df + A + 2012-08-02 1 + 2012-08-03 2 + 2012-08-04 3 + 2012-08-05 4 + + To get the difference between each expanding window's maximum and minimum + value in one pass, you can do + + >>> df.expanding().pipe(lambda x: x.max() - x.min()) + A + 2012-08-02 0.0 + 2012-08-03 1.0 + 2012-08-04 2.0 + 2012-08-05 3.0""", + ) + @Appender(template_pipe) + def pipe( + self, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: + return super().pipe(func, *args, **kwargs) + @doc( template_header, create_section_header("Parameters"), @@ -664,6 +723,78 @@ def skew(self, numeric_only: bool = False): def kurt(self, numeric_only: bool = False): return super().kurt(numeric_only=numeric_only) + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + dedent( + """ + GroupBy.first : Similar method for GroupBy objects. + Expanding.last : Method to get the last element in each window.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + The example below will show an expanding calculation with a window size of + three. + + >>> s = pd.Series(range(5)) + >>> s.expanding(3).first() + 0 NaN + 1 NaN + 2 0.0 + 3 0.0 + 4 0.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="First (left-most) element of the window", + agg_method="first", + ) + def first(self, numeric_only: bool = False): + return super().first(numeric_only=numeric_only) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + dedent( + """ + GroupBy.last : Similar method for GroupBy objects. + Expanding.first : Method to get the first element in each window.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + The example below will show an expanding calculation with a window size of + three. + + >>> s = pd.Series(range(5)) + >>> s.expanding(3).last() + 0 NaN + 1 NaN + 2 2.0 + 3 3.0 + 4 4.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="Last (right-most) element of the window", + agg_method="last", + ) + def last(self, numeric_only: bool = False): + return super().last(numeric_only=numeric_only) + @doc( template_header, create_section_header("Parameters"), diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 385ffb901acf0..69fce8cf2137e 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -14,6 +14,8 @@ TYPE_CHECKING, Any, Literal, + final, + overload, ) import numpy as np @@ -26,7 +28,11 @@ import pandas._libs.window.aggregations as window_aggregations from pandas.compat._optional import import_optional_dependency from pandas.errors import DataError -from pandas.util._decorators import doc +from pandas.util._decorators import ( + Appender, + Substitution, + doc, +) from pandas.core.dtypes.common import ( ensure_float64, @@ -81,6 +87,7 @@ kwargs_scipy, numba_notes, template_header, + template_pipe, template_returns, template_see_also, window_agg_numba_parameters, @@ -102,8 +109,12 @@ from pandas._typing import ( ArrayLike, + Concatenate, NDFrameT, QuantileInterpolation, + P, + Self, + T, WindowingRankType, npt, ) @@ -870,10 +881,10 @@ class Window(BaseWindow): Parameters ---------- window : int, timedelta, str, offset, or BaseIndexer subclass - Size of the moving window. + Interval of the moving window. - If an integer, the fixed number of observations used for - each window. + If an integer, the delta between the start and end of each window. + The number of points in the window depends on the ``closed`` argument. If a timedelta, str, or offset, the time period of each window. Each window will be a variable sized based on the observations included in @@ -918,14 +929,22 @@ class Window(BaseWindow): an integer index is not used to calculate the rolling window. closed : str, default None - If ``'right'``, the first point in the window is excluded from calculations. + Determines the inclusivity of points in the window + + If ``'right'``, uses the window (first, last] meaning the last point + is included in the calculations. + + If ``'left'``, uses the window [first, last) meaning the first point + is included in the calculations. - If ``'left'``, the last point in the window is excluded from calculations. + If ``'both'``, uses the window [first, last] meaning all points in + the window are included in the calculations. - If ``'both'``, no point in the window is excluded from calculations. + If ``'neither'``, uses the window (first, last) meaning the first + and last points in the window are excluded from calculations. - If ``'neither'``, the first and last points in the window are excluded - from calculations. + () and [] are referencing open and closed set + notation respetively. Default ``None`` (``'right'``). @@ -1529,6 +1548,30 @@ def apply_func(values, begin, end, min_periods, raw=raw): return apply_func + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: ... + + def pipe( + self, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: + return com.pipe(self, func, *args, **kwargs) + def sum( self, numeric_only: bool = False, @@ -1705,6 +1748,22 @@ def kurt(self, numeric_only: bool = False): numeric_only=numeric_only, ) + def first(self, numeric_only: bool = False): + window_func = window_aggregations.roll_first + return self._apply( + window_func, + name="first", + numeric_only=numeric_only, + ) + + def last(self, numeric_only: bool = False): + window_func = window_aggregations.roll_last + return self._apply( + window_func, + name="last", + numeric_only=numeric_only, + ) + def quantile( self, q: float, @@ -2044,6 +2103,54 @@ def apply( kwargs=kwargs, ) + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: ... + + @final + @Substitution( + klass="Rolling", + examples=""" + >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, + ... index=pd.date_range('2012-08-02', periods=4)) + >>> df + A + 2012-08-02 1 + 2012-08-03 2 + 2012-08-04 3 + 2012-08-05 4 + + To get the difference between each rolling 2-day window's maximum and minimum + value in one pass, you can do + + >>> df.rolling('2D').pipe(lambda x: x.max() - x.min()) + A + 2012-08-02 0.0 + 2012-08-03 1.0 + 2012-08-04 1.0 + 2012-08-05 1.0""", + ) + @Appender(template_pipe) + def pipe( + self, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: + return super().pipe(func, *args, **kwargs) + @doc( template_header, create_section_header("Parameters"), @@ -2539,6 +2646,78 @@ def sem(self, ddof: int = 1, numeric_only: bool = False): def kurt(self, numeric_only: bool = False): return super().kurt(numeric_only=numeric_only) + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + dedent( + """ + GroupBy.first : Similar method for GroupBy objects. + Rolling.last : Method to get the last element in each window.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + The example below will show a rolling calculation with a window size of + three. + + >>> s = pd.Series(range(5)) + >>> s.rolling(3).first() + 0 NaN + 1 NaN + 2 0.0 + 3 1.0 + 4 2.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="First (left-most) element of the window", + agg_method="first", + ) + def first(self, numeric_only: bool = False): + return super().first(numeric_only=numeric_only) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + dedent( + """ + GroupBy.last : Similar method for GroupBy objects. + Rolling.first : Method to get the first element in each window.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + The example below will show a rolling calculation with a window size of + three. + + >>> s = pd.Series(range(5)) + >>> s.rolling(3).last() + 0 NaN + 1 NaN + 2 2.0 + 3 3.0 + 4 4.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="Last (right-most) element of the window", + agg_method="last", + ) + def last(self, numeric_only: bool = False): + return super().last(numeric_only=numeric_only) + @doc( template_header, create_section_header("Parameters"), diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index f150de3d217f2..2b5bc450e41d6 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -820,6 +820,16 @@ class ValueLabelTypeMismatch(Warning): """ Warning raised by to_stata on a category column that contains non-string values. + When exporting data to Stata format using the `to_stata` method, category columns + must have string values as labels. If a category column contains non-string values + (e.g., integers, floats, or other types), this warning is raised to indicate that + the Stata file may not correctly represent the data. + + See Also + -------- + DataFrame.to_stata : Export DataFrame object to Stata dta format. + Series.cat : Accessor for categorical properties of the Series values. + Examples -------- >>> df = pd.DataFrame({"categories": pd.Series(["a", 2], dtype="category")}) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 9778a404e23e0..6827fbe9c998e 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -10,7 +10,10 @@ from pandas._config import using_string_dtype from pandas._libs import lib -from pandas.compat import pa_version_under18p0 +from pandas.compat import ( + pa_version_under18p0, + pa_version_under19p0, +) from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -77,7 +80,10 @@ def arrow_table_to_pandas( elif dtype_backend == "pyarrow": types_mapper = pd.ArrowDtype elif using_string_dtype(): - types_mapper = _arrow_string_types_mapper() + if pa_version_under19p0: + types_mapper = _arrow_string_types_mapper() + else: + types_mapper = None elif dtype_backend is lib.no_default or dtype_backend == "numpy": types_mapper = None else: diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index ced2ad91dba1e..460af65a60bf6 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -115,7 +115,7 @@ Strings are used for sheet names. Integers are used in zero-indexed sheet positions (chart sheets do not count as a sheet position). Lists of strings/integers are used to request multiple sheets. - Specify ``None`` to get all worksheets. + When ``None``, will return a dictionary containing DataFrames for each sheet. Available cases: @@ -124,7 +124,7 @@ * ``"Sheet1"``: Load sheet with name "Sheet1" * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" as a dict of `DataFrame` - * ``None``: All worksheets. + * ``None``: Returns a dictionary containing DataFrames for each sheet.. header : int, list of int, default 0 Row (0-indexed) to use for the column labels of the parsed @@ -1649,7 +1649,8 @@ def parse( Strings are used for sheet names. Integers are used in zero-indexed sheet positions (chart sheets do not count as a sheet position). Lists of strings/integers are used to request multiple sheets. - Specify ``None`` to get all worksheets. + When ``None``, will return a dictionary containing DataFrames for + each sheet. header : int, list of int, default 0 Row (0-indexed) to use for the column labels of the parsed DataFrame. If a list of integers is passed those row positions will diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index 10a06aec72a57..ba4919c9298ed 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -270,7 +270,7 @@ def _process_style(self, style: dict[str, Any] | None) -> str | None: style_key = json.dumps(style) if style_key in self._style_dict: return self._style_dict[style_key] - name = f"pd{len(self._style_dict)+1}" + name = f"pd{len(self._style_dict) + 1}" self._style_dict[style_key] = name odf_style = Style(name=name, family="table-cell") if "font" in style: diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 7b4c81853eba3..565c53f0f3fc5 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -78,6 +78,14 @@ def read_feather( """ Load a feather-format object from the file path. + Feather is particularly useful for scenarios that require efficient + serialization and deserialization of tabular data. It supports + schema preservation, making it a reliable choice for use cases + such as sharing data between Python and R, or persisting intermediate + results during data processing pipelines. This method provides additional + flexibility with options for selective column reading, thread parallelism, + and choosing the backend for data types. + Parameters ---------- path : str, path object, or file-like object diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 46ecb2b9a8f12..b7fbc4e5e22b7 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -897,9 +897,13 @@ def to_html( ``
`` tag, in addition to the default "dataframe". notebook : {True, False}, optional, default False Whether the generated HTML is for IPython Notebook. - border : int - A ``border=border`` attribute is included in the opening - ``
`` tag. Default ``pd.options.display.html.border``. + border : int or bool + When an integer value is provided, it sets the border attribute in + the opening tag, specifying the thickness of the border. + If ``False`` or ``0`` is passed, the border attribute will not + be present in the ``
`` tag. + The default value for this parameter is governed by + ``pd.options.display.html.border``. table_id : str, optional A css id is included in the opening `
` tag if specified. render_links : bool, default False diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index b4c6ff8792d52..c9a6e94a0c7c1 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -226,12 +226,17 @@ Series.describe: Generate descriptive statistics of Series. Series.memory_usage: Memory usage of Series.""" ) +series_max_cols_sub = dedent( + """\ + max_cols : int, optional + Unused, exists only for compatibility with DataFrame.info.""" +) series_sub_kwargs = { "klass": "Series", "type_sub": "", - "max_cols_sub": "", + "max_cols_sub": series_max_cols_sub, "show_counts_sub": show_counts_sub, "examples_sub": series_examples_sub, "see_also_sub": series_see_also_sub, diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index a9936ba8c8f2c..ab27321ffe83c 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -111,6 +111,8 @@ def _pprint_seq( """ if isinstance(seq, set): fmt = "{{{body}}}" + elif isinstance(seq, frozenset): + fmt = "frozenset({{{body}}})" else: fmt = "[{body}]" if hasattr(seq, "__setitem__") else "({body})" @@ -336,8 +338,8 @@ def format_object_summary( if indent_for_name: name_len = len(name) - space1 = f'\n{(" " * (name_len + 1))}' - space2 = f'\n{(" " * (name_len + 2))}' + space1 = f"\n{(' ' * (name_len + 1))}" + space2 = f"\n{(' ' * (name_len + 2))}" else: space1 = "\n" space2 = "\n " # space for the opening '[' diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index eb6773310da69..0f734a81795c4 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -117,6 +117,12 @@ class Styler(StylerRenderer): r""" Helps style a DataFrame or Series according to the data with HTML and CSS. + This class provides methods for styling and formatting a Pandas DataFrame or Series. + The styled output can be rendered as HTML or LaTeX, and it supports CSS-based + styling, allowing users to control colors, font styles, and other visual aspects of + tabular data. It is particularly useful for presenting DataFrame objects in a + Jupyter Notebook environment or when exporting styled tables for reports and + Parameters ---------- data : Series or DataFrame @@ -181,6 +187,8 @@ class Styler(StylerRenderer): Attributes ---------- + index : data.index Index + columns : data.columns Index env : Jinja2 jinja2.Environment template_html : Jinja2 Template template_html_table : Jinja2 Template @@ -1228,6 +1236,111 @@ def to_latex( ) return save_to_buffer(latex, buf=buf, encoding=encoding) + @overload + def to_typst( + self, + buf: FilePath | WriteBuffer[str], + *, + encoding: str | None = ..., + sparse_index: bool | None = ..., + sparse_columns: bool | None = ..., + max_rows: int | None = ..., + max_columns: int | None = ..., + ) -> None: ... + + @overload + def to_typst( + self, + buf: None = ..., + *, + encoding: str | None = ..., + sparse_index: bool | None = ..., + sparse_columns: bool | None = ..., + max_rows: int | None = ..., + max_columns: int | None = ..., + ) -> str: ... + + @Substitution(buf=buffering_args, encoding=encoding_args) + def to_typst( + self, + buf: FilePath | WriteBuffer[str] | None = None, + *, + encoding: str | None = None, + sparse_index: bool | None = None, + sparse_columns: bool | None = None, + max_rows: int | None = None, + max_columns: int | None = None, + ) -> str | None: + """ + Write Styler to a file, buffer or string in Typst format. + + .. versionadded:: 3.0.0 + + Parameters + ---------- + %(buf)s + %(encoding)s + sparse_index : bool, optional + Whether to sparsify the display of a hierarchical index. Setting to False + will display each explicit level element in a hierarchical key for each row. + Defaults to ``pandas.options.styler.sparse.index`` value. + sparse_columns : bool, optional + Whether to sparsify the display of a hierarchical index. Setting to False + will display each explicit level element in a hierarchical key for each + column. Defaults to ``pandas.options.styler.sparse.columns`` value. + max_rows : int, optional + The maximum number of rows that will be rendered. Defaults to + ``pandas.options.styler.render.max_rows``, which is None. + max_columns : int, optional + The maximum number of columns that will be rendered. Defaults to + ``pandas.options.styler.render.max_columns``, which is None. + + Rows and columns may be reduced if the number of total elements is + large. This value is set to ``pandas.options.styler.render.max_elements``, + which is 262144 (18 bit browser rendering). + + Returns + ------- + str or None + If `buf` is None, returns the result as a string. Otherwise returns `None`. + + See Also + -------- + DataFrame.to_typst : Write a DataFrame to a file, + buffer or string in Typst format. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) + >>> df.style.to_typst() # doctest: +SKIP + + .. code-block:: typst + + #table( + columns: 3, + [], [A], [B], + + [0], [1], [3], + [1], [2], [4], + ) + """ + obj = self._copy(deepcopy=True) + + if sparse_index is None: + sparse_index = get_option("styler.sparse.index") + if sparse_columns is None: + sparse_columns = get_option("styler.sparse.columns") + + text = obj._render_typst( + sparse_columns=sparse_columns, + sparse_index=sparse_index, + max_rows=max_rows, + max_cols=max_columns, + ) + return save_to_buffer( + text, buf=buf, encoding=(encoding if buf is not None else None) + ) + @overload def to_html( self, @@ -1644,7 +1757,7 @@ def _update_ctx_header(self, attrs: DataFrame, axis: AxisInt) -> None: for j in attrs.columns: ser = attrs[j] for i, c in ser.items(): - if not c: + if not c or pd.isna(c): continue css_list = maybe_convert_css_to_tuples(c) if axis == 0: @@ -1916,7 +2029,7 @@ def apply( more details. """ self._todo.append( - (lambda instance: getattr(instance, "_apply"), (func, axis, subset), kwargs) + (lambda instance: instance._apply, (func, axis, subset), kwargs) ) return self @@ -2023,7 +2136,7 @@ def apply_index( """ self._todo.append( ( - lambda instance: getattr(instance, "_apply_index"), + lambda instance: instance._apply_index, (func, axis, level, "apply"), kwargs, ) @@ -2052,7 +2165,7 @@ def map_index( ) -> Styler: self._todo.append( ( - lambda instance: getattr(instance, "_apply_index"), + lambda instance: instance._apply_index, (func, axis, level, "map"), kwargs, ) @@ -2125,9 +2238,7 @@ def map(self, func: Callable, subset: Subset | None = None, **kwargs) -> Styler: See `Table Visualization <../../user_guide/style.ipynb>`_ user guide for more details. """ - self._todo.append( - (lambda instance: getattr(instance, "_map"), (func, subset), kwargs) - ) + self._todo.append((lambda instance: instance._map, (func, subset), kwargs)) return self def set_table_attributes(self, attributes: str) -> Styler: @@ -2483,7 +2594,7 @@ def set_sticky( for i, level in enumerate(levels_): styles.append( { - "selector": f"thead tr:nth-child({level+1}) th", + "selector": f"thead tr:nth-child({level + 1}) th", "props": props + ( f"top:{i * pixel_size}px; height:{pixel_size}px; " @@ -2494,7 +2605,7 @@ def set_sticky( if not all(name is None for name in self.index.names): styles.append( { - "selector": f"thead tr:nth-child({obj.nlevels+1}) th", + "selector": f"thead tr:nth-child({obj.nlevels + 1}) th", "props": props + ( f"top:{(len(levels_)) * pixel_size}px; " @@ -2514,7 +2625,7 @@ def set_sticky( styles.extend( [ { - "selector": f"thead tr th:nth-child({level+1})", + "selector": f"thead tr th:nth-child({level + 1})", "props": props_ + "z-index:3 !important;", }, { @@ -4109,8 +4220,10 @@ def css_bar(start: float, end: float, color: str) -> str: if end > start: cell_css += "background: linear-gradient(90deg," if start > 0: - cell_css += f" transparent {start*100:.1f}%, {color} {start*100:.1f}%," - cell_css += f" {color} {end*100:.1f}%, transparent {end*100:.1f}%)" + cell_css += ( + f" transparent {start * 100:.1f}%, {color} {start * 100:.1f}%," + ) + cell_css += f" {color} {end * 100:.1f}%, transparent {end * 100:.1f}%)" return cell_css def css_calc(x, left: float, right: float, align: str, color: str | list | tuple): diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index c0f0608f1ab32..482ed316c7ce4 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -77,6 +77,7 @@ class StylerRenderer: template_html_table = env.get_template("html_table.tpl") template_html_style = env.get_template("html_style.tpl") template_latex = env.get_template("latex.tpl") + template_typst = env.get_template("typst.tpl") template_string = env.get_template("string.tpl") def __init__( @@ -232,6 +233,21 @@ def _render_latex( d.update(kwargs) return self.template_latex.render(**d) + def _render_typst( + self, + sparse_index: bool, + sparse_columns: bool, + max_rows: int | None = None, + max_cols: int | None = None, + **kwargs, + ) -> str: + """ + Render a Styler in typst format + """ + d = self._render(sparse_index, sparse_columns, max_rows, max_cols) + d.update(kwargs) + return self.template_typst.render(**d) + def _render_string( self, sparse_index: bool, @@ -834,10 +850,7 @@ def _generate_body_row( data_element = _element( "td", - ( - f"{self.css['data']} {self.css['row']}{r} " - f"{self.css['col']}{c}{cls}" - ), + (f"{self.css['data']} {self.css['row']}{r} {self.css['col']}{c}{cls}"), value, data_element_visible, attributes="", @@ -957,7 +970,7 @@ def concatenated_visible_rows(obj): idx_len = d["index_lengths"].get((lvl, r), None) if idx_len is not None: # i.e. not a sparsified entry d["clines"][rn + idx_len].append( - f"\\cline{{{lvln+1}-{len(visible_index_levels)+data_len}}}" + f"\\cline{{{lvln + 1}-{len(visible_index_levels) + data_len}}}" # noqa: E501 ) def format( @@ -1541,7 +1554,7 @@ def relabel_index( >>> df = pd.DataFrame({"samples": np.random.rand(10)}) >>> styler = df.loc[np.random.randint(0, 10, 3)].style - >>> styler.relabel_index([f"sample{i+1} ({{}})" for i in range(3)]) + >>> styler.relabel_index([f"sample{i + 1} ({{}})" for i in range(3)]) ... # doctest: +SKIP samples sample1 (5) 0.315811 @@ -2504,7 +2517,7 @@ def color(value, user_arg, command, comm_arg): if value[0] == "#" and len(value) == 7: # color is hex code return command, f"[HTML]{{{value[1:].upper()}}}{arg}" if value[0] == "#" and len(value) == 4: # color is short hex code - val = f"{value[1].upper()*2}{value[2].upper()*2}{value[3].upper()*2}" + val = f"{value[1].upper() * 2}{value[2].upper() * 2}{value[3].upper() * 2}" return command, f"[HTML]{{{val}}}{arg}" elif value[:3] == "rgb": # color is rgb or rgba r = re.findall("(?<=\\()[0-9\\s%]+(?=,)", value)[0].strip() diff --git a/pandas/io/formats/templates/typst.tpl b/pandas/io/formats/templates/typst.tpl new file mode 100644 index 0000000000000..66de8f31b405e --- /dev/null +++ b/pandas/io/formats/templates/typst.tpl @@ -0,0 +1,12 @@ +#table( + columns: {{ head[0] | length }}, +{% for r in head %} + {% for c in r %}[{% if c["is_visible"] %}{{ c["display_value"] }}{% endif %}],{% if not loop.last %} {% endif%}{% endfor %} + +{% endfor %} + +{% for r in body %} + {% for c in r %}[{% if c["is_visible"] %}{{ c["display_value"] }}{% endif %}],{% if not loop.last %} {% endif%}{% endfor %} + +{% endfor %} +) diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index 47f162e93216d..febf43b9a1018 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -260,7 +260,7 @@ def _other_namespaces(self) -> dict: nmsp_dict: dict[str, str] = {} if self.namespaces: nmsp_dict = { - f"xmlns{p if p=='' else f':{p}'}": n + f"xmlns{p if p == '' else f':{p}'}": n for p, n in self.namespaces.items() if n != self.prefix_uri[1:-1] } @@ -404,7 +404,7 @@ def _get_prefix_uri(self) -> str: f"{self.prefix} is not included in namespaces" ) from err elif "" in self.namespaces: - uri = f'{{{self.namespaces[""]}}}' + uri = f"{{{self.namespaces['']}}}" else: uri = "" @@ -502,7 +502,7 @@ def _get_prefix_uri(self) -> str: f"{self.prefix} is not included in namespaces" ) from err elif "" in self.namespaces: - uri = f'{{{self.namespaces[""]}}}' + uri = f"{{{self.namespaces['']}}}" else: uri = "" diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 237518b3c8d92..e032e26d771d7 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -520,6 +520,12 @@ def read_json( """ Convert a JSON string to pandas object. + This method reads JSON files or JSON-like data and converts them into pandas + objects. It supports a variety of input formats, including line-delimited JSON, + compressed files, and various data representations (table, records, index-based, + etc.). When `chunksize` is specified, an iterator is returned instead of loading + the entire data into memory. + Parameters ---------- path_or_buf : a str path, path object or file-like object @@ -917,7 +923,7 @@ def _combine_lines(self, lines) -> str: Combines a list of JSON objects into one JSON object. """ return ( - f'[{",".join([line for line in (line.strip() for line in lines) if line])}]' + f"[{','.join([line for line in (line.strip() for line in lines) if line])}]" ) @overload diff --git a/pandas/io/orc.py b/pandas/io/orc.py index a945f3dc38d35..1a2d564d5b44d 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -45,6 +45,13 @@ def read_orc( """ Load an ORC object from the file path, returning a DataFrame. + This method reads an ORC (Optimized Row Columnar) file into a pandas + DataFrame using the `pyarrow.orc` library. ORC is a columnar storage format + that provides efficient compression and fast retrieval for analytical workloads. + It allows reading specific columns, handling different filesystem + types (such as local storage, cloud storage via fsspec, or pyarrow filesystem), + and supports different data type backends, including `numpy_nullable` and `pyarrow`. + Parameters ---------- path : str, path object, or file-like object diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 672672490996d..8cadde1ad6537 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -165,7 +165,8 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: # The only way self.names is not the same length as number of cols is # if we have int index_col. We should just pad the names(they will get # removed anyways) to expected length then. - self.names = list(range(num_cols - len(self.names))) + self.names + columns_prefix = [str(x) for x in range(num_cols - len(self.names))] + self.names = columns_prefix + self.names multi_index_named = False frame.columns = self.names diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index e263c69376d05..c283f600eb971 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -112,8 +112,7 @@ def __init__(self, kwds) -> None: parse_dates = bool(parse_dates) elif not isinstance(parse_dates, list): raise TypeError( - "Only booleans and lists are accepted " - "for the 'parse_dates' parameter" + "Only booleans and lists are accepted for the 'parse_dates' parameter" ) self.parse_dates: bool | list = parse_dates self.date_parser = kwds.pop("date_parser", lib.no_default) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index db9547a18b600..e7b5c7f06a79a 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -595,8 +595,7 @@ def _infer_columns( joi = list(map(str, header[:-1] if have_mi_columns else header)) msg = f"[{','.join(joi)}], len of {len(joi)}, " raise ValueError( - f"Passed header={msg}" - f"but only {self.line_pos} lines in file" + f"Passed header={msg}but only {self.line_pos} lines in file" ) from err # We have an empty file, so check @@ -1219,8 +1218,7 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]: for row_num, actual_len in bad_lines: msg = ( - f"Expected {col_len} fields in line {row_num + 1}, saw " - f"{actual_len}" + f"Expected {col_len} fields in line {row_num + 1}, saw {actual_len}" ) if ( self.delimiter diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 54877017f76fc..67193f930b4dc 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1219,8 +1219,7 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: and value != getattr(value, "value", default) ): raise ValueError( - f"The {argname!r} option is not supported with the " - f"'pyarrow' engine" + f"The {argname!r} option is not supported with the 'pyarrow' engine" ) options[argname] = value @@ -1396,8 +1395,7 @@ def _clean_options( if not is_integer(skiprows) and skiprows is not None: # pyarrow expects skiprows to be passed as an integer raise ValueError( - "skiprows argument must be an integer when using " - "engine='pyarrow'" + "skiprows argument must be an integer when using engine='pyarrow'" ) else: if is_integer(skiprows): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 7d265bc430125..a689cfbcb1418 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -86,12 +86,16 @@ PeriodArray, ) from pandas.core.arrays.datetimes import tz_to_dtype +from pandas.core.arrays.string_ import BaseStringArray import pandas.core.common as com from pandas.core.computation.pytables import ( PyTablesExpr, maybe_expression, ) -from pandas.core.construction import extract_array +from pandas.core.construction import ( + array as pd_array, + extract_array, +) from pandas.core.indexes.api import ensure_index from pandas.io.common import stringify_path @@ -1127,6 +1131,12 @@ def put( """ Store object in HDFStore. + This method writes a pandas DataFrame or Series into an HDF5 file using + either the fixed or table format. The `table` format allows additional + operations like incremental appends and queries but may have performance + trade-offs. The `fixed` format provides faster read/write operations but + does not support appends or queries. + Parameters ---------- key : str @@ -3023,6 +3033,9 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None if isinstance(node, tables.VLArray): ret = node[0][start:stop] + dtype = getattr(attrs, "value_type", None) + if dtype is not None: + ret = pd_array(ret, dtype=dtype) else: dtype = getattr(attrs, "value_type", None) shape = getattr(attrs, "shape", None) @@ -3262,6 +3275,11 @@ def write_array( elif lib.is_np_dtype(value.dtype, "m"): self._handle.create_array(self.group, key, value.view("i8")) getattr(self.group, key)._v_attrs.value_type = "timedelta64" + elif isinstance(value, BaseStringArray): + vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom()) + vlarr.append(value.to_numpy()) + node = getattr(self.group, key) + node._v_attrs.value_type = str(value.dtype) elif empty_array: self.write_array_empty(key, value) else: @@ -3294,7 +3312,11 @@ def read( index = self.read_index("index", start=start, stop=stop) values = self.read_array("values", start=start, stop=stop) result = Series(values, index=index, name=self.name, copy=False) - if using_string_dtype() and is_string_array(values, skipna=True): + if ( + using_string_dtype() + and isinstance(values, np.ndarray) + and is_string_array(values, skipna=True) + ): result = result.astype(StringDtype(na_value=np.nan)) return result @@ -3363,7 +3385,11 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) - if using_string_dtype() and is_string_array(values, skipna=True): + if ( + using_string_dtype() + and isinstance(values, np.ndarray) + and is_string_array(values, skipna=True) + ): df = df.astype(StringDtype(na_value=np.nan)) dfs.append(df) @@ -3504,6 +3530,12 @@ def validate(self, other) -> None: # Value of type "Optional[Any]" is not indexable [index] oax = ov[i] # type: ignore[index] if sax != oax: + if c == "values_axes" and sax.kind != oax.kind: + raise ValueError( + f"Cannot serialize the column [{oax.values[0]}] " + f"because its data contents are not [{sax.kind}] " + f"but [{oax.kind}] object dtype" + ) raise ValueError( f"invalid combination of [{c}] on appending data " f"[{sax}] vs current table [{oax}]" @@ -4127,6 +4159,8 @@ def _create_axes( ordered = data_converted.ordered meta = "category" metadata = np.asarray(data_converted.categories).ravel() + elif isinstance(blk.dtype, StringDtype): + meta = str(blk.dtype) data, dtype_name = _get_data_and_dtype_name(data_converted) @@ -4387,7 +4421,8 @@ def read_column( errors=self.errors, ) cvs = col_values[1] - return Series(cvs, name=column, copy=False) + dtype = getattr(self.table.attrs, f"{column}_meta", None) + return Series(cvs, name=column, copy=False, dtype=dtype) raise KeyError(f"column [{column}] not found in the table") @@ -4737,9 +4772,23 @@ def read( df = DataFrame._from_arrays([values], columns=cols_, index=index_) if not (using_string_dtype() and values.dtype.kind == "O"): assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) - if using_string_dtype() and is_string_array( - values, # type: ignore[arg-type] - skipna=True, + + # If str / string dtype is stored in meta, use that. + converted = False + for column in cols_: + dtype = getattr(self.table.attrs, f"{column}_meta", None) + if dtype in ["str", "string"]: + df[column] = df[column].astype(dtype) + converted = True + # Otherwise try inference. + if ( + not converted + and using_string_dtype() + and isinstance(values, np.ndarray) + and is_string_array( + values, + skipna=True, + ) ): df = df.astype(StringDtype(na_value=np.nan)) frames.append(df) @@ -5088,6 +5137,9 @@ def _maybe_convert_for_string_atom( errors, columns: list[str], ): + if isinstance(bvalues.dtype, StringDtype): + # "ndarray[Any, Any]" has no attribute "to_numpy" + bvalues = bvalues.to_numpy() # type: ignore[union-attr] if bvalues.dtype != object: return bvalues @@ -5112,6 +5164,9 @@ def _maybe_convert_for_string_atom( data = bvalues.copy() data[mask] = nan_rep + if existing_col and mask.any() and len(nan_rep) > existing_col.itemsize: + raise ValueError("NaN representation is too large for existing column size") + # see if we have a valid string type inferred_type = lib.infer_dtype(data, skipna=False) if inferred_type != "string": @@ -5209,7 +5264,9 @@ def _unconvert_string_array( dtype = f"U{itemsize}" if isinstance(data[0], bytes): - data = Series(data, copy=False).str.decode(encoding, errors=errors)._values + ser = Series(data, copy=False).str.decode(encoding, errors=errors) + data = ser.to_numpy() + data.flags.writeable = True else: data = data.astype(dtype, copy=False).astype(object, copy=False) @@ -5297,6 +5354,8 @@ def _dtype_to_kind(dtype_str: str) -> str: kind = "integer" elif dtype_str == "object": kind = "object" + elif dtype_str == "str": + kind = "str" else: raise ValueError(f"cannot interpret dtype of [{dtype_str}]") diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index c5aab4d967cd4..792af5ff713a3 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -22,6 +22,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs.byteswap import ( read_double_with_byteswap, read_float_with_byteswap, @@ -699,6 +701,7 @@ def _chunk_to_dataframe(self) -> DataFrame: rslt = {} js, jb = 0, 0 + infer_string = get_option("future.infer_string") for j in range(self.column_count): name = self.column_names[j] @@ -715,6 +718,9 @@ def _chunk_to_dataframe(self) -> DataFrame: rslt[name] = pd.Series(self._string_chunk[js, :], index=ix, copy=False) if self.convert_text and (self.encoding is not None): rslt[name] = self._decode_string(rslt[name].str) + if infer_string: + rslt[name] = rslt[name].astype("str") + js += 1 else: self.close() diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index 89dbdab64c23c..a9c45e720fd56 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -33,19 +33,16 @@ ReadBuffer, ) _correct_line1 = ( - "HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!" - "000000000000000000000000000000 " + "HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!000000000000000000000000000000 " ) _correct_header1 = ( "HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!000000000000000001600000000" ) _correct_header2 = ( - "HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!" - "000000000000000000000000000000 " + "HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!000000000000000000000000000000 " ) _correct_obs_header = ( - "HEADER RECORD*******OBS HEADER RECORD!!!!!!!" - "000000000000000000000000000000 " + "HEADER RECORD*******OBS HEADER RECORD!!!!!!!000000000000000000000000000000 " ) _fieldkeys = [ "ntype", diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 5652d7fab0c7c..0e0f07c0f8ff3 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -76,6 +76,7 @@ from sqlalchemy import Table from sqlalchemy.sql.expression import ( + Delete, Select, TextClause, ) @@ -738,7 +739,7 @@ def to_sql( name: str, con, schema: str | None = None, - if_exists: Literal["fail", "replace", "append"] = "fail", + if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail", index: bool = True, index_label: IndexLabel | None = None, chunksize: int | None = None, @@ -750,6 +751,12 @@ def to_sql( """ Write records stored in a DataFrame to a SQL database. + .. warning:: + The pandas library does not attempt to sanitize inputs provided via a to_sql call. + Please refer to the documentation for the underlying database driver to see if it + will properly prevent injection, or alternatively be advised of a security risk when + executing arbitrary commands in a to_sql call. + Parameters ---------- frame : DataFrame, Series @@ -764,10 +771,11 @@ def to_sql( schema : str, optional Name of SQL schema in database to write to (if database flavor supports this). If None, use default schema (default). - if_exists : {'fail', 'replace', 'append'}, default 'fail' + if_exists : {'fail', 'replace', 'append', 'delete_rows'}, default 'fail' - fail: If table exists, do nothing. - replace: If table exists, drop it, recreate it, and insert data. - append: If table exists, insert data. Create if does not exist. + - delete_rows: If a table exists, delete all records and insert data. index : bool, default True Write DataFrame index as a column. index_label : str or sequence, optional @@ -818,7 +826,7 @@ def to_sql( `sqlite3 `__ or `SQLAlchemy `__ """ # noqa: E501 - if if_exists not in ("fail", "replace", "append"): + if if_exists not in ("fail", "replace", "append", "delete_rows"): raise ValueError(f"'{if_exists}' is not valid for if_exists") if isinstance(frame, Series): @@ -926,7 +934,7 @@ def __init__( pandas_sql_engine, frame=None, index: bool | str | list[str] | None = True, - if_exists: Literal["fail", "replace", "append"] = "fail", + if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail", prefix: str = "pandas", index_label=None, schema=None, @@ -974,11 +982,13 @@ def create(self) -> None: if self.exists(): if self.if_exists == "fail": raise ValueError(f"Table '{self.name}' already exists.") - if self.if_exists == "replace": + elif self.if_exists == "replace": self.pd_sql.drop_table(self.name, self.schema) self._execute_create() elif self.if_exists == "append": pass + elif self.if_exists == "delete_rows": + self.pd_sql.delete_rows(self.name, self.schema) else: raise ValueError(f"'{self.if_exists}' is not valid for if_exists") else: @@ -997,7 +1007,7 @@ def _execute_insert(self, conn, keys: list[str], data_iter) -> int: Each item contains a list of values to be inserted """ data = [dict(zip(keys, row)) for row in data_iter] - result = conn.execute(self.table.insert(), data) + result = self.pd_sql.execute(self.table.insert(), data) return result.rowcount def _execute_insert_multi(self, conn, keys: list[str], data_iter) -> int: @@ -1014,7 +1024,7 @@ def _execute_insert_multi(self, conn, keys: list[str], data_iter) -> int: data = [dict(zip(keys, row)) for row in data_iter] stmt = insert(self.table).values(data) - result = conn.execute(stmt) + result = self.pd_sql.execute(stmt) return result.rowcount def insert_data(self) -> tuple[list[str], list[np.ndarray]]: @@ -1480,7 +1490,7 @@ def to_sql( self, frame, name: str, - if_exists: Literal["fail", "replace", "append"] = "fail", + if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail", index: bool = True, index_label=None, schema=None, @@ -1649,12 +1659,20 @@ def run_transaction(self): else: yield self.con - def execute(self, sql: str | Select | TextClause, params=None): + def execute(self, sql: str | Select | TextClause | Delete, params=None): """Simple passthrough to SQLAlchemy connectable""" + from sqlalchemy.exc import SQLAlchemyError + args = [] if params is None else [params] if isinstance(sql, str): - return self.con.exec_driver_sql(sql, *args) - return self.con.execute(sql, *args) + execute_function = self.con.exec_driver_sql + else: + execute_function = self.con.execute + + try: + return execute_function(sql, *args) + except SQLAlchemyError as exc: + raise DatabaseError(f"Execution failed on sql '{sql}': {exc}") from exc def read_table( self, @@ -1866,7 +1884,7 @@ def prep_table( self, frame, name: str, - if_exists: Literal["fail", "replace", "append"] = "fail", + if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail", index: bool | str | list[str] | None = True, index_label=None, schema=None, @@ -1943,7 +1961,7 @@ def to_sql( self, frame, name: str, - if_exists: Literal["fail", "replace", "append"] = "fail", + if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail", index: bool = True, index_label=None, schema: str | None = None, @@ -1961,10 +1979,11 @@ def to_sql( frame : DataFrame name : string Name of SQL table. - if_exists : {'fail', 'replace', 'append'}, default 'fail' + if_exists : {'fail', 'replace', 'append', 'delete_rows'}, default 'fail' - fail: If table exists, do nothing. - replace: If table exists, drop it, recreate it, and insert data. - append: If table exists, insert data. Create if does not exist. + - delete_rows: If a table exists, delete all records and insert data. index : boolean, default True Write DataFrame index as a column. index_label : string or sequence, default None @@ -2061,6 +2080,16 @@ def drop_table(self, table_name: str, schema: str | None = None) -> None: self.get_table(table_name, schema).drop(bind=self.con) self.meta.clear() + def delete_rows(self, table_name: str, schema: str | None = None) -> None: + schema = schema or self.meta.schema + if self.has_table(table_name, schema): + self.meta.reflect( + bind=self.con, only=[table_name], schema=schema, views=True + ) + table = self.get_table(table_name, schema) + self.execute(table.delete()).close() + self.meta.clear() + def _create_sql_schema( self, frame: DataFrame, @@ -2108,6 +2137,8 @@ def run_transaction(self): self.con.commit() def execute(self, sql: str | Select | TextClause, params=None): + from adbc_driver_manager import Error + if not isinstance(sql, str): raise TypeError("Query must be a string unless using sqlalchemy.") args = [] if params is None else [params] @@ -2115,10 +2146,10 @@ def execute(self, sql: str | Select | TextClause, params=None): try: cur.execute(sql, *args) return cur - except Exception as exc: + except Error as exc: try: self.con.rollback() - except Exception as inner_exc: # pragma: no cover + except Error as inner_exc: # pragma: no cover ex = DatabaseError( f"Execution failed on sql: {sql}\n{exc}\nunable to rollback" ) @@ -2207,8 +2238,7 @@ def read_table( else: stmt = f"SELECT {select_list} FROM {table_name}" - with self.con.cursor() as cur: - cur.execute(stmt) + with self.execute(stmt) as cur: pa_table = cur.fetch_arrow_table() df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) @@ -2278,8 +2308,7 @@ def read_query( if chunksize: raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") - with self.con.cursor() as cur: - cur.execute(sql) + with self.execute(sql) as cur: pa_table = cur.fetch_arrow_table() df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) @@ -2296,7 +2325,7 @@ def to_sql( self, frame, name: str, - if_exists: Literal["fail", "replace", "append"] = "fail", + if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail", index: bool = True, index_label=None, schema: str | None = None, @@ -2318,6 +2347,7 @@ def to_sql( - fail: If table exists, do nothing. - replace: If table exists, drop it, recreate it, and insert data. - append: If table exists, insert data. Create if does not exist. + - delete_rows: If a table exists, delete all records and insert data. index : boolean, default True Write DataFrame index as a column. index_label : string or sequence, default None @@ -2335,6 +2365,9 @@ def to_sql( engine : {'auto', 'sqlalchemy'}, default 'auto' Raises NotImplementedError if not set to 'auto' """ + pa = import_optional_dependency("pyarrow") + from adbc_driver_manager import Error + if index_label: raise NotImplementedError( "'index_label' is not implemented for ADBC drivers" @@ -2364,12 +2397,13 @@ def to_sql( if if_exists == "fail": raise ValueError(f"Table '{table_name}' already exists.") elif if_exists == "replace": - with self.con.cursor() as cur: - cur.execute(f"DROP TABLE {table_name}") + sql_statement = f"DROP TABLE {table_name}" + self.execute(sql_statement).close() elif if_exists == "append": mode = "append" - - import pyarrow as pa + elif if_exists == "delete_rows": + mode = "append" + self.delete_rows(name, schema) try: tbl = pa.Table.from_pandas(frame, preserve_index=index) @@ -2377,9 +2411,14 @@ def to_sql( raise ValueError("datatypes not supported") from exc with self.con.cursor() as cur: - total_inserted = cur.adbc_ingest( - table_name=name, data=tbl, mode=mode, db_schema_name=schema - ) + try: + total_inserted = cur.adbc_ingest( + table_name=name, data=tbl, mode=mode, db_schema_name=schema + ) + except Error as exc: + raise DatabaseError( + f"Failed to insert records on table={name} with {mode=}" + ) from exc self.con.commit() return total_inserted @@ -2402,6 +2441,11 @@ def has_table(self, name: str, schema: str | None = None) -> bool: return False + def delete_rows(self, name: str, schema: str | None = None) -> None: + table_name = f"{schema}.{name}" if schema else name + if self.has_table(name, schema): + self.execute(f"DELETE FROM {table_name}").close() + def _create_sql_schema( self, frame: DataFrame, @@ -2496,9 +2540,9 @@ def sql_schema(self) -> str: return str(";\n".join(self.table)) def _execute_create(self) -> None: - with self.pd_sql.run_transaction() as conn: + with self.pd_sql.run_transaction() as cur: for stmt in self.table: - conn.execute(stmt) + cur.execute(stmt) def insert_statement(self, *, num_rows: int) -> str: names = list(map(str, self.frame.columns)) @@ -2520,8 +2564,13 @@ def insert_statement(self, *, num_rows: int) -> str: return insert_statement def _execute_insert(self, conn, keys, data_iter) -> int: + from sqlite3 import Error + data_list = list(data_iter) - conn.executemany(self.insert_statement(num_rows=1), data_list) + try: + conn.executemany(self.insert_statement(num_rows=1), data_list) + except Error as exc: + raise DatabaseError("Execution failed") from exc return conn.rowcount def _execute_insert_multi(self, conn, keys, data_iter) -> int: @@ -2643,6 +2692,8 @@ def run_transaction(self): cur.close() def execute(self, sql: str | Select | TextClause, params=None): + from sqlite3 import Error + if not isinstance(sql, str): raise TypeError("Query must be a string unless using sqlalchemy.") args = [] if params is None else [params] @@ -2650,10 +2701,10 @@ def execute(self, sql: str | Select | TextClause, params=None): try: cur.execute(sql, *args) return cur - except Exception as exc: + except Error as exc: try: self.con.rollback() - except Exception as inner_exc: # pragma: no cover + except Error as inner_exc: # pragma: no cover ex = DatabaseError( f"Execution failed on sql: {sql}\n{exc}\nunable to rollback" ) @@ -2769,10 +2820,11 @@ def to_sql( frame: DataFrame name: string Name of SQL table. - if_exists: {'fail', 'replace', 'append'}, default 'fail' + if_exists: {'fail', 'replace', 'append', 'delete_rows'}, default 'fail' fail: If table exists, do nothing. replace: If table exists, drop it, recreate it, and insert data. append: If table exists, insert data. Create if it does not exist. + delete_rows: If a table exists, delete all records and insert data. index : bool, default True Write DataFrame index as a column index_label : string or sequence, default None @@ -2846,7 +2898,12 @@ def get_table(self, table_name: str, schema: str | None = None) -> None: def drop_table(self, name: str, schema: str | None = None) -> None: drop_sql = f"DROP TABLE {_get_valid_sqlite_name(name)}" - self.execute(drop_sql) + self.execute(drop_sql).close() + + def delete_rows(self, name: str, schema: str | None = None) -> None: + delete_sql = f"DELETE FROM {_get_valid_sqlite_name(name)}" + if self.has_table(name, schema): + self.execute(delete_sql).close() def _create_sql_schema( self, diff --git a/pandas/meson.build b/pandas/meson.build index 435103a954d86..840ac257bba09 100644 --- a/pandas/meson.build +++ b/pandas/meson.build @@ -1,7 +1,8 @@ -incdir_numpy = run_command(py, - [ - '-c', - ''' +incdir_numpy = run_command( + py, + [ + '-c', + ''' import os import numpy as np try: @@ -12,9 +13,9 @@ try: except Exception: incdir = np.get_include() print(incdir) - ''' - ], - check: true + ''', + ], + check: true, ).stdout().strip() inc_np = include_directories(incdir_numpy) @@ -36,9 +37,9 @@ subdirs_list = [ 'plotting', 'tests', 'tseries', - 'util' + 'util', ] -foreach subdir: subdirs_list +foreach subdir : subdirs_list install_subdir(subdir, install_dir: py.get_install_dir() / 'pandas') endforeach @@ -47,6 +48,6 @@ top_level_py_list = [ '_typing.py', '_version.py', 'conftest.py', - 'testing.py' + 'testing.py', ] py.install_sources(top_level_py_list, subdir: 'pandas') diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index aee872f9ae50a..9670b5439c87e 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -247,11 +247,14 @@ def hist_frame( .. plot:: :context: close-figs - >>> data = {"length": [1.5, 0.5, 1.2, 0.9, 3], "width": [0.7, 0.2, 0.15, 0.2, 1.1]} + >>> data = { + ... "length": [1.5, 0.5, 1.2, 0.9, 3], + ... "width": [0.7, 0.2, 0.15, 0.2, 1.1], + ... } >>> index = ["pig", "rabbit", "duck", "chicken", "horse"] >>> df = pd.DataFrame(data, index=index) >>> hist = df.hist(bins=3) - """ # noqa: E501 + """ plot_backend = _get_plot_backend(backend) return plot_backend.hist_frame( data, @@ -845,7 +848,10 @@ class PlotAccessor(PandasObject): :context: close-figs >>> df = pd.DataFrame( - ... {"length": [1.5, 0.5, 1.2, 0.9, 3], "width": [0.7, 0.2, 0.15, 0.2, 1.1]}, + ... { + ... "length": [1.5, 0.5, 1.2, 0.9, 3], + ... "width": [0.7, 0.2, 0.15, 0.2, 1.1], + ... }, ... index=["pig", "rabbit", "duck", "chicken", "horse"], ... ) >>> plot = df.plot(title="DataFrame Plot") @@ -866,7 +872,7 @@ class PlotAccessor(PandasObject): >>> df = pd.DataFrame({"col1": [1, 2, 3, 4], "col2": ["A", "B", "A", "B"]}) >>> plot = df.groupby("col2").plot(kind="bar", title="DataFrameGroupBy Plot") - """ # noqa: E501 + """ _common_kinds = ("line", "bar", "barh", "kde", "density", "area", "hist", "box") _series_kinds = ("pie",) @@ -993,8 +999,7 @@ def __call__(self, *args, **kwargs): if kind not in self._all_kinds: raise ValueError( - f"{kind} is not a valid plot kind " - f"Valid plot kinds: {self._all_kinds}" + f"{kind} is not a valid plot kind Valid plot kinds: {self._all_kinds}" ) data = self._parent @@ -1630,7 +1635,9 @@ def area( ... "signups": [5, 5, 6, 12, 14, 13], ... "visits": [20, 42, 28, 62, 81, 50], ... }, - ... index=pd.date_range(start="2018/01/01", end="2018/07/01", freq="ME"), + ... index=pd.date_range( + ... start="2018/01/01", end="2018/07/01", freq="ME" + ... ), ... ) >>> ax = df.plot.area() @@ -1662,7 +1669,7 @@ def area( ... } ... ) >>> ax = df.plot.area(x="day") - """ # noqa: E501 + """ return self(kind="area", x=x, y=y, stacked=stacked, **kwargs) def pie(self, y: IndexLabel | None = None, **kwargs) -> PlotAccessor: diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 68682344f98ca..af77972da8634 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -20,6 +20,7 @@ import pandas as pd import pandas.core.common as com +from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.core import ( @@ -54,7 +55,8 @@ def _set_ticklabels(ax: Axes, labels: list[str], is_vertical: bool, **kwargs) -> ticks = ax.get_xticks() if is_vertical else ax.get_yticks() if len(ticks) != len(labels): i, remainder = divmod(len(ticks), len(labels)) - assert remainder == 0, remainder + if Version(mpl.__version__) < Version("3.10"): + assert remainder == 0, remainder labels *= i if is_vertical: ax.set_xticklabels(labels, **kwargs) @@ -121,8 +123,7 @@ def _validate_color_args(self, color, colormap): if colormap is not None: warnings.warn( - "'color' and 'colormap' cannot be used " - "simultaneously. Using 'color'", + "'color' and 'colormap' cannot be used simultaneously. Using 'color'", stacklevel=find_stack_level(), ) diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index d5624aecd1215..8ee75e7fe553e 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -56,7 +56,7 @@ def format_date_labels(ax: Axes, rot) -> None: fig = ax.get_figure() if fig is not None: # should always be a Figure but can technically be None - maybe_adjust_figure(fig, bottom=0.2) + maybe_adjust_figure(fig, bottom=0.2) # type: ignore[arg-type] def table( diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index b20f8ac5f4796..0e0fb23d924bc 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -30,6 +30,13 @@ def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table: """ Helper function to convert DataFrame and Series to matplotlib.table. + This method provides an easy way to visualize tabular data within a Matplotlib + figure. It automatically extracts index and column labels from the DataFrame + or Series, unless explicitly specified. This function is particularly useful + when displaying summary tables alongside other plots or when creating static + reports. It utilizes the `matplotlib.pyplot.table` backend and allows + customization through various styling options available in Matplotlib. + Parameters ---------- ax : Matplotlib axes object @@ -389,6 +396,12 @@ def andrews_curves( Returns ------- :class:`matplotlib.axes.Axes` + The matplotlib Axes object with the plot. + + See Also + -------- + plotting.parallel_coordinates : Plot parallel coordinates chart. + DataFrame.plot : Make plots of Series or DataFrame. Examples -------- diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index c1d9f5ea4d25c..4a05259a98087 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -261,6 +261,7 @@ class TestApi(Base): "JsonReader", "NaTType", "NAType", + "NoDefault", "PeriodIndexResamplerGroupby", "Resampler", "Rolling", diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index d36d723c4be6a..b9e407adc3051 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm + from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -16,6 +18,7 @@ ) import pandas._testing as tm from pandas.tests.frame.common import zip_frames +from pandas.util.version import Version @pytest.fixture @@ -65,6 +68,13 @@ def test_apply(float_frame, engine, request): @pytest.mark.parametrize("raw", [True, False]) @pytest.mark.parametrize("nopython", [True, False]) def test_apply_args(float_frame, axis, raw, engine, nopython): + numba = pytest.importorskip("numba") + if ( + engine == "numba" + and Version(numba.__version__) == Version("0.61") + and is_platform_arm() + ): + pytest.skip(f"Segfaults on ARM platforms with numba {numba.__version__}") engine_kwargs = {"nopython": nopython} result = float_frame.apply( lambda x, y: x + y, diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index d6cd9c321ace6..75bc3f5b74b9d 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm import pandas.util._test_decorators as td import pandas as pd @@ -9,8 +10,17 @@ Index, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu] +pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu, pytest.mark.skipif()] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) @pytest.fixture(params=[0, 1]) diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index c52168ae48ca8..e5a9492630b13 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,7 +4,9 @@ import numpy as np import pytest -from pandas.compat import WASM +from pandas.compat import ( + WASM, +) from pandas.core.dtypes.common import is_number @@ -159,17 +161,10 @@ def test_agg_cython_table_series(series, func, expected): ), ), ) -def test_agg_cython_table_transform_series(request, series, func, expected): +def test_agg_cython_table_transform_series(series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) - if series.dtype == "string" and func == "cumsum": - request.applymarker( - pytest.mark.xfail( - raises=(TypeError, NotImplementedError), - reason="TODO(infer_string) cumsum not yet implemented for string", - ) - ) warn = None if isinstance(func, str) else FutureWarning with tm.assert_produces_warning(warn, match="is currently using Series.*"): result = series.agg(func) diff --git a/pandas/tests/arrays/interval/test_formats.py b/pandas/tests/arrays/interval/test_formats.py index 535efee519374..88c9bf81d718c 100644 --- a/pandas/tests/arrays/interval/test_formats.py +++ b/pandas/tests/arrays/interval/test_formats.py @@ -6,8 +6,6 @@ def test_repr(): arr = IntervalArray.from_tuples([(0, 1), (1, 2)]) result = repr(arr) expected = ( - "\n" - "[(0, 1], (1, 2]]\n" - "Length: 2, dtype: interval[int64, right]" + "\n[(0, 1], (1, 2]]\nLength: 2, dtype: interval[int64, right]" ) assert result == expected diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index a32ac7db4656a..336a0fef69170 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -10,7 +10,10 @@ from pandas._config import using_string_dtype -from pandas.compat.pyarrow import pa_version_under12p0 +from pandas.compat.pyarrow import ( + pa_version_under12p0, + pa_version_under19p0, +) from pandas.core.dtypes.common import is_dtype_equal @@ -539,7 +542,7 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string): assert table.field("a").type == "large_string" with pd.option_context("string_storage", string_storage): result = table.to_pandas() - if dtype.na_value is np.nan and not using_string_dtype(): + if dtype.na_value is np.nan and not using_infer_string: assert result["a"].dtype == "object" else: assert isinstance(result["a"].dtype, pd.StringDtype) @@ -553,6 +556,21 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string): assert result.loc[2, "a"] is result["a"].dtype.na_value +@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") +def test_arrow_from_string(using_infer_string): + # not roundtrip, but starting with pyarrow table without pandas metadata + pa = pytest.importorskip("pyarrow") + table = pa.table({"a": pa.array(["a", "b", None], type=pa.string())}) + + result = table.to_pandas() + + if using_infer_string and not pa_version_under19p0: + expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="str") + else: + expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="object") + tm.assert_frame_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): # GH-41040 @@ -740,3 +758,9 @@ def test_tolist(dtype): result = arr.tolist() expected = vals tm.assert_equal(result, expected) + + +def test_string_array_view_type_error(): + arr = pd.array(["a", "b", "c"], dtype="string") + with pytest.raises(TypeError, match="Cannot change data-type for string array."): + arr.view("i8") diff --git a/pandas/tests/base/test_constructors.py b/pandas/tests/base/test_constructors.py index c4b02423f8cf0..dffd2009ef373 100644 --- a/pandas/tests/base/test_constructors.py +++ b/pandas/tests/base/test_constructors.py @@ -179,3 +179,14 @@ def test_constructor_datetime_nonns(self, constructor): arr.flags.writeable = False result = constructor(arr) tm.assert_equal(result, expected) + + def test_constructor_from_dict_keys(self, constructor, using_infer_string): + # https://github.com/pandas-dev/pandas/issues/60343 + d = {"a": 1, "b": 2} + result = constructor(d.keys(), dtype="str") + if using_infer_string: + assert result.dtype == "str" + else: + assert result.dtype == "object" + expected = constructor(list(d.keys()), dtype="str") + tm.assert_equal(result, expected) diff --git a/pandas/tests/base/test_fillna.py b/pandas/tests/base/test_fillna.py index 7300d3013305a..8c56bcc169d8e 100644 --- a/pandas/tests/base/test_fillna.py +++ b/pandas/tests/base/test_fillna.py @@ -16,7 +16,7 @@ def test_fillna(index_or_series_obj): obj = index_or_series_obj if isinstance(obj, MultiIndex): - msg = "isna is not defined for MultiIndex" + msg = "fillna is not defined for MultiIndex" with pytest.raises(NotImplementedError, match=msg): obj.fillna(0) return diff --git a/pandas/tests/dtypes/cast/test_downcast.py b/pandas/tests/dtypes/cast/test_downcast.py index 9430ba2c478ae..69200b2e5fc96 100644 --- a/pandas/tests/dtypes/cast/test_downcast.py +++ b/pandas/tests/dtypes/cast/test_downcast.py @@ -33,9 +33,9 @@ ( # This is a judgement call, but we do _not_ downcast Decimal # objects - np.array([decimal.Decimal(0.0)]), + np.array([decimal.Decimal("0.0")]), "int64", - np.array([decimal.Decimal(0.0)]), + np.array([decimal.Decimal("0.0")]), ), ( # GH#45837 diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 5a59617ce5bd3..d30fa9fc2ea0f 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -22,6 +22,7 @@ import pandas._testing as tm from pandas.api.types import pandas_dtype from pandas.arrays import SparseArray +from pandas.util.version import Version # EA & Actual Dtypes @@ -788,11 +789,18 @@ def test_validate_allhashable(): def test_pandas_dtype_numpy_warning(): # GH#51523 - with tm.assert_produces_warning( - DeprecationWarning, - check_stacklevel=False, - match="Converting `np.integer` or `np.signedinteger` to a dtype is deprecated", - ): + if Version(np.__version__) < Version("2.3.0.dev0"): + ctx = tm.assert_produces_warning( + DeprecationWarning, + check_stacklevel=False, + match=( + "Converting `np.integer` or `np.signedinteger` to a dtype is deprecated" + ), + ) + else: + ctx = tm.external_error_raised(TypeError) + + with ctx: pandas_dtype(np.integer) @@ -837,6 +845,26 @@ def test_pandas_dtype_string_dtypes(string_storage): assert result == pd.StringDtype(string_storage, na_value=pd.NA) +def test_pandas_dtype_string_dtype_alias_with_storage(): + with pytest.raises(TypeError, match="not understood"): + pandas_dtype("str[python]") + + with pytest.raises(TypeError, match="not understood"): + pandas_dtype("str[pyarrow]") + + result = pandas_dtype("string[python]") + assert result == pd.StringDtype("python", na_value=pd.NA) + + if HAS_PYARROW: + result = pandas_dtype("string[pyarrow]") + assert result == pd.StringDtype("pyarrow", na_value=pd.NA) + else: + with pytest.raises( + ImportError, match="required for PyArrow backed StringArray" + ): + pandas_dtype("string[pyarrow]") + + @td.skip_if_installed("pyarrow") def test_construct_from_string_without_pyarrow_installed(): # GH 57928 diff --git a/pandas/tests/dtypes/test_concat.py b/pandas/tests/dtypes/test_concat.py index d4fe6c5264007..571e12d0c3303 100644 --- a/pandas/tests/dtypes/test_concat.py +++ b/pandas/tests/dtypes/test_concat.py @@ -47,7 +47,7 @@ def test_concat_periodarray_2d(): _concat.concat_compat([arr[:2], arr[2:]], axis=1) -def test_concat_series_between_empty_and_tzaware_series(): +def test_concat_series_between_empty_and_tzaware_series(using_infer_string): tzaware_time = pd.Timestamp("2020-01-01T00:00:00+00:00") ser1 = Series(index=[tzaware_time], data=0, dtype=float) ser2 = Series(dtype=float) @@ -57,7 +57,9 @@ def test_concat_series_between_empty_and_tzaware_series(): data=[ (0.0, None), ], - index=pd.Index([tzaware_time], dtype=object), + index=[tzaware_time] + if using_infer_string + else pd.Index([tzaware_time], dtype=object), columns=[0, 1], dtype=float, ) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index b7e37ff270e60..621217a8c9317 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -660,8 +660,7 @@ def test_construction_generic(self, subtype): def test_construction_not_supported(self, subtype): # GH 19016 msg = ( - "category, object, and string subtypes are not supported " - "for IntervalDtype" + "category, object, and string subtypes are not supported for IntervalDtype" ) with pytest.raises(TypeError, match=msg): IntervalDtype(subtype) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index da444b55490f0..db98751324ebc 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1582,6 +1582,31 @@ def test_is_string_array(self): ) assert not lib.is_string_array(np.array([1, 2])) + @pytest.mark.parametrize( + "func", + [ + "is_bool_array", + "is_date_array", + "is_datetime_array", + "is_datetime64_array", + "is_float_array", + "is_integer_array", + "is_interval_array", + "is_string_array", + "is_time_array", + "is_timedelta_or_timedelta64_array", + ], + ) + def test_is_dtype_array_empty_obj(self, func): + # https://github.com/pandas-dev/pandas/pull/60796 + func = getattr(lib, func) + + arr = np.empty((2, 0), dtype=object) + assert not func(arr) + + arr = np.empty((0, 2), dtype=object) + assert not func(arr) + def test_to_object_array_tuples(self): r = (5, 6) values = [r] diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 73c462d492d2d..c61cda83cf6e0 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -321,7 +321,7 @@ def test_period(self): def test_decimal(self): # scalars GH#23530 - a = Decimal(1.0) + a = Decimal("1.0") assert isna(a) is False assert notna(a) is True diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 9a41a3a582c4a..9a2f186c2a00b 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -18,8 +18,9 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): try: alt = ser.astype("float64") - except TypeError: - # e.g. Period can't be cast to float64 + except (TypeError, ValueError): + # e.g. Period can't be cast to float64 (TypeError) + # String can't be cast to float64 (ValueError) alt = ser.astype(object) result = getattr(ser, op_name)(skipna=skipna) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 27fa1206f6f7f..1f3680bf67e90 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -139,8 +139,8 @@ def test_getitem_invalid(self, data): "index out of bounds", # pyarrow "Out of bounds access", # Sparse f"loc must be an integer between -{ub} and {ub}", # Sparse - f"index {ub+1} is out of bounds for axis 0 with size {ub}", - f"index -{ub+1} is out of bounds for axis 0 with size {ub}", + f"index {ub + 1} is out of bounds for axis 0 with size {ub}", + f"index -{ub + 1} is out of bounds for axis 0 with size {ub}", ] ) with pytest.raises(IndexError, match=msg): diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index bab8566a06dc2..60cade97ab528 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -113,13 +113,9 @@ def test_groupby_extension_transform(self, data_for_grouping): def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("B", group_keys=False, observed=False).apply(groupby_apply_op) + df.groupby("B", group_keys=False, observed=False).apply(groupby_apply_op) df.groupby("B", group_keys=False, observed=False).A.apply(groupby_apply_op) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("A", group_keys=False, observed=False).apply(groupby_apply_op) + df.groupby("A", group_keys=False, observed=False).apply(groupby_apply_op) df.groupby("A", group_keys=False, observed=False).B.apply(groupby_apply_op) def test_groupby_apply_identity(self, data_for_grouping): diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 59f313b4c9edb..2ee6a73ec4054 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -125,7 +125,6 @@ def to_numpy( return result def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): - # if not all( isinstance(t, self._HANDLED_TYPES + (DecimalArray,)) for t in inputs ): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index a68c8a06e1d18..b110911bda400 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -176,8 +176,7 @@ def take(self, indexer, allow_fill=False, fill_value=None): # an ndarary. indexer = np.asarray(indexer) msg = ( - "Index is out of bounds or cannot do a " - "non-empty take from an empty array." + "Index is out of bounds or cannot do a non-empty take from an empty array." ) if allow_fill: diff --git a/pandas/tests/extension/list/array.py b/pandas/tests/extension/list/array.py index da53bdcb4e37e..8b4728c7d6292 100644 --- a/pandas/tests/extension/list/array.py +++ b/pandas/tests/extension/list/array.py @@ -81,8 +81,7 @@ def take(self, indexer, allow_fill=False, fill_value=None): # an ndarary. indexer = np.asarray(indexer) msg = ( - "Index is out of bounds or cannot do a " - "non-empty take from an empty array." + "Index is out of bounds or cannot do a non-empty take from an empty array." ) if allow_fill: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 6dd1f3f15bc15..f4a63ff4c92ec 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -393,13 +393,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: # attribute "pyarrow_dtype" pa_type = ser.dtype.pyarrow_dtype # type: ignore[union-attr] - if ( - pa.types.is_string(pa_type) - or pa.types.is_binary(pa_type) - or pa.types.is_decimal(pa_type) - ): + if pa.types.is_binary(pa_type) or pa.types.is_decimal(pa_type): if op_name in ["cumsum", "cumprod", "cummax", "cummin"]: return False + elif pa.types.is_string(pa_type): + if op_name == "cumprod": + return False elif pa.types.is_boolean(pa_type): if op_name in ["cumprod", "cummax", "cummin"]: return False @@ -414,6 +413,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: def test_accumulate_series(self, data, all_numeric_accumulations, skipna, request): pa_type = data.dtype.pyarrow_dtype op_name = all_numeric_accumulations + + if pa.types.is_string(pa_type) and op_name in ["cumsum", "cummin", "cummax"]: + # https://github.com/pandas-dev/pandas/pull/60633 + # Doesn't fit test structure, tested in series/test_cumulative.py instead. + return + ser = pd.Series(data) if not self._supports_accumulation(ser, op_name): @@ -959,8 +964,7 @@ def _get_arith_xfail_marker(self, opname, pa_dtype): mark = pytest.mark.xfail( raises=TypeError, reason=( - f"{opname} not supported between" - f"pd.NA and {pa_dtype} Python scalar" + f"{opname} not supported betweenpd.NA and {pa_dtype} Python scalar" ), ) elif opname == "__rfloordiv__" and ( @@ -3451,7 +3455,9 @@ def test_string_to_datetime_parsing_cast(): ) def test_interpolate_not_numeric(data): if not data.dtype._is_numeric: - with pytest.raises(ValueError, match="Values must be numeric."): + ser = pd.Series(data) + msg = re.escape(f"Cannot interpolate with {ser.dtype} dtype") + with pytest.raises(TypeError, match=msg): pd.Series(data).interpolate() @@ -3505,3 +3511,20 @@ def test_map_numeric_na_action(): result = ser.map(lambda x: 42, na_action="ignore") expected = pd.Series([42.0, 42.0, np.nan], dtype="float64") tm.assert_series_equal(result, expected) + + +def test_categorical_from_arrow_dictionary(): + # GH 60563 + df = pd.DataFrame( + {"A": ["a1", "a2"]}, dtype=ArrowDtype(pa.dictionary(pa.int32(), pa.utf8())) + ) + result = df.value_counts(dropna=False) + expected = pd.Series( + [1, 1], + index=pd.MultiIndex.from_arrays( + [pd.Index(["a1", "a2"], dtype=ArrowDtype(pa.string()), name="A")] + ), + name="count", + dtype="int64", + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index e19351b2ad058..25129111180d6 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -24,6 +24,8 @@ from pandas.compat import HAS_PYARROW +from pandas.core.dtypes.base import StorageExtensionDtype + import pandas as pd import pandas._testing as tm from pandas.api.types import is_string_dtype @@ -192,6 +194,10 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: and op_name in ("any", "all") ) + def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: + assert isinstance(ser.dtype, StorageExtensionDtype) + return op_name in ["cummin", "cummax", "cumsum"] + def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = cast(StringDtype, tm.get_dtype(obj)) if op_name in ["__add__", "__radd__"]: diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index ea8e2e8ecc194..b3140bad8276b 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -33,7 +33,7 @@ def float_string_frame(): df = DataFrame( np.random.default_rng(2).standard_normal((30, 4)), index=Index([f"foo_{i}" for i in range(30)], dtype=object), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), ) df["foo"] = "bar" return df diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py index fc7c03dc25839..845174bbf600e 100644 --- a/pandas/tests/frame/constructors/test_from_dict.py +++ b/pandas/tests/frame/constructors/test_from_dict.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Index, @@ -44,7 +42,6 @@ def test_constructor_single_row(self): ) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken") def test_constructor_list_of_series(self): data = [ OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]), @@ -108,7 +105,6 @@ def test_constructor_list_of_series(self): expected = DataFrame.from_dict(sdict, orient="index") tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken") def test_constructor_orient(self, float_string_frame): data_dict = float_string_frame.T._series recons = DataFrame.from_dict(data_dict, orient="index") diff --git a/pandas/tests/frame/indexing/test_coercion.py b/pandas/tests/frame/indexing/test_coercion.py index 1a454351b7085..472bfb7772a80 100644 --- a/pandas/tests/frame/indexing/test_coercion.py +++ b/pandas/tests/frame/indexing/test_coercion.py @@ -88,12 +88,7 @@ def test_26395(indexer_al): df["D"] = 0 indexer_al(df)["C", "D"] = 2 - expected = DataFrame( - {"D": [0, 0, 2]}, - index=["A", "B", "C"], - columns=pd.Index(["D"], dtype=object), - dtype=np.int64, - ) + expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64) tm.assert_frame_equal(df, expected) with pytest.raises(TypeError, match="Invalid value"): diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 84c01e0be3b6f..0c99b08cb30c4 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import iNaT from pandas.errors import InvalidIndexError @@ -503,7 +501,6 @@ def test_setitem_ambig(self, using_infer_string): else: assert dm[2].dtype == np.object_ - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_None(self, float_frame): # GH #766 float_frame[None] = float_frame["A"] @@ -1141,7 +1138,7 @@ def test_loc_setitem_datetimelike_with_inference(self): result = df.dtypes expected = Series( [np.dtype("timedelta64[ns]")] * 6 + [np.dtype("datetime64[ns]")] * 2, - index=Index(list("ABCDEFGH"), dtype=object), + index=list("ABCDEFGH"), ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py index a1d60eb9626d6..b530cb98ef46c 100644 --- a/pandas/tests/frame/indexing/test_insert.py +++ b/pandas/tests/frame/indexing/test_insert.py @@ -68,8 +68,7 @@ def test_insert_with_columns_dups(self): df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True) df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True) exp = DataFrame( - [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], - columns=Index(["A", "A", "A"], dtype=object), + [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"] ) tm.assert_frame_equal(df, exp) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index cfd7e91c4ceab..20dd7b0c4d3e7 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -144,18 +144,32 @@ def test_setitem_different_dtype(self): ) tm.assert_series_equal(result, expected) - def test_setitem_empty_columns(self): - # GH 13522 + def test_setitem_overwrite_index(self): + # GH 13522 - assign the index as a column and then overwrite the values + # -> should not affect the index df = DataFrame(index=["A", "B", "C"]) df["X"] = df.index df["X"] = ["x", "y", "z"] exp = DataFrame( - data={"X": ["x", "y", "z"]}, - index=["A", "B", "C"], - columns=Index(["X"], dtype=object), + data={"X": ["x", "y", "z"]}, index=["A", "B", "C"], columns=["X"] ) tm.assert_frame_equal(df, exp) + def test_setitem_empty_columns(self): + # Starting from an empty DataFrame and setting a column should result + # in a default string dtype for the columns' Index + # https://github.com/pandas-dev/pandas/issues/60338 + + df = DataFrame() + df["foo"] = [1, 2, 3] + expected = DataFrame({"foo": [1, 2, 3]}) + tm.assert_frame_equal(df, expected) + + df = DataFrame(columns=Index([])) + df["foo"] = [1, 2, 3] + expected = DataFrame({"foo": [1, 2, 3]}) + tm.assert_frame_equal(df, expected) + def test_setitem_dt64_index_empty_columns(self): rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") df = DataFrame(index=np.arange(len(rng))) @@ -169,9 +183,7 @@ def test_setitem_timestamp_empty_columns(self): df["now"] = Timestamp("20130101", tz="UTC") expected = DataFrame( - [[Timestamp("20130101", tz="UTC")]] * 3, - index=range(3), - columns=Index(["now"], dtype=object), + [[Timestamp("20130101", tz="UTC")]] * 3, index=range(3), columns=["now"] ) tm.assert_frame_equal(df, expected) @@ -210,7 +222,7 @@ def test_setitem_period_preserves_dtype(self): result = DataFrame([]) result["a"] = data - expected = DataFrame({"a": data}, columns=Index(["a"], dtype=object)) + expected = DataFrame({"a": data}, columns=["a"]) tm.assert_frame_equal(result, expected) @@ -930,7 +942,7 @@ def test_setitem_scalars_no_index(self): # GH#16823 / GH#17894 df = DataFrame() df["foo"] = 1 - expected = DataFrame(columns=Index(["foo"], dtype=object)).astype(np.int64) + expected = DataFrame(columns=["foo"]).astype(np.int64) tm.assert_frame_equal(df, expected) def test_setitem_newcol_tuple_key(self, float_frame): diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index ab3743283ea13..eb1ee4e7b2970 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -745,7 +745,7 @@ def test_astype_tz_object_conversion(self, tz): result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) GH#60639") def test_astype_dt64_to_string( self, frame_or_series, tz_naive_fixture, using_infer_string ): diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 87b7d5052a345..1e594043510ea 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import is_dtype_equal @@ -32,8 +30,7 @@ def test_combine_first_mixed(self): combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_combine_first(self, float_frame, using_infer_string): + def test_combine_first(self, float_frame): # disjoint head, tail = float_frame[:5], float_frame[5:] @@ -79,9 +76,7 @@ def test_combine_first(self, float_frame, using_infer_string): tm.assert_series_equal(combined["A"].reindex(g.index), g["A"]) # corner cases - warning = FutureWarning if using_infer_string else None - with tm.assert_produces_warning(warning, match="empty entries"): - comb = float_frame.combine_first(DataFrame()) + comb = float_frame.combine_first(DataFrame()) tm.assert_frame_equal(comb, float_frame) comb = DataFrame().combine_first(float_frame) @@ -385,7 +380,7 @@ def test_combine_first_with_asymmetric_other(self, val): df2 = DataFrame({"isBool": [True]}) res = df1.combine_first(df2) - exp = DataFrame({"isBool": [True], "isNum": [val]}) + exp = DataFrame({"isNum": [val], "isBool": [True]}) tm.assert_frame_equal(res, exp) @@ -560,3 +555,13 @@ def test_combine_first_empty_columns(): result = left.combine_first(right) expected = DataFrame(columns=["a", "b", "c"]) tm.assert_frame_equal(result, expected) + + +def test_combine_first_preserve_column_order(): + # GH#60427 + df1 = DataFrame({"B": [1, 2, 3], "A": [4, None, 6]}) + df2 = DataFrame({"A": [5]}, index=[1]) + + result = df1.combine_first(df2) + expected = DataFrame({"B": [1, 2, 3], "A": [4.0, 5.0, 6.0]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index e7f6e5d625d3e..d0f30204758d3 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -196,3 +196,18 @@ def test_convert_dtypes_from_arrow(self): result = df.convert_dtypes() expected = df.astype({"a": "string[python]"}) tm.assert_frame_equal(result, expected) + + def test_convert_dtype_pyarrow_timezone_preserve(self): + # GH 60237 + pytest.importorskip("pyarrow") + df = pd.DataFrame( + { + "timestamps": pd.Series( + pd.to_datetime(range(5), utc=True, unit="h"), + dtype="timestamp[ns, tz=UTC][pyarrow]", + ) + } + ) + result = df.convert_dtypes(dtype_backend="pyarrow") + expected = df.copy() + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index c15952339ef18..d5e94382b8314 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -320,7 +318,6 @@ def test_corrwith_non_timeseries_data(self): for row in index[:4]: tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_corrwith_with_objects(self, using_infer_string): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), @@ -334,9 +331,8 @@ def test_corrwith_with_objects(self, using_infer_string): df2["obj"] = "bar" if using_infer_string: - import pyarrow as pa - - with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"): + msg = "Cannot perform reduction 'mean' with string dtype" + with pytest.raises(TypeError, match=msg): df1.corrwith(df2) else: with pytest.raises(TypeError, match="Could not convert"): diff --git a/pandas/tests/frame/methods/test_dropna.py b/pandas/tests/frame/methods/test_dropna.py index 4a60dc09cfe07..11893d7fac1a4 100644 --- a/pandas/tests/frame/methods/test_dropna.py +++ b/pandas/tests/frame/methods/test_dropna.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -184,7 +182,6 @@ def test_dropna_multiple_axes(self): with pytest.raises(TypeError, match="supplying multiple axes"): inp.dropna(how="all", axis=(0, 1), inplace=True) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dropna_tz_aware_datetime(self): # GH13407 df = DataFrame() diff --git a/pandas/tests/frame/methods/test_dtypes.py b/pandas/tests/frame/methods/test_dtypes.py index 1685f9ee331f5..bf01ec73cf72b 100644 --- a/pandas/tests/frame/methods/test_dtypes.py +++ b/pandas/tests/frame/methods/test_dtypes.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd @@ -135,13 +133,9 @@ def test_dtypes_timedeltas(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_frame_apply_np_array_return_type(self, using_infer_string): # GH 35517 df = DataFrame([["foo"]]) result = df.apply(lambda col: np.array("bar")) - if using_infer_string: - expected = Series([np.array(["bar"])]) - else: - expected = Series(["bar"]) + expected = Series(np.array("bar")) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index 74e4383950174..de6737ec3bc39 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -11,6 +11,7 @@ HAS_PYARROW, IS64, PYPY, + is_platform_arm, ) from pandas import ( @@ -23,6 +24,7 @@ option_context, ) import pandas._testing as tm +from pandas.util.version import Version @pytest.fixture @@ -522,7 +524,7 @@ def test_info_int_columns(using_infer_string): 0 1 2 non-null int64 1 2 2 non-null int64 dtypes: int64(2) - memory usage: {'50.0' if using_infer_string and HAS_PYARROW else '48.0+'} bytes + memory usage: {"50.0" if using_infer_string and HAS_PYARROW else "48.0+"} bytes """ ) assert result == expected @@ -544,7 +546,9 @@ def test_memory_usage_empty_no_warning(using_infer_string): @pytest.mark.single_cpu def test_info_compute_numba(): # GH#51922 - pytest.importorskip("numba") + numba = pytest.importorskip("numba") + if Version(numba.__version__) == Version("0.61") and is_platform_arm(): + pytest.skip(f"Segfaults on ARM platforms with numba {numba.__version__}") df = DataFrame([[1, 2], [3, 4]]) with option_context("compute.use_numba", True): diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index b8a34d5eaa226..09d1cc9a479b2 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -64,11 +64,7 @@ def test_interpolate_inplace(self, frame_or_series, request): assert np.shares_memory(orig, obj.values) assert orig.squeeze()[1] == 1.5 - # TODO(infer_string) raise proper TypeError in case of string dtype - @pytest.mark.xfail( - using_string_dtype(), reason="interpolate doesn't work for string" - ) - def test_interp_basic(self): + def test_interp_basic(self, using_infer_string): df = DataFrame( { "A": [1, 2, np.nan, 4], @@ -77,7 +73,8 @@ def test_interp_basic(self): "D": list("abcd"), } ) - msg = "DataFrame cannot interpolate with object dtype" + dtype = "str" if using_infer_string else "object" + msg = f"[Cc]annot interpolate with {dtype} dtype" with pytest.raises(TypeError, match=msg): df.interpolate() @@ -87,8 +84,8 @@ def test_interp_basic(self): df.interpolate(inplace=True) # check we DID operate inplace - assert np.shares_memory(df["C"]._values, cvalues) - assert np.shares_memory(df["D"]._values, dvalues) + assert tm.shares_memory(df["C"]._values, cvalues) + assert tm.shares_memory(df["D"]._values, dvalues) @pytest.mark.xfail( using_string_dtype(), reason="interpolate doesn't work for string" diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 479ea7d7ba692..aaa9485cab580 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -277,7 +277,20 @@ def test_join_index(float_frame): tm.assert_index_equal(joined.index, float_frame.index.sort_values()) tm.assert_index_equal(joined.columns, expected_columns) - join_msg = "'foo' is not a valid Merge type: left, right, inner, outer, cross, asof" + # left anti + joined = f.join(f2, how="left_anti") + tm.assert_index_equal(joined.index, float_frame.index[:5]) + tm.assert_index_equal(joined.columns, expected_columns) + + # right anti + joined = f.join(f2, how="right_anti") + tm.assert_index_equal(joined.index, float_frame.index[10:][::-1]) + tm.assert_index_equal(joined.columns, expected_columns) + + join_msg = ( + "'foo' is not a valid Merge type: left, right, inner, outer, " + "left_anti, right_anti, cross, asof" + ) with pytest.raises(ValueError, match=re.escape(join_msg)): f.join(f2, how="foo") diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index b2320798ea9a2..9e302dc5f94ee 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -334,7 +334,6 @@ def test_regex_replace_str_to_numeric(self, mix_abc): return_value = res3.replace(regex=r"\s*\.\s*", value=0, inplace=True) assert return_value is None expec = DataFrame({"a": mix_abc["a"], "b": ["a", "b", 0, 0], "c": mix_abc["c"]}) - # TODO(infer_string) expec["c"] = expec["c"].astype(object) tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) @@ -713,6 +712,13 @@ def test_replace_with_None_keeps_categorical(self): ) tm.assert_frame_equal(result, expected) + def test_replace_all_NA(self): + # GH#60688 + df = DataFrame({"ticker": ["#1234#"], "name": [None]}) + result = df.replace({col: {r"^#": "$"} for col in df.columns}, regex=True) + expected = DataFrame({"ticker": ["$1234#"], "name": [None]}) + tm.assert_frame_equal(result, expected) + def test_replace_value_is_none(self, datetime_frame): orig_value = datetime_frame.iloc[0, 0] orig2 = datetime_frame.iloc[1, 0] @@ -1469,21 +1475,24 @@ def test_regex_replace_scalar( tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("regex", [False, True]) - def test_replace_regex_dtype_frame(self, regex): + @pytest.mark.parametrize("value", [1, "1"]) + def test_replace_regex_dtype_frame(self, regex, value): # GH-48644 df1 = DataFrame({"A": ["0"], "B": ["0"]}) - expected_df1 = DataFrame({"A": [1], "B": [1]}, dtype=object) - result_df1 = df1.replace(to_replace="0", value=1, regex=regex) + # When value is an integer, coerce result to object. + # When value is a string, infer the correct string dtype. + dtype = object if value == 1 else None + + expected_df1 = DataFrame({"A": [value], "B": [value]}, dtype=dtype) + result_df1 = df1.replace(to_replace="0", value=value, regex=regex) tm.assert_frame_equal(result_df1, expected_df1) df2 = DataFrame({"A": ["0"], "B": ["1"]}) if regex: - # TODO(infer_string): both string columns get cast to object, - # while only needed for column A - expected_df2 = DataFrame({"A": [1], "B": ["1"]}, dtype=object) + expected_df2 = DataFrame({"A": [value], "B": ["1"]}, dtype=dtype) else: - expected_df2 = DataFrame({"A": Series([1], dtype=object), "B": ["1"]}) - result_df2 = df2.replace(to_replace="0", value=1, regex=regex) + expected_df2 = DataFrame({"A": Series([value], dtype=dtype), "B": ["1"]}) + result_df2 = df2.replace(to_replace="0", value=value, regex=regex) tm.assert_frame_equal(result_df2, expected_df2) def test_replace_with_value_also_being_replaced(self): diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 88e43b678a7e4..80227c0462329 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -644,7 +642,6 @@ def test_rest_index_multiindex_categorical_with_missing_values(self, codes): tm.assert_frame_equal(res, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "array, dtype", [ @@ -781,3 +778,34 @@ def test_reset_index_false_index_name(): result_frame.reset_index() expected_frame = DataFrame(range(5, 10), RangeIndex(range(5), name=False)) tm.assert_frame_equal(result_frame, expected_frame) + + +@pytest.mark.parametrize("columns", [None, Index([])]) +def test_reset_index_with_empty_frame(columns): + # Currently empty DataFrame has RangeIndex or object dtype Index, but when + # resetting the index we still want to end up with the default string dtype + # https://github.com/pandas-dev/pandas/issues/60338 + + index = Index([], name="foo") + df = DataFrame(index=index, columns=columns) + result = df.reset_index() + expected = DataFrame(columns=["foo"]) + tm.assert_frame_equal(result, expected) + + index = Index([1, 2, 3], name="foo") + df = DataFrame(index=index, columns=columns) + result = df.reset_index() + expected = DataFrame({"foo": [1, 2, 3]}) + tm.assert_frame_equal(result, expected) + + index = MultiIndex.from_tuples([], names=["foo", "bar"]) + df = DataFrame(index=index, columns=columns) + result = df.reset_index() + expected = DataFrame(columns=["foo", "bar"]) + tm.assert_frame_equal(result, expected) + + index = MultiIndex.from_tuples([(1, 2), (2, 3)], names=["foo", "bar"]) + df = DataFrame(index=index, columns=columns) + result = df.reset_index() + expected = DataFrame({"foo": [1, 2], "bar": [2, 3]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_sample.py b/pandas/tests/frame/methods/test_sample.py index 91d735a8b2fa7..a9d56cbfd2b46 100644 --- a/pandas/tests/frame/methods/test_sample.py +++ b/pandas/tests/frame/methods/test_sample.py @@ -198,8 +198,7 @@ def test_sample_upsampling_without_replacement(self, frame_or_series): obj = tm.get_obj(obj, frame_or_series) msg = ( - "Replace has to be set to `True` when " - "upsampling the population `frac` > 1." + "Replace has to be set to `True` when upsampling the population `frac` > 1." ) with pytest.raises(ValueError, match=msg): obj.sample(frac=2, replace=False) diff --git a/pandas/tests/frame/methods/test_set_axis.py b/pandas/tests/frame/methods/test_set_axis.py index 1967941bca9f0..7b75bcf4f348d 100644 --- a/pandas/tests/frame/methods/test_set_axis.py +++ b/pandas/tests/frame/methods/test_set_axis.py @@ -93,7 +93,7 @@ def test_set_axis_setattr_index_wrong_length(self, obj): # wrong length msg = ( f"Length mismatch: Expected axis has {len(obj)} elements, " - f"new values have {len(obj)-1} elements" + f"new values have {len(obj) - 1} elements" ) with pytest.raises(ValueError, match=msg): obj.index = np.arange(len(obj) - 1) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 23377b7373987..9eafc69013ffe 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ParserError import pandas as pd @@ -438,20 +436,18 @@ def test_to_csv_empty(self): result, expected = self._return_result_expected(df, 1000) tm.assert_frame_equal(result, expected, check_column_type=False) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.slow def test_to_csv_chunksize(self): chunksize = 1000 rows = chunksize // 2 + 1 df = DataFrame( np.ones((rows, 2)), - columns=Index(list("ab"), dtype=object), + columns=Index(list("ab")), index=MultiIndex.from_arrays([range(rows) for _ in range(2)]), ) result, expected = self._return_result_expected(df, chunksize, rnlvl=2) tm.assert_frame_equal(result, expected, check_names=False) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.slow @pytest.mark.parametrize( "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] @@ -480,7 +476,7 @@ def test_to_csv_params(self, nrows, df_params, func_params, ncols): for _ in range(df_params["c_idx_nlevels"]) ) else: - columns = Index([f"i-{i}" for i in range(ncols)], dtype=object) + columns = Index([f"i-{i}" for i in range(ncols)]) df = DataFrame(np.ones((nrows, ncols)), index=index, columns=columns) result, expected = self._return_result_expected(df, 1000, **func_params) tm.assert_frame_equal(result, expected, check_names=False) @@ -738,7 +734,6 @@ def test_to_csv_withcommas(self, temp_file): df2 = self.read_csv(path) tm.assert_frame_equal(df2, df) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_mixed(self, temp_file): def create_cols(name): return [f"{name}{i:03d}" for i in range(5)] @@ -755,7 +750,7 @@ def create_cols(name): ) df_bool = DataFrame(True, index=df_float.index, columns=create_cols("bool")) df_object = DataFrame( - "foo", index=df_float.index, columns=create_cols("object") + "foo", index=df_float.index, columns=create_cols("object"), dtype="object" ) df_dt = DataFrame( Timestamp("20010101"), @@ -824,13 +819,12 @@ def test_to_csv_dups_cols(self, temp_file): result.columns = df.columns tm.assert_frame_equal(result, df) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_dups_cols2(self, temp_file): # GH3457 df = DataFrame( np.ones((5, 3)), index=Index([f"i-{i}" for i in range(5)], name="foo"), - columns=Index(["a", "a", "b"], dtype=object), + columns=Index(["a", "a", "b"]), ) path = str(temp_file) diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index de5029b9f18b2..43db234267f21 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -1,10 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - import pandas as pd import pandas._testing as tm @@ -136,9 +132,6 @@ def test_data_frame_value_counts_dropna_true(nulls_fixture): tm.assert_series_equal(result, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) def test_data_frame_value_counts_dropna_false(nulls_fixture): # GH 41334 df = pd.DataFrame( diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 6b61fe8b05219..8239de3f39c20 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,7 +11,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW import pandas as pd from pandas import ( @@ -2033,6 +2033,51 @@ def test_arithmetic_multiindex_align(): tm.assert_frame_equal(result, expected) +def test_arithmetic_multiindex_column_align(): + # GH#60498 + df1 = DataFrame( + data=100, + columns=MultiIndex.from_product( + [["1A", "1B"], ["2A", "2B"]], names=["Lev1", "Lev2"] + ), + index=["C1", "C2"], + ) + df2 = DataFrame( + data=np.array([[0.1, 0.25], [0.2, 0.45]]), + columns=MultiIndex.from_product([["1A", "1B"]], names=["Lev1"]), + index=["C1", "C2"], + ) + expected = DataFrame( + data=np.array([[10.0, 10.0, 25.0, 25.0], [20.0, 20.0, 45.0, 45.0]]), + columns=MultiIndex.from_product( + [["1A", "1B"], ["2A", "2B"]], names=["Lev1", "Lev2"] + ), + index=["C1", "C2"], + ) + result = df1 * df2 + tm.assert_frame_equal(result, expected) + + +def test_arithmetic_multiindex_column_align_with_fillvalue(): + # GH#60903 + df1 = DataFrame( + data=[[1.0, 2.0]], + columns=MultiIndex.from_tuples([("A", "one"), ("A", "two")]), + ) + df2 = DataFrame( + data=[[3.0, 4.0]], + columns=MultiIndex.from_tuples([("B", "one"), ("B", "two")]), + ) + expected = DataFrame( + data=[[1.0, 2.0, 3.0, 4.0]], + columns=MultiIndex.from_tuples( + [("A", "one"), ("A", "two"), ("B", "one"), ("B", "two")] + ), + ) + result = df1.add(df2, fill_value=0) + tm.assert_frame_equal(result, expected) + + def test_bool_frame_mult_float(): # GH 18549 df = DataFrame(True, list("ab"), list("cd")) @@ -2101,12 +2146,19 @@ def test_enum_column_equality(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_mixed_col_index_dtype(): +def test_mixed_col_index_dtype(using_infer_string): # GH 47382 df1 = DataFrame(columns=list("abc"), data=1.0, index=[0]) df2 = DataFrame(columns=list("abc"), data=0.0, index=[0]) df1.columns = df2.columns.astype("string") result = df1 + df2 expected = DataFrame(columns=list("abc"), data=1.0, index=[0]) + if using_infer_string: + # df2.columns.dtype will be "str" instead of object, + # so the aligned result will be "string", not object + if HAS_PYARROW: + dtype = "string[pyarrow]" + else: + dtype = "string" + expected.columns = expected.columns.astype(dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 25e66a0e1c03d..6fdbfac8f4e0a 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( Categorical, @@ -162,21 +160,7 @@ def test_constructor_with_convert(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_construction_with_mixed(self, float_string_frame, using_infer_string): - # test construction edge cases with mixed types - - # f7u12, this does not work without extensive workaround - data = [ - [datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)], - [datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)], - ] - df = DataFrame(data) - - # check dtypes - result = df.dtypes - expected = Series({"datetime64[us]": 3}) - # mixed-type frames float_string_frame["datetime"] = datetime.now() float_string_frame["timedelta"] = timedelta(days=1, seconds=1) @@ -196,13 +180,11 @@ def test_construction_with_mixed(self, float_string_frame, using_infer_string): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_construction_with_conversions(self): # convert from a numpy array of non-ns timedelta64; as of 2.0 this does # *not* convert arr = np.array([1, 2, 3], dtype="timedelta64[s]") - df = DataFrame(index=range(3)) - df["A"] = arr + df = DataFrame({"A": arr}) expected = DataFrame( {"A": pd.timedelta_range("00:00:01", periods=3, freq="s")}, index=range(3) ) @@ -220,11 +202,11 @@ def test_construction_with_conversions(self): assert expected.dtypes["dt1"] == "M8[s]" assert expected.dtypes["dt2"] == "M8[s]" - df = DataFrame(index=range(3)) - df["dt1"] = np.datetime64("2013-01-01") - df["dt2"] = np.array( + dt1 = np.datetime64("2013-01-01") + dt2 = np.array( ["2013-01-01", "2013-01-02", "2013-01-03"], dtype="datetime64[D]" ) + df = DataFrame({"dt1": dt1, "dt2": dt2}) # df['dt3'] = np.array(['2013-01-01 00:00:01','2013-01-01 # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]') @@ -401,14 +383,17 @@ def test_update_inplace_sets_valid_block_values(): assert isinstance(df._mgr.blocks[0].values, Categorical) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_nonconsolidated_item_cache_take(): # https://github.com/pandas-dev/pandas/issues/35521 # create non-consolidated dataframe with object dtype columns - df = DataFrame() - df["col1"] = Series(["a"], dtype=object) + df = DataFrame( + { + "col1": Series(["a"], dtype=object), + } + ) df["col2"] = Series([0], dtype=object) + assert not df._mgr.is_consolidated() # access column (item cache) df["col1"] == "A" diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 9b6080603f0c9..037a2ae294bb2 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -21,8 +21,6 @@ from numpy.ma import mrecords import pytest -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError @@ -1974,7 +1972,6 @@ def test_constructor_with_datetimes4(self): df = DataFrame({"value": dr}) assert str(df.iat[0, 0].tz) == "US/Eastern" - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_constructor_with_datetimes5(self): # GH 7822 # preserver an index with a tz on dict construction diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index ca572b1026526..375b9b00a4988 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -791,7 +791,6 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture): tm.assert_frame_equal(result, expected) expected = DataFrame(df_index) - expected.columns = expected.columns.astype(object) result = df.reset_index().query('"2018-01-03 00:00:00+00" < time') tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index fde4dfeed9c55..64e686d25faa7 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -672,23 +672,10 @@ def test_mode_dropna(self, dropna, expected): expected = DataFrame(expected) tm.assert_frame_equal(result, expected) - def test_mode_sortwarning(self, using_infer_string): - # Check for the warning that is raised when the mode - # results cannot be sorted - + def test_mode_sort_with_na(self, using_infer_string): df = DataFrame({"A": [np.nan, np.nan, "a", "a"]}) expected = DataFrame({"A": ["a", np.nan]}) - - # TODO(infer_string) avoid this UserWarning for python storage - warning = ( - None - if using_infer_string and df.A.dtype.storage == "pyarrow" - else UserWarning - ) - with tm.assert_produces_warning(warning, match="Unable to sort modes"): - result = df.mode(dropna=False) - result = result.sort_values(by="A").reset_index(drop=True) - + result = df.mode(dropna=False) tm.assert_frame_equal(result, expected) def test_mode_empty_df(self): @@ -1557,6 +1544,44 @@ def test_min_max_dt64_with_NaT(self): exp = Series([pd.NaT], index=["foo"]) tm.assert_series_equal(res, exp) + def test_min_max_dt64_with_NaT_precision(self): + # GH#60646 Make sure the reduction doesn't cast input timestamps to + # float and lose precision. + df = DataFrame( + {"foo": [pd.NaT, pd.NaT, Timestamp("2012-05-01 09:20:00.123456789")]}, + dtype="datetime64[ns]", + ) + + res = df.min(axis=1) + exp = df.foo.rename(None) + tm.assert_series_equal(res, exp) + + res = df.max(axis=1) + exp = df.foo.rename(None) + tm.assert_series_equal(res, exp) + + def test_min_max_td64_with_NaT_precision(self): + # GH#60646 Make sure the reduction doesn't cast input timedeltas to + # float and lose precision. + df = DataFrame( + { + "foo": [ + pd.NaT, + pd.NaT, + to_timedelta("10000 days 06:05:01.123456789"), + ], + }, + dtype="timedelta64[ns]", + ) + + res = df.min(axis=1) + exp = df.foo.rename(None) + tm.assert_series_equal(res, exp) + + res = df.max(axis=1) + exp = df.foo.rename(None) + tm.assert_series_equal(res, exp) + def test_min_max_dt64_with_NaT_skipna_false(self, request, tz_naive_fixture): # GH#36907 tz = tz_naive_fixture diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 57c803c23b001..22fdfd3a01408 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import lib import pandas as pd @@ -1454,6 +1452,25 @@ def test_stack_empty_frame(dropna, future_stack): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated") +@pytest.mark.parametrize("dropna", [True, False, lib.no_default]) +def test_stack_empty_level(dropna, future_stack, int_frame): + # GH 60740 + if future_stack and dropna is not lib.no_default: + with pytest.raises(ValueError, match="dropna must be unspecified"): + DataFrame(dtype=np.int64).stack(dropna=dropna, future_stack=future_stack) + else: + expected = int_frame + result = int_frame.copy().stack( + level=[], dropna=dropna, future_stack=future_stack + ) + tm.assert_frame_equal(result, expected) + + expected = DataFrame() + result = DataFrame().stack(level=[], dropna=dropna, future_stack=future_stack) + tm.assert_frame_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated") @pytest.mark.parametrize("dropna", [True, False, lib.no_default]) @pytest.mark.parametrize("fill_value", [None, 0]) @@ -1675,7 +1692,6 @@ def test_unstack_multiple_no_empty_columns(self): expected = unstacked.dropna(axis=1, how="all") tm.assert_frame_equal(unstacked, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) @@ -1858,10 +1874,7 @@ def test_unstack_bug(self, future_stack): } ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) - + result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) unstacked = result.unstack() restacked = unstacked.stack(future_stack=future_stack) tm.assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) @@ -1926,7 +1939,6 @@ def test_stack_level_name(self, multiindex_dataframe_random_data, future_stack): expected = frame.stack(future_stack=future_stack) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 7d18ef28a722d..cbd563a03b908 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -769,6 +769,13 @@ def test_constructor_with_metadata(): assert isinstance(subset, MySubclassWithMetadata) +def test_constructor_with_metadata_from_records(): + # GH#57008 + df = MySubclassWithMetadata.from_records([{"a": 1, "b": 2}]) + assert df.my_metadata is None + assert type(df) is MySubclassWithMetadata + + class SimpleDataFrameSubClass(DataFrame): """A subclass of DataFrame that does not define a constructor.""" diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index 217255e73b450..652f52bd226af 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -3,9 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gte1p25 import pandas as pd @@ -122,9 +119,6 @@ def test_pos_object(self, df_data): tm.assert_frame_equal(+df, df) tm.assert_series_equal(+df["a"], df["a"]) - @pytest.mark.xfail( - using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" - ) @pytest.mark.filterwarnings("ignore:Applying:DeprecationWarning") def test_pos_object_raises(self): # GH#21380 diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 433e559ef620e..a88090b00499d 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -644,7 +644,7 @@ def test_timedelta_methods(method): operator.methodcaller("add_categories", ["c"]), operator.methodcaller("as_ordered"), operator.methodcaller("as_unordered"), - lambda x: getattr(x, "codes"), + lambda x: x.codes, operator.methodcaller("remove_categories", "a"), operator.methodcaller("remove_unused_categories"), operator.methodcaller("rename_categories", {"a": "A", "b": "B"}), diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py index 15c1efe5fd1ff..afddc90fdd055 100644 --- a/pandas/tests/groupby/aggregate/test_numba.py +++ b/pandas/tests/groupby/aggregate/test_numba.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm from pandas.errors import NumbaUtilError from pandas import ( @@ -11,8 +12,17 @@ option_context, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) def test_correct_function_signature(): @@ -186,6 +196,23 @@ def test_multifunc_numba_vs_cython_frame(agg_kwargs): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("func", ["sum", "mean", "var", "std", "min", "max"]) +def test_multifunc_numba_vs_cython_frame_noskipna(func): + pytest.importorskip("numba") + data = DataFrame( + { + 0: ["a", "a", "b", "b", "a"], + 1: [1.0, np.nan, 3.0, 4.0, 5.0], + 2: [1, 2, 3, 4, 5], + }, + columns=[0, 1, 2], + ) + grouped = data.groupby(0) + result = grouped.agg(func, skipna=False, engine="numba") + expected = grouped.agg(func, skipna=False, engine="cython") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "agg_kwargs,expected_func", [ diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index ce78b58e5d8f4..1c016143d50c3 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -499,17 +499,13 @@ def test_agg_timezone_round_trip(): assert ts == grouped.first()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] + assert ts == grouped.apply(lambda x: x.iloc[0])["B"].iloc[0] ts = df["B"].iloc[2] assert ts == grouped.last()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] + assert ts == grouped.apply(lambda x: x.iloc[-1])["B"].iloc[0] def test_sum_uint64_overflow(): diff --git a/pandas/tests/groupby/methods/test_kurt.py b/pandas/tests/groupby/methods/test_kurt.py new file mode 100644 index 0000000000000..21b7c50c3c5aa --- /dev/null +++ b/pandas/tests/groupby/methods/test_kurt.py @@ -0,0 +1,90 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm + + +def test_groupby_kurt_equivalence(): + # GH#40139 + # Test that that groupby kurt method (which uses libgroupby.group_kurt) + # matches the results of operating group-by-group (which uses nanops.nankurt) + nrows = 1000 + ngroups = 3 + ncols = 2 + nan_frac = 0.05 + + arr = np.random.default_rng(2).standard_normal((nrows, ncols)) + arr[np.random.default_rng(2).random(nrows) < nan_frac] = np.nan + + df = pd.DataFrame(arr) + grps = np.random.default_rng(2).integers(0, ngroups, size=nrows) + gb = df.groupby(grps) + + result = gb.kurt() + + grpwise = [grp.kurt().to_frame(i).T for i, grp in gb] + expected = pd.concat(grpwise, axis=0) + expected.index = expected.index.astype("int64") # 32bit builds + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", + [ + pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), + "Float64", + ], +) +def test_groupby_kurt_arrow_float64(dtype): + # GH#40139 + # Test groupby.kurt() with float64[pyarrow] and Float64 dtypes + df = pd.DataFrame( + { + "x": [1.0, np.nan, 3.2, 4.8, 2.3, 1.9, 8.9], + "y": [1.6, 3.3, 3.2, 6.8, 1.3, 2.9, 9.0], + }, + dtype=dtype, + ) + gb = df.groupby(by=lambda x: 0) + + result = gb.kurt() + expected = pd.DataFrame({"x": [2.1644713], "y": [0.1513969]}, dtype=dtype) + tm.assert_almost_equal(result, expected) + + +def test_groupby_kurt_noskipna(): + # GH#40139 + # Test groupby.kurt() with skipna = False + df = pd.DataFrame( + { + "x": [1.0, np.nan, 3.2, 4.8, 2.3, 1.9, 8.9], + "y": [1.6, 3.3, 3.2, 6.8, 1.3, 2.9, 9.0], + } + ) + gb = df.groupby(by=lambda x: 0) + + result = gb.kurt(skipna=False) + expected = pd.DataFrame({"x": [np.nan], "y": [0.1513969]}) + tm.assert_almost_equal(result, expected) + + +def test_groupby_kurt_all_ones(): + # GH#40139 + # Test groupby.kurt() with constant values + df = pd.DataFrame( + { + "x": [1.0] * 10, + } + ) + gb = df.groupby(by=lambda x: 0) + + result = gb.kurt(skipna=False) + expected = pd.DataFrame( + { + "x": [0.0], # Same behavior as pd.DataFrame.kurt() + } + ) + tm.assert_almost_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index 2dc89bc75746f..6664563bd2272 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Index, @@ -76,18 +74,16 @@ def test_size_series_masked_type_returns_Int64(dtype): tm.assert_series_equal(result, expected) -# TODO(infer_string) in case the column is object dtype, it should preserve that dtype -# for the result's index -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -def test_size_strings(any_string_dtype): +def test_size_strings(any_string_dtype, using_infer_string): # GH#55627 dtype = any_string_dtype df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype) result = df.groupby("a")["b"].size() exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64" + exp_index_dtype = "str" if using_infer_string and dtype == "object" else dtype expected = Series( [2, 1], - index=Index(["a", "b"], name="a", dtype=dtype), + index=Index(["a", "b"], name="a", dtype=exp_index_dtype), name="b", dtype=exp_dtype, ) diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 8ca6593a19f20..1050f8154572a 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -324,12 +324,9 @@ def test_against_frame_and_seriesgroupby( ) if frame: # compare against apply with DataFrame value_counts - warn = DeprecationWarning if groupby == "column" else None - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(warn, match=msg): - expected = gp.apply( - _frame_value_counts, ["gender", "education"], normalize, sort, ascending - ) + expected = gp.apply( + _frame_value_counts, ["gender", "education"], normalize, sort, ascending + ) if as_index: tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py index 013b308cd14cd..215e627abb018 100644 --- a/pandas/tests/groupby/test_api.py +++ b/pandas/tests/groupby/test_api.py @@ -74,6 +74,7 @@ def test_tab_completion(multiindex_dataframe_random_data): "all", "shift", "skew", + "kurt", "take", "pct_change", "any", @@ -173,13 +174,13 @@ def test_frame_consistency(groupby_func): elif groupby_func in ("nunique",): exclude_expected = {"axis"} elif groupby_func in ("max", "min"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} exclude_result = {"min_count", "engine", "engine_kwargs"} - elif groupby_func in ("mean", "std", "sum", "var"): - exclude_expected = {"axis", "kwargs", "skipna"} + elif groupby_func in ("sum", "mean", "std", "var"): + exclude_expected = {"axis", "kwargs"} exclude_result = {"engine", "engine_kwargs"} elif groupby_func in ("median", "prod", "sem"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} elif groupby_func in ("bfill", "ffill"): exclude_expected = {"inplace", "axis", "limit_area"} elif groupby_func in ("cummax", "cummin"): @@ -231,13 +232,13 @@ def test_series_consistency(request, groupby_func): if groupby_func in ("any", "all"): exclude_expected = {"kwargs", "bool_only", "axis"} elif groupby_func in ("max", "min"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} exclude_result = {"min_count", "engine", "engine_kwargs"} - elif groupby_func in ("mean", "std", "sum", "var"): - exclude_expected = {"axis", "kwargs", "skipna"} + elif groupby_func in ("sum", "mean", "std", "var"): + exclude_expected = {"axis", "kwargs"} exclude_result = {"engine", "engine_kwargs"} elif groupby_func in ("median", "prod", "sem"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} elif groupby_func in ("bfill", "ffill"): exclude_expected = {"inplace", "axis", "limit_area"} elif groupby_func in ("cummax", "cummin"): diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 1a4127ab49b0e..5bf16ee9ad0b8 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -27,12 +27,9 @@ def test_apply_func_that_appends_group_to_list_without_copy(): def store(group): groups.append(group) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("index").apply(store) - expected_value = DataFrame( - {"index": [0] * 10, 0: [1] * 10}, index=pd.RangeIndex(0, 100, 10) - ) + df.groupby("index").apply(store) + expected_value = DataFrame({0: [1] * 10}, index=pd.RangeIndex(0, 100, 10)) + expected_value.columns = expected_value.columns.astype(object) tm.assert_frame_equal(groups[0], expected_value) @@ -111,11 +108,7 @@ def test_apply_index_date_object(): ] exp_idx = Index(["2011-05-16", "2011-05-17", "2011-05-18"], name="date") expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("date", group_keys=False).apply( - lambda x: x["time"][x["value"].idxmax()] - ) + result = df.groupby("date").apply(lambda x: x["time"][x["value"].idxmax()]) tm.assert_series_equal(result, expected) @@ -189,9 +182,7 @@ def f_constant_df(group): for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]: del names[:] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("a", group_keys=False).apply(func) + df.groupby("a").apply(func) assert names == group_names @@ -209,11 +200,9 @@ def test_group_apply_once_per_group2(capsys): index=["0", "2", "4", "6", "8", "10", "12", "14"], ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("group_by_column", group_keys=False).apply( - lambda df: print("function_called") - ) + df.groupby("group_by_column", group_keys=False).apply( + lambda df: print("function_called") + ) result = capsys.readouterr().out.count("function_called") # If `groupby` behaves unexpectedly, this test will break @@ -233,15 +222,27 @@ def slow(group): def fast(group): return group.copy() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - fast_df = df.groupby("A", group_keys=False).apply(fast) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - slow_df = df.groupby("A", group_keys=False).apply(slow) - + fast_df = df.groupby("A", group_keys=False).apply(fast) + slow_df = df.groupby("A", group_keys=False).apply(slow) tm.assert_frame_equal(fast_df, slow_df) +def test_apply_fast_slow_identical_index(): + # GH#44803 + df = DataFrame( + { + "name": ["Alice", "Bob", "Carl"], + "age": [20, 21, 20], + } + ).set_index("name") + + grp_by_same_value = df.groupby(["age"], group_keys=False).apply(lambda group: group) + grp_by_copy = df.groupby(["age"], group_keys=False).apply( + lambda group: group.copy() + ) + tm.assert_frame_equal(grp_by_same_value, grp_by_copy) + + @pytest.mark.parametrize( "func", [ @@ -258,11 +259,8 @@ def test_groupby_apply_identity_maybecopy_index_identical(func): # transparent to the user df = DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("g", group_keys=False).apply(func) - tm.assert_frame_equal(result, df) + result = df.groupby("g", group_keys=False).apply(func) + tm.assert_frame_equal(result, df[["a", "b"]]) def test_apply_with_mixed_dtype(): @@ -273,19 +271,19 @@ def test_apply_with_mixed_dtype(): "foo2": ["one", "two", "two", "three", "one", "two"], } ) - result = df.apply(lambda x: x, axis=1).dtypes - expected = df.dtypes - tm.assert_series_equal(result, expected) + result = df.apply(lambda x: x, axis=1) + expected = df + tm.assert_frame_equal(result, expected) # GH 3610 incorrect dtype conversion with as_index=False df = DataFrame({"c1": [1, 2, 6, 6, 8]}) df["c2"] = df.c1 / 2.0 - result1 = df.groupby("c2").mean().reset_index().c2 - result2 = df.groupby("c2", as_index=False).mean().c2 - tm.assert_series_equal(result1, result2) + result1 = df.groupby("c2").mean().reset_index() + result2 = df.groupby("c2", as_index=False).mean() + tm.assert_frame_equal(result1, result2) -def test_groupby_as_index_apply(): +def test_groupby_as_index_apply(as_index): # GH #4648 and #3417 df = DataFrame( { @@ -294,38 +292,41 @@ def test_groupby_as_index_apply(): "time": range(6), } ) + gb = df.groupby("user_id", as_index=as_index) - g_as = df.groupby("user_id", as_index=True) - g_not_as = df.groupby("user_id", as_index=False) - - res_as = g_as.head(2).index - res_not_as = g_not_as.head(2).index - exp = Index([0, 1, 2, 4]) - tm.assert_index_equal(res_as, exp) - tm.assert_index_equal(res_not_as, exp) - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - res_as_apply = g_as.apply(lambda x: x.head(2)).index - with tm.assert_produces_warning(DeprecationWarning, match=msg): - res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index + expected = DataFrame( + { + "item_id": ["b", "b", "a", "a"], + "user_id": [1, 2, 1, 3], + "time": [0, 1, 2, 4], + }, + index=[0, 1, 2, 4], + ) + result = gb.head(2) + tm.assert_frame_equal(result, expected) # apply doesn't maintain the original ordering # changed in GH5610 as the as_index=False returns a MI here - exp_not_as_apply = Index([0, 2, 1, 4]) - tp = [(1, 0), (1, 2), (2, 1), (3, 4)] - exp_as_apply = MultiIndex.from_tuples(tp, names=["user_id", None]) - - tm.assert_index_equal(res_as_apply, exp_as_apply) - tm.assert_index_equal(res_not_as_apply, exp_not_as_apply) + if as_index: + tp = [(1, 0), (1, 2), (2, 1), (3, 4)] + index = MultiIndex.from_tuples(tp, names=["user_id", None]) + else: + index = Index([0, 2, 1, 4]) + expected = DataFrame( + { + "item_id": list("baba"), + "time": [0, 2, 1, 4], + }, + index=index, + ) + result = gb.apply(lambda x: x.head(2)) + tm.assert_frame_equal(result, expected) def test_groupby_as_index_apply_str(): ind = Index(list("abcde")) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index + res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index tm.assert_index_equal(res, ind) @@ -354,19 +355,13 @@ def desc3(group): # weirdo return result - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(desc) + result = grouped.apply(desc) assert result.index.names == ("A", "B", "stat") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result2 = grouped.apply(desc2) + result2 = grouped.apply(desc2) assert result2.index.names == ("A", "B", "stat") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result3 = grouped.apply(desc3) + result3 = grouped.apply(desc3) assert result3.index.names == ("A", "B", None) @@ -396,9 +391,7 @@ def test_apply_series_yield_constant(df): def test_apply_frame_yield_constant(df): # GH13568 - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["A", "B"]).apply(len) + result = df.groupby(["A", "B"]).apply(len) assert isinstance(result, Series) assert result.name is None @@ -409,9 +402,7 @@ def test_apply_frame_yield_constant(df): def test_apply_frame_to_series(df): grouped = df.groupby(["A", "B"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(len) + result = grouped.apply(len) expected = grouped.count()["C"] tm.assert_index_equal(result.index, expected.index) tm.assert_numpy_array_equal(result.values, expected.values) @@ -420,9 +411,7 @@ def test_apply_frame_to_series(df): def test_apply_frame_not_as_index_column_name(df): # GH 35964 - path within _wrap_applied_output not hit by a test grouped = df.groupby(["A", "B"], as_index=False) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(len) + result = grouped.apply(len) expected = grouped.count().rename(columns={"C": np.nan}).drop(columns="D") # TODO(GH#34306): Use assert_frame_equal when column name is not np.nan tm.assert_index_equal(result.index, expected.index) @@ -445,9 +434,7 @@ def trans2(group): } ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(trans) + result = df.groupby("A").apply(trans) exp = df.groupby("A")["C"].apply(trans2) tm.assert_series_equal(result, exp, check_names=False) assert result.name == "C" @@ -476,10 +463,8 @@ def test_apply_chunk_view(group_keys): # Low level tinkering could be unsafe, make sure not df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) - expected = df.take([0, 1, 3, 4, 6, 7]) + result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) + expected = df[["value"]].take([0, 1, 3, 4, 6, 7]) if group_keys: expected.index = MultiIndex.from_arrays( [[1, 1, 2, 2, 3, 3], expected.index], names=["key", None] @@ -499,9 +484,7 @@ def test_apply_no_name_column_conflict(): # it works! #2605 grouped = df.groupby(["name", "name2"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grouped.apply(lambda x: x.sort_values("value", inplace=True)) + grouped.apply(lambda x: x.sort_values("value", inplace=True)) def test_apply_typecast_fail(): @@ -518,11 +501,9 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("d", group_keys=False).apply(f) + result = df.groupby("d", group_keys=False).apply(f) - expected = df.copy() + expected = df[["c", "v"]] expected["v2"] = np.tile([0.0, 0.5, 1], 2) tm.assert_frame_equal(result, expected) @@ -544,13 +525,10 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("d", group_keys=False).apply(f) + result = df.groupby("d", group_keys=False).apply(f) - expected = df.copy() + expected = df[["c", "v"]] expected["v2"] = np.tile([0.0, 0.5, 1], 2) - tm.assert_frame_equal(result, expected) @@ -584,11 +562,8 @@ def filt2(x): else: return x[x.category == "c"] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = data.groupby("id_field").apply(filt1) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = data.groupby("id_field").apply(filt2) + expected = data.groupby("id_field").apply(filt1) + result = data.groupby("id_field").apply(filt2) tm.assert_frame_equal(result, expected) @@ -601,19 +576,11 @@ def test_apply_with_duplicated_non_sorted_axis(test_series): if test_series: ser = df.set_index("Y")["X"] result = ser.groupby(level=0, group_keys=False).apply(lambda x: x) - - # not expecting the order to remain the same for duplicated axis - result = result.sort_index() - expected = ser.sort_index() + expected = ser tm.assert_series_equal(result, expected) else: - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("Y", group_keys=False).apply(lambda x: x) - - # not expecting the order to remain the same for duplicated axis - result = result.sort_values("Y") - expected = df.sort_values("Y") + result = df.groupby("Y", group_keys=False).apply(lambda x: x) + expected = df[["X"]] tm.assert_frame_equal(result, expected) @@ -654,9 +621,7 @@ def f(g): g["value3"] = g["value1"] * 2 return g - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(f) + result = grouped.apply(f) assert "value3" in result @@ -670,13 +635,9 @@ def test_apply_numeric_coercion_when_datetime(): df = DataFrame( {"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]} ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) df.Date = pd.to_datetime(df.Date) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) tm.assert_series_equal(result["Str"], expected["Str"]) @@ -689,9 +650,7 @@ def test_apply_numeric_coercion_when_datetime_getitem(): def get_B(g): return g.iloc[0][["B"]] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(get_B)["B"] + result = df.groupby("A").apply(get_B)["B"] expected = df.B expected.index = df.A tm.assert_series_equal(result, expected) @@ -718,11 +677,8 @@ def predictions(tool): ) df2 = df1.copy() df2.oTime = pd.to_datetime(df2.oTime) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df1.groupby("Key").apply(predictions).p1 - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df2.groupby("Key").apply(predictions).p1 + expected = df1.groupby("Key").apply(predictions).p1 + result = df2.groupby("Key").apply(predictions).p1 tm.assert_series_equal(expected, result) @@ -737,13 +693,11 @@ def test_apply_aggregating_timedelta_and_datetime(): } ) df["time_delta_zero"] = df.datetime - df.datetime - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("clientid").apply( - lambda ddf: Series( - {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} - ) + result = df.groupby("clientid").apply( + lambda ddf: Series( + {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} ) + ) expected = DataFrame( { "clientid": ["A", "B", "C"], @@ -786,15 +740,11 @@ def func_with_no_date(batch): def func_with_date(batch): return Series({"b": datetime(2015, 1, 1), "c": 2}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) + dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) dfg_no_conversion_expected = DataFrame({"c": 2}, index=[1]) dfg_no_conversion_expected.index.name = "a" - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) + dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) dfg_conversion_expected = DataFrame( {"b": pd.Timestamp(2015, 1, 1), "c": 2}, index=[1] ) @@ -838,11 +788,8 @@ def test_groupby_apply_all_none(): def test_func(x): pass - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = test_df.groupby("groups").apply(test_func) - expected = DataFrame(columns=test_df.columns) - expected = expected.astype(test_df.dtypes) + result = test_df.groupby("groups").apply(test_func) + expected = DataFrame(columns=["random_vars"], dtype="int64") tm.assert_frame_equal(result, expected) @@ -852,12 +799,12 @@ def test_func(x): [ {"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]}, [[1, 1], [0, 2]], - {"groups": [1, 1], "vars": [0, 2]}, + {"vars": [0, 2]}, ], [ {"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]}, [[2, 2], [1, 3]], - {"groups": [2, 2], "vars": [1, 3]}, + {"vars": [1, 3]}, ], ], ) @@ -870,9 +817,7 @@ def test_func(x): return None return x.iloc[[0, -1]] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result1 = test_df1.groupby("groups").apply(test_func) + result1 = test_df1.groupby("groups").apply(test_func) index1 = MultiIndex.from_arrays(out_idx, names=["groups", None]) expected1 = DataFrame(out_data, index=index1) tm.assert_frame_equal(result1, expected1) @@ -882,9 +827,7 @@ def test_groupby_apply_return_empty_chunk(): # GH 22221: apply filter which returns some empty groups df = DataFrame({"value": [0, 1], "group": ["filled", "empty"]}) groups = df.groupby("group") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = groups.apply(lambda group: group[group.value != 1]["value"]) + result = groups.apply(lambda group: group[group.value != 1]["value"]) expected = Series( [0], name="value", @@ -909,9 +852,7 @@ def test_apply_with_mixed_types(meth): def test_func_returns_object(): # GH 28652 df = DataFrame({"a": [1, 2]}, index=Index([1, 2])) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("a").apply(lambda g: g.index) + result = df.groupby("a").apply(lambda g: g.index) expected = Series([Index([1]), Index([2])], index=Index([1, 2], name="a")) tm.assert_series_equal(result, expected) @@ -928,9 +869,7 @@ def test_apply_datetime_issue(group_column_dtlike): # standard int values in range(len(num_columns)) df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) + result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) expected = DataFrame(["spam"], Index(["foo"], dtype="str", name="a"), columns=[42]) tm.assert_frame_equal(result, expected) @@ -967,9 +906,7 @@ def test_apply_series_return_dataframe_groups(): def most_common_values(df): return Series({c: s.value_counts().index[0] for c, s in df.items()}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = tdf.groupby("day").apply(most_common_values)["userId"] + result = tdf.groupby("day").apply(most_common_values)["userId"] expected = Series( ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId" ) @@ -1010,13 +947,11 @@ def test_groupby_apply_datetime_result_dtypes(using_infer_string): ], columns=["observation", "color", "mood", "intensity", "score"], ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes + result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes dtype = pd.StringDtype(na_value=np.nan) if using_infer_string else object expected = Series( - [np.dtype("datetime64[us]"), dtype, dtype, np.int64, dtype], - index=["observation", "color", "mood", "intensity", "score"], + [np.dtype("datetime64[us]"), dtype, np.int64, dtype], + index=["observation", "mood", "intensity", "score"], ) tm.assert_series_equal(result, expected) @@ -1033,10 +968,8 @@ def test_groupby_apply_datetime_result_dtypes(using_infer_string): def test_apply_index_has_complex_internals(index): # GH 31248 df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("group", group_keys=False).apply(lambda x: x) - tm.assert_frame_equal(result, df) + result = df.groupby("group", group_keys=False).apply(lambda x: x) + tm.assert_frame_equal(result, df[["value"]]) @pytest.mark.parametrize( @@ -1058,9 +991,7 @@ def test_apply_index_has_complex_internals(index): def test_apply_function_returns_non_pandas_non_scalar(function, expected_values): # GH 31441 df = DataFrame(["A", "A", "B", "B"], columns=["groups"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("groups").apply(function) + result = df.groupby("groups").apply(function) expected = Series(expected_values, index=Index(["A", "B"], name="groups")) tm.assert_series_equal(result, expected) @@ -1072,9 +1003,7 @@ def fct(group): df = DataFrame({"A": ["a", "a", "b", "none"], "B": [1, 2, 3, np.nan]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(fct) + result = df.groupby("A").apply(fct) expected = Series( [[1.0, 2.0], [3.0], [np.nan]], index=Index(["a", "b", "none"], name="A") ) @@ -1085,9 +1014,7 @@ def fct(group): def test_apply_function_index_return(function): # GH: 22541 df = DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("id").apply(function) + result = df.groupby("id").apply(function) expected = Series( [Index([0, 4, 7, 9]), Index([1, 2, 3, 5]), Index([6, 8])], index=Index([1, 2, 3], name="id"), @@ -1123,9 +1050,7 @@ def test_apply_result_type(group_keys, udf): # We'd like to control whether the group keys end up in the index # regardless of whether the UDF happens to be a transform. df = DataFrame({"A": ["a", "b"], "B": [1, 2]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df_result = df.groupby("A", group_keys=group_keys).apply(udf) + df_result = df.groupby("A", group_keys=group_keys).apply(udf) series_result = df.B.groupby(df.A, group_keys=group_keys).apply(udf) if group_keys: @@ -1140,11 +1065,8 @@ def test_result_order_group_keys_false(): # GH 34998 # apply result order should not depend on whether index is the same or just equal df = DataFrame({"A": [2, 1, 2], "B": [1, 2, 3]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A", group_keys=False).apply(lambda x: x) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) + result = df.groupby("A", group_keys=False).apply(lambda x: x) + expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) tm.assert_frame_equal(result, expected) @@ -1156,15 +1078,8 @@ def test_apply_with_timezones_aware(): df1 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_no_tz}) df2 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_tz}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result1 = df1.groupby("x", group_keys=False).apply( - lambda df: df[["x", "y"]].copy() - ) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result2 = df2.groupby("x", group_keys=False).apply( - lambda df: df[["x", "y"]].copy() - ) + result1 = df1.groupby("x", group_keys=False).apply(lambda df: df[["y"]].copy()) + result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["y"]].copy()) tm.assert_frame_equal(result1, result2) @@ -1187,7 +1102,7 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): # Check output when no other methods are called before .apply() grp = df.groupby(by="a") - result = grp.apply(np.sum, axis=0, include_groups=False) + result = grp.apply(np.sum, axis=0) tm.assert_frame_equal(result, expected) # Check output when another method is called before .apply() @@ -1201,7 +1116,7 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): msg = "" with tm.assert_produces_warning(warn, match=msg): _ = getattr(grp, reduction_func)(*args) - result = grp.apply(np.sum, axis=0, include_groups=False) + result = grp.apply(np.sum, axis=0) tm.assert_frame_equal(result, expected) @@ -1223,14 +1138,12 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): ) grp = df.groupby(["A", "B"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grp.apply(lambda x: x.head(1)) + result = grp.apply(lambda x: x.head(1)) expected = df.iloc[[0, 2, 3]] expected = expected.reset_index() expected.index = MultiIndex.from_frame(expected[["A", "B", "idx"]]) - expected = expected.drop(columns=["idx"]) + expected = expected.drop(columns=["A", "B", "idx"]) tm.assert_frame_equal(result, expected) for val in result.index.levels[1]: @@ -1247,10 +1160,8 @@ def test_apply_dropna_with_indexed_same(dropna): }, index=list("xxyxz"), ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) - expected = df.dropna() if dropna else df.iloc[[0, 3, 1, 2, 4]] + result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) + expected = df.dropna()[["col"]] if dropna else df[["col"]].iloc[[0, 3, 1, 2, 4]] tm.assert_frame_equal(result, expected) @@ -1274,9 +1185,7 @@ def test_apply_dropna_with_indexed_same(dropna): def test_apply_as_index_constant_lambda(as_index, expected): # GH 13217 df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 1, 2, 2], "c": [1, 1, 1, 1]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) + result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) tm.assert_equal(result, expected) @@ -1286,9 +1195,7 @@ def test_sort_index_groups(): {"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 0], "C": [1, 1, 1, 2, 2]}, index=range(5), ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("C").apply(lambda x: x.A.sort_index()) + result = df.groupby("C").apply(lambda x: x.A.sort_index()) expected = Series( range(1, 6), index=MultiIndex.from_tuples( @@ -1308,12 +1215,10 @@ def test_positional_slice_groups_datetimelike(): "let": list("abcde"), } ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = expected.groupby( - [expected.let, expected.date.dt.date], group_keys=False - ).apply(lambda x: x.iloc[0:]) - tm.assert_frame_equal(result, expected) + result = expected.groupby( + [expected.let, expected.date.dt.date], group_keys=False + ).apply(lambda x: x.iloc[0:]) + tm.assert_frame_equal(result, expected[["date", "vals"]]) def test_groupby_apply_shape_cache_safety(): @@ -1354,32 +1259,27 @@ def test_apply_na(dropna): {"grp": [1, 1, 2, 2], "y": [1, 0, 2, 5], "z": [1, 2, np.nan, np.nan]} ) dfgrp = df.groupby("grp", dropna=dropna) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) + result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) + expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) tm.assert_frame_equal(result, expected) def test_apply_empty_string_nan_coerce_bug(): # GH#24903 - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = ( - DataFrame( - { - "a": [1, 1, 2, 2], - "b": ["", "", "", ""], - "c": pd.to_datetime([1, 2, 3, 4], unit="s"), - } - ) - .groupby(["a", "b"]) - .apply(lambda df: df.iloc[-1]) + result = ( + DataFrame( + { + "a": [1, 1, 2, 2], + "b": ["", "", "", ""], + "c": pd.to_datetime([1, 2, 3, 4], unit="s"), + } ) + .groupby(["a", "b"]) + .apply(lambda df: df.iloc[-1]) + ) expected = DataFrame( - [[1, "", pd.to_datetime(2, unit="s")], [2, "", pd.to_datetime(4, unit="s")]], - columns=["a", "b", "c"], + [[pd.to_datetime(2, unit="s")], [pd.to_datetime(4, unit="s")]], + columns=["c"], index=MultiIndex.from_tuples([(1, ""), (2, "")], names=["a", "b"]), ) tm.assert_frame_equal(result, expected) @@ -1401,11 +1301,9 @@ def test_apply_index_key_error_bug(index_values): }, index=Index(["a2", "a3", "aa"], name="a"), ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = result.groupby("a").apply( - lambda df: Series([df["b"].mean()], index=["b_mean"]) - ) + result = result.groupby("a").apply( + lambda df: Series([df["b"].mean()], index=["b_mean"]) + ) tm.assert_frame_equal(result, expected) @@ -1452,10 +1350,9 @@ def test_apply_index_key_error_bug(index_values): ) def test_apply_nonmonotonic_float_index(arg, idx): # GH 34455 - expected = DataFrame({"col": arg}, index=idx) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = expected.groupby("col", group_keys=False).apply(lambda x: x) + df = DataFrame({"grp": arg, "col": arg}, index=idx) + result = df.groupby("grp", group_keys=False).apply(lambda x: x) + expected = df[["col"]] tm.assert_frame_equal(result, expected) @@ -1484,6 +1381,7 @@ def test_result_name_when_one_group(name): ("apply", lambda gb: gb.values[-1]), ("apply", lambda gb: gb["b"].iloc[0]), ("agg", "skew"), + ("agg", "kurt"), ("agg", "prod"), ("agg", "sum"), ], @@ -1492,7 +1390,7 @@ def test_empty_df(method, op): # GH 47985 empty_df = DataFrame({"a": [], "b": []}) gb = empty_df.groupby("a", group_keys=True) - group = getattr(gb, "b") + group = gb.b result = getattr(group, method)(op) expected = Series( @@ -1502,19 +1400,12 @@ def test_empty_df(method, op): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("include_groups", [True, False]) -def test_include_groups(include_groups): +def test_include_groups(): # GH#7155 df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) gb = df.groupby("a") - warn = DeprecationWarning if include_groups else None - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(warn, match=msg): - result = gb.apply(lambda x: x.sum(), include_groups=include_groups) - expected = DataFrame({"a": [2, 2], "b": [7, 5]}, index=Index([1, 2], name="a")) - if not include_groups: - expected = expected[["b"]] - tm.assert_frame_equal(result, expected) + with pytest.raises(ValueError, match="include_groups=True is no longer allowed"): + gb.apply(lambda x: x.sum(), include_groups=True) @pytest.mark.parametrize("func, value", [(max, 2), (min, 1), (sum, 3)]) @@ -1523,7 +1414,7 @@ def test_builtins_apply(func, value): # Builtins act as e.g. sum(group), which sums the column labels of group df = DataFrame({0: [1, 1, 2], 1: [3, 4, 5], 2: [3, 4, 5]}) gb = df.groupby(0) - result = gb.apply(func, include_groups=False) + result = gb.apply(func) expected = Series([value, value], index=Index([1, 2], name=0)) tm.assert_series_equal(result, expected) @@ -1544,9 +1435,7 @@ def f_0(grp): return grp.iloc[0] expected = df.groupby("A").first()[["B"]] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(f_0)[["B"]] + result = df.groupby("A").apply(f_0)[["B"]] tm.assert_frame_equal(result, expected) def f_1(grp): @@ -1554,9 +1443,7 @@ def f_1(grp): return None return grp.iloc[0] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(f_1)[["B"]] + result = df.groupby("A").apply(f_1)[["B"]] e = expected.copy() e.loc["Tiger"] = np.nan tm.assert_frame_equal(result, e) @@ -1566,9 +1453,7 @@ def f_2(grp): return None return grp.iloc[0] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(f_2)[["B"]] + result = df.groupby("A").apply(f_2)[["B"]] e = expected.copy() e.loc["Pony"] = np.nan tm.assert_frame_equal(result, e) @@ -1579,9 +1464,7 @@ def f_3(grp): return None return grp.iloc[0] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(f_3)[["C"]] + result = df.groupby("A").apply(f_3)[["C"]] e = df.groupby("A").first()[["C"]] e.loc["Pony"] = pd.NaT tm.assert_frame_equal(result, e) @@ -1592,10 +1475,42 @@ def f_4(grp): return None return grp.iloc[0].loc["C"] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(f_4) + result = df.groupby("A").apply(f_4) e = df.groupby("A").first()["C"].copy() e.loc["Pony"] = np.nan e.name = None tm.assert_series_equal(result, e) + + +def test_nonreducer_nonstransform(): + # GH3380, GH60619 + # Was originally testing mutating in a UDF; now kept as an example + # of using apply with a nonreducer and nontransformer. + df = DataFrame( + { + "cat1": ["a"] * 8 + ["b"] * 6, + "cat2": ["c"] * 2 + + ["d"] * 2 + + ["e"] * 2 + + ["f"] * 2 + + ["c"] * 2 + + ["d"] * 2 + + ["e"] * 2, + "val": np.random.default_rng(2).integers(100, size=14), + } + ) + + def f(x): + x = x.copy() + x["rank"] = x.val.rank(method="min") + return x.groupby("cat2")["rank"].min() + + expected = DataFrame( + { + "cat1": list("aaaabbb"), + "cat2": list("cdefcde"), + "rank": [3.0, 2.0, 5.0, 1.0, 2.0, 4.0, 1.0], + } + ).set_index(["cat1", "cat2"])["rank"] + result = df.groupby("cat1").apply(f) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py deleted file mode 100644 index fa20efad4da77..0000000000000 --- a/pandas/tests/groupby/test_apply_mutate.py +++ /dev/null @@ -1,96 +0,0 @@ -import numpy as np - -import pandas as pd -import pandas._testing as tm - - -def test_group_by_copy(): - # GH#44803 - df = pd.DataFrame( - { - "name": ["Alice", "Bob", "Carl"], - "age": [20, 21, 20], - } - ).set_index("name") - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grp_by_same_value = df.groupby(["age"], group_keys=False).apply( - lambda group: group - ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grp_by_copy = df.groupby(["age"], group_keys=False).apply( - lambda group: group.copy() - ) - tm.assert_frame_equal(grp_by_same_value, grp_by_copy) - - -def test_mutate_groups(): - # GH3380 - - df = pd.DataFrame( - { - "cat1": ["a"] * 8 + ["b"] * 6, - "cat2": ["c"] * 2 - + ["d"] * 2 - + ["e"] * 2 - + ["f"] * 2 - + ["c"] * 2 - + ["d"] * 2 - + ["e"] * 2, - "cat3": [f"g{x}" for x in range(1, 15)], - "val": np.random.default_rng(2).integers(100, size=14), - } - ) - - def f_copy(x): - x = x.copy() - x["rank"] = x.val.rank(method="min") - return x.groupby("cat2")["rank"].min() - - def f_no_copy(x): - x["rank"] = x.val.rank(method="min") - return x.groupby("cat2")["rank"].min() - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grpby_copy = df.groupby("cat1").apply(f_copy) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grpby_no_copy = df.groupby("cat1").apply(f_no_copy) - tm.assert_series_equal(grpby_copy, grpby_no_copy) - - -def test_no_mutate_but_looks_like(): - # GH 8467 - # first show's mutation indicator - # second does not, but should yield the same results - df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) - tm.assert_series_equal(result1, result2) - - -def test_apply_function_with_indexing(): - # GH: 33058 - df = pd.DataFrame( - {"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]} - ) - - def fn(x): - x.loc[x.index[-1], "col2"] = 0 - return x.col2 - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["col1"], as_index=False).apply(fn) - expected = pd.Series( - [1, 2, 0, 4, 5, 0], - index=range(6), - name="col2", - ) - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index fffaee40a7d5c..e49be8c00b426 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -61,6 +61,7 @@ def f(a): "sem": np.nan, "size": 0, "skew": np.nan, + "kurt": np.nan, "std": np.nan, "sum": 0, "var": np.nan, @@ -127,10 +128,8 @@ def test_basic_string(using_infer_string): def f(x): return x.drop_duplicates("person_name").iloc[0] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = g.apply(f) - expected = x.iloc[[0, 1]].copy() + result = g.apply(f) + expected = x[["person_name"]].iloc[[0, 1]] expected.index = Index([1, 2], name="person_id") dtype = "str" if using_infer_string else object expected["person_name"] = expected["person_name"].astype(dtype) @@ -314,9 +313,7 @@ def test_apply(ordered): # but for transform we should still get back the original index idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) expected = Series(1, index=idx) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(lambda x: 1) + result = grouped.apply(lambda x: 1) tm.assert_series_equal(result, expected) @@ -993,7 +990,7 @@ def test_sort(): # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 10)}) - labels = [f"{i} - {i+499}" for i in range(0, 10000, 500)] + labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) df = df.sort_values(by=["value"], ascending=True) @@ -1357,11 +1354,7 @@ def test_get_nonexistent_category(): # Accessing a Category that is not in the dataframe df = DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)}) with pytest.raises(KeyError, match="'vau'"): - df.groupby("var").apply( - lambda rows: DataFrame( - {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]} - ) - ) + df.groupby("var").apply(lambda rows: DataFrame({"val": [rows.iloc[-1]["vau"]]})) def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed): @@ -2034,10 +2027,7 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde df["a2"] = df["a"] df = df.set_index(keys) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - warn = DeprecationWarning if method == "apply" and index_kind == "range" else None - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(warn, match=msg): - op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) + op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) if (method == "transform" or not as_index) and index_kind == "range": result = op_result["a"].cat.categories else: diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 47ad18c9ad2c8..679f7eb7f7f11 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -289,9 +289,7 @@ def test_count(): for key in ["1st", "2nd", ["1st", "2nd"]]: left = df.groupby(key).count() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) + right = df.groupby(key).apply(DataFrame.count) tm.assert_frame_equal(left, right) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e6c7eede1a401..e08ddf750b00c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import SpecificationError import pandas.util._test_decorators as td @@ -66,11 +64,9 @@ def test_groupby_nonobject_dtype_mixed(): def max_value(group): return group.loc[group["value"].idxmax()] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - applied = df.groupby("A").apply(max_value) + applied = df.groupby("A").apply(max_value) result = applied.dtypes - expected = df.dtypes + expected = df.drop(columns="A").dtypes tm.assert_series_equal(result, expected) @@ -229,11 +225,8 @@ def f3(x): df2 = DataFrame({"a": [3, 2, 2, 2], "b": range(4), "c": range(5, 9)}) # correct result - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result1 = df.groupby("a").apply(f1) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result2 = df2.groupby("a").apply(f1) + result1 = df.groupby("a").apply(f1) + result2 = df2.groupby("a").apply(f1) tm.assert_frame_equal(result1, result2) # should fail (not the same number of levels) @@ -271,7 +264,7 @@ def test_attr_wrapper(ts): # make sure raises error msg = "'SeriesGroupBy' object has no attribute 'foo'" with pytest.raises(AttributeError, match=msg): - getattr(grouped, "foo") + grouped.foo def test_frame_groupby(tsframe): @@ -1055,17 +1048,13 @@ def summarize_random_name(df): # Provide a different name for each Series. In this case, groupby # should not attempt to propagate the Series name since they are # inconsistent. - return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"]) + return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["C"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - metrics = df.groupby("A").apply(summarize) + metrics = df.groupby("A").apply(summarize) assert metrics.columns.name is None - with tm.assert_produces_warning(DeprecationWarning, match=msg): - metrics = df.groupby("A").apply(summarize, "metrics") + metrics = df.groupby("A").apply(summarize, "metrics") assert metrics.columns.name == "metrics" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - metrics = df.groupby("A").apply(summarize_random_name) + metrics = df.groupby("A").apply(summarize_random_name) assert metrics.columns.name is None @@ -1289,7 +1278,7 @@ def test_groupby_2d_malformed(): d["label"] = ["l1", "l2"] tmp = d.groupby(["group"]).mean(numeric_only=True) res_values = np.array([[0.0, 1.0], [0.0, 1.0]]) - tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"], dtype=object)) + tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"])) tm.assert_numpy_array_equal(tmp.values, res_values) @@ -1361,10 +1350,8 @@ def test_dont_clobber_name_column(): {"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2} ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("key", group_keys=False).apply(lambda x: x) - tm.assert_frame_equal(result, df) + result = df.groupby("key", group_keys=False).apply(lambda x: x) + tm.assert_frame_equal(result, df[["name"]]) def test_skip_group_keys(): @@ -1441,9 +1428,7 @@ def freducex(x): grouped = df.groupby(grouper, group_keys=False) # make sure all these work - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grouped.apply(f) + grouped.apply(f) grouped.aggregate(freduce) grouped.aggregate({"C": freduce, "D": freduce}) grouped.transform(f) @@ -1464,10 +1449,7 @@ def f(group): names.append(group.name) return group.copy() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("a", sort=False, group_keys=False).apply(f) - + df.groupby("a", sort=False, group_keys=False).apply(f) expected_names = [0, 1, 2] assert names == expected_names @@ -1672,9 +1654,7 @@ def test_groupby_preserves_sort(sort_column, group_column): def test_sort(x): tm.assert_frame_equal(x, x.sort_values(by=sort_column)) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - g.apply(test_sort) + g.apply(test_sort) def test_pivot_table_values_key_error(): @@ -1728,7 +1708,7 @@ def test_pivot_table_values_key_error(): ) @pytest.mark.parametrize("method", ["attr", "agg", "apply"]) @pytest.mark.parametrize( - "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"] + "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew", "kurt"] ) def test_empty_groupby(columns, keys, values, method, op, dropna, using_infer_string): # GH8093 & GH26411 @@ -1804,7 +1784,7 @@ def get_categorical_invalid_expected(): tm.assert_equal(result, expected) return - if op in ["prod", "sum", "skew"]: + if op in ["prod", "sum", "skew", "kurt"]: # ops that require more than just ordered-ness if is_dt64 or is_cat or is_per or (is_str and op != "sum"): # GH#41291 @@ -1817,15 +1797,15 @@ def get_categorical_invalid_expected(): msg = f"dtype 'str' does not support operation '{op}'" else: msg = "category type does not support" - if op == "skew": - msg = "|".join([msg, "does not support operation 'skew'"]) + if op in ["skew", "kurt"]: + msg = "|".join([msg, f"does not support operation '{op}'"]) with pytest.raises(TypeError, match=msg): get_result() if not isinstance(columns, list): # i.e. SeriesGroupBy return - elif op == "skew": + elif op in ["skew", "kurt"]: # TODO: test the numeric_only=True case return else: @@ -1860,10 +1840,8 @@ def test_empty_groupby_apply_nonunique_columns(): df[3] = df[3].astype(np.int64) df.columns = [0, 1, 2, 0] gb = df.groupby(df[1], group_keys=False) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - res = gb.apply(lambda x: x) - assert (res.dtypes == df.dtypes).all() + res = gb.apply(lambda x: x) + assert (res.dtypes == df.drop(columns=1).dtypes).all() def test_tuple_as_grouping(): @@ -2488,12 +2466,13 @@ def test_groupby_none_in_first_mi_level(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_groupby_none_column_name(): +def test_groupby_none_column_name(using_infer_string): # GH#47348 df = DataFrame({None: [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [4, 5, 6, 7]}) - result = df.groupby(by=[None]).sum() - expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=None)) + by = [np.nan] if using_infer_string else [None] + gb = df.groupby(by=by) + result = gb.sum() + expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=by[0])) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 060a8b7fd3824..8c4ab42b7be7a 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -323,9 +323,7 @@ def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, df = pd.DataFrame(data) gb = df.groupby("groups", dropna=dropna) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) + result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) mi_tuples = tuple(zip(data["groups"], selected_data["values"])) mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None]) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index c81e7ecb1446d..3ee9c9ea0c7fd 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -72,18 +72,11 @@ def func(group): assert group.testattr == "hello" return group.testattr - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning( - DeprecationWarning, - match=msg, - raise_on_extra_warnings=False, - check_stacklevel=False, - ): - result = custom_df.groupby("c").apply(func) + result = custom_df.groupby("c").apply(func) expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c")) tm.assert_series_equal(result, expected) - result = custom_df.groupby("c").apply(func, include_groups=False) + result = custom_df.groupby("c").apply(func) tm.assert_series_equal(result, expected) # https://github.com/pandas-dev/pandas/pull/56761 @@ -124,12 +117,5 @@ def test_groupby_resample_preserves_subclass(obj): df = df.set_index("Date") # Confirm groupby.resample() preserves dataframe type - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning( - DeprecationWarning, - match=msg, - raise_on_extra_warnings=False, - check_stacklevel=False, - ): - result = df.groupby("Buyer").resample("5D").sum() + result = df.groupby("Buyer").resample("5D").sum() assert isinstance(result, obj) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 4e7c0acb127ed..53e9c53efebf7 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -233,11 +233,7 @@ def test_grouper_creation_bug(self): result = g.sum() tm.assert_frame_equal(result, expected) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = g.apply(lambda x: x.sum()) - expected["A"] = [0, 2, 4] - expected = expected.loc[:, ["A", "B"]] + result = g.apply(lambda x: x.sum()) tm.assert_frame_equal(result, expected) def test_grouper_creation_bug2(self): @@ -788,7 +784,7 @@ def test_groupby_apply_empty_with_group_keys_false(self): # different index objects. df = DataFrame({"A": [], "B": [], "C": []}) g = df.groupby("A", group_keys=False) - result = g.apply(lambda x: x / x.sum(), include_groups=False) + result = g.apply(lambda x: x / x.sum()) expected = DataFrame({"B": [], "C": []}, index=None) tm.assert_frame_equal(result, expected) @@ -872,9 +868,7 @@ def test_groupby_tuple_keys_handle_multiindex(self): } ) expected = df.sort_values(by=["category_tuple", "num1"]) - result = df.groupby("category_tuple").apply( - lambda x: x.sort_values(by="num1"), include_groups=False - ) + result = df.groupby("category_tuple").apply(lambda x: x.sort_values(by="num1")) expected = expected[result.columns] tm.assert_frame_equal(result.reset_index(drop=True), expected) diff --git a/pandas/tests/groupby/test_numba.py b/pandas/tests/groupby/test_numba.py index 3e32031e51138..082319d8479f0 100644 --- a/pandas/tests/groupby/test_numba.py +++ b/pandas/tests/groupby/test_numba.py @@ -1,15 +1,24 @@ import pytest +from pandas.compat import is_platform_arm + from pandas import ( DataFrame, Series, option_context, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] -pytest.importorskip("numba") +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) @pytest.mark.filterwarnings("ignore") diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index 0779faa8d8975..99a88a5d8fe7c 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -244,6 +244,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): ("quantile", True), ("sem", True), ("skew", True), + ("kurt", True), ("std", True), ("sum", True), ("var", True), @@ -378,6 +379,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): "max", "prod", "skew", + "kurt", ) # Test default behavior; kernels that fail may be enabled in the future but kernels @@ -407,6 +409,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): "quantile", "sem", "skew", + "kurt", "std", "sum", "var", diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 789105c275625..864b9e5d55991 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -171,6 +171,7 @@ def test_groupby_raises_string( "shift": (None, ""), "size": (None, ""), "skew": (ValueError, "could not convert string to float"), + "kurt": (ValueError, "could not convert string to float"), "std": (ValueError, "could not convert string to float"), "sum": (None, ""), "var": ( @@ -190,10 +191,11 @@ def test_groupby_raises_string( "sem", "var", "skew", + "kurt", "quantile", ]: msg = f"dtype 'str' does not support operation '{groupby_func}'" - if groupby_func in ["sem", "std", "skew"]: + if groupby_func in ["sem", "std", "skew", "kurt"]: # The object-dtype raises ValueError when trying to convert to numeric. klass = TypeError elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow": @@ -261,10 +263,7 @@ def test_groupby_raises_string_np( if using_infer_string: if groupby_func_np is np.mean: klass = TypeError - msg = ( - f"Cannot perform reduction '{groupby_func_np.__name__}' " - "with string dtype" - ) + msg = f"Cannot perform reduction '{groupby_func_np.__name__}' with string dtype" _call_and_check(klass, msg, how, gb, groupby_func_np, ()) @@ -323,6 +322,15 @@ def test_groupby_raises_datetime( ] ), ), + "kurt": ( + TypeError, + "|".join( + [ + r"dtype datetime64\[ns\] does not support operation", + "datetime64 type does not support operation 'kurt'", + ] + ), + ), "std": (None, ""), "sum": (TypeError, "datetime64 type does not support operation 'sum"), "var": (TypeError, "datetime64 type does not support operation 'var'"), @@ -372,7 +380,7 @@ def test_groupby_raises_datetime_np( _call_and_check(klass, msg, how, gb, groupby_func_np, ()) -@pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "var"]) +@pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "kurt", "var"]) def test_groupby_raises_timedelta(func): df = DataFrame( { @@ -502,6 +510,15 @@ def test_groupby_raises_category( ] ), ), + "kurt": ( + TypeError, + "|".join( + [ + "dtype category does not support operation 'kurt'", + "category type does not support kurt operations", + ] + ), + ), "std": ( TypeError, "|".join( @@ -676,6 +693,15 @@ def test_groupby_raises_category_on_category( ] ), ), + "kurt": ( + TypeError, + "|".join( + [ + "category type does not support kurt operations", + "dtype category does not support operation 'kurt'", + ] + ), + ), "std": ( TypeError, "|".join( diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 51c7eab2bfa82..ea876cfdf4933 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -422,6 +422,239 @@ def test_mean_on_timedelta(): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "values, dtype, result_dtype", + [ + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "float64", "float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "Float64", "Float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "Int64", "Float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "timedelta64[ns]", "timedelta64[ns]"), + ( + pd.to_datetime( + [ + "2019-05-09", + pd.NaT, + "2019-05-11", + "2019-05-12", + "2019-05-13", + "2019-05-14", + "2019-05-15", + "2019-05-16", + "2019-05-17", + "2019-05-18", + ] + ), + "datetime64[ns]", + "datetime64[ns]", + ), + ], +) +def test_mean_skipna(values, dtype, result_dtype, skipna): + # GH#15675 + df = DataFrame( + { + "val": values, + "cat": ["A", "B"] * 5, + } + ).astype({"val": dtype}) + # We need to recast the expected values to the result_dtype because + # Series.mean() changes the dtype to float64/object depending on the input dtype + expected = ( + df.groupby("cat")["val"] + .apply(lambda x: x.mean(skipna=skipna)) + .astype(result_dtype) + ) + result = df.groupby("cat")["val"].mean(skipna=skipna) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "values, dtype", + [ + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "Float64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "Int64"), + ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "timedelta64[ns]"), + ], +) +def test_sum_skipna(values, dtype, skipna): + # GH#15675 + df = DataFrame( + { + "val": values, + "cat": ["A", "B"] * 5, + } + ).astype({"val": dtype}) + # We need to recast the expected values to the original dtype because + # Series.sum() changes the dtype + expected = ( + df.groupby("cat")["val"].apply(lambda x: x.sum(skipna=skipna)).astype(dtype) + ) + result = df.groupby("cat")["val"].sum(skipna=skipna) + tm.assert_series_equal(result, expected) + + +def test_sum_skipna_object(skipna): + # GH#15675 + df = DataFrame( + { + "val": ["a", "b", np.nan, "d", "e", "f", "g", "h", "i", "j"], + "cat": ["A", "B"] * 5, + } + ).astype({"val": object}) + if skipna: + expected = Series( + ["aegi", "bdfhj"], index=pd.Index(["A", "B"], name="cat"), name="val" + ).astype(object) + else: + expected = Series( + [np.nan, "bdfhj"], index=pd.Index(["A", "B"], name="cat"), name="val" + ).astype(object) + result = df.groupby("cat")["val"].sum(skipna=skipna) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "func, values, dtype, result_dtype", + [ + ("prod", [0, 1, 3, np.nan, 4, 5, 6, 7, -8, 9], "float64", "float64"), + ("prod", [0, -1, 3, 4, 5, np.nan, 6, 7, 8, 9], "Float64", "Float64"), + ("prod", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Int64", "Int64"), + ("prod", [np.nan] * 10, "float64", "float64"), + ("prod", [np.nan] * 10, "Float64", "Float64"), + ("prod", [np.nan] * 10, "Int64", "Int64"), + ("var", [0, -1, 3, 4, np.nan, 5, 6, 7, 8, 9], "float64", "float64"), + ("var", [0, 1, 3, -4, 5, 6, 7, -8, 9, np.nan], "Float64", "Float64"), + ("var", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "Int64", "Float64"), + ("var", [np.nan] * 10, "float64", "float64"), + ("var", [np.nan] * 10, "Float64", "Float64"), + ("var", [np.nan] * 10, "Int64", "Float64"), + ("std", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "float64", "float64"), + ("std", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "Float64", "Float64"), + ("std", [0, 1, 3, -4, 5, 6, 7, -8, 9, np.nan], "Int64", "Float64"), + ("std", [np.nan] * 10, "float64", "float64"), + ("std", [np.nan] * 10, "Float64", "Float64"), + ("std", [np.nan] * 10, "Int64", "Float64"), + ("sem", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"), + ("sem", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"), + ("sem", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Float64"), + ("sem", [np.nan] * 10, "float64", "float64"), + ("sem", [np.nan] * 10, "Float64", "Float64"), + ("sem", [np.nan] * 10, "Int64", "Float64"), + ("min", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"), + ("min", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"), + ("min", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Int64"), + ( + "min", + [0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], + "timedelta64[ns]", + "timedelta64[ns]", + ), + ( + "min", + pd.to_datetime( + [ + "2019-05-09", + pd.NaT, + "2019-05-11", + "2019-05-12", + "2019-05-13", + "2019-05-14", + "2019-05-15", + "2019-05-16", + "2019-05-17", + "2019-05-18", + ] + ), + "datetime64[ns]", + "datetime64[ns]", + ), + ("min", [np.nan] * 10, "float64", "float64"), + ("min", [np.nan] * 10, "Float64", "Float64"), + ("min", [np.nan] * 10, "Int64", "Int64"), + ("max", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"), + ("max", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"), + ("max", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Int64"), + ( + "max", + [0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], + "timedelta64[ns]", + "timedelta64[ns]", + ), + ( + "max", + pd.to_datetime( + [ + "2019-05-09", + pd.NaT, + "2019-05-11", + "2019-05-12", + "2019-05-13", + "2019-05-14", + "2019-05-15", + "2019-05-16", + "2019-05-17", + "2019-05-18", + ] + ), + "datetime64[ns]", + "datetime64[ns]", + ), + ("max", [np.nan] * 10, "float64", "float64"), + ("max", [np.nan] * 10, "Float64", "Float64"), + ("max", [np.nan] * 10, "Int64", "Int64"), + ("median", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"), + ("median", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"), + ("median", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Float64"), + ( + "median", + [0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], + "timedelta64[ns]", + "timedelta64[ns]", + ), + ( + "median", + pd.to_datetime( + [ + "2019-05-09", + pd.NaT, + "2019-05-11", + "2019-05-12", + "2019-05-13", + "2019-05-14", + "2019-05-15", + "2019-05-16", + "2019-05-17", + "2019-05-18", + ] + ), + "datetime64[ns]", + "datetime64[ns]", + ), + ("median", [np.nan] * 10, "float64", "float64"), + ("median", [np.nan] * 10, "Float64", "Float64"), + ("median", [np.nan] * 10, "Int64", "Float64"), + ], +) +def test_multifunc_skipna(func, values, dtype, result_dtype, skipna): + # GH#15675 + df = DataFrame( + { + "val": values, + "cat": ["A", "B"] * 5, + } + ).astype({"val": dtype}) + # We need to recast the expected values to the result_dtype as some operations + # change the dtype + expected = ( + df.groupby("cat")["val"] + .apply(lambda x: getattr(x, func)(skipna=skipna)) + .astype(result_dtype) + ) + result = getattr(df.groupby("cat")["val"], func)(skipna=skipna) + tm.assert_series_equal(result, expected) + + def test_cython_median(): arr = np.random.default_rng(2).standard_normal(1000) arr[::2] = np.nan @@ -1114,6 +1347,7 @@ def test_apply_to_nullable_integer_returns_float(values, function): "median", "mean", "skew", + "kurt", "std", "var", "sem", @@ -1127,8 +1361,8 @@ def test_regression_allowlist_methods(op, skipna, sort): grouped = frame.groupby(level=0, sort=sort) - if op == "skew": - # skew has skipna + if op in ["skew", "kurt", "sum", "mean"]: + # skew, kurt, sum, mean have skipna result = getattr(grouped, op)(skipna=skipna) expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)(skipna=skipna)) if sort: diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index a7712d9dc6586..550efe9187fe8 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -483,12 +483,8 @@ def test_timegrouper_apply_return_type_series(self): def sumfunc_series(x): return Series([x["value"].sum()], ("sum",)) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series) + expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) + result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series) tm.assert_frame_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) @@ -504,11 +500,8 @@ def test_timegrouper_apply_return_type_value(self): def sumfunc_value(x): return x.value.sum() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value) + expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) + result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value) tm.assert_series_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) @@ -934,9 +927,7 @@ def test_groupby_apply_timegrouper_with_nat_apply_squeeze( assert gb._selected_obj.index.nlevels == 1 # function that returns a Series - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - res = gb.apply(lambda x: x["Quantity"] * 2) + res = gb.apply(lambda x: x["Quantity"] * 2) dti = Index([Timestamp("2013-12-31")], dtype=df["Date"].dtype, name="Date") expected = DataFrame( diff --git a/pandas/tests/groupby/transform/test_numba.py b/pandas/tests/groupby/transform/test_numba.py index 969df8ef4c52b..e19b7592f75b3 100644 --- a/pandas/tests/groupby/transform/test_numba.py +++ b/pandas/tests/groupby/transform/test_numba.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm from pandas.errors import NumbaUtilError from pandas import ( @@ -9,8 +10,17 @@ option_context, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) def test_correct_function_signature(): diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index f506126f9cf6f..fecd20fd6cece 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -531,15 +531,13 @@ def f(group): return group[:1] grouped = df.groupby("c") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(f) + result = grouped.apply(f) assert result["d"].dtype == np.float64 # this is by definition a mutating operation! for key, group in grouped: - res = f(group) + res = f(group.drop(columns="c")) tm.assert_frame_equal(res, result.loc[key]) @@ -685,18 +683,14 @@ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target): f = gb[["float", "float_missing"]].apply(targop) expected = concat([f, i], axis=1) else: - if op != "shift" or not isinstance(gb_target.get("by"), (str, list)): - warn = None - else: - warn = DeprecationWarning - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(warn, match=msg): - expected = gb.apply(targop) + expected = gb.apply(targop) expected = expected.sort_index(axis=1) if op == "shift": expected["string_missing"] = expected["string_missing"].fillna(np.nan) - expected["string"] = expected["string"].fillna(np.nan) + by = gb_target.get("by") + if not isinstance(by, (str, list)) or (by != "string" and "string" not in by): + expected["string"] = expected["string"].fillna(np.nan) result = gb[expected.columns].transform(op, *args).sort_index(axis=1) tm.assert_frame_equal(result, expected) @@ -1094,13 +1088,13 @@ def test_transform_agg_by_name(request, reduction_func, frame_or_series): func = reduction_func obj = DataFrame( - {"a": [0, 0, 0, 1, 1, 1], "b": range(6)}, - index=["A", "B", "C", "D", "E", "F"], + {"a": [0, 0, 0, 0, 1, 1, 1, 1], "b": range(8)}, + index=["A", "B", "C", "D", "E", "F", "G", "H"], ) if frame_or_series is Series: obj = obj["a"] - g = obj.groupby(np.repeat([0, 1], 3)) + g = obj.groupby(np.repeat([0, 1], 4)) if func == "corrwith" and isinstance(obj, Series): # GH#32293 # TODO: implement SeriesGroupBy.corrwith @@ -1125,7 +1119,7 @@ def test_transform_agg_by_name(request, reduction_func, frame_or_series): tm.assert_index_equal(result.columns, obj.columns) # verify that values were broadcasted across each group - assert len(set(DataFrame(result).iloc[-3:, -1])) == 1 + assert len(set(DataFrame(result).iloc[-4:, -1])) == 1 def test_transform_lambda_with_datetimetz(): diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index 56cdca49cb2b0..d4932712de1bb 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -34,7 +34,7 @@ def test_insert(self): # test empty null_index = Index([]) - tm.assert_index_equal(Index(["a"], dtype=object), null_index.insert(0, "a")) + tm.assert_index_equal(Index(["a"]), null_index.insert(0, "a")) def test_insert_missing(self, nulls_fixture, using_infer_string): # GH#22295 diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 0e9fb77d6e8dd..d57df82b2358c 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -240,7 +240,6 @@ def test_tuple_union_bug(self, method, expected, sort): def test_union_name_preservation( self, first_list, second_list, first_name, second_name, expected_name, sort ): - expected_dtype = object if not first_list or not second_list else "str" first = Index(first_list, name=first_name) second = Index(second_list, name=second_name) union = first.union(second, sort=sort) @@ -251,7 +250,7 @@ def test_union_name_preservation( expected = Index(sorted(vals), name=expected_name) tm.assert_index_equal(union, expected) else: - expected = Index(vals, name=expected_name, dtype=expected_dtype) + expected = Index(vals, name=expected_name) tm.assert_index_equal(union.sort_values(), expected.sort_values()) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/categorical/test_indexing.py b/pandas/tests/indexes/categorical/test_indexing.py index 49eb79da616e7..25232075a07d9 100644 --- a/pandas/tests/indexes/categorical/test_indexing.py +++ b/pandas/tests/indexes/categorical/test_indexing.py @@ -64,8 +64,7 @@ def test_take_fill_value(self): tm.assert_categorical_equal(result.values, expected.values) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) @@ -103,8 +102,7 @@ def test_take_fill_value_datetime(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) diff --git a/pandas/tests/indexes/datetimes/methods/test_round.py b/pandas/tests/indexes/datetimes/methods/test_round.py index cde4a3a65804d..b023542ba0a4c 100644 --- a/pandas/tests/indexes/datetimes/methods/test_round.py +++ b/pandas/tests/indexes/datetimes/methods/test_round.py @@ -216,6 +216,6 @@ def test_round_int64(self, start, index_freq, periods, round_freq): assert (mod == 0).all(), f"round not a {round_freq} multiple" assert (diff <= unit // 2).all(), "round error" if unit % 2 == 0: - assert ( - result.asi8[diff == unit // 2] % 2 == 0 - ).all(), "round half to even error" + assert (result.asi8[diff == unit // 2] % 2 == 0).all(), ( + "round half to even error" + ) diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index 4551fdf073193..f4e0a63043335 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -205,12 +205,7 @@ def test_dti_representation_to_series(self, unit): exp3 = "0 2011-01-01\n1 2011-01-02\ndtype: datetime64[ns]" - exp4 = ( - "0 2011-01-01\n" - "1 2011-01-02\n" - "2 2011-01-03\n" - "dtype: datetime64[ns]" - ) + exp4 = "0 2011-01-01\n1 2011-01-02\n2 2011-01-03\ndtype: datetime64[ns]" exp5 = ( "0 2011-01-01 09:00:00+09:00\n" @@ -226,11 +221,7 @@ def test_dti_representation_to_series(self, unit): "dtype: datetime64[ns, US/Eastern]" ) - exp7 = ( - "0 2011-01-01 09:00:00\n" - "1 2011-01-02 10:15:00\n" - "dtype: datetime64[ns]" - ) + exp7 = "0 2011-01-01 09:00:00\n1 2011-01-02 10:15:00\ndtype: datetime64[ns]" with pd.option_context("display.width", 300): for idx, expected in zip( diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index bfbcdcff51ee6..c44345273466c 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -338,8 +338,7 @@ def test_take_fill_value(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) @@ -375,8 +374,7 @@ def test_take_fill_value_with_timezone(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) diff --git a/pandas/tests/indexes/datetimes/test_join.py b/pandas/tests/indexes/datetimes/test_join.py index d0ac32939296c..abf6809d67f9c 100644 --- a/pandas/tests/indexes/datetimes/test_join.py +++ b/pandas/tests/indexes/datetimes/test_join.py @@ -70,13 +70,17 @@ def test_join_utc_convert(self, join_type): assert isinstance(result, DatetimeIndex) assert result.tz is timezone.utc - def test_datetimeindex_union_join_empty(self, sort): + def test_datetimeindex_union_join_empty(self, sort, using_infer_string): dti = date_range(start="1/1/2001", end="2/1/2001", freq="D") empty = Index([]) result = dti.union(empty, sort=sort) - expected = dti.astype("O") - tm.assert_index_equal(result, expected) + if using_infer_string: + assert isinstance(result, DatetimeIndex) + tm.assert_index_equal(result, dti) + else: + expected = dti.astype("O") + tm.assert_index_equal(result, expected) result = dti.join(empty) assert isinstance(result, DatetimeIndex) diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index 8db483751438c..90423149658ab 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -154,8 +154,7 @@ def test_constructor_empty(self, constructor, breaks, closed): def test_constructor_string(self, constructor, breaks): # GH 19016 msg = ( - "category, object, and string subtypes are not supported " - "for IntervalIndex" + "category, object, and string subtypes are not supported for IntervalIndex" ) with pytest.raises(TypeError, match=msg): constructor(**self.get_kwargs_from_breaks(breaks)) @@ -224,8 +223,7 @@ def test_constructor_errors(self): # GH 19016: categorical data data = Categorical(list("01234abcde"), ordered=True) msg = ( - "category, object, and string subtypes are not supported " - "for IntervalIndex" + "category, object, and string subtypes are not supported for IntervalIndex" ) with pytest.raises(TypeError, match=msg): IntervalIndex.from_arrays(data[:-1], data[1:]) @@ -297,8 +295,7 @@ def test_constructor_errors(self): # GH 19016: categorical data data = Categorical(list("01234abcde"), ordered=True) msg = ( - "category, object, and string subtypes are not supported " - "for IntervalIndex" + "category, object, and string subtypes are not supported for IntervalIndex" ) with pytest.raises(TypeError, match=msg): IntervalIndex.from_breaks(data) diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index 73bbfc91028b3..d45d894c485c9 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -21,12 +21,7 @@ class TestIntervalIndexRendering: [ ( Series, - ( - "(0.0, 1.0] a\n" - "NaN b\n" - "(2.0, 3.0] c\n" - "dtype: object" - ), + ("(0.0, 1.0] a\nNaN b\n(2.0, 3.0] c\ndtype: object"), ), (DataFrame, (" 0\n(0.0, 1.0] a\nNaN b\n(2.0, 3.0] c")), ], diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index d82203a53a60f..f098690be2afa 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -259,8 +259,7 @@ def test_get_indexer(self): def test_get_indexer_nearest(self): midx = MultiIndex.from_tuples([("a", 1), ("b", 2)]) msg = ( - "method='nearest' not implemented yet for MultiIndex; " - "see GitHub issue 9365" + "method='nearest' not implemented yet for MultiIndex; see GitHub issue 9365" ) with pytest.raises(NotImplementedError, match=msg): midx.get_indexer(["a"], method="nearest") diff --git a/pandas/tests/indexes/multi/test_missing.py b/pandas/tests/indexes/multi/test_missing.py index 14ffc42fb4b59..41cfa093ae53c 100644 --- a/pandas/tests/indexes/multi/test_missing.py +++ b/pandas/tests/indexes/multi/test_missing.py @@ -8,7 +8,7 @@ def test_fillna(idx): # GH 11343 - msg = "isna is not defined for MultiIndex" + msg = "fillna is not defined for MultiIndex" with pytest.raises(NotImplementedError, match=msg): idx.fillna(idx[0]) diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index 43adc09774914..3c1b98d57b2a0 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -479,8 +479,7 @@ def test_take_fill_value_float64(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py index 9f36eb1e7a1d1..dc95e19523842 100644 --- a/pandas/tests/indexes/period/test_formats.py +++ b/pandas/tests/indexes/period/test_formats.py @@ -63,8 +63,7 @@ def test_representation(self, method): exp3 = "PeriodIndex(['2011-01-01', '2011-01-02'], dtype='period[D]')" exp4 = ( - "PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], " - "dtype='period[D]')" + "PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], dtype='period[D]')" ) exp5 = "PeriodIndex(['2011', '2012', '2013'], dtype='period[Y-DEC]')" diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 2683e25eda618..00e8262ddfa4c 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -700,8 +700,7 @@ def test_take_fill_value(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py index d1a278af337b7..648ee47ddc34c 100644 --- a/pandas/tests/indexes/string/test_indexing.py +++ b/pandas/tests/indexes/string/test_indexing.py @@ -13,6 +13,15 @@ def _isnan(val): return False +def _equivalent_na(dtype, null): + if dtype.na_value is pd.NA and null is pd.NA: + return True + elif _isnan(dtype.na_value) and _isnan(null): + return True + else: + return False + + class TestGetLoc: def test_get_loc(self, any_string_dtype): index = Index(["a", "b", "c"], dtype=any_string_dtype) @@ -41,14 +50,7 @@ def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture): def test_get_loc_missing(self, any_string_dtype, nulls_fixture): index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype) - if any_string_dtype == "string" and ( - (any_string_dtype.na_value is pd.NA and nulls_fixture is not pd.NA) - or (_isnan(any_string_dtype.na_value) and not _isnan(nulls_fixture)) - ): - with pytest.raises(KeyError): - index.get_loc(nulls_fixture) - else: - assert index.get_loc(nulls_fixture) == 2 + assert index.get_loc(nulls_fixture) == 2 class TestGetIndexer: @@ -93,9 +95,8 @@ def test_get_indexer_missing(self, any_string_dtype, null, using_infer_string): result = index.get_indexer(["a", null, "c"]) if using_infer_string: expected = np.array([0, 2, -1], dtype=np.intp) - elif any_string_dtype == "string" and ( - (any_string_dtype.na_value is pd.NA and null is not pd.NA) - or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + elif any_string_dtype == "string" and not _equivalent_na( + any_string_dtype, null ): expected = np.array([0, -1, -1], dtype=np.intp) else: @@ -115,9 +116,8 @@ def test_get_indexer_non_unique_nas( if using_infer_string: expected_indexer = np.array([0, 2], dtype=np.intp) expected_missing = np.array([], dtype=np.intp) - elif any_string_dtype == "string" and ( - (any_string_dtype.na_value is pd.NA and null is not pd.NA) - or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + elif any_string_dtype == "string" and not _equivalent_na( + any_string_dtype, null ): expected_indexer = np.array([0, -1], dtype=np.intp) expected_missing = np.array([1], dtype=np.intp) @@ -133,9 +133,8 @@ def test_get_indexer_non_unique_nas( if using_infer_string: expected_indexer = np.array([0, 1, 3], dtype=np.intp) - elif any_string_dtype == "string" and ( - (any_string_dtype.na_value is pd.NA and null is not pd.NA) - or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + elif any_string_dtype == "string" and not _equivalent_na( + any_string_dtype, null ): pass else: diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 06df8902f319c..5b75bd9afd6df 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -351,14 +351,11 @@ def test_view_with_args_object_array_raises(self, index): msg = "When changing to a larger dtype" with pytest.raises(ValueError, match=msg): index.view("i8") - elif index.dtype == "str" and not index.dtype.storage == "python": - # TODO(infer_string): Make the errors consistent - with pytest.raises(NotImplementedError, match="i8"): - index.view("i8") else: msg = ( - "Cannot change data-type for array of references.|" - "Cannot change data-type for object array.|" + r"Cannot change data-type for array of references\.|" + r"Cannot change data-type for object array\.|" + r"Cannot change data-type for array of strings\.|" ) with pytest.raises(TypeError, match=msg): index.view("i8") @@ -1115,8 +1112,7 @@ def test_take_fill_value(self): def test_take_fill_value_none_raises(self): index = Index(list("ABC"), name="xxx") msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index 4a31ae88a757a..dd228e6b713b5 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -419,8 +419,7 @@ class TestIndexConstructionErrors: def test_constructor_overflow_int64(self): # see GH#15832 msg = ( - "The elements provided in the data cannot " - "all be casted to the dtype int64" + "The elements provided in the data cannot all be casted to the dtype int64" ) with pytest.raises(OverflowError, match=msg): Index([np.iinfo(np.uint64).max - 1], dtype="int64") diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 65feb07e05d9f..5f36b8c3f5dbf 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -454,10 +454,12 @@ def test_insert_out_of_bounds(self, index, using_infer_string): else: msg = "slice indices must be integers or None or have an __index__ method" - if using_infer_string and ( - index.dtype == "string" or index.dtype == "category" - ): - msg = "loc must be an integer between" + if using_infer_string: + if index.dtype == "string" or index.dtype == "category": + msg = "loc must be an integer between" + elif index.dtype == "object" and len(index) == 0: + msg = "loc must be an integer between" + err = TypeError with pytest.raises(err, match=msg): index.insert(0.5, "foo") @@ -595,7 +597,7 @@ def test_fillna(self, index): pytest.skip(f"Not relevant for Index with {index.dtype}") elif isinstance(index, MultiIndex): idx = index.copy(deep=True) - msg = "isna is not defined for MultiIndex" + msg = "fillna is not defined for MultiIndex" with pytest.raises(NotImplementedError, match=msg): idx.fillna(idx[0]) else: diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 58b69d79c65ce..7cc74f4b3405c 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -530,7 +530,7 @@ def test_intersection_difference_match_empty(self, index, sort): @pytest.mark.parametrize( "method", ["intersection", "union", "difference", "symmetric_difference"] ) -def test_setop_with_categorical(index_flat, sort, method): +def test_setop_with_categorical(index_flat, sort, method, using_infer_string): # MultiIndex tested separately in tests.indexes.multi.test_setops index = index_flat @@ -539,10 +539,22 @@ def test_setop_with_categorical(index_flat, sort, method): result = getattr(index, method)(other, sort=sort) expected = getattr(index, method)(index, sort=sort) + if ( + using_infer_string + and index.empty + and method in ("union", "symmetric_difference") + ): + expected = expected.astype("category") tm.assert_index_equal(result, expected, exact=exact) result = getattr(index, method)(other[:5], sort=sort) expected = getattr(index, method)(index[:5], sort=sort) + if ( + using_infer_string + and index.empty + and method in ("union", "symmetric_difference") + ): + expected = expected.astype("category") tm.assert_index_equal(result, expected, exact=exact) diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index e411555c65bea..426083cb6b67c 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -262,8 +262,7 @@ def test_take_fill_value(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index ec9767aa4bab4..1d3258ab18a61 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -105,7 +105,7 @@ def test_loc_getitem_series(self): empty = Series(data=[], dtype=np.float64) expected = Series( [], - index=MultiIndex(levels=index.levels, codes=[[], []], dtype=np.float64), + index=MultiIndex(levels=index.levels, codes=[[], []]), dtype=np.float64, ) result = x.loc[empty] @@ -129,7 +129,7 @@ def test_loc_getitem_array(self): empty = np.array([]) expected = Series( [], - index=MultiIndex(levels=index.levels, codes=[[], []], dtype=np.float64), + index=MultiIndex(levels=index.levels, codes=[[], []]), dtype="float64", ) result = x.loc[empty] diff --git a/pandas/tests/indexing/test_at.py b/pandas/tests/indexing/test_at.py index 10a8fa88b4b5e..e80acc230a320 100644 --- a/pandas/tests/indexing/test_at.py +++ b/pandas/tests/indexing/test_at.py @@ -13,7 +13,6 @@ CategoricalIndex, DataFrame, DatetimeIndex, - Index, MultiIndex, Series, Timestamp, @@ -67,11 +66,7 @@ def test_at_setitem_item_cache_cleared(self): df.at[0, "x"] = 4 df.at[0, "cost"] = 789 - expected = DataFrame( - {"x": [4], "cost": 789}, - index=[0], - columns=Index(["x", "cost"], dtype=object), - ) + expected = DataFrame({"x": [4], "cost": 789}, index=[0]) tm.assert_frame_equal(df, expected) # And in particular, check that the _item_cache has updated correctly. diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index dc95e1bb1b8a0..2f6998a85c80b 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -763,8 +763,7 @@ def test_iloc_mask(self): "(index of the boolean Series and of the " "indexed object do not match).", ("locs", ".iloc"): ( - "iLocation based boolean indexing on an " - "integer type is not available" + "iLocation based boolean indexing on an integer type is not available" ), } diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index e0e9d4cfc5ccb..17e610bda93e4 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -766,9 +766,9 @@ def test_loc_setitem_empty_frame(self): # is inplace, so that dtype is retained sera = Series(val1, index=keys1, dtype=np.float64) serb = Series(val2, index=keys2) - expected = DataFrame( - {"A": sera, "B": serb}, columns=Index(["A", "B"], dtype=object) - ).reindex(index=index) + expected = DataFrame({"A": sera, "B": serb}, columns=Index(["A", "B"])).reindex( + index=index + ) tm.assert_frame_equal(df, expected) def test_loc_setitem_frame(self): @@ -966,7 +966,7 @@ def test_setitem_new_key_tz(self, indexer_sl): to_datetime(42).tz_localize("UTC"), to_datetime(666).tz_localize("UTC"), ] - expected = Series(vals, index=Index(["foo", "bar"], dtype=object)) + expected = Series(vals, index=Index(["foo", "bar"])) ser = Series(dtype=object) indexer_sl(ser)["foo"] = vals[0] @@ -1966,15 +1966,11 @@ def test_loc_setitem_empty_series_str_idx(self): # partially set with an empty object series ser = Series(dtype=object) ser.loc["foo"] = 1 - tm.assert_series_equal(ser, Series([1], index=Index(["foo"], dtype=object))) + tm.assert_series_equal(ser, Series([1], index=Index(["foo"]))) ser.loc["bar"] = 3 - tm.assert_series_equal( - ser, Series([1, 3], index=Index(["foo", "bar"], dtype=object)) - ) + tm.assert_series_equal(ser, Series([1, 3], index=Index(["foo", "bar"]))) ser.loc[3] = 4 - tm.assert_series_equal( - ser, Series([1, 3, 4], index=Index(["foo", "bar", 3], dtype=object)) - ) + tm.assert_series_equal(ser, Series([1, 3, 4], index=Index(["foo", "bar", 3]))) def test_loc_setitem_incremental_with_dst(self): # GH#20724 @@ -1996,7 +1992,7 @@ def test_loc_setitem_incremental_with_dst(self): ], ids=["self", "to_datetime64", "to_pydatetime", "np.datetime64"], ) - def test_loc_setitem_datetime_keys_cast(self, conv): + def test_loc_setitem_datetime_keys_cast(self, conv, using_infer_string): # GH#9516, GH#51363 changed in 3.0 to not cast on Index.insert dt1 = Timestamp("20130101 09:00:00") dt2 = Timestamp("20130101 10:00:00") @@ -2006,8 +2002,10 @@ def test_loc_setitem_datetime_keys_cast(self, conv): expected = DataFrame( {"one": [100.0, 200.0]}, - index=Index([conv(dt1), conv(dt2)], dtype=object), - columns=Index(["one"], dtype=object), + index=Index( + [conv(dt1), conv(dt2)], dtype=None if using_infer_string else object + ), + columns=Index(["one"]), ) tm.assert_frame_equal(df, expected) @@ -3297,3 +3295,23 @@ def test_loc_reindexing_of_empty_index(self): df.loc[Series([False] * 4, index=df.index, name=0), 0] = df[0] expected = DataFrame(index=[1, 1, 2, 2], data=["1", "1", "2", "2"]) tm.assert_frame_equal(df, expected) + + def test_loc_setitem_matching_index(self): + # GH 25548 + s = Series(0.0, index=list("abcd")) + s1 = Series(1.0, index=list("ab")) + s2 = Series(2.0, index=list("xy")) + + # Test matching indices + s.loc[["a", "b"]] = s1 + + result = s[["a", "b"]] + expected = s1 + tm.assert_series_equal(result, expected) + + # Test unmatched indices + s.loc[["a", "b"]] = s2 + + result = s[["a", "b"]] + expected = Series([np.nan, np.nan], index=["a", "b"]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 3dbdedbb94618..6f20d0e4e7cbf 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -30,7 +30,7 @@ def test_empty_frame_setitem_index_name_retained(self): expected = DataFrame( {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="df_index"), - columns=Index(["series"], dtype=object), + columns=Index(["series"]), ) tm.assert_frame_equal(df, expected) @@ -43,7 +43,7 @@ def test_empty_frame_setitem_index_name_inherited(self): expected = DataFrame( {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="series_index"), - columns=Index(["series"], dtype=object), + columns=Index(["series"]), ) tm.assert_frame_equal(df, expected) @@ -96,9 +96,7 @@ def test_partial_set_empty_frame2(self): # these work as they don't really change # anything but the index # GH#5632 - expected = DataFrame( - columns=Index(["foo"], dtype=object), index=Index([], dtype="object") - ) + expected = DataFrame(columns=Index(["foo"]), index=Index([], dtype="object")) df = DataFrame(index=Index([], dtype="object")) df["foo"] = Series([], dtype="object") @@ -116,9 +114,7 @@ def test_partial_set_empty_frame2(self): tm.assert_frame_equal(df, expected) def test_partial_set_empty_frame3(self): - expected = DataFrame( - columns=Index(["foo"], dtype=object), index=Index([], dtype="int64") - ) + expected = DataFrame(columns=Index(["foo"]), index=Index([], dtype="int64")) expected["foo"] = expected["foo"].astype("float64") df = DataFrame(index=Index([], dtype="int64")) @@ -135,9 +131,7 @@ def test_partial_set_empty_frame4(self): df = DataFrame(index=Index([], dtype="int64")) df["foo"] = range(len(df)) - expected = DataFrame( - columns=Index(["foo"], dtype=object), index=Index([], dtype="int64") - ) + expected = DataFrame(columns=Index(["foo"]), index=Index([], dtype="int64")) # range is int-dtype-like, so we get int64 dtype expected["foo"] = expected["foo"].astype("int64") tm.assert_frame_equal(df, expected) @@ -210,7 +204,7 @@ def test_partial_set_empty_frame_empty_copy_assignment(self): df = DataFrame(index=[0]) df = df.copy() df["a"] = 0 - expected = DataFrame(0, index=[0], columns=Index(["a"], dtype=object)) + expected = DataFrame(0, index=[0], columns=Index(["a"])) tm.assert_frame_equal(df, expected) def test_partial_set_empty_frame_empty_consistencies(self, using_infer_string): diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index b80b4b923c247..a41d7dec8b496 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -278,7 +278,7 @@ def test_empty_pyarrow(data): expected = pd.DataFrame(data) arrow_df = pa_from_dataframe(expected) result = from_dataframe(arrow_df) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_column_type=False) def test_multi_chunk_pyarrow() -> None: @@ -288,8 +288,7 @@ def test_multi_chunk_pyarrow() -> None: table = pa.table([n_legs], names=names) with pytest.raises( RuntimeError, - match="To join chunks a copy is required which is " - "forbidden by allow_copy=False", + match="Cannot do zero copy conversion into multi-column DataFrame block", ): pd.api.interchange.from_dataframe(table, allow_copy=False) @@ -641,3 +640,12 @@ def test_buffer_dtype_categorical( col = dfi.get_column_by_name("data") assert col.dtype == expected_dtype assert col.get_buffers()["data"][1] == expected_buffer_dtype + + +def test_from_dataframe_list_dtype(): + pa = pytest.importorskip("pyarrow", "14.0.0") + data = {"a": [[1, 2], [4, 5, 6]]} + tbl = pa.table(data) + result = from_dataframe(tbl) + expected = pd.DataFrame(data) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index bdefadf3dbec0..a5ddda9d66e7a 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -67,14 +67,13 @@ def s3_base(worker_id, monkeypatch): monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") if is_ci_environment(): if is_platform_arm() or is_platform_mac() or is_platform_windows(): - # NOT RUN on Windows/macOS/ARM, only Ubuntu + # NOT RUN on Windows/macOS, only Ubuntu # - subprocess in CI can cause timeouts # - GitHub Actions do not support # container services for the above OSs - # - CircleCI will probably hit the Docker rate pull limit pytest.skip( - "S3 tests do not have a corresponding service in " - "Windows, macOS or ARM platforms" + "S3 tests do not have a corresponding service on " + "Windows or macOS platforms" ) else: # set in .github/workflows/unit-tests.yml diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 34824f0a67985..140cf39b26556 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -910,8 +910,7 @@ def test_corrupt_bytes_raises(self, engine): error = XLRDError msg = ( - "Unsupported format, or corrupt file: Expected BOF " - "record; found b'foo'" + "Unsupported format, or corrupt file: Expected BOF record; found b'foo'" ) elif engine == "calamine": from python_calamine import CalamineError diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index 71ef1201e523f..0e13b2f94ed58 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -356,6 +356,6 @@ def test_format_hierarchical_rows_periodindex(merge_cells): for cell in formatted_cells: if cell.row != 0 and cell.col == 0: - assert isinstance( - cell.val, Timestamp - ), "Period should be converted to Timestamp" + assert isinstance(cell.val, Timestamp), ( + "Period should be converted to Timestamp" + ) diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index ff8a1b9f570ab..b7dcfde327b83 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -933,7 +933,7 @@ def test_trim(self, df): def test_export(self, df, styler): f = lambda x: "color: red" if x > 0 else "color: blue" - g = lambda x, z: f"color: {z}" if x > 0 else f"color: {z}" + g = lambda x, z: f"color: {z}" style1 = styler style1.map(f).map(g, z="b").highlight_max()._compute() # = render result = style1.export() diff --git a/pandas/tests/io/formats/style/test_to_latex.py b/pandas/tests/io/formats/style/test_to_latex.py index 1abe6238d3922..eb221686dd165 100644 --- a/pandas/tests/io/formats/style/test_to_latex.py +++ b/pandas/tests/io/formats/style/test_to_latex.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, MultiIndex, @@ -731,7 +729,6 @@ def test_longtable_caption_label(styler, caption, cap_exp, label, lab_exp): ) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("index", [True, False]) @pytest.mark.parametrize( "columns, siunitx", diff --git a/pandas/tests/io/formats/style/test_to_typst.py b/pandas/tests/io/formats/style/test_to_typst.py new file mode 100644 index 0000000000000..2365119c9c4dc --- /dev/null +++ b/pandas/tests/io/formats/style/test_to_typst.py @@ -0,0 +1,96 @@ +from textwrap import dedent + +import pytest + +from pandas import ( + DataFrame, + Series, +) + +pytest.importorskip("jinja2") +from pandas.io.formats.style import Styler + + +@pytest.fixture +def df(): + return DataFrame( + {"A": [0, 1], "B": [-0.61, -1.22], "C": Series(["ab", "cd"], dtype=object)} + ) + + +@pytest.fixture +def styler(df): + return Styler(df, uuid_len=0, precision=2) + + +def test_basic_table(styler): + result = styler.to_typst() + expected = dedent( + """\ + #table( + columns: 4, + [], [A], [B], [C], + + [0], [0], [-0.61], [ab], + [1], [1], [-1.22], [cd], + )""" + ) + assert result == expected + + +def test_concat(styler): + result = styler.concat(styler.data.agg(["sum"]).style).to_typst() + expected = dedent( + """\ + #table( + columns: 4, + [], [A], [B], [C], + + [0], [0], [-0.61], [ab], + [1], [1], [-1.22], [cd], + [sum], [1], [-1.830000], [abcd], + )""" + ) + assert result == expected + + +def test_concat_recursion(styler): + df = styler.data + styler1 = styler + styler2 = Styler(df.agg(["sum"]), uuid_len=0, precision=3) + styler3 = Styler(df.agg(["sum"]), uuid_len=0, precision=4) + result = styler1.concat(styler2.concat(styler3)).to_typst() + expected = dedent( + """\ + #table( + columns: 4, + [], [A], [B], [C], + + [0], [0], [-0.61], [ab], + [1], [1], [-1.22], [cd], + [sum], [1], [-1.830], [abcd], + [sum], [1], [-1.8300], [abcd], + )""" + ) + assert result == expected + + +def test_concat_chain(styler): + df = styler.data + styler1 = styler + styler2 = Styler(df.agg(["sum"]), uuid_len=0, precision=3) + styler3 = Styler(df.agg(["sum"]), uuid_len=0, precision=4) + result = styler1.concat(styler2).concat(styler3).to_typst() + expected = dedent( + """\ + #table( + columns: 4, + [], [A], [B], [C], + + [0], [0], [-0.61], [ab], + [1], [1], [-1.22], [cd], + [sum], [1], [-1.830], [abcd], + [sum], [1], [-1.8300], [abcd], + )""" + ) + assert result == expected diff --git a/pandas/tests/io/formats/test_css.py b/pandas/tests/io/formats/test_css.py index c4ecb48006cb1..642a562704344 100644 --- a/pandas/tests/io/formats/test_css.py +++ b/pandas/tests/io/formats/test_css.py @@ -193,8 +193,7 @@ def test_css_border_shorthands(prop, expected): ( "margin: 1px; margin-top: 2px", "", - "margin-left: 1px; margin-right: 1px; " - "margin-bottom: 1px; margin-top: 2px", + "margin-left: 1px; margin-right: 1px; margin-bottom: 1px; margin-top: 2px", ), ("margin-top: 2px", "margin: 1px", "margin: 1px; margin-top: 2px"), ("margin: 1px", "margin-top: 2px", "margin: 1px"), diff --git a/pandas/tests/io/formats/test_printing.py b/pandas/tests/io/formats/test_printing.py index 3b63011bf862e..7d154235d2c4a 100644 --- a/pandas/tests/io/formats/test_printing.py +++ b/pandas/tests/io/formats/test_printing.py @@ -82,6 +82,9 @@ def test_repr_dict(self): def test_repr_mapping(self): assert printing.pprint_thing(MyMapping()) == "{'a': 4, 'b': 4}" + def test_repr_frozenset(self): + assert printing.pprint_thing(frozenset([1, 2])) == "frozenset({1, 2})" + class TestFormatBase: def test_adjoin(self): diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 7bf041a50b745..6d762fdeb8d79 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -482,10 +482,7 @@ def test_to_csv_string_with_crlf(self): # case 3: CRLF as line terminator # 'lineterminator' should not change inner element expected_crlf = ( - b"int,str_crlf\r\n" - b"1,abc\r\n" - b'2,"d\r\nef"\r\n' - b'3,"g\r\nh\r\n\r\ni"\r\n' + b'int,str_crlf\r\n1,abc\r\n2,"d\r\nef"\r\n3,"g\r\nh\r\n\r\ni"\r\n' ) df.to_csv(path, lineterminator="\r\n", index=False) with open(path, "rb") as f: diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index b1a437bfdbd8a..9c75314b66fa2 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -94,8 +94,7 @@ def test_to_html_with_column_specific_col_space_raises(): ) msg = ( - "Col_space length\\(\\d+\\) should match " - "DataFrame number of columns\\(\\d+\\)" + "Col_space length\\(\\d+\\) should match DataFrame number of columns\\(\\d+\\)" ) with pytest.raises(ValueError, match=msg): df.to_html(col_space=[30, 40]) diff --git a/pandas/tests/io/formats/test_to_markdown.py b/pandas/tests/io/formats/test_to_markdown.py index 7aa7cebb5120f..f3d9b88cc91e2 100644 --- a/pandas/tests/io/formats/test_to_markdown.py +++ b/pandas/tests/io/formats/test_to_markdown.py @@ -35,8 +35,7 @@ def test_empty_frame(): df.to_markdown(buf=buf) result = buf.getvalue() assert result == ( - "| id | first_name | last_name |\n" - "|------|--------------|-------------|" + "| id | first_name | last_name |\n|------|--------------|-------------|" ) @@ -65,8 +64,7 @@ def test_series(): s.to_markdown(buf=buf) result = buf.getvalue() assert result == ( - "| | foo |\n|---:|------:|\n| 0 | 1 " - "|\n| 1 | 2 |\n| 2 | 3 |" + "| | foo |\n|---:|------:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |" ) diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index af3cdf2d44af3..63c975fd831e7 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -132,20 +132,17 @@ def test_to_string_with_formatters_unicode(self): ) assert result == expected - def test_to_string_index_formatter(self): - df = DataFrame([range(5), range(5, 10), range(10, 15)]) - - rs = df.to_string(formatters={"__index__": lambda x: "abc"[x]}) - - xp = dedent( - """\ - 0 1 2 3 4 - a 0 1 2 3 4 - b 5 6 7 8 9 - c 10 11 12 13 14\ - """ - ) - assert rs == xp + def test_to_string_index_formatter(self): + df = DataFrame([range(5), range(5, 10), range(10, 15)]) + rs = df.to_string(formatters={"__index__": lambda x: "abc"[x]}) + xp = dedent( + """\ + 0 1 2 3 4 + a 0 1 2 3 4 + b 5 6 7 8 9 + c 10 11 12 13 14""" + ) + assert rs == xp def test_no_extra_space(self): # GH#52690: Check that no extra space is given @@ -380,17 +377,11 @@ def test_to_string_small_float_values(self): # sadness per above if _three_digit_exp(): expected = ( - " a\n" - "0 1.500000e+000\n" - "1 1.000000e-017\n" - "2 -5.500000e-007" + " a\n0 1.500000e+000\n1 1.000000e-017\n2 -5.500000e-007" ) else: expected = ( - " a\n" - "0 1.500000e+00\n" - "1 1.000000e-17\n" - "2 -5.500000e-07" + " a\n0 1.500000e+00\n1 1.000000e-17\n2 -5.500000e-07" ) assert result == expected @@ -1213,13 +1204,7 @@ def test_to_string_float_na_spacing(self): ser[::2] = np.nan result = ser.to_string() - expected = ( - "0 NaN\n" - "1 1.5678\n" - "2 NaN\n" - "3 -3.0000\n" - "4 NaN" - ) + expected = "0 NaN\n1 1.5678\n2 NaN\n3 -3.0000\n4 NaN" assert result == expected def test_to_string_with_datetimeindex(self): diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index ff7d34c85c015..953a9246da1cd 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -41,6 +41,7 @@ def test_read_zipped_json(datapath): @td.skip_if_not_us_locale @pytest.mark.single_cpu +@pytest.mark.network def test_with_s3_url(compression, s3_public_bucket, s3so): # Bucket created in tests/io/conftest.py df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}')) diff --git a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py index 8de289afe9ff9..12ae24b064c9d 100644 --- a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py +++ b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py @@ -159,7 +159,7 @@ def test_build_decimal_series(self, dc): expected = OrderedDict( [ ("schema", schema), - ("data", [OrderedDict([("id", 0), ("a", 10.0)])]), + ("data", [OrderedDict([("id", 0), ("a", "10")])]), ] ) @@ -245,7 +245,7 @@ def test_to_json(self, da, dc, sa, ia): [ ("idx", 0), ("A", "2021-10-10T00:00:00.000"), - ("B", 10.0), + ("B", "10"), ("C", "pandas"), ("D", 10), ] diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index ad9dbf7554a8b..144b36166261b 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1,6 +1,5 @@ import datetime from datetime import timedelta -from decimal import Decimal from io import StringIO import json import os @@ -1268,9 +1267,7 @@ def test_default_handler_numpy_unsupported_dtype(self): columns=["a", "b"], ) expected = ( - '[["(1+0j)","(nan+0j)"],' - '["(2.3+0j)","(nan+0j)"],' - '["(4-5j)","(1.2+0j)"]]' + '[["(1+0j)","(nan+0j)"],["(2.3+0j)","(nan+0j)"],["(4-5j)","(1.2+0j)"]]' ) assert df.to_json(default_handler=str, orient="values") == expected @@ -1373,11 +1370,7 @@ def test_tz_is_naive(self): ) def test_tz_range_is_utc(self, tz_range): exp = '["2013-01-01T05:00:00.000Z","2013-01-02T05:00:00.000Z"]' - dfexp = ( - '{"DT":{' - '"0":"2013-01-01T05:00:00.000Z",' - '"1":"2013-01-02T05:00:00.000Z"}}' - ) + dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000Z","1":"2013-01-02T05:00:00.000Z"}}' assert ujson_dumps(tz_range, iso_dates=True) == exp dti = DatetimeIndex(tz_range) @@ -1413,6 +1406,7 @@ def test_read_inline_jsonl(self): tm.assert_frame_equal(result, expected) @pytest.mark.single_cpu + @pytest.mark.network @td.skip_if_not_us_locale def test_read_s3_jsonl(self, s3_public_bucket_with_data, s3so): # GH17200 @@ -1775,7 +1769,7 @@ def test_read_json_with_url_value(self, url): ) def test_read_json_with_very_long_file_path(self, compression): # GH 46718 - long_json_path = f'{"a" * 1000}.json{compression}' + long_json_path = f"{'a' * 1000}.json{compression}" with pytest.raises( FileNotFoundError, match=f"File {long_json_path} does not exist" ): @@ -2012,6 +2006,7 @@ def test_json_multiindex(self): assert result == expected @pytest.mark.single_cpu + @pytest.mark.network def test_to_s3(self, s3_public_bucket, s3so): # GH 28375 mock_bucket_name, target_file = s3_public_bucket.name, "test.json" @@ -2025,12 +2020,8 @@ def test_to_s3(self, s3_public_bucket, s3so): timeout -= 0.1 assert timeout > 0, "Timed out waiting for file to appear on moto" - def test_json_pandas_nulls(self, nulls_fixture, request): + def test_json_pandas_nulls(self, nulls_fixture): # GH 31615 - if isinstance(nulls_fixture, Decimal): - mark = pytest.mark.xfail(reason="not implemented") - request.applymarker(mark) - expected_warning = None msg = ( "The default 'epoch' date format is deprecated and will be removed " diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 3c843479b446a..d482eb5fa1a06 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -236,9 +236,9 @@ def test_readjson_chunks_closes(chunksize): ) with reader: reader.read() - assert ( - reader.handles.handle.closed - ), f"didn't close stream with chunksize = {chunksize}" + assert reader.handles.handle.closed, ( + f"didn't close stream with chunksize = {chunksize}" + ) @pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"]) @@ -435,8 +435,7 @@ def test_to_json_append_mode(mode_): # Test ValueError when mode is not supported option df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) msg = ( - f"mode={mode_} is not a valid option." - "Only 'w' and 'a' are currently supported." + f"mode={mode_} is not a valid option.Only 'w' and 'a' are currently supported." ) with pytest.raises(ValueError, match=msg): df.to_json(mode=mode_, lines=False, orient="records") diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 62118f1c82ebb..d2bf9bdb139bd 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -53,60 +53,24 @@ def orient(request): class TestUltraJSONTests: @pytest.mark.skipif(not IS64, reason="not compliant on 32-bit, xref #15865") - def test_encode_decimal(self): - sut = decimal.Decimal("1337.1337") - encoded = ujson.ujson_dumps(sut, double_precision=15) - decoded = ujson.ujson_loads(encoded) - assert decoded == 1337.1337 - - sut = decimal.Decimal("0.95") - encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "1.0" - - decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 - - sut = decimal.Decimal("0.94") - encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "0.9" - - decoded = ujson.ujson_loads(encoded) - assert decoded == 0.9 - - sut = decimal.Decimal("1.95") - encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "2.0" - - decoded = ujson.ujson_loads(encoded) - assert decoded == 2.0 - - sut = decimal.Decimal("-1.95") - encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "-2.0" - - decoded = ujson.ujson_loads(encoded) - assert decoded == -2.0 - - sut = decimal.Decimal("0.995") - encoded = ujson.ujson_dumps(sut, double_precision=2) - assert encoded == "1.0" - - decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 - - sut = decimal.Decimal("0.9995") - encoded = ujson.ujson_dumps(sut, double_precision=3) - assert encoded == "1.0" - - decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 - - sut = decimal.Decimal("0.99999999999999944") - encoded = ujson.ujson_dumps(sut, double_precision=15) - assert encoded == "1.0" - + @pytest.mark.parametrize( + "value, double_precision", + [ + ("1337.1337", 15), + ("0.95", 1), + ("0.94", 1), + ("1.95", 1), + ("-1.95", 1), + ("0.995", 2), + ("0.9995", 3), + ("0.99999999999999944", 15), + ], + ) + def test_encode_decimal(self, value, double_precision): + sut = decimal.Decimal(value) + encoded = ujson.ujson_dumps(sut, double_precision=double_precision) decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 + assert decoded == value @pytest.mark.parametrize("ensure_ascii", [True, False]) def test_encode_string_conversion(self, ensure_ascii): @@ -991,7 +955,7 @@ def test_decode_array(self, arr): def test_decode_extreme_numbers(self, extreme_num): assert extreme_num == ujson.ujson_loads(str(extreme_num)) - @pytest.mark.parametrize("too_extreme_num", [f"{2**64}", f"{-2**63-1}"]) + @pytest.mark.parametrize("too_extreme_num", [f"{2**64}", f"{-(2**63) - 1}"]) def test_decode_too_extreme_numbers(self, too_extreme_num): with pytest.raises( ValueError, @@ -1006,7 +970,7 @@ def test_decode_with_trailing_non_whitespaces(self): with pytest.raises(ValueError, match="Trailing data"): ujson.ujson_loads("{}\n\t a") - @pytest.mark.parametrize("value", [f"{2**64}", f"{-2**63-1}"]) + @pytest.mark.parametrize("value", [f"{2**64}", f"{-(2**63) - 1}"]) def test_decode_array_with_big_int(self, value): with pytest.raises( ValueError, diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 8352cc80f5e62..cfa8785b24bde 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -90,9 +90,6 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): def test_multi_index_no_level_names( request, all_parsers, index_col, using_infer_string ): - if using_infer_string and all_parsers.engine == "pyarrow": - # result should have string columns instead of object dtype - request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) data = """index1,index2,A,B,C,D foo,one,2,3,4,5 foo,two,7,8,9,10 diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index ed2e729430b01..a73327beea8bb 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -131,8 +131,7 @@ def test_catch_too_many_names(all_parsers): msg = ( "Too many columns specified: expected 4 and found 3" if parser.engine == "c" - else "Number of passed names did not match " - "number of header fields in the file" + else "Number of passed names did not match number of header fields in the file" ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index d3789cd387c05..55c8bbc4bb9e1 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -136,7 +136,7 @@ def test_mangled_unnamed_placeholders(all_parsers): expected = DataFrame(columns=Index([], dtype="str")) for j in range(i + 1): - col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1) + col_name = "Unnamed: 0" + f".{1 * j}" * min(j, 1) expected.insert(loc=0, column=col_name, value=[0, 1, 2]) expected[orig_key] = orig_value diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 1411ed5019766..9a15d9bc84a2e 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -228,7 +228,7 @@ def test_parse_tz_aware(all_parsers): def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): # see gh-5636 parser = all_parsers - msg = "Only booleans and lists " "are accepted for the 'parse_dates' parameter" + msg = "Only booleans and lists are accepted for the 'parse_dates' parameter" data = """A,B,C 1,2,2003-11-1""" @@ -239,7 +239,7 @@ def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): @pytest.mark.parametrize("parse_dates", [(1,), np.array([4, 5]), {1, 3}]) def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): parser = all_parsers - msg = "Only booleans and lists " "are accepted for the 'parse_dates' parameter" + msg = "Only booleans and lists are accepted for the 'parse_dates' parameter" data = """A,B,C 1,2,2003-11-1""" diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 47658c0eb9012..479f2468a86ab 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp from pandas.compat import PY312 @@ -25,10 +23,7 @@ ensure_clean_store, ) -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] tables = pytest.importorskip("tables") @@ -40,7 +35,7 @@ def test_append(setup_path): # tables.NaturalNameWarning): df = DataFrame( np.random.default_rng(2).standard_normal((20, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=20, freq="B"), ) _maybe_remove(store, "df1") @@ -203,7 +198,7 @@ def test_append_some_nans(setup_path): tm.assert_frame_equal(store["df3"], df3, check_index_type=True) -def test_append_all_nans(setup_path): +def test_append_all_nans(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: df = DataFrame( { @@ -255,7 +250,13 @@ def test_append_all_nans(setup_path): _maybe_remove(store, "df") store.append("df", df[:10], dropna=True) store.append("df", df[10:], dropna=True) - tm.assert_frame_equal(store["df"], df, check_index_type=True) + result = store["df"] + expected = df + if using_infer_string: + # TODO: Test is incorrect when not using_infer_string. + # Should take the last 4 rows uncondiationally. + expected = expected[-4:] + tm.assert_frame_equal(result, expected, check_index_type=True) _maybe_remove(store, "df2") store.append("df2", df[:10], dropna=False) @@ -294,7 +295,7 @@ def test_append_frame_column_oriented(setup_path, request): # column oriented df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.index = df.index._with_freq(None) # freq doesn't round-trip @@ -426,7 +427,7 @@ def check_col(key, name, size): { "A": [0.0, 1.0, 2.0, 3.0, 4.0], "B": [0.0, 1.0, 0.0, 1.0, 0.0], - "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), + "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]), "D": date_range("20130101", periods=5), } ).set_index("C") @@ -453,7 +454,7 @@ def check_col(key, name, size): _maybe_remove(store, "df") df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -517,7 +518,7 @@ def test_append_with_data_columns(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.iloc[0, df.columns.get_loc("B")] = 1.0 @@ -693,8 +694,8 @@ def test_append_misc(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) store.append("df", df, chunksize=1) result = store.select("df") @@ -710,8 +711,8 @@ def test_append_misc_chunksize(setup_path, chunksize): # more chunksize in append tests df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["string"] = "foo" df["float322"] = 1.0 @@ -747,15 +748,15 @@ def test_append_misc_empty_frame(setup_path): tm.assert_frame_equal(store.select("df2"), df) -def test_append_raise(setup_path): +def test_append_raise(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: # test append with invalid input to get good error messages # list in column df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["invalid"] = [["a"]] * len(df) assert df.dtypes["invalid"] == np.object_ @@ -775,8 +776,8 @@ def test_append_raise(setup_path): # datetime with embedded nans as object df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) s = Series(datetime.datetime(2001, 1, 2), index=df.index) s = s.astype(object) @@ -803,8 +804,8 @@ def test_append_raise(setup_path): # appending an incompatible table df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) store.append("df", df) @@ -823,12 +824,9 @@ def test_append_raise(setup_path): store.append("df", df) df["foo"] = "bar" msg = re.escape( - "invalid combination of [values_axes] on appending data " - "[name->values_block_1,cname->values_block_1," - "dtype->bytes24,kind->string,shape->(1, 30)] " - "vs current table " - "[name->values_block_1,cname->values_block_1," - "dtype->datetime64[s],kind->datetime64[s],shape->None]" + "Cannot serialize the column [foo] " + "because its data contents are not [string] " + "but [datetime64[s]] object dtype" ) with pytest.raises(ValueError, match=msg): store.append("df", df) @@ -884,7 +882,7 @@ def test_append_with_timedelta(setup_path): def test_append_to_multiple(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) @@ -921,12 +919,12 @@ def test_append_to_multiple(setup_path): def test_append_to_multiple_dropna(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ).rename(columns="{}_2".format) df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan @@ -946,7 +944,7 @@ def test_append_to_multiple_dropna(setup_path): def test_append_to_multiple_dropna_false(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) @@ -997,3 +995,29 @@ def test_append_to_multiple_min_itemsize(setup_path): ) result = store.select_as_multiple(["index", "nums", "strs"]) tm.assert_frame_equal(result, expected, check_index_type=True) + + +def test_append_string_nan_rep(setup_path): + # GH 16300 + df = DataFrame({"A": "a", "B": "foo"}, index=np.arange(10)) + df_nan = df.copy() + df_nan.loc[0:4, :] = np.nan + msg = "NaN representation is too large for existing column size" + + with ensure_clean_store(setup_path) as store: + # string column too small + store.append("sa", df["A"]) + with pytest.raises(ValueError, match=msg): + store.append("sa", df_nan["A"]) + + # nan_rep too big + store.append("sb", df["B"], nan_rep="bars") + with pytest.raises(ValueError, match=msg): + store.append("sb", df_nan["B"]) + + # smaller modified nan_rep + store.append("sc", df["A"], nan_rep="n") + store.append("sc", df_nan["A"]) + result = store["sc"] + expected = concat([df["A"], df_nan["A"]]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_categorical.py b/pandas/tests/io/pytables/test_categorical.py index 998021bad9001..ed2616b24cd71 100644 --- a/pandas/tests/io/pytables/test_categorical.py +++ b/pandas/tests/io/pytables/test_categorical.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( Categorical, DataFrame, @@ -16,10 +14,7 @@ ensure_clean_store, ) -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_categorical(setup_path): diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index d140cfc941e16..c5cac5a5caf09 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -13,10 +11,6 @@ from pandas.io.pytables import read_hdf -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - def test_complex_fixed(tmp_path, setup_path): df = DataFrame( diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py index c31b9989ef35e..b28101c09820f 100644 --- a/pandas/tests/io/pytables/test_errors.py +++ b/pandas/tests/io/pytables/test_errors.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( CategoricalIndex, DataFrame, @@ -24,10 +22,7 @@ _maybe_adjust_name, ) -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_pass_spec_to_storer(setup_path): @@ -93,9 +88,14 @@ def test_unimplemented_dtypes_table_columns(setup_path): with ensure_clean_store(setup_path) as store: # this fails because we have a date in the object block...... - msg = re.escape( - """Cannot serialize the column [datetime1] -because its data contents are not [string] but [date] object dtype""" + msg = "|".join( + [ + re.escape( + "Cannot serialize the column [datetime1]\nbecause its data " + "contents are not [string] but [date] object dtype" + ), + re.escape("[date] is not implemented as a table column"), + ] ) with pytest.raises(TypeError, match=msg): store.append("df_unimplemented", df) diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index 606b19ac0ed75..27b5d34146f85 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import ( PY311, is_ci_environment, @@ -35,14 +33,11 @@ from pandas.io import pytables from pandas.io.pytables import Term -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"]) -def test_mode(setup_path, tmp_path, mode): +def test_mode(setup_path, tmp_path, mode, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD"), dtype=object), @@ -91,10 +86,12 @@ def test_mode(setup_path, tmp_path, mode): read_hdf(path, "df", mode=mode) else: result = read_hdf(path, "df", mode=mode) + if using_infer_string: + df.columns = df.columns.astype("str") tm.assert_frame_equal(result, df) -def test_default_mode(tmp_path, setup_path): +def test_default_mode(tmp_path, setup_path, using_infer_string): # read_hdf uses default mode df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), @@ -104,7 +101,10 @@ def test_default_mode(tmp_path, setup_path): path = tmp_path / setup_path df.to_hdf(path, key="df", mode="w") result = read_hdf(path, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) def test_reopen_handle(tmp_path, setup_path): @@ -163,7 +163,7 @@ def test_reopen_handle(tmp_path, setup_path): assert not store.is_open -def test_open_args(setup_path): +def test_open_args(setup_path, using_infer_string): with tm.ensure_clean(setup_path) as path: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -178,8 +178,13 @@ def test_open_args(setup_path): store["df"] = df store.append("df2", df) - tm.assert_frame_equal(store["df"], df) - tm.assert_frame_equal(store["df2"], df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + + tm.assert_frame_equal(store["df"], expected) + tm.assert_frame_equal(store["df2"], expected) store.close() @@ -194,7 +199,7 @@ def test_flush(setup_path): store.flush(fsync=True) -def test_complibs_default_settings(tmp_path, setup_path): +def test_complibs_default_settings(tmp_path, setup_path, using_infer_string): # GH15943 df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -207,7 +212,11 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df", complevel=9) result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -218,7 +227,11 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df", complib="zlib") result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -229,7 +242,11 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df") result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -339,7 +356,7 @@ def test_encoding(setup_path): [b"A\xf8\xfc", np.nan, b"", b"b", b"c"], ], ) -@pytest.mark.parametrize("dtype", ["category", object]) +@pytest.mark.parametrize("dtype", ["category", None]) def test_latin_encoding(tmp_path, setup_path, dtype, val): enc = "latin-1" nan_rep = "" diff --git a/pandas/tests/io/pytables/test_keys.py b/pandas/tests/io/pytables/test_keys.py index 7d0802dcf2e47..9c5fc8786c7c6 100644 --- a/pandas/tests/io/pytables/test_keys.py +++ b/pandas/tests/io/pytables/test_keys.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, HDFStore, @@ -15,10 +13,7 @@ tables, ) -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_keys(setup_path): diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index a4257b54dd6db..c9fe6070b34c3 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp import pandas as pd @@ -24,10 +22,7 @@ ) from pandas.util import _test_decorators as td -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_format_type(tmp_path, setup_path): @@ -54,8 +49,8 @@ def test_api_default_format(tmp_path, setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) with pd.option_context("io.hdf.default_format", "fixed"): @@ -79,8 +74,8 @@ def test_api_default_format(tmp_path, setup_path): path = tmp_path / setup_path df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) with pd.option_context("io.hdf.default_format", "fixed"): @@ -106,7 +101,7 @@ def test_put(setup_path): ) df = DataFrame( np.random.default_rng(2).standard_normal((20, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=20, freq="B"), ) store["a"] = ts @@ -166,7 +161,7 @@ def test_put_compression(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -183,7 +178,7 @@ def test_put_compression(setup_path): def test_put_compression_blosc(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -197,10 +192,20 @@ def test_put_compression_blosc(setup_path): tm.assert_frame_equal(store["c"], df) -def test_put_mixed_type(setup_path, performance_warning): +def test_put_datetime_ser(setup_path, performance_warning, using_infer_string): + # https://github.com/pandas-dev/pandas/pull/60663 + ser = Series(3 * [Timestamp("20010102").as_unit("ns")]) + with ensure_clean_store(setup_path) as store: + store.put("ser", ser) + expected = ser.copy() + result = store.get("ser") + tm.assert_series_equal(result, expected) + + +def test_put_mixed_type(setup_path, performance_warning, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["obj1"] = "foo" @@ -220,13 +225,42 @@ def test_put_mixed_type(setup_path, performance_warning): with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df") - with tm.assert_produces_warning(performance_warning): + warning = None if using_infer_string else performance_warning + with tm.assert_produces_warning(warning): store.put("df", df) expected = store.get("df") tm.assert_frame_equal(expected, df) +def test_put_str_frame(setup_path, performance_warning, string_dtype_arguments): + # https://github.com/pandas-dev/pandas/pull/60663 + dtype = pd.StringDtype(*string_dtype_arguments) + df = DataFrame({"a": pd.array(["x", pd.NA, "y"], dtype=dtype)}) + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + + store.put("df", df) + expected_dtype = "str" if dtype.na_value is np.nan else "string" + expected = df.astype(expected_dtype) + result = store.get("df") + tm.assert_frame_equal(result, expected) + + +def test_put_str_series(setup_path, performance_warning, string_dtype_arguments): + # https://github.com/pandas-dev/pandas/pull/60663 + dtype = pd.StringDtype(*string_dtype_arguments) + ser = Series(["x", pd.NA, "y"], dtype=dtype) + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + + store.put("ser", ser) + expected_dtype = "str" if dtype.na_value is np.nan else "string" + expected = ser.astype(expected_dtype) + result = store.get("ser") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format", ["table", "fixed"]) @pytest.mark.parametrize( "index", @@ -253,7 +287,7 @@ def test_store_index_types(setup_path, format, index): tm.assert_frame_equal(df, store["df"]) -def test_column_multiindex(setup_path): +def test_column_multiindex(setup_path, using_infer_string): # GH 4710 # recreate multi-indexes properly @@ -264,6 +298,12 @@ def test_column_multiindex(setup_path): expected = df.set_axis(df.index.to_numpy()) with ensure_clean_store(setup_path) as store: + if using_infer_string: + # TODO(infer_string) make this work for string dtype + msg = "Saving a MultiIndex with an extension dtype is not supported." + with pytest.raises(NotImplementedError, match=msg): + store.put("df", df) + return store.put("df", df) tm.assert_frame_equal( store["df"], expected, check_index_type=True, check_column_type=True diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index 8ae87d4bab52d..7a3a7339e7809 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import is_platform_windows import pandas as pd @@ -26,10 +24,7 @@ from pandas.io.pytables import TableIterator -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_read_missing_key_close_store(tmp_path, setup_path): @@ -78,7 +73,7 @@ def test_read_missing_key_opened_store(tmp_path, setup_path): def test_read_column(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -175,7 +170,7 @@ def test_pytables_native2_read(datapath): assert isinstance(d1, DataFrame) -def test_read_hdf_open_store(tmp_path, setup_path): +def test_read_hdf_open_store(tmp_path, setup_path, using_infer_string): # GH10330 # No check for non-string path_or-buf, and no test of open store df = DataFrame( @@ -187,6 +182,12 @@ def test_read_hdf_open_store(tmp_path, setup_path): df = df.set_index(keys="E", append=True) path = tmp_path / setup_path + if using_infer_string: + # TODO(infer_string) make this work for string dtype + msg = "Saving a MultiIndex with an extension dtype is not supported." + with pytest.raises(NotImplementedError, match=msg): + df.to_hdf(path, key="df", mode="w") + return df.to_hdf(path, key="df", mode="w") direct = read_hdf(path, "df") with HDFStore(path, mode="r") as store: diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 6b98a720e4299..409b92d2ddde1 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp from pandas.compat import is_platform_windows @@ -26,10 +24,7 @@ ) from pandas.util import _test_decorators as td -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_conv_read_write(): @@ -49,8 +44,8 @@ def roundtrip(key, obj, **kwargs): o = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) tm.assert_frame_equal(o, roundtrip("frame", o)) @@ -150,8 +145,8 @@ def test_api_invalid(tmp_path, setup_path): # Invalid. df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) msg = "Can only append to Tables" @@ -201,7 +196,7 @@ def test_put_integer(setup_path): _check_roundtrip(df, tm.assert_frame_equal, setup_path) -def test_table_values_dtypes_roundtrip(setup_path): +def test_table_values_dtypes_roundtrip(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8") store.append("df_f8", df1) @@ -213,12 +208,9 @@ def test_table_values_dtypes_roundtrip(setup_path): # incompatible dtype msg = re.escape( - "invalid combination of [values_axes] on appending data " - "[name->values_block_0,cname->values_block_0," - "dtype->float64,kind->float,shape->(1, 3)] vs " - "current table [name->values_block_0," - "cname->values_block_0,dtype->int64,kind->integer," - "shape->None]" + "Cannot serialize the column [a] " + "because its data contents are not [float] " + "but [integer] object dtype" ) with pytest.raises(ValueError, match=msg): store.append("df_i8", df1) @@ -249,6 +241,7 @@ def test_table_values_dtypes_roundtrip(setup_path): store.append("df_mixed_dtypes1", df1) result = store.select("df_mixed_dtypes1").dtypes.value_counts() result.index = [str(i) for i in result.index] + str_dtype = "str" if using_infer_string else "object" expected = Series( { "float32": 2, @@ -258,7 +251,7 @@ def test_table_values_dtypes_roundtrip(setup_path): "int16": 1, "int8": 1, "int64": 1, - "object": 1, + str_dtype: 1, "datetime64[s]": 2, "datetime64[ms]": 1, "datetime64[ns]": 1, @@ -280,10 +273,10 @@ def test_series(setup_path): ) _check_roundtrip(ts, tm.assert_series_equal, path=setup_path) - ts2 = Series(ts.index, Index(ts.index, dtype=object)) + ts2 = Series(ts.index, Index(ts.index)) _check_roundtrip(ts2, tm.assert_series_equal, path=setup_path) - ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) + ts3 = Series(ts.values, Index(np.asarray(ts.index))) _check_roundtrip( ts3, tm.assert_series_equal, path=setup_path, check_index_type=False ) @@ -373,8 +366,8 @@ def test_timeseries_preepoch(setup_path, request): def test_frame(compression, setup_path): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) # put in some random NAs @@ -390,7 +383,7 @@ def test_frame(compression, setup_path): tdf = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) _check_roundtrip( @@ -405,7 +398,10 @@ def test_frame(compression, setup_path): assert recons._mgr.is_consolidated() # empty - _check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path) + df2 = df[:0] + # Prevent df2 from having index with inferred_type as string + df2.index = Index([]) + _check_roundtrip(df2[:0], tm.assert_frame_equal, path=setup_path) def test_empty_series_frame(setup_path): @@ -437,9 +433,17 @@ def test_can_serialize_dates(setup_path): _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) -def test_store_hierarchical(setup_path, multiindex_dataframe_random_data): +def test_store_hierarchical( + setup_path, using_infer_string, multiindex_dataframe_random_data +): frame = multiindex_dataframe_random_data + if using_infer_string: + # TODO(infer_string) make this work for string dtype + msg = "Saving a MultiIndex with an extension dtype is not supported." + with pytest.raises(NotImplementedError, match=msg): + _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) + return _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) _check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path) _check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path) @@ -458,8 +462,8 @@ def test_store_mixed(compression, setup_path): def _make_one(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["obj1"] = "foo" df["obj2"] = "bar" diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py index 4b20b929ef447..5e76aae28c147 100644 --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp from pandas.compat import PY312 @@ -27,10 +25,7 @@ from pandas.io.pytables import Term -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_select_columns_in_where(setup_path): @@ -138,7 +133,7 @@ def test_select(setup_path): # select with columns= df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) _maybe_remove(store, "df") @@ -278,8 +273,8 @@ def test_select_dtypes(setup_path, request): with ensure_clean_store(setup_path) as store: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) expected = df[df["A"] > 0] @@ -350,7 +345,7 @@ def test_select_iterator(tmp_path, setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) _maybe_remove(store, "df") @@ -375,7 +370,7 @@ def test_select_iterator(tmp_path, setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.to_hdf(path, key="df_non_table") @@ -391,7 +386,7 @@ def test_select_iterator(tmp_path, setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.to_hdf(path, key="df", format="table") @@ -408,7 +403,7 @@ def test_select_iterator(tmp_path, setup_path): with ensure_clean_store(setup_path) as store: df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) store.append("df1", df1, data_columns=True) @@ -436,7 +431,7 @@ def test_select_iterator_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -471,7 +466,7 @@ def test_select_iterator_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -513,7 +508,7 @@ def test_select_iterator_non_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -547,7 +542,7 @@ def test_select_iterator_non_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -571,7 +566,7 @@ def test_select_iterator_many_empty_frames(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -623,7 +618,7 @@ def test_select_iterator_many_empty_frames(setup_path): def test_frame_select(setup_path, request): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -655,7 +650,7 @@ def test_frame_select(setup_path, request): # invalid terms df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) store.append("df_time", df) @@ -674,7 +669,7 @@ def test_frame_select_complex(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -791,7 +786,7 @@ def test_invalid_filtering(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -813,7 +808,7 @@ def test_string_select(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -857,7 +852,7 @@ def test_string_select(setup_path): def test_select_as_multiple(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) @@ -1058,7 +1053,6 @@ def test_select_large_integer(tmp_path): ), columns=["x", "y"], ) - result = None with HDFStore(path) as s: s.append("data", df, data_columns=True, index=False) result = s.select("data", where="y==-9223372036854775801").get("y").get(0) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index a6fe9529c594a..bb2058c050f2a 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -25,6 +25,7 @@ timedelta_range, ) import pandas._testing as tm +from pandas.conftest import has_pyarrow from pandas.tests.io.pytables.common import ( _maybe_remove, ensure_clean_store, @@ -35,10 +36,7 @@ read_hdf, ) -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] tables = pytest.importorskip("tables") @@ -110,7 +108,7 @@ def test_iter_empty(setup_path): assert list(store) == [] -def test_repr(setup_path, performance_warning): +def test_repr(setup_path, performance_warning, using_infer_string): with ensure_clean_store(setup_path) as store: repr(store) store.info() @@ -145,7 +143,9 @@ def test_repr(setup_path, performance_warning): df.loc[df.index[3:6], ["obj1"]] = np.nan df = df._consolidate() - with tm.assert_produces_warning(performance_warning): + warning = None if using_infer_string else performance_warning + msg = "cannot\nmap directly to c-types .* dtype='object'" + with tm.assert_produces_warning(warning, match=msg): store["df"] = df # make a random group in hdf space @@ -311,12 +311,12 @@ def test_getattr(setup_path): # test attribute access result = store.a tm.assert_series_equal(result, s) - result = getattr(store, "a") + result = store.a tm.assert_series_equal(result, s) df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) store["df"] = df @@ -369,7 +369,7 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path): { "A": [0.0, 1.0, 2.0, 3.0, 4.0], "B": [0.0, 1.0, 0.0, 1.0, 0.0], - "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), + "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]), "D": date_range("20130101", periods=5), } ).set_index("C") @@ -385,6 +385,10 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path): tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]])) +@pytest.mark.xfail( + using_string_dtype() and has_pyarrow, + reason="TODO(infer_string): can't encode '\ud800': surrogates not allowed", +) @pytest.mark.parametrize("format", ["fixed", "table"]) def test_to_hdf_errors(tmp_path, format, setup_path): data = ["\ud800foo"] @@ -406,7 +410,7 @@ def col(t, column): # data columns df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -441,7 +445,7 @@ def col(t, column): # data columns df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -483,8 +487,8 @@ def test_table_mixed_dtypes(setup_path): # frame df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["obj1"] = "foo" df["obj2"] = "bar" @@ -539,8 +543,8 @@ def test_remove(setup_path): ) df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) store["a"] = ts store["b"] = df @@ -603,8 +607,8 @@ def test_same_name_scoping(setup_path): def test_store_index_name(setup_path): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df.index.name = "foo" @@ -650,8 +654,8 @@ def test_store_index_name_numpy_str(tmp_path, table_format, setup_path, unit, tz def test_store_series_name(setup_path): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) series = df["A"] @@ -665,7 +669,7 @@ def test_overwrite_node(setup_path): with ensure_clean_store(setup_path) as store: store["a"] = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) ts = Series( @@ -679,7 +683,7 @@ def test_overwrite_node(setup_path): def test_coordinates(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -714,7 +718,7 @@ def test_coordinates(setup_path): _maybe_remove(store, "df2") df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) @@ -870,8 +874,8 @@ def test_start_stop_fixed(setup_path): # sparse; not implemented df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df.iloc[3:5, 1:3] = np.nan df.iloc[8:10, -2] = np.nan @@ -904,8 +908,8 @@ def test_select_filter_corner(setup_path, request): def test_path_pathlib(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_pathlib( @@ -934,8 +938,8 @@ def test_contiguous_mixed_data_table(start, stop, setup_path): def test_path_pathlib_hdfstore(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) def writer(path): @@ -953,8 +957,8 @@ def reader(path): def test_pickle_path_localpath(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_pathlib( lambda p: df.to_hdf(p, key="df"), lambda p: read_hdf(p, "df") @@ -966,8 +970,8 @@ def test_pickle_path_localpath(): def test_copy(propindexes): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: diff --git a/pandas/tests/io/pytables/test_subclass.py b/pandas/tests/io/pytables/test_subclass.py index bbe1cd77e0d9f..03622faa2b5a8 100644 --- a/pandas/tests/io/pytables/test_subclass.py +++ b/pandas/tests/io/pytables/test_subclass.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Series, @@ -19,7 +17,6 @@ class TestHDFStoreSubclass: # GH 33748 - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_supported_for_subclass_dataframe(self, tmp_path): data = {"a": [1, 2], "b": [3, 4]} sdf = tm.SubclassedDataFrame(data, dtype=np.intp) diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 8f179f844e4d0..9192804e49bd1 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs.timezones import maybe_get_tz import pandas.util._test_decorators as td @@ -25,10 +23,6 @@ ensure_clean_store, ) -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - def _compare_with_tz(a, b): tm.assert_frame_equal(a, b) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 3f5b73f4aa8a4..a17cd27f8284e 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat._constants import ( IS64, WASM, @@ -20,10 +18,6 @@ from pandas.io.sas.sas7bdat import SAS7BDATReader -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - @pytest.fixture def dirpath(datapath): @@ -246,11 +240,13 @@ def test_zero_variables(datapath): pd.read_sas(fname) -def test_zero_rows(datapath): +@pytest.mark.parametrize("encoding", [None, "utf8"]) +def test_zero_rows(datapath, encoding): # GH 18198 fname = datapath("io", "sas", "data", "zero_rows.sas7bdat") - result = pd.read_sas(fname) - expected = pd.DataFrame([{"char_field": "a", "num_field": 1.0}]).iloc[:0] + result = pd.read_sas(fname, encoding=encoding) + str_value = b"a" if encoding is None else "a" + expected = pd.DataFrame([{"char_field": str_value, "num_field": 1.0}]).iloc[:0] tm.assert_frame_equal(result, expected) @@ -409,7 +405,7 @@ def test_0x40_control_byte(datapath): fname = datapath("io", "sas", "data", "0x40controlbyte.sas7bdat") df = pd.read_sas(fname, encoding="ascii") fname = datapath("io", "sas", "data", "0x40controlbyte.csv") - df0 = pd.read_csv(fname, dtype="object") + df0 = pd.read_csv(fname, dtype="str") tm.assert_frame_equal(df, df0) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 70422a0ea6edc..e162815271ab3 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -19,12 +19,11 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import ( WASM, is_platform_windows, ) +from pandas.compat.pyarrow import pa_version_under19p0 import pandas.util._test_decorators as td import pandas as pd @@ -154,8 +153,8 @@ def test_get_handle_pyarrow_compat(self): s = StringIO(data) with icom.get_handle(s, "rb", is_text=False) as handles: df = pa_csv.read_csv(handles.handle).to_pandas() - # TODO will have to update this when pyarrow' to_pandas() is fixed - expected = expected.astype("object") + if pa_version_under19p0: + expected = expected.astype("object") tm.assert_frame_equal(df, expected) assert not s.closed @@ -365,7 +364,6 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): expected = f_path.read() assert result == expected - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) hdf support") def test_write_fspath_hdf5(self): # Same test as write_fspath_all, except HDF5 files aren't # necessarily byte-for-byte identical for a given dataframe, so we'll diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 69354066dd5ef..e778193c147c1 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -6,7 +6,10 @@ import numpy as np import pytest -from pandas.compat.pyarrow import pa_version_under18p0 +from pandas.compat.pyarrow import ( + pa_version_under18p0, + pa_version_under19p0, +) import pandas as pd import pandas._testing as tm @@ -140,8 +143,8 @@ def test_rw_use_threads(self): def test_path_pathlib(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ).reset_index() result = tm.round_trip_pathlib(df.to_feather, read_feather) tm.assert_frame_equal(df, result) @@ -239,16 +242,27 @@ def test_invalid_dtype_backend(self): with pytest.raises(ValueError, match=msg): read_feather(path, dtype_backend="numpy") - def test_string_inference(self, tmp_path): + def test_string_inference(self, tmp_path, using_infer_string): # GH#54431 path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}) df.to_feather(path) with pd.option_context("future.infer_string", True): result = read_feather(path) + dtype = pd.StringDtype(na_value=np.nan) expected = pd.DataFrame( data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan) ) + expected = pd.DataFrame( + data={"a": ["x", "y"]}, + dtype=dtype, + columns=pd.Index( + ["a"], + dtype=object + if pa_version_under19p0 and not using_infer_string + else dtype, + ), + ) tm.assert_frame_equal(result, expected) @pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0") diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 5340560884afe..2e3e74a9d31ff 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -209,7 +209,6 @@ def test_arrowparquet_options(fsspectest): assert fsspectest.test[0] == "parquet_read" -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_fastparquet_options(fsspectest): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/test_http_headers.py b/pandas/tests/io/test_http_headers.py index b11fe931f46e5..3b9c8769ad9dc 100644 --- a/pandas/tests/io/test_http_headers.py +++ b/pandas/tests/io/test_http_headers.py @@ -86,7 +86,6 @@ def stata_responder(df): return bio.getvalue() -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "responder, read_method", [ @@ -107,6 +106,7 @@ def stata_responder(df): marks=[ td.skip_if_no("fastparquet"), td.skip_if_no("fsspec"), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string"), ], ), (pickle_respnder, pd.read_pickle), diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 7919bb956dc7a..56a8e4c439164 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -17,6 +17,7 @@ pa_version_under13p0, pa_version_under15p0, pa_version_under17p0, + pa_version_under19p0, ) import pandas as pd @@ -103,10 +104,7 @@ def fp(request): @pytest.fixture def df_compat(): - # TODO(infer_string) should this give str columns? - return pd.DataFrame( - {"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"], dtype=object) - ) + return pd.DataFrame({"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"])) @pytest.fixture @@ -254,8 +252,10 @@ def test_invalid_engine(df_compat): check_round_trip(df_compat, "foo", "bar") -def test_options_py(df_compat, pa): +def test_options_py(df_compat, pa, using_infer_string): # use the set option + if using_infer_string and not pa_version_under19p0: + df_compat.columns = df_compat.columns.astype("str") with pd.option_context("io.parquet.engine", "pyarrow"): check_round_trip(df_compat) @@ -683,7 +683,11 @@ def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine): with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f: httpserver.serve_content(content=f.read()) df = read_parquet(httpserver.url, engine=engine) - tm.assert_frame_equal(df, df_compat) + + expected = df_compat + if pa_version_under19p0: + expected.columns = expected.columns.astype(object) + tm.assert_frame_equal(df, expected) class TestParquetPyArrow(Base): @@ -784,18 +788,21 @@ def test_unsupported_float16_cleanup(self, pa, path_type): def test_categorical(self, pa): # supported in >= 0.7.0 - df = pd.DataFrame() - df["a"] = pd.Categorical(list("abcdef")) - - # test for null, out-of-order values, and unobserved category - df["b"] = pd.Categorical( - ["bar", "foo", "foo", "bar", None, "bar"], - dtype=pd.CategoricalDtype(["foo", "bar", "baz"]), - ) - - # test for ordered flag - df["c"] = pd.Categorical( - ["a", "b", "c", "a", "c", "b"], categories=["b", "c", "d"], ordered=True + df = pd.DataFrame( + { + "a": pd.Categorical(list("abcdef")), + # test for null, out-of-order values, and unobserved category + "b": pd.Categorical( + ["bar", "foo", "foo", "bar", None, "bar"], + dtype=pd.CategoricalDtype(["foo", "bar", "baz"]), + ), + # test for ordered flag + "c": pd.Categorical( + ["a", "b", "c", "a", "c", "b"], + categories=["b", "c", "d"], + ordered=True, + ), + } ) check_round_trip(df, pa) @@ -858,11 +865,13 @@ def test_s3_roundtrip_for_dir( repeat=1, ) - def test_read_file_like_obj_support(self, df_compat): + def test_read_file_like_obj_support(self, df_compat, using_infer_string): pytest.importorskip("pyarrow") buffer = BytesIO() df_compat.to_parquet(buffer) df_from_buf = read_parquet(buffer) + if using_infer_string and not pa_version_under19p0: + df_compat.columns = df_compat.columns.astype("str") tm.assert_frame_equal(df_compat, df_from_buf) def test_expand_user(self, df_compat, monkeypatch): @@ -929,7 +938,7 @@ def test_additional_extension_arrays(self, pa, using_infer_string): "c": pd.Series(["a", None, "c"], dtype="string"), } ) - if using_infer_string: + if using_infer_string and pa_version_under19p0: check_round_trip(df, pa, expected=df.astype({"c": "str"})) else: check_round_trip(df, pa) @@ -943,7 +952,10 @@ def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_strin df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")}) with pd.option_context("string_storage", string_storage): if using_infer_string: - expected = df.astype("str") + if pa_version_under19p0: + expected = df.astype("str") + else: + expected = df.astype(f"string[{string_storage}]") expected.columns = expected.columns.astype("str") else: expected = df.astype(f"string[{string_storage}]") @@ -1099,17 +1111,24 @@ def test_df_attrs_persistence(self, tmp_path, pa): new_df = read_parquet(path, engine=pa) assert new_df.attrs == df.attrs - def test_string_inference(self, tmp_path, pa): + def test_string_inference(self, tmp_path, pa, using_infer_string): # GH#54431 path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"]) - df.to_parquet(path, engine="pyarrow") + df.to_parquet(path, engine=pa) with pd.option_context("future.infer_string", True): - result = read_parquet(path, engine="pyarrow") + result = read_parquet(path, engine=pa) + dtype = pd.StringDtype(na_value=np.nan) expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype=pd.StringDtype(na_value=np.nan), - index=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), + dtype=dtype, + index=pd.Index(["a", "b"], dtype=dtype), + columns=pd.Index( + ["a"], + dtype=object + if pa_version_under19p0 and not using_infer_string + else dtype, + ), ) tm.assert_frame_equal(result, expected) @@ -1122,7 +1141,10 @@ def test_roundtrip_decimal(self, tmp_path, pa): df = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="string[pyarrow]") df.to_parquet(path, schema=pa.schema([("a", pa.decimal128(5))])) result = read_parquet(path) - expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]") + if pa_version_under19p0: + expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]") + else: + expected = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="object") tm.assert_frame_equal(result, expected) def test_infer_string_large_string_type(self, tmp_path, pa): diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 5fe0f1265edff..bab2c1561eb99 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -383,55 +383,6 @@ def test_pickle_buffer_roundtrip(): tm.assert_frame_equal(df, result) -# --------------------- -# tests for URL I/O -# --------------------- - - -@pytest.mark.parametrize( - "mockurl", ["http://url.com", "ftp://test.com", "http://gzip.com"] -) -def test_pickle_generalurl_read(monkeypatch, mockurl): - def python_pickler(obj, path): - with open(path, "wb") as fh: - pickle.dump(obj, fh, protocol=-1) - - class MockReadResponse: - def __init__(self, path) -> None: - self.file = open(path, "rb") - if "gzip" in path: - self.headers = {"Content-Encoding": "gzip"} - else: - self.headers = {"Content-Encoding": ""} - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def read(self): - return self.file.read() - - def close(self): - return self.file.close() - - with tm.ensure_clean() as path: - - def mock_urlopen_read(*args, **kwargs): - return MockReadResponse(path) - - df = DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), - ) - python_pickler(df, path) - monkeypatch.setattr("urllib.request.urlopen", mock_urlopen_read) - result = pd.read_pickle(mockurl) - tm.assert_frame_equal(df, result) - - def test_pickle_fsspec_roundtrip(): pytest.importorskip("fsspec") with tm.ensure_clean(): diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 7e1220ecee218..97c856d3b6c40 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1068,7 +1068,9 @@ def test_to_sql(conn, method, test_frame1, request): @pytest.mark.parametrize("conn", all_connectable) -@pytest.mark.parametrize("mode, num_row_coef", [("replace", 1), ("append", 2)]) +@pytest.mark.parametrize( + "mode, num_row_coef", [("replace", 1), ("append", 2), ("delete_rows", 1)] +) def test_to_sql_exist(conn, mode, num_row_coef, test_frame1, request): conn = request.getfixturevalue(conn) with pandasSQL_builder(conn, need_transaction=True) as pandasSQL: @@ -2698,6 +2700,58 @@ def test_drop_table(conn, request): assert not insp.has_table("temp_frame") +@pytest.mark.parametrize("conn_name", all_connectable) +def test_delete_rows_success(conn_name, test_frame1, request): + table_name = "temp_delete_rows_frame" + conn = request.getfixturevalue(conn_name) + + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, table_name) == test_frame1.shape[0] + + with pandasSQL.run_transaction(): + assert pandasSQL.delete_rows(table_name) is None + + assert count_rows(conn, table_name) == 0 + assert pandasSQL.has_table(table_name) + + +@pytest.mark.parametrize("conn_name", all_connectable) +def test_delete_rows_is_atomic(conn_name, request): + sqlalchemy = pytest.importorskip("sqlalchemy") + + table_name = "temp_delete_rows_atomic_frame" + table_stmt = f"CREATE TABLE {table_name} (a INTEGER, b INTEGER UNIQUE NOT NULL)" + + if conn_name != "sqlite_buildin" and "adbc" not in conn_name: + table_stmt = sqlalchemy.text(table_stmt) + + # setting dtype is mandatory for adbc related tests + original_df = DataFrame({"a": [1, 2], "b": [3, 4]}, dtype="int32") + replacing_df = DataFrame({"a": [5, 6, 7], "b": [8, 8, 8]}, dtype="int32") + + conn = request.getfixturevalue(conn_name) + pandasSQL = pandasSQL_builder(conn) + + with pandasSQL.run_transaction() as cur: + cur.execute(table_stmt) + + with pandasSQL.run_transaction(): + pandasSQL.to_sql(original_df, table_name, if_exists="append", index=False) + + # inserting duplicated values in a UNIQUE constraint column + with pytest.raises(pd.errors.DatabaseError): + with pandasSQL.run_transaction(): + pandasSQL.to_sql( + replacing_df, table_name, if_exists="delete_rows", index=False + ) + + # failed "delete_rows" is rolled back preserving original data + with pandasSQL.run_transaction(): + result_df = pandasSQL.read_query(f"SELECT * FROM {table_name}", dtype="int32") + tm.assert_frame_equal(result_df, original_df) + + @pytest.mark.parametrize("conn", all_connectable) def test_roundtrip(conn, request, test_frame1): if conn == "sqlite_str": @@ -3409,8 +3463,8 @@ def test_to_sql_with_negative_npinf(conn, request, input): mark = pytest.mark.xfail(reason="GH 36465") request.applymarker(mark) - msg = "inf cannot be used with MySQL" - with pytest.raises(ValueError, match=msg): + msg = "Execution failed on sql" + with pytest.raises(pd.errors.DatabaseError, match=msg): df.to_sql(name="foobar", con=conn, index=False) else: assert df.to_sql(name="foobar", con=conn, index=False) == 1 diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 5c07a56c9fb3f..d897d251909fe 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1503,8 +1503,7 @@ def test_bad_xml(parser): with pytest.raises( SyntaxError, match=( - "Extra content at the end of the document|" - "junk after document element" + "Extra content at the end of the document|junk after document element" ), ): read_xml( diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 845f369d3090f..d18f098267599 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -1070,28 +1070,43 @@ def test_boxplot_series_positions(self, hist_df): tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), positions) assert len(ax.lines) == 7 * len(numeric_cols) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib 3.10", + ) def test_boxplot_vertical(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] # if horizontal, yticklabels are rotated - ax = df.plot.box(rot=50, fontsize=8, vert=False) + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) + ax = df.plot.box(rot=50, fontsize=8, **kwargs) _check_ticks_props(ax, xrot=0, yrot=50, ylabelsize=8) _check_text_labels(ax.get_yticklabels(), labels) assert len(ax.lines) == 7 * len(numeric_cols) - @pytest.mark.filterwarnings("ignore:Attempt:UserWarning") + @pytest.mark.filterwarnings("ignore::UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib version 3.10", + ) def test_boxplot_vertical_subplots(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) axes = _check_plot_works( - df.plot.box, - default_axes=True, - subplots=True, - vert=False, - logx=True, + df.plot.box, default_axes=True, subplots=True, logx=True, **kwargs ) _check_axes_shape(axes, axes_num=3, layout=(1, 3)) _check_ax_scales(axes, xaxis="log") @@ -1099,12 +1114,22 @@ def test_boxplot_vertical_subplots(self, hist_df): _check_text_labels(ax.get_yticklabels(), [label]) assert len(ax.lines) == 7 + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib 3.10", + ) def test_boxplot_vertical_positions(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] positions = np.array([3, 2, 8]) - ax = df.plot.box(positions=positions, vert=False) + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) + ax = df.plot.box(positions=positions, **kwargs) _check_text_labels(ax.get_yticklabels(), labels) tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), positions) assert len(ax.lines) == 7 * len(numeric_cols) diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 4916963ab7c87..2267b6197cd80 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -1,5 +1,7 @@ """Test cases for .boxplot method""" +from __future__ import annotations + import itertools import string @@ -22,6 +24,7 @@ _check_ticks_props, _check_visible, ) +from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing @@ -35,6 +38,17 @@ def _check_ax_limits(col, ax): assert y_max >= col.max() +if Version(mpl.__version__) < Version("3.10"): + verts: list[dict[str, bool | str]] = [{"vert": False}, {"vert": True}] +else: + verts = [{"orientation": "horizontal"}, {"orientation": "vertical"}] + + +@pytest.fixture(params=verts) +def vert(request): + return request.param + + class TestDataFramePlots: def test_stacked_boxplot_set_axis(self): # GH2980 @@ -312,7 +326,7 @@ def test_specified_props_kwd(self, props, expected): assert result[expected][0].get_color() == "C1" - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_plot_xlabel_ylabel(self, vert): df = DataFrame( { @@ -322,11 +336,11 @@ def test_plot_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.plot(kind="box", vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.plot(kind="box", xlabel=xlabel, ylabel=ylabel, **vert) assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_plot_box(self, vert): # GH 54941 rng = np.random.default_rng(2) @@ -335,13 +349,13 @@ def test_plot_box(self, vert): xlabel, ylabel = "x", "y" _, axs = plt.subplots(ncols=2, figsize=(10, 7), sharey=True) - df1.plot.box(ax=axs[0], vert=vert, xlabel=xlabel, ylabel=ylabel) - df2.plot.box(ax=axs[1], vert=vert, xlabel=xlabel, ylabel=ylabel) + df1.plot.box(ax=axs[0], xlabel=xlabel, ylabel=ylabel, **vert) + df2.plot.box(ax=axs[1], xlabel=xlabel, ylabel=ylabel, **vert) for ax in axs: assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_boxplot_xlabel_ylabel(self, vert): df = DataFrame( { @@ -351,11 +365,11 @@ def test_boxplot_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.boxplot(vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.boxplot(xlabel=xlabel, ylabel=ylabel, **vert) assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_boxplot_group_xlabel_ylabel(self, vert): df = DataFrame( { @@ -365,13 +379,19 @@ def test_boxplot_group_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.boxplot(by="group", vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.boxplot(by="group", xlabel=xlabel, ylabel=ylabel, **vert) for subplot in ax: assert subplot.get_xlabel() == xlabel assert subplot.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) - def test_boxplot_group_no_xlabel_ylabel(self, vert): + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + def test_boxplot_group_no_xlabel_ylabel(self, vert, request): + if Version(mpl.__version__) >= Version("3.10") and vert == { + "orientation": "horizontal" + }: + request.applymarker( + pytest.mark.xfail(reason=f"{vert} fails starting with matplotlib 3.10") + ) df = DataFrame( { "a": np.random.default_rng(2).standard_normal(10), @@ -379,9 +399,13 @@ def test_boxplot_group_no_xlabel_ylabel(self, vert): "group": np.random.default_rng(2).choice(["group1", "group2"], 10), } ) - ax = df.boxplot(by="group", vert=vert) + ax = df.boxplot(by="group", **vert) for subplot in ax: - target_label = subplot.get_xlabel() if vert else subplot.get_ylabel() + target_label = ( + subplot.get_xlabel() + if vert == {"vert": True} or vert == {"orientation": "vertical"} + else subplot.get_ylabel() + ) assert target_label == pprint_thing(["group"]) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 9675b936c171e..c3b0219971446 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -427,7 +427,7 @@ def test_pie_series_autopct_and_fontsize(self): ax = _check_plot_works( series.plot.pie, colors=color_args, autopct="%.2f", fontsize=7 ) - pcts = [f"{s*100:.2f}" for s in series.values / series.sum()] + pcts = [f"{s * 100:.2f}" for s in series.values / series.sum()] expected_texts = list(chain.from_iterable(zip(series.index, pcts))) _check_text_labels(ax.texts, expected_texts) for t in ax.texts: diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 476978aeab15a..a7bb80727206e 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1607,17 +1607,10 @@ def test_mode_intoverflow(self, dropna, expected1, expected2): expected2 = Series(expected2, dtype=np.uint64) tm.assert_series_equal(result, expected2) - def test_mode_sortwarning(self): - # Check for the warning that is raised when the mode - # results cannot be sorted - - expected = Series(["foo", np.nan], dtype=object) + def test_mode_sort_with_na(self): s = Series([1, "foo", "foo", np.nan, np.nan]) - - with tm.assert_produces_warning(UserWarning, match="Unable to sort modes"): - result = s.mode(dropna=False) - result = result.sort_values().reset_index(drop=True) - + expected = Series(["foo", np.nan], dtype=object) + result = s.mode(dropna=False) tm.assert_series_equal(result, expected) def test_mode_boolean_with_na(self): diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 179f2c0e6cfa9..3a7fd548ca961 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1022,12 +1022,8 @@ def test_resample_segfault(unit): all_wins_and_wagers, columns=("ID", "timestamp", "A", "B") ).set_index("timestamp") df.index = df.index.as_unit(unit) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("ID").resample("5min").sum() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) + result = df.groupby("ID").resample("5min").sum() + expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) tm.assert_frame_equal(result, expected) @@ -1046,9 +1042,7 @@ def test_resample_dtype_preservation(unit): result = df.resample("1D").ffill() assert result.val.dtype == np.int32 - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("group").resample("1D").ffill() + result = df.groupby("group").resample("1D").ffill() assert result.val.dtype == np.int32 @@ -1821,12 +1815,8 @@ def f(data, add_arg): multiplier = 10 df = DataFrame({"A": 1, "B": 2}, index=date_range("2017", periods=10)) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby("A").resample("D").mean().multiply(multiplier) + result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) + expected = df.groupby("A").resample("D").mean().multiply(multiplier) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index b7b80b5e427ff..da1774cf22587 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -76,9 +76,7 @@ def test_groupby_resample_api(): ) index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i], names=["group", "date"]) expected = DataFrame({"val": [5] * 7 + [6] + [7] * 7 + [8]}, index=index) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] + result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index ff1b82210e20d..e7850f96b3b0f 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -71,12 +71,8 @@ def test_deferred_with_groupby(): def f_0(x): return x.set_index("date").resample("D").asfreq() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby("id").apply(f_0) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.set_index("date").groupby("id").resample("D").asfreq() + expected = df.groupby("id").apply(f_0) + result = df.set_index("date").groupby("id").resample("D").asfreq() tm.assert_frame_equal(result, expected) df = DataFrame( @@ -90,12 +86,8 @@ def f_0(x): def f_1(x): return x.resample("1D").ffill() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby("group").apply(f_1) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("group").resample("1D").ffill() + expected = df.groupby("group").apply(f_1) + result = df.groupby("group").resample("1D").ffill() tm.assert_frame_equal(result, expected) @@ -110,9 +102,7 @@ def test_getitem(test_frame): result = g.B.resample("2s").mean() tm.assert_series_equal(result, expected) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = g.resample("2s").mean().B + result = g.resample("2s").mean().B tm.assert_series_equal(result, expected) @@ -236,12 +226,8 @@ def test_methods(f, test_frame): g = test_frame.groupby("A") r = g.resample("2s") - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = getattr(r, f)() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) + result = getattr(r, f)() + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) tm.assert_equal(result, expected) @@ -258,12 +244,8 @@ def test_methods_nunique(test_frame): def test_methods_std_var(f, test_frame): g = test_frame.groupby("A") r = g.resample("2s") - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = getattr(r, f)(ddof=1) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) + result = getattr(r, f)(ddof=1) + expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) tm.assert_frame_equal(result, expected) @@ -272,24 +254,18 @@ def test_apply(test_frame): r = g.resample("2s") # reduction - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.resample("2s").sum() + expected = g.resample("2s").sum() def f_0(x): return x.resample("2s").sum() - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = r.apply(f_0) + result = r.apply(f_0) tm.assert_frame_equal(result, expected) def f_1(x): return x.resample("2s").apply(lambda y: y.sum()) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = g.apply(f_1) + result = g.apply(f_1) # y.sum() results in int64 instead of int32 on 32-bit architectures expected = expected.astype("int64") tm.assert_frame_equal(result, expected) @@ -357,9 +333,7 @@ def test_resample_groupby_with_label(unit): # GH 13235 index = date_range("2000-01-01", freq="2D", periods=5, unit=unit) df = DataFrame(index=index, data={"col0": [0, 0, 1, 1, 2], "col1": [1, 1, 1, 1, 1]}) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("col0").resample("1W", label="left").sum() + result = df.groupby("col0").resample("1W", label="left").sum() mi = [ np.array([0, 0, 1, 2], dtype=np.int64), @@ -369,9 +343,7 @@ def test_resample_groupby_with_label(unit): ), ] mindex = pd.MultiIndex.from_arrays(mi, names=["col0", None]) - expected = DataFrame( - data={"col0": [0, 0, 2, 2], "col1": [1, 1, 2, 1]}, index=mindex - ) + expected = DataFrame(data={"col1": [1, 1, 2, 1]}, index=mindex) tm.assert_frame_equal(result, expected) @@ -380,9 +352,7 @@ def test_consistency_with_window(test_frame): # consistent return values with window df = test_frame expected = Index([1, 2, 3], name="A") - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").resample("2s").mean() + result = df.groupby("A").resample("2s").mean() assert result.index.nlevels == 2 tm.assert_index_equal(result.index.levels[0], expected) @@ -479,13 +449,12 @@ def test_resample_groupby_agg_listlike(): def test_empty(keys): # GH 26411 df = DataFrame([], columns=["a", "b"], index=TimedeltaIndex([])) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + expected_columns = ["b"] if keys == ["a"] else [] expected = ( DataFrame(columns=["a", "b"]) .set_index(keys, drop=False) - .set_index(TimedeltaIndex([]), append=True) + .set_index(TimedeltaIndex([]), append=True)[expected_columns] ) if len(keys) == 1: expected.index.name = keys[0] @@ -505,9 +474,7 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): if consolidate: df = df._consolidate() - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["key"]).resample("W", on="date").min() + result = df.groupby(["key"]).resample("W", on="date").min() idx = pd.MultiIndex.from_arrays( [ ["A"] * 3 + ["B"] * 3, @@ -519,7 +486,6 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): ) expected = DataFrame( { - "key": ["A"] * 3 + ["B"] * 3, "col1": [0, 5, 12] * 2, "col_object": ["val"] * 3 + [np.nan] * 3, }, @@ -557,12 +523,11 @@ def test_resample_no_index(keys): df = DataFrame([], columns=["a", "b", "date"]) df["date"] = pd.to_datetime(df["date"]) df = df.set_index("date") - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + expected_columns = ["b"] if keys == ["a"] else [] expected = DataFrame(columns=["a", "b", "date"]).set_index(keys, drop=False) expected["date"] = pd.to_datetime(expected["date"]) - expected = expected.set_index("date", append=True, drop=True) + expected = expected.set_index("date", append=True, drop=True)[expected_columns] if len(keys) == 1: expected.index.name = keys[0] @@ -606,9 +571,7 @@ def test_groupby_resample_size_all_index_same(): {"A": [1] * 3 + [2] * 3 + [1] * 3 + [2] * 3, "B": np.arange(12)}, index=date_range("31/12/2000 18:00", freq="h", periods=12), ) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").resample("D").size() + result = df.groupby("A").resample("D").size() mi_exp = pd.MultiIndex.from_arrays( [ diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index f694b90a707c7..3cc95922e7f2f 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -351,14 +351,11 @@ def test_groupby_resample_interpolate_raises(groupy_test_df): dfs = [groupy_test_df, groupy_test_df_without_index_name] for df in dfs: - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - with pytest.raises( - NotImplementedError, - match="Direct interpolation of MultiIndex data frames is " - "not supported", - ): - df.groupby("volume").resample("1D").interpolate(method="linear") + with pytest.raises( + NotImplementedError, + match="Direct interpolation of MultiIndex data frames is not supported", + ): + df.groupby("volume").resample("1D").interpolate(method="linear") def test_groupby_resample_interpolate_with_apply_syntax(groupy_test_df): @@ -373,7 +370,6 @@ def test_groupby_resample_interpolate_with_apply_syntax(groupy_test_df): for df in dfs: result = df.groupby("volume").apply( lambda x: x.resample("1D").interpolate(method="linear"), - include_groups=False, ) volume = [50] * 15 + [60] @@ -417,7 +413,7 @@ def test_groupby_resample_interpolate_with_apply_syntax_off_grid(groupy_test_df) See GH#21351.""" # GH#21351 result = groupy_test_df.groupby("volume").apply( - lambda x: x.resample("265h").interpolate(method="linear"), include_groups=False + lambda x: x.resample("265h").interpolate(method="linear") ) volume = [50, 50, 60] diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f0abc1afc6ab0..f0f67aebd85ec 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1464,7 +1464,10 @@ def test_merge_how_validation(self): data2 = DataFrame( np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"] ) - msg = "'full' is not a valid Merge type: left, right, inner, outer, cross, asof" + msg = ( + "'full' is not a valid Merge type: left, right, inner, outer, " + "left_anti, right_anti, cross, asof" + ) with pytest.raises(ValueError, match=re.escape(msg)): data1.merge(data2, how="full") diff --git a/pandas/tests/reshape/merge/test_merge_antijoin.py b/pandas/tests/reshape/merge/test_merge_antijoin.py new file mode 100644 index 0000000000000..006622c6e5e94 --- /dev/null +++ b/pandas/tests/reshape/merge/test_merge_antijoin.py @@ -0,0 +1,280 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + MultiIndex, +) +import pandas._testing as tm +from pandas.core.reshape.merge import merge + + +def test_merge_antijoin(): + # GH#42916 + left = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"]) + right = DataFrame({"B": [1, 2, 4]}, index=["a", "b", "d"]) + + result = merge(left, right, how="left_anti", left_index=True, right_index=True) + expected = DataFrame({"A": [3], "B": [np.nan]}, index=["c"]) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_index=True, right_index=True) + expected = DataFrame({"A": [np.nan], "B": [4]}, index=["d"]) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_on_different_columns(): + left = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "c"]}).astype({"B": object}) + right = DataFrame({"C": [1.0, 2.0, 4.0], "D": ["a", "d", "b"]}).astype( + {"D": object} + ) + + result = merge(left, right, how="left_anti", left_on="B", right_on="D") + expected = DataFrame( + { + "A": [3.0], + "B": ["c"], + "C": [np.nan], + "D": [np.nan], + }, + index=[2], + ).astype({"B": object, "D": object}) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="B", right_on="D") + expected = DataFrame( + { + "A": [np.nan], + "B": [np.nan], + "C": [2.0], + "D": ["d"], + }, + index=[1], + ).astype({"B": object, "D": object}) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_nonunique_keys(): + left = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "b"]}).astype({"B": object}) + right = DataFrame({"C": [1.0, 2.0, 4.0], "D": ["b", "d", "d"]}).astype( + {"D": object} + ) + + result = merge(left, right, how="left_anti", left_on="B", right_on="D") + expected = DataFrame( + { + "A": [1.0], + "B": ["a"], + "C": [np.nan], + "D": [np.nan], + }, + index=[0], + ).astype({"B": object, "D": object}) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="B", right_on="D") + expected = DataFrame( + { + "A": [np.nan, np.nan], + "B": [np.nan, np.nan], + "C": [2.0, 4.0], + "D": ["d", "d"], + }, + index=[2, 3], + ).astype({"B": object, "D": object}) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_same_df(): + left = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"], dtype=np.int64) + result = merge(left, left, how="left_anti", left_index=True, right_index=True) + expected = DataFrame([], columns=["A_x", "A_y"], dtype=np.int64) + tm.assert_frame_equal(result, expected, check_index_type=False) + + +def test_merge_antijoin_nans(): + left = DataFrame({"A": [1.0, 2.0, np.nan], "C": ["a", "b", "c"]}).astype( + {"C": object} + ) + right = DataFrame({"A": [3.0, 2.0, np.nan], "D": ["d", "e", "f"]}).astype( + {"D": object} + ) + result = merge(left, right, how="left_anti", on="A") + expected = DataFrame({"A": [1.0], "C": ["a"], "D": [np.nan]}).astype( + {"C": object, "D": object} + ) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_on_datetime64tz(): + # GH11405 + left = DataFrame( + { + "key": pd.date_range("20151010", periods=2, tz="US/Eastern"), + "value": [1.0, 2.0], + } + ) + right = DataFrame( + { + "key": pd.date_range("20151011", periods=3, tz="US/Eastern"), + "value": [1.0, 2.0, 3.0], + } + ) + + expected = DataFrame( + { + "key": pd.date_range("20151010", periods=1, tz="US/Eastern"), + "value_x": [1.0], + "value_y": [np.nan], + }, + index=[0], + ) + result = merge(left, right, on="key", how="left_anti") + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + { + "key": pd.date_range("20151012", periods=2, tz="US/Eastern"), + "value_x": [np.nan, np.nan], + "value_y": [2.0, 3.0], + }, + index=[1, 2], + ) + result = merge(left, right, on="key", how="right_anti") + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_multiindex(): + left = DataFrame( + { + "A": [1, 2, 3], + "B": [4, 5, 6], + }, + index=MultiIndex.from_tuples( + [("a", "x"), ("b", "y"), ("c", "z")], names=["first", "second"] + ), + ) + right = DataFrame( + { + "C": [7, 8, 9], + "D": [10, 11, 12], + }, + index=MultiIndex.from_tuples( + [("a", "x"), ("b", "y"), ("c", "w")], names=["first", "second"] + ), + ) + + result = merge(left, right, how="left_anti", left_index=True, right_index=True) + expected = DataFrame( + { + "A": [3], + "B": [6], + "C": [np.nan], + "D": [np.nan], + }, + index=MultiIndex.from_tuples([("c", "z")], names=["first", "second"]), + ) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_index=True, right_index=True) + expected = DataFrame( + { + "A": [np.nan], + "B": [np.nan], + "C": [9], + "D": [12], + }, + index=MultiIndex.from_tuples([("c", "w")], names=["first", "second"]), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", + [ + "Int64", + pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("timestamp[s][pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], +) +def test_merge_antijoin_extension_dtype(dtype): + left = DataFrame( + { + "join_col": [1, 3, 5], + "left_val": [1, 2, 3], + } + ) + right = DataFrame( + { + "join_col": [2, 3, 4], + "right_val": [1, 2, 3], + } + ) + left = left.astype({"join_col": dtype}) + right = right.astype({"join_col": dtype}) + result = merge(left, right, how="left_anti", on="join_col") + expected = DataFrame( + { + "join_col": [1, 5], + "left_val": [1, 3], + "right_val": [np.nan, np.nan], + }, + index=[0, 2], + ) + expected = expected.astype({"join_col": dtype}) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_empty_dataframe(): + left = DataFrame({"A": [], "B": []}) + right = DataFrame({"C": [], "D": []}) + + result = merge(left, right, how="left_anti", left_on="A", right_on="C") + expected = DataFrame({"A": [], "B": [], "C": [], "D": []}) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="A", right_on="C") + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_no_common_elements(): + left = DataFrame({"A": [1, 2, 3]}) + right = DataFrame({"B": [4, 5, 6]}) + + result = merge(left, right, how="left_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [1, 2, 3], "B": [np.nan, np.nan, np.nan]}) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [np.nan, np.nan, np.nan], "B": [4, 5, 6]}) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_with_null_values(): + left = DataFrame({"A": [1.0, 2.0, None, 4.0]}) + right = DataFrame({"B": [2.0, None, 5.0]}) + + result = merge(left, right, how="left_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [1.0, 4.0], "B": [np.nan, np.nan]}, index=[0, 3]) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [np.nan], "B": [5.0]}, index=[2]) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_with_mixed_dtypes(): + left = DataFrame({"A": [1, "2", 3.0]}) + right = DataFrame({"B": ["2", 3.0, 4]}) + + result = merge(left, right, how="left_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [1], "B": [np.nan]}, dtype=object) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [np.nan], "B": [4]}, dtype=object, index=[2]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge_cross.py b/pandas/tests/reshape/merge/test_merge_cross.py index 14f9036e43fce..6ab80cf0e0823 100644 --- a/pandas/tests/reshape/merge/test_merge_cross.py +++ b/pandas/tests/reshape/merge/test_merge_cross.py @@ -42,8 +42,7 @@ def test_merge_cross_error_reporting(kwargs): left = DataFrame({"a": [1, 3]}) right = DataFrame({"b": [3, 4]}) msg = ( - "Can not pass on, right_on, left_on or set right_index=True or " - "left_index=True" + "Can not pass on, right_on, left_on or set right_index=True or left_index=True" ) with pytest.raises(MergeError, match=msg): merge(left, right, how="cross", **kwargs) @@ -94,8 +93,7 @@ def test_join_cross_error_reporting(): left = DataFrame({"a": [1, 3]}) right = DataFrame({"a": [3, 4]}) msg = ( - "Can not pass on, right_on, left_on or set right_index=True or " - "left_index=True" + "Can not pass on, right_on, left_on or set right_index=True or left_index=True" ) with pytest.raises(MergeError, match=msg): left.join(right, how="cross", on="a") diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index da1930323f464..c7b7992a78232 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Series, @@ -364,7 +362,6 @@ def test_with_prefix_contains_get_dummies_NaN_column(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "default_category, expected", [ @@ -401,12 +398,14 @@ def test_with_prefix_contains_get_dummies_NaN_column(): ], ) def test_with_prefix_default_category( - dummies_with_unassigned, default_category, expected + dummies_with_unassigned, default_category, expected, using_infer_string ): result = from_dummies( dummies_with_unassigned, sep="_", default_category=default_category ) expected = DataFrame(expected) + if using_infer_string: + expected = expected.astype("str") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index f42f7f8232229..374d236c8ff39 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2668,6 +2668,8 @@ def test_pivot_columns_not_given(self): with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): df.pivot() + # this still fails because columns=None gets passed down to unstack as level=None + # while at that point None was converted to NaN @pytest.mark.xfail( using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" ) @@ -2686,10 +2688,7 @@ def test_pivot_columns_is_none(self): expected = DataFrame({1: 3}, index=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" - ) - def test_pivot_index_is_none(self): + def test_pivot_index_is_none(self, using_infer_string): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2700,11 +2699,10 @@ def test_pivot_index_is_none(self): result = df.pivot(columns="b", index=None, values="c") expected = DataFrame(3, index=[1], columns=Index([2], name="b")) + if using_infer_string: + expected.index.name = np.nan tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" - ) def test_pivot_values_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index fe51817a78be8..baaedaa853565 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -991,7 +991,6 @@ def test_properties_quarterly(self): qedec_date = Period(freq="Q-DEC", year=2007, quarter=1) qejan_date = Period(freq="Q-JAN", year=2007, quarter=1) qejun_date = Period(freq="Q-JUN", year=2007, quarter=1) - # for x in range(3): for qd in (qedec_date, qejan_date, qejun_date): assert (qd + x).qyear == 2007 @@ -1016,7 +1015,6 @@ def test_properties_monthly(self): def test_properties_weekly(self): # Test properties on Periods with daily frequency. w_date = Period(freq="W", year=2007, month=1, day=7) - # assert w_date.year == 2007 assert w_date.quarter == 1 assert w_date.month == 1 @@ -1046,7 +1044,6 @@ def test_properties_daily(self): # Test properties on Periods with daily frequency. with tm.assert_produces_warning(FutureWarning, match=bday_msg): b_date = Period(freq="B", year=2007, month=1, day=1) - # assert b_date.year == 2007 assert b_date.quarter == 1 assert b_date.month == 1 @@ -1089,7 +1086,6 @@ def test_properties_hourly(self): def test_properties_minutely(self): # Test properties on Periods with minutely frequency. t_date = Period(freq="Min", year=2007, month=1, day=1, hour=0, minute=0) - # assert t_date.quarter == 1 assert t_date.month == 1 assert t_date.day == 1 @@ -1108,7 +1104,6 @@ def test_properties_secondly(self): s_date = Period( freq="Min", year=2007, month=1, day=1, hour=0, minute=0, second=0 ) - # assert s_date.year == 2007 assert s_date.quarter == 1 assert s_date.month == 1 diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index e029dfc3b2703..45caeb1733590 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -353,8 +353,7 @@ def test_construction(): Timedelta("foo") msg = ( - "cannot construct a Timedelta from " - "the passed arguments, allowed keywords are " + "cannot construct a Timedelta from the passed arguments, allowed keywords are " ) with pytest.raises(ValueError, match=msg): Timedelta(day=10) diff --git a/pandas/tests/scalar/timestamp/methods/test_round.py b/pandas/tests/scalar/timestamp/methods/test_round.py index 944aa55727217..6b27e5e6c5554 100644 --- a/pandas/tests/scalar/timestamp/methods/test_round.py +++ b/pandas/tests/scalar/timestamp/methods/test_round.py @@ -165,7 +165,6 @@ def test_round_dst_border_ambiguous(self, method, unit): # GH 18946 round near "fall back" DST ts = Timestamp("2017-10-29 00:00:00", tz="UTC").tz_convert("Europe/Madrid") ts = ts.as_unit(unit) - # result = getattr(ts, method)("h", ambiguous=True) assert result == ts assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index d3556b644c4bf..02efd850f9640 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -253,25 +253,17 @@ def test_timedelta_assignment(): # GH 8209 s = Series([], dtype=object) s.loc["B"] = timedelta(1) - expected = Series( - Timedelta("1 days"), dtype="timedelta64[ns]", index=Index(["B"], dtype=object) - ) + expected = Series(Timedelta("1 days"), dtype="timedelta64[ns]", index=["B"]) tm.assert_series_equal(s, expected) s = s.reindex(s.index.insert(0, "A")) expected = Series( - [np.nan, Timedelta("1 days")], - dtype="timedelta64[ns]", - index=Index(["A", "B"], dtype=object), + [np.nan, Timedelta("1 days")], dtype="timedelta64[ns]", index=["A", "B"] ) tm.assert_series_equal(s, expected) s.loc["A"] = timedelta(1) - expected = Series( - Timedelta("1 days"), - dtype="timedelta64[ns]", - index=Index(["A", "B"], dtype=object), - ) + expected = Series(Timedelta("1 days"), dtype="timedelta64[ns]", index=["A", "B"]) tm.assert_series_equal(s, expected) diff --git a/pandas/tests/series/indexing/test_set_value.py b/pandas/tests/series/indexing/test_set_value.py index 99e71fa4b804b..3bf89da53d923 100644 --- a/pandas/tests/series/indexing/test_set_value.py +++ b/pandas/tests/series/indexing/test_set_value.py @@ -3,17 +3,21 @@ import numpy as np from pandas import ( + DatetimeIndex, Index, Series, ) import pandas._testing as tm -def test_series_set_value(): +def test_series_set_value(using_infer_string): # GH#1561, GH#51363 as of 3.0 we do not do inference in Index.insert dates = [datetime(2001, 1, 1), datetime(2001, 1, 2)] - index = Index(dates, dtype=object) + if using_infer_string: + index = DatetimeIndex(dates) + else: + index = Index(dates, dtype=object) s = Series(dtype=object) s._set_value(dates[0], 1.0) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 158198239ba75..964f0b90b3c41 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -9,7 +9,6 @@ import numpy as np import pytest -from pandas.compat import WASM from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import IndexingError @@ -475,12 +474,14 @@ def test_setitem_callable_other(self): class TestSetitemWithExpansion: - def test_setitem_empty_series(self): - # GH#10193, GH#51363 changed in 3.0 to not do inference in Index.insert + def test_setitem_empty_series(self, using_infer_string): + # GH#10193 key = Timestamp("2012-01-01") series = Series(dtype=object) series[key] = 47 - expected = Series(47, Index([key], dtype=object)) + expected = Series( + 47, index=[key] if using_infer_string else Index([key], dtype=object) + ) tm.assert_series_equal(series, expected) def test_setitem_empty_series_datetimeindex_preserves_freq(self): @@ -537,10 +538,7 @@ def test_setitem_with_expansion_type_promotion(self): ser["a"] = Timestamp("2016-01-01") ser["b"] = 3.0 ser["c"] = "foo" - expected = Series( - [Timestamp("2016-01-01"), 3.0, "foo"], - index=Index(["a", "b", "c"], dtype=object), - ) + expected = Series([Timestamp("2016-01-01"), 3.0, "foo"], index=["a", "b", "c"]) tm.assert_series_equal(ser, expected) def test_setitem_not_contained(self, string_series): @@ -1449,7 +1447,6 @@ def obj(self): np_version_gte1p24 and os.environ.get("NPY_PROMOTION_STATE", "weak") != "weak" ) - or WASM ), reason="np.float32(1.1) ends up as 1.100000023841858, so " "np_can_hold_element raises and we cast to float64", diff --git a/pandas/tests/series/methods/test_between.py b/pandas/tests/series/methods/test_between.py index e67eafbd118ce..f035767e2ce0e 100644 --- a/pandas/tests/series/methods/test_between.py +++ b/pandas/tests/series/methods/test_between.py @@ -66,8 +66,7 @@ def test_between_error_args(self, inclusive): left, right = series[[2, 7]] value_error_msg = ( - "Inclusive has to be either string of 'both'," - "'left', 'right', or 'neither'." + "Inclusive has to be either string of 'both','left', 'right', or 'neither'." ) series = Series(date_range("1/1/2000", periods=10)) diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index 293919173c2d5..51d6704e1905b 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -31,7 +31,7 @@ def test_combine_first_name(self, datetime_series): result = datetime_series.combine_first(datetime_series[:5]) assert result.name == datetime_series.name - def test_combine_first(self): + def test_combine_first(self, using_infer_string): values = np.arange(20, dtype=np.float64) series = Series(values, index=np.arange(20, dtype=np.int64)) @@ -64,7 +64,8 @@ def test_combine_first(self): ser = Series([1.0, 2, 3], index=[0, 1, 2]) empty = Series([], index=[], dtype=object) result = ser.combine_first(empty) - ser.index = ser.index.astype("O") + if not using_infer_string: + ser.index = ser.index.astype("O") tm.assert_series_equal(result, ser.astype(object)) def test_combine_first_dt64(self, unit): diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 90c4056a39e84..d373386108ff6 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -297,3 +297,14 @@ def test_convert_dtypes_pyarrow_null(self): result = ser.convert_dtypes(dtype_backend="pyarrow") expected = pd.Series([None, None], dtype=pd.ArrowDtype(pa.null())) tm.assert_series_equal(result, expected) + + def test_convert_dtype_pyarrow_timezone_preserve(self): + # GH 60237 + pytest.importorskip("pyarrow") + ser = pd.Series( + pd.to_datetime(range(5), utc=True, unit="h"), + dtype="timestamp[ns, tz=UTC][pyarrow]", + ) + result = ser.convert_dtypes(dtype_backend="pyarrow") + expected = ser.copy() + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index e997ae32cf2e2..4f8484252ba8f 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -211,6 +211,30 @@ def test_isin_large_series_mixed_dtypes_and_nan(monkeypatch): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "dtype, data, values, expected", + [ + ("boolean", [pd.NA, False, True], [False, pd.NA], [True, True, False]), + ("Int64", [pd.NA, 2, 1], [1, pd.NA], [True, False, True]), + ("boolean", [pd.NA, False, True], [pd.NA, True, "a", 20], [True, False, True]), + ("boolean", [pd.NA, False, True], [], [False, False, False]), + ("Float64", [20.0, 30.0, pd.NA], [pd.NA], [False, False, True]), + ], +) +def test_isin_large_series_and_pdNA(dtype, data, values, expected, monkeypatch): + # https://github.com/pandas-dev/pandas/issues/60678 + # combination of large series (> _MINIMUM_COMP_ARR_LEN elements) and + # values contains pdNA + min_isin_comp = 2 + ser = Series(data, dtype=dtype) + expected = Series(expected, dtype="boolean") + + with monkeypatch.context() as m: + m.setattr(algorithms, "_MINIMUM_COMP_ARR_LEN", min_isin_comp) + result = ser.isin(values) + tm.assert_series_equal(result, expected) + + def test_isin_complex_numbers(): # GH 17927 array = [0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j] diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index ecfe3d1b39d31..abd5d075ea3d5 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -708,3 +708,10 @@ def test_replace_ea_float_with_bool(self): expected = ser.copy() result = ser.replace(0.0, True) tm.assert_series_equal(result, expected) + + def test_replace_all_NA(self): + # GH#60688 + df = pd.Series([pd.NA, pd.NA]) + result = df.replace({r"^#": "$"}, regex=True) + expected = pd.Series([pd.NA, pd.NA]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 69f42b5e42878..a2be698c0ec28 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -90,6 +90,13 @@ def test_unparsable_strings_with_dt64_dtype(self): with pytest.raises(ValueError, match=msg): Series(np.array(vals, dtype=object), dtype="datetime64[ns]") + def test_invalid_dtype_conversion_datetime_to_timedelta(self): + # GH#60728 + vals = Series([NaT, Timestamp(2025, 1, 1)], dtype="datetime64[ns]") + msg = r"^Cannot cast DatetimeArray to dtype timedelta64\[ns\]$" + with pytest.raises(TypeError, match=msg): + Series(vals, dtype="timedelta64[ns]") + @pytest.mark.parametrize( "constructor", [ diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index a9d5486139b46..db83cf1112e74 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -6,6 +6,8 @@ tests.frame.test_cumulative """ +import re + import numpy as np import pytest @@ -227,3 +229,56 @@ def test_cumprod_timedelta(self): ser = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=3)]) with pytest.raises(TypeError, match="cumprod not supported for Timedelta"): ser.cumprod() + + @pytest.mark.parametrize( + "data, op, skipna, expected_data", + [ + ([], "cumsum", True, []), + ([], "cumsum", False, []), + (["x", "z", "y"], "cumsum", True, ["x", "xz", "xzy"]), + (["x", "z", "y"], "cumsum", False, ["x", "xz", "xzy"]), + (["x", pd.NA, "y"], "cumsum", True, ["x", pd.NA, "xy"]), + (["x", pd.NA, "y"], "cumsum", False, ["x", pd.NA, pd.NA]), + ([pd.NA, "x", "y"], "cumsum", True, [pd.NA, "x", "xy"]), + ([pd.NA, "x", "y"], "cumsum", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cumsum", True, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cumsum", False, [pd.NA, pd.NA, pd.NA]), + ([], "cummin", True, []), + ([], "cummin", False, []), + (["y", "z", "x"], "cummin", True, ["y", "y", "x"]), + (["y", "z", "x"], "cummin", False, ["y", "y", "x"]), + (["y", pd.NA, "x"], "cummin", True, ["y", pd.NA, "x"]), + (["y", pd.NA, "x"], "cummin", False, ["y", pd.NA, pd.NA]), + ([pd.NA, "y", "x"], "cummin", True, [pd.NA, "y", "x"]), + ([pd.NA, "y", "x"], "cummin", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummin", True, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummin", False, [pd.NA, pd.NA, pd.NA]), + ([], "cummax", True, []), + ([], "cummax", False, []), + (["x", "z", "y"], "cummax", True, ["x", "z", "z"]), + (["x", "z", "y"], "cummax", False, ["x", "z", "z"]), + (["x", pd.NA, "y"], "cummax", True, ["x", pd.NA, "y"]), + (["x", pd.NA, "y"], "cummax", False, ["x", pd.NA, pd.NA]), + ([pd.NA, "x", "y"], "cummax", True, [pd.NA, "x", "y"]), + ([pd.NA, "x", "y"], "cummax", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummax", True, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]), + ], + ) + def test_cum_methods_ea_strings( + self, string_dtype_no_object, data, op, skipna, expected_data + ): + # https://github.com/pandas-dev/pandas/pull/60633 - pyarrow + # https://github.com/pandas-dev/pandas/pull/60938 - Python + ser = pd.Series(data, dtype=string_dtype_no_object) + method = getattr(ser, op) + expected = pd.Series(expected_data, dtype=string_dtype_no_object) + result = method(skipna=skipna) + tm.assert_series_equal(result, expected) + + def test_cumprod_pyarrow_strings(self, pyarrow_string_dtype, skipna): + # https://github.com/pandas-dev/pandas/pull/60633 + ser = pd.Series(list("xyz"), dtype=pyarrow_string_dtype) + msg = re.escape(f"operation 'cumprod' not supported for dtype '{ser.dtype}'") + with pytest.raises(TypeError, match=msg): + ser.cumprod(skipna=skipna) diff --git a/pandas/tests/strings/conftest.py b/pandas/tests/strings/conftest.py index 92b7b16da3c1f..5bcbb16da3be9 100644 --- a/pandas/tests/strings/conftest.py +++ b/pandas/tests/strings/conftest.py @@ -68,6 +68,7 @@ "get_dummies", "isalnum", "isalpha", + "isascii", "isdecimal", "isdigit", "islower", diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 34a6377b5786f..30e6ebf0eed13 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -293,23 +293,12 @@ def test_startswith_endswith_validate_na(any_string_dtype): dtype=any_string_dtype, ) - dtype = ser.dtype - if (isinstance(dtype, pd.StringDtype)) or dtype == np.dtype("object"): - msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - ser.str.startswith("kapow", na="baz") - msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - ser.str.endswith("bar", na="baz") - else: - # TODO(infer_string): don't surface pyarrow errors - import pyarrow as pa - - msg = "Could not convert 'baz' with type str: tried to convert to boolean" - with pytest.raises(pa.lib.ArrowInvalid, match=msg): - ser.str.startswith("kapow", na="baz") - with pytest.raises(pa.lib.ArrowInvalid, match=msg): - ser.str.endswith("kapow", na="baz") + msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.startswith("kapow", na="baz") + msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.endswith("bar", na="baz") @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 3b989e284ca25..16e10c6fcdccd 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td from pandas import ( @@ -13,11 +11,6 @@ _testing as tm, ) -try: - import pyarrow as pa -except ImportError: - pa = None - def test_get_dummies(any_string_dtype): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) @@ -98,30 +91,12 @@ def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype): # GH#47872 -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_get_dummies_with_str_dtype(any_string_dtype): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies("|", dtype=str) - expected = DataFrame( - [["T", "T", "F"], ["T", "F", "T"], ["F", "F", "F"]], - columns=list("abc"), - dtype=str, - ) - tm.assert_frame_equal(result, expected) + msg = "Only numeric or boolean dtypes are supported for 'dtype'" + with pytest.raises(ValueError, match=msg): + s.str.get_dummies("|", dtype=str) -# GH#47872 -@td.skip_if_no("pyarrow") -def test_get_dummies_with_pa_str_dtype(any_string_dtype): - s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies("|", dtype="str[pyarrow]") - expected = DataFrame( - [ - ["true", "true", "false"], - ["true", "false", "true"], - ["false", "false", "false"], - ], - columns=list("abc"), - dtype="str[pyarrow]", - ) - tm.assert_frame_equal(result, expected) + with pytest.raises(ValueError, match=msg): + s.str.get_dummies("|", dtype="datetime64[ns]") diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index cd3c512328139..c5414022e664b 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -83,6 +83,7 @@ def test_string_array_numeric_integer_array(nullable_string_dtype, method, expec [ ("isdigit", [False, None, True]), ("isalpha", [True, None, False]), + ("isascii", [True, None, True]), ("isalnum", [True, None, True]), ("isnumeric", [False, None, True]), ], diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 75a2007b61640..025f837982595 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -95,6 +95,7 @@ def test_repeat_with_null(any_string_dtype, arg, repeat): def test_empty_str_methods(any_string_dtype): empty_str = empty = Series(dtype=any_string_dtype) + empty_inferred_str = Series(dtype="str") if is_object_or_nan_string_dtype(any_string_dtype): empty_int = Series(dtype="int64") empty_bool = Series(dtype=bool) @@ -154,11 +155,12 @@ def test_empty_str_methods(any_string_dtype): tm.assert_series_equal(empty_str, empty.str.rstrip()) tm.assert_series_equal(empty_str, empty.str.wrap(42)) tm.assert_series_equal(empty_str, empty.str.get(0)) - tm.assert_series_equal(empty_object, empty_bytes.str.decode("ascii")) + tm.assert_series_equal(empty_inferred_str, empty_bytes.str.decode("ascii")) tm.assert_series_equal(empty_bytes, empty.str.encode("ascii")) # ismethods should always return boolean (GH 29624) tm.assert_series_equal(empty_bool, empty.str.isalnum()) tm.assert_series_equal(empty_bool, empty.str.isalpha()) + tm.assert_series_equal(empty_bool, empty.str.isascii()) tm.assert_series_equal(empty_bool, empty.str.isdigit()) tm.assert_series_equal(empty_bool, empty.str.isspace()) tm.assert_series_equal(empty_bool, empty.str.islower()) @@ -177,6 +179,7 @@ def test_empty_str_methods(any_string_dtype): @pytest.mark.parametrize( "method, expected", [ + ("isascii", [True, True, True, True, True, True, True, True, True, True]), ("isalnum", [True, True, True, True, True, False, True, True, False, False]), ("isalpha", [True, True, True, False, False, False, True, False, False, False]), ( @@ -564,7 +567,7 @@ def test_string_slice_out_of_bounds(any_string_dtype): def test_encode_decode(any_string_dtype): ser = Series(["a", "b", "a\xe4"], dtype=any_string_dtype).str.encode("utf-8") result = ser.str.decode("utf-8") - expected = ser.map(lambda x: x.decode("utf-8")).astype(object) + expected = Series(["a", "b", "a\xe4"], dtype="str") tm.assert_series_equal(result, expected) @@ -594,10 +597,34 @@ def test_decode_errors_kwarg(): ser.str.decode("cp1252") result = ser.str.decode("cp1252", "ignore") - expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype(object) + expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype("str") tm.assert_series_equal(result, expected) +def test_decode_string_dtype(string_dtype): + # https://github.com/pandas-dev/pandas/pull/60940 + ser = Series([b"a", b"b"]) + result = ser.str.decode("utf-8", dtype=string_dtype) + expected = Series(["a", "b"], dtype=string_dtype) + tm.assert_series_equal(result, expected) + + +def test_decode_object_dtype(object_dtype): + # https://github.com/pandas-dev/pandas/pull/60940 + ser = Series([b"a", rb"\ud800"]) + result = ser.str.decode("utf-8", dtype=object_dtype) + expected = Series(["a", r"\ud800"], dtype=object_dtype) + tm.assert_series_equal(result, expected) + + +def test_decode_bad_dtype(): + # https://github.com/pandas-dev/pandas/pull/60940 + ser = Series([b"a", b"b"]) + msg = "dtype must be string or object, got dtype='int64'" + with pytest.raises(ValueError, match=msg): + ser.str.decode("utf-8", dtype="int64") + + @pytest.mark.parametrize( "form, expected", [ @@ -749,5 +776,5 @@ def test_get_with_dict_label(): def test_series_str_decode(): # GH 22613 result = Series([b"x", b"y"]).str.decode(encoding="UTF-8", errors="strict") - expected = Series(["x", "y"], dtype="object") + expected = Series(["x", "y"], dtype="str") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 18df76ddd8ed8..76fad35304fe6 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -20,6 +20,7 @@ TimedeltaIndex, ) import pandas._testing as tm +from pandas.util.version import Version @pytest.fixture @@ -222,7 +223,7 @@ def test_missing_required_dependency(): assert name in output -def test_frame_setitem_dask_array_into_new_col(): +def test_frame_setitem_dask_array_into_new_col(request): # GH#47128 # dask sets "compute.use_numexpr" to False, so catch the current value @@ -230,7 +231,14 @@ def test_frame_setitem_dask_array_into_new_col(): olduse = pd.get_option("compute.use_numexpr") try: + dask = pytest.importorskip("dask") da = pytest.importorskip("dask.array") + if Version(dask.__version__) <= Version("2025.1.0") and Version( + np.__version__ + ) >= Version("2.1"): + request.applymarker( + pytest.mark.xfail(reason="loc.__setitem__ incorrectly mutated column c") + ) dda = da.array([1, 2]) df = DataFrame({"a": ["a", "b"]}) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 74b051aec71a4..e039f54960389 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -794,12 +794,36 @@ def test_to_datetime_np_str(self): ["2015-1-1", "%G-%V-%u", datetime(2014, 12, 29, 0, 0)], ["2015-1-4", "%G-%V-%u", datetime(2015, 1, 1, 0, 0)], ["2015-1-7", "%G-%V-%u", datetime(2015, 1, 4, 0, 0)], + ["2024-52-1", "%G-%V-%u", datetime(2024, 12, 23, 0, 0)], + ["2024-52-7", "%G-%V-%u", datetime(2024, 12, 29, 0, 0)], + ["2025-1-1", "%G-%V-%u", datetime(2024, 12, 30, 0, 0)], + ["2020-53-1", "%G-%V-%u", datetime(2020, 12, 28, 0, 0)], ], ) def test_to_datetime_iso_week_year_format(self, s, _format, dt): # See GH#16607 assert to_datetime(s, format=_format) == dt + @pytest.mark.parametrize( + "msg, s, _format", + [ + [ + "Week 53 does not exist in ISO year 2024", + "2024 53 1", + "%G %V %u", + ], + [ + "Week 53 does not exist in ISO year 2023", + "2023 53 1", + "%G %V %u", + ], + ], + ) + def test_invalid_iso_week_53(self, msg, s, _format): + # See GH#60885 + with pytest.raises(ValueError, match=msg): + to_datetime(s, format=_format) + @pytest.mark.parametrize( "msg, s, _format", [ @@ -1935,7 +1959,7 @@ def test_to_datetime_unit_na_values(self): @pytest.mark.parametrize("bad_val", ["foo", 111111111]) def test_to_datetime_unit_invalid(self, bad_val): if bad_val == "foo": - msg = "Unknown datetime string format, unable to parse: " f"{bad_val}" + msg = f"Unknown datetime string format, unable to parse: {bad_val}" else: msg = "cannot convert input 111111111 with the unit 'D'" with pytest.raises(ValueError, match=msg): @@ -2258,7 +2282,7 @@ def test_to_datetime_iso8601_exact_fails(self, input, format): [ '^unconverted data remains when parsing with format ".*": ".*". ' f"{PARSING_ERR_MSG}$", - f'^time data ".*" doesn\'t match format ".*". ' f"{PARSING_ERR_MSG}$", + f'^time data ".*" doesn\'t match format ".*". {PARSING_ERR_MSG}$', ] ) with pytest.raises( diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index f3645bf0649bd..893f526fb3eb0 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -192,7 +192,7 @@ def test_numeric_df_columns(columns): # see gh-14827 df = DataFrame( { - "a": [1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), "0.1"], + "a": [1.2, decimal.Decimal("3.14"), decimal.Decimal("infinity"), "0.1"], "b": [1.0, 2.0, 3.0, 4.0], } ) @@ -207,10 +207,10 @@ def test_numeric_df_columns(columns): "data,exp_data", [ ( - [[decimal.Decimal(3.14), 1.0], decimal.Decimal(1.6), 0.1], + [[decimal.Decimal("3.14"), 1.0], decimal.Decimal("1.6"), 0.1], [[3.14, 1.0], 1.6, 0.1], ), - ([np.array([decimal.Decimal(3.14), 1.0]), 0.1], [[3.14, 1.0], 0.1]), + ([np.array([decimal.Decimal("3.14"), 1.0]), 0.1], [[3.14, 1.0], 0.1]), ], ) def test_numeric_embedded_arr_likes(data, exp_data): diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index d0192c12f9518..7480b99595066 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -798,9 +798,9 @@ def test_get_offset(): for name, expected in pairs: offset = _get_offset(name) - assert ( - offset == expected - ), f"Expected {name!r} to yield {expected!r} (actual: {offset!r})" + assert offset == expected, ( + f"Expected {name!r} to yield {expected!r} (actual: {offset!r})" + ) def test_get_offset_legacy(): diff --git a/pandas/tests/tseries/offsets/test_ticks.py b/pandas/tests/tseries/offsets/test_ticks.py index f91230e1460c4..46b6846ad1ec2 100644 --- a/pandas/tests/tseries/offsets/test_ticks.py +++ b/pandas/tests/tseries/offsets/test_ticks.py @@ -289,8 +289,7 @@ def test_tick_rdiv(cls): td64 = delta.to_timedelta64() instance__type = ".".join([cls.__module__, cls.__name__]) msg = ( - "unsupported operand type\\(s\\) for \\/: 'int'|'float' and " - f"'{instance__type}'" + f"unsupported operand type\\(s\\) for \\/: 'int'|'float' and '{instance__type}'" ) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 07425af8ed37a..bc5cd5fcccbf8 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -134,10 +134,7 @@ def test_does_not_convert_mixed_integer(date_string, expected): ( "2013Q1", {"freq": "INVLD-L-DEC-SAT"}, - ( - "Unable to retrieve month information " - "from given freq: INVLD-L-DEC-SAT" - ), + ("Unable to retrieve month information from given freq: INVLD-L-DEC-SAT"), ), ], ) diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 15eaa8c167487..877b50e37670c 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -177,6 +177,38 @@ def test_agg_nested_dicts(): r.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) +@pytest.mark.parametrize( + "func,window_size", + [ + ( + "rolling", + 2, + ), + ( + "expanding", + None, + ), + ], +) +def test_pipe(func, window_size): + # Issue #57076 + df = DataFrame( + { + "B": np.random.default_rng(2).standard_normal(10), + "C": np.random.default_rng(2).standard_normal(10), + } + ) + r = getattr(df, func)(window_size) + + expected = r.max() - r.mean() + result = r.pipe(lambda x: x.max() - x.mean()) + tm.assert_frame_equal(result, expected) + + expected = r.max() - 2 * r.min() + result = r.pipe(lambda x, k: x.max() - k * x.min(), k=2) + tm.assert_frame_equal(result, expected) + + def test_count_nonnumeric_types(step): # GH12541 cols = [ diff --git a/pandas/tests/window/test_cython_aggregations.py b/pandas/tests/window/test_cython_aggregations.py index c60cb6ea74ec0..feb25a294c540 100644 --- a/pandas/tests/window/test_cython_aggregations.py +++ b/pandas/tests/window/test_cython_aggregations.py @@ -30,6 +30,8 @@ def _get_rolling_aggregations(): ("roll_median_c", window_aggregations.roll_median_c), ("roll_max", window_aggregations.roll_max), ("roll_min", window_aggregations.roll_min), + ("roll_first", window_aggregations.roll_first), + ("roll_last", window_aggregations.roll_last), ] + [ ( diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index b2f76bdd0e2ad..39cedc3b692da 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -451,6 +451,8 @@ def test_moment_functions_zero_length_pairwise(f): lambda x: x.expanding(min_periods=5).corr(x, pairwise=False), lambda x: x.expanding(min_periods=5).max(), lambda x: x.expanding(min_periods=5).min(), + lambda x: x.expanding(min_periods=5).first(), + lambda x: x.expanding(min_periods=5).last(), lambda x: x.expanding(min_periods=5).sum(), lambda x: x.expanding(min_periods=5).mean(), lambda x: x.expanding(min_periods=5).std(), @@ -596,6 +598,104 @@ def test_expanding_corr_pairwise_diff_length(): tm.assert_frame_equal(result4, expected) +@pytest.mark.parametrize( + "values,method,expected", + [ + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "first", + [float("nan"), float("nan"), 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "first", + [ + float("nan"), + float("nan"), + float("nan"), + float("nan"), + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + ], + ), + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "last", + [float("nan"), float("nan"), 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "last", + [ + float("nan"), + float("nan"), + float("nan"), + float("nan"), + 5.0, + 5.0, + 7.0, + 7.0, + 9.0, + 9.0, + ], + ), + ], +) +def test_expanding_first_last(values, method, expected): + # GH#33155 + x = Series(values) + result = getattr(x.expanding(3), method)() + expected = Series(expected) + tm.assert_almost_equal(result, expected) + + x = DataFrame({"A": values}) + result = getattr(x.expanding(3), method)() + expected = DataFrame({"A": expected}) + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize( + "values,method,expected", + [ + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "first", + [1.0] * 10, + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "first", + [1.0] * 10, + ), + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "last", + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "last", + [1.0, 1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0, 9.0], + ), + ], +) +def test_expanding_first_last_no_minp(values, method, expected): + # GH#33155 + x = Series(values) + result = getattr(x.expanding(min_periods=0), method)() + expected = Series(expected) + tm.assert_almost_equal(result, expected) + + x = DataFrame({"A": values}) + result = getattr(x.expanding(min_periods=0), method)() + expected = DataFrame({"A": expected}) + tm.assert_almost_equal(result, expected) + + def test_expanding_apply_args_kwargs(engine_and_raw): def mean_w_arg(x, const): return np.mean(x) + const diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index f8e804bf434e9..392239b8adadd 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -91,6 +91,8 @@ def test_getitem_multiple(self, roll_frame): "mean", "min", "max", + "first", + "last", "count", "kurt", "skew", @@ -101,11 +103,7 @@ def test_rolling(self, f, roll_frame): r = g.rolling(window=4) result = getattr(r, f)() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.rolling(4), f)()) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: getattr(x.rolling(4), f)()) # GH 39732 expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)]) expected.index = expected_index @@ -117,11 +115,7 @@ def test_rolling_ddof(self, f, roll_frame): r = g.rolling(window=4) result = getattr(r, f)(ddof=1) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) # GH 39732 expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)]) expected.index = expected_index @@ -135,13 +129,9 @@ def test_rolling_quantile(self, interpolation, roll_frame): r = g.rolling(window=4) result = r.quantile(0.4, interpolation=interpolation) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply( - lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) - ) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply( + lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) + ) # GH 39732 expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)]) expected.index = expected_index @@ -182,9 +172,7 @@ def test_rolling_corr_cov_other_diff_size_as_groups(self, f, roll_frame): def func(x): return getattr(x.rolling(4), f)(roll_frame) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(func) + expected = g.apply(func) # GH 39591: The grouped column should be all np.nan # (groupby.apply inserts 0s for cov) expected["A"] = np.nan @@ -200,9 +188,7 @@ def test_rolling_corr_cov_pairwise(self, f, roll_frame): def func(x): return getattr(x.B.rolling(4), f)(pairwise=True) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(func) + expected = g.apply(func) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -247,11 +233,7 @@ def test_rolling_apply(self, raw, roll_frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) # GH 39732 expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)]) expected.index = expected_index @@ -826,13 +808,9 @@ def test_groupby_rolling_resulting_multiindex3(self): def test_groupby_rolling_object_doesnt_affect_groupby_apply(self, roll_frame): # GH 39732 g = roll_frame.groupby("A", group_keys=False) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: x.rolling(4).sum()).index + expected = g.apply(lambda x: x.rolling(4).sum()).index _ = g.rolling(window=4) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = g.apply(lambda x: x.rolling(4).sum()).index + result = g.apply(lambda x: x.rolling(4).sum()).index tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -1008,13 +986,11 @@ def test_groupby_monotonic(self): df["date"] = to_datetime(df["date"]) df = df.sort_values("date") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = ( - df.set_index("date") - .groupby("name") - .apply(lambda x: x.rolling("180D")["amount"].sum()) - ) + expected = ( + df.set_index("date") + .groupby("name") + .apply(lambda x: x.rolling("180D")["amount"].sum()) + ) result = df.groupby("name").rolling("180D", on="date")["amount"].sum() tm.assert_series_equal(result, expected) @@ -1033,13 +1009,9 @@ def test_datelike_on_monotonic_within_each_group(self): } ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = ( - df.set_index("B") - .groupby("A") - .apply(lambda x: x.rolling("4s")["C"].mean()) - ) + expected = ( + df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) + ) result = df.groupby("A").rolling("4s", on="B").C.mean() tm.assert_series_equal(result, expected) @@ -1062,18 +1034,14 @@ def frame(self): return DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) @pytest.mark.parametrize( - "f", ["sum", "mean", "min", "max", "count", "kurt", "skew"] + "f", ["sum", "mean", "min", "max", "first", "last", "count", "kurt", "skew"] ) def test_expanding(self, f, frame): g = frame.groupby("A", group_keys=False) r = g.expanding() result = getattr(r, f)() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.expanding(), f)()) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: getattr(x.expanding(), f)()) # GH 39732 expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index @@ -1085,11 +1053,7 @@ def test_expanding_ddof(self, f, frame): r = g.expanding() result = getattr(r, f)(ddof=0) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) # GH 39732 expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index @@ -1103,13 +1067,9 @@ def test_expanding_quantile(self, interpolation, frame): r = g.expanding() result = r.quantile(0.4, interpolation=interpolation) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply( - lambda x: x.expanding().quantile(0.4, interpolation=interpolation) - ) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply( + lambda x: x.expanding().quantile(0.4, interpolation=interpolation) + ) # GH 39732 expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index @@ -1125,9 +1085,7 @@ def test_expanding_corr_cov(self, f, frame): def func_0(x): return getattr(x.expanding(), f)(frame) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(func_0) + expected = g.apply(func_0) # GH 39591: groupby.apply returns 1 instead of nan for windows # with all nan values null_idx = list(range(20, 61)) + list(range(72, 113)) @@ -1142,9 +1100,7 @@ def func_0(x): def func_1(x): return getattr(x.B.expanding(), f)(pairwise=True) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(func_1) + expected = g.apply(func_1) tm.assert_series_equal(result, expected) def test_expanding_apply(self, raw, frame): @@ -1153,13 +1109,7 @@ def test_expanding_apply(self, raw, frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply( - lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw) - ) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw)) # GH 39732 expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 120dbe788a23f..887aeca6590dc 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm from pandas.errors import NumbaUtilError import pandas.util._test_decorators as td @@ -11,8 +12,17 @@ to_datetime, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) @pytest.fixture(params=["single", "table"]) diff --git a/pandas/tests/window/test_online.py b/pandas/tests/window/test_online.py index 14d3a39107bc4..43d55a7992b3c 100644 --- a/pandas/tests/window/test_online.py +++ b/pandas/tests/window/test_online.py @@ -1,15 +1,24 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm + from pandas import ( DataFrame, Series, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] -pytest.importorskip("numba") +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) @pytest.mark.filterwarnings("ignore") diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index af3194b5085c4..2aaa35ec5ec2c 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1326,6 +1326,82 @@ def test_rolling_corr_timedelta_index(index, window): tm.assert_almost_equal(result, expected) +@pytest.mark.parametrize( + "values,method,expected", + [ + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "first", + [float("nan"), float("nan"), 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "first", + [float("nan")] * 10, + ), + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "last", + [float("nan"), float("nan"), 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "last", + [float("nan")] * 10, + ), + ], +) +def test_rolling_first_last(values, method, expected): + # GH#33155 + x = Series(values) + result = getattr(x.rolling(3), method)() + expected = Series(expected) + tm.assert_almost_equal(result, expected) + + x = DataFrame({"A": values}) + result = getattr(x.rolling(3), method)() + expected = DataFrame({"A": expected}) + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize( + "values,method,expected", + [ + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "first", + [1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "first", + [1.0, 1.0, 1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0], + ), + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "last", + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "last", + [1.0, 1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0, 9.0], + ), + ], +) +def test_rolling_first_last_no_minp(values, method, expected): + # GH#33155 + x = Series(values) + result = getattr(x.rolling(3, min_periods=0), method)() + expected = Series(expected) + tm.assert_almost_equal(result, expected) + + x = DataFrame({"A": values}) + result = getattr(x.rolling(3, min_periods=0), method)() + expected = DataFrame({"A": expected}) + tm.assert_almost_equal(result, expected) + + def test_groupby_rolling_nan_included(): # GH 35542 data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} diff --git a/pandas/tests/window/test_rolling_functions.py b/pandas/tests/window/test_rolling_functions.py index f77a98ae9a7d9..6820ab7332975 100644 --- a/pandas/tests/window/test_rolling_functions.py +++ b/pandas/tests/window/test_rolling_functions.py @@ -340,6 +340,8 @@ def test_center_reindex_frame(frame, roll_func, kwargs, minp, fill_value): lambda x: x.rolling(window=10, min_periods=5).var(), lambda x: x.rolling(window=10, min_periods=5).skew(), lambda x: x.rolling(window=10, min_periods=5).kurt(), + lambda x: x.rolling(window=10, min_periods=5).first(), + lambda x: x.rolling(window=10, min_periods=5).last(), lambda x: x.rolling(window=10, min_periods=5).quantile(q=0.5), lambda x: x.rolling(window=10, min_periods=5).median(), lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), @@ -501,6 +503,8 @@ def test_rolling_min_max_numeric_types(any_real_numpy_dtype): lambda x: x.rolling(window=10, min_periods=5).var(), lambda x: x.rolling(window=10, min_periods=5).skew(), lambda x: x.rolling(window=10, min_periods=5).kurt(), + lambda x: x.rolling(window=10, min_periods=5).first(), + lambda x: x.rolling(window=10, min_periods=5).last(), lambda x: x.rolling(window=10, min_periods=5).quantile(0.5), lambda x: x.rolling(window=10, min_periods=5).median(), lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index eacdaddfa28b0..043f369566a5d 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -541,6 +541,42 @@ def test_ragged_max(self, ragged): expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) + def test_ragged_first(self, ragged): + df = ragged + + result = df.rolling(window="1s", min_periods=1).first() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).first() + expected = df.copy() + expected["B"] = [0.0, 1, 1, 3, 3] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).first() + expected = df.copy() + expected["B"] = [0.0, 0, 0, 1, 1] + tm.assert_frame_equal(result, expected) + + def test_ragged_last(self, ragged): + df = ragged + + result = df.rolling(window="1s", min_periods=1).last() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).last() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).last() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "freq, op, result_data", [ @@ -586,6 +622,8 @@ def test_freqs_ops(self, freq, op, result_data): "skew", "min", "max", + "first", + "last", ], ) def test_all(self, f, regular): diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 9a01568971af8..88ea1bfa3c6ed 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -145,8 +145,7 @@ def infer_freq( pass elif isinstance(index.dtype, PeriodDtype): raise TypeError( - "PeriodIndex given. Check the `freq` attribute " - "instead of using infer_freq." + "PeriodIndex given. Check the `freq` attribute instead of using infer_freq." ) elif lib.is_np_dtype(index.dtype, "m"): # Allow TimedeltaIndex and TimedeltaArray diff --git a/pyproject.toml b/pyproject.toml index 7ab9cd2c17669..b7d53b0d8934a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -272,8 +272,6 @@ ignore = [ "B007", # controversial "B008", - # setattr is used to side-step mypy - "B009", # getattr is used to side-step mypy "B010", # tests use comparisons but not their returned value @@ -362,8 +360,6 @@ ignore = [ "PLR1733", # 5 errors, it seems like we wannt to ignore these # Unnecessary lookup of list item by index "PLR1736", # 4 errors, we're currently having inline pylint ignore - # empty-comment - "PLR2044", # autofixable # Unpacking a dictionary in iteration without calling `.items()` "PLE1141", # autofixable # import-outside-toplevel @@ -746,5 +742,5 @@ exclude_lines = [ directory = "coverage_html_report" [tool.codespell] -ignore-words-list = "blocs, coo, hist, nd, sav, ser, recuse, nin, timere, expec, expecs, indext, SME, NotIn, tructures, tru" +ignore-words-list = "blocs, coo, hist, nd, sav, ser, recuse, nin, timere, expec, expecs, indext, SME, NotIn, tructures, tru, indx, abd, ABD" ignore-regex = 'https://([\w/\.])+' diff --git a/scripts/cibw_before_build.sh b/scripts/cibw_before_build.sh index 4cdbf8db0ba89..d326dd3637314 100644 --- a/scripts/cibw_before_build.sh +++ b/scripts/cibw_before_build.sh @@ -1,3 +1,4 @@ +#!/bin/bash # Add 3rd party licenses, like numpy does for file in $PACKAGE_DIR/LICENSES/*; do cat $file >> $PACKAGE_DIR/LICENSE diff --git a/scripts/cibw_before_build_windows.sh b/scripts/cibw_before_build_windows.sh index 5153ebd691f3b..f9e1e68d8efba 100644 --- a/scripts/cibw_before_build_windows.sh +++ b/scripts/cibw_before_build_windows.sh @@ -1,3 +1,4 @@ +#!/bin/bash # Add 3rd party licenses, like numpy does for file in $PACKAGE_DIR/LICENSES/*; do cat $file >> $PACKAGE_DIR/LICENSE diff --git a/scripts/cibw_before_test_windows.sh b/scripts/cibw_before_test_windows.sh index dd02bc23dd5a1..8878e3950452f 100644 --- a/scripts/cibw_before_test_windows.sh +++ b/scripts/cibw_before_test_windows.sh @@ -1,3 +1,4 @@ +#!/bin/bash # TODO: Delete when there's a NumPy Windows wheel for the free-threaded build on PyPI. FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" if [[ $FREE_THREADED_BUILD == "True" ]]; then diff --git a/scripts/download_wheels.sh b/scripts/download_wheels.sh index 84279ac7a04d1..3dcae0cadcdcf 100755 --- a/scripts/download_wheels.sh +++ b/scripts/download_wheels.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # # Download all wheels for a pandas version. # @@ -11,11 +11,12 @@ # one by one to the dist/ directory where they would be generated. VERSION=$1 -mkdir -p $(dirname -- $0)/../dist -DIST_DIR="$(realpath $(dirname -- $0)/../dist)" +BASE_DIR=$(dirname -- $0) +mkdir -p $BASE_DIR/../dist +DIST_DIR="$(realpath $BASE_DIR/../dist)" -if [ -z $VERSION ]; then - printf "Usage:\n\t$0 \n\nWhere is for example 1.5.3" +if [ -z "$VERSION" ]; then + printf "Usage:\n\t%s \n\nWhere is for example 1.5.3" "$0" exit 1 fi @@ -23,7 +24,7 @@ curl "https://anaconda.org/multibuild-wheels-staging/pandas/files?version=${VERS grep "href=\"/multibuild-wheels-staging/pandas/${VERSION}" | \ sed -r 's/.*.*/\1/g' | \ awk '{print "https://anaconda.org" $0 }' | \ - xargs wget -P $DIST_DIR + xargs wget -P "$DIST_DIR" -printf "\nWheels downloaded to $DIST_DIR\nYou can upload them to PyPI using:\n\n" -printf "\ttwine upload ${DIST_DIR}/pandas-${VERSION}*.{whl,tar.gz} --skip-existing" +printf '\nWheels downloaded to %s\nYou can upload them to PyPI using:\n\n' "$DIST_DIR" +printf "\ttwine upload %s/pandas-%s*.{whl,tar.gz} --skip-existing" "$DIST_DIR" "$VERSION" diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 55acfaac4d843..944575dcc8659 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -45,6 +45,7 @@ "Styler.template_html_style", "Styler.template_html_table", "Styler.template_latex", + "Styler.template_typst", "Styler.template_string", "Styler.loader", "errors.InvalidComparison", diff --git a/web/pandas/about/team.md b/web/pandas/about/team.md index b66e134fa5b2f..7a19fd7af6595 100644 --- a/web/pandas/about/team.md +++ b/web/pandas/about/team.md @@ -41,8 +41,6 @@ If you want to support pandas development, you can find information in the [dona ## Governance -Wes McKinney is the Benevolent Dictator for Life (BDFL). - The project governance is available in the [project governance page]({{ base_url }}about/governance.html). ## Workgroups diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 6c69ff7602491..2ad8d6243db55 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -8,7 +8,7 @@ developers to build powerful and more focused data tools. The creation of libraries that complement pandas' functionality also allows pandas development to remain focused around its original requirements. -This is an community-maintained list of projects that build on pandas in order +This is a community-maintained list of projects that build on pandas in order to provide tools in the PyData space. The pandas core development team does not necessarily endorse any particular project on this list or have any knowledge of the maintenance status of any particular library. For a more complete list of projects that depend on pandas, see the [libraries.io usage page for @@ -158,7 +158,7 @@ df = pd.read_csv("data.csv") df # discover interesting insights! ``` -By printing out a dataframe, Lux automatically [recommends a set of visualizations](https://raw.githubusercontent.com/lux-org/lux-resources/master/readme_img/demohighlight.gif) that highlights interesting trends and patterns in the dataframe. Users can leverage any existing pandas commands without modifying their code, while being able to visualize their pandas data structures (e.g., DataFrame, Series, Index) at the same time. Lux also offers a [powerful, intuitive language](https://lux-api.readthedocs.io/en/latest/source/guide/vis.html>) that allow users to create Altair, matplotlib, or Vega-Lite visualizations without having to think at the level of code. +By printing out a dataframe, Lux automatically [recommends a set of visualizations](https://raw.githubusercontent.com/lux-org/lux-resources/master/readme_img/demohighlight.gif) that highlights interesting trends and patterns in the dataframe. Users can leverage any existing pandas commands without modifying their code, while being able to visualize their pandas data structures (e.g., DataFrame, Series, Index) at the same time. Lux also offers a [powerful, intuitive language](https://lux-api.readthedocs.io/en/latest/source/guide/vis.html) that allow users to create Altair, matplotlib, or Vega-Lite visualizations without having to think at the level of code. ### [D-Tale](https://github.com/man-group/dtale) @@ -205,7 +205,7 @@ standard output formats (HTML, HTML presentation slides, LaTeX, PDF, ReStructuredText, Markdown, Python) through 'Download As' in the web interface and `jupyter convert` in a shell. -Pandas DataFrames implement `_repr_html_`and `_repr_latex` methods which +Pandas DataFrames implement `_repr_html_` and `_repr_latex` methods which are utilized by Jupyter Notebook for displaying (abbreviated) HTML or LaTeX tables. LaTeX output is properly escaped. (Note: HTML tables may or may not be compatible with non-HTML Jupyter output formats.) @@ -342,7 +342,7 @@ It supports the following data types: - pandas data types - data types defined in the [NTV format](https://loco-philippe.github.io/ES/JSON%20semantic%20format%20(JSON-NTV).htm) -- data types defined in [Table Schema specification](http://dataprotocols.org/json-table-schema/#field-types-and-formats) +- data types defined in [Table Schema specification](https://datapackage.org/standard/table-schema/) The interface is always reversible (conversion round trip) with two formats (JSON-NTV and JSON-TableSchema). @@ -468,20 +468,57 @@ df.dtypes ArcticDB also supports appending, updating, and querying data from storage to a pandas DataFrame. Please find more information [here](https://docs.arcticdb.io/latest/api/query_builder/). +### [Hugging Face](https://huggingface.co/datasets) + +The Hugging Face Dataset Hub provides a large collection of ready-to-use datasets for machine learning shared by the community. The platform offers a user-friendly interface to explore, discover and visualize datasets, and provides tools to easily load and work with these datasets in Python thanks to the [huggingface_hub](https://github.com/huggingface/huggingface_hub) library. + +You can access datasets on Hugging Face using `hf://` paths in pandas, in the form `hf://datasets/username/dataset_name/...`. + +For example, here is how to load the [stanfordnlp/imdb dataset](https://huggingface.co/datasets/stanfordnlp/imdb): + +```python +import pandas as pd + +# Load the IMDB dataset +df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet") +``` + +Tip: on a dataset page, click on "Use this dataset" to get the code to load it in pandas. + +To save a dataset on Hugging Face you need to [create a public or private dataset](https://huggingface.co/new-dataset) and [login](https://huggingface.co/docs/huggingface_hub/quick-start#login-command), and then you can use `df.to_csv/to_json/to_parquet`: + +```python +# Save the dataset to my Hugging Face account +df.to_parquet("hf://datasets/username/dataset_name/train.parquet") +``` + +You can find more information about the Hugging Face Dataset Hub in the [documentation](https://huggingface.co/docs/hub/en/datasets). ## Out-of-core -### [Bodo](https://bodo.ai/) +### [Bodo](https://github.com/bodo-ai/Bodo) + + +Bodo is a high-performance compute engine for Python data processing. +Using an auto-parallelizing just-in-time (JIT) compiler, Bodo simplifies scaling Pandas +workloads from laptops to clusters without major code changes. +Under the hood, Bodo relies on MPI-based high-performance computing (HPC) technology—making it +both easier to use and often much faster than alternatives. +Bodo also provides a SQL engine that can query distributed pandas dataframes efficiently. -Bodo is a high-performance Python computing engine that automatically parallelizes and -optimizes your code through compilation using HPC (high-performance computing) techniques. -Designed to operate with native pandas dataframes, Bodo compiles your pandas code to execute -across multiple cores on a single machine or distributed clusters of multiple compute nodes efficiently. -Bodo also makes distributed pandas dataframes queryable with SQL. +```python +import pandas as pd +import bodo + +@bodo.jit +def process_data(): + df = pd.read_parquet("my_data.pq") + df2 = pd.DataFrame({"A": df.apply(lambda r: 0 if r.A == 0 else (r.B // r.A), axis=1)}) + df2.to_parquet("out.pq") + +process_data() +``` -The community edition of Bodo is free to use on up to 8 cores. Beyond that, Bodo offers a paid -enterprise edition. Free licenses of Bodo (for more than 8 cores) are available -[upon request](https://www.bodo.ai/contact) for academic and non-profit use. ### [Cylon](https://cylondata.org/) @@ -651,7 +688,7 @@ units aware. ### [Text Extensions](https://ibm.biz/text-extensions-for-pandas) -Text Extensions for Pandas provides extension types to cover common data structures for representing natural language data, plus library integrations that convert the outputs of popular natural language processing libraries into Pandas DataFrames. +Text Extensions for Pandas provides extension types to cover common data structures for representing natural language data, plus library integrations that convert the outputs of popular natural language processing libraries into pandas DataFrames. ## Accessors diff --git a/web/pandas/config.yml b/web/pandas/config.yml index a49aadd45204a..41ba581852dd1 100644 --- a/web/pandas/config.yml +++ b/web/pandas/config.yml @@ -132,8 +132,7 @@ workgroups: contact: infrastructure@pandas.pydata.org responsibilities: "Keep the pandas infrastructure up and working. In particular the servers for the website, benchmarks, CI and others needed." members: - - Marc Garcia - - Matthew Roeschke + - William Ayd - Thomas Li communications: name: Communications @@ -141,7 +140,6 @@ workgroups: responsibilities: "Share relevant information with the broader community, mainly via our social networks, as well as being the main point of contact between NumFOCUS and the core team." members: - Marco Gorelli - - Marc Garcia sponsors: active: - name: "NumFOCUS" diff --git a/web/pandas/index.html b/web/pandas/index.html index 98628b856edb6..e8aab9e11144c 100644 --- a/web/pandas/index.html +++ b/web/pandas/index.html @@ -121,7 +121,7 @@

Previous versions

    {% for release in releases[5:] %}
  • - {{ release.name }} ({{ release.published.strftime("%Y-%m-%d") }})
    + {{ release.name }} ({{ release.published.strftime("%b %d, %Y") }})
    changelog | docs | code diff --git a/web/pandas/pdeps/0010-required-pyarrow-dependency.md b/web/pandas/pdeps/0010-required-pyarrow-dependency.md index d586c46e243f8..0c3bf3c776988 100644 --- a/web/pandas/pdeps/0010-required-pyarrow-dependency.md +++ b/web/pandas/pdeps/0010-required-pyarrow-dependency.md @@ -185,7 +185,6 @@ Additionally, if a user is installing pandas in an environment where wheels are the user will need to also build Arrow C++ and related dependencies when installing from source. These environments include - Alpine linux (commonly used as a base for Docker containers) -- WASM (pyodide and pyscript) - Python development versions Lastly, pandas development and releases will need to be mindful of PyArrow's development and release cadance. For example when diff --git a/web/pandas/pdeps/0017-backwards-compatibility-and-deprecation-policy.md b/web/pandas/pdeps/0017-backwards-compatibility-and-deprecation-policy.md new file mode 100644 index 0000000000000..b8eba90f399c9 --- /dev/null +++ b/web/pandas/pdeps/0017-backwards-compatibility-and-deprecation-policy.md @@ -0,0 +1,74 @@ +# PDEP-17: Backwards compatibility and deprecation policy + +- Created: 27 June 2024 +- Status: Accepted +- Discussion: [#59125](https://github.com/pandas-dev/pandas/issues/59125) +- Author: [Abdulaziz Aloqeely](https://github.com/Aloqeely) +- Revision: 1 + +## Abstract + +This PDEP defines pandas' backwards compatibility and deprecation policy. + +The main additions to [pandas' current version policy](https://pandas.pydata.org/pandas-docs/version/2.2/development/policies.html) are: +- Deprecated functionality should remain unchanged in at least 2 minor releases before being changed or removed. +- Deprecations should initially use DeprecationWarning, and then be switched to FutureWarning in the last minor release before the major release they are planned to be removed in + +## Motivation + +Having a clear backwards compatibility and deprecation policy is crucial to having a healthy ecosystem. We want to ensure users can rely on pandas being stable while still allowing the library to evolve. + +This policy will ensure that users have enough time to deal with deprecations while also minimizing disruptions on downstream packages' users. + +## Scope + +This PDEP covers pandas' approach to backwards compatibility and the deprecation and removal process. + +## Background + +pandas uses a loose variant of semantic versioning. +A pandas release number is written in the format of ``MAJOR.MINOR.PATCH``. + +## General policy + +This policy applies to the [public API][1]. Anything not part of the [public API][1] or is marked as "Experimental" may be changed or removed at anytime. + +- Breaking backwards compatibility should benefit more than it harms users. +- Breaking changes should go through a deprecation cycle before being implemented if possible. +- Breaking changes should only occur in major releases. +- No deprecations should be introduced in patch releases. +- Deprecated functionality should remain unchanged in at least 2 minor releases before being changed or removed. + +Some bug fixes may require breaking backwards compatibility. In these cases, a deprecation cycle is not necessary. However, bug fixes which have a large impact on users might be treated as a breaking change. Whether or not a change is a bug fix or an API breaking change is a judgement call. + +## Deprecation process + +Deprecation provides a way to warn developers and give them time to adapt their code to the new functionality before the old behavior is eventually removed. + +A deprecation's warning message should: +- Provide information on what is changing. +- Mention how to achieve similar behavior if an alternative is available. +- For large-scale deprecations, it is recommended to include a reason for the deprecation, alongside a discussion link to get user feedback. + +Additionally, when one introduces a deprecation, they should: +- Use the appropriate warning class. More info on this can be found below. +- Add the GitHub issue/PR number as a comment above the warning line. +- Add an entry in the release notes. +- Mention that the functionality is deprecated in the documentation using the ``.. deprecated::`` directive. + +### Which warning class to use + +Deprecations should initially use ``DeprecationWarning``, and then be switched to ``FutureWarning`` for broader visibility in the last minor release before the major release they are planned to be removed in. +This implementation detail can be ignored by using the appropriate ``PandasDeprecationWarning`` variable, which will be aliased to the proper warning class based on the pandas version. + +### Enforcement of deprecations + +When one enforces a deprecation, they should: +- Add an entry in the release notes. +- For API changes, replace the ``.. deprecated::`` directive in the documentation with a ``.. versionchanged::`` directive. + +### PDEP-17 History + +- 27 June 2024: Initial version. + +[1]: https://pandas.pydata.org/docs/reference/index.html