diff --git a/.github/workflows/benchmarks-last-release.yml b/.github/workflows/benchmarks-last-release.yml index 0621a59959f..e1ae9b1b62e 100644 --- a/.github/workflows/benchmarks-last-release.yml +++ b/.github/workflows/benchmarks-last-release.yml @@ -22,13 +22,13 @@ jobs: fetch-depth: 0 - name: Set up conda environment - uses: mamba-org/provision-with-micromamba@v15 + uses: mamba-org/setup-micromamba@v1 with: environment-file: ${{env.CONDA_ENV_FILE}} environment-name: xarray-tests - cache-env: true - cache-env-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}-benchmark" - extra-specs: | + cache-environment: true + cache-environment-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}-benchmark" + create-args: >- asv - name: 'Get Previous tag' diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index b9a8d773c5a..ade00b942e7 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -22,13 +22,13 @@ jobs: fetch-depth: 0 - name: Set up conda environment - uses: mamba-org/provision-with-micromamba@v15 + uses: mamba-org/setup-micromamba@v1 with: environment-file: ${{env.CONDA_ENV_FILE}} environment-name: xarray-tests - cache-env: true - cache-env-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}-benchmark" - extra-specs: | + cache-environment: true + cache-environment-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}-benchmark" + create-args: >- asv diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index f7097185224..99ebefd9338 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -53,15 +53,15 @@ jobs: echo "TODAY=$(date +'%Y-%m-%d')" >> $GITHUB_ENV - name: Setup micromamba - uses: mamba-org/provision-with-micromamba@v15 + uses: mamba-org/setup-micromamba@v1 with: environment-file: ${{env.CONDA_ENV_FILE}} environment-name: xarray-tests - extra-specs: | + create-args: >- python=${{env.PYTHON_VERSION}} conda - cache-env: true - cache-env-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}" + cache-environment: true + cache-environment-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}" - name: Install xarray run: | @@ -100,15 +100,15 @@ jobs: run: | echo "TODAY=$(date +'%Y-%m-%d')" >> $GITHUB_ENV - name: Setup micromamba - uses: mamba-org/provision-with-micromamba@v15 + uses: mamba-org/setup-micromamba@v1 with: environment-file: ${{env.CONDA_ENV_FILE}} environment-name: xarray-tests - extra-specs: | + create-args: >- python=${{env.PYTHON_VERSION}} conda - cache-env: true - cache-env-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}" + cache-environment: true + cache-environment-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}" - name: Install xarray run: | python -m pip install --no-deps -e . @@ -126,7 +126,7 @@ jobs: python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report - name: Upload mypy coverage to Codecov - uses: codecov/codecov-action@v3.1.3 + uses: codecov/codecov-action@v3.1.4 with: file: mypy_report/cobertura.xml flags: mypy @@ -154,15 +154,15 @@ jobs: run: | echo "TODAY=$(date +'%Y-%m-%d')" >> $GITHUB_ENV - name: Setup micromamba - uses: mamba-org/provision-with-micromamba@v15 + uses: mamba-org/setup-micromamba@v1 with: environment-file: ${{env.CONDA_ENV_FILE}} environment-name: xarray-tests - extra-specs: | + create-args: >- python=${{env.PYTHON_VERSION}} conda - cache-env: true - cache-env-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}" + cache-environment: true + cache-environment-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}" - name: Install xarray run: | python -m pip install --no-deps -e . @@ -180,7 +180,7 @@ jobs: python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report - name: Upload mypy coverage to Codecov - uses: codecov/codecov-action@v3.1.3 + uses: codecov/codecov-action@v3.1.4 with: file: mypy_report/cobertura.xml flags: mypy39 @@ -210,16 +210,14 @@ jobs: fetch-depth: 0 # Fetch all history for all branches and tags. - name: Setup micromamba - uses: mamba-org/provision-with-micromamba@v15 + uses: mamba-org/setup-micromamba@v1 with: environment-name: xarray-tests - environment-file: false - extra-specs: | + create-args: >- python=3.10 pyyaml conda python-dateutil - channels: conda-forge - name: minimum versions policy run: | diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 457520a08e9..da4ad32b1f5 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -92,13 +92,13 @@ jobs: echo "PYTHON_VERSION=${{ matrix.python-version }}" >> $GITHUB_ENV - name: Setup micromamba - uses: mamba-org/provision-with-micromamba@v15 + uses: mamba-org/setup-micromamba@v1 with: environment-file: ${{ env.CONDA_ENV_FILE }} environment-name: xarray-tests - cache-env: true - cache-env-key: "${{runner.os}}-${{runner.arch}}-py${{matrix.python-version}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}" - extra-specs: | + cache-environment: true + cache-environment-key: "${{runner.os}}-${{runner.arch}}-py${{matrix.python-version}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}" + create-args: >- python=${{matrix.python-version}} conda @@ -139,7 +139,7 @@ jobs: path: pytest.xml - name: Upload code coverage to Codecov - uses: codecov/codecov-action@v3.1.3 + uses: codecov/codecov-action@v3.1.4 with: file: ./coverage.xml flags: unittests diff --git a/.github/workflows/nightly-wheels.yml b/.github/workflows/nightly-wheels.yml new file mode 100644 index 00000000000..562e442683e --- /dev/null +++ b/.github/workflows/nightly-wheels.yml @@ -0,0 +1,44 @@ +name: Upload nightly wheels +on: + workflow_dispatch: + schedule: + - cron: "0 0 * * *" +jobs: + cron: + runs-on: ubuntu-latest + if: github.repository == 'pydata/xarray' + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + - uses: actions/setup-python@v4 + with: + python-version: "3.11" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install build twine + + - name: Build tarball and wheels + run: | + git clean -xdf + git restore -SW . + python -m build + + - name: Check built artifacts + run: | + python -m twine check --strict dist/* + pwd + if [ -f dist/xarray-0.0.0.tar.gz ]; then + echo "❌ INVALID VERSION NUMBER" + exit 1 + else + echo "✅ Looks good" + fi + + - name: Upload wheel + uses: scientific-python/upload-nightly-action@main + with: + anaconda_nightly_upload_token: ${{ secrets.ANACONDA_NIGHTLY }} + artifacts_path: dist diff --git a/.github/workflows/parse_logs.py b/.github/workflows/parse_logs.py deleted file mode 100644 index c0674aeac0b..00000000000 --- a/.github/workflows/parse_logs.py +++ /dev/null @@ -1,102 +0,0 @@ -# type: ignore -import argparse -import functools -import json -import pathlib -import textwrap -from dataclasses import dataclass - -from pytest import CollectReport, TestReport - - -@dataclass -class SessionStart: - pytest_version: str - outcome: str = "status" - - @classmethod - def _from_json(cls, json): - json_ = json.copy() - json_.pop("$report_type") - return cls(**json_) - - -@dataclass -class SessionFinish: - exitstatus: str - outcome: str = "status" - - @classmethod - def _from_json(cls, json): - json_ = json.copy() - json_.pop("$report_type") - return cls(**json_) - - -def parse_record(record): - report_types = { - "TestReport": TestReport, - "CollectReport": CollectReport, - "SessionStart": SessionStart, - "SessionFinish": SessionFinish, - } - cls = report_types.get(record["$report_type"]) - if cls is None: - raise ValueError(f"unknown report type: {record['$report_type']}") - - return cls._from_json(record) - - -@functools.singledispatch -def format_summary(report): - return f"{report.nodeid}: {report}" - - -@format_summary.register -def _(report: TestReport): - message = report.longrepr.chain[0][1].message - return f"{report.nodeid}: {message}" - - -@format_summary.register -def _(report: CollectReport): - message = report.longrepr.split("\n")[-1].removeprefix("E").lstrip() - return f"{report.nodeid}: {message}" - - -def format_report(reports, py_version): - newline = "\n" - summaries = newline.join(format_summary(r) for r in reports) - message = textwrap.dedent( - """\ -
Python {py_version} Test Summary - - ``` - {summaries} - ``` - -
- """ - ).format(summaries=summaries, py_version=py_version) - return message - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("filepath", type=pathlib.Path) - args = parser.parse_args() - - py_version = args.filepath.stem.split("-")[1] - - print("Parsing logs ...") - - lines = args.filepath.read_text().splitlines() - reports = [parse_record(json.loads(line)) for line in lines] - - failed = [report for report in reports if report.outcome == "failed"] - - message = format_report(failed, py_version=py_version) - - output_file = pathlib.Path("pytest-logs.txt") - print(f"Writing output file to: {output_file.absolute()}") - output_file.write_text(message) diff --git a/.github/workflows/pypi-release.yaml b/.github/workflows/pypi-release.yaml index 41957a941e2..5f4a2cd364c 100644 --- a/.github/workflows/pypi-release.yaml +++ b/.github/workflows/pypi-release.yaml @@ -70,12 +70,26 @@ jobs: python -m pip install dist/xarray*.whl python -m xarray.util.print_versions + upload-to-test-pypi: + needs: test-built-dist + if: github.event_name == 'push' + runs-on: ubuntu-latest + + environment: + name: pypi + url: https://test.pypi.org/p/xarray + permissions: + id-token: write + + steps: + - uses: actions/download-artifact@v3 + with: + name: releases + path: dist - name: Publish package to TestPyPI if: github.event_name == 'push' - uses: pypa/gh-action-pypi-publish@v1.8.5 + uses: pypa/gh-action-pypi-publish@v1.8.10 with: - user: __token__ - password: ${{ secrets.TESTPYPI_TOKEN }} repository_url: https://test.pypi.org/legacy/ verbose: true @@ -84,14 +98,19 @@ jobs: needs: test-built-dist if: github.event_name == 'release' runs-on: ubuntu-latest + + environment: + name: pypi + url: https://pypi.org/p/xarray + permissions: + id-token: write + steps: - uses: actions/download-artifact@v3 with: name: releases path: dist - name: Publish package to PyPI - uses: pypa/gh-action-pypi-publish@v1.8.5 + uses: pypa/gh-action-pypi-publish@v1.8.10 with: - user: __token__ - password: ${{ secrets.PYPI_TOKEN }} verbose: true diff --git a/.github/workflows/testpypi-release.yaml b/.github/workflows/testpypi-release.yaml deleted file mode 100644 index ddc6a2bddf3..00000000000 --- a/.github/workflows/testpypi-release.yaml +++ /dev/null @@ -1,86 +0,0 @@ -name: Build and Upload xarray to PyPI -on: - push: - branches: - - 'main' - -# no need for concurrency limits - -jobs: - build-artifacts: - runs-on: ubuntu-latest - if: github.repository == 'pydata/xarray' - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - uses: actions/setup-python@v4 - name: Install Python - with: - python-version: "3.10" - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - python -m pip install build twine - python -m pip install tomli tomli_w - - - name: Disable local versions - run: | - python .github/workflows/configure-testpypi-version.py pyproject.toml - git update-index --assume-unchanged pyproject.toml - cat pyproject.toml - - - name: Build tarball and wheels - run: | - git clean -xdf - python -m build - - - name: Check built artifacts - run: | - python -m twine check --strict dist/* - if [ -f dist/xarray-0.0.0.tar.gz ]; then - echo "❌ INVALID VERSION NUMBER" - exit 1 - else - echo "✅ Looks good" - fi - - - uses: actions/upload-artifact@v3 - with: - name: releases - path: dist - - test-built-dist: - needs: build-artifacts - runs-on: ubuntu-latest - steps: - - uses: actions/setup-python@v4 - name: Install Python - with: - python-version: "3.10" - - uses: actions/download-artifact@v3 - with: - name: releases - path: dist - - name: List contents of built dist - run: | - ls -ltrh - ls -ltrh dist - - - name: Verify the built dist/wheel is valid - if: github.event_name == 'push' - run: | - python -m pip install --upgrade pip - python -m pip install dist/xarray*.whl - python -m xarray.util.print_versions - - - name: Publish package to TestPyPI - if: github.event_name == 'push' - uses: pypa/gh-action-pypi-publish@v1.8.5 - with: - user: __token__ - password: ${{ secrets.TESTPYPI_TOKEN }} - repository_url: https://test.pypi.org/legacy/ - verbose: true diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index ff7b633fcff..7c60f20125e 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -56,11 +56,11 @@ jobs: with: fetch-depth: 0 # Fetch all history for all branches and tags. - name: Set up conda environment - uses: mamba-org/provision-with-micromamba@v15 + uses: mamba-org/setup-micromamba@v1 with: environment-file: ci/requirements/environment.yml environment-name: xarray-tests - extra-specs: | + create-args: >- python=${{ matrix.python-version }} pytest-reportlog conda @@ -142,7 +142,7 @@ jobs: run: | python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report - name: Upload mypy coverage to Codecov - uses: codecov/codecov-action@v3.1.3 + uses: codecov/codecov-action@v3.1.4 with: file: mypy_report/cobertura.xml flags: mypy diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4ddd39ff490..e02b7d0bd08 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,6 @@ # https://pre-commit.com/ +ci: + autoupdate_schedule: monthly repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.4.0 @@ -14,15 +16,15 @@ repos: - id: absolufy-imports name: absolufy-imports files: ^xarray/ - - repo: https://github.com/charliermarsh/ruff-pre-commit + - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: 'v0.0.265' + rev: 'v0.0.282' hooks: - id: ruff args: ["--fix"] # https://github.com/python/black#version-control-integration - repo: https://github.com/psf/black - rev: 23.3.0 + rev: 23.7.0 hooks: - id: black-jupyter - repo: https://github.com/keewis/blackdoc @@ -30,10 +32,10 @@ repos: hooks: - id: blackdoc exclude: "generate_aggregations.py" - additional_dependencies: ["black==23.3.0"] + additional_dependencies: ["black==23.7.0"] - id: blackdoc-autoupdate-black - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.2.0 + rev: v1.4.1 hooks: - id: mypy # Copied from setup.cfg @@ -47,7 +49,7 @@ repos: types-pkg_resources, types-PyYAML, types-pytz, - typing-extensions==3.10.0.0, + typing-extensions>=4.1.0, numpy, ] - repo: https://github.com/citation-file-format/cff-converter-python diff --git a/.readthedocs.yaml b/.readthedocs.yaml index db2e1cd0b9a..55fea717f71 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -7,6 +7,7 @@ build: jobs: post_checkout: - (git --no-pager log --pretty="tformat:%s" -1 | grep -vqF "[skip-rtd]") || exit 183 + - git fetch --unshallow || true pre_install: - git update-index --assume-unchanged doc/conf.py ci/requirements/doc.yml diff --git a/CORE_TEAM_GUIDE.md b/CORE_TEAM_GUIDE.md new file mode 100644 index 00000000000..9eb91f4e586 --- /dev/null +++ b/CORE_TEAM_GUIDE.md @@ -0,0 +1,322 @@ +> **_Note:_** This Core Team Member Guide was adapted from the [napari project's Core Developer Guide](https://napari.org/stable/developers/core_dev_guide.html) and the [Pandas maintainers guide](https://pandas.pydata.org/docs/development/maintaining.html). + +# Core Team Member Guide + +Welcome, new core team member! We appreciate the quality of your work, and enjoy working with you! +Thank you for your numerous contributions to the project so far. + +By accepting the invitation to become a core team member you are **not required to commit to doing any more work** - +xarray is a volunteer project, and we value the contributions you have made already. + +You can see a list of all the current core team members on our +[@pydata/xarray](https://github.com/orgs/pydata/teams/xarray) +GitHub team. Once accepted, you should now be on that list too. +This document offers guidelines for your new role. + +## Tasks + +Xarray values a wide range of contributions, only some of which involve writing code. +As such, we do not currently make a distinction between a "core team member", "core developer", "maintainer", +or "triage team member" as some projects do (e.g. [pandas](https://pandas.pydata.org/docs/development/maintaining.html)). +That said, if you prefer to refer to your role as one of the other titles above then that is fine by us! + +Xarray is mostly a volunteer project, so these tasks shouldn’t be read as “expectations”. +**There are no strict expectations**, other than to adhere to our [Code of Conduct](https://github.com/pydata/xarray/tree/main/CODE_OF_CONDUCT.md). +Rather, the tasks that follow are general descriptions of what it might mean to be a core team member: + +- Facilitate a welcoming environment for those who file issues, make pull requests, and open discussion topics, +- Triage newly filed issues, +- Review newly opened pull requests, +- Respond to updates on existing issues and pull requests, +- Drive discussion and decisions on stalled issues and pull requests, +- Provide experience / wisdom on API design questions to ensure consistency and maintainability, +- Project organization (run developer meetings, coordinate with sponsors), +- Project evangelism (advertise xarray to new users), +- Community contact (represent xarray in user communities such as [Pangeo](https://pangeo.io/)), +- Key project contact (represent xarray's perspective within key related projects like NumPy, Zarr or Dask), +- Project fundraising (help write and administrate grants that will support xarray), +- Improve documentation or tutorials (especially on [`tutorial.xarray.dev`](https://tutorial.xarray.dev/)), +- Presenting or running tutorials (such as those we have given at the SciPy conference), +- Help maintain the [`xarray.dev`](https://xarray.dev/) landing page and website, the [code for which is here](https://github.com/xarray-contrib/xarray.dev), +- Write blog posts on the [xarray blog](https://xarray.dev/blog), +- Help maintain xarray's various Continuous Integration Workflows, +- Help maintain a regular release schedule (we aim for one or more releases per month), +- Attend the bi-weekly community meeting ([issue](https://github.com/pydata/xarray/issues/4001)), +- Contribute to the xarray codebase. + +(Matt Rocklin's post on [the role of a maintainer](https://matthewrocklin.com/blog/2019/05/18/maintainer) may be +interesting background reading, but should not be taken to strictly apply to the Xarray project.) + +Obviously you are not expected to contribute in all (or even more than one) of these ways! +They are listed so as to indicate the many types of work that go into maintaining xarray. + +It is natural that your available time and enthusiasm for the project will wax and wane - this is fine and expected! +It is also common for core team members to have a "niche" - a particular part of the codebase they have specific expertise +with, or certain types of task above which they primarily perform. + +If however you feel that is unlikely you will be able to be actively contribute in the foreseeable future +(or especially if you won't be available to answer questions about pieces of code that you wrote previously) +then you may want to consider letting us know you would rather be listed as an "Emeritus Core Team Member", +as this would help us in evaluating the overall health of the project. + +## Issue triage + +One of the main ways you might spend your contribution time is by responding to or triaging new issues. +Here’s a typical workflow for triaging a newly opened issue or discussion: + +1. **Thank the reporter for opening an issue.** + + The issue tracker is many people’s first interaction with the xarray project itself, beyond just using the library. + It may also be their first open-source contribution of any kind. As such, we want it to be a welcoming, pleasant experience. + +2. **Is the necessary information provided?** + + Ideally reporters would fill out the issue template, but many don’t. If crucial information (like the version of xarray they used), + is missing feel free to ask for that and label the issue with “needs info”. + The report should follow the [guidelines for xarray discussions](https://github.com/pydata/xarray/discussions/5404). + You may want to link to that if they didn’t follow the template. + + Make sure that the title accurately reflects the issue. Edit it yourself if it’s not clear. + Remember also that issues can be converted to discussions and vice versa if appropriate. + +3. **Is this a duplicate issue?** + + We have many open issues. If a new issue is clearly a duplicate, label the new issue as “duplicate”, and close the issue with a link to the original issue. + Make sure to still thank the reporter, and encourage them to chime in on the original issue, and perhaps try to fix it. + + If the new issue provides relevant information, such as a better or slightly different example, add it to the original issue as a comment or an edit to the original post. + +4. **Is the issue minimal and reproducible?** + + For bug reports, we ask that the reporter provide a minimal reproducible example. + See [minimal-bug-reports](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) for a good explanation. + If the example is not reproducible, or if it’s clearly not minimal, feel free to ask the reporter if they can provide and example or simplify the provided one. + Do acknowledge that writing minimal reproducible examples is hard work. If the reporter is struggling, you can try to write one yourself and we’ll edit the original post to include it. + + If a nice reproducible example has been provided, thank the reporter for that. + If a reproducible example can’t be provided, add the “needs mcve” label. + + If a reproducible example is provided, but you see a simplification, edit the original post with your simpler reproducible example. + +5. **Is this a clearly defined feature request?** + + Generally, xarray prefers to discuss and design new features in issues, before a pull request is made. + Encourage the submitter to include a proposed API for the new feature. Having them write a full docstring is a good way to pin down specifics. + + We may need a discussion from several xarray maintainers before deciding whether the proposal is in scope for xarray. + +6. **Is this a usage question?** + + We prefer that usage questions are asked on StackOverflow with the [`python-xarray` tag](https://stackoverflow.com/questions/tagged/python-xarray +) or as a [GitHub discussion topic](https://github.com/pydata/xarray/discussions). + + If it’s easy to answer, feel free to link to the relevant documentation section, let them know that in the future this kind of question should be on StackOverflow, and close the issue. + +7. **What labels and milestones should I add?** + + Apply the relevant labels. This is a bit of an art, and comes with experience. Look at similar issues to get a feel for how things are labeled. + Labels used for labelling issues that relate to particular features or parts of the codebase normally have the form `topic-`. + + If the issue is clearly defined and the fix seems relatively straightforward, label the issue as `contrib-good-first-issue`. + You can also remove the `needs triage` label that is automatically applied to all newly-opened issues. + +8. **Where should the poster look to fix the issue?** + + If you can, it is very helpful to point to the approximate location in the codebase where a contributor might begin to fix the issue. + This helps ease the way in for new contributors to the repository. + +## Code review and contributions + +As a core team member, you are a representative of the project, +and trusted to make decisions that will serve the long term interests +of all users. You also gain the responsibility of shepherding +other contributors through the review process; here are some +guidelines for how to do that. + +### All contributors are treated the same + +You should now have gained the ability to merge or approve +other contributors' pull requests. Merging contributions is a shared power: +only merge contributions you yourself have carefully reviewed, and that are +clear improvements for the project. When in doubt, and especially for more +complex changes, wait until at least one other core team member has approved. +(See [Reviewing](#reviewing) and especially +[Merge Only Changes You Understand](#merge-only-changes-you-understand) below.) + +It should also be considered best practice to leave a reasonable (24hr) time window +after approval before merge to ensure that other core team members have a reasonable +chance to weigh in. +Adding the `plan-to-merge` label notifies developers of the imminent merge. + +We are also an international community, with contributors from many different time zones, +some of whom will only contribute during their working hours, others who might only be able +to contribute during nights and weekends. It is important to be respectful of other peoples +schedules and working habits, even if it slows the project down slightly - we are in this +for the long run. In the same vein you also shouldn't feel pressured to be constantly +available or online, and users or contributors who are overly demanding and unreasonable +to the point of harassment will be directed to our [Code of Conduct](https://github.com/pydata/xarray/tree/main/CODE_OF_CONDUCT.md). +We value sustainable development practices over mad rushes. + +When merging, we automatically use GitHub's +[Squash and Merge](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/incorporating-changes-from-a-pull-request/merging-a-pull-request#merging-a-pull-request) +to ensure a clean git history. + +You should also continue to make your own pull requests as before and in accordance +with the [general contributing guide](https://docs.xarray.dev/en/stable/contributing.html). These pull requests still +require the approval of another core team member before they can be merged. + +### How to conduct a good review + +*Always* be kind to contributors. Contributors are often doing +volunteer work, for which we are tremendously grateful. Provide +constructive criticism on ideas and implementations, and remind +yourself of how it felt when your own work was being evaluated as a +novice. + +``xarray`` strongly values mentorship in code review. New users +often need more handholding, having little to no git +experience. Repeat yourself liberally, and, if you don’t recognize a +contributor, point them to our development guide, or other GitHub +workflow tutorials around the web. Do not assume that they know how +GitHub works (many don't realize that adding a commit +automatically updates a pull request, for example). Gentle, polite, kind +encouragement can make the difference between a new core team member and +an abandoned pull request. + +When reviewing, focus on the following: + +1. **Usability and generality:** `xarray` is a user-facing package that strives to be accessible +to both novice and advanced users, and new features should ultimately be +accessible to everyone using the package. `xarray` targets the scientific user +community broadly, and core features should be domain-agnostic and general purpose. +Custom functionality is meant to be provided through our various types of interoperability. + +2. **Performance and benchmarks:** As `xarray` targets scientific applications that often involve +large multidimensional datasets, high performance is a key value of `xarray`. While +every new feature won't scale equally to all sizes of data, keeping in mind performance +and our [benchmarks](https://github.com/pydata/xarray/tree/main/asv_bench) during a review may be important, and you may +need to ask for benchmarks to be run and reported or new benchmarks to be added. +You can run the CI benchmarking suite on any PR by tagging it with the ``run-benchmark`` label. + +3. **APIs and stability:** Coding users and developers will make +extensive use of our APIs. The foundation of a healthy ecosystem will be +a fully capable and stable set of APIs, so as `xarray` matures it will +very important to ensure our APIs are stable. Spending the extra time to consider names of public facing +variables and methods, alongside function signatures, could save us considerable +trouble in the future. We do our best to provide [deprecation cycles](https://docs.xarray.dev/en/stable/contributing.html#backwards-compatibility) +when making backwards-incompatible changes. + +4. **Documentation and tutorials:** All new methods should have appropriate doc +strings following [PEP257](https://peps.python.org/pep-0257/) and the +[NumPy documentation guide](https://numpy.org/devdocs/dev/howto-docs.html#documentation-style). +For any major new features, accompanying changes should be made to our +[tutorials](https://tutorial.xarray.dev). These should not only +illustrates the new feature, but explains it. + +5. **Implementations and algorithms:** You should understand the code being modified +or added before approving it. (See [Merge Only Changes You Understand](#merge-only-changes-you-understand) +below.) Implementations should do what they claim and be simple, readable, and efficient +in that order. + +6. **Tests:** All contributions *must* be tested, and each added line of code +should be covered by at least one test. Good tests not only execute the code, +but explore corner cases. It can be tempting not to review tests, but please +do so. + +Other changes may be *nitpicky*: spelling mistakes, formatting, +etc. Do not insist contributors make these changes, but instead you should offer +to make these changes by [pushing to their branch](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/committing-changes-to-a-pull-request-branch-created-from-a-fork), +or using GitHub’s [suggestion](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/reviewing-changes-in-pull-requests/commenting-on-a-pull-request) +[feature](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/reviewing-changes-in-pull-requests/incorporating-feedback-in-your-pull-request), and +be prepared to make them yourself if needed. Using the suggestion feature is preferred because +it gives the contributor a choice in whether to accept the changes. + +Unless you know that a contributor is experienced with git, don’t +ask for a rebase when merge conflicts arise. Instead, rebase the +branch yourself, force-push to their branch, and advise the contributor to force-pull. If the contributor is +no longer active, you may take over their branch by submitting a new pull +request and closing the original, including a reference to the original pull +request. In doing so, ensure you communicate that you are not throwing the +contributor's work away! If appropriate it is a good idea to acknowledge other contributions +to the pull request using the `Co-authored-by` +[syntax](https://docs.github.com/en/pull-requests/committing-changes-to-your-project/creating-and-editing-commits/creating-a-commit-with-multiple-authors) in the commit message. + +### Merge only changes you understand + +*Long-term maintainability* is an important concern. Code doesn't +merely have to *work*, but should be *understood* by multiple core +developers. Changes will have to be made in the future, and the +original contributor may have moved on. + +Therefore, *do not merge a code change unless you understand it*. Ask +for help freely: we can consult community members, or even external developers, +for added insight where needed, and see this as a great learning opportunity. + +While we collectively "own" any patches (and bugs!) that become part +of the code base, you are vouching for changes you merge. Please take +that responsibility seriously. + +Feel free to ping other active maintainers with any questions you may have. + +## Further resources + +As a core member, you should be familiar with community and developer +resources such as: + +- Our [contributor guide](https://docs.xarray.dev/en/stable/contributing.html). +- Our [code of conduct](https://github.com/pydata/xarray/tree/main/CODE_OF_CONDUCT.md). +- Our [philosophy and development roadmap](https://docs.xarray.dev/en/stable/roadmap.html). +- [PEP8](https://peps.python.org/pep-0008/) for Python style. +- [PEP257](https://peps.python.org/pep-0257/) and the + [NumPy documentation guide](https://numpy.org/devdocs/dev/howto-docs.html#documentation-style) + for docstring conventions. +- [`pre-commit`](https://pre-commit.com) hooks for autoformatting. +- [`black`](https://github.com/psf/black) autoformatting. +- [`flake8`](https://github.com/PyCQA/flake8) linting. +- [python-xarray](https://stackoverflow.com/questions/tagged/python-xarray) on Stack Overflow. +- [@xarray_dev](https://twitter.com/xarray_dev) on Twitter. +- [xarray-dev](https://discord.gg/bsSGdwBn) discord community (normally only used for remote synchronous chat during sprints). + +You are not required to monitor any of the social resources. + +Where possible we prefer to point people towards asynchronous forms of communication +like github issues instead of realtime chat options as they are far easier +for a global community to consume and refer back to. + +We hold a [bi-weekly developers meeting](https://docs.xarray.dev/en/stable/developers-meeting.html) via video call. +This is a great place to bring up any questions you have, raise visibility of an issue and/or gather more perspectives. +Attendance is absolutely optional, and we keep the meeting to 30 minutes in respect of your valuable time. +This meeting is public, so we occasionally have non-core team members join us. + +We also have a private mailing list for core team members +`xarray-core-team@googlegroups.com` which is sparingly used for discussions +that are required to be private, such as nominating new core members and discussing financial issues. + +## Inviting new core members + +Any core member may nominate other contributors to join the core team. +While there is no hard-and-fast rule about who can be nominated, ideally, +they should have: been part of the project for at least two months, contributed +significant changes of their own, contributed to the discussion and +review of others' work, and collaborated in a way befitting our +community values. **We strongly encourage nominating anyone who has made significant non-code contributions +to the Xarray community in any way**. After nomination voting will happen on a private mailing list. +While it is expected that most votes will be unanimous, a two-thirds majority of +the cast votes is enough. + +Core team members can choose to become emeritus core team members and suspend +their approval and voting rights until they become active again. + +## Contribute to this guide (!) + +This guide reflects the experience of the current core team members. We +may well have missed things that, by now, have become second +nature—things that you, as a new team member, will spot more easily. +Please ask the other core team members if you have any questions, and +submit a pull request with insights gained. + +## Conclusion + +We are excited to have you on board! We look forward to your +contributions to the code base and the community. Thank you in +advance! diff --git a/HOW_TO_RELEASE.md b/HOW_TO_RELEASE.md index 3bbd551415b..34a63aad202 100644 --- a/HOW_TO_RELEASE.md +++ b/HOW_TO_RELEASE.md @@ -18,11 +18,16 @@ upstream https://github.com/pydata/xarray (push) git switch main git pull upstream main ``` - 2. Add a list of contributors with: + 2. Add a list of contributors. + First fetch all previous release tags so we can see the version number of the last release was: + ```sh + git fetch upstream --tags + ``` + This will return a list of all the contributors since the last release: ```sh git log "$(git tag --sort=v:refname | tail -1).." --format=%aN | sort -u | perl -pe 's/\n/$1, /' ``` - This will return the number of contributors: + This will return the total number of contributors: ```sh git log "$(git tag --sort=v:refname | tail -1).." --format=%aN | sort -u | wc -l ``` @@ -54,7 +59,7 @@ upstream https://github.com/pydata/xarray (push) 10. This should automatically trigger an upload of the new build to PyPI via GitHub Actions. Check this has run [here](https://github.com/pydata/xarray/actions/workflows/pypi-release.yaml), and that the version number you expect is displayed [on PyPI](https://pypi.org/project/xarray/) -11. Add a section for the next release {YYYY.MM.X+1} to doc/whats-new.rst: +11. Add a section for the next release {YYYY.MM.X+1} to doc/whats-new.rst (we avoid doing this earlier so that it doesn't show up in the RTD build): ```rst .. _whats-new.YYYY.MM.X+1: diff --git a/README.md b/README.md index 41db66fd395..8035c9b901f 100644 --- a/README.md +++ b/README.md @@ -108,7 +108,7 @@ Thanks to our many contributors! ## License -Copyright 2014-2019, xarray Developers +Copyright 2014-2023, xarray Developers Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may @@ -125,12 +125,12 @@ limitations under the License. Xarray bundles portions of pandas, NumPy and Seaborn, all of which are available under a "3-clause BSD" license: -- pandas: setup.py, xarray/util/print_versions.py -- NumPy: xarray/core/npcompat.py -- Seaborn: _determine_cmap_params in xarray/core/plot/utils.py +- pandas: `setup.py`, `xarray/util/print_versions.py` +- NumPy: `xarray/core/npcompat.py` +- Seaborn: `_determine_cmap_params` in `xarray/core/plot/utils.py` Xarray also bundles portions of CPython, which is available under the -"Python Software Foundation License" in xarray/core/pycompat.py. +"Python Software Foundation License" in `xarray/core/pycompat.py`. Xarray uses icons from the icomoon package (free version), which is available under the "CC BY 4.0" license. diff --git a/asv_bench/benchmarks/accessors.py b/asv_bench/benchmarks/accessors.py new file mode 100644 index 00000000000..f9eb95851cc --- /dev/null +++ b/asv_bench/benchmarks/accessors.py @@ -0,0 +1,25 @@ +import numpy as np + +import xarray as xr + +from . import parameterized + +NTIME = 365 * 30 + + +@parameterized(["calendar"], [("standard", "noleap")]) +class DateTimeAccessor: + def setup(self, calendar): + np.random.randn(NTIME) + time = xr.date_range("2000", periods=30 * 365, calendar=calendar) + data = np.ones((NTIME,)) + self.da = xr.DataArray(data, dims="time", coords={"time": time}) + + def time_dayofyear(self, calendar): + self.da.time.dt.dayofyear + + def time_year(self, calendar): + self.da.time.dt.year + + def time_floor(self, calendar): + self.da.time.dt.floor("D") diff --git a/asv_bench/benchmarks/combine.py b/asv_bench/benchmarks/combine.py index a4f8db2786b..772d888306c 100644 --- a/asv_bench/benchmarks/combine.py +++ b/asv_bench/benchmarks/combine.py @@ -2,8 +2,49 @@ import xarray as xr +from . import requires_dask -class Combine: + +class Combine1d: + """Benchmark concatenating and merging large datasets""" + + def setup(self) -> None: + """Create 2 datasets with two different variables""" + + t_size = 8000 + t = np.arange(t_size) + data = np.random.randn(t_size) + + self.dsA0 = xr.Dataset({"A": xr.DataArray(data, coords={"T": t}, dims=("T"))}) + self.dsA1 = xr.Dataset( + {"A": xr.DataArray(data, coords={"T": t + t_size}, dims=("T"))} + ) + + def time_combine_by_coords(self) -> None: + """Also has to load and arrange t coordinate""" + datasets = [self.dsA0, self.dsA1] + + xr.combine_by_coords(datasets) + + +class Combine1dDask(Combine1d): + """Benchmark concatenating and merging large datasets""" + + def setup(self) -> None: + """Create 2 datasets with two different variables""" + requires_dask() + + t_size = 8000 + t = np.arange(t_size) + var = xr.Variable(dims=("T",), data=np.random.randn(t_size)).chunk() + + data_vars = {f"long_name_{v}": ("T", var) for v in range(500)} + + self.dsA0 = xr.Dataset(data_vars, coords={"T": t}) + self.dsA1 = xr.Dataset(data_vars, coords={"T": t + t_size}) + + +class Combine3d: """Benchmark concatenating and merging large datasets""" def setup(self): diff --git a/asv_bench/benchmarks/pandas.py b/asv_bench/benchmarks/pandas.py index 2a296ecc4d0..9bda5970a4c 100644 --- a/asv_bench/benchmarks/pandas.py +++ b/asv_bench/benchmarks/pandas.py @@ -29,19 +29,20 @@ def time_from_series(self, dtype, subset): class ToDataFrame: def setup(self, *args, **kwargs): xp = kwargs.get("xp", np) + nvars = kwargs.get("nvars", 1) random_kws = kwargs.get("random_kws", {}) method = kwargs.get("method", "to_dataframe") dim1 = 10_000 dim2 = 10_000 + + var = xr.Variable( + dims=("dim1", "dim2"), data=xp.random.random((dim1, dim2), **random_kws) + ) + data_vars = {f"long_name_{v}": (("dim1", "dim2"), var) for v in range(nvars)} + ds = xr.Dataset( - { - "x": xr.DataArray( - data=xp.random.random((dim1, dim2), **random_kws), - dims=["dim1", "dim2"], - coords={"dim1": np.arange(0, dim1), "dim2": np.arange(0, dim2)}, - ) - } + data_vars, coords={"dim1": np.arange(0, dim1), "dim2": np.arange(0, dim2)} ) self.to_frame = getattr(ds, method) @@ -58,4 +59,6 @@ def setup(self, *args, **kwargs): import dask.array as da - super().setup(xp=da, random_kws=dict(chunks=5000), method="to_dask_dataframe") + super().setup( + xp=da, random_kws=dict(chunks=5000), method="to_dask_dataframe", nvars=500 + ) diff --git a/ci/install-upstream-wheels.sh b/ci/install-upstream-wheels.sh index 39e04d04d47..41507fce13e 100755 --- a/ci/install-upstream-wheels.sh +++ b/ci/install-upstream-wheels.sh @@ -23,7 +23,7 @@ conda uninstall -y --force \ xarray # to limit the runtime of Upstream CI python -m pip install \ - -i https://pypi.anaconda.org/scipy-wheels-nightly/simple \ + -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple \ --no-deps \ --pre \ --upgrade \ diff --git a/ci/min_deps_check.py b/ci/min_deps_check.py index 0f7002ea513..bbaf440a9a0 100755 --- a/ci/min_deps_check.py +++ b/ci/min_deps_check.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """Fetch from conda database all available versions of the xarray dependencies and their -publication date. Compare it against requirements/py37-min-all-deps.yml to verify the +publication date. Compare it against requirements/min-all-deps.yml to verify the policy on obsolete dependencies is being followed. Print a pretty report :) """ import itertools @@ -46,7 +46,7 @@ def warning(msg: str) -> None: def parse_requirements(fname) -> Iterator[tuple[str, int, int, int | None]]: - """Load requirements/py37-min-all-deps.yml + """Load requirements/min-all-deps.yml Yield (package name, major version, minor version, [patch version]) """ diff --git a/ci/requirements/all-but-dask.yml b/ci/requirements/all-but-dask.yml index 74c0b72bd0d..4645be08b83 100644 --- a/ci/requirements/all-but-dask.yml +++ b/ci/requirements/all-but-dask.yml @@ -23,7 +23,7 @@ dependencies: - netcdf4 - numba - numbagg - - numpy<1.24 + - numpy - packaging - pandas - pint<0.21 diff --git a/ci/requirements/bare-minimum.yml b/ci/requirements/bare-minimum.yml index 0a36493fa07..e8a80fdba99 100644 --- a/ci/requirements/bare-minimum.yml +++ b/ci/requirements/bare-minimum.yml @@ -11,6 +11,6 @@ dependencies: - pytest-env - pytest-xdist - pytest-timeout - - numpy=1.21 + - numpy=1.22 - packaging=21.3 - pandas=1.4 diff --git a/ci/requirements/doc.yml b/ci/requirements/doc.yml index 115d7dfa533..fe1fe91bb51 100644 --- a/ci/requirements/doc.yml +++ b/ci/requirements/doc.yml @@ -7,9 +7,11 @@ dependencies: - python=3.10 - bottleneck - cartopy + - cfgrib - dask-core>=2022.1 - h5netcdf>=0.13 - ipykernel + - ipywidgets # silence nbsphinx warning - ipython - iris>=2.3 - jupyter_client @@ -17,14 +19,13 @@ dependencies: - nbsphinx - netcdf4>=1.5 - numba - - numpy>=1.21,<1.24 + - numpy>=1.21 - packaging>=21.3 - pandas>=1.4 - pooch - pip - pre-commit - pyproj - - rasterio>=1.1 - scipy!=1.10.0 - seaborn - setuptools diff --git a/ci/requirements/environment-py311.yml b/ci/requirements/environment-py311.yml index d3cf811bb4d..0b9817daef3 100644 --- a/ci/requirements/environment-py311.yml +++ b/ci/requirements/environment-py311.yml @@ -22,8 +22,8 @@ dependencies: - matplotlib-base - nc-time-axis - netcdf4 - # - numba - # - numbagg + - numba + - numbagg - numexpr - numpy - packaging @@ -42,7 +42,7 @@ dependencies: - rasterio - scipy - seaborn - # - sparse + - sparse - toolz - typing_extensions - zarr diff --git a/ci/requirements/environment-windows.yml b/ci/requirements/environment-windows.yml index 6abee0b18c3..efa9ccb5a9a 100644 --- a/ci/requirements/environment-windows.yml +++ b/ci/requirements/environment-windows.yml @@ -22,7 +22,7 @@ dependencies: - netcdf4 - numba - numbagg - - numpy<1.24 + - numpy - packaging - pandas - pint<0.21 diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index 7a478c35a58..dd73ef19658 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -25,7 +25,7 @@ dependencies: - numba - numbagg - numexpr - - numpy<1.24 + - numpy - packaging - pandas - pint<0.21 diff --git a/ci/requirements/min-all-deps.yml b/ci/requirements/min-all-deps.yml index e50d08264b8..8400270ce1b 100644 --- a/ci/requirements/min-all-deps.yml +++ b/ci/requirements/min-all-deps.yml @@ -8,48 +8,48 @@ dependencies: # When upgrading python, numpy, or pandas, must also change # doc/user-guide/installing.rst, doc/user-guide/plotting.rst and setup.py. - python=3.9 - - boto3=1.20 + - boto3=1.24 - bottleneck=1.3 - cartopy=0.20 - cdms2=3.1 - - cftime=1.5 + - cftime=1.6 - coveralls - - dask-core=2022.1 - - distributed=2022.1 + - dask-core=2022.7 + - distributed=2022.7 - flox=0.5 - - h5netcdf=0.13 + - h5netcdf=1.0 # h5py and hdf5 tend to cause conflicts # for e.g. hdf5 1.12 conflicts with h5py=3.1 # prioritize bumping other packages instead - h5py=3.6 - hdf5=1.12 - hypothesis - - iris=3.1 - - lxml=4.7 # Optional dep of pydap + - iris=3.2 + - lxml=4.9 # Optional dep of pydap - matplotlib-base=3.5 - nc-time-axis=1.4 # netcdf follows a 1.major.minor[.patch] convention # (see https://github.com/Unidata/netcdf4-python/issues/1090) - - netcdf4=1.5.7 + - netcdf4=1.6.0 - numba=0.55 - - numpy=1.21 + - numpy=1.22 - packaging=21.3 - pandas=1.4 - - pint=0.18 + - pint=0.19 - pip - pseudonetcdf=3.2 - - pydap=3.2 + - pydap=3.3 - pytest - pytest-cov - pytest-env - pytest-xdist - pytest-timeout - - rasterio=1.2 - - scipy=1.7 + - rasterio=1.3 + - scipy=1.8 - seaborn=0.11 - sparse=0.13 - - toolz=0.11 - - typing_extensions=4.0 - - zarr=2.10 + - toolz=0.12 + - typing_extensions=4.3 + - zarr=2.12 - pip: - - numbagg==0.1 + - numbagg==0.2.1 diff --git a/design_notes/named_array_design_doc.md b/design_notes/named_array_design_doc.md new file mode 100644 index 00000000000..fc8129af6b0 --- /dev/null +++ b/design_notes/named_array_design_doc.md @@ -0,0 +1,338 @@ +# named-array Design Document + +## Abstract + +Despite the wealth of scientific libraries in the Python ecosystem, there is a gap for a lightweight, efficient array structure with named dimensions that can provide convenient broadcasting and indexing. + +Existing solutions like Xarray's Variable, [Pytorch Named Tensor](https://github.com/pytorch/pytorch/issues/60832), [Levanter](https://crfm.stanford.edu/2023/06/16/levanter-1_0-release.html), and [Larray](https://larray.readthedocs.io/en/stable/tutorial/getting_started.html) have their own strengths and weaknesses. Xarray's Variable is an efficient data structure, but it depends on the relatively heavy-weight library Pandas, which limits its use in other projects. Pytorch Named Tensor offers named dimensions, but it lacks support for many operations, making it less user-friendly. Levanter is a powerful tool with a named tensor module (Haliax) that makes deep learning code easier to read, understand, and write, but it is not as lightweight or generic as desired. Larry offers labeled N-dimensional arrays, but it may not provide the level of seamless interoperability with other scientific Python libraries that some users need. + +named-array aims to solve these issues by exposing the core functionality of Xarray's Variable class as a standalone package. + +## Motivation and Scope + +The Python ecosystem boasts a wealth of scientific libraries that enable efficient computations on large, multi-dimensional arrays. Libraries like PyTorch, Xarray, and NumPy have revolutionized scientific computing by offering robust data structures for array manipulations. Despite this wealth of tools, a gap exists in the Python landscape for a lightweight, efficient array structure with named dimensions that can provide convenient broadcasting and indexing. + +Xarray internally maintains a data structure that meets this need, referred to as [`xarray.Variable`](https://docs.xarray.dev/en/latest/generated/xarray.Variable.html) . However, Xarray's dependency on Pandas, a relatively heavy-weight library, restricts other projects from leveraging this efficient data structure (, , ). + +We propose the creation of a standalone Python package, "named-array". This package is envisioned to be a version of the `xarray.Variable` data structure, cleanly separated from the heavier dependencies of Xarray. named-array will provide a lightweight, user-friendly array-like data structure with named dimensions, facilitating convenient indexing and broadcasting. The package will use existing scientific Python community standards such as established array protocols and the new [Python array API standard](https://data-apis.org/array-api/latest), allowing users to wrap multiple duck-array objects, including, but not limited to, NumPy, Dask, Sparse, Pint, CuPy, and Pytorch. + +The development of named-array is projected to meet a key community need and expected to broaden Xarray's user base. By making the core `xarray.Variable` more accessible, we anticipate an increase in contributors and a reduction in the developer burden on current Xarray maintainers. + +### Goals + +1. **Simple and minimal**: named-array will expose Xarray's [Variable class](https://docs.xarray.dev/en/stable/internals/variable-objects.html) as a standalone object (`NamedArray`) with named axes (dimensions) and arbitrary metadata (attributes) but without coordinate labels. This will make it a lightweight, efficient array data structure that allows convenient broadcasting and indexing. + +2. **Interoperability**: named-array will follow established scientific Python community standards and in doing so, will allow it to wrap multiple duck-array objects, including but not limited to, NumPy, Dask, Sparse, Pint, CuPy, and Pytorch. + +3. **Community Engagement**: By making the core `xarray.Variable` more accessible, we open the door to increased adoption of this fundamental data structure. As such, we hope to see an increase in contributors and reduction in the developer burden on current Xarray maintainers. + +### Non-Goals + +1. **Extensive Data Analysis**: named-array will not provide extensive data analysis features like statistical functions, data cleaning, or visualization. Its primary focus is on providing a data structure that allows users to use dimension names for descriptive array manipulations. + +2. **Support for I/O**: named-array will not bundle file reading functions. Instead users will be expected to handle I/O and then wrap those arrays with the new named-array data structure. + +## Backward Compatibility + +The creation of named-array is intended to separate the `xarray.Variable` from Xarray into a standalone package. This allows it to be used independently, without the need for Xarray's dependencies, like Pandas. This separation has implications for backward compatibility. + +Since the new named-array is envisioned to contain the core features of Xarray's variable, existing code using Variable from Xarray should be able to switch to named-array with minimal changes. However, there are several potential issues related to backward compatibility: + +* **API Changes**: as the Variable is decoupled from Xarray and moved into named-array, some changes to the API may be necessary. These changes might include differences in function signature, etc. These changes could break existing code that relies on the current API and associated utility functions (e.g. `as_variable()`). The `xarray.Variable` object will subclass `NamedArray`, and provide the existing interface for compatibility. + +## Detailed Description + +named-array aims to provide a lightweight, efficient array structure with named dimensions, or axes, that enables convenient broadcasting and indexing. The primary component of named-array is a standalone version of the xarray.Variable data structure, which was previously a part of the Xarray library. +The xarray.Variable data structure in named-array will maintain the core features of its counterpart in Xarray, including: + +* **Named Axes (Dimensions)**: Each axis of the array can be given a name, providing a descriptive and intuitive way to reference the dimensions of the array. + +* **Arbitrary Metadata (Attributes)**: named-array will support the attachment of arbitrary metadata to arrays as a dict, providing a mechanism to store additional information about the data that the array represents. + +* **Convenient Broadcasting and Indexing**: With named dimensions, broadcasting and indexing operations become more intuitive and less error-prone. + +The named-array package is designed to be interoperable with other scientific Python libraries. It will follow established scientific Python community standards and use standard array protocols, as well as the new data-apis standard. This allows named-array to wrap multiple duck-array objects, including, but not limited to, NumPy, Dask, Sparse, Pint, CuPy, and Pytorch. + +## Implementation + +* **Decoupling**: making `variable.py` agnostic to Xarray internals by decoupling it from the rest of the library. This will make the code more modular and easier to maintain. However, this will also make the code more complex, as we will need to define a clear interface for how the functionality in `variable.py` interacts with the rest of the library, particularly the ExplicitlyIndexed subclasses used to enable lazy indexing of data on disk. +* **Move Xarray's internal lazy indexing classes to follow standard Array Protocols**: moving the lazy indexing classes like `ExplicitlyIndexed` to use standard array protocols will be a key step in decoupling. It will also potentially improve interoperability with other libraries that use these protocols, and prepare these classes [for eventual movement out](https://github.com/pydata/xarray/issues/5081) of the Xarray code base. However, this will also require significant changes to the code, and we will need to ensure that all existing functionality is preserved. + * Use [https://data-apis.org/array-api-compat/](https://data-apis.org/array-api-compat/) to handle compatibility issues? +* **Leave lazy indexing classes in Xarray for now** +* **Preserve support for Dask collection protocols**: named-array will preserve existing support for the dask collections protocol namely the __dask_***__ methods +* **Preserve support for ChunkManagerEntrypoint?** Opening variables backed by dask vs cubed arrays currently is [handled within Variable.chunk](https://github.com/pydata/xarray/blob/92c8b33eb464b09d6f8277265b16cae039ab57ee/xarray/core/variable.py#L1272C15-L1272C15). If we are preserving dask support it would be nice to preserve general chunked array type support, but this currently requires an entrypoint. + +### Plan + +1. Create a new baseclass for `xarray.Variable` to its own module e.g. `xarray.core.base_variable` +2. Remove all imports of internal Xarray classes and utils from `base_variable.py`. `base_variable.Variable` should not depend on anything in xarray.core + * Will require moving the lazy indexing classes (subclasses of ExplicitlyIndexed) to be standards compliant containers.` + * an array-api compliant container that provides **array_namespace**` + * Support `.oindex` and `.vindex` for explicit indexing + * Potentially implement this by introducing a new compliant wrapper object? + * Delete the `NON_NUMPY_SUPPORTED_ARRAY_TYPES` variable which special-cases ExplicitlyIndexed and `pd.Index.` + * `ExplicitlyIndexed` class and subclasses should provide `.oindex` and `.vindex` for indexing by `Variable.__getitem__.`: `oindex` and `vindex` were proposed in [NEP21](https://numpy.org/neps/nep-0021-advanced-indexing.html), but have not been implemented yet + * Delete the ExplicitIndexer objects (`BasicIndexer`, `VectorizedIndexer`, `OuterIndexer`) + * Remove explicit support for `pd.Index`. When provided with a `pd.Index` object, Variable will coerce to an array using `np.array(pd.Index)`. For Xarray's purposes, Xarray can use `as_variable` to explicitly wrap these in PandasIndexingAdapter and pass them to `Variable.__init__`. +3. Define a minimal variable interface that the rest of Xarray can use: + 1. `dims`: tuple of dimension names + 2. `data`: numpy/dask/duck arrays` + 3. `attrs``: dictionary of attributes + +4. Implement basic functions & methods for manipulating these objects. These methods will be a cleaned-up subset (for now) of functionality on xarray.Variable, with adaptations inspired by the [Python array API](https://data-apis.org/array-api/2022.12/API_specification/index.html). +5. Existing Variable structures + 1. Keep Variable object which subclasses the new structure that adds the `.encoding` attribute and potentially other methods needed for easy refactoring. + 2. IndexVariable will remain in xarray.core.variable and subclass the new named-array data structure pending future deletion. +6. Docstrings and user-facing APIs will need to be updated to reflect the changed methods on Variable objects. + +Further implementation details are in Appendix: [Implementation Details](#appendix-implementation-details). + +## Project Timeline and Milestones + +We have identified the following milestones for the completion of this project: + +1. **Write and publish a design document**: this document will explain the purpose of named-array, the intended audience, and the features it will provide. It will also describe the architecture of named-array and how it will be implemented. This will ensure early community awareness and engagement in the project to promote subsequent uptake. +2. **Refactor `variable.py` to `base_variable.py`** and remove internal Xarray imports. +3. **Break out the package and create continuous integration infrastructure**: this will entail breaking out the named-array project into a Python package and creating a continuous integration (CI) system. This will help to modularize the code and make it easier to manage. Building a CI system will help ensure that codebase changes do not break existing functionality. +4. Incrementally add new functions & methods to the new package, ported from xarray. This will start to make named-array useful on its own. +5. Refactor the existing Xarray codebase to rely on the newly created package (named-array): This will help to demonstrate the usefulness of the new package, and also provide an example for others who may want to use it. +6. Expand tests, add documentation, and write a blog post: expanding the test suite will help to ensure that the code is reliable and that changes do not introduce bugs. Adding documentation will make it easier for others to understand and use the project. +7. Finally, we will write a series of blog posts on [xarray.dev](https://xarray.dev/) to promote the project and attract more contributors. + * Toward the end of the process, write a few blog posts that demonstrate the use of the newly available data structure + * pick the same example applications used by other implementations/applications (e.g. Pytorch, sklearn, and Levanter) to show how it can work. + +## Related Work + +1. [GitHub - deepmind/graphcast](https://github.com/deepmind/graphcast) +2. [Getting Started — LArray 0.34 documentation](https://larray.readthedocs.io/en/stable/tutorial/getting_started.html) +3. [Levanter — Legible, Scalable, Reproducible Foundation Models with JAX](https://crfm.stanford.edu/2023/06/16/levanter-1_0-release.html) +4. [google/xarray-tensorstore](https://github.com/google/xarray-tensorstore) +5. [State of Torch Named Tensors · Issue #60832 · pytorch/pytorch · GitHub](https://github.com/pytorch/pytorch/issues/60832) + * Incomplete support: Many primitive operations result in errors, making it difficult to use NamedTensors in Practice. Users often have to resort to removing the names from tensors to avoid these errors. + * Lack of active development: the development of the NamedTensor feature in PyTorch is not currently active due a lack of bandwidth for resolving ambiguities in the design. + * Usability issues: the current form of NamedTensor is not user-friendly and sometimes raises errors, making it difficult for users to incorporate NamedTensors into their workflows. +6. [Scikit-learn Enhancement Proposals (SLEPs) 8, 12, 14](https://github.com/scikit-learn/enhancement_proposals/pull/18) + * Some of the key points and limitations discussed in these proposals are: + * Inconsistency in feature name handling: Scikit-learn currently lacks a consistent and comprehensive way to handle and propagate feature names through its pipelines and estimators ([SLEP 8](https://github.com/scikit-learn/enhancement_proposals/pull/18),[SLEP 12](https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep012/proposal.html)). + * Memory intensive for large feature sets: storing and propagating feature names can be memory intensive, particularly in cases where the entire "dictionary" becomes the features, such as in NLP use cases ([SLEP 8](https://github.com/scikit-learn/enhancement_proposals/pull/18),[GitHub issue #35](https://github.com/scikit-learn/enhancement_proposals/issues/35)) + * Sparse matrices: sparse data structures present a challenge for feature name propagation. For instance, the sparse data structure functionality in Pandas 1.0 only supports converting directly to the coordinate format (COO), which can be an issue with transformers such as the OneHotEncoder.transform that has been optimized to construct a CSR matrix ([SLEP 14](https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep014/proposal.html)) + * New Data structures: the introduction of new data structures, such as "InputArray" or "DataArray" could lead to more burden for third-party estimator maintainers and increase the learning curve for users. Xarray's "DataArray" is mentioned as a potential alternative, but the proposal mentions that the conversion from a Pandas dataframe to a Dataset is not lossless ([SLEP 12](https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep012/proposal.html),[SLEP 14](https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep014/proposal.html),[GitHub issue #35](https://github.com/scikit-learn/enhancement_proposals/issues/35)). + * Dependency on other libraries: solutions that involve using Xarray and/or Pandas to handle feature names come with the challenge of managing dependencies. While a soft dependency approach is suggested, this means users would be able to have/enable the feature only if they have the dependency installed. Xarra-lite's integration with other scientific Python libraries could potentially help with this issue ([GitHub issue #35](https://github.com/scikit-learn/enhancement_proposals/issues/35)). + +## References and Previous Discussion + +* [[Proposal] Expose Variable without Pandas dependency · Issue #3981 · pydata/xarray · GitHub](https://github.com/pydata/xarray/issues/3981) +* [https://github.com/pydata/xarray/issues/3981#issuecomment-985051449](https://github.com/pydata/xarray/issues/3981#issuecomment-985051449) +* [Lazy indexing arrays as a stand-alone package · Issue #5081 · pydata/xarray · GitHub](https://github.com/pydata/xarray/issues/5081) + +### Appendix: Engagement with the Community + +We plan to publicize this document on : + +* [x] `Xarray dev call` +* [ ] `Scientific Python discourse` +* [ ] `Xarray Github` +* [ ] `Twitter` +* [ ] `Respond to NamedTensor and Scikit-Learn issues?` +* [ ] `Pangeo Discourse` +* [ ] `Numpy, SciPy email lists?` +* [ ] `Xarray blog` + +Additionally, We plan on writing a series of blog posts to effectively showcase the implementation and potential of the newly available functionality. To illustrate this, we will use the same example applications as other established libraries (such as Pytorch, sklearn), providing practical demonstrations of how these new data structures can be leveraged. + +### Appendix: API Surface + +Questions: + +1. Document Xarray indexing rules +2. Document use of .oindex and .vindex protocols +3. Do we use `.mean` and `.nanmean` or `.mean(skipna=...)`? + * Default behavior in named-array should mirror NumPy / the array API standard, not pandas. + * nanmean is not (yet) in the [array API](https://github.com/pydata/xarray/pull/7424#issuecomment-1373979208). There are a handful of other key functions (e.g., median) that are are also missing. I think that should be OK, as long as what we support is a strict superset of the array API. +4. What methods need to be exposed on Variable? + * `Variable.concat` classmethod: create two functions, one as the equivalent of `np.stack` and other for `np.concat` + * `.rolling_window` and `.coarsen_reshape` ? + * `named-array.apply_ufunc`: used in astype, clip, quantile, isnull, notnull` + +#### methods to be preserved from xarray.Variable + +```python +# Sorting + Variable.argsort + Variable.searchsorted + +# NaN handling + Variable.fillna + Variable.isnull + Variable.notnull + +# Lazy data handling + Variable.chunk # Could instead have accessor interface and recommend users use `Variable.dask.chunk` and `Variable.cubed.chunk`? + Variable.to_numpy() + Variable.as_numpy() + +# Xarray-specific + Variable.get_axis_num + Variable.isel + Variable.to_dict + +# Reductions + Variable.reduce + Variable.all + Variable.any + Variable.argmax + Variable.argmin + Variable.count + Variable.max + Variable.mean + Variable.median + Variable.min + Variable.prod + Variable.quantile + Variable.std + Variable.sum + Variable.var + +# Accumulate + Variable.cumprod + Variable.cumsum + +# numpy-like Methods + Variable.astype + Variable.copy + Variable.clip + Variable.round + Variable.item + Variable.where + +# Reordering/Reshaping + Variable.squeeze + Variable.pad + Variable.roll + Variable.shift + +``` + +#### methods to be renamed from xarray.Variable + +```python +# Xarray-specific + Variable.concat # create two functions, one as the equivalent of `np.stack` and other for `np.concat` + + # Given how niche these are, these would be better as functions than methods. + # We could also keep these in Xarray, at least for now. If we don't think people will use functionality outside of Xarray it probably is not worth the trouble of porting it (including documentation, etc). + Variable.coarsen # This should probably be called something like coarsen_reduce. + Variable.coarsen_reshape + Variable.rolling_window + + Variable.set_dims # split this into broadcas_to and expand_dims + + +# Reordering/Reshaping + Variable.stack # To avoid confusion with np.stack, let's call this stack_dims. + Variable.transpose # Could consider calling this permute_dims, like the [array API standard](https://data-apis.org/array-api/2022.12/API_specification/manipulation_functions.html#objects-in-api) + Variable.unstack # Likewise, maybe call this unstack_dims? +``` + +#### methods to be removed from xarray.Variable + +```python +# Testing + Variable.broadcast_equals + Variable.equals + Variable.identical + Variable.no_conflicts + +# Lazy data handling + Variable.compute # We can probably omit this method for now, too, given that dask.compute() uses a protocol. The other concern is that different array libraries have different notions of "compute" and this one is rather Dask specific, including conversion from Dask to NumPy arrays. For example, in JAX every operation executes eagerly, but in a non-blocking fashion, and you need to call jax.block_until_ready() to ensure computation is finished. + Variable.load # Could remove? compute vs load is a common source of confusion. + +# Xarray-specific + Variable.to_index + Variable.to_index_variable + Variable.to_variable + Variable.to_base_variable + Variable.to_coord + + Variable.rank # Uses bottleneck. Delete? Could use https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.rankdata.html instead + + +# numpy-like Methods + Variable.conjugate # .conj is enough + Variable.__array_wrap__ # This is a very old NumPy protocol for duck arrays. We don't need it now that we have `__array_ufunc__` and `__array_function__` + +# Encoding + Variable.reset_encoding + +``` + +#### Attributes to be preserved from xarray.Variable + +```python +# Properties + Variable.attrs + Variable.chunks + Variable.data + Variable.dims + Variable.dtype + + Variable.nbytes + Variable.ndim + Variable.shape + Variable.size + Variable.sizes + + Variable.T + Variable.real + Variable.imag + Variable.conj +``` + +#### Attributes to be renamed from xarray.Variable + +```python +``` + +#### Attributes to be removed from xarray.Variable + +```python + + Variable.values # Probably also remove -- this is a legacy from before Xarray supported dask arrays. ".data" is enough. + +# Encoding + Variable.encoding + +``` + +### Appendix: Implementation Details + +* Merge in VariableArithmetic's parent classes: AbstractArray, NdimSizeLenMixin with the new data structure.. + +```python +class VariableArithmetic( + ImplementsArrayReduce, + IncludeReduceMethods, + IncludeCumMethods, + IncludeNumpySameMethods, + SupportsArithmetic, + VariableOpsMixin, +): + __slots__ = () + # prioritize our operations over those of numpy.ndarray (priority=0) + __array_priority__ = 50 + +``` + +* Move over `_typed_ops.VariableOpsMixin` +* Build a list of utility functions used elsewhere : Which of these should become public API? + * `broadcast_variables`: `dataset.py`, `dataarray.py`,`missing.py` + * This could be just called "broadcast" in named-array. + * `Variable._getitem_with_mask` : `alignment.py` + * keep this method/function as private and inside Xarray. +* The Variable constructor will need to be rewritten to no longer accept tuples, encodings, etc. These details should be handled at the Xarray data structure level. +* What happens to `duck_array_ops?` +* What about Variable.chunk and "chunk managers"? + * Could this functionality be left in Xarray proper for now? Alternative array types like JAX also have some notion of "chunks" for parallel arrays, but the details differ in a number of ways from the Dask/Cubed. + * Perhaps variable.chunk/load methods should become functions defined in xarray that convert Variable objects. This is easy so long as xarray can reach in and replace .data + +* Utility functions like `as_variable` should be moved out of `base_variable.py` so they can convert BaseVariable objects to/from DataArray or Dataset containing explicitly indexed arrays. diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst index 5d825be2e08..527bdcdede2 100644 --- a/doc/api-hidden.rst +++ b/doc/api-hidden.rst @@ -9,17 +9,40 @@ .. autosummary:: :toctree: generated/ + Coordinates.from_pandas_multiindex + Coordinates.get + Coordinates.items + Coordinates.keys + Coordinates.values + Coordinates.dims + Coordinates.dtypes + Coordinates.variables + Coordinates.xindexes + Coordinates.indexes + Coordinates.to_dataset + Coordinates.to_index + Coordinates.update + Coordinates.merge + Coordinates.copy + Coordinates.equals + Coordinates.identical + core.coordinates.DatasetCoordinates.get core.coordinates.DatasetCoordinates.items core.coordinates.DatasetCoordinates.keys - core.coordinates.DatasetCoordinates.merge - core.coordinates.DatasetCoordinates.to_dataset - core.coordinates.DatasetCoordinates.to_index - core.coordinates.DatasetCoordinates.update core.coordinates.DatasetCoordinates.values core.coordinates.DatasetCoordinates.dims - core.coordinates.DatasetCoordinates.indexes + core.coordinates.DatasetCoordinates.dtypes core.coordinates.DatasetCoordinates.variables + core.coordinates.DatasetCoordinates.xindexes + core.coordinates.DatasetCoordinates.indexes + core.coordinates.DatasetCoordinates.to_dataset + core.coordinates.DatasetCoordinates.to_index + core.coordinates.DatasetCoordinates.update + core.coordinates.DatasetCoordinates.merge + core.coordinates.DataArrayCoordinates.copy + core.coordinates.DatasetCoordinates.equals + core.coordinates.DatasetCoordinates.identical core.rolling.DatasetCoarsen.boundary core.rolling.DatasetCoarsen.coord_func @@ -47,14 +70,19 @@ core.coordinates.DataArrayCoordinates.get core.coordinates.DataArrayCoordinates.items core.coordinates.DataArrayCoordinates.keys - core.coordinates.DataArrayCoordinates.merge - core.coordinates.DataArrayCoordinates.to_dataset - core.coordinates.DataArrayCoordinates.to_index - core.coordinates.DataArrayCoordinates.update core.coordinates.DataArrayCoordinates.values core.coordinates.DataArrayCoordinates.dims - core.coordinates.DataArrayCoordinates.indexes + core.coordinates.DataArrayCoordinates.dtypes core.coordinates.DataArrayCoordinates.variables + core.coordinates.DataArrayCoordinates.xindexes + core.coordinates.DataArrayCoordinates.indexes + core.coordinates.DataArrayCoordinates.to_dataset + core.coordinates.DataArrayCoordinates.to_index + core.coordinates.DataArrayCoordinates.update + core.coordinates.DataArrayCoordinates.merge + core.coordinates.DataArrayCoordinates.copy + core.coordinates.DataArrayCoordinates.equals + core.coordinates.DataArrayCoordinates.identical core.rolling.DataArrayCoarsen.boundary core.rolling.DataArrayCoarsen.coord_func @@ -451,6 +479,21 @@ CFTimeIndex.values CFTimeIndex.year + Index.from_variables + Index.concat + Index.stack + Index.unstack + Index.create_variables + Index.to_pandas_index + Index.isel + Index.sel + Index.join + Index.reindex_like + Index.equals + Index.roll + Index.rename + Index.copy + backends.NetCDF4DataStore.close backends.NetCDF4DataStore.encode backends.NetCDF4DataStore.encode_attribute diff --git a/doc/api.rst b/doc/api.rst index 34d6558ed55..0cf07f91df8 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -1085,12 +1085,14 @@ Advanced API .. autosummary:: :toctree: generated/ + Coordinates Dataset.variables DataArray.variable Variable IndexVariable as_variable - indexes.Index + Index + IndexSelResult Context register_dataset_accessor register_dataarray_accessor diff --git a/doc/conf.py b/doc/conf.py index eb861004e2f..6c6efb47f6b 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -58,7 +58,7 @@ ] ) -nbsphinx_allow_errors = True +nbsphinx_allow_errors = False # -- General configuration ------------------------------------------------ @@ -238,7 +238,7 @@ extra_footer="""

Xarray is a fiscally sponsored project of NumFOCUS, a nonprofit dedicated to supporting the open-source scientific computing community.
Theme by the Executable Book Project

""", - twitter_url="https://twitter.com/xarray_devs", + twitter_url="https://twitter.com/xarray_dev", icon_links=[], # workaround for pydata/pydata-sphinx-theme#1220 ) @@ -323,6 +323,7 @@ "dask": ("https://docs.dask.org/en/latest", None), "cftime": ("https://unidata.github.io/cftime", None), "sparse": ("https://sparse.pydata.org/en/latest/", None), + "cubed": ("https://tom-e-white.com/cubed/", None), } diff --git a/doc/contributing.rst b/doc/contributing.rst index 3cc43314d9a..3cdd7dd9933 100644 --- a/doc/contributing.rst +++ b/doc/contributing.rst @@ -518,7 +518,7 @@ See the `Installation `_ Including figures and files ---------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~ Image files can be directly included in pages with the ``image::`` directive. diff --git a/doc/developers-meeting.rst b/doc/developers-meeting.rst index 1c49a900f66..153f3520f26 100644 --- a/doc/developers-meeting.rst +++ b/doc/developers-meeting.rst @@ -3,18 +3,18 @@ Developers meeting Xarray developers meet bi-weekly every other Wednesday. -The meeting occurs on `Zoom `__. +The meeting occurs on `Zoom `__. -Find the `notes for the meeting here `__. +Find the `notes for the meeting here `__. There is a :issue:`GitHub issue for changes to the meeting<4001>`. You can subscribe to this calendar to be notified of changes: -* `Google Calendar `__ -* `iCal `__ +* `Google Calendar `__ +* `iCal `__ .. raw:: html - + diff --git a/doc/examples/multidimensional-coords.ipynb b/doc/examples/multidimensional-coords.ipynb index f7471f05e5d..ce8a091a5da 100644 --- a/doc/examples/multidimensional-coords.ipynb +++ b/doc/examples/multidimensional-coords.ipynb @@ -56,7 +56,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this example, the _logical coordinates_ are `x` and `y`, while the _physical coordinates_ are `xc` and `yc`, which represent the latitudes and longitude of the data." + "In this example, the _logical coordinates_ are `x` and `y`, while the _physical coordinates_ are `xc` and `yc`, which represent the longitudes and latitudes of the data." ] }, { diff --git a/doc/examples/visualization_gallery.ipynb b/doc/examples/visualization_gallery.ipynb index e6fa564db0d..e7e9196a6f6 100644 --- a/doc/examples/visualization_gallery.ipynb +++ b/doc/examples/visualization_gallery.ipynb @@ -193,90 +193,6 @@ "# Show\n", "plt.tight_layout()" ] - }, - { - "cell_type": "markdown", - "metadata": { - "jp-MarkdownHeadingCollapsed": true, - "tags": [] - }, - "source": [ - "## `imshow()` and rasterio map projections\n", - "\n", - "\n", - "Using rasterio's projection information for more accurate plots.\n", - "\n", - "This example extends `recipes.rasterio` and plots the image in the\n", - "original map projection instead of relying on pcolormesh and a map\n", - "transformation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "da = xr.tutorial.open_rasterio(\"RGB.byte\")\n", - "\n", - "# The data is in UTM projection. We have to set it manually until\n", - "# https://github.com/SciTools/cartopy/issues/813 is implemented\n", - "crs = ccrs.UTM(\"18\")\n", - "\n", - "# Plot on a map\n", - "ax = plt.subplot(projection=crs)\n", - "da.plot.imshow(ax=ax, rgb=\"band\", transform=crs)\n", - "ax.coastlines(\"10m\", color=\"r\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Parsing rasterio geocoordinates\n", - "\n", - "Converting a projection's cartesian coordinates into 2D longitudes and\n", - "latitudes.\n", - "\n", - "These new coordinates might be handy for plotting and indexing, but it should\n", - "be kept in mind that a grid which is regular in projection coordinates will\n", - "likely be irregular in lon/lat. It is often recommended to work in the data's\n", - "original map projection (see `recipes.rasterio_rgb`)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyproj import Transformer\n", - "import numpy as np\n", - "\n", - "da = xr.tutorial.open_rasterio(\"RGB.byte\")\n", - "\n", - "x, y = np.meshgrid(da[\"x\"], da[\"y\"])\n", - "transformer = Transformer.from_crs(da.crs, \"EPSG:4326\", always_xy=True)\n", - "lon, lat = transformer.transform(x, y)\n", - "da.coords[\"lon\"] = ((\"y\", \"x\"), lon)\n", - "da.coords[\"lat\"] = ((\"y\", \"x\"), lat)\n", - "\n", - "# Compute a greyscale out of the rgb image\n", - "greyscale = da.mean(dim=\"band\")\n", - "\n", - "# Plot on a map\n", - "ax = plt.subplot(projection=ccrs.PlateCarree())\n", - "greyscale.plot(\n", - " ax=ax,\n", - " x=\"lon\",\n", - " y=\"lat\",\n", - " transform=ccrs.PlateCarree(),\n", - " cmap=\"Greys_r\",\n", - " shading=\"auto\",\n", - " add_colorbar=False,\n", - ")\n", - "ax.coastlines(\"10m\", color=\"r\")" - ] } ], "metadata": { @@ -296,6 +212,13 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } } }, "nbformat": 4, diff --git a/doc/getting-started-guide/installing.rst b/doc/getting-started-guide/installing.rst index 9fee849a341..ff8650bc0ff 100644 --- a/doc/getting-started-guide/installing.rst +++ b/doc/getting-started-guide/installing.rst @@ -7,7 +7,7 @@ Required dependencies --------------------- - Python (3.9 or later) -- `numpy `__ (1.21 or later) +- `numpy `__ (1.22 or later) - `packaging `__ (21.3 or later) - `pandas `__ (1.4 or later) diff --git a/doc/howdoi.rst b/doc/howdoi.rst index b6374cc5100..8cc4e9939f2 100644 --- a/doc/howdoi.rst +++ b/doc/howdoi.rst @@ -42,7 +42,7 @@ How do I ... * - extract the underlying array (e.g. NumPy or Dask arrays) - :py:attr:`DataArray.data` * - convert to and extract the underlying NumPy array - - :py:attr:`DataArray.values` + - :py:attr:`DataArray.to_numpy` * - convert to a pandas DataFrame - :py:attr:`Dataset.to_dataframe` * - sort values diff --git a/doc/internals/chunked-arrays.rst b/doc/internals/chunked-arrays.rst new file mode 100644 index 00000000000..7192c3f0bc5 --- /dev/null +++ b/doc/internals/chunked-arrays.rst @@ -0,0 +1,102 @@ +.. currentmodule:: xarray + +.. _internals.chunkedarrays: + +Alternative chunked array types +=============================== + +.. warning:: + + This is a *highly* experimental feature. Please report any bugs or other difficulties on `xarray's issue tracker `_. + In particular see discussion on `xarray issue #6807 `_ + +Xarray can wrap chunked dask arrays (see :ref:`dask`), but can also wrap any other chunked array type that exposes the correct interface. +This allows us to support using other frameworks for distributed and out-of-core processing, with user code still written as xarray commands. +In particular xarray also supports wrapping :py:class:`cubed.Array` objects +(see `Cubed's documentation `_ and the `cubed-xarray package `_). + +The basic idea is that by wrapping an array that has an explicit notion of ``.chunks``, xarray can expose control over +the choice of chunking scheme to users via methods like :py:meth:`DataArray.chunk` whilst the wrapped array actually +implements the handling of processing all of the chunks. + +Chunked array methods and "core operations" +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A chunked array needs to meet all the :ref:`requirements for normal duck arrays `, but must also +implement additional features. + +Chunked arrays have additional attributes and methods, such as ``.chunks`` and ``.rechunk``. +Furthermore, Xarray dispatches chunk-aware computations across one or more chunked arrays using special functions known +as "core operations". Examples include ``map_blocks``, ``blockwise``, and ``apply_gufunc``. + +The core operations are generalizations of functions first implemented in :py:mod:`dask.array`. +The implementation of these functions is specific to the type of arrays passed to them. For example, when applying the +``map_blocks`` core operation, :py:class:`dask.array.Array` objects must be processed by :py:func:`dask.array.map_blocks`, +whereas :py:class:`cubed.Array` objects must be processed by :py:func:`cubed.map_blocks`. + +In order to use the correct implementation of a core operation for the array type encountered, xarray dispatches to the +corresponding subclass of :py:class:`~xarray.core.parallelcompat.ChunkManagerEntrypoint`, +also known as a "Chunk Manager". Therefore **a full list of the operations that need to be defined is set by the +API of the** :py:class:`~xarray.core.parallelcompat.ChunkManagerEntrypoint` **abstract base class**. Note that chunked array +methods are also currently dispatched using this class. + +Chunked array creation is also handled by this class. As chunked array objects have a one-to-one correspondence with +in-memory numpy arrays, it should be possible to create a chunked array from a numpy array by passing the desired +chunking pattern to an implementation of :py:class:`~xarray.core.parallelcompat.ChunkManagerEntrypoint.from_array``. + +.. note:: + + The :py:class:`~xarray.core.parallelcompat.ChunkManagerEntrypoint` abstract base class is mostly just acting as a + namespace for containing the chunked-aware function primitives. Ideally in the future we would have an API standard + for chunked array types which codified this structure, making the entrypoint system unnecessary. + +.. currentmodule:: xarray.core.parallelcompat + +.. autoclass:: xarray.core.parallelcompat.ChunkManagerEntrypoint + :members: + +Registering a new ChunkManagerEntrypoint subclass +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Rather than hard-coding various chunk managers to deal with specific chunked array implementations, xarray uses an +entrypoint system to allow developers of new chunked array implementations to register their corresponding subclass of +:py:class:`~xarray.core.parallelcompat.ChunkManagerEntrypoint`. + + +To register a new entrypoint you need to add an entry to the ``setup.cfg`` like this:: + + [options.entry_points] + xarray.chunkmanagers = + dask = xarray.core.daskmanager:DaskManager + +See also `cubed-xarray `_ for another example. + +To check that the entrypoint has worked correctly, you may find it useful to display the available chunkmanagers using +the internal function :py:func:`~xarray.core.parallelcompat.list_chunkmanagers`. + +.. autofunction:: list_chunkmanagers + + +User interface +~~~~~~~~~~~~~~ + +Once the chunkmanager subclass has been registered, xarray objects wrapping the desired array type can be created in 3 ways: + +#. By manually passing the array type to the :py:class:`~xarray.DataArray` constructor, see the examples for :ref:`numpy-like arrays `, + +#. Calling :py:meth:`~xarray.DataArray.chunk`, passing the keyword arguments ``chunked_array_type`` and ``from_array_kwargs``, + +#. Calling :py:func:`~xarray.open_dataset`, passing the keyword arguments ``chunked_array_type`` and ``from_array_kwargs``. + +The latter two methods ultimately call the chunkmanager's implementation of ``.from_array``, to which they pass the ``from_array_kwargs`` dict. +The ``chunked_array_type`` kwarg selects which registered chunkmanager subclass to dispatch to. It defaults to ``'dask'`` +if Dask is installed, otherwise it defaults to whichever chunkmanager is registered if only one is registered. +If multiple chunkmanagers are registered it will raise an error by default. + +Parallel processing without chunks +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To use a parallel array type that does not expose a concept of chunks explicitly, none of the information on this page +is theoretically required. Such an array type (e.g. `Ramba `_ or +`Arkouda `_) could be wrapped using xarray's existing support for +:ref:`numpy-like "duck" arrays `. diff --git a/doc/internals/duck-arrays-integration.rst b/doc/internals/duck-arrays-integration.rst index d403328aa2f..1f1f57974df 100644 --- a/doc/internals/duck-arrays-integration.rst +++ b/doc/internals/duck-arrays-integration.rst @@ -1,23 +1,57 @@ -.. _internals.duck_arrays: +.. _internals.duckarrays: Integrating with duck arrays ============================= .. warning:: - This is a experimental feature. + This is an experimental feature. Please report any bugs or other difficulties on `xarray's issue tracker `_. -Xarray can wrap custom :term:`duck array` objects as long as they define numpy's -``shape``, ``dtype`` and ``ndim`` properties and the ``__array__``, -``__array_ufunc__`` and ``__array_function__`` methods. +Xarray can wrap custom numpy-like arrays (":term:`duck array`\s") - see the :ref:`user guide documentation `. +This page is intended for developers who are interested in wrapping a new custom array type with xarray. + +.. _internals.duckarrays.requirements: + +Duck array requirements +~~~~~~~~~~~~~~~~~~~~~~~ + +Xarray does not explicitly check that required methods are defined by the underlying duck array object before +attempting to wrap the given array. However, a wrapped array type should at a minimum define these attributes: + +* ``shape`` property, +* ``dtype`` property, +* ``ndim`` property, +* ``__array__`` method, +* ``__array_ufunc__`` method, +* ``__array_function__`` method. + +These need to be defined consistently with :py:class:`numpy.ndarray`, for example the array ``shape`` +property needs to obey `numpy's broadcasting rules `_ +(see also the `Python Array API standard's explanation `_ +of these same rules). + +Python Array API standard support +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +As an integration library xarray benefits greatly from the standardization of duck-array libraries' APIs, and so is a +big supporter of the `Python Array API Standard `_. . + +We aim to support any array libraries that follow the Array API standard out-of-the-box. However, xarray does occasionally +call some numpy functions which are not (yet) part of the standard (e.g. :py:meth:`xarray.DataArray.pad` calls :py:func:`numpy.pad`). +See `xarray issue #7848 `_ for a list of such functions. We can still support dispatching on these functions through +the array protocols above, it just means that if you exclusively implement the methods in the Python Array API standard +then some features in xarray will not work. + +Custom inline reprs +~~~~~~~~~~~~~~~~~~~ In certain situations (e.g. when printing the collapsed preview of variables of a ``Dataset``), xarray will display the repr of a :term:`duck array` in a single line, truncating it to a certain number of characters. If that would drop too much information, the :term:`duck array` may define a ``_repr_inline_`` method that takes ``max_width`` (number of characters) as an -argument: +argument .. code:: python diff --git a/doc/internals/extending-xarray.rst b/doc/internals/extending-xarray.rst index 56aeb8fa462..a180b85044f 100644 --- a/doc/internals/extending-xarray.rst +++ b/doc/internals/extending-xarray.rst @@ -1,4 +1,6 @@ +.. _internals.accessors: + Extending xarray using accessors ================================ diff --git a/doc/internals/how-to-create-custom-index.rst b/doc/internals/how-to-create-custom-index.rst new file mode 100644 index 00000000000..93805229db1 --- /dev/null +++ b/doc/internals/how-to-create-custom-index.rst @@ -0,0 +1,233 @@ +.. currentmodule:: xarray + +How to create a custom index +============================ + +.. warning:: + + This feature is highly experimental. Support for custom indexes has been + introduced in v2022.06.0 and is still incomplete. API is subject to change + without deprecation notice. However we encourage you to experiment and report issues that arise. + +Xarray's built-in support for label-based indexing (e.g. `ds.sel(latitude=40, method="nearest")`) and alignment operations +relies on :py:class:`pandas.Index` objects. Pandas Indexes are powerful and suitable for many +applications but also have some limitations: + +- it only works with 1-dimensional coordinates where explicit labels + are fully loaded in memory +- it is hard to reuse it with irregular data for which there exist more + efficient, tree-based structures to perform data selection +- it doesn't support extra metadata that may be required for indexing and + alignment (e.g., a coordinate reference system) + +Fortunately, Xarray now allows extending this functionality with custom indexes, +which can be implemented in 3rd-party libraries. + +The Index base class +-------------------- + +Every Xarray index must inherit from the :py:class:`Index` base class. It is for +example the case of Xarray built-in ``PandasIndex`` and ``PandasMultiIndex`` +subclasses, which wrap :py:class:`pandas.Index` and +:py:class:`pandas.MultiIndex` respectively. + +The ``Index`` API closely follows the :py:class:`Dataset` and +:py:class:`DataArray` API, e.g., for an index to support :py:meth:`DataArray.sel` it needs to +implement :py:meth:`Index.sel`, to support :py:meth:`DataArray.stack` and :py:meth:`DataArray.unstack` it +needs to implement :py:meth:`Index.stack` and :py:meth:`Index.unstack`, etc. + +Some guidelines and examples are given below. More details can be found in the +documented :py:class:`Index` API. + +Minimal requirements +-------------------- + +Every index must at least implement the :py:meth:`Index.from_variables` class +method, which is used by Xarray to build a new index instance from one or more +existing coordinates in a Dataset or DataArray. + +Since any collection of coordinates can be passed to that method (i.e., the +number, order and dimensions of the coordinates are all arbitrary), it is the +responsibility of the index to check the consistency and validity of those input +coordinates. + +For example, :py:class:`~xarray.core.indexes.PandasIndex` accepts only one coordinate and +:py:class:`~xarray.core.indexes.PandasMultiIndex` accepts one or more 1-dimensional coordinates that must all +share the same dimension. Other, custom indexes need not have the same +constraints, e.g., + +- a georeferenced raster index which only accepts two 1-d coordinates with + distinct dimensions +- a staggered grid index which takes coordinates with different dimension name + suffixes (e.g., "_c" and "_l" for center and left) + +Optional requirements +--------------------- + +Pretty much everything else is optional. Depending on the method, in the absence +of a (re)implementation, an index will either raise a `NotImplementedError` +or won't do anything specific (just drop, pass or copy itself +from/to the resulting Dataset or DataArray). + +For example, you can just skip re-implementing :py:meth:`Index.rename` if there +is no internal attribute or object to rename according to the new desired +coordinate or dimension names. In the case of ``PandasIndex``, we rename the +underlying ``pandas.Index`` object and/or update the ``PandasIndex.dim`` +attribute since the associated dimension name has been changed. + +Wrap index data as coordinate data +---------------------------------- + +In some cases it is possible to reuse the index's underlying object or structure +as coordinate data and hence avoid data duplication. + +For ``PandasIndex`` and ``PandasMultiIndex``, we +leverage the fact that ``pandas.Index`` objects expose some array-like API. In +Xarray we use some wrappers around those underlying objects as a thin +compatibility layer to preserve dtypes, handle explicit and n-dimensional +indexing, etc. + +Other structures like tree-based indexes (e.g., kd-tree) may differ too much +from arrays to reuse it as coordinate data. + +If the index data can be reused as coordinate data, the ``Index`` subclass +should implement :py:meth:`Index.create_variables`. This method accepts a +dictionary of variable names as keys and :py:class:`Variable` objects as values (used for propagating +variable metadata) and should return a dictionary of new :py:class:`Variable` or +:py:class:`IndexVariable` objects. + +Data selection +-------------- + +For an index to support label-based selection, it needs to at least implement +:py:meth:`Index.sel`. This method accepts a dictionary of labels where the keys +are coordinate names (already filtered for the current index) and the values can +be pretty much anything (e.g., a slice, a tuple, a list, a numpy array, a +:py:class:`Variable` or a :py:class:`DataArray`). It is the responsibility of +the index to properly handle those input labels. + +:py:meth:`Index.sel` must return an instance of :py:class:`IndexSelResult`. The +latter is a small data class that holds positional indexers (indices) and that +may also hold new variables, new indexes, names of variables or indexes to drop, +names of dimensions to rename, etc. For example, this is useful in the case of +``PandasMultiIndex`` as it allows Xarray to convert it into a single ``PandasIndex`` +when only one level remains after the selection. + +The :py:class:`IndexSelResult` class is also used to merge results from label-based +selection performed by different indexes. Note that it is now possible to have +two distinct indexes for two 1-d coordinates sharing the same dimension, but it +is not currently possible to use those two indexes in the same call to +:py:meth:`Dataset.sel`. + +Optionally, the index may also implement :py:meth:`Index.isel`. In the case of +``PandasIndex`` we use it to create a new index object by just indexing the +underlying ``pandas.Index`` object. In other cases this may not be possible, +e.g., a kd-tree object may not be easily indexed. If ``Index.isel()`` is not +implemented, the index in just dropped in the DataArray or Dataset resulting +from the selection. + +Alignment +--------- + +For an index to support alignment, it needs to implement: + +- :py:meth:`Index.equals`, which compares the index with another index and + returns either ``True`` or ``False`` +- :py:meth:`Index.join`, which combines the index with another index and returns + a new Index object +- :py:meth:`Index.reindex_like`, which queries the index with another index and + returns positional indexers that are used to re-index Dataset or DataArray + variables along one or more dimensions + +Xarray ensures that those three methods are called with an index of the same +type as argument. + +Meta-indexes +------------ + +Nothing prevents writing a custom Xarray index that itself encapsulates other +Xarray index(es). We call such index a "meta-index". + +Here is a small example of a meta-index for geospatial, raster datasets (i.e., +regularly spaced 2-dimensional data) that internally relies on two +``PandasIndex`` instances for the x and y dimensions respectively: + +.. code-block:: python + + from xarray import Index + from xarray.core.indexes import PandasIndex + from xarray.core.indexing import merge_sel_results + + + class RasterIndex(Index): + def __init__(self, xy_indexes): + assert len(xy_indexes) == 2 + + # must have two distinct dimensions + dim = [idx.dim for idx in xy_indexes.values()] + assert dim[0] != dim[1] + + self._xy_indexes = xy_indexes + + @classmethod + def from_variables(cls, variables): + assert len(variables) == 2 + + xy_indexes = { + k: PandasIndex.from_variables({k: v}) for k, v in variables.items() + } + + return cls(xy_indexes) + + def create_variables(self, variables): + idx_variables = {} + + for index in self._xy_indexes.values(): + idx_variables.update(index.create_variables(variables)) + + return idx_variables + + def sel(self, labels): + results = [] + + for k, index in self._xy_indexes.items(): + if k in labels: + results.append(index.sel({k: labels[k]})) + + return merge_sel_results(results) + + +This basic index only supports label-based selection. Providing a full-featured +index by implementing the other ``Index`` methods should be pretty +straightforward for this example, though. + +This example is also not very useful unless we add some extra functionality on +top of the two encapsulated ``PandasIndex`` objects, such as a coordinate +reference system. + +How to use a custom index +------------------------- + +You can use :py:meth:`Dataset.set_xindex` or :py:meth:`DataArray.set_xindex` to assign a +custom index to a Dataset or DataArray, e.g., using the ``RasterIndex`` above: + +.. code-block:: python + + import numpy as np + import xarray as xr + + da = xr.DataArray( + np.random.uniform(size=(100, 50)), + coords={"x": ("x", np.arange(50)), "y": ("y", np.arange(100))}, + dims=("y", "x"), + ) + + # Xarray create default indexes for the 'x' and 'y' coordinates + # we first need to explicitly drop it + da = da.drop_indexes(["x", "y"]) + + # Build a RasterIndex from the 'x' and 'y' coordinates + da_raster = da.set_xindex(["x", "y"], RasterIndex) + + # RasterIndex now takes care of label-based selection + selected = da_raster.sel(x=10, y=slice(20, 50)) diff --git a/doc/internals/index.rst b/doc/internals/index.rst index e4ca9779dd7..7e13f0cfe95 100644 --- a/doc/internals/index.rst +++ b/doc/internals/index.rst @@ -8,6 +8,12 @@ stack, NumPy and pandas. It is written in pure Python (no C or Cython extensions), which makes it easy to develop and extend. Instead, we push compiled code to :ref:`optional dependencies`. +The pages in this section are intended for: + +* Contributors to xarray who wish to better understand some of the internals, +* Developers who wish to extend xarray with domain-specific logic, perhaps to support a new scientific community of users, +* Developers who wish to interface xarray with their existing tooling, e.g. by creating a plugin for reading a new file format, or wrapping a custom array type. + .. toctree:: :maxdepth: 2 @@ -15,6 +21,8 @@ compiled code to :ref:`optional dependencies`. variable-objects duck-arrays-integration + chunked-arrays extending-xarray zarr-encoding-spec how-to-add-new-backend + how-to-create-custom-index diff --git a/doc/user-guide/data-structures.rst b/doc/user-guide/data-structures.rst index e0fd4bd0d25..64e7b3625ac 100644 --- a/doc/user-guide/data-structures.rst +++ b/doc/user-guide/data-structures.rst @@ -19,7 +19,8 @@ DataArray :py:class:`xarray.DataArray` is xarray's implementation of a labeled, multi-dimensional array. It has several key properties: -- ``values``: a :py:class:`numpy.ndarray` holding the array's values +- ``values``: a :py:class:`numpy.ndarray` or + :ref:`numpy-like array ` holding the array's values - ``dims``: dimension names for each axis (e.g., ``('x', 'y', 'z')``) - ``coords``: a dict-like container of arrays (*coordinates*) that label each point (e.g., 1-dimensional arrays of numbers, datetime objects or @@ -46,7 +47,8 @@ Creating a DataArray The :py:class:`~xarray.DataArray` constructor takes: - ``data``: a multi-dimensional array of values (e.g., a numpy ndarray, - :py:class:`~pandas.Series`, :py:class:`~pandas.DataFrame` or ``pandas.Panel``) + a :ref:`numpy-like array `, :py:class:`~pandas.Series`, + :py:class:`~pandas.DataFrame` or ``pandas.Panel``) - ``coords``: a list or dictionary of coordinates. If a list, it should be a list of tuples where the first element is the dimension name and the second element is the corresponding coordinate array_like object. diff --git a/doc/user-guide/duckarrays.rst b/doc/user-guide/duckarrays.rst index 78c7d1e572a..f0650ac61b5 100644 --- a/doc/user-guide/duckarrays.rst +++ b/doc/user-guide/duckarrays.rst @@ -1,30 +1,183 @@ .. currentmodule:: xarray +.. _userguide.duckarrays: + Working with numpy-like arrays ============================== +NumPy-like arrays (often known as :term:`duck array`\s) are drop-in replacements for the :py:class:`numpy.ndarray` +class but with different features, such as propagating physical units or a different layout in memory. +Xarray can often wrap these array types, allowing you to use labelled dimensions and indexes whilst benefiting from the +additional features of these array libraries. + +Some numpy-like array types that xarray already has some support for: + +* `Cupy `_ - GPU support (see `cupy-xarray `_), +* `Sparse `_ - for performant arrays with many zero elements, +* `Pint `_ - for tracking the physical units of your data (see `pint-xarray `_), +* `Dask `_ - parallel computing on larger-than-memory arrays (see :ref:`using dask with xarray `), +* `Cubed `_ - another parallel computing framework that emphasises reliability (see `cubed-xarray `_). + .. warning:: - This feature should be considered experimental. Please report any bug you may find on - xarray’s github repository. + This feature should be considered somewhat experimental. Please report any bugs you find on + `xarray’s issue tracker `_. + +.. note:: + + For information on wrapping dask arrays see :ref:`dask`. Whilst xarray wraps dask arrays in a similar way to that + described on this page, chunked array types like :py:class:`dask.array.Array` implement additional methods that require + slightly different user code (e.g. calling ``.chunk`` or ``.compute``). See the docs on :ref:`wrapping chunked arrays `. + +Why "duck"? +----------- + +Why is it also called a "duck" array? This comes from a common statement of object-oriented programming - +"If it walks like a duck, and quacks like a duck, treat it like a duck". In other words, a library like xarray that +is capable of using multiple different types of arrays does not have to explicitly check that each one it encounters is +permitted (e.g. ``if dask``, ``if numpy``, ``if sparse`` etc.). Instead xarray can take the more permissive approach of simply +treating the wrapped array as valid, attempting to call the relevant methods (e.g. ``.mean()``) and only raising an +error if a problem occurs (e.g. the method is not found on the wrapped class). This is much more flexible, and allows +objects and classes from different libraries to work together more easily. + +What is a numpy-like array? +--------------------------- + +A "numpy-like array" (also known as a "duck array") is a class that contains array-like data, and implements key +numpy-like functionality such as indexing, broadcasting, and computation methods. + +For example, the `sparse `_ library provides a sparse array type which is useful for representing nD array objects like sparse matrices +in a memory-efficient manner. We can create a sparse array object (of the :py:class:`sparse.COO` type) from a numpy array like this: + +.. ipython:: python + + from sparse import COO + + x = np.eye(4, dtype=np.uint8) # create diagonal identity matrix + s = COO.from_numpy(x) + s -NumPy-like arrays (:term:`duck array`) extend the :py:class:`numpy.ndarray` with -additional features, like propagating physical units or a different layout in memory. +This sparse object does not attempt to explicitly store every element in the array, only the non-zero elements. +This approach is much more efficient for large arrays with only a few non-zero elements (such as tri-diagonal matrices). +Sparse array objects can be converted back to a "dense" numpy array by calling :py:meth:`sparse.COO.todense`. -:py:class:`DataArray` and :py:class:`Dataset` objects can wrap these duck arrays, as -long as they satisfy certain conditions (see :ref:`internals.duck_arrays`). +Just like :py:class:`numpy.ndarray` objects, :py:class:`sparse.COO` arrays support indexing + +.. ipython:: python + + s[1, 1] # diagonal elements should be ones + s[2, 3] # off-diagonal elements should be zero + +broadcasting, + +.. ipython:: python + + x2 = np.zeros( + (4, 1), dtype=np.uint8 + ) # create second sparse array of different shape + s2 = COO.from_numpy(x2) + (s * s2) # multiplication requires broadcasting + +and various computation methods + +.. ipython:: python + + s.sum(axis=1) + +This numpy-like array also supports calling so-called `numpy ufuncs `_ +("universal functions") on it directly: + +.. ipython:: python + + np.sum(s, axis=1) + + +Notice that in each case the API for calling the operation on the sparse array is identical to that of calling it on the +equivalent numpy array - this is the sense in which the sparse array is "numpy-like". .. note:: - For ``dask`` support see :ref:`dask`. + For discussion on exactly which methods a class needs to implement to be considered "numpy-like", see :ref:`internals.duckarrays`. + +Wrapping numpy-like arrays in xarray +------------------------------------ + +:py:class:`DataArray`, :py:class:`Dataset`, and :py:class:`Variable` objects can wrap these numpy-like arrays. +Constructing xarray objects which wrap numpy-like arrays +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Missing features ----------------- -Most of the API does support :term:`duck array` objects, but there are a few areas where -the code will still cast to ``numpy`` arrays: +The primary way to create an xarray object which wraps a numpy-like array is to pass that numpy-like array instance directly +to the constructor of the xarray class. The :ref:`page on xarray data structures ` shows how :py:class:`DataArray` and :py:class:`Dataset` +both accept data in various forms through their ``data`` argument, but in fact this data can also be any wrappable numpy-like array. -- dimension coordinates, and thus all indexing operations: +For example, we can wrap the sparse array we created earlier inside a new DataArray object: + +.. ipython:: python + + s_da = xr.DataArray(s, dims=["i", "j"]) + s_da + +We can see what's inside - the printable representation of our xarray object (the repr) automatically uses the printable +representation of the underlying wrapped array. + +Of course our sparse array object is still there underneath - it's stored under the ``.data`` attribute of the dataarray: + +.. ipython:: python + + s_da.data + +Array methods +~~~~~~~~~~~~~ + +We saw above that numpy-like arrays provide numpy methods. Xarray automatically uses these when you call the corresponding xarray method: + +.. ipython:: python + + s_da.sum(dim="j") + +Converting wrapped types +~~~~~~~~~~~~~~~~~~~~~~~~ + +If you want to change the type inside your xarray object you can use :py:meth:`DataArray.as_numpy`: + +.. ipython:: python + + s_da.as_numpy() + +This returns a new :py:class:`DataArray` object, but now wrapping a normal numpy array. + +If instead you want to convert to numpy and return that numpy array you can use either :py:meth:`DataArray.to_numpy` or +:py:meth:`DataArray.values`, where the former is strongly preferred. The difference is in the way they coerce to numpy - :py:meth:`~DataArray.values` +always uses :py:func:`numpy.asarray` which will fail for some array types (e.g. ``cupy``), whereas :py:meth:`~DataArray.to_numpy` +uses the correct method depending on the array type. + +.. ipython:: python + + s_da.to_numpy() + +.. ipython:: python + :okexcept: + + s_da.values + +This illustrates the difference between :py:meth:`~DataArray.data` and :py:meth:`~DataArray.values`, +which is sometimes a point of confusion for new xarray users. +Explicitly: :py:meth:`DataArray.data` returns the underlying numpy-like array, regardless of type, whereas +:py:meth:`DataArray.values` converts the underlying array to a numpy array before returning it. +(This is another reason to use :py:meth:`~DataArray.to_numpy` over :py:meth:`~DataArray.values` - the intention is clearer.) + +Conversion to numpy as a fallback +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If a wrapped array does not implement the corresponding array method then xarray will often attempt to convert the +underlying array to a numpy array so that the operation can be performed. You may want to watch out for this behavior, +and report any instances in which it causes problems. + +Most of xarray's API does support using :term:`duck array` objects, but there are a few areas where +the code will still convert to ``numpy`` arrays: + +- Dimension coordinates, and thus all indexing operations: * :py:meth:`Dataset.sel` and :py:meth:`DataArray.sel` * :py:meth:`Dataset.loc` and :py:meth:`DataArray.loc` @@ -33,7 +186,7 @@ the code will still cast to ``numpy`` arrays: :py:meth:`DataArray.reindex` and :py:meth:`DataArray.reindex_like`: duck arrays in data variables and non-dimension coordinates won't be casted -- functions and methods that depend on external libraries or features of ``numpy`` not +- Functions and methods that depend on external libraries or features of ``numpy`` not covered by ``__array_function__`` / ``__array_ufunc__``: * :py:meth:`Dataset.ffill` and :py:meth:`DataArray.ffill` (uses ``bottleneck``) @@ -49,17 +202,25 @@ the code will still cast to ``numpy`` arrays: :py:class:`numpy.vectorize`) * :py:func:`apply_ufunc` with ``vectorize=True`` (uses :py:class:`numpy.vectorize`) -- incompatibilities between different :term:`duck array` libraries: +- Incompatibilities between different :term:`duck array` libraries: * :py:meth:`Dataset.chunk` and :py:meth:`DataArray.chunk`: this fails if the data was not already chunked and the :term:`duck array` (e.g. a ``pint`` quantity) should - wrap the new ``dask`` array; changing the chunk sizes works. - + wrap the new ``dask`` array; changing the chunk sizes works however. Extensions using duck arrays ---------------------------- -Here's a list of libraries extending ``xarray`` to make working with wrapped duck arrays -easier: + +Whilst the features above allow many numpy-like array libraries to be used pretty seamlessly with xarray, it often also +makes sense to use an interfacing package to make certain tasks easier. + +For example the `pint-xarray package `_ offers a custom ``.pint`` accessor (see :ref:`internals.accessors`) which provides +convenient access to information stored within the wrapped array (e.g. ``.units`` and ``.magnitude``), and makes makes +creating wrapped pint arrays (and especially xarray-wrapping-pint-wrapping-dask arrays) simpler for the user. + +We maintain a list of libraries extending ``xarray`` to make working with particular wrapped duck arrays +easier. If you know of more that aren't on this list please raise an issue to add them! - `pint-xarray `_ - `cupy-xarray `_ +- `cubed-xarray `_ diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index dc495b9f285..c0e88634705 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -559,6 +559,67 @@ and currently raises a warning unless ``invalid_netcdf=True`` is set: Note that this produces a file that is likely to be not readable by other netCDF libraries! +.. _io.hdf5: + +HDF5 +---- +`HDF5`_ is both a file format and a data model for storing information. HDF5 stores +data hierarchically, using groups to create a nested structure. HDF5 is a more +general verion of the netCDF4 data model, so the nested structure is one of many +similarities between the two data formats. + +Reading HDF5 files in xarray requires the ``h5netcdf`` engine, which can be installed +with ``conda install h5netcdf``. Once installed we can use xarray to open HDF5 files: + +.. code:: python + + xr.open_dataset("/path/to/my/file.h5") + +The similarities between HDF5 and netCDF4 mean that HDF5 data can be written with the +same :py:meth:`Dataset.to_netcdf` method as used for netCDF4 data: + +.. ipython:: python + + ds = xr.Dataset( + {"foo": (("x", "y"), np.random.rand(4, 5))}, + coords={ + "x": [10, 20, 30, 40], + "y": pd.date_range("2000-01-01", periods=5), + "z": ("x", list("abcd")), + }, + ) + + ds.to_netcdf("saved_on_disk.h5") + +Groups +~~~~~~ + +If you have multiple or highly nested groups, xarray by default may not read the group +that you want. A particular group of an HDF5 file can be specified using the ``group`` +argument: + +.. code:: python + + xr.open_dataset("/path/to/my/file.h5", group="/my/group") + +While xarray cannot interrogate an HDF5 file to determine which groups are available, +the HDF5 Python reader `h5py`_ can be used instead. + +Natively the xarray data structures can only handle one level of nesting, organized as +DataArrays inside of Datasets. If your HDF5 file has additional levels of hierarchy you +can only access one group and a time and will need to specify group names. + +.. note:: + + For native handling of multiple HDF5 groups with xarray, including I/O, you might be + interested in the experimental + `xarray-datatree `_ package. + + +.. _HDF5: https://hdfgroup.github.io/hdf5/index.html +.. _h5py: https://www.h5py.org/ + + .. _io.zarr: Zarr diff --git a/doc/user-guide/terminology.rst b/doc/user-guide/terminology.rst index 24e6ab69927..d99312643aa 100644 --- a/doc/user-guide/terminology.rst +++ b/doc/user-guide/terminology.rst @@ -54,23 +54,22 @@ complete examples, please consult the relevant documentation.* Coordinate An array that labels a dimension or set of dimensions of another ``DataArray``. In the usual one-dimensional case, the coordinate array's - values can loosely be thought of as tick labels along a dimension. There - are two types of coordinate arrays: *dimension coordinates* and - *non-dimension coordinates* (see below). A coordinate named ``x`` can be - retrieved from ``arr.coords[x]``. A ``DataArray`` can have more - coordinates than dimensions because a single dimension can be labeled by - multiple coordinate arrays. However, only one coordinate array can be a - assigned as a particular dimension's dimension coordinate array. As a + values can loosely be thought of as tick labels along a dimension. We + distinguish :term:`Dimension coordinate` vs. :term:`Non-dimension + coordinate` and :term:`Indexed coordinate` vs. :term:`Non-indexed + coordinate`. A coordinate named ``x`` can be retrieved from + ``arr.coords[x]``. A ``DataArray`` can have more coordinates than + dimensions because a single dimension can be labeled by multiple + coordinate arrays. However, only one coordinate array can be a assigned + as a particular dimension's dimension coordinate array. As a consequence, ``len(arr.dims) <= len(arr.coords)`` in general. Dimension coordinate A one-dimensional coordinate array assigned to ``arr`` with both a name - and dimension name in ``arr.dims``. Dimension coordinates are used for - label-based indexing and alignment, like the index found on a - :py:class:`pandas.DataFrame` or :py:class:`pandas.Series`. In fact, - dimension coordinates use :py:class:`pandas.Index` objects under the - hood for efficient computation. Dimension coordinates are marked by - ``*`` when printing a ``DataArray`` or ``Dataset``. + and dimension name in ``arr.dims``. Usually (but not always), a + dimension coordinate is also an :term:`Indexed coordinate` so that it can + be used for label-based indexing and alignment, like the index found on + a :py:class:`pandas.DataFrame` or :py:class:`pandas.Series`. Non-dimension coordinate A coordinate array assigned to ``arr`` with a name in ``arr.coords`` but @@ -79,20 +78,40 @@ complete examples, please consult the relevant documentation.* example, multidimensional coordinates are often used in geoscience datasets when :doc:`the data's physical coordinates (such as latitude and longitude) differ from their logical coordinates - <../examples/multidimensional-coords>`. However, non-dimension coordinates - are not indexed, and any operation on non-dimension coordinates that - leverages indexing will fail. Printing ``arr.coords`` will print all of - ``arr``'s coordinate names, with the corresponding dimension(s) in - parentheses. For example, ``coord_name (dim_name) 1 2 3 ...``. + <../examples/multidimensional-coords>`. Printing ``arr.coords`` will + print all of ``arr``'s coordinate names, with the corresponding + dimension(s) in parentheses. For example, ``coord_name (dim_name) 1 2 3 + ...``. + + Indexed coordinate + A coordinate which has an associated :term:`Index`. Generally this means + that the coordinate labels can be used for indexing (selection) and/or + alignment. An indexed coordinate may have one or more arbitrary + dimensions although in most cases it is also a :term:`Dimension + coordinate`. It may or may not be grouped with other indexed coordinates + depending on whether they share the same index. Indexed coordinates are + marked by ``*`` when printing a ``DataArray`` or ``Dataset``. + + Non-indexed coordinate + A coordinate which has no associated :term:`Index`. It may still + represent fixed labels along one or more dimensions but it cannot be + used for label-based indexing and alignment. Index - An *index* is a data structure optimized for efficient selecting and - slicing of an associated array. Xarray creates indexes for dimension - coordinates so that operations along dimensions are fast, while - non-dimension coordinates are not indexed. Under the hood, indexes are - implemented as :py:class:`pandas.Index` objects. The index associated - with dimension name ``x`` can be retrieved by ``arr.indexes[x]``. By - construction, ``len(arr.dims) == len(arr.indexes)`` + An *index* is a data structure optimized for efficient data selection + and alignment within a discrete or continuous space that is defined by + coordinate labels (unless it is a functional index). By default, Xarray + creates a :py:class:`~xarray.indexes.PandasIndex` object (i.e., a + :py:class:`pandas.Index` wrapper) for each :term:`Dimension coordinate`. + For more advanced use cases (e.g., staggered or irregular grids, + geospatial indexes), Xarray also accepts any instance of a specialized + :py:class:`~xarray.indexes.Index` subclass that is associated to one or + more arbitrary coordinates. The index associated with the coordinate + ``x`` can be retrieved by ``arr.xindexes[x]`` (or ``arr.indexes["x"]`` + if the index is convertible to a :py:class:`pandas.Index` object). If + two coordinates ``x`` and ``y`` share the same index, + ``arr.xindexes[x]`` and ``arr.xindexes[y]`` both return the same + :py:class:`~xarray.indexes.Index` object. name The names of dimensions, coordinates, DataArray objects and data @@ -112,3 +131,128 @@ complete examples, please consult the relevant documentation.* ``__array_ufunc__`` and ``__array_function__`` protocols are also required. __ https://numpy.org/neps/nep-0022-ndarray-duck-typing-overview.html + + .. ipython:: python + :suppress: + + import numpy as np + import xarray as xr + + Aligning + Aligning refers to the process of ensuring that two or more DataArrays or Datasets + have the same dimensions and coordinates, so that they can be combined or compared properly. + + .. ipython:: python + + x = xr.DataArray( + [[25, 35], [10, 24]], + dims=("lat", "lon"), + coords={"lat": [35.0, 40.0], "lon": [100.0, 120.0]}, + ) + y = xr.DataArray( + [[20, 5], [7, 13]], + dims=("lat", "lon"), + coords={"lat": [35.0, 42.0], "lon": [100.0, 120.0]}, + ) + x + y + + Broadcasting + A technique that allows operations to be performed on arrays with different shapes and dimensions. + When performing operations on arrays with different shapes and dimensions, xarray will automatically attempt to broadcast the + arrays to a common shape before the operation is applied. + + .. ipython:: python + + # 'a' has shape (3,) and 'b' has shape (4,) + a = xr.DataArray(np.array([1, 2, 3]), dims=["x"]) + b = xr.DataArray(np.array([4, 5, 6, 7]), dims=["y"]) + + # 2D array with shape (3, 4) + a + b + + Merging + Merging is used to combine two or more Datasets or DataArrays that have different variables or coordinates along + the same dimensions. When merging, xarray aligns the variables and coordinates of the different datasets along + the specified dimensions and creates a new ``Dataset`` containing all the variables and coordinates. + + .. ipython:: python + + # create two 1D arrays with names + arr1 = xr.DataArray( + [1, 2, 3], dims=["x"], coords={"x": [10, 20, 30]}, name="arr1" + ) + arr2 = xr.DataArray( + [4, 5, 6], dims=["x"], coords={"x": [20, 30, 40]}, name="arr2" + ) + + # merge the two arrays into a new dataset + merged_ds = xr.Dataset({"arr1": arr1, "arr2": arr2}) + merged_ds + + Concatenating + Concatenating is used to combine two or more Datasets or DataArrays along a dimension. When concatenating, + xarray arranges the datasets or dataarrays along a new dimension, and the resulting ``Dataset`` or ``Dataarray`` + will have the same variables and coordinates along the other dimensions. + + .. ipython:: python + + a = xr.DataArray([[1, 2], [3, 4]], dims=("x", "y")) + b = xr.DataArray([[5, 6], [7, 8]], dims=("x", "y")) + c = xr.concat([a, b], dim="c") + c + + Combining + Combining is the process of arranging two or more DataArrays or Datasets into a single ``DataArray`` or + ``Dataset`` using some combination of merging and concatenation operations. + + .. ipython:: python + + ds1 = xr.Dataset( + {"data": xr.DataArray([[1, 2], [3, 4]], dims=("x", "y"))}, + coords={"x": [1, 2], "y": [3, 4]}, + ) + ds2 = xr.Dataset( + {"data": xr.DataArray([[5, 6], [7, 8]], dims=("x", "y"))}, + coords={"x": [2, 3], "y": [4, 5]}, + ) + + # combine the datasets + combined_ds = xr.combine_by_coords([ds1, ds2]) + combined_ds + + lazy + Lazily-evaluated operations do not load data into memory until necessary.Instead of doing calculations + right away, xarray lets you plan what calculations you want to do, like finding the + average temperature in a dataset.This planning is called "lazy evaluation." Later, when + you're ready to see the final result, you tell xarray, "Okay, go ahead and do those calculations now!" + That's when xarray starts working through the steps you planned and gives you the answer you wanted.This + lazy approach helps save time and memory because xarray only does the work when you actually need the + results. + + labeled + Labeled data has metadata describing the context of the data, not just the raw data values. + This contextual information can be labels for array axes (i.e. dimension names) tick labels along axes (stored as Coordinate variables) or unique names for each array. These labels + provide context and meaning to the data, making it easier to understand and work with. If you have + temperature data for different cities over time. Using xarray, you can label the dimensions: one for + cities and another for time. + + serialization + Serialization is the process of converting your data into a format that makes it easy to save and share. + When you serialize data in xarray, you're taking all those temperature measurements, along with their + labels and other information, and turning them into a format that can be stored in a file or sent over + the internet. xarray objects can be serialized into formats which store the labels alongside the data. + Some supported serialization formats are files that can then be stored or transferred (e.g. netCDF), + whilst others are protocols that allow for data access over a network (e.g. Zarr). + + indexing + :ref:`Indexing` is how you select subsets of your data which you are interested in. + + - Label-based Indexing: Selecting data by passing a specific label and comparing it to the labels + stored in the associated coordinates. You can use labels to specify what you want like "Give me the + temperature for New York on July 15th." + + - Positional Indexing: You can use numbers to refer to positions in the data like "Give me the third temperature value" This is useful when you know the order of your data but don't need to remember the exact labels. + + - Slicing: You can take a "slice" of your data, like you might want all temperatures from July 1st + to July 10th. xarray supports slicing for both positional and label-based indexing. diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b94a8a2faa5..8e119361ba1 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -14,11 +14,228 @@ What's New np.random.seed(123456) +.. _whats-new.2023.08.0: + +v2023.08.0 (Aug 18, 2023) +------------------------- + +This release brings changes to minimum dependencies, allows reading of datasets where a dimension name is +associated with a multidimensional variable (e.g. finite volume ocean model output), and introduces +a new :py:class:`xarray.Coordinates` object. + +Thanks to the 16 contributors to this release: Anderson Banihirwe, Articoking, Benoit Bovy, Deepak Cherian, Harshitha, Ian Carroll, +Joe Hamman, Justus Magin, Peter Hill, Rachel Wegener, Riley Kuttruff, Thomas Nicholas, Tom Nicholas, ilgast, quantsnus, vallirep + +Announcements +~~~~~~~~~~~~~ + +The :py:class:`xarray.Variable` class is being refactored out to a new project title 'namedarray'. +See the `design doc `_ for more +details. Reach out to us on this [discussion topic](https://github.com/pydata/xarray/discussions/8080) if you have any thoughts. + +New Features +~~~~~~~~~~~~ + +- :py:class:`Coordinates` can now be constructed independently of any Dataset or + DataArray (it is also returned by the :py:attr:`Dataset.coords` and + :py:attr:`DataArray.coords` properties). ``Coordinates`` objects are useful for + passing both coordinate variables and indexes to new Dataset / DataArray objects, + e.g., via their constructor or via :py:meth:`Dataset.assign_coords`. We may also + wrap coordinate variables in a ``Coordinates`` object in order to skip + the automatic creation of (pandas) indexes for dimension coordinates. + The :py:class:`Coordinates.from_pandas_multiindex` constructor may be used to + create coordinates directly from a :py:class:`pandas.MultiIndex` object (it is + preferred over passing it directly as coordinate data, which may be deprecated soon). + Like Dataset and DataArray objects, ``Coordinates`` objects may now be used in + :py:func:`align` and :py:func:`merge`. + (:issue:`6392`, :pull:`7368`). + By `Benoît Bovy `_. +- Visually group together coordinates with the same indexes in the index section of the text repr (:pull:`7225`). + By `Justus Magin `_. +- Allow creating Xarray objects where a multidimensional variable shares its name + with a dimension. Examples include output from finite volume models like FVCOM. + (:issue:`2233`, :pull:`7989`) + By `Deepak Cherian `_ and `Benoit Bovy `_. +- When outputting :py:class:`Dataset` objects as Zarr via :py:meth:`Dataset.to_zarr`, + user can now specify that chunks that will contain no valid data will not be written. + Originally, this could be done by specifying ``"write_empty_chunks": True`` in the + ``encoding`` parameter; however, this setting would not carry over when appending new + data to an existing dataset. (:issue:`8009`) Requires ``zarr>=2.11``. + + +Breaking changes +~~~~~~~~~~~~~~~~ + +- The minimum versions of some dependencies were changed (:pull:`8022`): + + ===================== ========= ======== + Package Old New + ===================== ========= ======== + boto3 1.20 1.24 + cftime 1.5 1.6 + dask-core 2022.1 2022.7 + distributed 2022.1 2022.7 + hfnetcdf 0.13 1.0 + iris 3.1 3.2 + lxml 4.7 4.9 + netcdf4 1.5.7 1.6.0 + numpy 1.21 1.22 + pint 0.18 0.19 + pydap 3.2 3.3 + rasterio 1.2 1.3 + scipy 1.7 1.8 + toolz 0.11 0.12 + typing_extensions 4.0 4.3 + zarr 2.10 2.12 + numbagg 0.1 0.2.1 + ===================== ========= ======== + + +Documentation +~~~~~~~~~~~~~ + +- Added examples to docstrings of :py:meth:`Dataset.assign_attrs`, :py:meth:`Dataset.broadcast_equals`, + :py:meth:`Dataset.equals`, :py:meth:`Dataset.identical`, :py:meth:`Dataset.expand_dims`,:py:meth:`Dataset.drop_vars` + (:issue:`6793`, :pull:`7937`) By `Harshitha `_. +- Add docstrings for the :py:class:`Index` base class and add some documentation on how to + create custom, Xarray-compatible indexes (:pull:`6975`) + By `Benoît Bovy `_. +- Added a page clarifying the role of Xarray core team members. + (:pull:`7999`) By `Tom Nicholas `_. +- Fixed broken links in "See also" section of :py:meth:`Dataset.count` (:issue:`8055`, :pull:`8057`) + By `Articoking `_. +- Extended the glossary by adding terms Aligning, Broadcasting, Merging, Concatenating, Combining, lazy, + labeled, serialization, indexing (:issue:`3355`, :pull:`7732`) + By `Harshitha `_. + +Internal Changes +~~~~~~~~~~~~~~~~ + +- :py:func:`as_variable` now consistently includes the variable name in any exceptions + raised. (:pull:`7995`). By `Peter Hill `_ +- :py:func:`encode_dataset_coordinates` now sorts coordinates automatically assigned to + `coordinates` attributes during serialization (:issue:`8026`, :pull:`8034`). + `By Ian Carroll `_. + +.. _whats-new.2023.07.0: + +v2023.07.0 (July 17, 2023) +-------------------------- + +This release brings improvements to the documentation on wrapping numpy-like arrays, improved docstrings, and bug fixes. + +Deprecations +~~~~~~~~~~~~ + +- `hue_style` is being deprecated for scatter plots. (:issue:`7907`, :pull:`7925`). + By `Jimmy Westling `_. + +Bug fixes +~~~~~~~~~ + +- Ensure no forward slashes in variable and dimension names for HDF5-based engines. + (:issue:`7943`, :pull:`7953`) By `Kai Mühlbauer `_. + +Documentation +~~~~~~~~~~~~~ + +- Added examples to docstrings of :py:meth:`Dataset.assign_attrs`, :py:meth:`Dataset.broadcast_equals`, + :py:meth:`Dataset.equals`, :py:meth:`Dataset.identical`, :py:meth:`Dataset.expand_dims`,:py:meth:`Dataset.drop_vars` + (:issue:`6793`, :pull:`7937`) By `Harshitha `_. +- Added page on wrapping chunked numpy-like arrays as alternatives to dask arrays. + (:pull:`7951`) By `Tom Nicholas `_. +- Expanded the page on wrapping numpy-like "duck" arrays. + (:pull:`7911`) By `Tom Nicholas `_. +- Added examples to docstrings of :py:meth:`Dataset.isel`, :py:meth:`Dataset.reduce`, :py:meth:`Dataset.argmin`, + :py:meth:`Dataset.argmax` (:issue:`6793`, :pull:`7881`) + By `Harshitha `_ . + +Internal Changes +~~~~~~~~~~~~~~~~ + +- Allow chunked non-dask arrays (i.e. Cubed arrays) in groupby operations. (:pull:`7941`) + By `Tom Nicholas `_. + + +.. _whats-new.2023.06.0: + +v2023.06.0 (June 21, 2023) +-------------------------- + +This release adds features to ``curvefit``, improves the performance of concatenation, and fixes various bugs. + +Thank to our 13 contributors to this release: +Anderson Banihirwe, Deepak Cherian, dependabot[bot], Illviljan, Juniper Tyree, Justus Magin, Martin Fleischmann, +Mattia Almansi, mgunyho, Rutger van Haasteren, Thomas Nicholas, Tom Nicholas, Tom White. + + +New Features +~~~~~~~~~~~~ + +- Added support for multidimensional initial guess and bounds in :py:meth:`DataArray.curvefit` (:issue:`7768`, :pull:`7821`). + By `András Gunyhó `_. +- Add an ``errors`` option to :py:meth:`Dataset.curve_fit` that allows + returning NaN for the parameters and covariances of failed fits, rather than + failing the whole series of fits (:issue:`6317`, :pull:`7891`). + By `Dominik Stańczak `_ and `András Gunyhó `_. + +Breaking changes +~~~~~~~~~~~~~~~~ + + +Deprecations +~~~~~~~~~~~~ +- Deprecate the `cdms2 `_ conversion methods (:pull:`7876`) + By `Justus Magin `_. + +Performance +~~~~~~~~~~~ +- Improve concatenation performance (:issue:`7833`, :pull:`7824`). + By `Jimmy Westling `_. + +Bug fixes +~~~~~~~~~ +- Fix bug where weighted ``polyfit`` were changing the original object (:issue:`5644`, :pull:`7900`). + By `Mattia Almansi `_. +- Don't call ``CachingFileManager.__del__`` on interpreter shutdown (:issue:`7814`, :pull:`7880`). + By `Justus Magin `_. +- Preserve vlen dtype for empty string arrays (:issue:`7328`, :pull:`7862`). + By `Tom White `_ and `Kai Mühlbauer `_. +- Ensure dtype of reindex result matches dtype of the original DataArray (:issue:`7299`, :pull:`7917`) + By `Anderson Banihirwe `_. +- Fix bug where a zero-length zarr ``chunk_store`` was ignored as if it was ``None`` (:pull:`7923`) + By `Juniper Tyree `_. + +Documentation +~~~~~~~~~~~~~ + +Internal Changes +~~~~~~~~~~~~~~~~ + +- Minor improvements to support of the python `array api standard `_, + internally using the function ``xp.astype()`` instead of the method ``arr.astype()``, as the latter is not in the standard. + (:pull:`7847`) By `Tom Nicholas `_. +- Xarray now uploads nightly wheels to https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/ (:issue:`7863`, :pull:`7865`). + By `Martin Fleischmann `_. +- Stop uploading development wheels to TestPyPI (:pull:`7889`) + By `Justus Magin `_. +- Added an exception catch for ``AttributeError`` along with ``ImportError`` when duck typing the dynamic imports in pycompat.py. This catches some name collisions between packages. (:issue:`7870`, :pull:`7874`) .. _whats-new.2023.05.0: -v2023.05.0 (unreleased) ------------------------ +v2023.05.0 (May 18, 2023) +------------------------- + +This release adds some new methods and operators, updates our deprecation policy for python versions, fixes some bugs with groupby, +and introduces experimental support for alternative chunked parallel array computation backends via a new plugin system! + +**Note:** If you are using a locally-installed development version of xarray then pulling the changes from this release may require you to re-install. +This avoids an error where xarray cannot detect dask via the new entrypoints system introduced in :pull:`7019`. See :issue:`7856` for details. + +Thanks to our 14 contributors: +Alan Brammer, crusaderky, David Stansby, dcherian, Deeksha, Deepak Cherian, Illviljan, James McCreight, +Joe Hamman, Justus Magin, Kyle Sunden, Max Hollmann, mgunyho, and Tom Nicholas + New Features ~~~~~~~~~~~~ @@ -27,28 +244,37 @@ New Features - Add support for lshift and rshift binary operators (``<<``, ``>>``) on :py:class:`xr.DataArray` of type :py:class:`int` (:issue:`7727` , :pull:`7741`). By `Alan Brammer `_. - +- Keyword argument `data='array'` to both :py:meth:`xarray.Dataset.to_dict` and + :py:meth:`xarray.DataArray.to_dict` will now return data as the underlying array type. + Python lists are returned for `data='list'` or `data=True`. Supplying `data=False` only returns the schema without data. + ``encoding=True`` returns the encoding dictionary for the underlying variable also. (:issue:`1599`, :pull:`7739`) . + By `James McCreight `_. Breaking changes ~~~~~~~~~~~~~~~~ - adjust the deprecation policy for python to once again align with NEP-29 (:issue:`7765`, :pull:`7793`) By `Justus Magin `_. -Deprecations -~~~~~~~~~~~~ - +Performance +~~~~~~~~~~~ +- Optimize ``.dt `` accessor performance with ``CFTimeIndex``. (:pull:`7796`) + By `Deepak Cherian `_. Bug fixes ~~~~~~~~~ +- Fix `as_compatible_data` for masked float arrays, now always creates a copy when mask is present (:issue:`2377`, :pull:`7788`). + By `Max Hollmann `_. - Fix groupby binary ops when grouped array is subset relative to other. (:issue:`7797`). By `Deepak Cherian `_. - -Documentation -~~~~~~~~~~~~~ - +- Fix groupby sum, prod for all-NaN groups with ``flox``. (:issue:`7808`). + By `Deepak Cherian `_. Internal Changes ~~~~~~~~~~~~~~~~ +- Experimental support for wrapping chunked array libraries other than dask. + A new ABC is defined - :py:class:`xr.core.parallelcompat.ChunkManagerEntrypoint` - which can be subclassed and then + registered by alternative chunked array implementations. (:issue:`6807`, :pull:`7019`) + By `Tom Nicholas `_. .. _whats-new.2023.04.2: @@ -109,10 +335,6 @@ New Features - Added ability to save ``DataArray`` objects directly to Zarr using :py:meth:`~xarray.DataArray.to_zarr`. (:issue:`7692`, :pull:`7693`) . By `Joe Hamman `_. -- Keyword argument `data='array'` to both :py:meth:`xarray.Dataset.to_dict` and - :py:meth:`xarray.DataArray.to_dict` will now return data as the underlying array type. Python lists are returned for `data='list'` or `data=True`. Supplying `data=False` only returns the schema without data. ``encoding=True`` returns the encoding dictionary for the underlying variable also. - (:issue:`1599`, :pull:`7739`) . - By `James McCreight `_. Breaking changes ~~~~~~~~~~~~~~~~ @@ -645,6 +867,7 @@ Bug fixes Documentation ~~~~~~~~~~~~~ + - Update merge docstrings. (:issue:`6935`, :pull:`7033`) By `Zach Moon `_. - Raise a more informative error when trying to open a non-existent zarr store. (:issue:`6484`, :pull:`7060`) diff --git a/pyproject.toml b/pyproject.toml index 88b34d002d5..4d63fd564ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ module = [ "cf_units.*", "cfgrib.*", "cftime.*", + "cubed.*", "cupy.*", "fsspec.*", "h5netcdf.*", diff --git a/setup.cfg b/setup.cfg index 81b7f1c4a0e..85ac8e259e5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -132,6 +132,10 @@ xarray = static/css/* static/html/* +[options.entry_points] +xarray.chunkmanagers = + dask = xarray.core.daskmanager:DaskManager + [tool:pytest] python_files = test_*.py testpaths = xarray/tests properties diff --git a/xarray/__init__.py b/xarray/__init__.py index 75a58053663..830bc254a71 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -26,16 +26,19 @@ where, ) from xarray.core.concat import concat +from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset from xarray.core.extensions import ( register_dataarray_accessor, register_dataset_accessor, ) +from xarray.core.indexes import Index +from xarray.core.indexing import IndexSelResult from xarray.core.merge import Context, MergeError, merge from xarray.core.options import get_options, set_options from xarray.core.parallel import map_blocks -from xarray.core.variable import Coordinate, IndexVariable, Variable, as_variable +from xarray.core.variable import IndexVariable, Variable, as_variable from xarray.util.print_versions import show_versions try: @@ -98,8 +101,11 @@ "CFTimeIndex", "Context", "Coordinate", + "Coordinates", "DataArray", "Dataset", + "Index", + "IndexSelResult", "IndexVariable", "Variable", # Exceptions diff --git a/xarray/backends/api.py b/xarray/backends/api.py index e5adedbb576..e35d85a1e2f 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -3,16 +3,29 @@ import os from collections.abc import Hashable, Iterable, Mapping, MutableMapping, Sequence from functools import partial -from glob import glob from io import BytesIO from numbers import Number -from typing import TYPE_CHECKING, Any, Callable, Final, Literal, Union, cast, overload +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Final, + Literal, + Union, + cast, + overload, +) import numpy as np from xarray import backends, conventions from xarray.backends import plugins -from xarray.backends.common import AbstractDataStore, ArrayWriter, _normalize_path +from xarray.backends.common import ( + AbstractDataStore, + ArrayWriter, + _find_absolute_paths, + _normalize_path, +) from xarray.backends.locks import _get_scheduler from xarray.core import indexing from xarray.core.combine import ( @@ -20,9 +33,11 @@ _nested_combine, combine_by_coords, ) +from xarray.core.daskmanager import DaskManager from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset, _get_chunk, _maybe_chunk from xarray.core.indexes import Index +from xarray.core.parallelcompat import guess_chunkmanager from xarray.core.utils import is_remote_uri if TYPE_CHECKING: @@ -38,6 +53,7 @@ CompatOptions, JoinOptions, NestedSequence, + T_Chunks, ) T_NetcdfEngine = Literal["netcdf4", "scipy", "h5netcdf"] @@ -48,7 +64,6 @@ str, # no nice typing support for custom backends None, ] - T_Chunks = Union[int, dict[Any, Any], Literal["auto"], None] T_NetcdfTypes = Literal[ "NETCDF4", "NETCDF4_CLASSIC", "NETCDF3_64BIT", "NETCDF3_CLASSIC" ] @@ -297,17 +312,27 @@ def _chunk_ds( chunks, overwrite_encoded_chunks, inline_array, + chunked_array_type, + from_array_kwargs, **extra_tokens, ): - from dask.base import tokenize + chunkmanager = guess_chunkmanager(chunked_array_type) - mtime = _get_mtime(filename_or_obj) - token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens) - name_prefix = f"open_dataset-{token}" + # TODO refactor to move this dask-specific logic inside the DaskManager class + if isinstance(chunkmanager, DaskManager): + from dask.base import tokenize + + mtime = _get_mtime(filename_or_obj) + token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens) + name_prefix = "open_dataset-" + else: + # not used + token = (None,) + name_prefix = None variables = {} for name, var in backend_ds.variables.items(): - var_chunks = _get_chunk(var, chunks) + var_chunks = _get_chunk(var, chunks, chunkmanager) variables[name] = _maybe_chunk( name, var, @@ -316,6 +341,8 @@ def _chunk_ds( name_prefix=name_prefix, token=token, inline_array=inline_array, + chunked_array_type=chunkmanager, + from_array_kwargs=from_array_kwargs.copy(), ) return backend_ds._replace(variables) @@ -328,6 +355,8 @@ def _dataset_from_backend_dataset( cache, overwrite_encoded_chunks, inline_array, + chunked_array_type, + from_array_kwargs, **extra_tokens, ): if not isinstance(chunks, (int, dict)) and chunks not in {None, "auto"}: @@ -346,6 +375,8 @@ def _dataset_from_backend_dataset( chunks, overwrite_encoded_chunks, inline_array, + chunked_array_type, + from_array_kwargs, **extra_tokens, ) @@ -373,6 +404,8 @@ def open_dataset( decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, inline_array: bool = False, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, backend_kwargs: dict[str, Any] | None = None, **kwargs, ) -> Dataset: @@ -465,6 +498,15 @@ def open_dataset( itself, and each chunk refers to that task by its key. With ``inline_array=True``, Dask will instead inline the array directly in the values of the task graph. See :py:func:`dask.array.from_array`. + chunked_array_type: str, optional + Which chunked array type to coerce this datasets' arrays to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Experimental API that should not be relied upon. + from_array_kwargs: dict + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. backend_kwargs: dict Additional keyword arguments passed on to the engine open function, equivalent to `**kwargs`. @@ -508,6 +550,9 @@ def open_dataset( if engine is None: engine = plugins.guess_engine(filename_or_obj) + if from_array_kwargs is None: + from_array_kwargs = {} + backend = plugins.get_backend(engine) decoders = _resolve_decoders_kwargs( @@ -536,6 +581,8 @@ def open_dataset( cache, overwrite_encoded_chunks, inline_array, + chunked_array_type, + from_array_kwargs, drop_variables=drop_variables, **decoders, **kwargs, @@ -546,8 +593,8 @@ def open_dataset( def open_dataarray( filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, *, - engine: T_Engine = None, - chunks: T_Chunks = None, + engine: T_Engine | None = None, + chunks: T_Chunks | None = None, cache: bool | None = None, decode_cf: bool | None = None, mask_and_scale: bool | None = None, @@ -558,6 +605,8 @@ def open_dataarray( decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, inline_array: bool = False, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, backend_kwargs: dict[str, Any] | None = None, **kwargs, ) -> DataArray: @@ -652,6 +701,15 @@ def open_dataarray( itself, and each chunk refers to that task by its key. With ``inline_array=True``, Dask will instead inline the array directly in the values of the task graph. See :py:func:`dask.array.from_array`. + chunked_array_type: str, optional + Which chunked array type to coerce the underlying data array to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Experimental API that should not be relied upon. + from_array_kwargs: dict + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. backend_kwargs: dict Additional keyword arguments passed on to the engine open function, equivalent to `**kwargs`. @@ -695,6 +753,8 @@ def open_dataarray( cache=cache, drop_variables=drop_variables, inline_array=inline_array, + chunked_array_type=chunked_array_type, + from_array_kwargs=from_array_kwargs, backend_kwargs=backend_kwargs, use_cftime=use_cftime, decode_timedelta=decode_timedelta, @@ -726,7 +786,7 @@ def open_dataarray( def open_mfdataset( paths: str | NestedSequence[str | os.PathLike], - chunks: T_Chunks = None, + chunks: T_Chunks | None = None, concat_dim: str | DataArray | Index @@ -736,7 +796,7 @@ def open_mfdataset( | None = None, compat: CompatOptions = "no_conflicts", preprocess: Callable[[Dataset], Dataset] | None = None, - engine: T_Engine = None, + engine: T_Engine | None = None, data_vars: Literal["all", "minimal", "different"] | list[str] = "all", coords="different", combine: Literal["by_coords", "nested"] = "by_coords", @@ -911,37 +971,7 @@ def open_mfdataset( .. [1] https://docs.xarray.dev/en/stable/dask.html .. [2] https://docs.xarray.dev/en/stable/dask.html#chunking-and-performance """ - if isinstance(paths, str): - if is_remote_uri(paths) and engine == "zarr": - try: - from fsspec.core import get_fs_token_paths - except ImportError as e: - raise ImportError( - "The use of remote URLs for opening zarr requires the package fsspec" - ) from e - - fs, _, _ = get_fs_token_paths( - paths, - mode="rb", - storage_options=kwargs.get("backend_kwargs", {}).get( - "storage_options", {} - ), - expand=False, - ) - tmp_paths = fs.glob(fs._strip_protocol(paths)) # finds directories - paths = [fs.get_mapper(path) for path in tmp_paths] - elif is_remote_uri(paths): - raise ValueError( - "cannot do wild-card matching for paths that are remote URLs " - f"unless engine='zarr' is specified. Got paths: {paths}. " - "Instead, supply paths as an explicit list of strings." - ) - else: - paths = sorted(glob(_normalize_path(paths))) - elif isinstance(paths, os.PathLike): - paths = [os.fspath(paths)] - else: - paths = [os.fspath(p) if isinstance(p, os.PathLike) else p for p in paths] + paths = _find_absolute_paths(paths, engine=engine, **kwargs) if not paths: raise OSError("no files to open") @@ -1490,6 +1520,8 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + write_empty_chunks: bool | None = None, + chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> backends.ZarrStore: ... @@ -1512,6 +1544,8 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + write_empty_chunks: bool | None = None, + chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> Delayed: ... @@ -1531,6 +1565,8 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + write_empty_chunks: bool | None = None, + chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> backends.ZarrStore | Delayed: """This function creates an appropriate datastore for writing a dataset to a zarr ztore @@ -1623,6 +1659,7 @@ def to_zarr( safe_chunks=safe_chunks, stacklevel=4, # for Dataset.to_zarr() zarr_version=zarr_version, + write_empty=write_empty_chunks, ) if mode in ["a", "r+"]: @@ -1652,7 +1689,9 @@ def to_zarr( writer = ArrayWriter() # TODO: figure out how to properly handle unlimited_dims dump_to_store(dataset, zstore, writer, encoding=encoding) - writes = writer.sync(compute=compute) + writes = writer.sync( + compute=compute, chunkmanager_store_kwargs=chunkmanager_store_kwargs + ) if compute: _finalize_store(writes, zstore) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index bca8b7f668a..1ac988c6b4f 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -5,19 +5,22 @@ import time import traceback from collections.abc import Iterable +from glob import glob from typing import TYPE_CHECKING, Any, ClassVar import numpy as np from xarray.conventions import cf_encoder from xarray.core import indexing -from xarray.core.pycompat import is_duck_dask_array +from xarray.core.parallelcompat import get_chunked_array_type +from xarray.core.pycompat import is_chunked_array from xarray.core.utils import FrozenDict, NdimSizeLenMixin, is_remote_uri if TYPE_CHECKING: from io import BufferedIOBase from xarray.core.dataset import Dataset + from xarray.core.types import NestedSequence # Create a logger object, but don't add any handlers. Leave that to user code. logger = logging.getLogger(__name__) @@ -27,6 +30,24 @@ def _normalize_path(path): + """ + Normalize pathlikes to string. + + Parameters + ---------- + path : + Path to file. + + Examples + -------- + >>> from pathlib import Path + + >>> directory = Path(xr.backends.common.__file__).parent + >>> paths_path = Path(directory).joinpath("comm*n.py") + >>> paths_str = xr.backends.common._normalize_path(paths_path) + >>> print([type(p) for p in (paths_str,)]) + [] + """ if isinstance(path, os.PathLike): path = os.fspath(path) @@ -36,6 +57,64 @@ def _normalize_path(path): return path +def _find_absolute_paths( + paths: str | os.PathLike | NestedSequence[str | os.PathLike], **kwargs +) -> list[str]: + """ + Find absolute paths from the pattern. + + Parameters + ---------- + paths : + Path(s) to file(s). Can include wildcards like * . + **kwargs : + Extra kwargs. Mainly for fsspec. + + Examples + -------- + >>> from pathlib import Path + + >>> directory = Path(xr.backends.common.__file__).parent + >>> paths = str(Path(directory).joinpath("comm*n.py")) # Find common with wildcard + >>> paths = xr.backends.common._find_absolute_paths(paths) + >>> [Path(p).name for p in paths] + ['common.py'] + """ + if isinstance(paths, str): + if is_remote_uri(paths) and kwargs.get("engine", None) == "zarr": + try: + from fsspec.core import get_fs_token_paths + except ImportError as e: + raise ImportError( + "The use of remote URLs for opening zarr requires the package fsspec" + ) from e + + fs, _, _ = get_fs_token_paths( + paths, + mode="rb", + storage_options=kwargs.get("backend_kwargs", {}).get( + "storage_options", {} + ), + expand=False, + ) + tmp_paths = fs.glob(fs._strip_protocol(paths)) # finds directories + paths = [fs.get_mapper(path) for path in tmp_paths] + elif is_remote_uri(paths): + raise ValueError( + "cannot do wild-card matching for paths that are remote URLs " + f"unless engine='zarr' is specified. Got paths: {paths}. " + "Instead, supply paths as an explicit list of strings." + ) + else: + paths = sorted(glob(_normalize_path(paths))) + elif isinstance(paths, os.PathLike): + paths = [os.fspath(paths)] + else: + paths = [os.fspath(p) if isinstance(p, os.PathLike) else p for p in paths] + + return paths + + def _encode_variable_name(name): if name is None: name = NONE_VAR_NAME @@ -153,7 +232,7 @@ def __init__(self, lock=None): self.lock = lock def add(self, source, target, region=None): - if is_duck_dask_array(source): + if is_chunked_array(source): self.sources.append(source) self.targets.append(target) self.regions.append(region) @@ -163,21 +242,25 @@ def add(self, source, target, region=None): else: target[...] = source - def sync(self, compute=True): + def sync(self, compute=True, chunkmanager_store_kwargs=None): if self.sources: - import dask.array as da + chunkmanager = get_chunked_array_type(*self.sources) # TODO: consider wrapping targets with dask.delayed, if this makes # for any discernible difference in perforance, e.g., # targets = [dask.delayed(t) for t in self.targets] - delayed_store = da.store( + if chunkmanager_store_kwargs is None: + chunkmanager_store_kwargs = {} + + delayed_store = chunkmanager.store( self.sources, self.targets, lock=self.lock, compute=compute, flush=True, regions=self.regions, + **chunkmanager_store_kwargs, ) self.sources = [] self.targets = [] diff --git a/xarray/backends/file_manager.py b/xarray/backends/file_manager.py index 91fd15fcaa4..df901f9a1d9 100644 --- a/xarray/backends/file_manager.py +++ b/xarray/backends/file_manager.py @@ -1,5 +1,6 @@ from __future__ import annotations +import atexit import contextlib import io import threading @@ -289,6 +290,13 @@ def __repr__(self) -> str: ) +@atexit.register +def _remove_del_method(): + # We don't need to close unclosed files at program exit, and may not be able + # to, because Python is cleaning up imports / globals. + del CachingFileManager.__del__ + + class _RefCounter: """Class for keeping track of reference counts.""" diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 7389f6a2862..59f6c362491 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -6,8 +6,6 @@ from collections.abc import Iterable from typing import TYPE_CHECKING, Any -from packaging.version import Version - from xarray.backends.common import ( BACKEND_ENTRYPOINTS, BackendEntrypoint, @@ -20,6 +18,7 @@ from xarray.backends.netCDF4_ import ( BaseNetCDF4Array, _encode_nc4_variable, + _ensure_no_forward_slash_in_name, _extract_nc4_variable_encoding, _get_datatype, _nc4_require_group, @@ -232,30 +231,17 @@ def get_attrs(self): return FrozenDict(_read_attributes(self.ds)) def get_dimensions(self): - import h5netcdf - - if Version(h5netcdf.__version__) >= Version("0.14.0.dev0"): - return FrozenDict((k, len(v)) for k, v in self.ds.dimensions.items()) - else: - return self.ds.dimensions + return FrozenDict((k, len(v)) for k, v in self.ds.dimensions.items()) def get_encoding(self): - import h5netcdf - - if Version(h5netcdf.__version__) >= Version("0.14.0.dev0"): - return { - "unlimited_dims": { - k for k, v in self.ds.dimensions.items() if v.isunlimited() - } - } - else: - return { - "unlimited_dims": { - k for k, v in self.ds.dimensions.items() if v is None - } + return { + "unlimited_dims": { + k for k, v in self.ds.dimensions.items() if v.isunlimited() } + } def set_dimension(self, name, length, is_unlimited=False): + _ensure_no_forward_slash_in_name(name) if is_unlimited: self.ds.dimensions[name] = None self.ds.resize_dimension(name, length) @@ -273,6 +259,7 @@ def prepare_variable( ): import h5py + _ensure_no_forward_slash_in_name(name) attrs = variable.attrs.copy() dtype = _get_datatype(variable, raise_on_invalid_encoding=check_encoding) diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index d3866e90de6..b5c3413e7f8 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -65,10 +65,12 @@ def __init__(self, variable_name, datastore): dtype = array.dtype if dtype is str: - # use object dtype because that's the only way in numpy to - # represent variable length strings; it also prevents automatic - # string concatenation via conventions.decode_cf_variable - dtype = np.dtype("O") + # use object dtype (with additional vlen string metadata) because that's + # the only way in numpy to represent variable length strings and to + # check vlen string dtype in further steps + # it also prevents automatic string concatenation via + # conventions.decode_cf_variable + dtype = coding.strings.create_vlen_dtype(str) self.dtype = dtype def __setitem__(self, key, value): @@ -192,6 +194,15 @@ def _nc4_require_group(ds, group, mode, create_group=_netcdf4_create_group): return ds +def _ensure_no_forward_slash_in_name(name): + if "/" in name: + raise ValueError( + f"Forward slashes '/' are not allowed in variable and dimension names (got {name!r}). " + "Forward slashes are used as hierarchy-separators for " + "HDF5-based files ('netcdf4'/'h5netcdf')." + ) + + def _ensure_fill_value_valid(data, attributes): # work around for netCDF4/scipy issue where _FillValue has the wrong type: # https://github.com/Unidata/netcdf4-python/issues/271 @@ -445,6 +456,7 @@ def get_encoding(self): } def set_dimension(self, name, length, is_unlimited=False): + _ensure_no_forward_slash_in_name(name) dim_length = length if not is_unlimited else None self.ds.createDimension(name, size=dim_length) @@ -468,6 +480,8 @@ def encode_variable(self, variable): def prepare_variable( self, name, variable, check_encoding=False, unlimited_dims=None ): + _ensure_no_forward_slash_in_name(name) + datatype = _get_datatype( variable, self.format, raise_on_invalid_encoding=check_encoding ) diff --git a/xarray/backends/plugins.py b/xarray/backends/plugins.py index 232c2300192..a62ca6c9862 100644 --- a/xarray/backends/plugins.py +++ b/xarray/backends/plugins.py @@ -146,7 +146,7 @@ def refresh_engines() -> None: def guess_engine( store_spec: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, -): +) -> str | type[BackendEntrypoint]: engines = list_engines() for engine, backend in engines.items(): diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 116c48f5692..9b5bcc82e6f 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -4,7 +4,6 @@ from typing import TYPE_CHECKING, Any import numpy as np -from packaging.version import Version from xarray.backends.common import ( BACKEND_ENTRYPOINTS, @@ -123,11 +122,10 @@ def open( "output_grid": output_grid or True, "timeout": timeout, } - if Version(pydap.lib.__version__) >= Version("3.3.0"): - if verify is not None: - kwargs.update({"verify": verify}) - if user_charset is not None: - kwargs.update({"user_charset": user_charset}) + if verify is not None: + kwargs.update({"verify": verify}) + if user_charset is not None: + kwargs.update({"user_charset": user_charset}) ds = pydap.client.open_url(**kwargs) return cls(ds) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 7d21c771e06..f88523422bb 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -19,6 +19,7 @@ ) from xarray.backends.store import StoreBackendEntrypoint from xarray.core import indexing +from xarray.core.parallelcompat import guess_chunkmanager from xarray.core.pycompat import integer_types from xarray.core.utils import ( FrozenDict, @@ -69,7 +70,14 @@ def __init__(self, variable_name, datastore): array = self.get_array() self.shape = array.shape - dtype = array.dtype + # preserve vlen string object dtype (GH 7328) + if array.filters is not None and any( + [filt.codec_id == "vlen-utf8" for filt in array.filters] + ): + dtype = coding.strings.create_vlen_dtype(str) + else: + dtype = array.dtype + self.dtype = dtype def get_array(self): @@ -360,6 +368,7 @@ class ZarrStore(AbstractWritableDataStore): "_synchronizer", "_write_region", "_safe_chunks", + "_write_empty", ) @classmethod @@ -378,6 +387,7 @@ def open_group( safe_chunks=True, stacklevel=2, zarr_version=None, + write_empty: bool | None = None, ): import zarr @@ -409,7 +419,7 @@ def open_group( if consolidated is None: consolidated = False - if chunk_store: + if chunk_store is not None: open_kwargs["chunk_store"] = chunk_store if consolidated is None: consolidated = False @@ -449,6 +459,7 @@ def open_group( append_dim, write_region, safe_chunks, + write_empty, ) def __init__( @@ -459,6 +470,7 @@ def __init__( append_dim=None, write_region=None, safe_chunks=True, + write_empty: bool | None = None, ): self.zarr_group = zarr_group self._read_only = self.zarr_group.read_only @@ -469,6 +481,7 @@ def __init__( self._append_dim = append_dim self._write_region = write_region self._safe_chunks = safe_chunks + self._write_empty = write_empty @property def ds(self): @@ -640,6 +653,8 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No dimensions. """ + import zarr + for vn, v in variables.items(): name = _encode_variable_name(vn) check = vn in check_encoding_set @@ -657,7 +672,14 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No # TODO: if mode="a", consider overriding the existing variable # metadata. This would need some case work properly with region # and append_dim. - zarr_array = self.zarr_group[name] + if self._write_empty is not None: + zarr_array = zarr.open( + store=self.zarr_group.store, + path=f"{self.zarr_group.name}/{name}", + write_empty_chunks=self._write_empty, + ) + else: + zarr_array = self.zarr_group[name] else: # new variable encoding = extract_zarr_variable_encoding( @@ -671,8 +693,25 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No if coding.strings.check_vlen_dtype(dtype) == str: dtype = str + + if self._write_empty is not None: + if ( + "write_empty_chunks" in encoding + and encoding["write_empty_chunks"] != self._write_empty + ): + raise ValueError( + 'Differing "write_empty_chunks" values in encoding and parameters' + f'Got {encoding["write_empty_chunks"] = } and {self._write_empty = }' + ) + else: + encoding["write_empty_chunks"] = self._write_empty + zarr_array = self.zarr_group.create( - name, shape=shape, dtype=dtype, fill_value=fill_value, **encoding + name, + shape=shape, + dtype=dtype, + fill_value=fill_value, + **encoding, ) zarr_array = _put_attrs(zarr_array, encoded_attrs) @@ -716,6 +755,8 @@ def open_zarr( decode_timedelta=None, use_cftime=None, zarr_version=None, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, **kwargs, ): """Load and decode a dataset from a Zarr store. @@ -800,6 +841,15 @@ def open_zarr( The desired zarr spec version to target (currently 2 or 3). The default of None will attempt to determine the zarr version from ``store`` when possible, otherwise defaulting to 2. + chunked_array_type: str, optional + Which chunked array type to coerce this datasets' arrays to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEntryPoint` system. + Experimental API that should not be relied upon. + from_array_kwargs: dict, optional + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to + :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. Returns ------- @@ -817,12 +867,17 @@ def open_zarr( """ from xarray.backends.api import open_dataset + if from_array_kwargs is None: + from_array_kwargs = {} + if chunks == "auto": try: - import dask.array # noqa + guess_chunkmanager( + chunked_array_type + ) # attempt to import that parallel backend chunks = {} - except ImportError: + except ValueError: chunks = None if kwargs: @@ -851,6 +906,8 @@ def open_zarr( engine="zarr", chunks=chunks, drop_variables=drop_variables, + chunked_array_type=chunked_array_type, + from_array_kwargs=from_array_kwargs, backend_kwargs=backend_kwargs, decode_timedelta=decode_timedelta, use_cftime=use_cftime, diff --git a/xarray/coding/cftimeindex.py b/xarray/coding/cftimeindex.py index c6a7b9f8763..8f3472dce19 100644 --- a/xarray/coding/cftimeindex.py +++ b/xarray/coding/cftimeindex.py @@ -470,13 +470,9 @@ def get_loc(self, key): else: return super().get_loc(key) - def _maybe_cast_slice_bound(self, label, side, kind=None): + def _maybe_cast_slice_bound(self, label, side): """Adapted from pandas.tseries.index.DatetimeIndex._maybe_cast_slice_bound - - Note that we have never used the kind argument in CFTimeIndex and it is - deprecated as of pandas version 1.3.0. It exists only for compatibility - reasons. We can remove it when our minimum version of pandas is 1.3.0. """ if not isinstance(label, str): return label diff --git a/xarray/coding/strings.py b/xarray/coding/strings.py index 61b3ab7c46c..d0bfb1a7a63 100644 --- a/xarray/coding/strings.py +++ b/xarray/coding/strings.py @@ -14,7 +14,7 @@ unpack_for_encoding, ) from xarray.core import indexing -from xarray.core.pycompat import is_duck_dask_array +from xarray.core.parallelcompat import get_chunked_array_type, is_chunked_array from xarray.core.variable import Variable @@ -29,7 +29,8 @@ def check_vlen_dtype(dtype): if dtype.kind != "O" or dtype.metadata is None: return None else: - return dtype.metadata.get("element_type") + # check xarray (element_type) as well as h5py (vlen) + return dtype.metadata.get("element_type", dtype.metadata.get("vlen")) def is_unicode_dtype(dtype): @@ -134,10 +135,10 @@ def bytes_to_char(arr): if arr.dtype.kind != "S": raise ValueError("argument must have a fixed-width bytes dtype") - if is_duck_dask_array(arr): - import dask.array as da + if is_chunked_array(arr): + chunkmanager = get_chunked_array_type(arr) - return da.map_blocks( + return chunkmanager.map_blocks( _numpy_bytes_to_char, arr, dtype="S1", @@ -169,8 +170,8 @@ def char_to_bytes(arr): # can't make an S0 dtype return np.zeros(arr.shape[:-1], dtype=np.string_) - if is_duck_dask_array(arr): - import dask.array as da + if is_chunked_array(arr): + chunkmanager = get_chunked_array_type(arr) if len(arr.chunks[-1]) > 1: raise ValueError( @@ -179,7 +180,7 @@ def char_to_bytes(arr): ) dtype = np.dtype("S" + str(arr.shape[-1])) - return da.map_blocks( + return chunkmanager.map_blocks( _numpy_char_to_bytes, arr, dtype=dtype, diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 5c6e51c2215..8ba7dcbb0e2 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -10,7 +10,8 @@ import pandas as pd from xarray.core import dtypes, duck_array_ops, indexing -from xarray.core.pycompat import is_duck_dask_array +from xarray.core.parallelcompat import get_chunked_array_type +from xarray.core.pycompat import is_chunked_array from xarray.core.variable import Variable if TYPE_CHECKING: @@ -57,7 +58,7 @@ class _ElementwiseFunctionArray(indexing.ExplicitlyIndexedNDArrayMixin): """ def __init__(self, array, func: Callable, dtype: np.typing.DTypeLike): - assert not is_duck_dask_array(array) + assert not is_chunked_array(array) self.array = indexing.as_indexable(array) self.func = func self._dtype = dtype @@ -158,10 +159,10 @@ def lazy_elemwise_func(array, func: Callable, dtype: np.typing.DTypeLike): ------- Either a dask.array.Array or _ElementwiseFunctionArray. """ - if is_duck_dask_array(array): - import dask.array as da + if is_chunked_array(array): + chunkmanager = get_chunked_array_type(array) - return da.map_blocks(func, array, dtype=dtype) + return chunkmanager.map_blocks(func, array, dtype=dtype) else: return _ElementwiseFunctionArray(array, func, dtype) @@ -330,7 +331,7 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: if "scale_factor" in encoding or "add_offset" in encoding: dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) - data = data.astype(dtype=dtype, copy=True) + data = duck_array_ops.astype(data, dtype=dtype, copy=True) if "add_offset" in encoding: data -= pop_to(encoding, attrs, "add_offset", name=name) if "scale_factor" in encoding: @@ -377,7 +378,7 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: if "_FillValue" in attrs: new_fill = signed_dtype.type(attrs["_FillValue"]) attrs["_FillValue"] = new_fill - data = duck_array_ops.around(data).astype(signed_dtype) + data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype) return Variable(dims, data, attrs, encoding, fastpath=True) else: diff --git a/xarray/conventions.py b/xarray/conventions.py index ea0787aa1a1..5dd2fbbde74 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -108,6 +108,10 @@ def ensure_dtype_not_object(var: Variable, name: T_Name = None) -> Variable: if var.dtype.kind == "O": dims, data, attrs, encoding = _var_as_tuple(var) + # leave vlen dtypes unchanged + if strings.check_vlen_dtype(data.dtype) is not None: + return var + if is_duck_dask_array(data): warnings.warn( "variable {} has data in the form of a dask array with " @@ -690,7 +694,7 @@ def _encode_coordinates(variables, attributes, non_dim_coord_names): if not coords_str and variable_coordinates[name]: coordinates_text = " ".join( str(coord_name) - for coord_name in variable_coordinates[name] + for coord_name in sorted(variable_coordinates[name]) if coord_name not in not_technically_coordinates ) if coordinates_text: @@ -715,7 +719,7 @@ def _encode_coordinates(variables, attributes, non_dim_coord_names): SerializationWarning, ) else: - attributes["coordinates"] = " ".join(map(str, global_coordinates)) + attributes["coordinates"] = " ".join(sorted(map(str, global_coordinates))) return variables, attributes diff --git a/xarray/core/_aggregations.py b/xarray/core/_aggregations.py index 3051502beba..d5070f97c6a 100644 --- a/xarray/core/_aggregations.py +++ b/xarray/core/_aggregations.py @@ -9,7 +9,7 @@ from xarray.core import duck_array_ops from xarray.core.options import OPTIONS from xarray.core.types import Dims -from xarray.core.utils import contains_only_dask_or_numpy, module_available +from xarray.core.utils import contains_only_chunked_or_numpy, module_available if TYPE_CHECKING: from xarray.core.dataarray import DataArray @@ -65,8 +65,8 @@ def count( See Also -------- - numpy.count - dask.array.count + pandas.DataFrame.count + dask.dataframe.DataFrame.count DataArray.count :ref:`agg` User guide on reduction or aggregation operations. @@ -74,7 +74,7 @@ def count( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -89,7 +89,7 @@ def count( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.count() @@ -296,7 +296,7 @@ def max( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -311,7 +311,7 @@ def max( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.max() @@ -383,7 +383,7 @@ def min( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -398,13 +398,13 @@ def min( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.min() Dimensions: () Data variables: - da float64 1.0 + da float64 0.0 Use ``skipna`` to control whether NaNs are ignored. @@ -474,7 +474,7 @@ def mean( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -489,13 +489,13 @@ def mean( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.mean() Dimensions: () Data variables: - da float64 1.8 + da float64 1.6 Use ``skipna`` to control whether NaNs are ignored. @@ -572,7 +572,7 @@ def prod( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -587,13 +587,13 @@ def prod( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.prod() Dimensions: () Data variables: - da float64 12.0 + da float64 0.0 Use ``skipna`` to control whether NaNs are ignored. @@ -609,7 +609,7 @@ def prod( Dimensions: () Data variables: - da float64 12.0 + da float64 0.0 """ return self.reduce( duck_array_ops.prod, @@ -679,7 +679,7 @@ def sum( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -694,13 +694,13 @@ def sum( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.sum() Dimensions: () Data variables: - da float64 9.0 + da float64 8.0 Use ``skipna`` to control whether NaNs are ignored. @@ -716,7 +716,7 @@ def sum( Dimensions: () Data variables: - da float64 9.0 + da float64 8.0 """ return self.reduce( duck_array_ops.sum, @@ -783,7 +783,7 @@ def std( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -798,13 +798,13 @@ def std( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.std() Dimensions: () Data variables: - da float64 0.7483 + da float64 1.02 Use ``skipna`` to control whether NaNs are ignored. @@ -820,7 +820,7 @@ def std( Dimensions: () Data variables: - da float64 0.8367 + da float64 1.14 """ return self.reduce( duck_array_ops.std, @@ -887,7 +887,7 @@ def var( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -902,13 +902,13 @@ def var( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.var() Dimensions: () Data variables: - da float64 0.56 + da float64 1.04 Use ``skipna`` to control whether NaNs are ignored. @@ -924,7 +924,7 @@ def var( Dimensions: () Data variables: - da float64 0.7 + da float64 1.3 """ return self.reduce( duck_array_ops.var, @@ -987,7 +987,7 @@ def median( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -1002,7 +1002,7 @@ def median( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.median() @@ -1078,7 +1078,7 @@ def cumsum( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -1093,14 +1093,14 @@ def cumsum( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.cumsum() Dimensions: (time: 6) Dimensions without coordinates: time Data variables: - da (time) float64 1.0 3.0 6.0 7.0 9.0 9.0 + da (time) float64 1.0 3.0 6.0 6.0 8.0 8.0 Use ``skipna`` to control whether NaNs are ignored. @@ -1109,7 +1109,7 @@ def cumsum( Dimensions: (time: 6) Dimensions without coordinates: time Data variables: - da (time) float64 1.0 3.0 6.0 7.0 9.0 nan + da (time) float64 1.0 3.0 6.0 6.0 8.0 nan """ return self.reduce( duck_array_ops.cumsum, @@ -1171,7 +1171,7 @@ def cumprod( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -1186,14 +1186,14 @@ def cumprod( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.cumprod() Dimensions: (time: 6) Dimensions without coordinates: time Data variables: - da (time) float64 1.0 2.0 6.0 6.0 12.0 12.0 + da (time) float64 1.0 2.0 6.0 0.0 0.0 0.0 Use ``skipna`` to control whether NaNs are ignored. @@ -1202,7 +1202,7 @@ def cumprod( Dimensions: (time: 6) Dimensions without coordinates: time Data variables: - da (time) float64 1.0 2.0 6.0 6.0 12.0 nan + da (time) float64 1.0 2.0 6.0 0.0 0.0 nan """ return self.reduce( duck_array_ops.cumprod, @@ -1261,8 +1261,8 @@ def count( See Also -------- - numpy.count - dask.array.count + pandas.DataFrame.count + dask.dataframe.DataFrame.count Dataset.count :ref:`agg` User guide on reduction or aggregation operations. @@ -1270,7 +1270,7 @@ def count( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -1279,7 +1279,7 @@ def count( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -1483,7 +1483,7 @@ def max( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -1562,14 +1562,14 @@ def min( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.min() - array(1.) + array(0.) Use ``skipna`` to control whether NaNs are ignored. @@ -1636,7 +1636,7 @@ def mean( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -1645,14 +1645,14 @@ def mean( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.mean() - array(1.8) + array(1.6) Use ``skipna`` to control whether NaNs are ignored. @@ -1726,7 +1726,7 @@ def prod( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -1735,14 +1735,14 @@ def prod( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.prod() - array(12.) + array(0.) Use ``skipna`` to control whether NaNs are ignored. @@ -1754,7 +1754,7 @@ def prod( >>> da.prod(skipna=True, min_count=2) - array(12.) + array(0.) """ return self.reduce( duck_array_ops.prod, @@ -1823,7 +1823,7 @@ def sum( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -1832,14 +1832,14 @@ def sum( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.sum() - array(9.) + array(8.) Use ``skipna`` to control whether NaNs are ignored. @@ -1851,7 +1851,7 @@ def sum( >>> da.sum(skipna=True, min_count=2) - array(9.) + array(8.) """ return self.reduce( duck_array_ops.sum, @@ -1917,7 +1917,7 @@ def std( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -1926,14 +1926,14 @@ def std( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.std() - array(0.74833148) + array(1.0198039) Use ``skipna`` to control whether NaNs are ignored. @@ -1945,7 +1945,7 @@ def std( >>> da.std(skipna=True, ddof=1) - array(0.83666003) + array(1.14017543) """ return self.reduce( duck_array_ops.std, @@ -2011,7 +2011,7 @@ def var( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -2020,14 +2020,14 @@ def var( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.var() - array(0.56) + array(1.04) Use ``skipna`` to control whether NaNs are ignored. @@ -2039,7 +2039,7 @@ def var( >>> da.var(skipna=True, ddof=1) - array(0.7) + array(1.3) """ return self.reduce( duck_array_ops.var, @@ -2101,7 +2101,7 @@ def median( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -2110,7 +2110,7 @@ def median( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -2193,14 +2193,14 @@ def cumsum( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.cumsum() - array([1., 3., 6., 7., 9., 9.]) + array([1., 3., 6., 6., 8., 8.]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.cumsum(skipna=False) - array([ 1., 3., 6., 7., 9., nan]) + array([ 1., 3., 6., 6., 8., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -2282,14 +2282,14 @@ def cumprod( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.cumprod() - array([ 1., 2., 6., 6., 12., 12.]) + array([1., 2., 6., 0., 0., 0.]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.cumprod(skipna=False) - array([ 1., 2., 6., 6., 12., nan]) + array([ 1., 2., 6., 0., 0., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -2400,7 +2400,7 @@ def count( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.groupby("labels").count() @@ -2413,7 +2413,7 @@ def count( if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="count", @@ -2511,7 +2511,7 @@ def all( if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="all", @@ -2609,7 +2609,7 @@ def any( if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="any", @@ -2685,7 +2685,7 @@ def max( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -2700,7 +2700,7 @@ def max( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.groupby("labels").max() @@ -2723,7 +2723,7 @@ def max( if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="max", @@ -2801,7 +2801,7 @@ def min( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -2816,7 +2816,7 @@ def min( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.groupby("labels").min() @@ -2824,7 +2824,7 @@ def min( Coordinates: * labels (labels) object 'a' 'b' 'c' Data variables: - da (labels) float64 1.0 2.0 1.0 + da (labels) float64 1.0 2.0 0.0 Use ``skipna`` to control whether NaNs are ignored. @@ -2834,12 +2834,12 @@ def min( Coordinates: * labels (labels) object 'a' 'b' 'c' Data variables: - da (labels) float64 nan 2.0 1.0 + da (labels) float64 nan 2.0 0.0 """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="min", @@ -2919,7 +2919,7 @@ def mean( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -2934,7 +2934,7 @@ def mean( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.groupby("labels").mean() @@ -2942,7 +2942,7 @@ def mean( Coordinates: * labels (labels) object 'a' 'b' 'c' Data variables: - da (labels) float64 1.0 2.0 2.0 + da (labels) float64 1.0 2.0 1.5 Use ``skipna`` to control whether NaNs are ignored. @@ -2952,12 +2952,12 @@ def mean( Coordinates: * labels (labels) object 'a' 'b' 'c' Data variables: - da (labels) float64 nan 2.0 2.0 + da (labels) float64 nan 2.0 1.5 """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="mean", @@ -3044,7 +3044,7 @@ def prod( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -3059,7 +3059,7 @@ def prod( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.groupby("labels").prod() @@ -3067,7 +3067,7 @@ def prod( Coordinates: * labels (labels) object 'a' 'b' 'c' Data variables: - da (labels) float64 1.0 4.0 3.0 + da (labels) float64 1.0 4.0 0.0 Use ``skipna`` to control whether NaNs are ignored. @@ -3077,7 +3077,7 @@ def prod( Coordinates: * labels (labels) object 'a' 'b' 'c' Data variables: - da (labels) float64 nan 4.0 3.0 + da (labels) float64 nan 4.0 0.0 Specify ``min_count`` for finer control over when NaNs are ignored. @@ -3087,12 +3087,12 @@ def prod( Coordinates: * labels (labels) object 'a' 'b' 'c' Data variables: - da (labels) float64 nan 4.0 3.0 + da (labels) float64 nan 4.0 0.0 """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="prod", @@ -3181,7 +3181,7 @@ def sum( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -3196,7 +3196,7 @@ def sum( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.groupby("labels").sum() @@ -3204,7 +3204,7 @@ def sum( Coordinates: * labels (labels) object 'a' 'b' 'c' Data variables: - da (labels) float64 1.0 4.0 4.0 + da (labels) float64 1.0 4.0 3.0 Use ``skipna`` to control whether NaNs are ignored. @@ -3214,7 +3214,7 @@ def sum( Coordinates: * labels (labels) object 'a' 'b' 'c' Data variables: - da (labels) float64 nan 4.0 4.0 + da (labels) float64 nan 4.0 3.0 Specify ``min_count`` for finer control over when NaNs are ignored. @@ -3224,12 +3224,12 @@ def sum( Coordinates: * labels (labels) object 'a' 'b' 'c' Data variables: - da (labels) float64 nan 4.0 4.0 + da (labels) float64 nan 4.0 3.0 """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="sum", @@ -3315,7 +3315,7 @@ def std( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -3330,7 +3330,7 @@ def std( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.groupby("labels").std() @@ -3338,7 +3338,7 @@ def std( Coordinates: * labels (labels) object 'a' 'b' 'c' Data variables: - da (labels) float64 0.0 0.0 1.0 + da (labels) float64 0.0 0.0 1.5 Use ``skipna`` to control whether NaNs are ignored. @@ -3348,7 +3348,7 @@ def std( Coordinates: * labels (labels) object 'a' 'b' 'c' Data variables: - da (labels) float64 nan 0.0 1.0 + da (labels) float64 nan 0.0 1.5 Specify ``ddof=1`` for an unbiased estimate. @@ -3358,12 +3358,12 @@ def std( Coordinates: * labels (labels) object 'a' 'b' 'c' Data variables: - da (labels) float64 nan 0.0 1.414 + da (labels) float64 nan 0.0 2.121 """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="std", @@ -3449,7 +3449,7 @@ def var( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -3464,7 +3464,7 @@ def var( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.groupby("labels").var() @@ -3472,7 +3472,7 @@ def var( Coordinates: * labels (labels) object 'a' 'b' 'c' Data variables: - da (labels) float64 0.0 0.0 1.0 + da (labels) float64 0.0 0.0 2.25 Use ``skipna`` to control whether NaNs are ignored. @@ -3482,7 +3482,7 @@ def var( Coordinates: * labels (labels) object 'a' 'b' 'c' Data variables: - da (labels) float64 nan 0.0 1.0 + da (labels) float64 nan 0.0 2.25 Specify ``ddof=1`` for an unbiased estimate. @@ -3492,12 +3492,12 @@ def var( Coordinates: * labels (labels) object 'a' 'b' 'c' Data variables: - da (labels) float64 nan 0.0 2.0 + da (labels) float64 nan 0.0 4.5 """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="var", @@ -3579,7 +3579,7 @@ def median( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -3594,7 +3594,7 @@ def median( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.groupby("labels").median() @@ -3602,7 +3602,7 @@ def median( Coordinates: * labels (labels) object 'a' 'b' 'c' Data variables: - da (labels) float64 1.0 2.0 2.0 + da (labels) float64 1.0 2.0 1.5 Use ``skipna`` to control whether NaNs are ignored. @@ -3612,7 +3612,7 @@ def median( Coordinates: * labels (labels) object 'a' 'b' 'c' Data variables: - da (labels) float64 nan 2.0 2.0 + da (labels) float64 nan 2.0 1.5 """ return self.reduce( duck_array_ops.median, @@ -3682,7 +3682,7 @@ def cumsum( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -3697,14 +3697,14 @@ def cumsum( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.groupby("labels").cumsum() Dimensions: (time: 6) Dimensions without coordinates: time Data variables: - da (time) float64 1.0 2.0 3.0 4.0 4.0 1.0 + da (time) float64 1.0 2.0 3.0 3.0 4.0 1.0 Use ``skipna`` to control whether NaNs are ignored. @@ -3713,7 +3713,7 @@ def cumsum( Dimensions: (time: 6) Dimensions without coordinates: time Data variables: - da (time) float64 1.0 2.0 3.0 4.0 4.0 nan + da (time) float64 1.0 2.0 3.0 3.0 4.0 nan """ return self.reduce( duck_array_ops.cumsum, @@ -3783,7 +3783,7 @@ def cumprod( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -3798,14 +3798,14 @@ def cumprod( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.groupby("labels").cumprod() Dimensions: (time: 6) Dimensions without coordinates: time Data variables: - da (time) float64 1.0 2.0 3.0 3.0 4.0 1.0 + da (time) float64 1.0 2.0 3.0 0.0 4.0 1.0 Use ``skipna`` to control whether NaNs are ignored. @@ -3814,7 +3814,7 @@ def cumprod( Dimensions: (time: 6) Dimensions without coordinates: time Data variables: - da (time) float64 1.0 2.0 3.0 3.0 4.0 nan + da (time) float64 1.0 2.0 3.0 0.0 4.0 nan """ return self.reduce( duck_array_ops.cumprod, @@ -3881,8 +3881,8 @@ def count( See Also -------- - numpy.count - dask.array.count + pandas.DataFrame.count + dask.dataframe.DataFrame.count Dataset.count :ref:`resampling` User guide on resampling operations. @@ -3899,7 +3899,7 @@ def count( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -3914,7 +3914,7 @@ def count( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.resample(time="3M").count() @@ -3927,7 +3927,7 @@ def count( if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="count", @@ -4025,7 +4025,7 @@ def all( if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="all", @@ -4123,7 +4123,7 @@ def any( if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="any", @@ -4199,7 +4199,7 @@ def max( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -4214,7 +4214,7 @@ def max( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.resample(time="3M").max() @@ -4237,7 +4237,7 @@ def max( if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="max", @@ -4315,7 +4315,7 @@ def min( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -4330,7 +4330,7 @@ def min( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.resample(time="3M").min() @@ -4338,7 +4338,7 @@ def min( Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 1.0 1.0 2.0 + da (time) float64 1.0 0.0 2.0 Use ``skipna`` to control whether NaNs are ignored. @@ -4348,12 +4348,12 @@ def min( Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 1.0 1.0 nan + da (time) float64 1.0 0.0 nan """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="min", @@ -4433,7 +4433,7 @@ def mean( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -4448,7 +4448,7 @@ def mean( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.resample(time="3M").mean() @@ -4456,7 +4456,7 @@ def mean( Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 1.0 2.0 2.0 + da (time) float64 1.0 1.667 2.0 Use ``skipna`` to control whether NaNs are ignored. @@ -4466,12 +4466,12 @@ def mean( Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 1.0 2.0 nan + da (time) float64 1.0 1.667 nan """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="mean", @@ -4558,7 +4558,7 @@ def prod( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -4573,7 +4573,7 @@ def prod( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.resample(time="3M").prod() @@ -4581,7 +4581,7 @@ def prod( Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 1.0 6.0 2.0 + da (time) float64 1.0 0.0 2.0 Use ``skipna`` to control whether NaNs are ignored. @@ -4591,7 +4591,7 @@ def prod( Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 1.0 6.0 nan + da (time) float64 1.0 0.0 nan Specify ``min_count`` for finer control over when NaNs are ignored. @@ -4601,12 +4601,12 @@ def prod( Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 nan 6.0 nan + da (time) float64 nan 0.0 nan """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="prod", @@ -4695,7 +4695,7 @@ def sum( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -4710,7 +4710,7 @@ def sum( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.resample(time="3M").sum() @@ -4718,7 +4718,7 @@ def sum( Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 1.0 6.0 2.0 + da (time) float64 1.0 5.0 2.0 Use ``skipna`` to control whether NaNs are ignored. @@ -4728,7 +4728,7 @@ def sum( Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 1.0 6.0 nan + da (time) float64 1.0 5.0 nan Specify ``min_count`` for finer control over when NaNs are ignored. @@ -4738,12 +4738,12 @@ def sum( Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 nan 6.0 nan + da (time) float64 nan 5.0 nan """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="sum", @@ -4829,7 +4829,7 @@ def std( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -4844,7 +4844,7 @@ def std( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.resample(time="3M").std() @@ -4852,7 +4852,7 @@ def std( Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 0.0 0.8165 0.0 + da (time) float64 0.0 1.247 0.0 Use ``skipna`` to control whether NaNs are ignored. @@ -4862,7 +4862,7 @@ def std( Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 0.0 0.8165 nan + da (time) float64 0.0 1.247 nan Specify ``ddof=1`` for an unbiased estimate. @@ -4872,12 +4872,12 @@ def std( Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 nan 1.0 nan + da (time) float64 nan 1.528 nan """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="std", @@ -4963,7 +4963,7 @@ def var( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -4978,7 +4978,7 @@ def var( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.resample(time="3M").var() @@ -4986,7 +4986,7 @@ def var( Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 0.0 0.6667 0.0 + da (time) float64 0.0 1.556 0.0 Use ``skipna`` to control whether NaNs are ignored. @@ -4996,7 +4996,7 @@ def var( Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 0.0 0.6667 nan + da (time) float64 0.0 1.556 nan Specify ``ddof=1`` for an unbiased estimate. @@ -5006,12 +5006,12 @@ def var( Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 nan 1.0 nan + da (time) float64 nan 2.333 nan """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="var", @@ -5093,7 +5093,7 @@ def median( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -5108,7 +5108,7 @@ def median( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.resample(time="3M").median() @@ -5196,7 +5196,7 @@ def cumsum( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -5211,14 +5211,14 @@ def cumsum( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.resample(time="3M").cumsum() Dimensions: (time: 6) Dimensions without coordinates: time Data variables: - da (time) float64 1.0 2.0 5.0 6.0 2.0 2.0 + da (time) float64 1.0 2.0 5.0 5.0 2.0 2.0 Use ``skipna`` to control whether NaNs are ignored. @@ -5227,7 +5227,7 @@ def cumsum( Dimensions: (time: 6) Dimensions without coordinates: time Data variables: - da (time) float64 1.0 2.0 5.0 6.0 2.0 nan + da (time) float64 1.0 2.0 5.0 5.0 2.0 nan """ return self.reduce( duck_array_ops.cumsum, @@ -5297,7 +5297,7 @@ def cumprod( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -5312,14 +5312,14 @@ def cumprod( * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> ds.resample(time="3M").cumprod() Dimensions: (time: 6) Dimensions without coordinates: time Data variables: - da (time) float64 1.0 2.0 6.0 6.0 2.0 2.0 + da (time) float64 1.0 2.0 6.0 0.0 2.0 2.0 Use ``skipna`` to control whether NaNs are ignored. @@ -5328,7 +5328,7 @@ def cumprod( Dimensions: (time: 6) Dimensions without coordinates: time Data variables: - da (time) float64 1.0 2.0 6.0 6.0 2.0 nan + da (time) float64 1.0 2.0 6.0 0.0 2.0 nan """ return self.reduce( duck_array_ops.cumprod, @@ -5395,8 +5395,8 @@ def count( See Also -------- - numpy.count - dask.array.count + pandas.DataFrame.count + dask.dataframe.DataFrame.count DataArray.count :ref:`groupby` User guide on groupby operations. @@ -5413,7 +5413,7 @@ def count( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -5422,7 +5422,7 @@ def count( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -5701,7 +5701,7 @@ def max( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -5808,14 +5808,14 @@ def min( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.groupby("labels").min() - array([1., 2., 1.]) + array([1., 2., 0.]) Coordinates: * labels (labels) object 'a' 'b' 'c' @@ -5823,14 +5823,14 @@ def min( >>> da.groupby("labels").min(skipna=False) - array([nan, 2., 1.]) + array([nan, 2., 0.]) Coordinates: * labels (labels) object 'a' 'b' 'c' """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="min", @@ -5908,7 +5908,7 @@ def mean( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -5917,14 +5917,14 @@ def mean( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.groupby("labels").mean() - array([1., 2., 2.]) + array([1. , 2. , 1.5]) Coordinates: * labels (labels) object 'a' 'b' 'c' @@ -5932,14 +5932,14 @@ def mean( >>> da.groupby("labels").mean(skipna=False) - array([nan, 2., 2.]) + array([nan, 2. , 1.5]) Coordinates: * labels (labels) object 'a' 'b' 'c' """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="mean", @@ -6024,7 +6024,7 @@ def prod( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -6033,14 +6033,14 @@ def prod( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.groupby("labels").prod() - array([1., 4., 3.]) + array([1., 4., 0.]) Coordinates: * labels (labels) object 'a' 'b' 'c' @@ -6048,7 +6048,7 @@ def prod( >>> da.groupby("labels").prod(skipna=False) - array([nan, 4., 3.]) + array([nan, 4., 0.]) Coordinates: * labels (labels) object 'a' 'b' 'c' @@ -6056,14 +6056,14 @@ def prod( >>> da.groupby("labels").prod(skipna=True, min_count=2) - array([nan, 4., 3.]) + array([nan, 4., 0.]) Coordinates: * labels (labels) object 'a' 'b' 'c' """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="prod", @@ -6150,7 +6150,7 @@ def sum( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -6159,14 +6159,14 @@ def sum( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.groupby("labels").sum() - array([1., 4., 4.]) + array([1., 4., 3.]) Coordinates: * labels (labels) object 'a' 'b' 'c' @@ -6174,7 +6174,7 @@ def sum( >>> da.groupby("labels").sum(skipna=False) - array([nan, 4., 4.]) + array([nan, 4., 3.]) Coordinates: * labels (labels) object 'a' 'b' 'c' @@ -6182,14 +6182,14 @@ def sum( >>> da.groupby("labels").sum(skipna=True, min_count=2) - array([nan, 4., 4.]) + array([nan, 4., 3.]) Coordinates: * labels (labels) object 'a' 'b' 'c' """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="sum", @@ -6273,7 +6273,7 @@ def std( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -6282,14 +6282,14 @@ def std( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.groupby("labels").std() - array([0., 0., 1.]) + array([0. , 0. , 1.5]) Coordinates: * labels (labels) object 'a' 'b' 'c' @@ -6297,7 +6297,7 @@ def std( >>> da.groupby("labels").std(skipna=False) - array([nan, 0., 1.]) + array([nan, 0. , 1.5]) Coordinates: * labels (labels) object 'a' 'b' 'c' @@ -6305,14 +6305,14 @@ def std( >>> da.groupby("labels").std(skipna=True, ddof=1) - array([ nan, 0. , 1.41421356]) + array([ nan, 0. , 2.12132034]) Coordinates: * labels (labels) object 'a' 'b' 'c' """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="std", @@ -6396,7 +6396,7 @@ def var( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -6405,14 +6405,14 @@ def var( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.groupby("labels").var() - array([0., 0., 1.]) + array([0. , 0. , 2.25]) Coordinates: * labels (labels) object 'a' 'b' 'c' @@ -6420,7 +6420,7 @@ def var( >>> da.groupby("labels").var(skipna=False) - array([nan, 0., 1.]) + array([ nan, 0. , 2.25]) Coordinates: * labels (labels) object 'a' 'b' 'c' @@ -6428,14 +6428,14 @@ def var( >>> da.groupby("labels").var(skipna=True, ddof=1) - array([nan, 0., 2.]) + array([nan, 0. , 4.5]) Coordinates: * labels (labels) object 'a' 'b' 'c' """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="var", @@ -6515,7 +6515,7 @@ def median( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -6524,14 +6524,14 @@ def median( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.groupby("labels").median() - array([1., 2., 2.]) + array([1. , 2. , 1.5]) Coordinates: * labels (labels) object 'a' 'b' 'c' @@ -6539,7 +6539,7 @@ def median( >>> da.groupby("labels").median(skipna=False) - array([nan, 2., 2.]) + array([nan, 2. , 1.5]) Coordinates: * labels (labels) object 'a' 'b' 'c' """ @@ -6610,7 +6610,7 @@ def cumsum( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -6619,14 +6619,14 @@ def cumsum( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.groupby("labels").cumsum() - array([1., 2., 3., 4., 4., 1.]) + array([1., 2., 3., 3., 4., 1.]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.groupby("labels").cumsum(skipna=False) - array([ 1., 2., 3., 4., 4., nan]) + array([ 1., 2., 3., 3., 4., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -6716,14 +6716,14 @@ def cumprod( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.groupby("labels").cumprod() - array([1., 2., 3., 3., 4., 1.]) + array([1., 2., 3., 0., 4., 1.]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.groupby("labels").cumprod(skipna=False) - array([ 1., 2., 3., 3., 4., nan]) + array([ 1., 2., 3., 0., 4., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -6828,7 +6828,7 @@ def count( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -7107,7 +7107,7 @@ def max( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -7214,14 +7214,14 @@ def min( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.resample(time="3M").min() - array([1., 1., 2.]) + array([1., 0., 2.]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 @@ -7229,14 +7229,14 @@ def min( >>> da.resample(time="3M").min(skipna=False) - array([ 1., 1., nan]) + array([ 1., 0., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="min", @@ -7314,7 +7314,7 @@ def mean( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -7323,14 +7323,14 @@ def mean( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.resample(time="3M").mean() - array([1., 2., 2.]) + array([1. , 1.66666667, 2. ]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 @@ -7338,14 +7338,14 @@ def mean( >>> da.resample(time="3M").mean(skipna=False) - array([ 1., 2., nan]) + array([1. , 1.66666667, nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="mean", @@ -7430,7 +7430,7 @@ def prod( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -7439,14 +7439,14 @@ def prod( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.resample(time="3M").prod() - array([1., 6., 2.]) + array([1., 0., 2.]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 @@ -7454,7 +7454,7 @@ def prod( >>> da.resample(time="3M").prod(skipna=False) - array([ 1., 6., nan]) + array([ 1., 0., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 @@ -7462,14 +7462,14 @@ def prod( >>> da.resample(time="3M").prod(skipna=True, min_count=2) - array([nan, 6., nan]) + array([nan, 0., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="prod", @@ -7556,7 +7556,7 @@ def sum( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -7565,14 +7565,14 @@ def sum( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.resample(time="3M").sum() - array([1., 6., 2.]) + array([1., 5., 2.]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 @@ -7580,7 +7580,7 @@ def sum( >>> da.resample(time="3M").sum(skipna=False) - array([ 1., 6., nan]) + array([ 1., 5., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 @@ -7588,14 +7588,14 @@ def sum( >>> da.resample(time="3M").sum(skipna=True, min_count=2) - array([nan, 6., nan]) + array([nan, 5., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="sum", @@ -7679,7 +7679,7 @@ def std( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -7688,14 +7688,14 @@ def std( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.resample(time="3M").std() - array([0. , 0.81649658, 0. ]) + array([0. , 1.24721913, 0. ]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 @@ -7703,7 +7703,7 @@ def std( >>> da.resample(time="3M").std(skipna=False) - array([0. , 0.81649658, nan]) + array([0. , 1.24721913, nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 @@ -7711,14 +7711,14 @@ def std( >>> da.resample(time="3M").std(skipna=True, ddof=1) - array([nan, 1., nan]) + array([ nan, 1.52752523, nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="std", @@ -7802,7 +7802,7 @@ def var( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -7811,14 +7811,14 @@ def var( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.resample(time="3M").var() - array([0. , 0.66666667, 0. ]) + array([0. , 1.55555556, 0. ]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 @@ -7826,7 +7826,7 @@ def var( >>> da.resample(time="3M").var(skipna=False) - array([0. , 0.66666667, nan]) + array([0. , 1.55555556, nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 @@ -7834,14 +7834,14 @@ def var( >>> da.resample(time="3M").var(skipna=True, ddof=1) - array([nan, 1., nan]) + array([ nan, 2.33333333, nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-04-30 2001-07-31 """ if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="var", @@ -7921,7 +7921,7 @@ def median( Examples -------- >>> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -7930,7 +7930,7 @@ def median( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -8025,14 +8025,14 @@ def cumsum( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.resample(time="3M").cumsum() - array([1., 2., 5., 6., 2., 2.]) + array([1., 2., 5., 5., 2., 2.]) Coordinates: labels (time) >> da.resample(time="3M").cumsum(skipna=False) - array([ 1., 2., 5., 6., 2., nan]) + array([ 1., 2., 5., 5., 2., nan]) Coordinates: labels (time) >> da = xr.DataArray( - ... np.array([1, 2, 3, 1, 2, np.nan]), + ... np.array([1, 2, 3, 0, 2, np.nan]), ... dims="time", ... coords=dict( ... time=("time", pd.date_range("2001-01-01", freq="M", periods=6)), @@ -8122,14 +8122,14 @@ def cumprod( ... ) >>> da - array([ 1., 2., 3., 1., 2., nan]) + array([ 1., 2., 3., 0., 2., nan]) Coordinates: * time (time) datetime64[ns] 2001-01-31 2001-02-28 ... 2001-06-30 labels (time) >> da.resample(time="3M").cumprod() - array([1., 2., 6., 6., 2., 2.]) + array([1., 2., 6., 0., 2., 2.]) Coordinates: labels (time) >> da.resample(time="3M").cumprod(skipna=False) - array([ 1., 2., 6., 6., 2., nan]) + array([ 1., 2., 6., 0., 2., nan]) Coordinates: labels (time) None: def _date_field(self, name: str, dtype: DTypeLike) -> T_DataArray: if dtype is None: dtype = self._obj.dtype - obj_type = type(self._obj) - result = _get_date_field(self._obj.data, name, dtype) - return obj_type(result, name=name, coords=self._obj.coords, dims=self._obj.dims) + result = _get_date_field(_index_or_data(self._obj), name, dtype) + newvar = self._obj.variable.copy(data=result, deep=False) + return self._obj._replace(newvar, name=name) def _tslib_round_accessor(self, name: str, freq: str) -> T_DataArray: - obj_type = type(self._obj) - result = _round_field(self._obj.data, name, freq) - return obj_type(result, name=name, coords=self._obj.coords, dims=self._obj.dims) + result = _round_field(_index_or_data(self._obj), name, freq) + newvar = self._obj.variable.copy(data=result, deep=False) + return self._obj._replace(newvar, name=name) def floor(self, freq: str) -> T_DataArray: """ diff --git a/xarray/core/accessor_str.py b/xarray/core/accessor_str.py index c6c4af87d1c..31028f10350 100644 --- a/xarray/core/accessor_str.py +++ b/xarray/core/accessor_str.py @@ -51,6 +51,7 @@ import numpy as np +from xarray.core import duck_array_ops from xarray.core.computation import apply_ufunc from xarray.core.types import T_DataArray @@ -2085,13 +2086,16 @@ def _get_res_multi(val, pat): else: # dtype MUST be object or strings can be truncated # See: https://github.com/numpy/numpy/issues/8352 - return self._apply( - func=_get_res_multi, - func_args=(pat,), - dtype=np.object_, - output_core_dims=[[dim]], - output_sizes={dim: maxgroups}, - ).astype(self._obj.dtype.kind) + return duck_array_ops.astype( + self._apply( + func=_get_res_multi, + func_args=(pat,), + dtype=np.object_, + output_core_dims=[[dim]], + output_sizes={dim: maxgroups}, + ), + self._obj.dtype.kind, + ) def extractall( self, @@ -2258,15 +2262,18 @@ def _get_res(val, ipat, imaxcount=maxcount, dtype=self._obj.dtype): return res - return self._apply( - # dtype MUST be object or strings can be truncated - # See: https://github.com/numpy/numpy/issues/8352 - func=_get_res, - func_args=(pat,), - dtype=np.object_, - output_core_dims=[[group_dim, match_dim]], - output_sizes={group_dim: maxgroups, match_dim: maxcount}, - ).astype(self._obj.dtype.kind) + return duck_array_ops.astype( + self._apply( + # dtype MUST be object or strings can be truncated + # See: https://github.com/numpy/numpy/issues/8352 + func=_get_res, + func_args=(pat,), + dtype=np.object_, + output_core_dims=[[group_dim, match_dim]], + output_sizes={group_dim: maxgroups, match_dim: maxcount}, + ), + self._obj.dtype.kind, + ) def findall( self, @@ -2385,13 +2392,16 @@ def _partitioner( # dtype MUST be object or strings can be truncated # See: https://github.com/numpy/numpy/issues/8352 - return self._apply( - func=arrfunc, - func_args=(sep,), - dtype=np.object_, - output_core_dims=[[dim]], - output_sizes={dim: 3}, - ).astype(self._obj.dtype.kind) + return duck_array_ops.astype( + self._apply( + func=arrfunc, + func_args=(sep,), + dtype=np.object_, + output_core_dims=[[dim]], + output_sizes={dim: 3}, + ), + self._obj.dtype.kind, + ) def partition( self, @@ -2510,13 +2520,16 @@ def _dosplit(mystr, sep, maxsplit=maxsplit, dtype=self._obj.dtype): # dtype MUST be object or strings can be truncated # See: https://github.com/numpy/numpy/issues/8352 - return self._apply( - func=_dosplit, - func_args=(sep,), - dtype=np.object_, - output_core_dims=[[dim]], - output_sizes={dim: maxsplit}, - ).astype(self._obj.dtype.kind) + return duck_array_ops.astype( + self._apply( + func=_dosplit, + func_args=(sep,), + dtype=np.object_, + output_core_dims=[[dim]], + output_sizes={dim: maxsplit}, + ), + self._obj.dtype.kind, + ) def split( self, diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index edebccc2534..39ff878b56d 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -5,13 +5,12 @@ from collections import defaultdict from collections.abc import Hashable, Iterable, Mapping from contextlib import suppress -from typing import TYPE_CHECKING, Any, Callable, Generic, TypeVar, cast +from typing import TYPE_CHECKING, Any, Callable, Generic, cast import numpy as np import pandas as pd from xarray.core import dtypes -from xarray.core.common import DataWithCoords from xarray.core.indexes import ( Index, Indexes, @@ -20,15 +19,14 @@ indexes_all_equal, safe_cast_to_index, ) +from xarray.core.types import T_Alignable from xarray.core.utils import is_dict_like, is_full_slice from xarray.core.variable import Variable, as_compatible_data, calculate_dimensions if TYPE_CHECKING: from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset - from xarray.core.types import JoinOptions, T_DataArray, T_Dataset, T_DataWithCoords - -DataAlignable = TypeVar("DataAlignable", bound=DataWithCoords) + from xarray.core.types import JoinOptions, T_DataArray, T_Dataset def reindex_variables( @@ -92,7 +90,7 @@ def reindex_variables( NormalizedIndexVars = dict[MatchingIndexKey, dict[Hashable, Variable]] -class Aligner(Generic[DataAlignable]): +class Aligner(Generic[T_Alignable]): """Implements all the complex logic for the re-indexing and alignment of Xarray objects. @@ -105,8 +103,8 @@ class Aligner(Generic[DataAlignable]): """ - objects: tuple[DataAlignable, ...] - results: tuple[DataAlignable, ...] + objects: tuple[T_Alignable, ...] + results: tuple[T_Alignable, ...] objects_matching_indexes: tuple[dict[MatchingIndexKey, Index], ...] join: str exclude_dims: frozenset[Hashable] @@ -127,7 +125,7 @@ class Aligner(Generic[DataAlignable]): def __init__( self, - objects: Iterable[DataAlignable], + objects: Iterable[T_Alignable], join: str = "inner", indexes: Mapping[Any, Any] | None = None, exclude_dims: Iterable = frozenset(), @@ -510,7 +508,7 @@ def _get_dim_pos_indexers( def _get_indexes_and_vars( self, - obj: DataAlignable, + obj: T_Alignable, matching_indexes: dict[MatchingIndexKey, Index], ) -> tuple[dict[Hashable, Index], dict[Hashable, Variable]]: new_indexes = {} @@ -533,13 +531,13 @@ def _get_indexes_and_vars( def _reindex_one( self, - obj: DataAlignable, + obj: T_Alignable, matching_indexes: dict[MatchingIndexKey, Index], - ) -> DataAlignable: + ) -> T_Alignable: new_indexes, new_variables = self._get_indexes_and_vars(obj, matching_indexes) dim_pos_indexers = self._get_dim_pos_indexers(matching_indexes) - new_obj = obj._reindex_callback( + return obj._reindex_callback( self, dim_pos_indexers, new_variables, @@ -548,8 +546,6 @@ def _reindex_one( self.exclude_dims, self.exclude_vars, ) - new_obj.encoding = obj.encoding - return new_obj def reindex_all(self) -> None: self.results = tuple( @@ -581,13 +577,13 @@ def align(self) -> None: def align( - *objects: DataAlignable, + *objects: T_Alignable, join: JoinOptions = "inner", copy: bool = True, indexes=None, exclude=frozenset(), fill_value=dtypes.NA, -) -> tuple[DataAlignable, ...]: +) -> tuple[T_Alignable, ...]: """ Given any number of Dataset and/or DataArray objects, returns new objects with aligned indexes and dimension sizes. @@ -801,6 +797,7 @@ def deep_align( This function is not public API. """ + from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset @@ -808,7 +805,7 @@ def deep_align( indexes = {} def is_alignable(obj): - return isinstance(obj, (DataArray, Dataset)) + return isinstance(obj, (Coordinates, DataArray, Dataset)) positions = [] keys = [] @@ -866,7 +863,7 @@ def is_alignable(obj): def reindex( - obj: DataAlignable, + obj: T_Alignable, indexers: Mapping[Any, Any], method: str | None = None, tolerance: int | float | Iterable[int | float] | None = None, @@ -874,7 +871,7 @@ def reindex( fill_value: Any = dtypes.NA, sparse: bool = False, exclude_vars: Iterable[Hashable] = frozenset(), -) -> DataAlignable: +) -> T_Alignable: """Re-index either a Dataset or a DataArray. Not public API. @@ -905,13 +902,13 @@ def reindex( def reindex_like( - obj: DataAlignable, + obj: T_Alignable, other: Dataset | DataArray, method: str | None = None, tolerance: int | float | Iterable[int | float] | None = None, copy: bool = True, fill_value: Any = dtypes.NA, -) -> DataAlignable: +) -> T_Alignable: """Re-index either a Dataset or a DataArray like another Dataset/DataArray. Not public API. @@ -953,8 +950,8 @@ def _get_broadcast_dims_map_common_coords(args, exclude): def _broadcast_helper( - arg: T_DataWithCoords, exclude, dims_map, common_coords -) -> T_DataWithCoords: + arg: T_Alignable, exclude, dims_map, common_coords +) -> T_Alignable: from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset @@ -984,16 +981,16 @@ def _broadcast_dataset(ds: T_Dataset) -> T_Dataset: # remove casts once https://github.com/python/mypy/issues/12800 is resolved if isinstance(arg, DataArray): - return cast("T_DataWithCoords", _broadcast_array(arg)) + return cast(T_Alignable, _broadcast_array(arg)) elif isinstance(arg, Dataset): - return cast("T_DataWithCoords", _broadcast_dataset(arg)) + return cast(T_Alignable, _broadcast_dataset(arg)) else: raise ValueError("all input must be Dataset or DataArray objects") # TODO: this typing is too restrictive since it cannot deal with mixed # DataArray and Dataset types...? Is this a problem? -def broadcast(*args: T_DataWithCoords, exclude=None) -> tuple[T_DataWithCoords, ...]: +def broadcast(*args: T_Alignable, exclude=None) -> tuple[T_Alignable, ...]: """Explicitly broadcast any number of DataArray or Dataset objects against one another. diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 8106c295f5a..1599fb60ddc 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -1,7 +1,6 @@ from __future__ import annotations import itertools -import warnings from collections import Counter from collections.abc import Iterable, Sequence from typing import TYPE_CHECKING, Literal, Union @@ -653,7 +652,6 @@ def _combine_single_variable_hypercube( return concatenated -# TODO remove empty list default param after version 0.21, see PR4696 def combine_by_coords( data_objects: Iterable[Dataset | DataArray] = [], compat: CompatOptions = "no_conflicts", @@ -662,7 +660,6 @@ def combine_by_coords( fill_value: object = dtypes.NA, join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "no_conflicts", - datasets: Iterable[Dataset] | None = None, ) -> Dataset | DataArray: """ @@ -760,8 +757,6 @@ def combine_by_coords( If a callable, it must expect a sequence of ``attrs`` dicts and a context object as its only parameters. - datasets : Iterable of Datasets - Returns ------- combined : xarray.Dataset or xarray.DataArray @@ -918,14 +913,6 @@ def combine_by_coords( DataArrays or Datasets, a ValueError will be raised (as this is an ambiguous operation). """ - # TODO remove after version 0.21, see PR4696 - if datasets is not None: - warnings.warn( - "The datasets argument has been renamed to `data_objects`." - " From 0.21 on passing a value for datasets will raise an error." - ) - data_objects = datasets - if not data_objects: return Dataset() @@ -970,10 +957,9 @@ def combine_by_coords( # Perform the multidimensional combine on each group of data variables # before merging back together - concatenated_grouped_by_data_vars = [] - for vars, datasets_with_same_vars in grouped_by_vars: - concatenated = _combine_single_variable_hypercube( - list(datasets_with_same_vars), + concatenated_grouped_by_data_vars = tuple( + _combine_single_variable_hypercube( + tuple(datasets_with_same_vars), fill_value=fill_value, data_vars=data_vars, coords=coords, @@ -981,7 +967,8 @@ def combine_by_coords( join=join, combine_attrs=combine_attrs, ) - concatenated_grouped_by_data_vars.append(concatenated) + for vars, datasets_with_same_vars in grouped_by_vars + ) return merge( concatenated_grouped_by_data_vars, diff --git a/xarray/core/common.py b/xarray/core/common.py index f6abcba1ff0..ade701457c6 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -13,8 +13,8 @@ from xarray.core import dtypes, duck_array_ops, formatting, formatting_html, ops from xarray.core.indexing import BasicIndexer, ExplicitlyIndexed from xarray.core.options import OPTIONS, _get_keep_attrs -from xarray.core.pdcompat import _convert_base_to_offset -from xarray.core.pycompat import is_duck_dask_array +from xarray.core.parallelcompat import get_chunked_array_type, guess_chunkmanager +from xarray.core.pycompat import is_chunked_array from xarray.core.utils import ( Frozen, either_dict_or_kwargs, @@ -46,6 +46,7 @@ DTypeLikeSave, ScalarOrArray, SideOptions, + T_Chunks, T_DataWithCoords, T_Variable, ) @@ -159,7 +160,7 @@ def __int__(self: Any) -> int: def __complex__(self: Any) -> complex: return complex(self.values) - def __array__(self: Any, dtype: DTypeLike = None) -> np.ndarray: + def __array__(self: Any, dtype: DTypeLike | None = None) -> np.ndarray: return np.asarray(self.values, dtype=dtype) def __repr__(self) -> str: @@ -209,7 +210,7 @@ def get_axis_num(self, dim: Hashable | Iterable[Hashable]) -> int | tuple[int, . int or tuple of int Axis number or numbers corresponding to the given dimensions. """ - if isinstance(dim, Iterable) and not isinstance(dim, str): + if not isinstance(dim, str) and isinstance(dim, Iterable): return tuple(self._get_axis_num(d) for d in dim) else: return self._get_axis_num(dim) @@ -305,9 +306,7 @@ def __setattr__(self, name: str, value: Any) -> None: except AttributeError as e: # Don't accidentally shadow custom AttributeErrors, e.g. # DataArray.dims.setter - if str(e) != "{!r} object has no attribute {!r}".format( - type(self).__name__, name - ): + if str(e) != f"{type(self).__name__!r} object has no attribute {name!r}": raise raise AttributeError( f"cannot set attribute {name!r} on a {type(self).__name__!r} object. Use __setitem__ style" @@ -607,9 +606,17 @@ def assign_coords( Dataset.swap_dims Dataset.set_coords """ + from xarray.core.coordinates import Coordinates + coords_combined = either_dict_or_kwargs(coords, coords_kwargs, "assign_coords") data = self.copy(deep=False) - results: dict[Hashable, Any] = self._calc_assign_results(coords_combined) + + results: Coordinates | dict[Hashable, Any] + if isinstance(coords, Coordinates): + results = coords + else: + results = self._calc_assign_results(coords_combined) + data.coords.update(results) return data @@ -627,6 +634,36 @@ def assign_attrs( **kwargs keyword arguments passed into ``attrs.update``. + Examples + -------- + >>> dataset = xr.Dataset({"temperature": [25, 30, 27]}) + >>> dataset + + Dimensions: (temperature: 3) + Coordinates: + * temperature (temperature) int64 25 30 27 + Data variables: + *empty* + + >>> new_dataset = dataset.assign_attrs( + ... units="Celsius", description="Temperature data" + ... ) + >>> new_dataset + + Dimensions: (temperature: 3) + Coordinates: + * temperature (temperature) int64 25 30 27 + Data variables: + *empty* + Attributes: + units: Celsius + description: Temperature data + + # Attributes of the new dataset + + >>> new_dataset.attrs + {'units': 'Celsius', 'description': 'Temperature data'} + Returns ------- assigned : same type as caller @@ -950,6 +987,7 @@ def _resample( from xarray.core.dataarray import DataArray from xarray.core.groupby import ResolvedTimeResampleGrouper, TimeResampleGrouper + from xarray.core.pdcompat import _convert_base_to_offset from xarray.core.resample import RESAMPLE_DIM if keep_attrs is not None: @@ -1291,9 +1329,7 @@ def isin(self: T_DataWithCoords, test_elements: Any) -> T_DataWithCoords: if isinstance(test_elements, Dataset): raise TypeError( - "isin() argument must be convertible to an array: {}".format( - test_elements - ) + f"isin() argument must be convertible to an array: {test_elements}" ) elif isinstance(test_elements, (Variable, DataArray)): # need to explicitly pull out data to support dask arrays as the @@ -1396,28 +1432,52 @@ def __getitem__(self, value): @overload def full_like( - other: DataArray, fill_value: Any, dtype: DTypeLikeSave = None + other: DataArray, + fill_value: Any, + dtype: DTypeLikeSave | None = None, + *, + chunks: T_Chunks = None, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> DataArray: ... @overload def full_like( - other: Dataset, fill_value: Any, dtype: DTypeMaybeMapping = None + other: Dataset, + fill_value: Any, + dtype: DTypeMaybeMapping | None = None, + *, + chunks: T_Chunks = None, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset: ... @overload def full_like( - other: Variable, fill_value: Any, dtype: DTypeLikeSave = None + other: Variable, + fill_value: Any, + dtype: DTypeLikeSave | None = None, + *, + chunks: T_Chunks = None, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: ... @overload def full_like( - other: Dataset | DataArray, fill_value: Any, dtype: DTypeMaybeMapping = None + other: Dataset | DataArray, + fill_value: Any, + dtype: DTypeMaybeMapping | None = None, + *, + chunks: T_Chunks = {}, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray: ... @@ -1426,7 +1486,11 @@ def full_like( def full_like( other: Dataset | DataArray | Variable, fill_value: Any, - dtype: DTypeMaybeMapping = None, + dtype: DTypeMaybeMapping | None = None, + *, + chunks: T_Chunks = None, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: ... @@ -1434,9 +1498,16 @@ def full_like( def full_like( other: Dataset | DataArray | Variable, fill_value: Any, - dtype: DTypeMaybeMapping = None, + dtype: DTypeMaybeMapping | None = None, + *, + chunks: T_Chunks = None, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: - """Return a new object with the same shape and type as a given object. + """ + Return a new object with the same shape and type as a given object. + + Returned object will be chunked if if the given object is chunked, or if chunks or chunked_array_type are specified. Parameters ---------- @@ -1449,6 +1520,18 @@ def full_like( dtype : dtype or dict-like of dtype, optional dtype of the new array. If a dict-like, maps dtypes to variables. If omitted, it defaults to other.dtype. + chunks : int, "auto", tuple of int or mapping of Hashable to int, optional + Chunk sizes along each dimension, e.g., ``5``, ``"auto"``, ``(5, 5)`` or + ``{"x": 5, "y": 5}``. + chunked_array_type: str, optional + Which chunked array type to coerce the underlying data array to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Experimental API that should not be relied upon. + from_array_kwargs: dict, optional + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example, with dask as the default chunked array type, this method would pass additional kwargs + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. Returns ------- @@ -1562,7 +1645,12 @@ def full_like( data_vars = { k: _full_like_variable( - v.variable, fill_value.get(k, dtypes.NA), dtype_.get(k, None) + v.variable, + fill_value.get(k, dtypes.NA), + dtype_.get(k, None), + chunks, + chunked_array_type, + from_array_kwargs, ) for k, v in other.data_vars.items() } @@ -1571,7 +1659,14 @@ def full_like( if isinstance(dtype, Mapping): raise ValueError("'dtype' cannot be dict-like when passing a DataArray") return DataArray( - _full_like_variable(other.variable, fill_value, dtype), + _full_like_variable( + other.variable, + fill_value, + dtype, + chunks, + chunked_array_type, + from_array_kwargs, + ), dims=other.dims, coords=other.coords, attrs=other.attrs, @@ -1580,13 +1675,20 @@ def full_like( elif isinstance(other, Variable): if isinstance(dtype, Mapping): raise ValueError("'dtype' cannot be dict-like when passing a Variable") - return _full_like_variable(other, fill_value, dtype) + return _full_like_variable( + other, fill_value, dtype, chunks, chunked_array_type, from_array_kwargs + ) else: raise TypeError("Expected DataArray, Dataset, or Variable") def _full_like_variable( - other: Variable, fill_value: Any, dtype: DTypeLike = None + other: Variable, + fill_value: Any, + dtype: DTypeLike | None = None, + chunks: T_Chunks = None, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Variable: """Inner function of full_like, where other must be a variable""" from xarray.core.variable import Variable @@ -1594,13 +1696,28 @@ def _full_like_variable( if fill_value is dtypes.NA: fill_value = dtypes.get_fill_value(dtype if dtype is not None else other.dtype) - if is_duck_dask_array(other.data): - import dask.array + if ( + is_chunked_array(other.data) + or chunked_array_type is not None + or chunks is not None + ): + if chunked_array_type is None: + chunkmanager = get_chunked_array_type(other.data) + else: + chunkmanager = guess_chunkmanager(chunked_array_type) if dtype is None: dtype = other.dtype - data = dask.array.full( - other.shape, fill_value, dtype=dtype, chunks=other.data.chunks + + if from_array_kwargs is None: + from_array_kwargs = {} + + data = chunkmanager.array_api.full( + other.shape, + fill_value, + dtype=dtype, + chunks=chunks if chunks else other.data.chunks, + **from_array_kwargs, ) else: data = np.full_like(other.data, fill_value, dtype=dtype) @@ -1609,36 +1726,72 @@ def _full_like_variable( @overload -def zeros_like(other: DataArray, dtype: DTypeLikeSave = None) -> DataArray: +def zeros_like( + other: DataArray, + dtype: DTypeLikeSave | None = None, + *, + chunks: T_Chunks = None, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, +) -> DataArray: ... @overload -def zeros_like(other: Dataset, dtype: DTypeMaybeMapping = None) -> Dataset: +def zeros_like( + other: Dataset, + dtype: DTypeMaybeMapping | None = None, + *, + chunks: T_Chunks = None, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, +) -> Dataset: ... @overload -def zeros_like(other: Variable, dtype: DTypeLikeSave = None) -> Variable: +def zeros_like( + other: Variable, + dtype: DTypeLikeSave | None = None, + *, + chunks: T_Chunks = None, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, +) -> Variable: ... @overload def zeros_like( - other: Dataset | DataArray, dtype: DTypeMaybeMapping = None + other: Dataset | DataArray, + dtype: DTypeMaybeMapping | None = None, + *, + chunks: T_Chunks = None, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray: ... @overload def zeros_like( - other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping = None + other: Dataset | DataArray | Variable, + dtype: DTypeMaybeMapping | None = None, + *, + chunks: T_Chunks = None, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: ... def zeros_like( - other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping = None + other: Dataset | DataArray | Variable, + dtype: DTypeMaybeMapping | None = None, + *, + chunks: T_Chunks = None, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: """Return a new object of zeros with the same shape and type as a given dataarray or dataset. @@ -1649,6 +1802,18 @@ def zeros_like( The reference object. The output will have the same dimensions and coordinates as this object. dtype : dtype, optional dtype of the new array. If omitted, it defaults to other.dtype. + chunks : int, "auto", tuple of int or mapping of Hashable to int, optional + Chunk sizes along each dimension, e.g., ``5``, ``"auto"``, ``(5, 5)`` or + ``{"x": 5, "y": 5}``. + chunked_array_type: str, optional + Which chunked array type to coerce the underlying data array to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Experimental API that should not be relied upon. + from_array_kwargs: dict, optional + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example, with dask as the default chunked array type, this method would pass additional kwargs + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. Returns ------- @@ -1692,40 +1857,83 @@ def zeros_like( full_like """ - return full_like(other, 0, dtype) + return full_like( + other, + 0, + dtype, + chunks=chunks, + chunked_array_type=chunked_array_type, + from_array_kwargs=from_array_kwargs, + ) @overload -def ones_like(other: DataArray, dtype: DTypeLikeSave = None) -> DataArray: +def ones_like( + other: DataArray, + dtype: DTypeLikeSave | None = None, + *, + chunks: T_Chunks = None, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, +) -> DataArray: ... @overload -def ones_like(other: Dataset, dtype: DTypeMaybeMapping = None) -> Dataset: +def ones_like( + other: Dataset, + dtype: DTypeMaybeMapping | None = None, + *, + chunks: T_Chunks = None, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, +) -> Dataset: ... @overload -def ones_like(other: Variable, dtype: DTypeLikeSave = None) -> Variable: +def ones_like( + other: Variable, + dtype: DTypeLikeSave | None = None, + *, + chunks: T_Chunks = None, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, +) -> Variable: ... @overload def ones_like( - other: Dataset | DataArray, dtype: DTypeMaybeMapping = None + other: Dataset | DataArray, + dtype: DTypeMaybeMapping | None = None, + *, + chunks: T_Chunks = None, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray: ... @overload def ones_like( - other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping = None + other: Dataset | DataArray | Variable, + dtype: DTypeMaybeMapping | None = None, + *, + chunks: T_Chunks = None, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: ... def ones_like( - other: Dataset | DataArray | Variable, dtype: DTypeMaybeMapping = None + other: Dataset | DataArray | Variable, + dtype: DTypeMaybeMapping | None = None, + *, + chunks: T_Chunks = None, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, ) -> Dataset | DataArray | Variable: """Return a new object of ones with the same shape and type as a given dataarray or dataset. @@ -1736,6 +1944,18 @@ def ones_like( The reference object. The output will have the same dimensions and coordinates as this object. dtype : dtype, optional dtype of the new array. If omitted, it defaults to other.dtype. + chunks : int, "auto", tuple of int or mapping of Hashable to int, optional + Chunk sizes along each dimension, e.g., ``5``, ``"auto"``, ``(5, 5)`` or + ``{"x": 5, "y": 5}``. + chunked_array_type: str, optional + Which chunked array type to coerce the underlying data array to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Experimental API that should not be relied upon. + from_array_kwargs: dict, optional + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example, with dask as the default chunked array type, this method would pass additional kwargs + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. Returns ------- @@ -1771,7 +1991,14 @@ def ones_like( full_like """ - return full_like(other, 1, dtype) + return full_like( + other, + 1, + dtype, + chunks=chunks, + chunked_array_type=chunked_array_type, + from_array_kwargs=from_array_kwargs, + ) def get_chunksizes( diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 356f1029192..685307fc8c3 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -20,7 +20,8 @@ from xarray.core.indexes import Index, filter_indexes_from_coords from xarray.core.merge import merge_attrs, merge_coordinates_without_align from xarray.core.options import OPTIONS, _get_keep_attrs -from xarray.core.pycompat import is_duck_dask_array +from xarray.core.parallelcompat import get_chunked_array_type +from xarray.core.pycompat import is_chunked_array, is_duck_dask_array from xarray.core.types import Dims, T_DataArray from xarray.core.utils import is_dict_like, is_scalar from xarray.core.variable import Variable @@ -675,16 +676,18 @@ def apply_variable_ufunc( for arg, core_dims in zip(args, signature.input_core_dims) ] - if any(is_duck_dask_array(array) for array in input_data): + if any(is_chunked_array(array) for array in input_data): if dask == "forbidden": raise ValueError( - "apply_ufunc encountered a dask array on an " - "argument, but handling for dask arrays has not " + "apply_ufunc encountered a chunked array on an " + "argument, but handling for chunked arrays has not " "been enabled. Either set the ``dask`` argument " "or load your data into memory first with " "``.load()`` or ``.compute()``" ) elif dask == "parallelized": + chunkmanager = get_chunked_array_type(*input_data) + numpy_func = func if dask_gufunc_kwargs is None: @@ -697,7 +700,7 @@ def apply_variable_ufunc( for n, (data, core_dims) in enumerate( zip(input_data, signature.input_core_dims) ): - if is_duck_dask_array(data): + if is_chunked_array(data): # core dimensions cannot span multiple chunks for axis, dim in enumerate(core_dims, start=-len(core_dims)): if len(data.chunks[axis]) != 1: @@ -705,7 +708,7 @@ def apply_variable_ufunc( f"dimension {dim} on {n}th function argument to " "apply_ufunc with dask='parallelized' consists of " "multiple chunks, but is also a core dimension. To " - "fix, either rechunk into a single dask array chunk along " + "fix, either rechunk into a single array chunk along " f"this dimension, i.e., ``.chunk(dict({dim}=-1))``, or " "pass ``allow_rechunk=True`` in ``dask_gufunc_kwargs`` " "but beware that this may significantly increase memory usage." @@ -732,9 +735,7 @@ def apply_variable_ufunc( ) def func(*arrays): - import dask.array as da - - res = da.apply_gufunc( + res = chunkmanager.apply_gufunc( numpy_func, signature.to_gufunc_string(exclude_dims), *arrays, @@ -749,8 +750,7 @@ def func(*arrays): pass else: raise ValueError( - "unknown setting for dask array handling in " - "apply_ufunc: {}".format(dask) + "unknown setting for chunked array handling in " f"apply_ufunc: {dask}" ) else: if vectorize: @@ -812,7 +812,7 @@ def func(*arrays): def apply_array_ufunc(func, *args, dask="forbidden"): """Apply a ndarray level function over ndarray objects.""" - if any(is_duck_dask_array(arg) for arg in args): + if any(is_chunked_array(arg) for arg in args): if dask == "forbidden": raise ValueError( "apply_ufunc encountered a dask array on an " @@ -2013,7 +2013,7 @@ def to_floatable(x: DataArray) -> DataArray: ) elif x.dtype.kind == "m": # timedeltas - return x.astype(float) + return duck_array_ops.astype(x, dtype=float) return x if isinstance(data, Dataset): @@ -2061,12 +2061,11 @@ def _calc_idxminmax( # This will run argmin or argmax. indx = func(array, dim=dim, axis=None, keep_attrs=keep_attrs, skipna=skipna) - # Handle dask arrays. - if is_duck_dask_array(array.data): - import dask.array - + # Handle chunked arrays (e.g. dask). + if is_chunked_array(array.data): + chunkmanager = get_chunked_array_type(array.data) chunks = dict(zip(array.dims, array.chunks)) - dask_coord = dask.array.from_array(array[dim].data, chunks=chunks[dim]) + dask_coord = chunkmanager.from_array(array[dim].data, chunks=chunks[dim]) res = indx.copy(data=dask_coord[indx.data.ravel()].reshape(indx.shape)) # we need to attach back the dim name res.name = dim @@ -2153,16 +2152,14 @@ def unify_chunks(*objects: Dataset | DataArray) -> tuple[Dataset | DataArray, .. if not unify_chunks_args: return objects - # Run dask.array.core.unify_chunks - from dask.array.core import unify_chunks - - _, dask_data = unify_chunks(*unify_chunks_args) - dask_data_iter = iter(dask_data) + chunkmanager = get_chunked_array_type(*[arg for arg in unify_chunks_args]) + _, chunked_data = chunkmanager.unify_chunks(*unify_chunks_args) + chunked_data_iter = iter(chunked_data) out: list[Dataset | DataArray] = [] for obj, ds in zip(objects, datasets): for k, v in ds._variables.items(): if v.chunks is not None: - ds._variables[k] = v.copy(data=next(dask_data_iter)) + ds._variables[k] = v.copy(data=next(chunked_data_iter)) out.append(obj._from_temp_dataset(ds) if isinstance(obj, DataArray) else ds) return tuple(out) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index dcf2a23d311..d7aad8c7188 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -3,6 +3,7 @@ from collections.abc import Hashable, Iterable from typing import TYPE_CHECKING, Any, Union, cast, overload +import numpy as np import pandas as pd from xarray.core import dtypes, utils @@ -517,7 +518,7 @@ def _dataset_concat( if variables_to_merge: grouped = { k: v - for k, v in collect_variables_and_indexes(list(datasets)).items() + for k, v in collect_variables_and_indexes(datasets).items() if k in variables_to_merge } merged_vars, merged_indexes = merge_collected( @@ -543,7 +544,7 @@ def ensure_common_dims(vars, concat_dim_lengths): # ensure each variable with the given name shares the same # dimensions and the same shape for all of them except along the # concat dimension - common_dims = tuple(pd.unique([d for v in vars for d in v.dims])) + common_dims = tuple(utils.OrderedSet(d for v in vars for d in v.dims)) if dim not in common_dims: common_dims = (dim,) + common_dims for var, dim_len in zip(vars, concat_dim_lengths): @@ -568,38 +569,45 @@ def get_indexes(name): yield PandasIndex(data, dim, coord_dtype=var.dtype) # create concatenation index, needed for later reindexing - concat_index = list(range(sum(concat_dim_lengths))) + file_start_indexes = np.append(0, np.cumsum(concat_dim_lengths)) + concat_index = np.arange(file_start_indexes[-1]) + concat_index_size = concat_index.size + variable_index_mask = np.ones(concat_index_size, dtype=bool) # stack up each variable and/or index to fill-out the dataset (in order) # n.b. this loop preserves variable order, needed for groupby. + ndatasets = len(datasets) for name in vars_order: if name in concat_over and name not in result_indexes: variables = [] - variable_index = [] + # Initialize the mask to all True then set False if any name is missing in + # the datasets: + variable_index_mask.fill(True) var_concat_dim_length = [] for i, ds in enumerate(datasets): if name in ds.variables: variables.append(ds[name].variable) - # add to variable index, needed for reindexing - var_idx = [ - sum(concat_dim_lengths[:i]) + k - for k in range(concat_dim_lengths[i]) - ] - variable_index.extend(var_idx) - var_concat_dim_length.append(len(var_idx)) + var_concat_dim_length.append(concat_dim_lengths[i]) else: # raise if coordinate not in all datasets if name in coord_names: raise ValueError( f"coordinate {name!r} not present in all datasets." ) + + # Mask out the indexes without the name: + start = file_start_indexes[i] + end = file_start_indexes[i + 1] + variable_index_mask[slice(start, end)] = False + + variable_index = concat_index[variable_index_mask] vars = ensure_common_dims(variables, var_concat_dim_length) # Try to concatenate the indexes, concatenate the variables when no index # is found on all datasets. indexes: list[Index] = list(get_indexes(name)) if indexes: - if len(indexes) < len(datasets): + if len(indexes) < ndatasets: raise ValueError( f"{name!r} must have either an index or no index in all datasets, " f"found {len(indexes)}/{len(datasets)} datasets with an index." @@ -623,7 +631,7 @@ def get_indexes(name): vars, dim, positions, combine_attrs=combine_attrs ) # reindex if variable is not present in all datasets - if len(variable_index) < len(concat_index): + if len(variable_index) < concat_index_size: combined_var = reindex_variables( variables={name: combined_var}, dim_pos_indexers={ diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 32809a54ddd..f03d98f781a 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -3,49 +3,53 @@ import warnings from collections.abc import Hashable, Iterator, Mapping, Sequence from contextlib import contextmanager -from typing import TYPE_CHECKING, Any +from typing import ( + TYPE_CHECKING, + Any, + Generic, + cast, +) import numpy as np import pandas as pd from xarray.core import formatting +from xarray.core.alignment import Aligner from xarray.core.indexes import ( Index, Indexes, PandasMultiIndex, assert_no_index_corrupted, + create_default_index_implicit, ) from xarray.core.merge import merge_coordinates_without_align, merge_coords +from xarray.core.types import Self, T_DataArray from xarray.core.utils import Frozen, ReprObject -from xarray.core.variable import Variable, calculate_dimensions +from xarray.core.variable import Variable, as_variable, calculate_dimensions if TYPE_CHECKING: from xarray.core.common import DataWithCoords from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset - from xarray.core.types import T_DataArray # Used as the key corresponding to a DataArray's variable when converting # arbitrary DataArray objects to datasets _THIS_ARRAY = ReprObject("") -class Coordinates(Mapping[Hashable, "T_DataArray"]): +class AbstractCoordinates(Mapping[Hashable, "T_DataArray"]): _data: DataWithCoords __slots__ = ("_data",) def __getitem__(self, key: Hashable) -> T_DataArray: raise NotImplementedError() - def __setitem__(self, key: Hashable, value: Any) -> None: - self.update({key: value}) - @property def _names(self) -> set[Hashable]: raise NotImplementedError() @property - def dims(self) -> Mapping[Hashable, int] | tuple[Hashable, ...]: + def dims(self) -> Frozen[Hashable, int] | tuple[Hashable, ...]: raise NotImplementedError() @property @@ -54,10 +58,22 @@ def dtypes(self) -> Frozen[Hashable, np.dtype]: @property def indexes(self) -> Indexes[pd.Index]: + """Mapping of pandas.Index objects used for label based indexing. + + Raises an error if this Coordinates object has indexes that cannot + be coerced to pandas.Index objects. + + See Also + -------- + Coordinates.xindexes + """ return self._data.indexes @property def xindexes(self) -> Indexes[Index]: + """Mapping of :py:class:`~xarray.indexes.Index` objects + used for label based indexing. + """ return self._data.xindexes @property @@ -125,7 +141,7 @@ def to_index(self, ordered_dims: Sequence[Hashable] | None = None) -> pd.Index: index_lengths = np.fromiter( (len(index) for index in indexes), dtype=np.intp ) - cumprod_lengths = np.cumproduct(index_lengths) + cumprod_lengths = np.cumprod(index_lengths) if cumprod_lengths[-1] == 0: # if any factor is empty, the cartesian product is empty @@ -163,13 +179,209 @@ def to_index(self, ordered_dims: Sequence[Hashable] | None = None) -> pd.Index: return pd.MultiIndex(level_list, code_list, names=names) - def update(self, other: Mapping[Any, Any]) -> None: - other_vars = getattr(other, "variables", other) - self._maybe_drop_multiindex_coords(set(other_vars)) - coords, indexes = merge_coords( - [self.variables, other_vars], priority_arg=1, indexes=self.xindexes + +class Coordinates(AbstractCoordinates): + """Dictionary like container for Xarray coordinates (variables + indexes). + + This collection is a mapping of coordinate names to + :py:class:`~xarray.DataArray` objects. + + It can be passed directly to the :py:class:`~xarray.Dataset` and + :py:class:`~xarray.DataArray` constructors via their `coords` argument. This + will add both the coordinates variables and their index. + + Coordinates are either: + + - returned via the :py:attr:`Dataset.coords` and :py:attr:`DataArray.coords` + properties. + - built from index objects (e.g., :py:meth:`Coordinates.from_pandas_multiindex`). + - built directly from coordinate data and index objects (beware that no consistency + check is done on those inputs). + + In the latter case, no default (pandas) index is created. + + Parameters + ---------- + coords: dict-like + Mapping where keys are coordinate names and values are objects that + can be converted into a :py:class:`~xarray.Variable` object + (see :py:func:`~xarray.as_variable`). + indexes: dict-like + Mapping of where keys are coordinate names and values are + :py:class:`~xarray.indexes.Index` objects. + + """ + + _data: DataWithCoords + + __slots__ = ("_data",) + + def __init__( + self, + coords: Mapping[Any, Any] | None = None, + indexes: Mapping[Any, Index] | None = None, + ) -> None: + # When coordinates are constructed directly, an internal Dataset is + # created so that it is compatible with the DatasetCoordinates and + # DataArrayCoordinates classes serving as a proxy for the data. + # TODO: refactor DataArray / Dataset so that Coordinates store the data. + from xarray.core.dataset import Dataset + + if coords is None: + variables = {} + elif isinstance(coords, Coordinates): + variables = {k: v.copy() for k, v in coords.variables.items()} + else: + variables = {k: as_variable(v) for k, v in coords.items()} + + if indexes is None: + indexes = {} + else: + indexes = dict(indexes) + + no_coord_index = set(indexes) - set(variables) + if no_coord_index: + raise ValueError( + f"no coordinate variables found for these indexes: {no_coord_index}" + ) + + for k, idx in indexes.items(): + if not isinstance(idx, Index): + raise TypeError(f"'{k}' is not an `xarray.indexes.Index` object") + + # maybe convert to base variable + for k, v in variables.items(): + if k not in indexes: + variables[k] = v.to_base_variable() + + self._data = Dataset._construct_direct( + coord_names=set(variables), variables=variables, indexes=indexes ) - self._update_coords(coords, indexes) + + @classmethod + def _construct_direct( + cls, + coords: dict[Any, Variable], + indexes: dict[Any, Index], + dims: dict[Any, int] | None = None, + ) -> Self: + from xarray.core.dataset import Dataset + + obj = object.__new__(cls) + obj._data = Dataset._construct_direct( + coord_names=set(coords), + variables=coords, + indexes=indexes, + dims=dims, + ) + return obj + + @classmethod + def from_pandas_multiindex(cls, midx: pd.MultiIndex, dim: str) -> Self: + """Wrap a pandas multi-index as Xarray coordinates (dimension + levels). + + The returned coordinates can be directly assigned to a + :py:class:`~xarray.Dataset` or :py:class:`~xarray.DataArray` via the + ``coords`` argument of their constructor. + + Parameters + ---------- + midx : :py:class:`pandas.MultiIndex` + Pandas multi-index object. + dim : str + Dimension name. + + Returns + ------- + coords : Coordinates + A collection of Xarray indexed coordinates created from the multi-index. + + """ + xr_idx = PandasMultiIndex(midx, dim) + + variables = xr_idx.create_variables() + indexes = {k: xr_idx for k in variables} + + return cls(coords=variables, indexes=indexes) + + @property + def _names(self) -> set[Hashable]: + return self._data._coord_names + + @property + def dims(self) -> Frozen[Hashable, int] | tuple[Hashable, ...]: + """Mapping from dimension names to lengths or tuple of dimension names.""" + return self._data.dims + + @property + def sizes(self) -> Frozen[Hashable, int]: + """Mapping from dimension names to lengths.""" + return self._data.sizes + + @property + def dtypes(self) -> Frozen[Hashable, np.dtype]: + """Mapping from coordinate names to dtypes. + + Cannot be modified directly. + + See Also + -------- + Dataset.dtypes + """ + return Frozen({n: v.dtype for n, v in self._data.variables.items()}) + + @property + def variables(self) -> Mapping[Hashable, Variable]: + """Low level interface to Coordinates contents as dict of Variable objects. + + This dictionary is frozen to prevent mutation. + """ + return self._data.variables + + def to_dataset(self) -> Dataset: + """Convert these coordinates into a new Dataset.""" + names = [name for name in self._data._variables if name in self._names] + return self._data._copy_listed(names) + + def __getitem__(self, key: Hashable) -> DataArray: + return self._data[key] + + def __delitem__(self, key: Hashable) -> None: + # redirect to DatasetCoordinates.__delitem__ + del self._data.coords[key] + + def equals(self, other: Coordinates) -> bool: + """Two Coordinates objects are equal if they have matching variables, + all of which are equal. + + See Also + -------- + Coordinates.identical + """ + if not isinstance(other, Coordinates): + return False + return self.to_dataset().equals(other.to_dataset()) + + def identical(self, other: Coordinates) -> bool: + """Like equals, but also checks all variable attributes. + + See Also + -------- + Coordinates.equals + """ + if not isinstance(other, Coordinates): + return False + return self.to_dataset().identical(other.to_dataset()) + + def _update_coords( + self, coords: dict[Hashable, Variable], indexes: Mapping[Any, Index] + ) -> None: + # redirect to DatasetCoordinates._update_coords + self._data.coords._update_coords(coords, indexes) + + def _maybe_drop_multiindex_coords(self, coords: set[Hashable]) -> None: + # redirect to DatasetCoordinates._maybe_drop_multiindex_coords + self._data.coords._maybe_drop_multiindex_coords(coords) def _merge_raw(self, other, reflexive): """For use with binary arithmetic.""" @@ -200,7 +412,7 @@ def _merge_inplace(self, other): yield self._update_coords(variables, indexes) - def merge(self, other: Coordinates | None) -> Dataset: + def merge(self, other: Mapping[Any, Any] | None) -> Dataset: """Merge two sets of coordinates to create a new Dataset The method implements the logic used for joining coordinates in the @@ -214,8 +426,9 @@ def merge(self, other: Coordinates | None) -> Dataset: Parameters ---------- - other : DatasetCoordinates or DataArrayCoordinates - The coordinates from another dataset or data array. + other : dict-like, optional + A :py:class:`Coordinates` object or any mapping that can be turned + into coordinates. Returns ------- @@ -236,13 +449,92 @@ def merge(self, other: Coordinates | None) -> Dataset: variables=coords, coord_names=coord_names, indexes=indexes ) + def __setitem__(self, key: Hashable, value: Any) -> None: + self.update({key: value}) + + def update(self, other: Mapping[Any, Any]) -> None: + """Update this Coordinates variables with other coordinate variables.""" + other_obj: Coordinates | Mapping[Hashable, Variable] + + if isinstance(other, Coordinates): + # special case: default indexes won't be created + other_obj = other + else: + other_obj = getattr(other, "variables", other) + + self._maybe_drop_multiindex_coords(set(other_obj)) + + coords, indexes = merge_coords( + [self.variables, other_obj], + priority_arg=1, + indexes=self.xindexes, + ) + + self._update_coords(coords, indexes) + + def _overwrite_indexes( + self, + indexes: Mapping[Any, Index], + variables: Mapping[Any, Variable] | None = None, + ) -> Self: + results = self.to_dataset()._overwrite_indexes(indexes, variables) + + # TODO: remove cast once we get rid of DatasetCoordinates + # and DataArrayCoordinates (i.e., Dataset and DataArray encapsulate Coordinates) + return cast(Self, results.coords) + + def _reindex_callback( + self, + aligner: Aligner, + dim_pos_indexers: dict[Hashable, Any], + variables: dict[Hashable, Variable], + indexes: dict[Hashable, Index], + fill_value: Any, + exclude_dims: frozenset[Hashable], + exclude_vars: frozenset[Hashable], + ) -> Self: + """Callback called from ``Aligner`` to create a new reindexed Coordinate.""" + aligned = self.to_dataset()._reindex_callback( + aligner, + dim_pos_indexers, + variables, + indexes, + fill_value, + exclude_dims, + exclude_vars, + ) + + # TODO: remove cast once we get rid of DatasetCoordinates + # and DataArrayCoordinates (i.e., Dataset and DataArray encapsulate Coordinates) + return cast(Self, aligned.coords) + + def _ipython_key_completions_(self): + """Provide method for the key-autocompletions in IPython.""" + return self._data._ipython_key_completions_() + + def copy( + self, + deep: bool = False, + memo: dict[int, Any] | None = None, + ) -> Coordinates: + """Return a copy of this Coordinates object.""" + # do not copy indexes (may corrupt multi-coordinate indexes) + # TODO: disable variables deepcopy? it may also be problematic when they + # encapsulate index objects like pd.Index + variables = { + k: v._copy(deep=deep, memo=memo) for k, v in self.variables.items() + } + return Coordinates._construct_direct( + coords=variables, indexes=dict(self.xindexes), dims=dict(self.sizes) + ) + class DatasetCoordinates(Coordinates): - """Dictionary like container for Dataset coordinates. + """Dictionary like container for Dataset coordinates (variables + indexes). - Essentially an immutable dictionary with keys given by the array's - dimensions and the values given by the corresponding xarray.Coordinate - objects. + This collection can be passed directly to the :py:class:`~xarray.Dataset` + and :py:class:`~xarray.DataArray` constructors via their `coords` argument. + This will add both the coordinates variables and their index. """ _data: Dataset @@ -257,7 +549,7 @@ def _names(self) -> set[Hashable]: return self._data._coord_names @property - def dims(self) -> Mapping[Hashable, int]: + def dims(self) -> Frozen[Hashable, int]: return self._data.dims @property @@ -343,11 +635,12 @@ def _ipython_key_completions_(self): ] -class DataArrayCoordinates(Coordinates["T_DataArray"]): - """Dictionary like container for DataArray coordinates. +class DataArrayCoordinates(Coordinates, Generic[T_DataArray]): + """Dictionary like container for DataArray coordinates (variables + indexes). - Essentially a dict with keys given by the array's - dimensions and the values given by corresponding DataArray objects. + This collection can be passed directly to the :py:class:`~xarray.Dataset` + and :py:class:`~xarray.DataArray` constructors via their `coords` argument. + This will add both the coordinates variables and their index. """ _data: T_DataArray @@ -477,3 +770,77 @@ def assert_coordinate_consistent( f"dimension coordinate {k!r} conflicts between " f"indexed and indexing objects:\n{obj[k]}\nvs.\n{coords[k]}" ) + + +def create_coords_with_default_indexes( + coords: Mapping[Any, Any], data_vars: Mapping[Any, Variable] | None = None +) -> Coordinates: + """Maybe create default indexes from a mapping of coordinates.""" + + # Note: data_vars are needed here only because a pd.MultiIndex object + # can be promoted as coordinates. + # TODO: It won't be relevant anymore when this behavior will be dropped + # in favor of the more explicit ``Coordinates.from_pandas_multiindex()``. + + from xarray.core.dataarray import DataArray + + all_variables = dict(coords) + if data_vars is not None: + all_variables.update(data_vars) + + indexes: dict[Hashable, Index] = {} + variables: dict[Hashable, Variable] = {} + + maybe_index_vars: dict[Hashable, Variable] = {} + mindex_data_vars: list[Hashable] = [] + + for k, v in all_variables.items(): + if k in coords: + maybe_index_vars[k] = v + elif isinstance(v, pd.MultiIndex): + # TODO: eventually stop promoting multi-index passed via data variables + mindex_data_vars.append(k) + maybe_index_vars[k] = v + + if mindex_data_vars: + warnings.warn( + f"passing one or more `pandas.MultiIndex` via data variable(s) {mindex_data_vars} " + "will no longer create indexed coordinates in the future. " + "If you want to keep this behavior, pass it as coordinates instead.", + FutureWarning, + ) + + maybe_index_vars = { + k: v + for k, v in all_variables.items() + if k in coords or isinstance(v, pd.MultiIndex) + } + + dataarray_coords: list[DataArrayCoordinates] = [] + + for name, obj in maybe_index_vars.items(): + if isinstance(obj, DataArray): + dataarray_coords.append(obj.coords) + + variable = as_variable(obj, name=name) + + if variable.dims == (name,): + idx, idx_vars = create_default_index_implicit(variable, all_variables) + indexes.update({k: idx for k in idx_vars}) + variables.update(idx_vars) + all_variables.update(idx_vars) + else: + variables[name] = variable + + new_coords = Coordinates._construct_direct(coords=variables, indexes=indexes) + + # extract and merge coordinates and indexes from input DataArrays + if dataarray_coords: + prioritized = {k: (v, indexes.get(k, None)) for k, v in variables.items()} + variables, indexes = merge_coordinates_without_align( + dataarray_coords + [new_coords], + prioritized=prioritized, + ) + new_coords = Coordinates._construct_direct(coords=variables, indexes=indexes) + + return new_coords diff --git a/xarray/core/dask_array_ops.py b/xarray/core/dask_array_ops.py index 24c5f698a27..d2d3e4a6d1c 100644 --- a/xarray/core/dask_array_ops.py +++ b/xarray/core/dask_array_ops.py @@ -1,9 +1,5 @@ from __future__ import annotations -from functools import partial - -from numpy.core.multiarray import normalize_axis_index # type: ignore[attr-defined] - from xarray.core import dtypes, nputils @@ -96,36 +92,3 @@ def _fill_with_last_one(a, b): axis=axis, dtype=array.dtype, ) - - -def _first_last_wrapper(array, *, axis, op, keepdims): - return op(array, axis, keepdims=keepdims) - - -def _first_or_last(darray, axis, op): - import dask.array - - # This will raise the same error message seen for numpy - axis = normalize_axis_index(axis, darray.ndim) - - wrapped_op = partial(_first_last_wrapper, op=op) - return dask.array.reduction( - darray, - chunk=wrapped_op, - aggregate=wrapped_op, - axis=axis, - dtype=darray.dtype, - keepdims=False, # match numpy version - ) - - -def nanfirst(darray, axis): - from xarray.core.duck_array_ops import nanfirst - - return _first_or_last(darray, axis, op=nanfirst) - - -def nanlast(darray, axis): - from xarray.core.duck_array_ops import nanlast - - return _first_or_last(darray, axis, op=nanlast) diff --git a/xarray/core/daskmanager.py b/xarray/core/daskmanager.py new file mode 100644 index 00000000000..56d8dc9e23a --- /dev/null +++ b/xarray/core/daskmanager.py @@ -0,0 +1,215 @@ +from __future__ import annotations + +from collections.abc import Iterable, Sequence +from typing import TYPE_CHECKING, Any, Callable + +import numpy as np +from packaging.version import Version + +from xarray.core.duck_array_ops import dask_available +from xarray.core.indexing import ImplicitToExplicitIndexingAdapter +from xarray.core.parallelcompat import ChunkManagerEntrypoint, T_ChunkedArray +from xarray.core.pycompat import is_duck_dask_array + +if TYPE_CHECKING: + from xarray.core.types import DaskArray, T_Chunks, T_NormalizedChunks + + +class DaskManager(ChunkManagerEntrypoint["DaskArray"]): + array_cls: type[DaskArray] + available: bool = dask_available + + def __init__(self) -> None: + # TODO can we replace this with a class attribute instead? + + from dask.array import Array + + self.array_cls = Array + + def is_chunked_array(self, data: Any) -> bool: + return is_duck_dask_array(data) + + def chunks(self, data: DaskArray) -> T_NormalizedChunks: + return data.chunks + + def normalize_chunks( + self, + chunks: T_Chunks | T_NormalizedChunks, + shape: tuple[int, ...] | None = None, + limit: int | None = None, + dtype: np.dtype | None = None, + previous_chunks: T_NormalizedChunks | None = None, + ) -> T_NormalizedChunks: + """Called by open_dataset""" + from dask.array.core import normalize_chunks + + return normalize_chunks( + chunks, + shape=shape, + limit=limit, + dtype=dtype, + previous_chunks=previous_chunks, + ) + + def from_array(self, data: Any, chunks, **kwargs) -> DaskArray: + import dask.array as da + + if isinstance(data, ImplicitToExplicitIndexingAdapter): + # lazily loaded backend array classes should use NumPy array operations. + kwargs["meta"] = np.ndarray + + return da.from_array( + data, + chunks, + **kwargs, + ) + + def compute(self, *data: DaskArray, **kwargs) -> tuple[np.ndarray, ...]: + from dask.array import compute + + return compute(*data, **kwargs) + + @property + def array_api(self) -> Any: + from dask import array as da + + return da + + def reduction( + self, + arr: T_ChunkedArray, + func: Callable, + combine_func: Callable | None = None, + aggregate_func: Callable | None = None, + axis: int | Sequence[int] | None = None, + dtype: np.dtype | None = None, + keepdims: bool = False, + ) -> T_ChunkedArray: + from dask.array import reduction + + return reduction( + arr, + chunk=func, + combine=combine_func, + aggregate=aggregate_func, + axis=axis, + dtype=dtype, + keepdims=keepdims, + ) + + def apply_gufunc( + self, + func: Callable, + signature: str, + *args: Any, + axes: Sequence[tuple[int, ...]] | None = None, + axis: int | None = None, + keepdims: bool = False, + output_dtypes: Sequence[np.typing.DTypeLike] | None = None, + output_sizes: dict[str, int] | None = None, + vectorize: bool | None = None, + allow_rechunk: bool = False, + meta: tuple[np.ndarray, ...] | None = None, + **kwargs, + ): + from dask.array.gufunc import apply_gufunc + + return apply_gufunc( + func, + signature, + *args, + axes=axes, + axis=axis, + keepdims=keepdims, + output_dtypes=output_dtypes, + output_sizes=output_sizes, + vectorize=vectorize, + allow_rechunk=allow_rechunk, + meta=meta, + **kwargs, + ) + + def map_blocks( + self, + func: Callable, + *args: Any, + dtype: np.typing.DTypeLike | None = None, + chunks: tuple[int, ...] | None = None, + drop_axis: int | Sequence[int] | None = None, + new_axis: int | Sequence[int] | None = None, + **kwargs, + ): + import dask + from dask.array import map_blocks + + if drop_axis is None and Version(dask.__version__) < Version("2022.9.1"): + # See https://github.com/pydata/xarray/pull/7019#discussion_r1196729489 + # TODO remove once dask minimum version >= 2022.9.1 + drop_axis = [] + + # pass through name, meta, token as kwargs + return map_blocks( + func, + *args, + dtype=dtype, + chunks=chunks, + drop_axis=drop_axis, + new_axis=new_axis, + **kwargs, + ) + + def blockwise( + self, + func: Callable, + out_ind: Iterable, + *args: Any, + # can't type this as mypy assumes args are all same type, but dask blockwise args alternate types + name: str | None = None, + token=None, + dtype: np.dtype | None = None, + adjust_chunks: dict[Any, Callable] | None = None, + new_axes: dict[Any, int] | None = None, + align_arrays: bool = True, + concatenate: bool | None = None, + meta=None, + **kwargs, + ): + from dask.array import blockwise + + return blockwise( + func, + out_ind, + *args, + name=name, + token=token, + dtype=dtype, + adjust_chunks=adjust_chunks, + new_axes=new_axes, + align_arrays=align_arrays, + concatenate=concatenate, + meta=meta, + **kwargs, + ) + + def unify_chunks( + self, + *args: Any, # can't type this as mypy assumes args are all same type, but dask unify_chunks args alternate types + **kwargs, + ) -> tuple[dict[str, T_NormalizedChunks], list[DaskArray]]: + from dask.array.core import unify_chunks + + return unify_chunks(*args, **kwargs) + + def store( + self, + sources: DaskArray | Sequence[DaskArray], + targets: Any, + **kwargs, + ): + from dask.array import store + + return store( + sources=sources, + targets=targets, + **kwargs, + ) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 2f663c4936a..df57ad898e4 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -23,7 +23,12 @@ from xarray.core.arithmetic import DataArrayArithmetic from xarray.core.common import AbstractArray, DataWithCoords, get_chunksizes from xarray.core.computation import unify_chunks -from xarray.core.coordinates import DataArrayCoordinates, assert_coordinate_consistent +from xarray.core.coordinates import ( + Coordinates, + DataArrayCoordinates, + assert_coordinate_consistent, + create_coords_with_default_indexes, +) from xarray.core.dataset import Dataset from xarray.core.formatting import format_item from xarray.core.indexes import ( @@ -34,7 +39,7 @@ isel_indexes, ) from xarray.core.indexing import is_fancy_indexer, map_index_queries -from xarray.core.merge import PANDAS_TYPES, MergeError, _create_indexes_from_coords +from xarray.core.merge import PANDAS_TYPES, MergeError from xarray.core.options import OPTIONS, _get_keep_attrs from xarray.core.utils import ( Default, @@ -42,6 +47,7 @@ ReprObject, _default, either_dict_or_kwargs, + emit_user_level_warning, ) from xarray.core.variable import ( IndexVariable, @@ -77,6 +83,7 @@ from xarray.backends import ZarrStore from xarray.backends.api import T_NetcdfEngine, T_NetcdfTypes from xarray.core.groupby import DataArrayGroupBy + from xarray.core.parallelcompat import ChunkManagerEntrypoint from xarray.core.resample import DataArrayResample from xarray.core.rolling import DataArrayCoarsen, DataArrayRolling from xarray.core.types import ( @@ -102,9 +109,35 @@ T_XarrayOther = TypeVar("T_XarrayOther", bound=Union["DataArray", Dataset]) +def _check_coords_dims(shape, coords, dims): + sizes = dict(zip(dims, shape)) + for k, v in coords.items(): + if any(d not in dims for d in v.dims): + raise ValueError( + f"coordinate {k} has dimensions {v.dims}, but these " + "are not a subset of the DataArray " + f"dimensions {dims}" + ) + + for d, s in zip(v.dims, v.shape): + if s != sizes[d]: + raise ValueError( + f"conflicting sizes for dimension {d!r}: " + f"length {sizes[d]} on the data but length {s} on " + f"coordinate {k!r}" + ) + + if k in sizes and v.shape != (sizes[k],): + raise ValueError( + f"coordinate {k!r} is a DataArray dimension, but " + f"it has shape {v.shape!r} rather than expected shape {sizes[k]!r} " + "matching the dimension size" + ) + + def _infer_coords_and_dims( shape, coords, dims -) -> tuple[dict[Hashable, Variable], tuple[Hashable, ...]]: +) -> tuple[Mapping[Hashable, Any], tuple[Hashable, ...]]: """All the logic for creating a new DataArray""" if ( @@ -142,40 +175,22 @@ def _infer_coords_and_dims( if not isinstance(d, str): raise TypeError(f"dimension {d} is not a string") - new_coords: dict[Hashable, Variable] = {} - - if utils.is_dict_like(coords): - for k, v in coords.items(): - new_coords[k] = as_variable(v, name=k) - elif coords is not None: - for dim, coord in zip(dims, coords): - var = as_variable(coord, name=dim) - var.dims = (dim,) - new_coords[dim] = var.to_index_variable() - - sizes = dict(zip(dims, shape)) - for k, v in new_coords.items(): - if any(d not in dims for d in v.dims): - raise ValueError( - f"coordinate {k} has dimensions {v.dims}, but these " - "are not a subset of the DataArray " - f"dimensions {dims}" - ) + new_coords: Mapping[Hashable, Any] - for d, s in zip(v.dims, v.shape): - if s != sizes[d]: - raise ValueError( - f"conflicting sizes for dimension {d!r}: " - f"length {sizes[d]} on the data but length {s} on " - f"coordinate {k!r}" - ) + if isinstance(coords, Coordinates): + new_coords = coords + else: + new_coords = {} + if utils.is_dict_like(coords): + for k, v in coords.items(): + new_coords[k] = as_variable(v, name=k) + elif coords is not None: + for dim, coord in zip(dims, coords): + var = as_variable(coord, name=dim) + var.dims = (dim,) + new_coords[dim] = var.to_index_variable() - if k in sizes and v.shape != (sizes[k],): - raise ValueError( - f"coordinate {k!r} is a DataArray dimension, but " - f"it has shape {v.shape!r} rather than expected shape {sizes[k]!r} " - "matching the dimension size" - ) + _check_coords_dims(shape, new_coords, dims) return new_coords, dims @@ -264,7 +279,7 @@ class DataArray( or pandas object, attempts are made to use this array's metadata to fill in other unspecified arguments. A view of the array's data is used instead of a copy if possible. - coords : sequence or dict of array_like, optional + coords : sequence or dict of array_like or :py:class:`~xarray.Coordinates`, optional Coordinates (tick labels) to use for indexing along each dimension. The following notations are accepted: @@ -284,6 +299,10 @@ class DataArray( - mapping {coord name: (dimension name, array-like)} - mapping {coord name: (tuple of dimension names, array-like)} + Alternatively, a :py:class:`~xarray.Coordinates` object may be used in + order to explicitly pass indexes (e.g., a multi-index or any custom + Xarray index) or to bypass the creation of a default index for any + :term:`Dimension coordinate` included in that object. dims : Hashable or sequence of Hashable, optional Name(s) of the data dimension(s). Must be either a Hashable (only for 1D data) or a sequence of Hashables with length equal @@ -295,6 +314,11 @@ class DataArray( attrs : dict_like or None, optional Attributes to assign to the new instance. By default, an empty attribute dictionary is initialized. + indexes : py:class:`~xarray.Indexes` or dict-like, optional + For internal use only. For passing indexes objects to the + new DataArray, use the ``coords`` argument instead with a + :py:class:`~xarray.Coordinate` object (both coordinate variables + and indexes will be extracted from the latter). Examples -------- @@ -384,7 +408,7 @@ def __init__( name: Hashable | None = None, attrs: Mapping | None = None, # internal parameters - indexes: dict[Hashable, Index] | None = None, + indexes: Mapping[Any, Index] | None = None, fastpath: bool = False, ) -> None: if fastpath: @@ -393,10 +417,11 @@ def __init__( assert attrs is None assert indexes is not None else: - # TODO: (benbovy - explicit indexes) remove - # once it becomes part of the public interface if indexes is not None: - raise ValueError("Providing explicit indexes is not supported yet") + raise ValueError( + "Explicitly passing indexes via the `indexes` argument is not supported " + "when `fastpath=False`. Use the `coords` argument instead." + ) # try to fill in arguments from data if they weren't supplied if coords is None: @@ -420,17 +445,18 @@ def __init__( data = as_compatible_data(data) coords, dims = _infer_coords_and_dims(data.shape, coords, dims) variable = Variable(dims, data, attrs, fastpath=True) - indexes, coords = _create_indexes_from_coords(coords) + + if not isinstance(coords, Coordinates): + coords = create_coords_with_default_indexes(coords) + indexes = dict(coords.xindexes) + coords = {k: v.copy() for k, v in coords.variables.items()} # These fully describe a DataArray self._variable = variable assert isinstance(coords, dict) self._coords = coords self._name = name - - # TODO(shoyer): document this argument, once it becomes part of the - # public interface. - self._indexes = indexes + self._indexes = indexes # type: ignore[assignment] self._close = None @@ -498,7 +524,7 @@ def _replace_maybe_drop_dims( def _overwrite_indexes( self: T_DataArray, indexes: Mapping[Any, Index], - coords: Mapping[Any, Variable] | None = None, + variables: Mapping[Any, Variable] | None = None, drop_coords: list[Hashable] | None = None, rename_dims: Mapping[Any, Any] | None = None, ) -> T_DataArray: @@ -506,8 +532,8 @@ def _overwrite_indexes( if not indexes: return self - if coords is None: - coords = {} + if variables is None: + variables = {} if drop_coords is None: drop_coords = [] @@ -516,7 +542,7 @@ def _overwrite_indexes( new_indexes = dict(self._indexes) for name in indexes: - new_coords[name] = coords[name] + new_coords[name] = variables[name] new_indexes[name] = indexes[name] for name in drop_coords: @@ -904,12 +930,20 @@ def indexes(self) -> Indexes: @property def xindexes(self) -> Indexes: - """Mapping of xarray Index objects used for label based indexing.""" + """Mapping of :py:class:`~xarray.indexes.Index` objects + used for label based indexing. + """ return Indexes(self._indexes, {k: self._coords[k] for k in self._indexes}) @property def coords(self) -> DataArrayCoordinates: - """Dictionary-like container of coordinate arrays.""" + """Mapping of :py:class:`~xarray.DataArray` objects corresponding to + coordinate variables. + + See Also + -------- + Coordinates + """ return DataArrayCoordinates(self) @overload @@ -1264,6 +1298,8 @@ def chunk( token: str | None = None, lock: bool = False, inline_array: bool = False, + chunked_array_type: str | ChunkManagerEntrypoint | None = None, + from_array_kwargs=None, **chunks_kwargs: Any, ) -> T_DataArray: """Coerce this array's data into a dask arrays with the given chunks. @@ -1285,12 +1321,21 @@ def chunk( Prefix for the name of the new dask array. token : str, optional Token uniquely identifying this array. - lock : optional + lock : bool, default: False Passed on to :py:func:`dask.array.from_array`, if the array is not already as dask array. - inline_array: optional + inline_array: bool, default: False Passed on to :py:func:`dask.array.from_array`, if the array is not already as dask array. + chunked_array_type: str, optional + Which chunked array type to coerce the underlying data array to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEntryPoint` system. + Experimental API that should not be relied upon. + from_array_kwargs: dict, optional + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example, with dask as the default chunked array type, this method would pass additional kwargs + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. **chunks_kwargs : {dim: chunks, ...}, optional The keyword arguments form of ``chunks``. One of chunks or chunks_kwargs must be provided. @@ -1328,6 +1373,8 @@ def chunk( token=token, lock=lock, inline_array=inline_array, + chunked_array_type=chunked_array_type, + from_array_kwargs=from_array_kwargs, ) return self._from_temp_dataset(ds) @@ -1788,7 +1835,11 @@ def _reindex_callback( exclude_dims, exclude_vars, ) - return self._from_temp_dataset(reindexed) + + da = self._from_temp_dataset(reindexed) + da.encoding = self.encoding + + return da def reindex_like( self: T_DataArray, @@ -4317,16 +4368,44 @@ def from_series(cls, series: pd.Series, sparse: bool = False) -> DataArray: return result def to_cdms2(self) -> cdms2_Variable: - """Convert this array into a cdms2.Variable""" + """Convert this array into a cdms2.Variable + + .. deprecated:: 2023.06.0 + The `cdms2`_ library has been deprecated. Please consider using the + `xcdat`_ library instead. + + .. _cdms2: https://github.com/CDAT/cdms + .. _xcdat: https://github.com/xCDAT/xcdat + """ from xarray.convert import to_cdms2 + emit_user_level_warning( + "The cdms2 library has been deprecated." + " Please consider using the xcdat library instead.", + DeprecationWarning, + ) + return to_cdms2(self) @classmethod def from_cdms2(cls, variable: cdms2_Variable) -> DataArray: - """Convert a cdms2.Variable into an xarray.DataArray""" + """Convert a cdms2.Variable into an xarray.DataArray + + .. deprecated:: 2023.06.0 + The `cdms2`_ library has been deprecated. Please consider using the + `xcdat`_ library instead. + + .. _cdms2: https://github.com/CDAT/cdms + .. _xcdat: https://github.com/xCDAT/xcdat + """ from xarray.convert import from_cdms2 + emit_user_level_warning( + "The cdms2 library has been deprecated." + " Please consider using the xcdat library instead.", + DeprecationWarning, + ) + return from_cdms2(variable) def to_iris(self) -> iris_Cube: @@ -4633,11 +4712,7 @@ def _title_for_slice(self, truncate: int = 50) -> str: for dim, coord in self.coords.items(): if coord.size == 1: one_dims.append( - "{dim} = {v}{unit}".format( - dim=dim, - v=format_item(coord.values), - unit=_get_units_from_attrs(coord), - ) + f"{dim} = {format_item(coord.values)}{_get_units_from_attrs(coord)}" ) title = ", ".join(one_dims) @@ -4961,15 +5036,15 @@ def quantile( desired quantile lies between two data points. The options sorted by their R type as summarized in the H&F paper [1]_ are: - 1. "inverted_cdf" (*) - 2. "averaged_inverted_cdf" (*) - 3. "closest_observation" (*) - 4. "interpolated_inverted_cdf" (*) - 5. "hazen" (*) - 6. "weibull" (*) + 1. "inverted_cdf" + 2. "averaged_inverted_cdf" + 3. "closest_observation" + 4. "interpolated_inverted_cdf" + 5. "hazen" + 6. "weibull" 7. "linear" (default) - 8. "median_unbiased" (*) - 9. "normal_unbiased" (*) + 8. "median_unbiased" + 9. "normal_unbiased" The first three methods are discontiuous. The following discontinuous variations of the default "linear" (7.) option are also available: @@ -4983,8 +5058,6 @@ def quantile( was previously called "interpolation", renamed in accordance with numpy version 1.22.0. - (*) These methods require numpy version 1.22 or newer. - keep_attrs : bool or None, optional If True, the dataset's attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new @@ -5461,6 +5534,7 @@ def polyfit( numpy.polyfit numpy.polyval xarray.polyval + DataArray.curvefit """ return self._to_temp_dataset().polyfit( dim, deg, skipna=skipna, rcond=rcond, w=w, full=full, cov=cov @@ -6115,9 +6189,10 @@ def curvefit( func: Callable[..., Any], reduce_dims: Dims = None, skipna: bool = True, - p0: dict[str, Any] | None = None, - bounds: dict[str, Any] | None = None, + p0: dict[str, float | DataArray] | None = None, + bounds: dict[str, tuple[float | DataArray, float | DataArray]] | None = None, param_names: Sequence[str] | None = None, + errors: ErrorOptions = "raise", kwargs: dict[str, Any] | None = None, ) -> Dataset: """ @@ -6147,17 +6222,25 @@ def curvefit( Whether to skip missing values when fitting. Default is True. p0 : dict-like or None, optional Optional dictionary of parameter names to initial guesses passed to the - `curve_fit` `p0` arg. If none or only some parameters are passed, the rest will - be assigned initial values following the default scipy behavior. - bounds : dict-like or None, optional - Optional dictionary of parameter names to bounding values passed to the - `curve_fit` `bounds` arg. If none or only some parameters are passed, the rest - will be unbounded following the default scipy behavior. + `curve_fit` `p0` arg. If the values are DataArrays, they will be appropriately + broadcast to the coordinates of the array. If none or only some parameters are + passed, the rest will be assigned initial values following the default scipy + behavior. + bounds : dict-like, optional + Optional dictionary of parameter names to tuples of bounding values passed to the + `curve_fit` `bounds` arg. If any of the bounds are DataArrays, they will be + appropriately broadcast to the coordinates of the array. If none or only some + parameters are passed, the rest will be unbounded following the default scipy + behavior. param_names : sequence of Hashable or None, optional Sequence of names for the fittable parameters of `func`. If not supplied, this will be automatically determined by arguments of `func`. `param_names` should be manually supplied when fitting a function that takes a variable number of parameters. + errors : {"raise", "ignore"}, default: "raise" + If 'raise', any errors from the `scipy.optimize_curve_fit` optimization will + raise an exception. If 'ignore', the coefficients and covariances for the + coordinates where the fitting failed will be NaN. **kwargs : optional Additional keyword arguments to passed to scipy curve_fit. @@ -6171,6 +6254,86 @@ def curvefit( [var]_curvefit_covariance The covariance matrix of the coefficient estimates. + Examples + -------- + Generate some exponentially decaying data, where the decay constant and amplitude are + different for different values of the coordinate ``x``: + + >>> rng = np.random.default_rng(seed=0) + >>> def exp_decay(t, time_constant, amplitude): + ... return np.exp(-t / time_constant) * amplitude + ... + >>> t = np.linspace(0, 10, 11) + >>> da = xr.DataArray( + ... np.stack( + ... [ + ... exp_decay(t, 1, 0.1), + ... exp_decay(t, 2, 0.2), + ... exp_decay(t, 3, 0.3), + ... ] + ... ) + ... + rng.normal(size=(3, t.size)) * 0.01, + ... coords={"x": [0, 1, 2], "time": t}, + ... ) + >>> da + + array([[ 0.1012573 , 0.0354669 , 0.01993775, 0.00602771, -0.00352513, + 0.00428975, 0.01328788, 0.009562 , -0.00700381, -0.01264187, + -0.0062282 ], + [ 0.20041326, 0.09805582, 0.07138797, 0.03216692, 0.01974438, + 0.01097441, 0.00679441, 0.01015578, 0.01408826, 0.00093645, + 0.01501222], + [ 0.29334805, 0.21847449, 0.16305984, 0.11130396, 0.07164415, + 0.04744543, 0.03602333, 0.03129354, 0.01074885, 0.01284436, + 0.00910995]]) + Coordinates: + * x (x) int64 0 1 2 + * time (time) float64 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 + + Fit the exponential decay function to the data along the ``time`` dimension: + + >>> fit_result = da.curvefit("time", exp_decay) + >>> fit_result["curvefit_coefficients"].sel( + ... param="time_constant" + ... ) # doctest: +NUMBER + + array([1.0569203, 1.7354963, 2.9421577]) + Coordinates: + * x (x) int64 0 1 2 + param >> fit_result["curvefit_coefficients"].sel(param="amplitude") + + array([0.1005489 , 0.19631423, 0.30003579]) + Coordinates: + * x (x) int64 0 1 2 + param >> fit_result = da.curvefit( + ... "time", + ... exp_decay, + ... p0={ + ... "amplitude": 0.2, + ... "time_constant": xr.DataArray([1, 2, 3], coords=[da.x]), + ... }, + ... ) + >>> fit_result["curvefit_coefficients"].sel(param="time_constant") + + array([1.0569213 , 1.73550052, 2.94215733]) + Coordinates: + * x (x) int64 0 1 2 + param >> fit_result["curvefit_coefficients"].sel(param="amplitude") + + array([0.10054889, 0.1963141 , 0.3000358 ]) + Coordinates: + * x (x) int64 0 1 2 + param None: raise ValueError(msg % args) -def _get_chunk(var, chunks): +def _get_chunk(var: Variable, chunks, chunkmanager: ChunkManagerEntrypoint): """ Return map from each dim to chunk sizes, accounting for backend's preferred chunks. """ - import dask.array as da - if isinstance(var, IndexVariable): return {} dims = var.dims @@ -225,7 +238,8 @@ def _get_chunk(var, chunks): chunks.get(dim, None) or preferred_chunk_sizes for dim, preferred_chunk_sizes in zip(dims, preferred_chunk_shape) ) - chunk_shape = da.core.normalize_chunks( + + chunk_shape = chunkmanager.normalize_chunks( chunk_shape, shape=shape, dtype=var.dtype, previous_chunks=preferred_chunk_shape ) @@ -242,7 +256,7 @@ def _get_chunk(var, chunks): # expresses the preferred chunks, the sequence sums to the size. preferred_stops = ( range(preferred_chunk_sizes, size, preferred_chunk_sizes) - if isinstance(preferred_chunk_sizes, Number) + if isinstance(preferred_chunk_sizes, int) else itertools.accumulate(preferred_chunk_sizes[:-1]) ) # Gather any stop indices of the specified chunks that are not a stop index @@ -253,7 +267,7 @@ def _get_chunk(var, chunks): ) if breaks: warnings.warn( - "The specified Dask chunks separate the stored chunks along " + "The specified chunks separate the stored chunks along " f'dimension "{dim}" starting at index {min(breaks)}. This could ' "degrade performance. Instead, consider rechunking after loading." ) @@ -270,18 +284,37 @@ def _maybe_chunk( name_prefix="xarray-", overwrite_encoded_chunks=False, inline_array=False, + chunked_array_type: str | ChunkManagerEntrypoint | None = None, + from_array_kwargs=None, ): - from dask.base import tokenize - if chunks is not None: chunks = {dim: chunks[dim] for dim in var.dims if dim in chunks} + if var.ndim: - # when rechunking by different amounts, make sure dask names change - # by provinding chunks as an input to tokenize. - # subtle bugs result otherwise. see GH3350 - token2 = tokenize(name, token if token else var._data, chunks) - name2 = f"{name_prefix}{name}-{token2}" - var = var.chunk(chunks, name=name2, lock=lock, inline_array=inline_array) + chunked_array_type = guess_chunkmanager( + chunked_array_type + ) # coerce string to ChunkManagerEntrypoint type + if isinstance(chunked_array_type, DaskManager): + from dask.base import tokenize + + # when rechunking by different amounts, make sure dask names change + # by providing chunks as an input to tokenize. + # subtle bugs result otherwise. see GH3350 + token2 = tokenize(name, token if token else var._data, chunks) + name2 = f"{name_prefix}{name}-{token2}" + + from_array_kwargs = utils.consolidate_dask_from_array_kwargs( + from_array_kwargs, + name=name2, + lock=lock, + inline_array=inline_array, + ) + + var = var.chunk( + chunks, + chunked_array_type=chunked_array_type, + from_array_kwargs=from_array_kwargs, + ) if overwrite_encoded_chunks and var.chunks is not None: var.encoding["chunks"] = tuple(x[0] for x in var.chunks) @@ -332,17 +365,24 @@ def _initialize_curvefit_params(params, p0, bounds, func_args): """Set initial guess and bounds for curvefit. Priority: 1) passed args 2) func signature 3) scipy defaults """ + from xarray.core.computation import where def _initialize_feasible(lb, ub): # Mimics functionality of scipy.optimize.minpack._initialize_feasible lb_finite = np.isfinite(lb) ub_finite = np.isfinite(ub) - p0 = np.nansum( - [ - 0.5 * (lb + ub) * int(lb_finite & ub_finite), - (lb + 1) * int(lb_finite & ~ub_finite), - (ub - 1) * int(~lb_finite & ub_finite), - ] + p0 = where( + lb_finite, + where( + ub_finite, + 0.5 * (lb + ub), # both bounds finite + lb + 1, # lower bound finite, upper infinite + ), + where( + ub_finite, + ub - 1, # lower bound infinite, upper finite + 0, # both bounds infinite + ), ) return p0 @@ -352,14 +392,38 @@ def _initialize_feasible(lb, ub): if p in func_args and func_args[p].default is not func_args[p].empty: param_defaults[p] = func_args[p].default if p in bounds: - bounds_defaults[p] = tuple(bounds[p]) - if param_defaults[p] < bounds[p][0] or param_defaults[p] > bounds[p][1]: - param_defaults[p] = _initialize_feasible(bounds[p][0], bounds[p][1]) + lb, ub = bounds[p] + bounds_defaults[p] = (lb, ub) + param_defaults[p] = where( + (param_defaults[p] < lb) | (param_defaults[p] > ub), + _initialize_feasible(lb, ub), + param_defaults[p], + ) if p in p0: param_defaults[p] = p0[p] return param_defaults, bounds_defaults +def merge_data_and_coords(data_vars, coords): + """Used in Dataset.__init__.""" + if isinstance(coords, Coordinates): + coords = coords.copy() + else: + coords = create_coords_with_default_indexes(coords, data_vars) + + # exclude coords from alignment (all variables in a Coordinates object should + # already be aligned together) and use coordinates' indexes to align data_vars + return merge_core( + [data_vars, coords], + compat="broadcast_equals", + join="outer", + explicit_coords=tuple(coords), + indexes=coords.xindexes, + priority_arg=1, + skip_align_args=[1], + ) + + class DataVariables(Mapping[Any, "DataArray"]): __slots__ = ("_dataset",) @@ -451,8 +515,11 @@ class Dataset( Dataset implements the mapping interface with keys given by variable names and values given by DataArray objects for each variable name. - One dimensional variables with name equal to their dimension are - index coordinates used for label based indexing. + By default, pandas indexes are created for one dimensional variables with + name equal to their dimension (i.e., :term:`Dimension coordinate`) so those + variables can be readily used as coordinates for label based indexing. When a + :py:class:`~xarray.Coordinates` object is passed to ``coords``, any existing + index(es) built from those coordinates will be added to the Dataset. To load data from a file or file-like object, use the `open_dataset` function. @@ -473,22 +540,21 @@ class Dataset( - mapping {var name: (dimension name, array-like)} - mapping {var name: (tuple of dimension names, array-like)} - mapping {dimension name: array-like} - (it will be automatically moved to coords, see below) + (if array-like is not a scalar it will be automatically moved to coords, + see below) Each dimension must have the same length in all variables in which it appears. - coords : dict-like, optional - Another mapping in similar form as the `data_vars` argument, - except the each item is saved on the dataset as a "coordinate". + coords : :py:class:`~xarray.Coordinates` or dict-like, optional + A :py:class:`~xarray.Coordinates` object or another mapping in + similar form as the `data_vars` argument, except that each item + is saved on the dataset as a "coordinate". These variables have an associated meaning: they describe constant/fixed/independent quantities, unlike the varying/measured/dependent quantities that belong in - `variables`. Coordinates values may be given by 1-dimensional - arrays or scalars, in which case `dims` do not need to be - supplied: 1D arrays will be assumed to give index values along - the dimension with the same name. + `variables`. - The following notations are accepted: + The following notations are accepted for arbitrary mappings: - mapping {coord name: DataArray} - mapping {coord name: Variable} @@ -498,8 +564,16 @@ class Dataset( (the dimension name is implicitly set to be the same as the coord name) - The last notation implies that the coord name is the same as - the dimension name. + The last notation implies either that the coordinate value is a scalar + or that it is a 1-dimensional array and the coord name is the same as + the dimension name (i.e., a :term:`Dimension coordinate`). In the latter + case, the 1-dimensional array will be assumed to give index values + along the dimension with the same name. + + Alternatively, a :py:class:`~xarray.Coordinates` object may be used in + order to explicitly pass indexes (e.g., a multi-index or any custom + Xarray index) or to bypass the creation of a default index for any + :term:`Dimension coordinate` included in that object. attrs : dict-like, optional Global attributes to save on this dataset. @@ -562,6 +636,7 @@ class Dataset( precipitation float64 8.326 Attributes: description: Weather related data. + """ _attrs: dict[Hashable, Any] | None @@ -593,8 +668,6 @@ def __init__( coords: Mapping[Any, Any] | None = None, attrs: Mapping[Any, Any] | None = None, ) -> None: - # TODO(shoyer): expose indexes as a public argument in __init__ - if data_vars is None: data_vars = {} if coords is None: @@ -607,10 +680,10 @@ def __init__( ) if isinstance(coords, Dataset): - coords = coords.variables + coords = coords._variables variables, coord_names, dims, indexes, _ = merge_data_and_coords( - data_vars, coords, compat="broadcast_equals" + data_vars, coords ) self._attrs = dict(attrs) if attrs is not None else None @@ -743,13 +816,13 @@ def load(self: T_Dataset, **kwargs) -> T_Dataset: """ # access .data to coerce everything to numpy or dask arrays lazy_data = { - k: v._data for k, v in self.variables.items() if is_duck_dask_array(v._data) + k: v._data for k, v in self.variables.items() if is_chunked_array(v._data) } if lazy_data: - import dask.array as da + chunkmanager = get_chunked_array_type(*lazy_data.values()) - # evaluate all the dask arrays simultaneously - evaluated_data = da.compute(*lazy_data.values(), **kwargs) + # evaluate all the chunked arrays simultaneously + evaluated_data = chunkmanager.compute(*lazy_data.values(), **kwargs) for k, data in zip(lazy_data, evaluated_data): self.variables[k].data = data @@ -1359,8 +1432,8 @@ def _construct_dataarray(self, name: Hashable) -> DataArray: coords: dict[Hashable, Variable] = {} # preserve ordering for k in self._variables: - if k in self._coord_names and set(self.variables[k].dims) <= needed_dims: - coords[k] = self.variables[k] + if k in self._coord_names and set(self._variables[k].dims) <= needed_dims: + coords[k] = self._variables[k] indexes = filter_indexes_from_coords(self._indexes, set(coords)) @@ -1575,7 +1648,7 @@ def _setitem_check(self, key, value): val = np.array(val) # type conversion - new_value[name] = val.astype(var_k.dtype, copy=False) + new_value[name] = duck_array_ops.astype(val, dtype=var_k.dtype, copy=False) # check consistency of dimension sizes and dimension coordinates if isinstance(value, DataArray) or isinstance(value, Dataset): @@ -1617,10 +1690,59 @@ def broadcast_equals(self, other: Dataset) -> bool: the other dataset can still be broadcast equal if the the non-scalar variable is a constant. + Examples + -------- + + # 2D array with shape (1, 3) + + >>> data = np.array([[1, 2, 3]]) + >>> a = xr.Dataset( + ... {"variable_name": (("space", "time"), data)}, + ... coords={"space": [0], "time": [0, 1, 2]}, + ... ) + >>> a + + Dimensions: (space: 1, time: 3) + Coordinates: + * space (space) int64 0 + * time (time) int64 0 1 2 + Data variables: + variable_name (space, time) int64 1 2 3 + + # 2D array with shape (3, 1) + + >>> data = np.array([[1], [2], [3]]) + >>> b = xr.Dataset( + ... {"variable_name": (("time", "space"), data)}, + ... coords={"time": [0, 1, 2], "space": [0]}, + ... ) + >>> b + + Dimensions: (time: 3, space: 1) + Coordinates: + * time (time) int64 0 1 2 + * space (space) int64 0 + Data variables: + variable_name (time, space) int64 1 2 3 + + .equals returns True if two Datasets have the same values, dimensions, and coordinates. .broadcast_equals returns True if the + results of broadcasting two Datasets against each other have the same values, dimensions, and coordinates. + + >>> a.equals(b) + False + + >>> a.broadcast_equals(b) + True + + >>> a2, b2 = xr.broadcast(a, b) + >>> a2.equals(b2) + True + See Also -------- Dataset.equals Dataset.identical + Dataset.broadcast """ try: return self._all_compat(other, "broadcast_equals") @@ -1637,6 +1759,67 @@ def equals(self, other: Dataset) -> bool: This method is necessary because `v1 == v2` for ``Dataset`` does element-wise comparisons (like numpy.ndarrays). + Examples + -------- + + # 2D array with shape (1, 3) + + >>> data = np.array([[1, 2, 3]]) + >>> dataset1 = xr.Dataset( + ... {"variable_name": (("space", "time"), data)}, + ... coords={"space": [0], "time": [0, 1, 2]}, + ... ) + >>> dataset1 + + Dimensions: (space: 1, time: 3) + Coordinates: + * space (space) int64 0 + * time (time) int64 0 1 2 + Data variables: + variable_name (space, time) int64 1 2 3 + + # 2D array with shape (3, 1) + + >>> data = np.array([[1], [2], [3]]) + >>> dataset2 = xr.Dataset( + ... {"variable_name": (("time", "space"), data)}, + ... coords={"time": [0, 1, 2], "space": [0]}, + ... ) + >>> dataset2 + + Dimensions: (time: 3, space: 1) + Coordinates: + * time (time) int64 0 1 2 + * space (space) int64 0 + Data variables: + variable_name (time, space) int64 1 2 3 + >>> dataset1.equals(dataset2) + False + + >>> dataset1.broadcast_equals(dataset2) + True + + .equals returns True if two Datasets have the same values, dimensions, and coordinates. .broadcast_equals returns True if the + results of broadcasting two Datasets against each other have the same values, dimensions, and coordinates. + + Similar for missing values too: + + >>> ds1 = xr.Dataset( + ... { + ... "temperature": (["x", "y"], [[1, np.nan], [3, 4]]), + ... }, + ... coords={"x": [0, 1], "y": [0, 1]}, + ... ) + + >>> ds2 = xr.Dataset( + ... { + ... "temperature": (["x", "y"], [[1, np.nan], [3, 4]]), + ... }, + ... coords={"x": [0, 1], "y": [0, 1]}, + ... ) + >>> ds1.equals(ds2) + True + See Also -------- Dataset.broadcast_equals @@ -1651,6 +1834,66 @@ def identical(self, other: Dataset) -> bool: """Like equals, but also checks all dataset attributes and the attributes on all variables and coordinates. + Example + ------- + + >>> a = xr.Dataset( + ... {"Width": ("X", [1, 2, 3])}, + ... coords={"X": [1, 2, 3]}, + ... attrs={"units": "m"}, + ... ) + >>> b = xr.Dataset( + ... {"Width": ("X", [1, 2, 3])}, + ... coords={"X": [1, 2, 3]}, + ... attrs={"units": "m"}, + ... ) + >>> c = xr.Dataset( + ... {"Width": ("X", [1, 2, 3])}, + ... coords={"X": [1, 2, 3]}, + ... attrs={"units": "ft"}, + ... ) + >>> a + + Dimensions: (X: 3) + Coordinates: + * X (X) int64 1 2 3 + Data variables: + Width (X) int64 1 2 3 + Attributes: + units: m + + >>> b + + Dimensions: (X: 3) + Coordinates: + * X (X) int64 1 2 3 + Data variables: + Width (X) int64 1 2 3 + Attributes: + units: m + + >>> c + + Dimensions: (X: 3) + Coordinates: + * X (X) int64 1 2 3 + Data variables: + Width (X) int64 1 2 3 + Attributes: + units: ft + + >>> a.equals(b) + True + + >>> a.identical(b) + True + + >>> a.equals(c) + True + + >>> a.identical(c) + False + See Also -------- Dataset.broadcast_equals @@ -1679,13 +1922,19 @@ def indexes(self) -> Indexes[pd.Index]: @property def xindexes(self) -> Indexes[Index]: - """Mapping of xarray Index objects used for label based indexing.""" + """Mapping of :py:class:`~xarray.indexes.Index` objects + used for label based indexing. + """ return Indexes(self._indexes, {k: self._variables[k] for k in self._indexes}) @property def coords(self) -> DatasetCoordinates: - """Dictionary of xarray.DataArray objects corresponding to coordinate - variables + """Mapping of :py:class:`~xarray.DataArray` objects corresponding to + coordinate variables. + + See Also + -------- + Coordinates """ return DatasetCoordinates(self) @@ -1702,6 +1951,33 @@ def set_coords(self: T_Dataset, names: Hashable | Iterable[Hashable]) -> T_Datas names : hashable or iterable of hashable Name(s) of variables in this dataset to convert into coordinates. + Examples + -------- + >>> dataset = xr.Dataset( + ... { + ... "pressure": ("time", [1.013, 1.2, 3.5]), + ... "time": pd.date_range("2023-01-01", periods=3), + ... } + ... ) + >>> dataset + + Dimensions: (time: 3) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 2023-01-03 + Data variables: + pressure (time) float64 1.013 1.2 3.5 + + >>> dataset.set_coords("pressure") + + Dimensions: (time: 3) + Coordinates: + pressure (time) float64 1.013 1.2 3.5 + * time (time) datetime64[ns] 2023-01-01 2023-01-02 2023-01-03 + Data variables: + *empty* + + On calling ``set_coords`` , these data variables are converted to coordinates, as shown in the final dataset. + Returns ------- Dataset @@ -1740,9 +2016,66 @@ def reset_coords( If True, remove coordinates instead of converting them into variables. + Examples + -------- + >>> dataset = xr.Dataset( + ... { + ... "temperature": ( + ... ["time", "lat", "lon"], + ... [[[25, 26], [27, 28]], [[29, 30], [31, 32]]], + ... ), + ... "precipitation": ( + ... ["time", "lat", "lon"], + ... [[[0.5, 0.8], [0.2, 0.4]], [[0.3, 0.6], [0.7, 0.9]]], + ... ), + ... }, + ... coords={ + ... "time": pd.date_range(start="2023-01-01", periods=2), + ... "lat": [40, 41], + ... "lon": [-80, -79], + ... "altitude": 1000, + ... }, + ... ) + + # Dataset before resetting coordinates + + >>> dataset + + Dimensions: (time: 2, lat: 2, lon: 2) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 + * lat (lat) int64 40 41 + * lon (lon) int64 -80 -79 + altitude int64 1000 + Data variables: + temperature (time, lat, lon) int64 25 26 27 28 29 30 31 32 + precipitation (time, lat, lon) float64 0.5 0.8 0.2 0.4 0.3 0.6 0.7 0.9 + + # Reset the 'altitude' coordinate + + >>> dataset_reset = dataset.reset_coords("altitude") + + # Dataset after resetting coordinates + + >>> dataset_reset + + Dimensions: (time: 2, lat: 2, lon: 2) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 + * lat (lat) int64 40 41 + * lon (lon) int64 -80 -79 + Data variables: + temperature (time, lat, lon) int64 25 26 27 28 29 30 31 32 + precipitation (time, lat, lon) float64 0.5 0.8 0.2 0.4 0.3 0.6 0.7 0.9 + altitude int64 1000 + Returns ------- Dataset + + See Also + -------- + Dataset.set_coords """ if names is None: names = self._coord_names - set(self._indexes) @@ -1879,7 +2212,9 @@ def to_netcdf( Nested dictionary with variable names as keys and dictionaries of variable specific encodings as values, e.g., ``{"my_variable": {"dtype": "int16", "scale_factor": 0.1, - "zlib": True}, ...}`` + "zlib": True}, ...}``. + If ``encoding`` is specified the original encoding of the variables of + the dataset is ignored. The `h5netcdf` engine supports both the NetCDF4-style compression encoding parameters ``{"zlib": True, "complevel": 9}`` and the h5py @@ -1945,6 +2280,8 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + write_empty_chunks: bool | None = None, + chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> ZarrStore: ... @@ -1966,6 +2303,8 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + write_empty_chunks: bool | None = None, + chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> Delayed: ... @@ -1984,6 +2323,8 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + write_empty_chunks: bool | None = None, + chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> ZarrStore | Delayed: """Write dataset contents to a zarr group. @@ -2072,6 +2413,21 @@ def to_zarr( The desired zarr spec version to target (currently 2 or 3). The default of None will attempt to determine the zarr version from ``store`` when possible, otherwise defaulting to 2. + write_empty_chunks : bool or None, optional + If True, all chunks will be stored regardless of their + contents. If False, each chunk is compared to the array's fill value + prior to storing. If a chunk is uniformly equal to the fill value, then + that chunk is not be stored, and the store entry for that chunk's key + is deleted. This setting enables sparser storage, as only chunks with + non-fill-value data are stored, at the expense of overhead associated + with checking the data of each chunk. If None (default) fall back to + specification(s) in ``encoding`` or Zarr defaults. A ``ValueError`` + will be raised if the value of this (if not None) differs with + ``encoding``. + chunkmanager_store_kwargs : dict, optional + Additional keyword arguments passed on to the `ChunkManager.store` method used to store + chunked arrays. For example for a dask array additional kwargs will be passed eventually to + :py:func:`dask.array.store()`. Experimental API that should not be relied upon. Returns ------- @@ -2117,6 +2473,8 @@ def to_zarr( region=region, safe_chunks=safe_chunks, zarr_version=zarr_version, + write_empty_chunks=write_empty_chunks, + chunkmanager_store_kwargs=chunkmanager_store_kwargs, ) def __repr__(self) -> str: @@ -2205,6 +2563,8 @@ def chunk( token: str | None = None, lock: bool = False, inline_array: bool = False, + chunked_array_type: str | ChunkManagerEntrypoint | None = None, + from_array_kwargs=None, **chunks_kwargs: None | int | str | tuple[int, ...], ) -> T_Dataset: """Coerce all arrays in this dataset into dask arrays with the given @@ -2232,6 +2592,15 @@ def chunk( inline_array: bool, default: False Passed on to :py:func:`dask.array.from_array`, if the array is not already as dask array. + chunked_array_type: str, optional + Which chunked array type to coerce this datasets' arrays to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Experimental API that should not be relied upon. + from_array_kwargs: dict, optional + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example, with dask as the default chunked array type, this method would pass additional kwargs + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. **chunks_kwargs : {dim: chunks, ...}, optional The keyword arguments form of ``chunks``. One of chunks or chunks_kwargs must be provided @@ -2266,8 +2635,22 @@ def chunk( f"some chunks keys are not dimensions on this object: {bad_dims}" ) + chunkmanager = guess_chunkmanager(chunked_array_type) + if from_array_kwargs is None: + from_array_kwargs = {} + variables = { - k: _maybe_chunk(k, v, chunks, token, lock, name_prefix) + k: _maybe_chunk( + k, + v, + chunks, + token, + lock, + name_prefix, + inline_array=inline_array, + chunked_array_type=chunkmanager, + from_array_kwargs=from_array_kwargs.copy(), + ) for k, v in self.variables.items() } return self._replace(variables) @@ -2305,7 +2688,7 @@ def _validate_indexers( if v.dtype.kind in "US": index = self._indexes[k].to_pandas_index() if isinstance(index, pd.DatetimeIndex): - v = v.astype("datetime64[ns]") + v = duck_array_ops.astype(v, dtype="datetime64[ns]") elif isinstance(index, CFTimeIndex): v = _parse_array_of_cftime_strings(v, index.date_type) @@ -2424,6 +2807,63 @@ def isel( in this dataset, unless vectorized indexing was triggered by using an array indexer, in which case the data will be a copy. + Examples + -------- + + >>> dataset = xr.Dataset( + ... { + ... "math_scores": ( + ... ["student", "test"], + ... [[90, 85, 92], [78, 80, 85], [95, 92, 98]], + ... ), + ... "english_scores": ( + ... ["student", "test"], + ... [[88, 90, 92], [75, 82, 79], [93, 96, 91]], + ... ), + ... }, + ... coords={ + ... "student": ["Alice", "Bob", "Charlie"], + ... "test": ["Test 1", "Test 2", "Test 3"], + ... }, + ... ) + + # A specific element from the dataset is selected + + >>> dataset.isel(student=1, test=0) + + Dimensions: () + Coordinates: + student >> slice_of_data = dataset.isel(student=slice(0, 2), test=slice(0, 2)) + >>> slice_of_data + + Dimensions: (student: 2, test: 2) + Coordinates: + * student (student) >> index_array = xr.DataArray([0, 2], dims="student") + >>> indexed_data = dataset.isel(student=index_array) + >>> indexed_data + + Dimensions: (student: 2, test: 3) + Coordinates: + * student (student) >> dates = pd.date_range(start="2023-01-01", periods=5) + >>> pageviews = [1200, 1500, 900, 1800, 2000] + >>> visitors = [800, 1000, 600, 1200, 1500] + >>> dataset = xr.Dataset( + ... { + ... "pageviews": (("date"), pageviews), + ... "visitors": (("date"), visitors), + ... }, + ... coords={"date": dates}, + ... ) + >>> busiest_days = dataset.sortby("pageviews", ascending=False) + >>> busiest_days.head() + + Dimensions: (date: 5) + Coordinates: + * date (date) datetime64[ns] 2023-01-05 2023-01-04 ... 2023-01-03 + Data variables: + pageviews (date) int64 2000 1800 1500 1200 900 + visitors (date) int64 1500 1200 1000 800 600 + + # Retrieve the 3 most busiest days in terms of pageviews + + >>> busiest_days.head(3) + + Dimensions: (date: 3) + Coordinates: + * date (date) datetime64[ns] 2023-01-05 2023-01-04 2023-01-02 + Data variables: + pageviews (date) int64 2000 1800 1500 + visitors (date) int64 1500 1200 1000 + + # Using a dictionary to specify the number of elements for specific dimensions + + >>> busiest_days.head({"date": 3}) + + Dimensions: (date: 3) + Coordinates: + * date (date) datetime64[ns] 2023-01-05 2023-01-04 2023-01-02 + Data variables: + pageviews (date) int64 2000 1800 1500 + visitors (date) int64 1500 1200 1000 + See Also -------- Dataset.tail @@ -2658,6 +3142,48 @@ def tail( The keyword arguments form of ``indexers``. One of indexers or indexers_kwargs must be provided. + Examples + -------- + >>> activity_names = ["Walking", "Running", "Cycling", "Swimming", "Yoga"] + >>> durations = [30, 45, 60, 45, 60] # in minutes + >>> energies = [150, 300, 250, 400, 100] # in calories + >>> dataset = xr.Dataset( + ... { + ... "duration": (["activity"], durations), + ... "energy_expenditure": (["activity"], energies), + ... }, + ... coords={"activity": activity_names}, + ... ) + >>> sorted_dataset = dataset.sortby("energy_expenditure", ascending=False) + >>> sorted_dataset + + Dimensions: (activity: 5) + Coordinates: + * activity (activity) >> sorted_dataset.tail(3) + + Dimensions: (activity: 3) + Coordinates: + * activity (activity) >> sorted_dataset.tail({"activity": 3}) + + Dimensions: (activity: 3) + Coordinates: + * activity (activity) Dataset: + ) -> T_Dataset: """Callback called from ``Aligner`` to create a new reindexed Dataset.""" new_variables = variables.copy() @@ -2859,6 +3385,8 @@ def _reindex_callback( new_variables, new_coord_names, indexes=new_indexes ) + reindexed.encoding = self.encoding + return reindexed def reindex_like( @@ -3907,6 +4435,64 @@ def expand_dims( expanded : Dataset This object, but with additional dimension(s). + Examples + -------- + >>> dataset = xr.Dataset({"temperature": ([], 25.0)}) + >>> dataset + + Dimensions: () + Data variables: + temperature float64 25.0 + + # Expand the dataset with a new dimension called "time" + + >>> dataset.expand_dims(dim="time") + + Dimensions: (time: 1) + Dimensions without coordinates: time + Data variables: + temperature (time) float64 25.0 + + # 1D data + + >>> temperature_1d = xr.DataArray([25.0, 26.5, 24.8], dims="x") + >>> dataset_1d = xr.Dataset({"temperature": temperature_1d}) + >>> dataset_1d + + Dimensions: (x: 3) + Dimensions without coordinates: x + Data variables: + temperature (x) float64 25.0 26.5 24.8 + + # Expand the dataset with a new dimension called "time" using axis argument + + >>> dataset_1d.expand_dims(dim="time", axis=0) + + Dimensions: (time: 1, x: 3) + Dimensions without coordinates: time, x + Data variables: + temperature (time, x) float64 25.0 26.5 24.8 + + # 2D data + + >>> temperature_2d = xr.DataArray(np.random.rand(3, 4), dims=("y", "x")) + >>> dataset_2d = xr.Dataset({"temperature": temperature_2d}) + >>> dataset_2d + + Dimensions: (y: 3, x: 4) + Dimensions without coordinates: y, x + Data variables: + temperature (y, x) float64 0.5488 0.7152 0.6028 ... 0.3834 0.7917 0.5289 + + # Expand the dataset with a new dimension called "time" using axis argument + + >>> dataset_2d.expand_dims(dim="time", axis=2) + + Dimensions: (y: 3, x: 4, time: 1) + Dimensions without coordinates: y, x, time + Data variables: + temperature (y, x, time) float64 0.5488 0.7152 0.6028 ... 0.7917 0.5289 + See Also -------- DataArray.expand_dims @@ -5081,10 +5667,100 @@ def drop_vars( passed are not in the dataset. If 'ignore', any given names that are in the dataset are dropped and no error is raised. + Examples + -------- + + >>> dataset = xr.Dataset( + ... { + ... "temperature": ( + ... ["time", "latitude", "longitude"], + ... [[[25.5, 26.3], [27.1, 28.0]]], + ... ), + ... "humidity": ( + ... ["time", "latitude", "longitude"], + ... [[[65.0, 63.8], [58.2, 59.6]]], + ... ), + ... "wind_speed": ( + ... ["time", "latitude", "longitude"], + ... [[[10.2, 8.5], [12.1, 9.8]]], + ... ), + ... }, + ... coords={ + ... "time": pd.date_range("2023-07-01", periods=1), + ... "latitude": [40.0, 40.2], + ... "longitude": [-75.0, -74.8], + ... }, + ... ) + >>> dataset + + Dimensions: (time: 1, latitude: 2, longitude: 2) + Coordinates: + * time (time) datetime64[ns] 2023-07-01 + * latitude (latitude) float64 40.0 40.2 + * longitude (longitude) float64 -75.0 -74.8 + Data variables: + temperature (time, latitude, longitude) float64 25.5 26.3 27.1 28.0 + humidity (time, latitude, longitude) float64 65.0 63.8 58.2 59.6 + wind_speed (time, latitude, longitude) float64 10.2 8.5 12.1 9.8 + + # Drop the 'humidity' variable + + >>> dataset.drop_vars(["humidity"]) + + Dimensions: (time: 1, latitude: 2, longitude: 2) + Coordinates: + * time (time) datetime64[ns] 2023-07-01 + * latitude (latitude) float64 40.0 40.2 + * longitude (longitude) float64 -75.0 -74.8 + Data variables: + temperature (time, latitude, longitude) float64 25.5 26.3 27.1 28.0 + wind_speed (time, latitude, longitude) float64 10.2 8.5 12.1 9.8 + + # Drop the 'humidity', 'temperature' variables + + >>> dataset.drop_vars(["humidity", "temperature"]) + + Dimensions: (time: 1, latitude: 2, longitude: 2) + Coordinates: + * time (time) datetime64[ns] 2023-07-01 + * latitude (latitude) float64 40.0 40.2 + * longitude (longitude) float64 -75.0 -74.8 + Data variables: + wind_speed (time, latitude, longitude) float64 10.2 8.5 12.1 9.8 + + # Attempt to drop non-existent variable with errors="ignore" + + >>> dataset.drop_vars(["pressure"], errors="ignore") + + Dimensions: (time: 1, latitude: 2, longitude: 2) + Coordinates: + * time (time) datetime64[ns] 2023-07-01 + * latitude (latitude) float64 40.0 40.2 + * longitude (longitude) float64 -75.0 -74.8 + Data variables: + temperature (time, latitude, longitude) float64 25.5 26.3 27.1 28.0 + humidity (time, latitude, longitude) float64 65.0 63.8 58.2 59.6 + wind_speed (time, latitude, longitude) float64 10.2 8.5 12.1 9.8 + + # Attempt to drop non-existent variable with errors="raise" + + >>> dataset.drop_vars(["pressure"], errors="raise") + Traceback (most recent call last): + ValueError: These variables cannot be found in this dataset: ['pressure'] + + Raises + ------ + ValueError + Raised if you attempt to drop a variable which is not present, and the kwarg ``errors='raise'``. + Returns ------- dropped : Dataset + See Also + -------- + DataArray.drop_vars + """ # the Iterable check is required for mypy if is_scalar(names) or not isinstance(names, Iterable): @@ -5487,6 +6163,70 @@ def dropna( Which variables to check for missing values. By default, all variables in the dataset are checked. + Examples + -------- + >>> dataset = xr.Dataset( + ... { + ... "temperature": ( + ... ["time", "location"], + ... [[23.4, 24.1], [np.nan, 22.1], [21.8, 24.2], [20.5, 25.3]], + ... ) + ... }, + ... coords={"time": [1, 2, 3, 4], "location": ["A", "B"]}, + ... ) + >>> dataset + + Dimensions: (time: 4, location: 2) + Coordinates: + * time (time) int64 1 2 3 4 + * location (location) >> dataset.dropna(dim="time") + + Dimensions: (time: 3, location: 2) + Coordinates: + * time (time) int64 1 3 4 + * location (location) >> dataset.dropna(dim="time", how="any") + + Dimensions: (time: 3, location: 2) + Coordinates: + * time (time) int64 1 3 4 + * location (location) >> dataset.dropna(dim="time", how="all") + + Dimensions: (time: 4, location: 2) + Coordinates: + * time (time) int64 1 2 3 4 + * location (location) >> dataset.dropna(dim="time", thresh=2) + + Dimensions: (time: 3, location: 2) + Coordinates: + * time (time) int64 1 3 4 + * location (location) T_Dataset Parameters ---------- dim : Hashable - Specifies the dimension along which to propagate values when - filling. + Specifies the dimension along which to propagate values when filling. limit : int or None, optional The maximum number of consecutive NaN values to forward fill. In other words, if there is a gap with more than this number of @@ -5756,9 +6500,48 @@ def ffill(self: T_Dataset, dim: Hashable, limit: int | None = None) -> T_Dataset than 0 or None for no limit. Must be None or greater than or equal to axis length if filling along chunked axes (dimensions). + Examples + -------- + >>> time = pd.date_range("2023-01-01", periods=10, freq="D") + >>> data = np.array( + ... [1, np.nan, np.nan, np.nan, 5, np.nan, np.nan, 8, np.nan, 10] + ... ) + >>> dataset = xr.Dataset({"data": (("time",), data)}, coords={"time": time}) + >>> dataset + + Dimensions: (time: 10) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-01-10 + Data variables: + data (time) float64 1.0 nan nan nan 5.0 nan nan 8.0 nan 10.0 + + # Perform forward fill (ffill) on the dataset + + >>> dataset.ffill(dim="time") + + Dimensions: (time: 10) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-01-10 + Data variables: + data (time) float64 1.0 1.0 1.0 1.0 5.0 5.0 5.0 8.0 8.0 10.0 + + # Limit the forward filling to a maximum of 2 consecutive NaN values + + >>> dataset.ffill(dim="time", limit=2) + + Dimensions: (time: 10) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-01-10 + Data variables: + data (time) float64 1.0 1.0 1.0 nan 5.0 5.0 5.0 8.0 8.0 10.0 + Returns ------- Dataset + + See Also + -------- + Dataset.bfill """ from xarray.core.missing import _apply_over_vars_with_dim, ffill @@ -5782,9 +6565,48 @@ def bfill(self: T_Dataset, dim: Hashable, limit: int | None = None) -> T_Dataset than 0 or None for no limit. Must be None or greater than or equal to axis length if filling along chunked axes (dimensions). + Examples + -------- + >>> time = pd.date_range("2023-01-01", periods=10, freq="D") + >>> data = np.array( + ... [1, np.nan, np.nan, np.nan, 5, np.nan, np.nan, 8, np.nan, 10] + ... ) + >>> dataset = xr.Dataset({"data": (("time",), data)}, coords={"time": time}) + >>> dataset + + Dimensions: (time: 10) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-01-10 + Data variables: + data (time) float64 1.0 nan nan nan 5.0 nan nan 8.0 nan 10.0 + + # filled dataset, fills NaN values by propagating values backward + + >>> dataset.bfill(dim="time") + + Dimensions: (time: 10) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-01-10 + Data variables: + data (time) float64 1.0 5.0 5.0 5.0 5.0 8.0 8.0 8.0 10.0 10.0 + + # Limit the backward filling to a maximum of 2 consecutive NaN values + + >>> dataset.bfill(dim="time", limit=2) + + Dimensions: (time: 10) + Coordinates: + * time (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-01-10 + Data variables: + data (time) float64 1.0 nan 5.0 5.0 5.0 8.0 8.0 8.0 10.0 10.0 + Returns ------- Dataset + + See Also + -------- + Dataset.ffill """ from xarray.core.missing import _apply_over_vars_with_dim, bfill @@ -5849,6 +6671,38 @@ def reduce( reduced : Dataset Dataset with this object's DataArrays replaced with new DataArrays of summarized data and the indicated dimension(s) removed. + + Examples + -------- + + >>> dataset = xr.Dataset( + ... { + ... "math_scores": ( + ... ["student", "test"], + ... [[90, 85, 92], [78, 80, 85], [95, 92, 98]], + ... ), + ... "english_scores": ( + ... ["student", "test"], + ... [[88, 90, 92], [75, 82, 79], [93, 96, 91]], + ... ), + ... }, + ... coords={ + ... "student": ["Alice", "Bob", "Charlie"], + ... "test": ["Test 1", "Test 2", "Test 3"], + ... }, + ... ) + + # Calculate the 75th percentile of math scores for each student using np.percentile + + >>> percentile_scores = dataset.reduce(np.percentile, q=75, dim="test") + >>> percentile_scores + + Dimensions: (student: 3) + Coordinates: + * student (student) T_Dataset: ------- result : Dataset + Examples + -------- + >>> dataset = xr.Dataset( + ... { + ... "math_scores": ( + ... ["student", "test"], + ... [[90, 85, 79], [78, 80, 85], [95, 92, 98]], + ... ), + ... "english_scores": ( + ... ["student", "test"], + ... [[88, 90, 92], [75, 82, 79], [39, 96, 78]], + ... ), + ... }, + ... coords={ + ... "student": ["Alice", "Bob", "Charlie"], + ... "test": ["Test 1", "Test 2", "Test 3"], + ... }, + ... ) + + # Indices of the minimum values along the 'student' dimension are calculated + + >>> argmin_indices = dataset.argmin(dim="student") + + >>> min_score_in_math = dataset["student"].isel( + ... student=argmin_indices["math_scores"] + ... ) + >>> min_score_in_math + + array(['Bob', 'Bob', 'Alice'], dtype='>> min_score_in_english = dataset["student"].isel( + ... student=argmin_indices["english_scores"] + ... ) + >>> min_score_in_english + + array(['Charlie', 'Bob', 'Charlie'], dtype=' T_Dataset: ------- result : Dataset + Examples + -------- + + >>> dataset = xr.Dataset( + ... { + ... "math_scores": ( + ... ["student", "test"], + ... [[90, 85, 92], [78, 80, 85], [95, 92, 98]], + ... ), + ... "english_scores": ( + ... ["student", "test"], + ... [[88, 90, 92], [75, 82, 79], [93, 96, 91]], + ... ), + ... }, + ... coords={ + ... "student": ["Alice", "Bob", "Charlie"], + ... "test": ["Test 1", "Test 2", "Test 3"], + ... }, + ... ) + + # Indices of the maximum values along the 'student' dimension are calculated + + >>> argmax_indices = dataset.argmax(dim="test") + + >>> argmax_indices + + Dimensions: (student: 3) + Coordinates: + * student (student) T_Dataset: """ @@ -8581,17 +9517,25 @@ def curvefit( Whether to skip missing values when fitting. Default is True. p0 : dict-like, optional Optional dictionary of parameter names to initial guesses passed to the - `curve_fit` `p0` arg. If none or only some parameters are passed, the rest will - be assigned initial values following the default scipy behavior. + `curve_fit` `p0` arg. If the values are DataArrays, they will be appropriately + broadcast to the coordinates of the array. If none or only some parameters are + passed, the rest will be assigned initial values following the default scipy + behavior. bounds : dict-like, optional - Optional dictionary of parameter names to bounding values passed to the - `curve_fit` `bounds` arg. If none or only some parameters are passed, the rest - will be unbounded following the default scipy behavior. + Optional dictionary of parameter names to tuples of bounding values passed to the + `curve_fit` `bounds` arg. If any of the bounds are DataArrays, they will be + appropriately broadcast to the coordinates of the array. If none or only some + parameters are passed, the rest will be unbounded following the default scipy + behavior. param_names : sequence of hashable, optional Sequence of names for the fittable parameters of `func`. If not supplied, this will be automatically determined by arguments of `func`. `param_names` should be manually supplied when fitting a function that takes a variable number of parameters. + errors : {"raise", "ignore"}, default: "raise" + If 'raise', any errors from the `scipy.optimize_curve_fit` optimization will + raise an exception. If 'ignore', the coefficients and covariances for the + coordinates where the fitting failed will be NaN. **kwargs : optional Additional keyword arguments to passed to scipy curve_fit. @@ -8653,29 +9597,56 @@ def curvefit( "in fitting on scalar data." ) + # Check that initial guess and bounds only contain coordinates that are in preserved_dims + for param, guess in p0.items(): + if isinstance(guess, DataArray): + unexpected = set(guess.dims) - set(preserved_dims) + if unexpected: + raise ValueError( + f"Initial guess for '{param}' has unexpected dimensions " + f"{tuple(unexpected)}. It should only have dimensions that are in data " + f"dimensions {preserved_dims}." + ) + for param, (lb, ub) in bounds.items(): + for label, bound in zip(("Lower", "Upper"), (lb, ub)): + if isinstance(bound, DataArray): + unexpected = set(bound.dims) - set(preserved_dims) + if unexpected: + raise ValueError( + f"{label} bound for '{param}' has unexpected dimensions " + f"{tuple(unexpected)}. It should only have dimensions that are in data " + f"dimensions {preserved_dims}." + ) + + if errors not in ["raise", "ignore"]: + raise ValueError('errors must be either "raise" or "ignore"') + # Broadcast all coords with each other coords_ = broadcast(*coords_) coords_ = [ coord.broadcast_like(self, exclude=preserved_dims) for coord in coords_ ] + n_coords = len(coords_) params, func_args = _get_func_args(func, param_names) param_defaults, bounds_defaults = _initialize_curvefit_params( params, p0, bounds, func_args ) n_params = len(params) - kwargs.setdefault("p0", [param_defaults[p] for p in params]) - kwargs.setdefault( - "bounds", - [ - [bounds_defaults[p][0] for p in params], - [bounds_defaults[p][1] for p in params], - ], - ) - def _wrapper(Y, *coords_, **kwargs): + def _wrapper(Y, *args, **kwargs): # Wrap curve_fit with raveled coordinates and pointwise NaN handling - x = np.vstack([c.ravel() for c in coords_]) + # *args contains: + # - the coordinates + # - initial guess + # - lower bounds + # - upper bounds + coords__ = args[:n_coords] + p0_ = args[n_coords + 0 * n_params : n_coords + 1 * n_params] + lb = args[n_coords + 1 * n_params : n_coords + 2 * n_params] + ub = args[n_coords + 2 * n_params :] + + x = np.vstack([c.ravel() for c in coords__]) y = Y.ravel() if skipna: mask = np.all([np.any(~np.isnan(x), axis=0), ~np.isnan(y)], axis=0) @@ -8686,7 +9657,15 @@ def _wrapper(Y, *coords_, **kwargs): pcov = np.full([n_params, n_params], np.nan) return popt, pcov x = np.squeeze(x) - popt, pcov = curve_fit(func, x, y, **kwargs) + + try: + popt, pcov = curve_fit(func, x, y, p0=p0_, bounds=(lb, ub), **kwargs) + except RuntimeError: + if errors == "raise": + raise + popt = np.full([n_params], np.nan) + pcov = np.full([n_params, n_params], np.nan) + return popt, pcov result = type(self)() @@ -8696,13 +9675,21 @@ def _wrapper(Y, *coords_, **kwargs): else: name = f"{str(name)}_" + input_core_dims = [reduce_dims_ for _ in range(n_coords + 1)] + input_core_dims.extend( + [[] for _ in range(3 * n_params)] + ) # core_dims for p0 and bounds + popt, pcov = apply_ufunc( _wrapper, da, *coords_, + *param_defaults.values(), + *[b[0] for b in bounds_defaults.values()], + *[b[1] for b in bounds_defaults.values()], vectorize=True, dask="parallelized", - input_core_dims=[reduce_dims_ for d in range(len(coords_) + 1)], + input_core_dims=input_core_dims, output_core_dims=[["param"], ["cov_i", "cov_j"]], dask_gufunc_kwargs={ "output_sizes": { diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index 4d8583cfe65..7ac342e3d52 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -37,11 +37,11 @@ def __eq__(self, other): # instead of following NumPy's own type-promotion rules. These type promotion # rules match pandas instead. For reference, see the NumPy type hierarchy: # https://numpy.org/doc/stable/reference/arrays.scalars.html -PROMOTE_TO_OBJECT = [ - {np.number, np.character}, # numpy promotes to character - {np.bool_, np.character}, # numpy promotes to character - {np.bytes_, np.unicode_}, # numpy promotes to unicode -] +PROMOTE_TO_OBJECT: tuple[tuple[type[np.generic], type[np.generic]], ...] = ( + (np.number, np.character), # numpy promotes to character + (np.bool_, np.character), # numpy promotes to character + (np.bytes_, np.unicode_), # numpy promotes to unicode +) def maybe_promote(dtype): @@ -74,7 +74,10 @@ def maybe_promote(dtype): else: dtype = object fill_value = np.nan - return np.dtype(dtype), fill_value + + dtype = np.dtype(dtype) + fill_value = dtype.type(fill_value) + return dtype, fill_value NAT_TYPES = {np.datetime64("NaT").dtype, np.timedelta64("NaT").dtype} @@ -156,7 +159,9 @@ def is_datetime_like(dtype): return np.issubdtype(dtype, np.datetime64) or np.issubdtype(dtype, np.timedelta64) -def result_type(*arrays_and_dtypes): +def result_type( + *arrays_and_dtypes: np.typing.ArrayLike | np.typing.DTypeLike, +) -> np.dtype: """Like np.result_type, but with type promotion rules matching pandas. Examples of changed behavior: diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 84e66803fe8..4f245e59f73 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -9,6 +9,7 @@ import datetime import inspect import warnings +from functools import partial from importlib import import_module import numpy as np @@ -29,10 +30,11 @@ zeros_like, # noqa ) from numpy import concatenate as _concatenate +from numpy.core.multiarray import normalize_axis_index # type: ignore[attr-defined] from numpy.lib.stride_tricks import sliding_window_view # noqa from xarray.core import dask_array_ops, dtypes, nputils -from xarray.core.nputils import nanfirst, nanlast +from xarray.core.parallelcompat import get_chunked_array_type, is_chunked_array from xarray.core.pycompat import array_type, is_duck_dask_array from xarray.core.utils import is_duck_array, module_available @@ -192,7 +194,10 @@ def asarray(data, xp=np): def as_shared_dtype(scalars_or_arrays, xp=np): """Cast a arrays to a shared dtype using xarray's type promotion rules.""" - if any(isinstance(x, array_type("cupy")) for x in scalars_or_arrays): + array_type_cupy = array_type("cupy") + if array_type_cupy and any( + isinstance(x, array_type_cupy) for x in scalars_or_arrays + ): import cupy as cp arrays = [asarray(x, xp=cp) for x in scalars_or_arrays] @@ -640,10 +645,10 @@ def first(values, axis, skipna=None): """Return the first non-NA elements in this array along the given axis""" if (skipna or skipna is None) and values.dtype.kind not in "iSU": # only bother for dtypes that can hold NaN - if is_duck_dask_array(values): - return dask_array_ops.nanfirst(values, axis) + if is_chunked_array(values): + return chunked_nanfirst(values, axis) else: - return nanfirst(values, axis) + return nputils.nanfirst(values, axis) return take(values, 0, axis=axis) @@ -651,10 +656,10 @@ def last(values, axis, skipna=None): """Return the last non-NA elements in this array along the given axis""" if (skipna or skipna is None) and values.dtype.kind not in "iSU": # only bother for dtypes that can hold NaN - if is_duck_dask_array(values): - return dask_array_ops.nanlast(values, axis) + if is_chunked_array(values): + return chunked_nanlast(values, axis) else: - return nanlast(values, axis) + return nputils.nanlast(values, axis) return take(values, -1, axis=axis) @@ -673,3 +678,32 @@ def push(array, n, axis): return dask_array_ops.push(array, n, axis) else: return push(array, n, axis) + + +def _first_last_wrapper(array, *, axis, op, keepdims): + return op(array, axis, keepdims=keepdims) + + +def _chunked_first_or_last(darray, axis, op): + chunkmanager = get_chunked_array_type(darray) + + # This will raise the same error message seen for numpy + axis = normalize_axis_index(axis, darray.ndim) + + wrapped_op = partial(_first_last_wrapper, op=op) + return chunkmanager.reduction( + darray, + func=wrapped_op, + aggregate_func=wrapped_op, + axis=axis, + dtype=darray.dtype, + keepdims=False, # match numpy version + ) + + +def chunked_nanfirst(darray, axis): + return _chunked_first_or_last(darray, axis, op=nputils.nanfirst) + + +def chunked_nanlast(darray, axis): + return _chunked_first_or_last(darray, axis, op=nputils.nanlast) diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 7f93706c74c..1f2bf720a10 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -424,21 +424,37 @@ def inline_index_repr(index, max_width=None): def summarize_index( - name: Hashable, index, col_width: int, max_width: int | None = None -): + names: tuple[Hashable, ...], + index, + col_width: int, + max_width: int | None = None, +) -> str: if max_width is None: max_width = OPTIONS["display_width"] - preformatted = pretty_print(f" {name} ", col_width) + def prefixes(length: int) -> list[str]: + if length in (0, 1): + return [" "] + + return ["┌"] + ["│"] * max(length - 2, 0) + ["└"] + + preformatted = [ + pretty_print(f" {prefix} {name}", col_width) + for prefix, name in zip(prefixes(len(names)), names) + ] - index_width = max_width - len(preformatted) + head, *tail = preformatted + index_width = max_width - len(head) repr_ = inline_index_repr(index, max_width=index_width) - return preformatted + repr_ + return "\n".join([head + repr_] + [line.rstrip() for line in tail]) -def nondefault_indexes(indexes): +def filter_nondefault_indexes(indexes, filter_indexes: bool): from xarray.core.indexes import PandasIndex, PandasMultiIndex + if not filter_indexes: + return indexes + default_indexes = (PandasIndex, PandasMultiIndex) return { @@ -448,7 +464,9 @@ def nondefault_indexes(indexes): } -def indexes_repr(indexes, col_width=None, max_rows=None): +def indexes_repr(indexes, max_rows: int | None = None) -> str: + col_width = _calculate_col_width(chain.from_iterable(indexes)) + return _mapping_repr( indexes, "Indexes", @@ -599,6 +617,12 @@ def short_data_repr(array): return f"[{array.size} values with dtype={array.dtype}]" +def _get_indexes_dict(indexes): + return { + tuple(index_vars.keys()): idx for idx, index_vars in indexes.group_by_index() + } + + @recursive_repr("") def array_repr(arr): from xarray.core.variable import Variable @@ -643,15 +667,13 @@ def array_repr(arr): display_default_indexes = _get_boolean_with_default( "display_default_indexes", False ) - if display_default_indexes: - xindexes = arr.xindexes - else: - xindexes = nondefault_indexes(arr.xindexes) + + xindexes = filter_nondefault_indexes( + _get_indexes_dict(arr.xindexes), not display_default_indexes + ) if xindexes: - summary.append( - indexes_repr(xindexes, col_width=col_width, max_rows=max_rows) - ) + summary.append(indexes_repr(xindexes, max_rows=max_rows)) if arr.attrs: summary.append(attrs_repr(arr.attrs, max_rows=max_rows)) @@ -682,12 +704,11 @@ def dataset_repr(ds): display_default_indexes = _get_boolean_with_default( "display_default_indexes", False ) - if display_default_indexes: - xindexes = ds.xindexes - else: - xindexes = nondefault_indexes(ds.xindexes) + xindexes = filter_nondefault_indexes( + _get_indexes_dict(ds.xindexes), not display_default_indexes + ) if xindexes: - summary.append(indexes_repr(xindexes, col_width=col_width, max_rows=max_rows)) + summary.append(indexes_repr(xindexes, max_rows=max_rows)) if ds.attrs: summary.append(attrs_repr(ds.attrs, max_rows=max_rows)) @@ -697,9 +718,7 @@ def dataset_repr(ds): def diff_dim_summary(a, b): if a.dims != b.dims: - return "Differing dimensions:\n ({}) != ({})".format( - dim_summary(a), dim_summary(b) - ) + return f"Differing dimensions:\n ({dim_summary(a)}) != ({dim_summary(b)})" else: return "" @@ -826,9 +845,7 @@ def _compat_to_str(compat): def diff_array_repr(a, b, compat): # used for DataArray, Variable and IndexVariable summary = [ - "Left and right {} objects are not {}".format( - type(a).__name__, _compat_to_str(compat) - ) + f"Left and right {type(a).__name__} objects are not {_compat_to_str(compat)}" ] summary.append(diff_dim_summary(a, b)) @@ -859,9 +876,7 @@ def diff_array_repr(a, b, compat): def diff_dataset_repr(a, b, compat): summary = [ - "Left and right {} objects are not {}".format( - type(a).__name__, _compat_to_str(compat) - ) + f"Left and right {type(a).__name__} objects are not {_compat_to_str(compat)}" ] col_width = _calculate_col_width(set(list(a.variables) + list(b.variables))) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 55fe103d41e..9894a4a4daf 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -236,6 +236,10 @@ def __getitem__(self, key): key = key[0] return self.values[key] + def to_index(self) -> pd.Index: + # could be pd.RangeIndex? + return pd.Index(np.arange(self.size)) + def copy(self, deep: bool = True, data: Any = None): raise NotImplementedError @@ -381,7 +385,7 @@ def is_unique_and_monotonic(self) -> bool: @property def group_as_index(self) -> pd.Index: if self._group_as_index is None: - self._group_as_index = safe_cast_to_index(self.group1d) + self._group_as_index = self.group1d.to_index() return self._group_as_index @@ -622,6 +626,7 @@ def _resolve_group(obj: T_Xarray, group: T_Group | Hashable) -> T_Group: (group_dim,) = group.dims if len(group) != obj.sizes[group_dim]: raise ValueError(error_msg) + newgroup = DataArray(group) else: if not hashable(group): @@ -883,6 +888,21 @@ def _binary_op(self, other, f, reflexive=False): group = group.where(~mask, drop=True) codes = codes.where(~mask, drop=True).astype(int) + # if other is dask-backed, that's a hint that the + # "expanded" dataset is too big to hold in memory. + # this can be the case when `other` was read from disk + # and contains our lazy indexing classes + # We need to check for dask-backed Datasets + # so utils.is_duck_dask_array does not work for this check + if obj.chunks and not other.chunks: + # TODO: What about datasets with some dask vars, and others not? + # This handles dims other than `name`` + chunks = {k: v for k, v in obj.chunksizes.items() if k in other.dims} + # a chunk size of 1 seems reasonable since we expect individual elements of + # other to be repeated multiple times across the reduced dimension(s) + chunks[name] = 1 + other = other.chunk(chunks) + # codes are defined for coord, so we align `other` with `coord` # before indexing other, _ = align(other, coord, join="right", copy=False) @@ -964,6 +984,13 @@ def _flox_reduce( else: non_numeric = {} + if "min_count" in kwargs: + if kwargs["func"] not in ["sum", "prod"]: + raise TypeError("Received an unexpected keyword argument 'min_count'") + elif kwargs["min_count"] is None: + # set explicitly to avoid unncessarily accumulating count + kwargs["min_count"] = 0 + # weird backcompat # reducing along a unique indexed dimension with squeeze=True # should raise an error @@ -1090,15 +1117,15 @@ def quantile( desired quantile lies between two data points. The options sorted by their R type as summarized in the H&F paper [1]_ are: - 1. "inverted_cdf" (*) - 2. "averaged_inverted_cdf" (*) - 3. "closest_observation" (*) - 4. "interpolated_inverted_cdf" (*) - 5. "hazen" (*) - 6. "weibull" (*) + 1. "inverted_cdf" + 2. "averaged_inverted_cdf" + 3. "closest_observation" + 4. "interpolated_inverted_cdf" + 5. "hazen" + 6. "weibull" 7. "linear" (default) - 8. "median_unbiased" (*) - 9. "normal_unbiased" (*) + 8. "median_unbiased" + 9. "normal_unbiased" The first three methods are discontiuous. The following discontinuous variations of the default "linear" (7.) option are also available: @@ -1108,9 +1135,8 @@ def quantile( * "midpoint" * "nearest" - See :py:func:`numpy.quantile` or [1]_ for details. Methods marked with - an asterisk require numpy version 1.22 or newer. The "method" argument was - previously called "interpolation", renamed in accordance with numpy + See :py:func:`numpy.quantile` or [1]_ for details. The "method" argument + was previously called "interpolation", renamed in accordance with numpy version 1.22.0. keep_attrs : bool or None, default: None If True, the dataarray's attributes (`attrs`) will be copied from diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 93e9e535fe3..7de290f4e14 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -24,26 +24,66 @@ ) if TYPE_CHECKING: - from xarray.core.types import ErrorOptions, T_Index + from xarray.core.types import ErrorOptions, JoinOptions, T_Index from xarray.core.variable import Variable + IndexVars = dict[Any, "Variable"] class Index: - """Base class inherited by all xarray-compatible indexes. + """ + Base class inherited by all xarray-compatible indexes. - Do not use this class directly for creating index objects. + Do not use this class directly for creating index objects. Xarray indexes + are created exclusively from subclasses of ``Index``, mostly via Xarray's + public API like ``Dataset.set_xindex``. + + Every subclass must at least implement :py:meth:`Index.from_variables`. The + (re)implementation of the other methods of this base class is optional but + mostly required in order to support operations relying on indexes such as + label-based selection or alignment. + + The ``Index`` API closely follows the :py:meth:`Dataset` and + :py:meth:`DataArray` API, e.g., for an index to support ``.sel()`` it needs + to implement :py:meth:`Index.sel`, to support ``.stack()`` and + ``.unstack()`` it needs to implement :py:meth:`Index.stack` and + :py:meth:`Index.unstack`, etc. + When a method is not (re)implemented, depending on the case the + corresponding operation on a :py:meth:`Dataset` or :py:meth:`DataArray` + either will raise a ``NotImplementedError`` or will simply drop/pass/copy + the index from/to the result. + + Do not use this class directly for creating index objects. """ @classmethod def from_variables( - cls, + cls: type[T_Index], variables: Mapping[Any, Variable], *, options: Mapping[str, Any], - ) -> Index: + ) -> T_Index: + """Create a new index object from one or more coordinate variables. + + This factory method must be implemented in all subclasses of Index. + + The coordinate variables may be passed here in an arbitrary number and + order and each with arbitrary dimensions. It is the responsibility of + the index to check the consistency and validity of these coordinates. + + Parameters + ---------- + variables : dict-like + Mapping of :py:class:`Variable` objects holding the coordinate labels + to index. + + Returns + ------- + index : Index + A new Index object. + """ raise NotImplementedError() @classmethod @@ -53,20 +93,102 @@ def concat( dim: Hashable, positions: Iterable[Iterable[int]] | None = None, ) -> T_Index: + """Create a new index by concatenating one or more indexes of the same + type. + + Implementation is optional but required in order to support + ``concat``. Otherwise it will raise an error if the index needs to be + updated during the operation. + + Parameters + ---------- + indexes : sequence of Index objects + Indexes objects to concatenate together. All objects must be of the + same type. + dim : Hashable + Name of the dimension to concatenate along. + positions : None or list of integer arrays, optional + List of integer arrays which specifies the integer positions to which + to assign each dataset along the concatenated dimension. If not + supplied, objects are concatenated in the provided order. + + Returns + ------- + index : Index + A new Index object. + """ raise NotImplementedError() @classmethod - def stack(cls, variables: Mapping[Any, Variable], dim: Hashable) -> Index: + def stack( + cls: type[T_Index], variables: Mapping[Any, Variable], dim: Hashable + ) -> T_Index: + """Create a new index by stacking coordinate variables into a single new + dimension. + + Implementation is optional but required in order to support ``stack``. + Otherwise it will raise an error when trying to pass the Index subclass + as argument to :py:meth:`Dataset.stack`. + + Parameters + ---------- + variables : dict-like + Mapping of :py:class:`Variable` objects to stack together. + dim : Hashable + Name of the new, stacked dimension. + + Returns + ------- + index + A new Index object. + """ raise NotImplementedError( f"{cls!r} cannot be used for creating an index of stacked coordinates" ) def unstack(self) -> tuple[dict[Hashable, Index], pd.MultiIndex]: + """Unstack a (multi-)index into multiple (single) indexes. + + Implementation is optional but required in order to support unstacking + the coordinates from which this index has been built. + + Returns + ------- + indexes : tuple + A 2-length tuple where the 1st item is a dictionary of unstacked + Index objects and the 2nd item is a :py:class:`pandas.MultiIndex` + object used to unstack unindexed coordinate variables or data + variables. + """ raise NotImplementedError() def create_variables( self, variables: Mapping[Any, Variable] | None = None ) -> IndexVars: + """Maybe create new coordinate variables from this index. + + This method is useful if the index data can be reused as coordinate + variable data. It is often the case when the underlying index structure + has an array-like interface, like :py:class:`pandas.Index` objects. + + The variables given as argument (if any) are either returned as-is + (default behavior) or can be used to copy their metadata (attributes and + encoding) into the new returned coordinate variables. + + Note: the input variables may or may not have been filtered for this + index. + + Parameters + ---------- + variables : dict-like, optional + Mapping of :py:class:`Variable` objects. + + Returns + ------- + index_variables : dict-like + Dictionary of :py:class:`Variable` or :py:class:`IndexVariable` + objects. + """ if variables is not None: # pass through return dict(**variables) @@ -74,51 +196,212 @@ def create_variables( return {} def to_pandas_index(self) -> pd.Index: - """Cast this xarray index to a pandas.Index object or raise a TypeError - if this is not supported. + """Cast this xarray index to a pandas.Index object or raise a + ``TypeError`` if this is not supported. - This method is used by all xarray operations that expect/require a - pandas.Index object. + This method is used by all xarray operations that still rely on + pandas.Index objects. + By default it raises a ``TypeError``, unless it is re-implemented in + subclasses of Index. """ raise TypeError(f"{self!r} cannot be cast to a pandas.Index object") def isel( - self, indexers: Mapping[Any, int | slice | np.ndarray | Variable] - ) -> Index | None: + self: T_Index, indexers: Mapping[Any, int | slice | np.ndarray | Variable] + ) -> T_Index | None: + """Maybe returns a new index from the current index itself indexed by + positional indexers. + + This method should be re-implemented in subclasses of Index if the + wrapped index structure supports indexing operations. For example, + indexing a ``pandas.Index`` is pretty straightforward as it behaves very + much like an array. By contrast, it may be harder doing so for a + structure like a kd-tree that differs much from a simple array. + + If not re-implemented in subclasses of Index, this method returns + ``None``, i.e., calling :py:meth:`Dataset.isel` will either drop the + index in the resulting dataset or pass it unchanged if its corresponding + coordinate(s) are not indexed. + + Parameters + ---------- + indexers : dict + A dictionary of positional indexers as passed from + :py:meth:`Dataset.isel` and where the entries have been filtered + for the current index. + + Returns + ------- + maybe_index : Index + A new Index object or ``None``. + """ return None def sel(self, labels: dict[Any, Any]) -> IndexSelResult: + """Query the index with arbitrary coordinate label indexers. + + Implementation is optional but required in order to support label-based + selection. Otherwise it will raise an error when trying to call + :py:meth:`Dataset.sel` with labels for this index coordinates. + + Coordinate label indexers can be of many kinds, e.g., scalar, list, + tuple, array-like, slice, :py:class:`Variable`, :py:class:`DataArray`, etc. + It is the responsibility of the index to handle those indexers properly. + + Parameters + ---------- + labels : dict + A dictionary of coordinate label indexers passed from + :py:meth:`Dataset.sel` and where the entries have been filtered + for the current index. + + Returns + ------- + sel_results : :py:class:`IndexSelResult` + An index query result object that contains dimension positional indexers. + It may also contain new indexes, coordinate variables, etc. + """ raise NotImplementedError(f"{self!r} doesn't support label-based selection") - def join(self: T_Index, other: T_Index, how: str = "inner") -> T_Index: + def join(self: T_Index, other: T_Index, how: JoinOptions = "inner") -> T_Index: + """Return a new index from the combination of this index with another + index of the same type. + + Implementation is optional but required in order to support alignment. + + Parameters + ---------- + other : Index + The other Index object to combine with this index. + join : str, optional + Method for joining the two indexes (see :py:func:`~xarray.align`). + + Returns + ------- + joined : Index + A new Index object. + """ raise NotImplementedError( f"{self!r} doesn't support alignment with inner/outer join method" ) def reindex_like(self: T_Index, other: T_Index) -> dict[Hashable, Any]: + """Query the index with another index of the same type. + + Implementation is optional but required in order to support alignment. + + Parameters + ---------- + other : Index + The other Index object used to query this index. + + Returns + ------- + dim_positional_indexers : dict + A dictionary where keys are dimension names and values are positional + indexers. + """ raise NotImplementedError(f"{self!r} doesn't support re-indexing labels") - def equals(self, other): # pragma: no cover + def equals(self: T_Index, other: T_Index) -> bool: + """Compare this index with another index of the same type. + + Implemenation is optional but required in order to support alignment. + + Parameters + ---------- + other : Index + The other Index object to compare with this object. + + Returns + ------- + is_equal : bool + ``True`` if the indexes are equal, ``False`` otherwise. + """ raise NotImplementedError() - def roll(self, shifts: Mapping[Any, int]) -> Index | None: + def roll(self: T_Index, shifts: Mapping[Any, int]) -> T_Index | None: + """Roll this index by an offset along one or more dimensions. + + This method can be re-implemented in subclasses of Index, e.g., when the + index can be itself indexed. + + If not re-implemented, this method returns ``None``, i.e., calling + :py:meth:`Dataset.roll` will either drop the index in the resulting + dataset or pass it unchanged if its corresponding coordinate(s) are not + rolled. + + Parameters + ---------- + shifts : mapping of hashable to int, optional + A dict with keys matching dimensions and values given + by integers to rotate each of the given dimensions, as passed + :py:meth:`Dataset.roll`. + + Returns + ------- + rolled : Index + A new index with rolled data. + """ return None def rename( - self, name_dict: Mapping[Any, Hashable], dims_dict: Mapping[Any, Hashable] - ) -> Index: - return self + self: T_Index, + name_dict: Mapping[Any, Hashable], + dims_dict: Mapping[Any, Hashable], + ) -> T_Index: + """Maybe update the index with new coordinate and dimension names. - def __copy__(self) -> Index: - return self._copy(deep=False) + This method should be re-implemented in subclasses of Index if it has + attributes that depend on coordinate or dimension names. - def __deepcopy__(self, memo: dict[int, Any] | None = None) -> Index: - return self._copy(deep=True, memo=memo) + By default (if not re-implemented), it returns the index itself. + + Warning: the input names are not filtered for this method, they may + correspond to any variable or dimension of a Dataset or a DataArray. + + Parameters + ---------- + name_dict : dict-like + Mapping of current variable or coordinate names to the desired names, + as passed from :py:meth:`Dataset.rename_vars`. + dims_dict : dict-like + Mapping of current dimension names to the desired names, as passed + from :py:meth:`Dataset.rename_dims`. + + Returns + ------- + renamed : Index + Index with renamed attributes. + """ + return self def copy(self: T_Index, deep: bool = True) -> T_Index: + """Return a (deep) copy of this index. + + Implementation in subclasses of Index is optional. The base class + implements the default (deep) copy semantics. + + Parameters + ---------- + deep : bool, optional + If true (default), a copy of the internal structures + (e.g., wrapped index) is returned with the new object. + + Returns + ------- + index : Index + A new Index object. + """ return self._copy(deep=deep) + def __copy__(self: T_Index) -> T_Index: + return self.copy(deep=False) + + def __deepcopy__(self, memo: dict[int, Any] | None = None) -> Index: + return self._copy(deep=True, memo=memo) + def _copy( self: T_Index, deep: bool = True, memo: dict[int, Any] | None = None ) -> T_Index: @@ -131,7 +414,7 @@ def _copy( copied.__dict__.update(self.__dict__) return copied - def __getitem__(self, indexer: Any): + def __getitem__(self: T_Index, indexer: Any) -> T_Index: raise NotImplementedError() def _repr_inline_(self, max_width): @@ -1098,19 +1381,22 @@ def create_default_index_implicit( class Indexes(collections.abc.Mapping, Generic[T_PandasOrXarrayIndex]): - """Immutable proxy for Dataset or DataArrary indexes. + """Immutable proxy for Dataset or DataArray indexes. - Keys are coordinate names and values may correspond to either pandas or - xarray indexes. + It is a mapping where keys are coordinate names and values are either pandas + or xarray indexes. - Also provides some utility methods. + It also contains the indexed coordinate variables and provides some utility + methods. """ + _index_type: type[Index] | type[pd.Index] _indexes: dict[Any, T_PandasOrXarrayIndex] _variables: dict[Any, Variable] __slots__ = ( + "_index_type", "_indexes", "_variables", "_dims", @@ -1121,8 +1407,9 @@ class Indexes(collections.abc.Mapping, Generic[T_PandasOrXarrayIndex]): def __init__( self, - indexes: dict[Any, T_PandasOrXarrayIndex], - variables: dict[Any, Variable], + indexes: Mapping[Any, T_PandasOrXarrayIndex] | None = None, + variables: Mapping[Any, Variable] | None = None, + index_type: type[Index] | type[pd.Index] = Index, ): """Constructor not for public consumption. @@ -1131,11 +1418,33 @@ def __init__( indexes : dict Indexes held by this object. variables : dict - Indexed coordinate variables in this object. + Indexed coordinate variables in this object. Entries must + match those of `indexes`. + index_type : type + The type of all indexes, i.e., either :py:class:`xarray.indexes.Index` + or :py:class:`pandas.Index`. """ - self._indexes = indexes - self._variables = variables + if indexes is None: + indexes = {} + if variables is None: + variables = {} + + unmatched_keys = set(indexes) ^ set(variables) + if unmatched_keys: + raise ValueError( + f"unmatched keys found in indexes and variables: {unmatched_keys}" + ) + + if any(not isinstance(idx, index_type) for idx in indexes.values()): + index_type_str = f"{index_type.__module__}.{index_type.__name__}" + raise TypeError( + f"values of indexes must all be instances of {index_type_str}" + ) + + self._index_type = index_type + self._indexes = dict(**indexes) + self._variables = dict(**variables) self._dims: Mapping[Hashable, int] | None = None self.__coord_name_id: dict[Any, int] | None = None @@ -1283,7 +1592,7 @@ def to_pandas_indexes(self) -> Indexes[pd.Index]: elif isinstance(idx, Index): indexes[k] = idx.to_pandas_index() - return Indexes(indexes, self._variables) + return Indexes(indexes, self._variables, index_type=pd.Index) def copy_indexes( self, deep: bool = True, memo: dict[int, Any] | None = None @@ -1338,7 +1647,8 @@ def __getitem__(self, key) -> T_PandasOrXarrayIndex: return self._indexes[key] def __repr__(self): - return formatting.indexes_repr(self) + indexes = formatting._get_indexes_dict(self) + return formatting.indexes_repr(indexes) def default_indexes( @@ -1362,7 +1672,7 @@ def default_indexes( coord_names = set(coords) for name, var in coords.items(): - if name in dims: + if name in dims and var.ndim == 1: index, index_vars = create_default_index_implicit(var, coords) if set(index_vars) <= coord_names: indexes.update({k: index for k in index_vars}) @@ -1495,7 +1805,7 @@ def filter_indexes_from_coords( of coordinate names. """ - filtered_indexes: dict[Any, Index] = dict(**indexes) + filtered_indexes: dict[Any, Index] = dict(indexes) index_coord_names: dict[Hashable, set[Hashable]] = defaultdict(set) for name, idx in indexes.items(): diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 35a5261f248..acab9ccc60b 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -17,6 +17,7 @@ from xarray.core import duck_array_ops from xarray.core.nputils import NumpyVIndexAdapter from xarray.core.options import OPTIONS +from xarray.core.parallelcompat import get_chunked_array_type, is_chunked_array from xarray.core.pycompat import ( array_type, integer_types, @@ -1142,16 +1143,15 @@ def _arrayize_vectorized_indexer(indexer, shape): return VectorizedIndexer(tuple(new_key)) -def _dask_array_with_chunks_hint(array, chunks): - """Create a dask array using the chunks hint for dimensions of size > 1.""" - import dask.array as da +def _chunked_array_with_chunks_hint(array, chunks, chunkmanager): + """Create a chunked array using the chunks hint for dimensions of size > 1.""" if len(chunks) < array.ndim: raise ValueError("not enough chunks in hint") new_chunks = [] for chunk, size in zip(chunks, array.shape): new_chunks.append(chunk if size > 1 else (1,)) - return da.from_array(array, new_chunks) + return chunkmanager.from_array(array, new_chunks) def _logical_any(args): @@ -1165,8 +1165,11 @@ def _masked_result_drop_slice(key, data=None): new_keys = [] for k in key: if isinstance(k, np.ndarray): - if is_duck_dask_array(data): - new_keys.append(_dask_array_with_chunks_hint(k, chunks_hint)) + if is_chunked_array(data): + chunkmanager = get_chunked_array_type(data) + new_keys.append( + _chunked_array_with_chunks_hint(k, chunks_hint, chunkmanager) + ) elif isinstance(data, array_type("sparse")): import sparse diff --git a/xarray/core/merge.py b/xarray/core/merge.py index bf7288ad7ed..24b6ed0ba43 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -11,7 +11,6 @@ from xarray.core.duck_array_ops import lazy_array_equiv from xarray.core.indexes import ( Index, - Indexes, create_default_index_implicit, filter_indexes_from_coords, indexes_equal, @@ -34,7 +33,7 @@ tuple[DimsLike, ArrayLike, Mapping, Mapping], ] XarrayValue = Union[DataArray, Variable, VariableLike] - DatasetLike = Union[Dataset, Mapping[Any, XarrayValue]] + DatasetLike = Union[Dataset, Coordinates, Mapping[Any, XarrayValue]] CoercibleValue = Union[XarrayValue, pd.Series, pd.DataFrame] CoercibleMapping = Union[Dataset, Mapping[Any, CoercibleValue]] @@ -195,11 +194,11 @@ def _assert_prioritized_valid( def merge_collected( - grouped: dict[Hashable, list[MergeElement]], + grouped: dict[Any, list[MergeElement]], prioritized: Mapping[Any, MergeElement] | None = None, compat: CompatOptions = "minimal", combine_attrs: CombineAttrsOptions = "override", - equals: dict[Hashable, bool] | None = None, + equals: dict[Any, bool] | None = None, ) -> tuple[dict[Hashable, Variable], dict[Hashable, Index]]: """Merge dicts of variables, while resolving conflicts appropriately. @@ -306,22 +305,27 @@ def merge_collected( def collect_variables_and_indexes( - list_of_mappings: list[DatasetLike], + list_of_mappings: Iterable[DatasetLike], indexes: Mapping[Any, Any] | None = None, ) -> dict[Hashable, list[MergeElement]]: """Collect variables and indexes from list of mappings of xarray objects. - Mappings must either be Dataset objects, or have values of one of the - following types: + Mappings can be Dataset or Coordinates objects, in which case both + variables and indexes are extracted from it. + + It can also have values of one of the following types: - an xarray.Variable - a tuple `(dims, data[, attrs[, encoding]])` that can be converted in an xarray.Variable - or an xarray.DataArray If a mapping of indexes is given, those indexes are assigned to all variables - with a matching key/name. + with a matching key/name. For dimension variables with no matching index, a + default (pandas) index is assigned. DataArray indexes that don't match mapping + keys are also extracted. """ + from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset @@ -338,8 +342,8 @@ def append_all(variables, indexes): append(name, variable, indexes.get(name)) for mapping in list_of_mappings: - if isinstance(mapping, Dataset): - append_all(mapping.variables, mapping._indexes) + if isinstance(mapping, (Coordinates, Dataset)): + append_all(mapping.variables, mapping.xindexes) continue for name, variable in mapping.items(): @@ -466,12 +470,13 @@ def coerce_pandas_values(objects: Iterable[CoercibleMapping]) -> list[DatasetLik List of Dataset or dictionary objects. Any inputs or values in the inputs that were pandas objects have been converted into native xarray objects. """ + from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset out = [] for obj in objects: - if isinstance(obj, Dataset): + if isinstance(obj, (Dataset, Coordinates)): variables: DatasetLike = obj else: variables = {} @@ -556,56 +561,11 @@ def merge_coords( return variables, out_indexes -def merge_data_and_coords(data_vars, coords, compat="broadcast_equals", join="outer"): - """Used in Dataset.__init__.""" - indexes, coords = _create_indexes_from_coords(coords, data_vars) - objects = [data_vars, coords] - explicit_coords = coords.keys() - return merge_core( - objects, - compat, - join, - explicit_coords=explicit_coords, - indexes=Indexes(indexes, coords), - ) - - -def _create_indexes_from_coords(coords, data_vars=None): - """Maybe create default indexes from a mapping of coordinates. - - Return those indexes and updated coordinates. - """ - all_variables = dict(coords) - if data_vars is not None: - all_variables.update(data_vars) - - indexes = {} - updated_coords = {} - - # this is needed for backward compatibility: when a pandas multi-index - # is given as data variable, it is promoted as index / level coordinates - # TODO: depreciate this implicit behavior - index_vars = { - k: v - for k, v in all_variables.items() - if k in coords or isinstance(v, pd.MultiIndex) - } - - for name, obj in index_vars.items(): - variable = as_variable(obj, name=name) - - if variable.dims == (name,): - idx, idx_vars = create_default_index_implicit(variable, all_variables) - indexes.update({k: idx for k in idx_vars}) - updated_coords.update(idx_vars) - all_variables.update(idx_vars) - else: - updated_coords[name] = obj - - return indexes, updated_coords - - -def assert_valid_explicit_coords(variables, dims, explicit_coords): +def assert_valid_explicit_coords( + variables: Mapping[Any, Any], + dims: Mapping[Any, int], + explicit_coords: Iterable[Hashable], +) -> None: """Validate explicit coordinate names/dims. Raise a MergeError if an explicit coord shares a name with a dimension @@ -688,9 +648,10 @@ def merge_core( join: JoinOptions = "outer", combine_attrs: CombineAttrsOptions = "override", priority_arg: int | None = None, - explicit_coords: Sequence | None = None, + explicit_coords: Iterable[Hashable] | None = None, indexes: Mapping[Any, Any] | None = None, fill_value: object = dtypes.NA, + skip_align_args: list[int] | None = None, ) -> _MergeResult: """Core logic for merging labeled objects. @@ -716,6 +677,8 @@ def merge_core( may be cast to pandas.Index objects. fill_value : scalar, optional Value to use for newly missing values + skip_align_args : list of int, optional + Optional arguments in `objects` that are not included in alignment. Returns ------- @@ -737,10 +700,20 @@ def merge_core( _assert_compat_valid(compat) + objects = list(objects) + if skip_align_args is None: + skip_align_args = [] + + skip_align_objs = [(pos, objects.pop(pos)) for pos in skip_align_args] + coerced = coerce_pandas_values(objects) aligned = deep_align( coerced, join=join, copy=False, indexes=indexes, fill_value=fill_value ) + + for pos, obj in skip_align_objs: + aligned.insert(pos, obj) + collected = collect_variables_and_indexes(aligned, indexes=indexes) prioritized = _get_priority_vars_and_indexes(aligned, priority_arg, compat=compat) variables, out_indexes = merge_collected( @@ -997,18 +970,23 @@ def merge( combine_nested combine_by_coords """ + + from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset dict_like_objects = [] for obj in objects: - if not isinstance(obj, (DataArray, Dataset, dict)): + if not isinstance(obj, (DataArray, Dataset, Coordinates, dict)): raise TypeError( "objects must be an iterable containing only " "Dataset(s), DataArray(s), and dictionaries." ) - obj = obj.to_dataset(promote_attrs=True) if isinstance(obj, DataArray) else obj + if isinstance(obj, DataArray): + obj = obj.to_dataset(promote_attrs=True) + elif isinstance(obj, Coordinates): + obj = obj.to_dataset() dict_like_objects.append(obj) merge_result = merge_core( @@ -1035,7 +1013,7 @@ def dataset_merge_method( # method due for backwards compatibility # TODO: consider deprecating it? - if isinstance(overwrite_vars, Iterable) and not isinstance(overwrite_vars, str): + if not isinstance(overwrite_vars, str) and isinstance(overwrite_vars, Iterable): overwrite_vars = set(overwrite_vars) else: overwrite_vars = {overwrite_vars} diff --git a/xarray/core/missing.py b/xarray/core/missing.py index d7f0be5fa08..c6efaebc04c 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -15,7 +15,7 @@ from xarray.core.computation import apply_ufunc from xarray.core.duck_array_ops import datetime_to_numeric, push, timedelta_to_numeric from xarray.core.options import OPTIONS, _get_keep_attrs -from xarray.core.pycompat import is_duck_dask_array +from xarray.core.parallelcompat import get_chunked_array_type, is_chunked_array from xarray.core.types import Interp1dOptions, InterpOptions from xarray.core.utils import OrderedSet, is_scalar from xarray.core.variable import Variable, broadcast_variables @@ -66,9 +66,7 @@ def __call__(self, x): return self.f(x, **self.call_kwargs) def __repr__(self): - return "{type}: method={method}".format( - type=self.__class__.__name__, method=self.method - ) + return f"{self.__class__.__name__}: method={self.method}" class NumpyInterpolator(BaseInterpolator): @@ -639,7 +637,7 @@ def interp(var, indexes_coords, method: InterpOptions, **kwargs): var.transpose(*original_dims).data, x, destination, method, kwargs ) - result = Variable(new_dims, interped, attrs=var.attrs) + result = Variable(new_dims, interped, attrs=var.attrs, fastpath=True) # dimension of the output array out_dims: OrderedSet = OrderedSet() @@ -648,7 +646,8 @@ def interp(var, indexes_coords, method: InterpOptions, **kwargs): out_dims.update(indexes_coords[d][1].dims) else: out_dims.add(d) - result = result.transpose(*out_dims) + if len(out_dims) > 1: + result = result.transpose(*out_dims) return result @@ -693,8 +692,8 @@ def interp_func(var, x, new_x, method: InterpOptions, kwargs): else: func, kwargs = _get_interpolator_nd(method, **kwargs) - if is_duck_dask_array(var): - import dask.array as da + if is_chunked_array(var): + chunkmanager = get_chunked_array_type(var) ndim = var.ndim nconst = ndim - len(x) @@ -709,28 +708,24 @@ def interp_func(var, x, new_x, method: InterpOptions, kwargs): ] new_x_arginds = [item for pair in new_x_arginds for item in pair] - args = ( - var, - range(ndim), - *x_arginds, - *new_x_arginds, - ) + args = (var, range(ndim), *x_arginds, *new_x_arginds) - _, rechunked = da.unify_chunks(*args) + _, rechunked = chunkmanager.unify_chunks(*args) args = tuple(elem for pair in zip(rechunked, args[1::2]) for elem in pair) new_x = rechunked[1 + (len(rechunked) - 1) // 2 :] + new_x0_chunks = new_x[0].chunks + new_x0_shape = new_x[0].shape + new_x0_chunks_is_not_none = new_x0_chunks is not None new_axes = { - ndim + i: new_x[0].chunks[i] - if new_x[0].chunks is not None - else new_x[0].shape[i] + ndim + i: new_x0_chunks[i] if new_x0_chunks_is_not_none else new_x0_shape[i] for i in range(new_x[0].ndim) } # if useful, re-use localize for each chunk of new_x - localize = (method in ["linear", "nearest"]) and (new_x[0].chunks is not None) + localize = (method in ["linear", "nearest"]) and new_x0_chunks_is_not_none # scipy.interpolate.interp1d always forces to float. # Use the same check for blockwise as well: @@ -741,8 +736,8 @@ def interp_func(var, x, new_x, method: InterpOptions, kwargs): meta = var._meta - return da.blockwise( - _dask_aware_interpnd, + return chunkmanager.blockwise( + _chunked_aware_interpnd, out_ind, *args, interp_func=func, @@ -785,8 +780,8 @@ def _interpnd(var, x, new_x, func, kwargs): return rslt.reshape(rslt.shape[:-1] + new_x[0].shape) -def _dask_aware_interpnd(var, *coords, interp_func, interp_kwargs, localize=True): - """Wrapper for `_interpnd` through `blockwise` +def _chunked_aware_interpnd(var, *coords, interp_func, interp_kwargs, localize=True): + """Wrapper for `_interpnd` through `blockwise` for chunked arrays. The first half arrays in `coords` are original coordinates, the other half are destination coordinates diff --git a/xarray/core/nanops.py b/xarray/core/nanops.py index 022de845c4c..3b8ddfe032d 100644 --- a/xarray/core/nanops.py +++ b/xarray/core/nanops.py @@ -6,6 +6,7 @@ from xarray.core import dtypes, nputils, utils from xarray.core.duck_array_ops import ( + astype, count, fillna, isnull, @@ -22,7 +23,7 @@ def _maybe_null_out(result, axis, mask, min_count=1): if axis is not None and getattr(result, "ndim", False): null_mask = (np.take(mask.shape, axis).prod() - mask.sum(axis) - min_count) < 0 dtype, fill_value = dtypes.maybe_promote(result.dtype) - result = where(null_mask, fill_value, result.astype(dtype)) + result = where(null_mask, fill_value, astype(result, dtype)) elif getattr(result, "dtype", None) not in dtypes.NAT_TYPES: null_mask = mask.size - mask.sum() @@ -140,7 +141,7 @@ def _nanvar_object(value, axis=None, ddof=0, keepdims=False, **kwargs): value_mean = _nanmean_ddof_object( ddof=0, value=value, axis=axis, keepdims=True, **kwargs ) - squared = (value.astype(value_mean.dtype) - value_mean) ** 2 + squared = (astype(value, value_mean.dtype) - value_mean) ** 2 return _nanmean_ddof_object(ddof, squared, axis=axis, keepdims=keepdims, **kwargs) diff --git a/xarray/core/parallel.py b/xarray/core/parallel.py index 2f8612c5a9b..07c3c606bf2 100644 --- a/xarray/core/parallel.py +++ b/xarray/core/parallel.py @@ -406,9 +406,7 @@ def _wrapper( new_layers: collections.defaultdict[str, dict[Any, Any]] = collections.defaultdict( dict ) - gname = "{}-{}".format( - dask.utils.funcname(func), dask.base.tokenize(npargs[0], args, kwargs) - ) + gname = f"{dask.utils.funcname(func)}-{dask.base.tokenize(npargs[0], args, kwargs)}" # map dims to list of chunk indexes ichunk = {dim: range(len(chunks_v)) for dim, chunks_v in input_chunks.items()} diff --git a/xarray/core/parallelcompat.py b/xarray/core/parallelcompat.py new file mode 100644 index 00000000000..26efc5fc412 --- /dev/null +++ b/xarray/core/parallelcompat.py @@ -0,0 +1,645 @@ +""" +The code in this module is an experiment in going from N=1 to N=2 parallel computing frameworks in xarray. +It could later be used as the basis for a public interface allowing any N frameworks to interoperate with xarray, +but for now it is just a private experiment. +""" +from __future__ import annotations + +import functools +import sys +from abc import ABC, abstractmethod +from collections.abc import Iterable, Sequence +from importlib.metadata import EntryPoint, entry_points +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Generic, + TypeVar, +) + +import numpy as np + +from xarray.core.pycompat import is_chunked_array + +T_ChunkedArray = TypeVar("T_ChunkedArray") + +if TYPE_CHECKING: + from xarray.core.types import T_Chunks, T_NormalizedChunks + + +@functools.lru_cache(maxsize=1) +def list_chunkmanagers() -> dict[str, ChunkManagerEntrypoint]: + """ + Return a dictionary of available chunk managers and their ChunkManagerEntrypoint subclass objects. + + Returns + ------- + chunkmanagers : dict + Dictionary whose values are registered ChunkManagerEntrypoint subclass instances, and whose values + are the strings under which they are registered. + + Notes + ----- + # New selection mechanism introduced with Python 3.10. See GH6514. + """ + if sys.version_info >= (3, 10): + entrypoints = entry_points(group="xarray.chunkmanagers") + else: + entrypoints = entry_points().get("xarray.chunkmanagers", ()) + + return load_chunkmanagers(entrypoints) + + +def load_chunkmanagers( + entrypoints: Sequence[EntryPoint], +) -> dict[str, ChunkManagerEntrypoint]: + """Load entrypoints and instantiate chunkmanagers only once.""" + + loaded_entrypoints = { + entrypoint.name: entrypoint.load() for entrypoint in entrypoints + } + + available_chunkmanagers = { + name: chunkmanager() + for name, chunkmanager in loaded_entrypoints.items() + if chunkmanager.available + } + return available_chunkmanagers + + +def guess_chunkmanager( + manager: str | ChunkManagerEntrypoint | None, +) -> ChunkManagerEntrypoint: + """ + Get namespace of chunk-handling methods, guessing from what's available. + + If the name of a specific ChunkManager is given (e.g. "dask"), then use that. + Else use whatever is installed, defaulting to dask if there are multiple options. + """ + + chunkmanagers = list_chunkmanagers() + + if manager is None: + if len(chunkmanagers) == 1: + # use the only option available + manager = next(iter(chunkmanagers.keys())) + else: + # default to trying to use dask + manager = "dask" + + if isinstance(manager, str): + if manager not in chunkmanagers: + raise ValueError( + f"unrecognized chunk manager {manager} - must be one of: {list(chunkmanagers)}" + ) + + return chunkmanagers[manager] + elif isinstance(manager, ChunkManagerEntrypoint): + # already a valid ChunkManager so just pass through + return manager + else: + raise TypeError( + f"manager must be a string or instance of ChunkManagerEntrypoint, but received type {type(manager)}" + ) + + +def get_chunked_array_type(*args) -> ChunkManagerEntrypoint: + """ + Detects which parallel backend should be used for given set of arrays. + + Also checks that all arrays are of same chunking type (i.e. not a mix of cubed and dask). + """ + + # TODO this list is probably redundant with something inside xarray.apply_ufunc + ALLOWED_NON_CHUNKED_TYPES = {int, float, np.ndarray} + + chunked_arrays = [ + a + for a in args + if is_chunked_array(a) and type(a) not in ALLOWED_NON_CHUNKED_TYPES + ] + + # Asserts all arrays are the same type (or numpy etc.) + chunked_array_types = {type(a) for a in chunked_arrays} + if len(chunked_array_types) > 1: + raise TypeError( + f"Mixing chunked array types is not supported, but received multiple types: {chunked_array_types}" + ) + elif len(chunked_array_types) == 0: + raise TypeError("Expected a chunked array but none were found") + + # iterate over defined chunk managers, seeing if each recognises this array type + chunked_arr = chunked_arrays[0] + chunkmanagers = list_chunkmanagers() + selected = [ + chunkmanager + for chunkmanager in chunkmanagers.values() + if chunkmanager.is_chunked_array(chunked_arr) + ] + if not selected: + raise TypeError( + f"Could not find a Chunk Manager which recognises type {type(chunked_arr)}" + ) + elif len(selected) >= 2: + raise TypeError(f"Multiple ChunkManagers recognise type {type(chunked_arr)}") + else: + return selected[0] + + +class ChunkManagerEntrypoint(ABC, Generic[T_ChunkedArray]): + """ + Interface between a particular parallel computing framework and xarray. + + This abstract base class must be subclassed by libraries implementing chunked array types, and + registered via the ``chunkmanagers`` entrypoint. + + Abstract methods on this class must be implemented, whereas non-abstract methods are only required in order to + enable a subset of xarray functionality, and by default will raise a ``NotImplementedError`` if called. + + Attributes + ---------- + array_cls + Type of the array class this parallel computing framework provides. + + Parallel frameworks need to provide an array class that supports the array API standard. + This attribute is used for array instance type checking at runtime. + """ + + array_cls: type[T_ChunkedArray] + available: bool = True + + @abstractmethod + def __init__(self) -> None: + """Used to set the array_cls attribute at import time.""" + raise NotImplementedError() + + def is_chunked_array(self, data: Any) -> bool: + """ + Check if the given object is an instance of this type of chunked array. + + Compares against the type stored in the array_cls attribute by default. + + Parameters + ---------- + data : Any + + Returns + ------- + is_chunked : bool + + See Also + -------- + dask.is_dask_collection + """ + return isinstance(data, self.array_cls) + + @abstractmethod + def chunks(self, data: T_ChunkedArray) -> T_NormalizedChunks: + """ + Return the current chunks of the given array. + + Returns chunks explicitly as a tuple of tuple of ints. + + Used internally by xarray objects' .chunks and .chunksizes properties. + + Parameters + ---------- + data : chunked array + + Returns + ------- + chunks : tuple[tuple[int, ...], ...] + + See Also + -------- + dask.array.Array.chunks + cubed.Array.chunks + """ + raise NotImplementedError() + + @abstractmethod + def normalize_chunks( + self, + chunks: T_Chunks | T_NormalizedChunks, + shape: tuple[int, ...] | None = None, + limit: int | None = None, + dtype: np.dtype | None = None, + previous_chunks: T_NormalizedChunks | None = None, + ) -> T_NormalizedChunks: + """ + Normalize given chunking pattern into an explicit tuple of tuples representation. + + Exposed primarily because different chunking backends may want to make different decisions about how to + automatically chunk along dimensions not given explicitly in the input chunks. + + Called internally by xarray.open_dataset. + + Parameters + ---------- + chunks : tuple, int, dict, or string + The chunks to be normalized. + shape : Tuple[int] + The shape of the array + limit : int (optional) + The maximum block size to target in bytes, + if freedom is given to choose + dtype : np.dtype + previous_chunks : Tuple[Tuple[int]], optional + Chunks from a previous array that we should use for inspiration when + rechunking dimensions automatically. + + See Also + -------- + dask.array.core.normalize_chunks + """ + raise NotImplementedError() + + @abstractmethod + def from_array( + self, data: np.ndarray, chunks: T_Chunks, **kwargs + ) -> T_ChunkedArray: + """ + Create a chunked array from a non-chunked numpy-like array. + + Generally input should have a ``.shape``, ``.ndim``, ``.dtype`` and support numpy-style slicing. + + Called when the .chunk method is called on an xarray object that is not already chunked. + Also called within open_dataset (when chunks is not None) to create a chunked array from + an xarray lazily indexed array. + + Parameters + ---------- + data : array_like + chunks : int, tuple + How to chunk the array. + + See Also + -------- + dask.array.from_array + cubed.from_array + """ + raise NotImplementedError() + + def rechunk( + self, + data: T_ChunkedArray, + chunks: T_NormalizedChunks | tuple[int, ...] | T_Chunks, + **kwargs, + ) -> T_ChunkedArray: + """ + Changes the chunking pattern of the given array. + + Called when the .chunk method is called on an xarray object that is already chunked. + + Parameters + ---------- + data : dask array + Array to be rechunked. + chunks : int, tuple, dict or str, optional + The new block dimensions to create. -1 indicates the full size of the + corresponding dimension. Default is "auto" which automatically + determines chunk sizes. + + Returns + ------- + chunked array + + See Also + -------- + dask.array.Array.rechunk + cubed.Array.rechunk + """ + return data.rechunk(chunks, **kwargs) # type: ignore[attr-defined] + + @abstractmethod + def compute(self, *data: T_ChunkedArray | Any, **kwargs) -> tuple[np.ndarray, ...]: + """ + Computes one or more chunked arrays, returning them as eager numpy arrays. + + Called anytime something needs to computed, including multiple arrays at once. + Used by `.compute`, `.persist`, `.values`. + + Parameters + ---------- + *data : object + Any number of objects. If an object is an instance of the chunked array type, it is computed + and the in-memory result returned as a numpy array. All other types should be passed through unchanged. + + Returns + ------- + objs + The input, but with all chunked arrays now computed. + + See Also + -------- + dask.compute + cubed.compute + """ + raise NotImplementedError() + + @property + def array_api(self) -> Any: + """ + Return the array_api namespace following the python array API standard. + + See https://data-apis.org/array-api/latest/ . Currently used to access the array API function + ``full_like``, which is called within the xarray constructors ``xarray.full_like``, ``xarray.ones_like``, + ``xarray.zeros_like``, etc. + + See Also + -------- + dask.array + cubed.array_api + """ + raise NotImplementedError() + + def reduction( + self, + arr: T_ChunkedArray, + func: Callable, + combine_func: Callable | None = None, + aggregate_func: Callable | None = None, + axis: int | Sequence[int] | None = None, + dtype: np.dtype | None = None, + keepdims: bool = False, + ) -> T_ChunkedArray: + """ + A general version of array reductions along one or more axes. + + Used inside some reductions like nanfirst, which is used by ``groupby.first``. + + Parameters + ---------- + arr : chunked array + Data to be reduced along one or more axes. + func : Callable(x_chunk, axis, keepdims) + First function to be executed when resolving the dask graph. + This function is applied in parallel to all original chunks of x. + See below for function parameters. + combine_func : Callable(x_chunk, axis, keepdims), optional + Function used for intermediate recursive aggregation (see + split_every below). If omitted, it defaults to aggregate_func. + aggregate_func : Callable(x_chunk, axis, keepdims) + Last function to be executed, producing the final output. It is always invoked, even when the reduced + Array counts a single chunk along the reduced axes. + axis : int or sequence of ints, optional + Axis or axes to aggregate upon. If omitted, aggregate along all axes. + dtype : np.dtype + data type of output. This argument was previously optional, but + leaving as ``None`` will now raise an exception. + keepdims : boolean, optional + Whether the reduction function should preserve the reduced axes, + leaving them at size ``output_size``, or remove them. + + Returns + ------- + chunked array + + See Also + -------- + dask.array.reduction + cubed.core.reduction + """ + raise NotImplementedError() + + @abstractmethod + def apply_gufunc( + self, + func: Callable, + signature: str, + *args: Any, + axes: Sequence[tuple[int, ...]] | None = None, + keepdims: bool = False, + output_dtypes: Sequence[np.typing.DTypeLike] | None = None, + vectorize: bool | None = None, + **kwargs, + ): + """ + Apply a generalized ufunc or similar python function to arrays. + + ``signature`` determines if the function consumes or produces core + dimensions. The remaining dimensions in given input arrays (``*args``) + are considered loop dimensions and are required to broadcast + naturally against each other. + + In other terms, this function is like ``np.vectorize``, but for + the blocks of chunked arrays. If the function itself shall also + be vectorized use ``vectorize=True`` for convenience. + + Called inside ``xarray.apply_ufunc``, which is called internally for most xarray operations. + Therefore this method must be implemented for the vast majority of xarray computations to be supported. + + Parameters + ---------- + func : callable + Function to call like ``func(*args, **kwargs)`` on input arrays + (``*args``) that returns an array or tuple of arrays. If multiple + arguments with non-matching dimensions are supplied, this function is + expected to vectorize (broadcast) over axes of positional arguments in + the style of NumPy universal functions [1]_ (if this is not the case, + set ``vectorize=True``). If this function returns multiple outputs, + ``output_core_dims`` has to be set as well. + signature: string + Specifies what core dimensions are consumed and produced by ``func``. + According to the specification of numpy.gufunc signature [2]_ + *args : numeric + Input arrays or scalars to the callable function. + axes: List of tuples, optional, keyword only + A list of tuples with indices of axes a generalized ufunc should operate on. + For instance, for a signature of ``"(i,j),(j,k)->(i,k)"`` appropriate for + matrix multiplication, the base elements are two-dimensional matrices + and these are taken to be stored in the two last axes of each argument. The + corresponding axes keyword would be ``[(-2, -1), (-2, -1), (-2, -1)]``. + For simplicity, for generalized ufuncs that operate on 1-dimensional arrays + (vectors), a single integer is accepted instead of a single-element tuple, + and for generalized ufuncs for which all outputs are scalars, the output + tuples can be omitted. + keepdims: bool, optional, keyword only + If this is set to True, axes which are reduced over will be left in the result as + a dimension with size one, so that the result will broadcast correctly against the + inputs. This option can only be used for generalized ufuncs that operate on inputs + that all have the same number of core dimensions and with outputs that have no core + dimensions , i.e., with signatures like ``"(i),(i)->()"`` or ``"(m,m)->()"``. + If used, the location of the dimensions in the output can be controlled with axes + and axis. + output_dtypes : Optional, dtype or list of dtypes, keyword only + Valid numpy dtype specification or list thereof. + If not given, a call of ``func`` with a small set of data + is performed in order to try to automatically determine the + output dtypes. + vectorize: bool, keyword only + If set to ``True``, ``np.vectorize`` is applied to ``func`` for + convenience. Defaults to ``False``. + **kwargs : dict + Extra keyword arguments to pass to `func` + + Returns + ------- + Single chunked array or tuple of chunked arrays + + See Also + -------- + dask.array.gufunc.apply_gufunc + cubed.apply_gufunc + + References + ---------- + .. [1] https://docs.scipy.org/doc/numpy/reference/ufuncs.html + .. [2] https://docs.scipy.org/doc/numpy/reference/c-api/generalized-ufuncs.html + """ + raise NotImplementedError() + + def map_blocks( + self, + func: Callable, + *args: Any, + dtype: np.typing.DTypeLike | None = None, + chunks: tuple[int, ...] | None = None, + drop_axis: int | Sequence[int] | None = None, + new_axis: int | Sequence[int] | None = None, + **kwargs, + ): + """ + Map a function across all blocks of a chunked array. + + Called in elementwise operations, but notably not (currently) called within xarray.map_blocks. + + Parameters + ---------- + func : callable + Function to apply to every block in the array. + If ``func`` accepts ``block_info=`` or ``block_id=`` + as keyword arguments, these will be passed dictionaries + containing information about input and output chunks/arrays + during computation. See examples for details. + args : dask arrays or other objects + dtype : np.dtype, optional + The ``dtype`` of the output array. It is recommended to provide this. + If not provided, will be inferred by applying the function to a small + set of fake data. + chunks : tuple, optional + Chunk shape of resulting blocks if the function does not preserve + shape. If not provided, the resulting array is assumed to have the same + block structure as the first input array. + drop_axis : number or iterable, optional + Dimensions lost by the function. + new_axis : number or iterable, optional + New dimensions created by the function. Note that these are applied + after ``drop_axis`` (if present). + **kwargs : + Other keyword arguments to pass to function. Values must be constants + (not dask.arrays) + + See Also + -------- + dask.array.map_blocks + cubed.map_blocks + """ + raise NotImplementedError() + + def blockwise( + self, + func: Callable, + out_ind: Iterable, + *args: Any, # can't type this as mypy assumes args are all same type, but dask blockwise args alternate types + adjust_chunks: dict[Any, Callable] | None = None, + new_axes: dict[Any, int] | None = None, + align_arrays: bool = True, + **kwargs, + ): + """ + Tensor operation: Generalized inner and outer products. + + A broad class of blocked algorithms and patterns can be specified with a + concise multi-index notation. The ``blockwise`` function applies an in-memory + function across multiple blocks of multiple inputs in a variety of ways. + Many chunked array operations are special cases of blockwise including + elementwise, broadcasting, reductions, tensordot, and transpose. + + Currently only called explicitly in xarray when performing multidimensional interpolation. + + Parameters + ---------- + func : callable + Function to apply to individual tuples of blocks + out_ind : iterable + Block pattern of the output, something like 'ijk' or (1, 2, 3) + *args : sequence of Array, index pairs + You may also pass literal arguments, accompanied by None index + e.g. (x, 'ij', y, 'jk', z, 'i', some_literal, None) + **kwargs : dict + Extra keyword arguments to pass to function + adjust_chunks : dict + Dictionary mapping index to function to be applied to chunk sizes + new_axes : dict, keyword only + New indexes and their dimension lengths + align_arrays: bool + Whether or not to align chunks along equally sized dimensions when + multiple arrays are provided. This allows for larger chunks in some + arrays to be broken into smaller ones that match chunk sizes in other + arrays such that they are compatible for block function mapping. If + this is false, then an error will be thrown if arrays do not already + have the same number of blocks in each dimension. + + See Also + -------- + dask.array.blockwise + cubed.core.blockwise + """ + raise NotImplementedError() + + def unify_chunks( + self, + *args: Any, # can't type this as mypy assumes args are all same type, but dask unify_chunks args alternate types + **kwargs, + ) -> tuple[dict[str, T_NormalizedChunks], list[T_ChunkedArray]]: + """ + Unify chunks across a sequence of arrays. + + Called by xarray.unify_chunks. + + Parameters + ---------- + *args: sequence of Array, index pairs + Sequence like (x, 'ij', y, 'jk', z, 'i') + + See Also + -------- + dask.array.core.unify_chunks + cubed.core.unify_chunks + """ + raise NotImplementedError() + + def store( + self, + sources: T_ChunkedArray | Sequence[T_ChunkedArray], + targets: Any, + **kwargs: dict[str, Any], + ): + """ + Store chunked arrays in array-like objects, overwriting data in target. + + This stores chunked arrays into object that supports numpy-style setitem + indexing (e.g. a Zarr Store). Allows storing values chunk by chunk so that it does not have to + fill up memory. For best performance you likely want to align the block size of + the storage target with the block size of your array. + + Used when writing to any registered xarray I/O backend. + + Parameters + ---------- + sources: Array or collection of Arrays + targets: array-like or collection of array-likes + These should support setitem syntax ``target[10:20] = ...``. + If sources is a single item, targets must be a single item; if sources is a + collection of arrays, targets must be a matching collection. + kwargs: + Parameters passed to compute/persist (only used if compute=True) + + See Also + -------- + dask.array.store + cubed.store + """ + raise NotImplementedError() diff --git a/xarray/core/pycompat.py b/xarray/core/pycompat.py index 4a3f3638d14..9af5d693170 100644 --- a/xarray/core/pycompat.py +++ b/xarray/core/pycompat.py @@ -12,7 +12,7 @@ integer_types = (int, np.integer) if TYPE_CHECKING: - ModType = Literal["dask", "pint", "cupy", "sparse"] + ModType = Literal["dask", "pint", "cupy", "sparse", "cubed"] DuckArrayTypes = tuple[type[Any], ...] # TODO: improve this? maybe Generic @@ -30,7 +30,7 @@ class DuckArrayModule: available: bool def __init__(self, mod: ModType) -> None: - duck_array_module: ModuleType | None = None + duck_array_module: ModuleType | None duck_array_version: Version duck_array_type: DuckArrayTypes try: @@ -45,10 +45,12 @@ def __init__(self, mod: ModType) -> None: duck_array_type = (duck_array_module.ndarray,) elif mod == "sparse": duck_array_type = (duck_array_module.SparseArray,) + elif mod == "cubed": + duck_array_type = (duck_array_module.Array,) else: raise NotImplementedError - except ImportError: # pragma: no cover + except (ImportError, AttributeError): # pragma: no cover duck_array_module = None duck_array_version = Version("0.0.0") duck_array_type = () @@ -59,14 +61,26 @@ def __init__(self, mod: ModType) -> None: self.available = duck_array_module is not None +_cached_duck_array_modules: dict[ModType, DuckArrayModule] = {} + + +def _get_cached_duck_array_module(mod: ModType) -> DuckArrayModule: + if mod not in _cached_duck_array_modules: + duckmod = DuckArrayModule(mod) + _cached_duck_array_modules[mod] = duckmod + return duckmod + else: + return _cached_duck_array_modules[mod] + + def array_type(mod: ModType) -> DuckArrayTypes: """Quick wrapper to get the array class of the module.""" - return DuckArrayModule(mod).type + return _get_cached_duck_array_module(mod).type def mod_version(mod: ModType) -> Version: """Quick wrapper to get the version of the module.""" - return DuckArrayModule(mod).version + return _get_cached_duck_array_module(mod).version def is_dask_collection(x): @@ -81,5 +95,9 @@ def is_duck_dask_array(x): return is_duck_array(x) and is_dask_collection(x) +def is_chunked_array(x) -> bool: + return is_duck_dask_array(x) or (is_duck_array(x) and hasattr(x, "chunks")) + + def is_0d_dask_array(x): return is_duck_dask_array(x) and is_scalar(x) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 7eb4e9c7687..916fabe42ac 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -158,9 +158,9 @@ def method(self, keep_attrs=None, **kwargs): return method def _mean(self, keep_attrs, **kwargs): - result = self.sum(keep_attrs=False, **kwargs) / self.count( - keep_attrs=False - ).astype(self.obj.dtype, copy=False) + result = self.sum(keep_attrs=False, **kwargs) / duck_array_ops.astype( + self.count(keep_attrs=False), dtype=self.obj.dtype, copy=False + ) if keep_attrs: result.attrs = self.obj.attrs return result diff --git a/xarray/core/rolling_exp.py b/xarray/core/rolling_exp.py index 91edd3acb7c..587fed6617d 100644 --- a/xarray/core/rolling_exp.py +++ b/xarray/core/rolling_exp.py @@ -4,7 +4,6 @@ from typing import Any, Generic import numpy as np -from packaging.version import Version from xarray.core.options import _get_keep_attrs from xarray.core.pdcompat import count_not_none @@ -37,10 +36,6 @@ def move_exp_nansum(array, *, axis, alpha): raise TypeError("rolling_exp is not currently supported for dask-like arrays") import numbagg - # numbagg <= 0.2.0 did not have a __version__ attribute - if Version(getattr(numbagg, "__version__", "0.1.0")) < Version("0.2.0"): - raise ValueError("`rolling_exp(...).sum() requires numbagg>=0.2.1.") - return numbagg.move_exp_nansum(array, axis=axis, alpha=alpha) diff --git a/xarray/core/types.py b/xarray/core/types.py index 0f11b16b003..8f00ed05cdb 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -1,12 +1,14 @@ from __future__ import annotations import datetime -from collections.abc import Hashable, Iterable, Sequence +import sys +from collections.abc import Hashable, Iterable, Iterator, Mapping, Sequence from typing import ( TYPE_CHECKING, Any, Callable, Literal, + Protocol, SupportsIndex, TypeVar, Union, @@ -14,18 +16,31 @@ import numpy as np import pandas as pd -from packaging.version import Version + +try: + if sys.version_info >= (3, 11): + from typing import Self + else: + from typing_extensions import Self +except ImportError: + if TYPE_CHECKING: + raise + else: + Self: Any = None if TYPE_CHECKING: from numpy._typing import _SupportsDType from numpy.typing import ArrayLike from xarray.backends.common import BackendEntrypoint + from xarray.core.alignment import Aligner from xarray.core.common import AbstractArray, DataWithCoords + from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset from xarray.core.groupby import DataArrayGroupBy, GroupBy - from xarray.core.indexes import Index + from xarray.core.indexes import Index, Indexes + from xarray.core.utils import Frozen from xarray.core.variable import Variable try: @@ -33,18 +48,15 @@ except ImportError: DaskArray = np.ndarray # type: ignore - # TODO: Turn on when https://github.com/python/mypy/issues/11871 is fixed. - # Can be uncommented if using pyright though. - # import sys + try: + from cubed import Array as CubedArray + except ImportError: + CubedArray = np.ndarray - # try: - # if sys.version_info >= (3, 11): - # from typing import Self - # else: - # from typing_extensions import Self - # except ImportError: - # Self: Any = None - Self: Any = None + try: + from zarr.core import Array as ZarrArray + except ImportError: + ZarrArray = np.ndarray # Anything that can be coerced to a shape tuple _ShapeLike = Union[SupportsIndex, Sequence[SupportsIndex]] @@ -79,14 +91,66 @@ CFTimeDatetime = Any DatetimeLike = Union[pd.Timestamp, datetime.datetime, np.datetime64, CFTimeDatetime] else: - Self: Any = None DTypeLikeSave: Any = None +class Alignable(Protocol): + """Represents any Xarray type that supports alignment. + + It may be ``Dataset``, ``DataArray`` or ``Coordinates``. This protocol class + is needed since those types do not all have a common base class. + + """ + + @property + def dims(self) -> Frozen[Hashable, int] | tuple[Hashable, ...]: + ... + + @property + def sizes(self) -> Frozen[Hashable, int]: + ... + + @property + def xindexes(self) -> Indexes[Index]: + ... + + def _reindex_callback( + self, + aligner: Aligner, + dim_pos_indexers: dict[Hashable, Any], + variables: dict[Hashable, Variable], + indexes: dict[Hashable, Index], + fill_value: Any, + exclude_dims: frozenset[Hashable], + exclude_vars: frozenset[Hashable], + ) -> Self: + ... + + def _overwrite_indexes( + self, + indexes: Mapping[Any, Index], + variables: Mapping[Any, Variable] | None = None, + ) -> Self: + ... + + def __len__(self) -> int: + ... + + def __iter__(self) -> Iterator[Hashable]: + ... + + def copy( + self, + deep: bool = False, + ) -> Self: + ... + + T_Backend = TypeVar("T_Backend", bound="BackendEntrypoint") T_Dataset = TypeVar("T_Dataset", bound="Dataset") T_DataArray = TypeVar("T_DataArray", bound="DataArray") T_Variable = TypeVar("T_Variable", bound="Variable") +T_Coordinates = TypeVar("T_Coordinates", bound="Coordinates") T_Array = TypeVar("T_Array", bound="AbstractArray") T_Index = TypeVar("T_Index", bound="Index") @@ -95,6 +159,7 @@ # Maybe we rename this to T_Data or something less Fortran-y? T_Xarray = TypeVar("T_Xarray", "DataArray", "Dataset") T_DataWithCoords = TypeVar("T_DataWithCoords", bound="DataWithCoords") +T_Alignable = TypeVar("T_Alignable", bound="Alignable") ScalarOrArray = Union["ArrayLike", np.generic, np.ndarray, "DaskArray"] DsCompatible = Union["Dataset", "DataArray", "Variable", "GroupBy", "ScalarOrArray"] @@ -105,6 +170,9 @@ Dims = Union[str, Iterable[Hashable], "ellipsis", None] OrderedDims = Union[str, Sequence[Union[Hashable, "ellipsis"]], "ellipsis", None] +T_Chunks = Union[int, dict[Any, Any], Literal["auto"], None] +T_NormalizedChunks = tuple[tuple[int, ...], ...] + ErrorOptions = Literal["raise", "ignore"] ErrorOptionsWithWarn = Literal["raise", "warn", "ignore"] @@ -179,27 +247,18 @@ ] -if Version(np.__version__) >= Version("1.22.0"): - QuantileMethods = Literal[ - "inverted_cdf", - "averaged_inverted_cdf", - "closest_observation", - "interpolated_inverted_cdf", - "hazen", - "weibull", - "linear", - "median_unbiased", - "normal_unbiased", - "lower", - "higher", - "midpoint", - "nearest", - ] -else: - QuantileMethods = Literal[ # type: ignore[misc] - "linear", - "lower", - "higher", - "midpoint", - "nearest", - ] +QuantileMethods = Literal[ + "inverted_cdf", + "averaged_inverted_cdf", + "closest_observation", + "interpolated_inverted_cdf", + "hazen", + "weibull", + "linear", + "median_unbiased", + "normal_unbiased", + "lower", + "higher", + "midpoint", + "nearest", +] diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 1c90a2410f2..bd0ca57f33c 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -536,8 +536,7 @@ def discard(self, value: T) -> None: # Additional methods def update(self, values: Iterable[T]) -> None: - for v in values: - self._d[v] = None + self._d.update(dict.fromkeys(values)) def __repr__(self) -> str: return f"{type(self).__name__}({list(self)!r})" @@ -1125,19 +1124,19 @@ def iterate_nested(nested_list): yield item -def contains_only_dask_or_numpy(obj) -> bool: - """Returns True if xarray object contains only numpy or dask arrays. +def contains_only_chunked_or_numpy(obj) -> bool: + """Returns True if xarray object contains only numpy arrays or chunked arrays (i.e. pure dask or cubed). Expects obj to be Dataset or DataArray""" from xarray.core.dataarray import DataArray - from xarray.core.pycompat import is_duck_dask_array + from xarray.core.pycompat import is_chunked_array if isinstance(obj, DataArray): obj = obj._to_temp_dataset() return all( [ - isinstance(var.data, np.ndarray) or is_duck_dask_array(var.data) + isinstance(var.data, np.ndarray) or is_chunked_array(var.data) for var in obj.variables.values() ] ) @@ -1202,3 +1201,66 @@ def emit_user_level_warning(message, category=None): """Emit a warning at the user level by inspecting the stack trace.""" stacklevel = find_stack_level() warnings.warn(message, category=category, stacklevel=stacklevel) + + +def consolidate_dask_from_array_kwargs( + from_array_kwargs: dict, + name: str | None = None, + lock: bool | None = None, + inline_array: bool | None = None, +) -> dict: + """ + Merge dask-specific kwargs with arbitrary from_array_kwargs dict. + + Temporary function, to be deleted once explicitly passing dask-specific kwargs to .chunk() is deprecated. + """ + + from_array_kwargs = _resolve_doubly_passed_kwarg( + from_array_kwargs, + kwarg_name="name", + passed_kwarg_value=name, + default=None, + err_msg_dict_name="from_array_kwargs", + ) + from_array_kwargs = _resolve_doubly_passed_kwarg( + from_array_kwargs, + kwarg_name="lock", + passed_kwarg_value=lock, + default=False, + err_msg_dict_name="from_array_kwargs", + ) + from_array_kwargs = _resolve_doubly_passed_kwarg( + from_array_kwargs, + kwarg_name="inline_array", + passed_kwarg_value=inline_array, + default=False, + err_msg_dict_name="from_array_kwargs", + ) + + return from_array_kwargs + + +def _resolve_doubly_passed_kwarg( + kwargs_dict: dict, + kwarg_name: str, + passed_kwarg_value: str | bool | None, + default: bool | None, + err_msg_dict_name: str, +) -> dict: + # if in kwargs_dict but not passed explicitly then just pass kwargs_dict through unaltered + if kwarg_name in kwargs_dict and passed_kwarg_value is None: + pass + # if passed explicitly but not in kwargs_dict then use that + elif kwarg_name not in kwargs_dict and passed_kwarg_value is not None: + kwargs_dict[kwarg_name] = passed_kwarg_value + # if in neither then use default + elif kwarg_name not in kwargs_dict and passed_kwarg_value is None: + kwargs_dict[kwarg_name] = default + # if in both then raise + else: + raise ValueError( + f"argument {kwarg_name} cannot be passed both as a keyword argument and within " + f"the {err_msg_dict_name} dictionary" + ) + + return kwargs_dict diff --git a/xarray/core/variable.py b/xarray/core/variable.py index c19cb21cba2..c89545c43ae 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -7,12 +7,12 @@ import warnings from collections.abc import Hashable, Iterable, Mapping, Sequence from datetime import timedelta +from functools import partial from typing import TYPE_CHECKING, Any, Callable, Literal, NoReturn import numpy as np import pandas as pd from numpy.typing import ArrayLike -from packaging.version import Version import xarray as xr # only for Dataset and DataArray from xarray.core import common, dtypes, duck_array_ops, indexing, nputils, ops, utils @@ -26,10 +26,15 @@ as_indexable, ) from xarray.core.options import OPTIONS, _get_keep_attrs +from xarray.core.parallelcompat import ( + get_chunked_array_type, + guess_chunkmanager, +) from xarray.core.pycompat import ( array_type, integer_types, is_0d_dask_array, + is_chunked_array, is_duck_dask_array, ) from xarray.core.utils import ( @@ -54,6 +59,7 @@ BASIC_INDEXING_TYPES = integer_types + (slice,) if TYPE_CHECKING: + from xarray.core.parallelcompat import ChunkManagerEntrypoint from xarray.core.types import ( Dims, ErrorOptionsWithWarn, @@ -119,17 +125,15 @@ def as_variable(obj, name=None) -> Variable | IndexVariable: elif isinstance(obj, tuple): if isinstance(obj[1], DataArray): raise TypeError( - "Using a DataArray object to construct a variable is" + f"Variable {name!r}: Using a DataArray object to construct a variable is" " ambiguous, please extract the data using the .data property." ) try: obj = Variable(*obj) except (TypeError, ValueError) as error: - # use .format() instead of % because it handles tuples consistently raise error.__class__( - "Could not convert tuple of form " - "(dims, data[, attrs, encoding]): " - "{} to Variable.".format(obj) + f"Variable {name!r}: Could not convert tuple of form " + f"(dims, data[, attrs, encoding]): {obj} to Variable." ) elif utils.is_scalar(obj): obj = Variable([], obj) @@ -148,18 +152,12 @@ def as_variable(obj, name=None) -> Variable | IndexVariable: obj = Variable(name, data, fastpath=True) else: raise TypeError( - "unable to convert object into a variable without an " + f"Variable {name!r}: unable to convert object into a variable without an " f"explicit list of dimensions: {obj!r}" ) - if name is not None and name in obj.dims: - # convert the Variable into an Index - if obj.ndim != 1: - raise MissingDimensionsError( - f"{name!r} has more than 1-dimension and the same name as one of its " - f"dimensions {obj.dims!r}. xarray disallows such variables because they " - "conflict with the coordinates used to label dimensions." - ) + if name is not None and name in obj.dims and obj.ndim == 1: + # automatically convert the Variable into an Index obj = obj.to_index_variable() return obj @@ -194,10 +192,10 @@ def _as_nanosecond_precision(data): nanosecond_precision_dtype = pd.DatetimeTZDtype("ns", dtype.tz) else: nanosecond_precision_dtype = "datetime64[ns]" - return data.astype(nanosecond_precision_dtype) + return duck_array_ops.astype(data, nanosecond_precision_dtype) elif dtype.kind == "m" and dtype != np.dtype("timedelta64[ns]"): utils.emit_user_level_warning(NON_NANOSECOND_WARNING.format(case="timedelta")) - return data.astype("timedelta64[ns]") + return duck_array_ops.astype(data, "timedelta64[ns]") else: return data @@ -232,7 +230,7 @@ def _possibly_convert_datetime_or_timedelta_index(data): return data -def as_compatible_data(data, fastpath=False): +def as_compatible_data(data, fastpath: bool = False): """Prepare and wrap data to put in a Variable. - If data does not have the necessary attributes, convert it to ndarray. @@ -274,8 +272,7 @@ def as_compatible_data(data, fastpath=False): mask = np.ma.getmaskarray(data) if mask.any(): dtype, fill_value = dtypes.maybe_promote(data.dtype) - data = np.asarray(data, dtype=dtype) - data[mask] = fill_value + data = duck_array_ops.where_method(data, ~mask, fill_value) else: data = np.asarray(data) @@ -368,7 +365,7 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): self.encoding = encoding @property - def dtype(self): + def dtype(self) -> np.dtype: """ Data-type of the array’s elements. @@ -380,7 +377,7 @@ def dtype(self): return self._data.dtype @property - def shape(self): + def shape(self) -> tuple[int, ...]: """ Tuple of array dimensions. @@ -533,8 +530,10 @@ def load(self, **kwargs): -------- dask.array.compute """ - if is_duck_dask_array(self._data): - self._data = as_compatible_data(self._data.compute(**kwargs)) + if is_chunked_array(self._data): + chunkmanager = get_chunked_array_type(self._data) + loaded_data, *_ = chunkmanager.compute(self._data, **kwargs) + self._data = as_compatible_data(loaded_data) elif isinstance(self._data, indexing.ExplicitlyIndexed): self._data = self._data.get_duck_array() elif not is_duck_array(self._data): @@ -670,7 +669,8 @@ def dims(self, value: str | Iterable[Hashable]) -> None: def _parse_dimensions(self, dims: str | Iterable[Hashable]) -> tuple[Hashable, ...]: if isinstance(dims, str): dims = (dims,) - dims = tuple(dims) + else: + dims = tuple(dims) if len(dims) != self.ndim: raise ValueError( f"dimensions {dims} must have the same length as the " @@ -1077,9 +1077,7 @@ def _copy( ndata = as_compatible_data(data) if self.shape != ndata.shape: raise ValueError( - "Data shape {} must match shape of object {}".format( - ndata.shape, self.shape - ) + f"Data shape {ndata.shape} must match shape of object {self.shape}" ) attrs = copy.deepcopy(self._attrs, memo) if deep else copy.copy(self._attrs) @@ -1166,8 +1164,10 @@ def chunk( | Mapping[Any, None | int | tuple[int, ...]] ) = {}, name: str | None = None, - lock: bool = False, - inline_array: bool = False, + lock: bool | None = None, + inline_array: bool | None = None, + chunked_array_type: str | ChunkManagerEntrypoint | None = None, + from_array_kwargs=None, **chunks_kwargs: Any, ) -> Variable: """Coerce this array's data into a dask array with the given chunks. @@ -1188,12 +1188,21 @@ def chunk( name : str, optional Used to generate the name for this array in the internal dask graph. Does not need not be unique. - lock : optional + lock : bool, default: False Passed on to :py:func:`dask.array.from_array`, if the array is not already as dask array. - inline_array: optional + inline_array : bool, default: False Passed on to :py:func:`dask.array.from_array`, if the array is not already as dask array. + chunked_array_type: str, optional + Which chunked array type to coerce this datasets' arrays to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEntrypoint` system. + Experimental API that should not be relied upon. + from_array_kwargs: dict, optional + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example, with dask as the default chunked array type, this method would pass additional kwargs + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. **chunks_kwargs : {dim: chunks, ...}, optional The keyword arguments form of ``chunks``. One of chunks or chunks_kwargs must be provided. @@ -1209,7 +1218,6 @@ def chunk( xarray.unify_chunks dask.array.from_array """ - import dask.array as da if chunks is None: warnings.warn( @@ -1220,6 +1228,8 @@ def chunk( chunks = {} if isinstance(chunks, (float, str, int, tuple, list)): + # TODO we shouldn't assume here that other chunkmanagers can handle these types + # TODO should we call normalize_chunks here? pass # dask.array.from_array can handle these directly else: chunks = either_dict_or_kwargs(chunks, chunks_kwargs, "chunk") @@ -1227,9 +1237,22 @@ def chunk( if utils.is_dict_like(chunks): chunks = {self.get_axis_num(dim): chunk for dim, chunk in chunks.items()} + chunkmanager = guess_chunkmanager(chunked_array_type) + + if from_array_kwargs is None: + from_array_kwargs = {} + + # TODO deprecate passing these dask-specific arguments explicitly. In future just pass everything via from_array_kwargs + _from_array_kwargs = utils.consolidate_dask_from_array_kwargs( + from_array_kwargs, + name=name, + lock=lock, + inline_array=inline_array, + ) + data = self._data - if is_duck_dask_array(data): - data = data.rechunk(chunks) + if chunkmanager.is_chunked_array(data): + data = chunkmanager.rechunk(data, chunks) # type: ignore[arg-type] else: if isinstance(data, indexing.ExplicitlyIndexed): # Unambiguously handle array storage backends (like NetCDF4 and h5py) @@ -1244,17 +1267,13 @@ def chunk( data, indexing.OuterIndexer ) - # All of our lazily loaded backend array classes should use NumPy - # array operations. - kwargs = {"meta": np.ndarray} - else: - kwargs = {} - if utils.is_dict_like(chunks): - chunks = tuple(chunks.get(n, s) for n, s in enumerate(self.shape)) + chunks = tuple(chunks.get(n, s) for n, s in enumerate(data.shape)) - data = da.from_array( - data, chunks, name=name, lock=lock, inline_array=inline_array, **kwargs + data = chunkmanager.from_array( + data, + chunks, # type: ignore[arg-type] + **_from_array_kwargs, ) return self._replace(data=data) @@ -1266,7 +1285,8 @@ def to_numpy(self) -> np.ndarray: # TODO first attempt to call .to_numpy() once some libraries implement it if hasattr(data, "chunks"): - data = data.compute() + chunkmanager = get_chunked_array_type(data) + data, *_ = chunkmanager.compute(data) if isinstance(data, array_type("cupy")): data = data.get() # pint has to be imported dynamically as pint imports xarray @@ -1392,7 +1412,7 @@ def _shift_one_dim(self, dim, count, fill_value=dtypes.NA): pads = [(0, 0) if d != dim else dim_pad for d in self.dims] data = np.pad( - trimmed_data.astype(dtype), + duck_array_ops.astype(trimmed_data, dtype), pads, mode="constant", constant_values=fill_value, @@ -1541,7 +1561,7 @@ def pad( pad_option_kwargs["reflect_type"] = reflect_type array = np.pad( - self.data.astype(dtype, copy=False), + duck_array_ops.astype(self.data, dtype, copy=False), pad_width_by_index, mode=mode, **pad_option_kwargs, @@ -1817,15 +1837,20 @@ def _unstack_once( new_shape = tuple(list(reordered.shape[: len(other_dims)]) + new_dim_sizes) new_dims = reordered.dims[: len(other_dims)] + new_dim_names + create_template: Callable if fill_value is dtypes.NA: is_missing_values = math.prod(new_shape) > math.prod(self.shape) if is_missing_values: dtype, fill_value = dtypes.maybe_promote(self.dtype) + + create_template = partial(np.full_like, fill_value=fill_value) else: dtype = self.dtype fill_value = dtypes.get_fill_value(dtype) + create_template = np.empty_like else: dtype = self.dtype + create_template = partial(np.full_like, fill_value=fill_value) if sparse: # unstacking a dense multitindexed array to a sparse array @@ -1848,12 +1873,7 @@ def _unstack_once( ) else: - data = np.full_like( - self.data, - fill_value=fill_value, - shape=new_shape, - dtype=dtype, - ) + data = create_template(self.data, shape=new_shape, dtype=dtype) # Indexer is a list of lists of locations. Each list is the locations # on the new dimension. This is robust to the data being sparse; in that @@ -2073,12 +2093,13 @@ def concat( # twice variables = list(variables) first_var = variables[0] + first_var_dims = first_var.dims - arrays = [v.data for v in variables] + arrays = [v._data for v in variables] - if dim in first_var.dims: + if dim in first_var_dims: axis = first_var.get_axis_num(dim) - dims = first_var.dims + dims = first_var_dims data = duck_array_ops.concatenate(arrays, axis=axis) if positions is not None: # TODO: deprecate this option -- we don't need it for groupby @@ -2087,7 +2108,7 @@ def concat( data = duck_array_ops.take(data, indices, axis=axis) else: axis = 0 - dims = (dim,) + first_var.dims + dims = (dim,) + first_var_dims data = duck_array_ops.stack(arrays, axis=axis) attrs = merge_attrs( @@ -2096,12 +2117,12 @@ def concat( encoding = dict(first_var.encoding) if not shortcut: for var in variables: - if var.dims != first_var.dims: + if var.dims != first_var_dims: raise ValueError( - f"Variable has dimensions {list(var.dims)} but first Variable has dimensions {list(first_var.dims)}" + f"Variable has dimensions {list(var.dims)} but first Variable has dimensions {list(first_var_dims)}" ) - return cls(dims, data, attrs, encoding) + return cls(dims, data, attrs, encoding, fastpath=True) def equals(self, other, equiv=duck_array_ops.array_equiv): """True if two Variables have the same dimensions and values; @@ -2177,15 +2198,15 @@ def quantile( desired quantile lies between two data points. The options sorted by their R type as summarized in the H&F paper [1]_ are: - 1. "inverted_cdf" (*) - 2. "averaged_inverted_cdf" (*) - 3. "closest_observation" (*) - 4. "interpolated_inverted_cdf" (*) - 5. "hazen" (*) - 6. "weibull" (*) + 1. "inverted_cdf" + 2. "averaged_inverted_cdf" + 3. "closest_observation" + 4. "interpolated_inverted_cdf" + 5. "hazen" + 6. "weibull" 7. "linear" (default) - 8. "median_unbiased" (*) - 9. "normal_unbiased" (*) + 8. "median_unbiased" + 9. "normal_unbiased" The first three methods are discontiuous. The following discontinuous variations of the default "linear" (7.) option are also available: @@ -2199,8 +2220,6 @@ def quantile( was previously called "interpolation", renamed in accordance with numpy version 1.22.0. - (*) These methods require numpy version 1.22 or newer. - keep_attrs : bool, optional If True, the variable's attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new @@ -2268,14 +2287,7 @@ def _wrapper(npa, **kwargs): axis = np.arange(-1, -1 * len(dim) - 1, -1) - if Version(np.__version__) >= Version("1.22.0"): - kwargs = {"q": q, "axis": axis, "method": method} - else: - if method not in ("linear", "lower", "higher", "midpoint", "nearest"): - raise ValueError( - f"Interpolation method '{method}' requires numpy >= 1.22 or is not supported." - ) - kwargs = {"q": q, "axis": axis, "interpolation": method} + kwargs = {"q": q, "axis": axis, "method": method} result = apply_ufunc( _wrapper, @@ -2409,7 +2421,7 @@ def rolling_window( """ if fill_value is dtypes.NA: # np.nan is passed dtype, fill_value = dtypes.maybe_promote(self.dtype) - var = self.astype(dtype, copy=False) + var = duck_array_ops.astype(self, dtype, copy=False) else: dtype = self.dtype var = self @@ -2453,7 +2465,7 @@ def rolling_window( pads[d] = (win - 1, 0) padded = var.pad(pads, mode="constant", constant_values=fill_value) - axis = [self.get_axis_num(d) for d in dim] + axis = tuple(self.get_axis_num(d) for d in dim) new_dims = self.dims + tuple(window_dim) return Variable( new_dims, @@ -2903,7 +2915,15 @@ def values(self, values): f"Please use DataArray.assign_coords, Dataset.assign_coords or Dataset.assign as appropriate." ) - def chunk(self, chunks={}, name=None, lock=False, inline_array=False): + def chunk( + self, + chunks={}, + name=None, + lock=False, + inline_array=False, + chunked_array_type=None, + from_array_kwargs=None, + ): # Dummy - do not chunk. This method is invoked e.g. by Dataset.chunk() return self.copy(deep=False) @@ -3007,9 +3027,7 @@ def copy(self, deep: bool = True, data: ArrayLike | None = None): ndata = as_compatible_data(data) if self.shape != ndata.shape: raise ValueError( - "Data shape {} must match shape of object {}".format( - ndata.shape, self.shape - ) + f"Data shape {ndata.shape} must match shape of object {self.shape}" ) attrs = copy.deepcopy(self._attrs) if deep else copy.copy(self._attrs) @@ -3099,10 +3117,6 @@ def _inplace_binary_op(self, other, f): ) -# for backwards compatibility -Coordinate = utils.alias(IndexVariable, "Coordinate") - - def _unified_dims(variables): # validate dimensions all_dims = {} diff --git a/xarray/core/weighted.py b/xarray/core/weighted.py index 904c6a4d980..e21091fad6b 100644 --- a/xarray/core/weighted.py +++ b/xarray/core/weighted.py @@ -238,7 +238,10 @@ def _sum_of_weights(self, da: DataArray, dim: Dims = None) -> DataArray: # (and not 2); GH4074 if self.weights.dtype == bool: sum_of_weights = self._reduce( - mask, self.weights.astype(int), dim=dim, skipna=False + mask, + duck_array_ops.astype(self.weights, dtype=int), + dim=dim, + skipna=False, ) else: sum_of_weights = self._reduce(mask, self.weights, dim=dim, skipna=False) diff --git a/xarray/plot/dataarray_plot.py b/xarray/plot/dataarray_plot.py index 7f11ddac0a6..d2c0a8e2af6 100644 --- a/xarray/plot/dataarray_plot.py +++ b/xarray/plot/dataarray_plot.py @@ -733,15 +733,6 @@ def _plot1d(plotfunc): If specified plot 3D and use this coordinate for *z* axis. hue : Hashable or None, optional Dimension or coordinate for which you want multiple lines plotted. - hue_style: {'discrete', 'continuous'} or None, optional - How to use the ``hue`` variable: - - - ``'continuous'`` -- continuous color scale - (default for numeric ``hue`` variables) - - ``'discrete'`` -- a color for each unique value, - using the default color cycle - (default for non-numeric ``hue`` variables) - markersize: Hashable or None, optional scatter only. Variable by which to vary size of scattered points. linewidth: Hashable or None, optional @@ -935,6 +926,19 @@ def newplotfunc( warnings.warn(msg, DeprecationWarning, stacklevel=2) del args + if hue_style is not None: + # TODO: Not used since 2022.10. Deprecated since 2023.07. + warnings.warn( + ( + "hue_style is no longer used for plot1d plots " + "and the argument will eventually be removed. " + "Convert numbers to string for a discrete hue " + "and use add_legend or add_colorbar to control which guide to display." + ), + DeprecationWarning, + stacklevel=2, + ) + _is_facetgrid = kwargs.pop("_is_facetgrid", False) if plotfunc.__name__ == "scatter": diff --git a/xarray/plot/utils.py b/xarray/plot/utils.py index e807081f838..2c58fe83cef 100644 --- a/xarray/plot/utils.py +++ b/xarray/plot/utils.py @@ -1438,6 +1438,16 @@ def data_is_numeric(self) -> bool: >>> a = xr.DataArray([0.5, 0, 0, 0.5, 2, 3]) >>> _Normalize(a).data_is_numeric True + + >>> # TODO: Datetime should be numeric right? + >>> a = xr.DataArray(pd.date_range("2000-1-1", periods=4)) + >>> _Normalize(a).data_is_numeric + False + + # TODO: Timedelta should be numeric right? + >>> a = xr.DataArray(pd.timedelta_range("-1D", periods=4, freq="D")) + >>> _Normalize(a).data_is_numeric + True """ return self._data_is_numeric diff --git a/xarray/testing.py b/xarray/testing.py index b6a88135ee1..6a8bb04f170 100644 --- a/xarray/testing.py +++ b/xarray/testing.py @@ -8,6 +8,7 @@ import pandas as pd from xarray.core import duck_array_ops, formatting, utils +from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset from xarray.core.indexes import Index, PandasIndex, PandasMultiIndex, default_indexes @@ -68,9 +69,9 @@ def assert_equal(a, b): Parameters ---------- - a : xarray.Dataset, xarray.DataArray or xarray.Variable + a : xarray.Dataset, xarray.DataArray, xarray.Variable or xarray.Coordinates The first object to compare. - b : xarray.Dataset, xarray.DataArray or xarray.Variable + b : xarray.Dataset, xarray.DataArray, xarray.Variable or xarray.Coordinates The second object to compare. See Also @@ -79,11 +80,15 @@ def assert_equal(a, b): numpy.testing.assert_array_equal """ __tracebackhide__ = True - assert type(a) == type(b) + assert ( + type(a) == type(b) or isinstance(a, Coordinates) and isinstance(b, Coordinates) + ) if isinstance(a, (Variable, DataArray)): assert a.equals(b), formatting.diff_array_repr(a, b, "equals") elif isinstance(a, Dataset): assert a.equals(b), formatting.diff_dataset_repr(a, b, "equals") + elif isinstance(a, Coordinates): + assert a.equals(b), formatting.diff_coords_repr(a, b, "equals") else: raise TypeError(f"{type(a)} not supported by assertion comparison") @@ -97,9 +102,9 @@ def assert_identical(a, b): Parameters ---------- - a : xarray.Dataset, xarray.DataArray or xarray.Variable + a : xarray.Dataset, xarray.DataArray, xarray.Variable or xarray.Coordinates The first object to compare. - b : xarray.Dataset, xarray.DataArray or xarray.Variable + b : xarray.Dataset, xarray.DataArray, xarray.Variable or xarray.Coordinates The second object to compare. See Also @@ -107,7 +112,9 @@ def assert_identical(a, b): assert_equal, assert_allclose, Dataset.equals, DataArray.equals """ __tracebackhide__ = True - assert type(a) == type(b) + assert ( + type(a) == type(b) or isinstance(a, Coordinates) and isinstance(b, Coordinates) + ) if isinstance(a, Variable): assert a.identical(b), formatting.diff_array_repr(a, b, "identical") elif isinstance(a, DataArray): @@ -115,6 +122,8 @@ def assert_identical(a, b): assert a.identical(b), formatting.diff_array_repr(a, b, "identical") elif isinstance(a, (Dataset, Variable)): assert a.identical(b), formatting.diff_dataset_repr(a, b, "identical") + elif isinstance(a, Coordinates): + assert a.identical(b), formatting.diff_coords_repr(a, b, "identical") else: raise TypeError(f"{type(a)} not supported by assertion comparison") @@ -364,14 +373,13 @@ def _assert_dataset_invariants(ds: Dataset, check_default_indexes: bool): assert all( ds._dims[k] == v.sizes[k] for v in ds._variables.values() for k in v.sizes ), (ds._dims, {k: v.sizes for k, v in ds._variables.items()}) - assert all( - isinstance(v, IndexVariable) - for (k, v) in ds._variables.items() - if v.dims == (k,) - ), {k: type(v) for k, v in ds._variables.items() if v.dims == (k,)} - assert all(v.dims == (k,) for (k, v) in ds._variables.items() if k in ds._dims), { - k: v.dims for k, v in ds._variables.items() if k in ds._dims - } + + if check_default_indexes: + assert all( + isinstance(v, IndexVariable) + for (k, v) in ds._variables.items() + if v.dims == (k,) + ), {k: type(v) for k, v in ds._variables.items() if v.dims == (k,)} if ds._indexes is not None: _assert_indexes_invariants_checks( @@ -401,9 +409,11 @@ def _assert_internal_invariants( _assert_dataset_invariants( xarray_obj, check_default_indexes=check_default_indexes ) + elif isinstance(xarray_obj, Coordinates): + _assert_dataset_invariants( + xarray_obj.to_dataset(), check_default_indexes=check_default_indexes + ) else: raise TypeError( - "{} is not a supported type for xarray invariant checks".format( - type(xarray_obj) - ) + f"{type(xarray_obj)} is not a supported type for xarray invariant checks" ) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 7d58c5bfed2..d54e1004f08 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -16,6 +16,7 @@ from collections.abc import Iterator from contextlib import ExitStack from io import BytesIO +from os import listdir from pathlib import Path from typing import TYPE_CHECKING, Any, Final, cast @@ -46,6 +47,7 @@ ) from xarray.backends.pydap_ import PydapDataStore from xarray.backends.scipy_ import ScipyBackendEntrypoint +from xarray.coding.strings import check_vlen_dtype, create_vlen_dtype from xarray.coding.variables import SerializationWarning from xarray.conventions import encode_dataset_coordinates from xarray.core import indexing @@ -859,6 +861,20 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None: with self.roundtrip(original) as actual: assert_identical(expected, actual) + def test_roundtrip_empty_vlen_string_array(self) -> None: + # checks preserving vlen dtype for empty arrays GH7862 + dtype = create_vlen_dtype(str) + original = Dataset({"a": np.array([], dtype=dtype)}) + assert check_vlen_dtype(original["a"].dtype) == str + with self.roundtrip(original) as actual: + assert_identical(original, actual) + assert object == actual["a"].dtype + assert actual["a"].dtype == original["a"].dtype + # only check metadata for capable backends + # eg. NETCDF3 based backends do not roundtrip metadata + if actual["a"].dtype.metadata is not None: + assert check_vlen_dtype(actual["a"].dtype) == str + @pytest.mark.parametrize( "decoded_fn, encoded_fn", [ @@ -1596,6 +1612,20 @@ def test_encoding_unlimited_dims(self) -> None: assert actual.encoding["unlimited_dims"] == set("y") assert_equal(ds, actual) + def test_raise_on_forward_slashes_in_names(self) -> None: + # test for forward slash in variable names and dimensions + # see GH 7943 + data_vars: list[dict[str, Any]] = [ + {"PASS/FAIL": (["PASSFAIL"], np.array([0]))}, + {"PASS/FAIL": np.array([0])}, + {"PASSFAIL": (["PASS/FAIL"], np.array([0]))}, + ] + for dv in data_vars: + ds = Dataset(data_vars=dv) + with pytest.raises(ValueError, match="Forward slashes '/' are not allowed"): + with self.roundtrip(ds): + pass + @requires_netCDF4 class TestNetCDF4Data(NetCDF4Base): @@ -1834,6 +1864,8 @@ def test_with_chunkstore(self) -> None: with self.create_zarr_target() as store_target, self.create_zarr_target() as chunk_store: save_kwargs = {"chunk_store": chunk_store} self.save(expected, store_target, **save_kwargs) + # the chunk store must have been populated with some entries + assert len(chunk_store) > 0 open_kwargs = {"backend_kwargs": {"chunk_store": chunk_store}} with self.open(store_target, **open_kwargs) as ds: assert_equal(ds, expected) @@ -2584,10 +2616,10 @@ def test_write_read_select_write(self) -> None: ds.to_zarr(initial_store, mode="w", **self.version_kwargs) ds1 = xr.open_zarr(initial_store, **self.version_kwargs) - # Combination of where+squeeze triggers error on write. - ds_sel = ds1.where(ds1.coords["dim3"] == "a", drop=True).squeeze("dim3") - with self.create_zarr_target() as final_store: - ds_sel.to_zarr(final_store, mode="w", **self.version_kwargs) + # Combination of where+squeeze triggers error on write. + ds_sel = ds1.where(ds1.coords["dim3"] == "a", drop=True).squeeze("dim3") + with self.create_zarr_target() as final_store: + ds_sel.to_zarr(final_store, mode="w", **self.version_kwargs) @pytest.mark.parametrize("obj", [Dataset(), DataArray(name="foo")]) def test_attributes(self, obj) -> None: @@ -2634,6 +2666,86 @@ def create_store(self): yield group +@requires_zarr +class TestZarrWriteEmpty(TestZarrDirectoryStore): + @contextlib.contextmanager + def temp_dir(self) -> Iterator[tuple[str, str]]: + with tempfile.TemporaryDirectory() as d: + store = os.path.join(d, "test.zarr") + yield d, store + + @contextlib.contextmanager + def roundtrip_dir( + self, + data, + store, + save_kwargs=None, + open_kwargs=None, + allow_cleanup_failure=False, + ) -> Iterator[Dataset]: + if save_kwargs is None: + save_kwargs = {} + if open_kwargs is None: + open_kwargs = {} + + data.to_zarr(store, **save_kwargs, **self.version_kwargs) + with xr.open_dataset( + store, engine="zarr", **open_kwargs, **self.version_kwargs + ) as ds: + yield ds + + @pytest.mark.parametrize("write_empty", [True, False]) + def test_write_empty(self, write_empty: bool) -> None: + if not write_empty: + expected = ["0.1.0", "1.1.0"] + else: + expected = [ + "0.0.0", + "0.0.1", + "0.1.0", + "0.1.1", + "1.0.0", + "1.0.1", + "1.1.0", + "1.1.1", + ] + + ds = xr.Dataset( + data_vars={ + "test": ( + ("Z", "Y", "X"), + np.array([np.nan, np.nan, 1.0, np.nan]).reshape((1, 2, 2)), + ) + } + ) + + if has_dask: + ds["test"] = ds["test"].chunk((1, 1, 1)) + encoding = None + else: + encoding = {"test": {"chunks": (1, 1, 1)}} + + with self.temp_dir() as (d, store): + ds.to_zarr( + store, + mode="w", + encoding=encoding, + write_empty_chunks=write_empty, + ) + + with self.roundtrip_dir( + ds, + store, + {"mode": "a", "append_dim": "Z", "write_empty_chunks": write_empty}, + ) as a_ds: + expected_ds = xr.concat([ds, ds], dim="Z") + + assert_identical(a_ds, expected_ds) + + ls = listdir(os.path.join(store, "test")) + assert set(expected) == set([file for file in ls if file[0] != "."]) + + class ZarrBaseV3(ZarrBase): zarr_version = 3 diff --git a/xarray/tests/test_cftimeindex.py b/xarray/tests/test_cftimeindex.py index a676b1f07f1..f58a6490632 100644 --- a/xarray/tests/test_cftimeindex.py +++ b/xarray/tests/test_cftimeindex.py @@ -7,7 +7,6 @@ import numpy as np import pandas as pd import pytest -from packaging.version import Version import xarray as xr from xarray.coding.cftimeindex import ( @@ -33,12 +32,7 @@ # cftime 1.5.2 renames "gregorian" to "standard" standard_or_gregorian = "" if has_cftime: - import cftime - - if Version(cftime.__version__) >= Version("1.5.2"): - standard_or_gregorian = "standard" - else: - standard_or_gregorian = "gregorian" + standard_or_gregorian = "standard" def date_dict(year=None, month=None, day=None, hour=None, minute=None, second=None): diff --git a/xarray/tests/test_coding_strings.py b/xarray/tests/test_coding_strings.py index cb9595f4a64..f1eca00f9a1 100644 --- a/xarray/tests/test_coding_strings.py +++ b/xarray/tests/test_coding_strings.py @@ -32,6 +32,10 @@ def test_vlen_dtype() -> None: assert strings.is_bytes_dtype(dtype) assert strings.check_vlen_dtype(dtype) is bytes + # check h5py variant ("vlen") + dtype = np.dtype("O", metadata={"vlen": str}) # type: ignore[call-overload,unused-ignore] + assert strings.check_vlen_dtype(dtype) is str + assert strings.check_vlen_dtype(np.dtype(object)) is None diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index acdf9c8846e..4dae7809be9 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -129,9 +129,9 @@ def test_multidimensional_coordinates(self) -> None: foo1_coords = enc["foo1"].attrs.get("coordinates", "") foo2_coords = enc["foo2"].attrs.get("coordinates", "") foo3_coords = enc["foo3"].attrs.get("coordinates", "") - assert set(foo1_coords.split()) == {"lat1", "lon1"} - assert set(foo2_coords.split()) == {"lat2", "lon2"} - assert set(foo3_coords.split()) == {"lat3", "lon3"} + assert foo1_coords == "lon1 lat1" + assert foo2_coords == "lon2 lat2" + assert foo3_coords == "lon3 lat3" # Should not have any global coordinates. assert "coordinates" not in attrs @@ -150,11 +150,12 @@ def test_var_with_coord_attr(self) -> None: enc, attrs = conventions.encode_dataset_coordinates(orig) # Make sure we have the right coordinates for each variable. values_coords = enc["values"].attrs.get("coordinates", "") - assert set(values_coords.split()) == {"time", "lat", "lon"} + assert values_coords == "time lon lat" # Should not have any global coordinates. assert "coordinates" not in attrs def test_do_not_overwrite_user_coordinates(self) -> None: + # don't overwrite user-defined "coordinates" encoding orig = Dataset( coords={"x": [0, 1, 2], "y": ("x", [5, 6, 7]), "z": ("x", [8, 9, 10])}, data_vars={"a": ("x", [1, 2, 3]), "b": ("x", [3, 5, 6])}, @@ -168,6 +169,18 @@ def test_do_not_overwrite_user_coordinates(self) -> None: with pytest.raises(ValueError, match=r"'coordinates' found in both attrs"): conventions.encode_dataset_coordinates(orig) + def test_deterministic_coords_encoding(self) -> None: + # the coordinates attribute is sorted when set by xarray.conventions ... + # ... on a variable's coordinates attribute + ds = Dataset({"foo": 0}, coords={"baz": 0, "bar": 0}) + vars, attrs = conventions.encode_dataset_coordinates(ds) + assert vars["foo"].attrs["coordinates"] == "bar baz" + assert attrs.get("coordinates") is None + # ... on the global coordinates attribute + ds = ds.drop_vars("foo") + vars, attrs = conventions.encode_dataset_coordinates(ds) + assert attrs["coordinates"] == "bar baz" + @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_emit_coordinates_attribute_in_attrs(self) -> None: orig = Dataset( @@ -487,3 +500,18 @@ def test_decode_cf_error_includes_variable_name(): ds = Dataset({"invalid": ([], 1e36, {"units": "days since 2000-01-01"})}) with pytest.raises(ValueError, match="Failed to decode variable 'invalid'"): decode_cf(ds) + + +def test_encode_cf_variable_with_vlen_dtype() -> None: + v = Variable( + ["x"], np.array(["a", "b"], dtype=coding.strings.create_vlen_dtype(str)) + ) + encoded_v = conventions.encode_cf_variable(v) + assert encoded_v.data.dtype.kind == "O" + assert coding.strings.check_vlen_dtype(encoded_v.data.dtype) == str + + # empty array + v = Variable(["x"], np.array([], dtype=coding.strings.create_vlen_dtype(str))) + encoded_v = conventions.encode_cf_variable(v) + assert encoded_v.data.dtype.kind == "O" + assert coding.strings.check_vlen_dtype(encoded_v.data.dtype) == str diff --git a/xarray/tests/test_coordinates.py b/xarray/tests/test_coordinates.py new file mode 100644 index 00000000000..bf68a5c1838 --- /dev/null +++ b/xarray/tests/test_coordinates.py @@ -0,0 +1,146 @@ +from __future__ import annotations + +import pandas as pd +import pytest + +from xarray.core.alignment import align +from xarray.core.coordinates import Coordinates +from xarray.core.dataarray import DataArray +from xarray.core.dataset import Dataset +from xarray.core.indexes import PandasIndex, PandasMultiIndex +from xarray.tests import assert_identical, source_ndarray + + +class TestCoordinates: + def test_init_noindex(self) -> None: + coords = Coordinates(coords={"foo": ("x", [0, 1, 2])}) + expected = Dataset(coords={"foo": ("x", [0, 1, 2])}) + assert_identical(coords.to_dataset(), expected) + + def test_init_from_coords(self) -> None: + expected = Dataset(coords={"foo": ("x", [0, 1, 2])}) + coords = Coordinates(coords=expected.coords) + assert_identical(coords.to_dataset(), expected) + + # test variables copied + assert coords.variables["foo"] is not expected.variables["foo"] + + # default index + expected = Dataset(coords={"x": ("x", [0, 1, 2])}) + coords = Coordinates(coords=expected.coords, indexes=expected.xindexes) + assert_identical(coords.to_dataset(), expected) + + def test_init_empty(self) -> None: + coords = Coordinates() + assert len(coords) == 0 + + def test_init_index_error(self) -> None: + idx = PandasIndex([1, 2, 3], "x") + with pytest.raises(ValueError, match="no coordinate variables found"): + Coordinates(indexes={"x": idx}) + + with pytest.raises(TypeError, match=".* is not an `xarray.indexes.Index`"): + Coordinates(coords={"x": ("x", [1, 2, 3])}, indexes={"x": "not_an_xarray_index"}) # type: ignore + + def test_init_dim_sizes_conflict(self) -> None: + with pytest.raises(ValueError): + Coordinates(coords={"foo": ("x", [1, 2]), "bar": ("x", [1, 2, 3, 4])}) + + def test_from_pandas_multiindex(self) -> None: + midx = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("one", "two")) + coords = Coordinates.from_pandas_multiindex(midx, "x") + + assert isinstance(coords.xindexes["x"], PandasMultiIndex) + assert coords.xindexes["x"].index.equals(midx) + assert coords.xindexes["x"].dim == "x" + + expected = PandasMultiIndex(midx, "x").create_variables() + assert list(coords.variables) == list(expected) + for name in ("x", "one", "two"): + assert_identical(expected[name], coords.variables[name]) + + def test_dims(self) -> None: + _ds = Dataset(coords={"x": [0, 1, 2]}) + coords = Coordinates(coords=_ds.coords, indexes=_ds.xindexes) + assert coords.dims == {"x": 3} + + def test_sizes(self) -> None: + _ds = Dataset(coords={"x": [0, 1, 2]}) + coords = Coordinates(coords=_ds.coords, indexes=_ds.xindexes) + assert coords.sizes == {"x": 3} + + def test_dtypes(self) -> None: + _ds = Dataset(coords={"x": [0, 1, 2]}) + coords = Coordinates(coords=_ds.coords, indexes=_ds.xindexes) + assert coords.dtypes == {"x": int} + + def test_getitem(self) -> None: + _ds = Dataset(coords={"x": [0, 1, 2]}) + coords = Coordinates(coords=_ds.coords, indexes=_ds.xindexes) + assert_identical( + coords["x"], + DataArray([0, 1, 2], coords={"x": [0, 1, 2]}, name="x"), + ) + + def test_delitem(self) -> None: + _ds = Dataset(coords={"x": [0, 1, 2]}) + coords = Coordinates(coords=_ds.coords, indexes=_ds.xindexes) + del coords["x"] + assert "x" not in coords + + def test_update(self) -> None: + _ds = Dataset(coords={"x": [0, 1, 2]}) + coords = Coordinates(coords=_ds.coords, indexes=_ds.xindexes) + + coords.update({"y": ("y", [4, 5, 6])}) + assert "y" in coords + assert "y" in coords.xindexes + expected = DataArray([4, 5, 6], coords={"y": [4, 5, 6]}, name="y") + assert_identical(coords["y"], expected) + + def test_equals(self): + _ds = Dataset(coords={"x": [0, 1, 2]}) + coords = Coordinates(coords=_ds.coords, indexes=_ds.xindexes) + + assert coords.equals(coords) + assert not coords.equals("no_a_coords") + + def test_identical(self): + _ds = Dataset(coords={"x": [0, 1, 2]}) + coords = Coordinates(coords=_ds.coords, indexes=_ds.xindexes) + + assert coords.identical(coords) + assert not coords.identical("no_a_coords") + + def test_copy(self) -> None: + no_index_coords = Coordinates({"foo": ("x", [1, 2, 3])}) + copied = no_index_coords.copy() + assert_identical(no_index_coords, copied) + v0 = no_index_coords.variables["foo"] + v1 = copied.variables["foo"] + assert v0 is not v1 + assert source_ndarray(v0.data) is source_ndarray(v1.data) + + deep_copied = no_index_coords.copy(deep=True) + assert_identical(no_index_coords.to_dataset(), deep_copied.to_dataset()) + v0 = no_index_coords.variables["foo"] + v1 = deep_copied.variables["foo"] + assert v0 is not v1 + assert source_ndarray(v0.data) is not source_ndarray(v1.data) + + def test_align(self) -> None: + _ds = Dataset(coords={"x": [0, 1, 2]}) + coords = Coordinates(coords=_ds.coords, indexes=_ds.xindexes) + + left = coords + + # test Coordinates._reindex_callback + right = coords.to_dataset().isel(x=[0, 1]).coords + left2, right2 = align(left, right, join="inner") + assert_identical(left2, right2) + + # test Coordinates._overwrite_indexes + right.update({"x": ("x", [4, 5, 6])}) + left2, right2 = align(left, right, join="override") + assert_identical(left2, left) + assert_identical(left2, right2) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 1171464a962..6e65d52fdb5 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -193,11 +193,9 @@ def test_binary_op_bitshift(self) -> None: def test_repr(self): expected = dedent( - """\ + f"""\ - {!r}""".format( - self.lazy_var.data - ) + {self.lazy_var.data!r}""" ) assert expected == repr(self.lazy_var) @@ -656,14 +654,12 @@ def test_dataarray_repr(self): nonindex_coord = build_dask_array("coord") a = DataArray(data, dims=["x"], coords={"y": ("x", nonindex_coord)}) expected = dedent( - """\ + f"""\ - {!r} + {data!r} Coordinates: y (x) int64 dask.array - Dimensions without coordinates: x""".format( - data - ) + Dimensions without coordinates: x""" ) assert expected == repr(a) assert kernel_call_count == 0 # should not evaluate dask array @@ -904,13 +900,12 @@ def test_to_dask_dataframe_dim_order(self): @pytest.mark.parametrize("method", ["load", "compute"]) def test_dask_kwargs_variable(method): - x = Variable("y", da.from_array(np.arange(3), chunks=(2,))) - # args should be passed on to da.Array.compute() - with mock.patch.object( - da.Array, "compute", return_value=np.arange(3) - ) as mock_compute: + chunked_array = da.from_array(np.arange(3), chunks=(2,)) + x = Variable("y", chunked_array) + # args should be passed on to dask.compute() (via DaskManager.compute()) + with mock.patch.object(da, "compute", return_value=(np.arange(3),)) as mock_compute: getattr(x, method)(foo="bar") - mock_compute.assert_called_with(foo="bar") + mock_compute.assert_called_with(chunked_array, foo="bar") @pytest.mark.parametrize("method", ["load", "compute", "persist"]) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 263653e992e..183c0ad7371 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -27,6 +27,7 @@ from xarray.convert import from_cdms2 from xarray.core import dtypes from xarray.core.common import full_like +from xarray.core.coordinates import Coordinates from xarray.core.indexes import Index, PandasIndex, filter_indexes_from_coords from xarray.core.types import QueryEngineOptions, QueryParserOptions from xarray.core.utils import is_scalar @@ -486,6 +487,32 @@ def test_constructor_dask_coords(self) -> None: expected = DataArray(data, coords={"x": ecoord, "y": ecoord}, dims=["x", "y"]) assert_equal(actual, expected) + def test_constructor_no_default_index(self) -> None: + # explicitly passing a Coordinates object skips the creation of default index + da = DataArray(range(3), coords=Coordinates({"x": ("x", [1, 2, 3])})) + assert "x" in da.coords + assert "x" not in da.xindexes + + def test_constructor_multiindex(self) -> None: + midx = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("one", "two")) + coords = Coordinates.from_pandas_multiindex(midx, "x") + + da = DataArray(range(4), coords=coords, dims="x") + assert_identical(da.coords, coords) + + def test_constructor_custom_index(self) -> None: + class CustomIndex(Index): + ... + + coords = Coordinates( + coords={"x": ("x", [1, 2, 3])}, indexes={"x": CustomIndex()} + ) + da = DataArray(range(3), coords=coords) + assert isinstance(da.xindexes["x"], CustomIndex) + + # test coordinate variables copied + assert da.coords["x"] is not coords.variables["x"] + def test_equals_and_identical(self) -> None: orig = DataArray(np.arange(5.0), {"a": 42}, dims="x") @@ -1546,6 +1573,24 @@ def test_assign_coords_existing_multiindex(self) -> None: with pytest.warns(FutureWarning, match=r"Updating MultiIndexed coordinate"): data.assign_coords(x=range(4)) + def test_assign_coords_custom_index(self) -> None: + class CustomIndex(Index): + pass + + coords = Coordinates( + coords={"x": ("x", [1, 2, 3])}, indexes={"x": CustomIndex()} + ) + da = xr.DataArray([0, 1, 2], dims="x") + actual = da.assign_coords(coords) + assert isinstance(actual.xindexes["x"], CustomIndex) + + def test_assign_coords_no_default_index(self) -> None: + coords = Coordinates({"y": ("y", [1, 2, 3])}) + da = DataArray([1, 2, 3], dims="y") + actual = da.assign_coords(coords) + assert_identical(actual.coords, coords, check_default_indexes=False) + assert "y" not in actual.xindexes + def test_coords_alignment(self) -> None: lhs = DataArray([1, 2, 3], [("x", [0, 1, 2])]) rhs = DataArray([2, 3, 4], [("x", [1, 2, 3])]) @@ -1698,6 +1743,19 @@ def test_reindex_str_dtype(self, dtype) -> None: assert_identical(expected, actual) assert actual.dtype == expected.dtype + def test_reindex_empty_array_dtype(self) -> None: + # Dtype of reindex result should match dtype of the original DataArray. + # See GH issue #7299 + x = xr.DataArray([], dims=("x",), coords={"x": []}).astype("float32") + y = x.reindex(x=[1.0, 2.0]) + + assert ( + x.dtype == y.dtype + ), "Dtype of reindexed DataArray should match dtype of the original DataArray" + assert ( + y.dtype == np.float32 + ), "Dtype of reindexed DataArray should remain float32" + def test_rename(self) -> None: da = xr.DataArray( [1, 2, 3], dims="dim", name="name", coords={"coord": ("dim", [5, 6, 7])} @@ -2432,6 +2490,19 @@ def test_unstack_pandas_consistency(self) -> None: actual = DataArray(s, dims="z").unstack("z") assert_identical(expected, actual) + @pytest.mark.filterwarnings("error") + def test_unstack_roundtrip_integer_array(self) -> None: + arr = xr.DataArray( + np.arange(6).reshape(2, 3), + coords={"x": ["a", "b"], "y": [0, 1, 2]}, + dims=["x", "y"], + ) + + stacked = arr.stack(z=["x", "y"]) + roundtripped = stacked.unstack() + + assert_identical(arr, roundtripped) + def test_stack_nonunique_consistency(self, da) -> None: da = da.isel(time=0, drop=True) # 2D actual = da.stack(z=["a", "x"]) @@ -2790,10 +2861,7 @@ def test_quantile_method(self, method) -> None: q = [0.25, 0.5, 0.75] actual = DataArray(self.va).quantile(q, method=method) - if Version(np.__version__) >= Version("1.22.0"): - expected = np.nanquantile(self.dv.values, np.array(q), method=method) - else: - expected = np.nanquantile(self.dv.values, np.array(q), interpolation=method) + expected = np.nanquantile(self.dv.values, np.array(q), method=method) np.testing.assert_allclose(actual.values, expected) @@ -3548,6 +3616,10 @@ def test_to_masked_array(self) -> None: ma = da.to_masked_array() assert len(ma.mask) == N + @pytest.mark.skipif( + Version(np.__version__) > Version("1.24") or sys.version_info[:2] > (3, 10), + reason="cdms2 is unmaintained and does not support newer `numpy` or python versions", + ) def test_to_and_from_cdms2_classic(self) -> None: """Classic with 1D axes""" pytest.importorskip("cdms2") @@ -3565,7 +3637,8 @@ def test_to_and_from_cdms2_classic(self) -> None: IndexVariable("distance", [-2, 2]), IndexVariable("time", [0, 1, 2]), ] - actual = original.to_cdms2() + with pytest.deprecated_call(match=".*cdms2"): + actual = original.to_cdms2() assert_array_equal(actual.asma(), original) assert actual.id == original.name assert tuple(actual.getAxisIds()) == original.dims @@ -3578,7 +3651,8 @@ def test_to_and_from_cdms2_classic(self) -> None: assert len(component_times) == 3 assert str(component_times[0]) == "2000-1-1 0:0:0.0" - roundtripped = DataArray.from_cdms2(actual) + with pytest.deprecated_call(match=".*cdms2"): + roundtripped = DataArray.from_cdms2(actual) assert_identical(original, roundtripped) back = from_cdms2(actual) @@ -3587,6 +3661,10 @@ def test_to_and_from_cdms2_classic(self) -> None: for coord_name in original.coords.keys(): assert_array_equal(original.coords[coord_name], back.coords[coord_name]) + @pytest.mark.skipif( + Version(np.__version__) > Version("1.24") or sys.version_info[:2] > (3, 10), + reason="cdms2 is unmaintained and does not support newer `numpy` or python versions", + ) def test_to_and_from_cdms2_sgrid(self) -> None: """Curvilinear (structured) grid @@ -3605,7 +3683,8 @@ def test_to_and_from_cdms2_sgrid(self) -> None: coords=dict(x=x, y=y, lon=lon, lat=lat), name="sst", ) - actual = original.to_cdms2() + with pytest.deprecated_call(): + actual = original.to_cdms2() assert tuple(actual.getAxisIds()) == original.dims assert_array_equal(original.coords["lon"], actual.getLongitude().asma()) assert_array_equal(original.coords["lat"], actual.getLatitude().asma()) @@ -3616,6 +3695,10 @@ def test_to_and_from_cdms2_sgrid(self) -> None: assert_array_equal(original.coords["lat"], back.coords["lat"]) assert_array_equal(original.coords["lon"], back.coords["lon"]) + @pytest.mark.skipif( + Version(np.__version__) > Version("1.24") or sys.version_info[:2] > (3, 10), + reason="cdms2 is unmaintained and does not support newer `numpy` or python versions", + ) def test_to_and_from_cdms2_ugrid(self) -> None: """Unstructured grid""" pytest.importorskip("cdms2") @@ -3626,7 +3709,8 @@ def test_to_and_from_cdms2_ugrid(self) -> None: original = DataArray( np.arange(5), dims=["cell"], coords={"lon": lon, "lat": lat, "cell": cell} ) - actual = original.to_cdms2() + with pytest.deprecated_call(match=".*cdms2"): + actual = original.to_cdms2() assert tuple(actual.getAxisIds()) == original.dims assert_array_equal(original.coords["lon"], actual.getLongitude().getValue()) assert_array_equal(original.coords["lat"], actual.getLatitude().getValue()) @@ -4399,7 +4483,7 @@ def exp_decay(t, n0, tau=1): da = da.chunk({"x": 1}) fit = da.curvefit( - coords=[da.t], func=exp_decay, p0={"n0": 4}, bounds={"tau": [2, 6]} + coords=[da.t], func=exp_decay, p0={"n0": 4}, bounds={"tau": (2, 6)} ) assert_allclose(fit.curvefit_coefficients, expected, rtol=1e-3) @@ -4420,12 +4504,183 @@ def exp_decay(t, n0, tau=1): assert param_defaults == {"n0": 4, "tau": 6} assert bounds_defaults == {"n0": (-np.inf, np.inf), "tau": (5, np.inf)} + # DataArray as bound + param_defaults, bounds_defaults = xr.core.dataset._initialize_curvefit_params( + params=params, + p0={"n0": 4}, + bounds={"tau": [DataArray([3, 4], coords=[("x", [1, 2])]), np.inf]}, + func_args=func_args, + ) + assert param_defaults["n0"] == 4 + assert ( + param_defaults["tau"] == xr.DataArray([4, 5], coords=[("x", [1, 2])]) + ).all() + assert bounds_defaults["n0"] == (-np.inf, np.inf) + assert ( + bounds_defaults["tau"][0] == DataArray([3, 4], coords=[("x", [1, 2])]) + ).all() + assert bounds_defaults["tau"][1] == np.inf + param_names = ["a"] params, func_args = xr.core.dataset._get_func_args(np.power, param_names) assert params == param_names with pytest.raises(ValueError): xr.core.dataset._get_func_args(np.power, []) + @requires_scipy + @pytest.mark.parametrize("use_dask", [True, False]) + def test_curvefit_multidimensional_guess(self, use_dask: bool) -> None: + if use_dask and not has_dask: + pytest.skip("requires dask") + + def sine(t, a, f, p): + return a * np.sin(2 * np.pi * (f * t + p)) + + t = np.arange(0, 2, 0.02) + da = DataArray( + np.stack([sine(t, 1.0, 2, 0), sine(t, 1.0, 2, 0)]), + coords={"x": [0, 1], "t": t}, + ) + + # Fitting to a sine curve produces a different result depending on the + # initial guess: either the phase is zero and the amplitude is positive + # or the phase is 0.5 * 2pi and the amplitude is negative. + + expected = DataArray( + [[1, 2, 0], [-1, 2, 0.5]], + coords={"x": [0, 1], "param": ["a", "f", "p"]}, + ) + + # Different initial guesses for different values of x + a_guess = DataArray([1, -1], coords=[da.x]) + p_guess = DataArray([0, 0.5], coords=[da.x]) + + if use_dask: + da = da.chunk({"x": 1}) + + fit = da.curvefit( + coords=[da.t], + func=sine, + p0={"a": a_guess, "p": p_guess, "f": 2}, + ) + assert_allclose(fit.curvefit_coefficients, expected) + + with pytest.raises( + ValueError, + match=r"Initial guess for 'a' has unexpected dimensions .* should only have " + "dimensions that are in data dimensions", + ): + # initial guess with additional dimensions should be an error + da.curvefit( + coords=[da.t], + func=sine, + p0={"a": DataArray([1, 2], coords={"foo": [1, 2]})}, + ) + + @requires_scipy + @pytest.mark.parametrize("use_dask", [True, False]) + def test_curvefit_multidimensional_bounds(self, use_dask: bool) -> None: + if use_dask and not has_dask: + pytest.skip("requires dask") + + def sine(t, a, f, p): + return a * np.sin(2 * np.pi * (f * t + p)) + + t = np.arange(0, 2, 0.02) + da = xr.DataArray( + np.stack([sine(t, 1.0, 2, 0), sine(t, 1.0, 2, 0)]), + coords={"x": [0, 1], "t": t}, + ) + + # Fit a sine with different bounds: positive amplitude should result in a fit with + # phase 0 and negative amplitude should result in phase 0.5 * 2pi. + + expected = DataArray( + [[1, 2, 0], [-1, 2, 0.5]], + coords={"x": [0, 1], "param": ["a", "f", "p"]}, + ) + + if use_dask: + da = da.chunk({"x": 1}) + + fit = da.curvefit( + coords=[da.t], + func=sine, + p0={"f": 2, "p": 0.25}, # this guess is needed to get the expected result + bounds={ + "a": ( + DataArray([0, -2], coords=[da.x]), + DataArray([2, 0], coords=[da.x]), + ), + }, + ) + assert_allclose(fit.curvefit_coefficients, expected) + + # Scalar lower bound with array upper bound + fit2 = da.curvefit( + coords=[da.t], + func=sine, + p0={"f": 2, "p": 0.25}, # this guess is needed to get the expected result + bounds={ + "a": (-2, DataArray([2, 0], coords=[da.x])), + }, + ) + assert_allclose(fit2.curvefit_coefficients, expected) + + with pytest.raises( + ValueError, + match=r"Upper bound for 'a' has unexpected dimensions .* should only have " + "dimensions that are in data dimensions", + ): + # bounds with additional dimensions should be an error + da.curvefit( + coords=[da.t], + func=sine, + bounds={"a": (0, DataArray([1], coords={"foo": [1]}))}, + ) + + @requires_scipy + @pytest.mark.parametrize("use_dask", [True, False]) + def test_curvefit_ignore_errors(self, use_dask: bool) -> None: + if use_dask and not has_dask: + pytest.skip("requires dask") + + # nonsense function to make the optimization fail + def line(x, a, b): + if a > 10: + return 0 + return a * x + b + + da = DataArray( + [[1, 3, 5], [0, 20, 40]], + coords={"i": [1, 2], "x": [0.0, 1.0, 2.0]}, + ) + + if use_dask: + da = da.chunk({"i": 1}) + + expected = DataArray( + [[2, 1], [np.nan, np.nan]], coords={"i": [1, 2], "param": ["a", "b"]} + ) + + with pytest.raises(RuntimeError, match="calls to function has reached maxfev"): + da.curvefit( + coords="x", + func=line, + # limit maximum number of calls so the optimization fails + kwargs=dict(maxfev=5), + ).compute() # have to compute to raise the error + + fit = da.curvefit( + coords="x", + func=line, + errors="ignore", + # limit maximum number of calls so the optimization fails + kwargs=dict(maxfev=5), + ).compute() + + assert_allclose(fit.curvefit_coefficients, expected) + class TestReduce: @pytest.fixture(autouse=True) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index cc9220dfe33..5304c54971a 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -31,10 +31,11 @@ from xarray.coding.cftimeindex import CFTimeIndex from xarray.core import dtypes, indexing, utils from xarray.core.common import duck_array_ops, full_like -from xarray.core.coordinates import DatasetCoordinates +from xarray.core.coordinates import Coordinates, DatasetCoordinates from xarray.core.indexes import Index, PandasIndex from xarray.core.pycompat import array_type, integer_types from xarray.core.utils import is_scalar +from xarray.testing import _assert_internal_invariants from xarray.tests import ( DuckArrayWrapper, InaccessibleArray, @@ -467,13 +468,16 @@ def test_constructor(self) -> None: with pytest.raises(ValueError, match=r"conflicting sizes"): Dataset({"a": x1, "b": x2}) - with pytest.raises(ValueError, match=r"disallows such variables"): - Dataset({"a": x1, "x": z}) with pytest.raises(TypeError, match=r"tuple of form"): Dataset({"x": (1, 2, 3, 4, 5, 6, 7)}) with pytest.raises(ValueError, match=r"already exists as a scalar"): Dataset({"x": 0, "y": ("x", [1, 2, 3])}) + # nD coordinate variable "x" sharing name with dimension + actual = Dataset({"a": x1, "x": z}) + assert "x" not in actual.xindexes + _assert_internal_invariants(actual, check_default_indexes=True) + # verify handling of DataArrays expected = Dataset({"x": x1, "z": z}) actual = Dataset({"z": expected["z"]}) @@ -630,6 +634,37 @@ def test_constructor_with_coords(self) -> None: Dataset({}, {"x": mindex, "y": mindex}) Dataset({}, {"x": mindex, "level_1": range(4)}) + def test_constructor_no_default_index(self) -> None: + # explicitly passing a Coordinates object skips the creation of default index + ds = Dataset(coords=Coordinates({"x": ("x", [1, 2, 3])})) + assert "x" in ds + assert "x" not in ds.xindexes + + def test_constructor_multiindex(self) -> None: + midx = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("one", "two")) + coords = Coordinates.from_pandas_multiindex(midx, "x") + + ds = Dataset(coords=coords) + assert_identical(ds, coords.to_dataset()) + + with pytest.warns( + FutureWarning, match=".*`pandas.MultiIndex` via data variable.*" + ): + Dataset(data_vars={"x": midx}) + + def test_constructor_custom_index(self) -> None: + class CustomIndex(Index): + ... + + coords = Coordinates( + coords={"x": ("x", [1, 2, 3])}, indexes={"x": CustomIndex()} + ) + ds = Dataset(coords=coords) + assert isinstance(ds.xindexes["x"], CustomIndex) + + # test coordinate variables copied + assert ds.variables["x"] is not coords.variables["x"] + def test_properties(self) -> None: ds = create_test_data() @@ -4251,6 +4286,25 @@ class CustomIndex(PandasIndex): actual = ds.assign_coords(y=[4, 5, 6]) assert isinstance(actual.xindexes["x"], CustomIndex) + def test_assign_coords_custom_index(self) -> None: + class CustomIndex(Index): + pass + + coords = Coordinates( + coords={"x": ("x", [1, 2, 3])}, indexes={"x": CustomIndex()} + ) + ds = Dataset() + actual = ds.assign_coords(coords) + assert isinstance(actual.xindexes["x"], CustomIndex) + + def test_assign_coords_no_default_index(self) -> None: + coords = Coordinates({"y": ("y", [1, 2, 3])}) + ds = Dataset() + actual = ds.assign_coords(coords) + expected = coords.to_dataset() + assert_identical(expected, actual, check_default_indexes=False) + assert "y" not in actual.xindexes + def test_merge_multiindex_level(self) -> None: data = create_test_multiindex() @@ -6197,6 +6251,13 @@ def test_ipython_key_completion(self) -> None: ds["var3"].coords[item] # should not raise assert sorted(actual) == sorted(expected) + coords = Coordinates(ds.coords) + actual = coords._ipython_key_completions_() + expected = ["time", "dim2", "dim3", "numbers"] + for item in actual: + coords[item] # should not raise + assert sorted(actual) == sorted(expected) + # data_vars actual = ds.data_vars._ipython_key_completions_() expected = ["var1", "var2", "var3", "dim1"] @@ -6217,6 +6278,14 @@ def test_polyfit_output(self) -> None: out = ds.polyfit("time", 2) assert len(out.data_vars) == 0 + def test_polyfit_weighted(self) -> None: + # Make sure weighted polyfit does not change the original object (issue #5644) + ds = create_test_data(seed=1) + ds_copy = ds.copy(deep=True) + + ds.polyfit("dim2", 2, w=np.arange(ds.sizes["dim2"])) + xr.testing.assert_identical(ds, ds_copy) + def test_polyfit_warnings(self) -> None: ds = create_test_data(seed=1) diff --git a/xarray/tests/test_distributed.py b/xarray/tests/test_distributed.py index a57f8728ef0..fb917dfb254 100644 --- a/xarray/tests/test_distributed.py +++ b/xarray/tests/test_distributed.py @@ -6,7 +6,6 @@ import numpy as np import pytest -from packaging.version import Version if TYPE_CHECKING: import dask @@ -194,10 +193,6 @@ def test_dask_distributed_zarr_integration_test( assert_allclose(original, computed) -@pytest.mark.xfail( - condition=Version(distributed.__version__) < Version("2022.02.0"), - reason="https://github.com/dask/distributed/pull/5739", -) @gen_cluster(client=True) async def test_async(c, s, a, b) -> None: x = create_test_data() @@ -230,10 +225,6 @@ def test_hdf5_lock() -> None: assert isinstance(HDF5_LOCK, dask.utils.SerializableLock) -@pytest.mark.xfail( - condition=Version(distributed.__version__) < Version("2022.02.0"), - reason="https://github.com/dask/distributed/pull/5739", -) @gen_cluster(client=True) async def test_serializable_locks(c, s, a, b) -> None: def f(x, lock=None): diff --git a/xarray/tests/test_dtypes.py b/xarray/tests/test_dtypes.py index 1c942a1e6c8..490520c8f54 100644 --- a/xarray/tests/test_dtypes.py +++ b/xarray/tests/test_dtypes.py @@ -18,17 +18,17 @@ ([np.bytes_, np.unicode_], np.object_), ], ) -def test_result_type(args, expected): +def test_result_type(args, expected) -> None: actual = dtypes.result_type(*args) assert actual == expected -def test_result_type_scalar(): +def test_result_type_scalar() -> None: actual = dtypes.result_type(np.arange(3, dtype=np.float32), np.nan) assert actual == np.float32 -def test_result_type_dask_array(): +def test_result_type_dask_array() -> None: # verify it works without evaluating dask arrays da = pytest.importorskip("dask.array") dask = pytest.importorskip("dask") @@ -50,7 +50,7 @@ def error(): @pytest.mark.parametrize("obj", [1.0, np.inf, "ab", 1.0 + 1.0j, True]) -def test_inf(obj): +def test_inf(obj) -> None: assert dtypes.INF > obj assert dtypes.NINF < obj @@ -85,7 +85,7 @@ def test_inf(obj): ("V", (np.dtype("O"), "nan")), # dtype('V') ], ) -def test_maybe_promote(kind, expected): +def test_maybe_promote(kind, expected) -> None: # 'g': np.float128 is not tested : not available on all platforms # 'G': np.complex256 is not tested : not available on all platforms @@ -94,7 +94,7 @@ def test_maybe_promote(kind, expected): assert str(actual[1]) == expected[1] -def test_nat_types_membership(): +def test_nat_types_membership() -> None: assert np.datetime64("NaT").dtype in dtypes.NAT_TYPES assert np.timedelta64("NaT").dtype in dtypes.NAT_TYPES assert np.float64 not in dtypes.NAT_TYPES diff --git a/xarray/tests/test_formatting.py b/xarray/tests/test_formatting.py index bf5f7d0bdc5..7670b77322c 100644 --- a/xarray/tests/test_formatting.py +++ b/xarray/tests/test_formatting.py @@ -218,31 +218,70 @@ def test_attribute_repr(self) -> None: assert "\n" not in newlines assert "\t" not in tabs - def test_index_repr(self): + def test_index_repr(self) -> None: from xarray.core.indexes import Index class CustomIndex(Index): - def __init__(self, names): + names: tuple[str, ...] + + def __init__(self, names: tuple[str, ...]): self.names = names def __repr__(self): return f"CustomIndex(coords={self.names})" - coord_names = ["x", "y"] + coord_names = ("x", "y") index = CustomIndex(coord_names) - name = "x" + names = ("x",) - normal = formatting.summarize_index(name, index, col_width=20) - assert name in normal + normal = formatting.summarize_index(names, index, col_width=20) + assert names[0] in normal + assert len(normal.splitlines()) == len(names) assert "CustomIndex" in normal - CustomIndex._repr_inline_ = ( - lambda self, max_width: f"CustomIndex[{', '.join(self.names)}]" - ) - inline = formatting.summarize_index(name, index, col_width=20) - assert name in inline + class IndexWithInlineRepr(CustomIndex): + def _repr_inline_(self, max_width: int): + return f"CustomIndex[{', '.join(self.names)}]" + + index = IndexWithInlineRepr(coord_names) + inline = formatting.summarize_index(names, index, col_width=20) + assert names[0] in inline assert index._repr_inline_(max_width=40) in inline + @pytest.mark.parametrize( + "names", + ( + ("x",), + ("x", "y"), + ("x", "y", "z"), + ("x", "y", "z", "a"), + ), + ) + def test_index_repr_grouping(self, names) -> None: + from xarray.core.indexes import Index + + class CustomIndex(Index): + def __init__(self, names): + self.names = names + + def __repr__(self): + return f"CustomIndex(coords={self.names})" + + index = CustomIndex(names) + + normal = formatting.summarize_index(names, index, col_width=20) + assert all(name in normal for name in names) + assert len(normal.splitlines()) == len(names) + assert "CustomIndex" in normal + + hint_chars = [line[2] for line in normal.splitlines()] + + if len(names) <= 1: + assert hint_chars == [" "] + else: + assert hint_chars[0] == "┌" and hint_chars[-1] == "└" + assert len(names) == 2 or hint_chars[1:-1] == ["│"] * (len(names) - 2) + def test_diff_array_repr(self) -> None: da_a = xr.DataArray( np.array([[1, 2, 3], [4, 5, 6]], dtype="int64"), diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index a8530d85235..5d99eda1e88 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -1,6 +1,7 @@ from __future__ import annotations import datetime +import operator import warnings import numpy as np @@ -11,12 +12,14 @@ from xarray import DataArray, Dataset, Variable from xarray.core.groupby import _consolidate_slices from xarray.tests import ( + InaccessibleArray, assert_allclose, assert_array_equal, assert_equal, assert_identical, create_test_data, has_cftime, + has_flox, has_pandas_version_two, requires_dask, requires_flox, @@ -135,6 +138,18 @@ def test_groupby_input_mutation() -> None: assert_identical(array, array_copy) # should not modify inputs +@pytest.mark.parametrize("use_flox", [True, False]) +def test_groupby_indexvariable(use_flox: bool) -> None: + # regression test for GH7919 + array = xr.DataArray([1, 2, 3], [("x", [2, 2, 1])]) + iv = xr.IndexVariable(dims="x", data=pd.Index(array.x.values)) + with xr.set_options(use_flox=use_flox): + actual = array.groupby(iv).sum() + actual = array.groupby(iv).sum() + expected = xr.DataArray([3, 3], [("x", [1, 2])]) + assert_identical(expected, actual) + + @pytest.mark.parametrize( "obj", [ @@ -713,7 +728,7 @@ def test_groupby_dataset_iter() -> None: def test_groupby_dataset_errors() -> None: data = create_test_data() with pytest.raises(TypeError, match=r"`group` must be"): - data.groupby(np.arange(10)) + data.groupby(np.arange(10)) # type: ignore with pytest.raises(ValueError, match=r"length does not match"): data.groupby(data["dim1"][:3]) with pytest.raises(TypeError, match=r"`group` must be"): @@ -2336,3 +2351,59 @@ def test_groupby_binary_op_regression() -> None: anom_gb = x_slice.groupby("time.month") - clim assert_identical(xr.zeros_like(anom_gb), anom_gb) + + +def test_groupby_multiindex_level() -> None: + # GH6836 + midx = pd.MultiIndex.from_product([list("abc"), [0, 1]], names=("one", "two")) + mda = xr.DataArray(np.random.rand(6, 3), [("x", midx), ("y", range(3))]) + groups = mda.groupby("one").groups + assert groups == {"a": [0, 1], "b": [2, 3], "c": [4, 5]} + + +@requires_flox +@pytest.mark.parametrize("func", ["sum", "prod"]) +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("min_count", [None, 1]) +def test_min_count_vs_flox(func: str, min_count: int | None, skipna: bool) -> None: + da = DataArray( + data=np.array([np.nan, 1, 1, np.nan, 1, 1]), + dims="x", + coords={"labels": ("x", np.array([1, 2, 3, 1, 2, 3]))}, + ) + + gb = da.groupby("labels") + method = operator.methodcaller(func, min_count=min_count, skipna=skipna) + with xr.set_options(use_flox=True): + actual = method(gb) + with xr.set_options(use_flox=False): + expected = method(gb) + assert_identical(actual, expected) + + +@pytest.mark.parametrize("use_flox", [True, False]) +def test_min_count_error(use_flox: bool) -> None: + if use_flox and not has_flox: + pytest.skip() + da = DataArray( + data=np.array([np.nan, 1, 1, np.nan, 1, 1]), + dims="x", + coords={"labels": ("x", np.array([1, 2, 3, 1, 2, 3]))}, + ) + with xr.set_options(use_flox=use_flox): + with pytest.raises(TypeError): + da.groupby("labels").mean(min_count=1) + + +@requires_dask +def test_groupby_math_auto_chunk(): + da = xr.DataArray( + [[1, 2, 3], [1, 2, 3], [1, 2, 3]], + dims=("y", "x"), + coords={"label": ("x", [2, 2, 1])}, + ) + sub = xr.DataArray( + InaccessibleArray(np.array([1, 2])), dims="label", coords={"label": [1, 2]} + ) + actual = da.chunk(x=1, y=2).groupby("label") - sub + assert actual.chunksizes == {"x": (1, 1, 1), "y": (2, 1)} diff --git a/xarray/tests/test_indexes.py b/xarray/tests/test_indexes.py index 27b5cf2119c..ebe9f3fb932 100644 --- a/xarray/tests/test_indexes.py +++ b/xarray/tests/test_indexes.py @@ -582,7 +582,12 @@ def indexes( _, variables = indexes_and_vars - return Indexes(indexes, variables) + if isinstance(x_idx, Index): + index_type = Index + else: + index_type = pd.Index + + return Indexes(indexes, variables, index_type=index_type) def test_interface(self, unique_indexes, indexes) -> None: x_idx = unique_indexes[0] diff --git a/xarray/tests/test_merge.py b/xarray/tests/test_merge.py index 8957f9c829a..63449708a79 100644 --- a/xarray/tests/test_merge.py +++ b/xarray/tests/test_merge.py @@ -235,6 +235,13 @@ def test_merge_dicts_dims(self): expected = xr.Dataset({"x": [12], "y": ("x", [13])}) assert_identical(actual, expected) + def test_merge_coordinates(self): + coords1 = xr.Coordinates({"x": ("x", [0, 1, 2])}) + coords2 = xr.Coordinates({"y": ("y", [3, 4, 5])}) + expected = xr.Dataset(coords={"x": [0, 1, 2], "y": [3, 4, 5]}) + actual = xr.merge([coords1, coords2]) + assert_identical(actual, expected) + def test_merge_error(self): ds = xr.Dataset({"x": 0}) with pytest.raises(xr.MergeError): diff --git a/xarray/tests/test_parallelcompat.py b/xarray/tests/test_parallelcompat.py new file mode 100644 index 00000000000..2c3378a2816 --- /dev/null +++ b/xarray/tests/test_parallelcompat.py @@ -0,0 +1,219 @@ +from __future__ import annotations + +from typing import Any + +import numpy as np +import pytest + +from xarray.core.daskmanager import DaskManager +from xarray.core.parallelcompat import ( + ChunkManagerEntrypoint, + get_chunked_array_type, + guess_chunkmanager, + list_chunkmanagers, +) +from xarray.core.types import T_Chunks, T_NormalizedChunks +from xarray.tests import has_dask, requires_dask + + +class DummyChunkedArray(np.ndarray): + """ + Mock-up of a chunked array class. + + Adds a (non-functional) .chunks attribute by following this example in the numpy docs + https://numpy.org/doc/stable/user/basics.subclassing.html#simple-example-adding-an-extra-attribute-to-ndarray + """ + + chunks: T_NormalizedChunks + + def __new__( + cls, + shape, + dtype=float, + buffer=None, + offset=0, + strides=None, + order=None, + chunks=None, + ): + obj = super().__new__(cls, shape, dtype, buffer, offset, strides, order) + obj.chunks = chunks + return obj + + def __array_finalize__(self, obj): + if obj is None: + return + self.chunks = getattr(obj, "chunks", None) + + def rechunk(self, chunks, **kwargs): + copied = self.copy() + copied.chunks = chunks + return copied + + +class DummyChunkManager(ChunkManagerEntrypoint): + """Mock-up of ChunkManager class for DummyChunkedArray""" + + def __init__(self): + self.array_cls = DummyChunkedArray + + def is_chunked_array(self, data: Any) -> bool: + return isinstance(data, DummyChunkedArray) + + def chunks(self, data: DummyChunkedArray) -> T_NormalizedChunks: + return data.chunks + + def normalize_chunks( + self, + chunks: T_Chunks | T_NormalizedChunks, + shape: tuple[int, ...] | None = None, + limit: int | None = None, + dtype: np.dtype | None = None, + previous_chunks: T_NormalizedChunks | None = None, + ) -> T_NormalizedChunks: + from dask.array.core import normalize_chunks + + return normalize_chunks(chunks, shape, limit, dtype, previous_chunks) + + def from_array( + self, data: np.ndarray, chunks: T_Chunks, **kwargs + ) -> DummyChunkedArray: + from dask import array as da + + return da.from_array(data, chunks, **kwargs) + + def rechunk(self, data: DummyChunkedArray, chunks, **kwargs) -> DummyChunkedArray: + return data.rechunk(chunks, **kwargs) + + def compute(self, *data: DummyChunkedArray, **kwargs) -> tuple[np.ndarray, ...]: + from dask.array import compute + + return compute(*data, **kwargs) + + def apply_gufunc( + self, + func, + signature, + *args, + axes=None, + axis=None, + keepdims=False, + output_dtypes=None, + output_sizes=None, + vectorize=None, + allow_rechunk=False, + meta=None, + **kwargs, + ): + from dask.array.gufunc import apply_gufunc + + return apply_gufunc( + func, + signature, + *args, + axes=axes, + axis=axis, + keepdims=keepdims, + output_dtypes=output_dtypes, + output_sizes=output_sizes, + vectorize=vectorize, + allow_rechunk=allow_rechunk, + meta=meta, + **kwargs, + ) + + +@pytest.fixture +def register_dummy_chunkmanager(monkeypatch): + """ + Mocks the registering of an additional ChunkManagerEntrypoint. + + This preserves the presence of the existing DaskManager, so a test that relies on this and DaskManager both being + returned from list_chunkmanagers() at once would still work. + + The monkeypatching changes the behavior of list_chunkmanagers when called inside xarray.core.parallelcompat, + but not when called from this tests file. + """ + # Should include DaskManager iff dask is available to be imported + preregistered_chunkmanagers = list_chunkmanagers() + + monkeypatch.setattr( + "xarray.core.parallelcompat.list_chunkmanagers", + lambda: {"dummy": DummyChunkManager()} | preregistered_chunkmanagers, + ) + yield + + +class TestGetChunkManager: + def test_get_chunkmanger(self, register_dummy_chunkmanager) -> None: + chunkmanager = guess_chunkmanager("dummy") + assert isinstance(chunkmanager, DummyChunkManager) + + def test_fail_on_nonexistent_chunkmanager(self) -> None: + with pytest.raises(ValueError, match="unrecognized chunk manager foo"): + guess_chunkmanager("foo") + + @requires_dask + def test_get_dask_if_installed(self) -> None: + chunkmanager = guess_chunkmanager(None) + assert isinstance(chunkmanager, DaskManager) + + @pytest.mark.skipif(has_dask, reason="requires dask not to be installed") + def test_dont_get_dask_if_not_installed(self) -> None: + with pytest.raises(ValueError, match="unrecognized chunk manager dask"): + guess_chunkmanager("dask") + + @requires_dask + def test_choose_dask_over_other_chunkmanagers( + self, register_dummy_chunkmanager + ) -> None: + chunk_manager = guess_chunkmanager(None) + assert isinstance(chunk_manager, DaskManager) + + +class TestGetChunkedArrayType: + def test_detect_chunked_arrays(self, register_dummy_chunkmanager) -> None: + dummy_arr = DummyChunkedArray([1, 2, 3]) + + chunk_manager = get_chunked_array_type(dummy_arr) + assert isinstance(chunk_manager, DummyChunkManager) + + def test_ignore_inmemory_arrays(self, register_dummy_chunkmanager) -> None: + dummy_arr = DummyChunkedArray([1, 2, 3]) + + chunk_manager = get_chunked_array_type(*[dummy_arr, 1.0, np.array([5, 6])]) + assert isinstance(chunk_manager, DummyChunkManager) + + with pytest.raises(TypeError, match="Expected a chunked array"): + get_chunked_array_type(5.0) + + def test_raise_if_no_arrays_chunked(self, register_dummy_chunkmanager) -> None: + with pytest.raises(TypeError, match="Expected a chunked array "): + get_chunked_array_type(*[1.0, np.array([5, 6])]) + + def test_raise_if_no_matching_chunkmanagers(self) -> None: + dummy_arr = DummyChunkedArray([1, 2, 3]) + + with pytest.raises( + TypeError, match="Could not find a Chunk Manager which recognises" + ): + get_chunked_array_type(dummy_arr) + + @requires_dask + def test_detect_dask_if_installed(self) -> None: + import dask.array as da + + dask_arr = da.from_array([1, 2, 3], chunks=(1,)) + + chunk_manager = get_chunked_array_type(dask_arr) + assert isinstance(chunk_manager, DaskManager) + + @requires_dask + def test_raise_on_mixed_array_types(self, register_dummy_chunkmanager) -> None: + import dask.array as da + + dummy_arr = DummyChunkedArray([1, 2, 3]) + dask_arr = da.from_array([1, 2, 3], chunks=(1,)) + + with pytest.raises(TypeError, match="received multiple types"): + get_chunked_array_type(*[dask_arr, dummy_arr]) diff --git a/xarray/tests/test_plot.py b/xarray/tests/test_plot.py index 18ca49670ba..8b2dfbdec41 100644 --- a/xarray/tests/test_plot.py +++ b/xarray/tests/test_plot.py @@ -2708,23 +2708,32 @@ def test_bad_args( x=x, y=y, hue=hue, add_legend=add_legend, add_colorbar=add_colorbar ) - @pytest.mark.xfail(reason="datetime,timedelta hue variable not supported.") - @pytest.mark.parametrize("hue_style", ["discrete", "continuous"]) - def test_datetime_hue(self, hue_style: Literal["discrete", "continuous"]) -> None: + def test_datetime_hue(self) -> None: ds2 = self.ds.copy() + + # TODO: Currently plots as categorical, should it behave as numerical? ds2["hue"] = pd.date_range("2000-1-1", periods=4) - ds2.plot.scatter(x="A", y="B", hue="hue", hue_style=hue_style) + ds2.plot.scatter(x="A", y="B", hue="hue") ds2["hue"] = pd.timedelta_range("-1D", periods=4, freq="D") - ds2.plot.scatter(x="A", y="B", hue="hue", hue_style=hue_style) + ds2.plot.scatter(x="A", y="B", hue="hue") - @pytest.mark.parametrize("hue_style", ["discrete", "continuous"]) - def test_facetgrid_hue_style( - self, hue_style: Literal["discrete", "continuous"] - ) -> None: - g = self.ds.plot.scatter( - x="A", y="B", row="row", col="col", hue="hue", hue_style=hue_style - ) + def test_facetgrid_hue_style(self) -> None: + ds2 = self.ds.copy() + + # Numbers plots as continous: + g = ds2.plot.scatter(x="A", y="B", row="row", col="col", hue="hue") + assert isinstance(g._mappables[-1], mpl.collections.PathCollection) + + # Datetimes plots as categorical: + # TODO: Currently plots as categorical, should it behave as numerical? + ds2["hue"] = pd.date_range("2000-1-1", periods=4) + g = ds2.plot.scatter(x="A", y="B", row="row", col="col", hue="hue") + assert isinstance(g._mappables[-1], mpl.collections.PathCollection) + + # Strings plots as categorical: + ds2["hue"] = ["a", "a", "b", "b"] + g = ds2.plot.scatter(x="A", y="B", row="row", col="col", hue="hue") assert isinstance(g._mappables[-1], mpl.collections.PathCollection) @pytest.mark.parametrize( diff --git a/xarray/tests/test_plugins.py b/xarray/tests/test_plugins.py index 0882bc1b570..441f16f4dca 100644 --- a/xarray/tests/test_plugins.py +++ b/xarray/tests/test_plugins.py @@ -236,6 +236,7 @@ def test_lazy_import() -> None: "sparse", "cupy", "pint", + "cubed", ] # ensure that none of the above modules has been imported before modules_backup = {} diff --git a/xarray/tests/test_rolling.py b/xarray/tests/test_rolling.py index ddc193712ae..73aebc1b1f0 100644 --- a/xarray/tests/test_rolling.py +++ b/xarray/tests/test_rolling.py @@ -5,7 +5,6 @@ import numpy as np import pandas as pd import pytest -from packaging.version import Version import xarray as xr from xarray import DataArray, Dataset, set_options @@ -391,14 +390,6 @@ class TestDataArrayRollingExp: @pytest.mark.parametrize("backend", ["numpy"], indirect=True) @pytest.mark.parametrize("func", ["mean", "sum"]) def test_rolling_exp_runs(self, da, dim, window_type, window, func) -> None: - import numbagg - - if ( - Version(getattr(numbagg, "__version__", "0.1.0")) < Version("0.2.1") - and func == "sum" - ): - pytest.skip("rolling_exp.sum requires numbagg 0.2.1") - da = da.where(da > 0.2) rolling_exp = da.rolling_exp(window_type=window_type, **{dim: window}) @@ -430,14 +421,6 @@ def test_rolling_exp_mean_pandas(self, da, dim, window_type, window) -> None: @pytest.mark.parametrize("backend", ["numpy"], indirect=True) @pytest.mark.parametrize("func", ["mean", "sum"]) def test_rolling_exp_keep_attrs(self, da, func) -> None: - import numbagg - - if ( - Version(getattr(numbagg, "__version__", "0.1.0")) < Version("0.2.1") - and func == "sum" - ): - pytest.skip("rolling_exp.sum requires numbagg 0.2.1") - attrs = {"attrs": "da"} da.attrs = attrs diff --git a/xarray/tests/test_ufuncs.py b/xarray/tests/test_ufuncs.py index 6cd73e9cfb7..6b4c3f38ee9 100644 --- a/xarray/tests/test_ufuncs.py +++ b/xarray/tests/test_ufuncs.py @@ -4,7 +4,7 @@ import pytest import xarray as xr -from xarray.tests import assert_array_equal, mock +from xarray.tests import assert_allclose, assert_array_equal, mock from xarray.tests import assert_identical as assert_identical_ @@ -16,16 +16,16 @@ def assert_identical(a, b): assert_array_equal(a, b) -def test_unary(): - args = [ - 0, - np.zeros(2), +@pytest.mark.parametrize( + "a", + [ xr.Variable(["x"], [0, 0]), xr.DataArray([0, 0], dims="x"), xr.Dataset({"y": ("x", [0, 0])}), - ] - for a in args: - assert_identical(a + 1, np.cos(a)) + ], +) +def test_unary(a): + assert_allclose(a + 1, np.cos(a)) def test_binary(): diff --git a/xarray/tests/test_units.py b/xarray/tests/test_units.py index 9e872c93c0c..addd7587544 100644 --- a/xarray/tests/test_units.py +++ b/xarray/tests/test_units.py @@ -2,6 +2,7 @@ import functools import operator +import sys import numpy as np import pandas as pd @@ -1508,6 +1509,10 @@ def test_dot_dataarray(dtype): class TestVariable: + @pytest.mark.skipif( + (sys.version_info >= (3, 11)) and sys.platform.startswith("win"), + reason="fails for some reason on win and 3.11, GH7971", + ) @pytest.mark.parametrize( "func", ( @@ -2339,6 +2344,10 @@ def test_repr(self, func, variant, dtype): # warnings or errors, but does not check the result func(data_array) + @pytest.mark.skipif( + (sys.version_info >= (3, 11)) and sys.platform.startswith("win"), + reason="fails for some reason on win and 3.11, GH7971", + ) @pytest.mark.parametrize( "func", ( @@ -2416,6 +2425,10 @@ def test_aggregation(self, func, dtype): assert_units_equal(expected, actual) assert_allclose(expected, actual) + @pytest.mark.skipif( + (sys.version_info >= (3, 11)) and sys.platform.startswith("win"), + reason="fails for some reason on win and 3.11, GH7971", + ) @pytest.mark.parametrize( "func", ( @@ -4069,6 +4082,10 @@ def test_repr(self, func, variant, dtype): # warnings or errors, but does not check the result func(ds) + @pytest.mark.skipif( + (sys.version_info >= (3, 11)) and sys.platform.startswith("win"), + reason="fails for some reason on win and 3.11, GH7971", + ) @pytest.mark.parametrize( "func", ( @@ -5627,16 +5644,20 @@ def test_merge(self, variant, unit, error, dtype): @requires_dask class TestPintWrappingDask: + @pytest.mark.skipif( + version.parse(pint.__version__) <= version.parse("0.21"), + reason="pint didn't support dask properly before 0.21", + ) def test_duck_array_ops(self): import dask.array d = dask.array.array([1, 2, 3]) - q = pint.Quantity(d, units="m") + q = unit_registry.Quantity(d, units="m") da = xr.DataArray(q, dims="x") actual = da.mean().compute() actual.name = None - expected = xr.DataArray(pint.Quantity(np.array(2.0), units="m")) + expected = xr.DataArray(unit_registry.Quantity(np.array(2.0), units="m")) assert_units_equal(expected, actual) # Don't use isinstance b/c we don't want to allow subclasses through diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index bef5efc15cc..f30cdcf3f73 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -10,9 +10,8 @@ import pandas as pd import pytest import pytz -from packaging.version import Version -from xarray import Coordinate, DataArray, Dataset, IndexVariable, Variable, set_options +from xarray import DataArray, Dataset, IndexVariable, Variable, set_options from xarray.core import dtypes, duck_array_ops, indexing from xarray.core.common import full_like, ones_like, zeros_like from xarray.core.indexing import ( @@ -1247,8 +1246,10 @@ def test_as_variable(self): expected = Variable(("x", "y"), data) with pytest.raises(ValueError, match=r"without explicit dimension names"): as_variable(data, name="x") - with pytest.raises(ValueError, match=r"has more than 1-dimension"): - as_variable(expected, name="x") + + # name of nD variable matches dimension name + actual = as_variable(expected, name="x") + assert_identical(expected, actual) # test datetime, timedelta conversion dt = np.array([datetime(1999, 1, 1) + timedelta(days=x) for x in range(10)]) @@ -1706,6 +1707,15 @@ def test_stack_unstack_consistency(self): actual = v.stack(z=("x", "y")).unstack(z={"x": 2, "y": 2}) assert_identical(actual, v) + @pytest.mark.filterwarnings("error::RuntimeWarning") + def test_unstack_without_missing(self): + v = Variable(["z"], [0, 1, 2, 3]) + expected = Variable(["x", "y"], [[0, 1], [2, 3]]) + + actual = v.unstack(z={"x": 2, "y": 2}) + + assert_identical(actual, expected) + def test_broadcasting_math(self): x = np.random.randn(2, 3) v = Variable(["a", "b"], x) @@ -1830,10 +1840,7 @@ def test_quantile_method(self, method, use_dask) -> None: q = np.array([0.25, 0.5, 0.75]) actual = v.quantile(q, dim="y", method=method) - if Version(np.__version__) >= Version("1.22"): - expected = np.nanquantile(self.d, q, axis=1, method=method) - else: - expected = np.nanquantile(self.d, q, axis=1, interpolation=method) + expected = np.nanquantile(self.d, q, axis=1, method=method) if use_dask: assert isinstance(actual.data, dask_array_type) @@ -2445,11 +2452,6 @@ def test_concat_str_dtype(self, dtype): assert actual.identical(expected) assert np.issubdtype(actual.dtype, dtype) - def test_coordinate_alias(self): - with pytest.warns(Warning, match="deprecated"): - x = Coordinate("x", [1, 2, 3]) - assert isinstance(x, IndexVariable) - def test_datetime64(self): # GH:1932 Make sure indexing keeps precision t = np.array([1518418799999986560, 1518418799999996560], dtype="datetime64[ns]") @@ -2560,6 +2562,19 @@ def test_masked_array(self): assert_array_equal(expected, actual) assert np.dtype(float) == actual.dtype + original = np.ma.MaskedArray([1.0, 2.0], mask=[True, False]) + original.flags.writeable = False + expected = [np.nan, 2.0] + actual = as_compatible_data(original) + assert_array_equal(expected, actual) + assert np.dtype(float) == actual.dtype + + # GH2377 + actual = Variable(dims=tuple(), data=np.ma.masked) + expected = Variable(dims=tuple(), data=np.nan) + assert_array_equal(expected, actual) + assert actual.dtype == expected.dtype + @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_datetime(self): expected = np.datetime64("2000-01-01") diff --git a/xarray/util/generate_aggregations.py b/xarray/util/generate_aggregations.py index efc69c46947..312f5722f8e 100644 --- a/xarray/util/generate_aggregations.py +++ b/xarray/util/generate_aggregations.py @@ -22,12 +22,13 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Callable, Sequence +from collections.abc import Sequence +from typing import TYPE_CHECKING, Any, Callable from xarray.core import duck_array_ops from xarray.core.options import OPTIONS from xarray.core.types import Dims -from xarray.core.utils import contains_only_dask_or_numpy, module_available +from xarray.core.utils import contains_only_chunked_or_numpy, module_available if TYPE_CHECKING: from xarray.core.dataarray import DataArray @@ -185,6 +186,15 @@ def {method}( function for calculating ``{method}`` on this object's data. These could include dask-specific kwargs like ``split_every``.""" +_COUNT_SEE_ALSO = """ + See Also + -------- + pandas.DataFrame.{method} + dask.dataframe.DataFrame.{method} + {see_also_obj}.{method} + :ref:`{docref}` + User guide on {docref_description}.""" + _NUMERIC_ONLY_NOTES = "Non-numeric variables will be removed prior to reducing." _FLOX_NOTES_TEMPLATE = """Use the ``flox`` package to significantly speed up {kind} computations, @@ -248,7 +258,7 @@ def __init__( else: self.array_method = name self.np_example_array = """ - ... np.array([1, 2, 3, 1, 2, np.nan])""" + ... np.array([1, 2, 3, 0, 2, np.nan])""" class AggregationGenerator: @@ -311,7 +321,9 @@ def generate_method(self, method): yield TEMPLATE_RETURNS.format(**template_kwargs) - yield TEMPLATE_SEE_ALSO.format( + see_also = _COUNT_SEE_ALSO if method.name == "count" else TEMPLATE_SEE_ALSO + # Fixes broken links mentioned in #8055 + yield see_also.format( **template_kwargs, docref=self.docref, docref_description=self.docref_description, @@ -394,7 +406,7 @@ def generate_code(self, method): if ( flox_available and OPTIONS["use_flox"] - and contains_only_dask_or_numpy(self._obj) + and contains_only_chunked_or_numpy(self._obj) ): return self._flox_reduce( func="{method.name}",