Skip to content

Commit

Permalink
Merge branch 'main' into bump/optional
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke authored Feb 14, 2025
2 parents 7dfe5bc + 6bcd303 commit 7252460
Show file tree
Hide file tree
Showing 35 changed files with 263 additions and 101 deletions.
5 changes: 5 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,11 @@ repos:
hooks:
- id: meson-fmt
args: ['--inplace']
- repo: https://github.com/shellcheck-py/shellcheck-py
rev: v0.10.0.1
hooks:
- id: shellcheck
args: ["--severity=warning"]
- repo: local
hooks:
- id: pyright
Expand Down
18 changes: 9 additions & 9 deletions ci/code_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,15 @@ else
fi

[[ -z "$CHECK" || "$CHECK" == "code" || "$CHECK" == "doctests" || "$CHECK" == "docstrings" || "$CHECK" == "single-docs" || "$CHECK" == "notebooks" ]] || \
{ echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 9999; }
{ echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 1; }

BASE_DIR="$(dirname $0)/.."
BASE_DIR="$(dirname "$0")/.."
RET=0

### CODE ###
if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then

MSG='Check import. No warnings, and blocklist some optional dependencies' ; echo $MSG
MSG='Check import. No warnings, and blocklist some optional dependencies' ; echo "$MSG"
python -W error -c "
import sys
import pandas
Expand All @@ -49,24 +49,24 @@ if mods:
sys.stderr.write('err: pandas should not import: {}\n'.format(', '.join(mods)))
sys.exit(len(mods))
"
RET=$(($RET + $?)) ; echo $MSG "DONE"
RET=$(($RET + $?)) ; echo "$MSG" "DONE"

fi

### DOCTESTS ###
if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then

MSG='Python and Cython Doctests' ; echo $MSG
MSG='Python and Cython Doctests' ; echo "$MSG"
python -c 'import pandas as pd; pd.test(run_doctests=True)'
RET=$(($RET + $?)) ; echo $MSG "DONE"
RET=$(($RET + $?)) ; echo "$MSG" "DONE"

fi

### DOCSTRINGS ###
if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then

MSG='Validate Docstrings' ; echo $MSG
$BASE_DIR/scripts/validate_docstrings.py \
MSG='Validate Docstrings' ; echo "$MSG"
"$BASE_DIR"/scripts/validate_docstrings.py \
--format=actions \
-i ES01 `# For now it is ok if docstrings are missing the extended summary` \
-i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \
Expand Down Expand Up @@ -265,7 +265,7 @@ fi
if [[ -z "$CHECK" || "$CHECK" == "notebooks" ]]; then

MSG='Notebooks' ; echo $MSG
jupyter nbconvert --execute $(find doc/source -name '*.ipynb') --to notebook
jupyter nbconvert --execute "$(find doc/source -name '*.ipynb')" --to notebook
RET=$(($RET + $?)) ; echo $MSG "DONE"

fi
Expand Down
8 changes: 3 additions & 5 deletions ci/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,8 @@
# Workaround for pytest-xdist (it collects different tests in the workers if PYTHONHASHSEED is not set)
# https://github.com/pytest-dev/pytest/issues/920
# https://github.com/pytest-dev/pytest/issues/1075
export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))')

# May help reproduce flaky CI builds if set in subsequent runs
echo PYTHONHASHSEED=$PYTHONHASHSEED
PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))')
export PYTHONHASHSEED

COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.toml"

Expand All @@ -16,5 +14,5 @@ if [[ "$PATTERN" ]]; then
PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\""
fi

echo $PYTEST_CMD
echo "$PYTEST_CMD"
sh -c "$PYTEST_CMD"
9 changes: 5 additions & 4 deletions ci/upload_wheels.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/bin/bash
# Modified from numpy's https://github.com/numpy/numpy/blob/main/tools/wheels/upload_wheels.sh

set_upload_vars() {
Expand All @@ -19,20 +20,20 @@ set_upload_vars() {
fi
}
upload_wheels() {
echo ${PWD}
echo "${PWD}"
if [[ ${ANACONDA_UPLOAD} == true ]]; then
if [ -z ${TOKEN} ]; then
if [ -z "${TOKEN}" ]; then
echo no token set, not uploading
else
# sdists are located under dist folder when built through setup.py
if compgen -G "./dist/*.gz"; then
echo "Found sdist"
anaconda -q -t ${TOKEN} upload --skip -u ${ANACONDA_ORG} ./dist/*.gz
anaconda -q -t "${TOKEN}" upload --skip -u "${ANACONDA_ORG}" ./dist/*.gz
echo "Uploaded sdist"
fi
if compgen -G "./wheelhouse/*.whl"; then
echo "Found wheel"
anaconda -q -t ${TOKEN} upload --skip -u ${ANACONDA_ORG} ./wheelhouse/*.whl
anaconda -q -t "${TOKEN}" upload --skip -u "${ANACONDA_ORG}" ./wheelhouse/*.whl
echo "Uploaded wheel"
fi
echo "PyPI-style index: https://pypi.anaconda.org/$ANACONDA_ORG/simple"
Expand Down
2 changes: 1 addition & 1 deletion doc/source/development/contributing_codebase.rst
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ be located.
- tests.scalar
- tests.tseries.offsets

2. Does your test depend only on code in pd._libs?
2. Does your test depend only on code in ``pd._libs``?
This test likely belongs in one of:

- tests.libs
Expand Down
2 changes: 1 addition & 1 deletion doc/source/development/developer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ Column metadata
* Boolean: ``'bool'``
* Integers: ``'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64'``
* Floats: ``'float16', 'float32', 'float64'``
* Date and Time Types: ``'datetime', 'datetimetz'``, ``'timedelta'``
* Date and Time Types: ``'datetime', 'datetimetz', 'timedelta'``
* String: ``'unicode', 'bytes'``
* Categorical: ``'categorical'``
* Other Python objects: ``'object'``
Expand Down
1 change: 1 addition & 0 deletions doc/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ Attributes
Series.array
Series.values
Series.dtype
Series.info
Series.shape
Series.nbytes
Series.ndim
Expand Down
4 changes: 2 additions & 2 deletions doc/source/user_guide/window.rst
Original file line number Diff line number Diff line change
Expand Up @@ -356,11 +356,11 @@ See :ref:`enhancing performance with Numba <enhancingperf.numba>` for general us

Numba will be applied in potentially two routines:

#. If ``func`` is a standard Python function, the engine will `JIT <https://numba.pydata.org/numba-doc/latest/user/overview.html>`__ the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again.
#. If ``func`` is a standard Python function, the engine will `JIT <https://numba.readthedocs.io/en/stable/user/overview.html>`__ the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again.
#. The engine will JIT the for loop where the apply function is applied to each window.

The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the
`numba.jit decorator <https://numba.pydata.org/numba-doc/latest/reference/jit-compilation.html#numba.jit>`__.
`numba.jit decorator <https://numba.readthedocs.io/en/stable/user/jit.html>`__.
These keyword arguments will be applied to *both* the passed function (if a standard Python function)
and the apply for loop over each window.

Expand Down
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Other enhancements
- :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`)
- :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`)
- Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`)
- Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`)
- :class:`pandas.api.typing.NoDefault` is available for typing ``no_default``
- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
- :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`)
Expand Down Expand Up @@ -70,6 +71,7 @@ Other enhancements
- :meth:`Series.str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
- :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`)
- Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`)
- Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`)
- Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
Expand Down
97 changes: 63 additions & 34 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -753,16 +753,20 @@ def group_sum(

if uses_mask:
isna_entry = mask[i, j]
isna_result = result_mask[lab, j]
else:
isna_entry = _treat_as_na(val, is_datetimelike)
isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)

if not skipna and isna_result:
# If sum is already NA, don't add to it. This is important for
# datetimelikebecause adding a value to NPY_NAT may not result
# in a NPY_NAT
continue
if not skipna:
if uses_mask:
isna_result = result_mask[lab, j]
else:
isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)

if isna_result:
# If sum is already NA, don't add to it. This is important for
# datetimelikebecause adding a value to NPY_NAT may not result
# in a NPY_NAT
continue

if not isna_entry:
nobs[lab, j] += 1
Expand Down Expand Up @@ -845,14 +849,18 @@ def group_prod(

if uses_mask:
isna_entry = mask[i, j]
isna_result = result_mask[lab, j]
else:
isna_entry = _treat_as_na(val, False)
isna_result = _treat_as_na(prodx[lab, j], False)

if not skipna and isna_result:
# If prod is already NA, no need to update it
continue
if not skipna:
if uses_mask:
isna_result = result_mask[lab, j]
else:
isna_result = _treat_as_na(prodx[lab, j], False)

if isna_result:
# If prod is already NA, no need to update it
continue

if not isna_entry:
nobs[lab, j] += 1
Expand Down Expand Up @@ -919,22 +927,30 @@ def group_var(

if uses_mask:
isna_entry = mask[i, j]
isna_result = result_mask[lab, j]
elif is_datetimelike:
# With group_var, we cannot just use _treat_as_na bc
# datetimelike dtypes get cast to float64 instead of
# to int64.
isna_entry = val == NPY_NAT
isna_result = out[lab, j] == NPY_NAT
else:
isna_entry = _treat_as_na(val, is_datetimelike)
isna_result = _treat_as_na(out[lab, j], is_datetimelike)

if not skipna and isna_result:
# If aggregate is already NA, don't add to it. This is important for
# datetimelike because adding a value to NPY_NAT may not result
# in a NPY_NAT
continue
if not skipna:
if uses_mask:
isna_result = result_mask[lab, j]
elif is_datetimelike:
# With group_var, we cannot just use _treat_as_na bc
# datetimelike dtypes get cast to float64 instead of
# to int64.
isna_result = out[lab, j] == NPY_NAT
else:
isna_result = _treat_as_na(out[lab, j], is_datetimelike)

if isna_result:
# If aggregate is already NA, don't add to it. This is
# important for datetimelike because adding a value to NPY_NAT
# may not result in a NPY_NAT
continue

if not isna_entry:
nobs[lab, j] += 1
Expand Down Expand Up @@ -1232,22 +1248,30 @@ def group_mean(

if uses_mask:
isna_entry = mask[i, j]
isna_result = result_mask[lab, j]
elif is_datetimelike:
# With group_mean, we cannot just use _treat_as_na bc
# datetimelike dtypes get cast to float64 instead of
# to int64.
isna_entry = val == NPY_NAT
isna_result = sumx[lab, j] == NPY_NAT
else:
isna_entry = _treat_as_na(val, is_datetimelike)
isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)

if not skipna and isna_result:
# If sum is already NA, don't add to it. This is important for
# datetimelike because adding a value to NPY_NAT may not result
# in NPY_NAT
continue
if not skipna:
if uses_mask:
isna_result = result_mask[lab, j]
elif is_datetimelike:
# With group_mean, we cannot just use _treat_as_na bc
# datetimelike dtypes get cast to float64 instead of
# to int64.
isna_result = sumx[lab, j] == NPY_NAT
else:
isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)

if isna_result:
# If sum is already NA, don't add to it. This is important for
# datetimelike because adding a value to NPY_NAT may not result
# in NPY_NAT
continue

if not isna_entry:
nobs[lab, j] += 1
Expand Down Expand Up @@ -1909,15 +1933,20 @@ cdef group_min_max(

if uses_mask:
isna_entry = mask[i, j]
isna_result = result_mask[lab, j]
else:
isna_entry = _treat_as_na(val, is_datetimelike)
isna_result = _treat_as_na(group_min_or_max[lab, j],
is_datetimelike)

if not skipna and isna_result:
# If current min/max is already NA, it will always be NA
continue
if not skipna:
if uses_mask:
isna_result = result_mask[lab, j]
else:
isna_result = _treat_as_na(
group_min_or_max[lab, j], is_datetimelike
)

if isna_result:
# If current min/max is already NA, it will always be NA
continue

if not isna_entry:
nobs[lab, j] += 1
Expand Down
5 changes: 5 additions & 0 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1522,6 +1522,11 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
"""
Return a string label of the type of a scalar or list-like of values.

This method inspects the elements of the provided input and determines
classification of its data type. It is particularly useful for
handling heterogeneous data inputs where explicit dtype conversion may not
be possible or necessary.

Parameters
----------
value : scalar, list, ndarray, or pandas type
Expand Down
7 changes: 7 additions & 0 deletions pandas/_libs/tslibs/strptime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -924,6 +924,13 @@ cdef (int, int) _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday)

correction = date(iso_year, 1, 4).isoweekday() + 3
ordinal = (iso_week * 7) + iso_weekday - correction

if iso_week == 53:
now = date.fromordinal(date(iso_year, 1, 1).toordinal() + ordinal - iso_weekday)
jan_4th = date(iso_year+1, 1, 4)
if (jan_4th - now).days < 7:
raise ValueError(f"Week 53 does not exist in ISO year {iso_year}.")

# ordinal may be negative or 0 now, which means the date is in the previous
# calendar year
if ordinal < 1:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/_numba/kernels/min_max_.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def grouped_min_max(
for i in range(N):
lab = labels[i]
val = values[i]
if lab < 0 or (nobs[lab] >= 1 and np.isnan(output[lab])):
if lab < 0 or (not skipna and nobs[lab] >= 1 and np.isnan(output[lab])):
continue

if values.dtype.kind == "i" or not np.isnan(val):
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/arrays/sparse/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,11 @@ class SparseFrameAccessor(BaseAccessor, PandasDelegate):
"""
DataFrame accessor for sparse data.
It allows users to interact with a `DataFrame` that contains sparse data types
(`SparseDtype`). It provides methods and attributes to efficiently work with sparse
storage, reducing memory usage while maintaining compatibility with standard pandas
operations.
Parameters
----------
data : scipy.sparse.spmatrix
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1836,7 +1836,7 @@ def pandas_dtype(dtype) -> DtypeObj:
# raise a consistent TypeError if failed
try:
with warnings.catch_warnings():
# TODO: warnings.catch_warnings can be removed when numpy>2.2.2
# TODO: warnings.catch_warnings can be removed when numpy>2.3.0
# is the minimum version
# GH#51523 - Series.astype(np.integer) doesn't show
# numpy deprecation warning of np.integer
Expand Down
Loading

0 comments on commit 7252460

Please sign in to comment.