Skip to content

Commit ec9659c

Browse files
committed
Merge remote-tracking branch 'upstream' into perf/groupby-arrow-native
2 parents e350cfd + 9694eb1 commit ec9659c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+447
-173
lines changed

.github/workflows/unit-tests.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ jobs:
3131
# Prevent the include jobs from overriding other jobs
3232
pattern: [""]
3333
pandas_future_infer_string: ["1"]
34+
pandas_future_python_scalars: ["0"]
3435
include:
3536
- name: "Downstream Compat"
3637
env_file: actions-313-downstream_compat.yaml
@@ -75,6 +76,10 @@ jobs:
7576
env_file: actions-313.yaml
7677
pandas_future_infer_string: "0"
7778
platform: ubuntu-24.04
79+
- name: "PANDAS_FUTURE_PYTHON_SCALARS=1"
80+
env_file: actions-313.yaml
81+
pandas_future_python_scalars: "1"
82+
platform: ubuntu-24.04
7883
- name: "Numpy Dev"
7984
env_file: actions-313-numpydev.yaml
8085
pattern: "not slow and not network and not single_cpu"
@@ -92,6 +97,7 @@ jobs:
9297
LC_ALL: ${{ matrix.lc_all || '' }}
9398
PANDAS_CI: '1'
9499
PANDAS_FUTURE_INFER_STRING: ${{ matrix.pandas_future_infer_string || '1' }}
100+
PANDAS_FUTURE_PYTHON_SCALARS: ${{ matrix.pandas_future_python_scalars || '0' }}
95101
TEST_ARGS: ${{ matrix.test_args || '' }}
96102
PYTEST_WORKERS: 'auto'
97103
PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }}

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1160,6 +1160,7 @@ Timedelta
11601160
- Accuracy improvement in :meth:`Timedelta.to_pytimedelta` to round microseconds consistently for large nanosecond based Timedelta (:issue:`57841`)
11611161
- Bug in :class:`Timedelta` constructor failing to raise when passed an invalid keyword (:issue:`53801`)
11621162
- Bug in :meth:`DataFrame.cumsum` which was raising ``IndexError`` if dtype is ``timedelta64[ns]`` (:issue:`57956`)
1163+
- Bug in adding or subtracting a :class:`Timedelta` object with non-nanosecond unit to a python ``datetime.datetime`` object giving incorrect results; this now works correctly for Timedeltas inside the ``datetime.timedelta`` implementation bounds (:issue:`53643`)
11631164
- Bug in multiplication operations with ``timedelta64`` dtype failing to raise ``TypeError`` when multiplying by ``bool`` objects or dtypes (:issue:`58054`)
11641165
- Bug in multiplication operations with ``timedelta64`` dtype incorrectly raising when multiplying by numpy-nullable dtypes or pyarrow integer dtypes (:issue:`58054`)
11651166

pandas/_config/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@ def using_string_dtype() -> bool:
3535
return _mode_options["infer_string"]
3636

3737

38+
def using_python_scalars() -> bool:
39+
_mode_options = _global_config["future"]
40+
return _mode_options["python_scalars"]
41+
42+
3843
def is_nan_na() -> bool:
3944
_mode_options = _global_config["future"]
4045
return not _mode_options["distinguish_nan_and_na"]

pandas/_libs/meson.build

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ foreach ext_name, ext_dict : libs_sources
160160
ext_dict.get('sources'),
161161
cython_args: cython_args,
162162
include_directories: [inc_np, inc_pd],
163-
dependencies: ext_dict.get('deps', ''),
163+
dependencies: ext_dict.get('deps', []),
164164
subdir: 'pandas/_libs',
165165
install: true,
166166
)

pandas/_libs/tslibs/meson.build

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ foreach ext_name, ext_dict : tslibs_sources
4040
ext_dict.get('sources'),
4141
cython_args: cython_args,
4242
include_directories: [inc_np, inc_pd],
43-
dependencies: ext_dict.get('deps', ''),
43+
dependencies: ext_dict.get('deps', []),
4444
subdir: 'pandas/_libs/tslibs',
4545
install: true,
4646
)

pandas/_libs/tslibs/timedeltas.pyx

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1022,9 +1022,23 @@ cdef _timedelta_from_value_and_reso(cls, int64_t value, NPY_DATETIMEUNIT reso):
10221022
elif reso == NPY_DATETIMEUNIT.NPY_FR_us:
10231023
td_base = _Timedelta.__new__(cls, microseconds=int(value))
10241024
elif reso == NPY_DATETIMEUNIT.NPY_FR_ms:
1025-
td_base = _Timedelta.__new__(cls, milliseconds=0)
1025+
if -86_399_999_913_600_000 <= value < 86_400_000_000_000_000:
1026+
# i.e. we are in range for pytimedelta. By passing the
1027+
# 'correct' value here we can
1028+
# make pydatetime + Timedelta operations work correctly,
1029+
# xref GH#53643
1030+
td_base = _Timedelta.__new__(cls, milliseconds=value)
1031+
else:
1032+
td_base = _Timedelta.__new__(cls, milliseconds=0)
10261033
elif reso == NPY_DATETIMEUNIT.NPY_FR_s:
1027-
td_base = _Timedelta.__new__(cls, seconds=0)
1034+
if -86_399_999_913_600 <= value < 86_400_000_000_000:
1035+
# i.e. we are in range for pytimedelta. By passing the
1036+
# 'correct' value here we can
1037+
# make pydatetime + Timedelta operations work correctly,
1038+
# xref GH#53643
1039+
td_base = _Timedelta.__new__(cls, seconds=value)
1040+
else:
1041+
td_base = _Timedelta.__new__(cls, seconds=0)
10281042
# Other resolutions are disabled but could potentially be implemented here:
10291043
# elif reso == NPY_DATETIMEUNIT.NPY_FR_m:
10301044
# td_base = _Timedelta.__new__(Timedelta, minutes=int(value))

pandas/conftest.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2094,6 +2094,11 @@ def using_infer_string() -> bool:
20942094
return pd.options.future.infer_string is True
20952095

20962096

2097+
@pytest.fixture
2098+
def using_python_scalars() -> bool:
2099+
return pd.options.future.python_scalars is True
2100+
2101+
20972102
_warsaws: list[Any] = ["Europe/Warsaw", "dateutil/Europe/Warsaw"]
20982103
if pytz is not None:
20992104
_warsaws.append(pytz.timezone("Europe/Warsaw"))

pandas/core/algorithms.py

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -898,18 +898,15 @@ def value_counts_internal(
898898
result = result.iloc[0:0]
899899

900900
# normalizing is by len of all (regardless of dropna)
901-
counts = np.array([len(ii)])
901+
normalize_denominator = len(ii)
902902

903903
else:
904+
normalize_denominator = None
904905
if is_extension_array_dtype(values):
905906
# handle Categorical and sparse,
906907
result = Series(values, copy=False)._values.value_counts(dropna=dropna)
907908
result.name = name
908909
result.index.name = index_name
909-
counts = result._values
910-
if not isinstance(counts, np.ndarray):
911-
# e.g. ArrowExtensionArray
912-
counts = np.asarray(counts)
913910

914911
elif isinstance(values, ABCMultiIndex):
915912
# GH49558
@@ -920,10 +917,6 @@ def value_counts_internal(
920917
.size()
921918
)
922919
result.index.names = values.names
923-
# error: Incompatible types in assignment (expression has type
924-
# "ndarray[Any, Any] | DatetimeArray | TimedeltaArray | PeriodArray | Any",
925-
# variable has type "ndarray[tuple[int, ...], dtype[Any]]")
926-
counts = result._values # type: ignore[assignment]
927920

928921
else:
929922
values = _ensure_arraylike(values, func_name="value_counts")
@@ -936,8 +929,7 @@ def value_counts_internal(
936929
idx = Index(keys, dtype=keys.dtype, name=index_name)
937930

938931
if (
939-
bins is None
940-
and not sort
932+
not sort
941933
and isinstance(values, (DatetimeIndex, TimedeltaIndex))
942934
and idx.equals(values)
943935
and values.inferred_freq is not None
@@ -951,7 +943,10 @@ def value_counts_internal(
951943
result = result.sort_values(ascending=ascending, kind="stable")
952944

953945
if normalize:
954-
result = result / counts.sum()
946+
if normalize_denominator is not None:
947+
result = result / normalize_denominator
948+
else:
949+
result = result / result.sum()
955950

956951
return result
957952

pandas/core/arraylike.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from pandas._libs import lib
1616
from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
1717

18+
from pandas.core.dtypes.cast import maybe_unbox_numpy_scalar
1819
from pandas.core.dtypes.generic import ABCNDFrame
1920

2021
from pandas.core import roperator
@@ -529,4 +530,6 @@ def dispatch_reduction_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwar
529530

530531
# By default, numpy's reductions do not skip NaNs, so we have to
531532
# pass skipna=False
532-
return getattr(self, method_name)(skipna=False, **kwargs)
533+
result = getattr(self, method_name)(skipna=False, **kwargs)
534+
result = maybe_unbox_numpy_scalar(result)
535+
return result

pandas/core/arrays/_arrow_string_mixins.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,12 +203,16 @@ def _str_swapcase(self) -> Self:
203203
return self._from_pyarrow_array(pc.utf8_swapcase(self._pa_array))
204204

205205
def _str_removeprefix(self, prefix: str):
206+
if prefix == "":
207+
return self._from_pyarrow_array(self._pa_array)
206208
starts_with = pc.starts_with(self._pa_array, pattern=prefix)
207209
removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
208210
result = pc.if_else(starts_with, removed, self._pa_array)
209211
return self._from_pyarrow_array(result)
210212

211213
def _str_removesuffix(self, suffix: str):
214+
if suffix == "":
215+
return self._from_pyarrow_array(self._pa_array)
212216
ends_with = pc.ends_with(self._pa_array, pattern=suffix)
213217
removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
214218
result = pc.if_else(ends_with, removed, self._pa_array)

0 commit comments

Comments
 (0)