Skip to content

Commit 30151cd

Browse files
authored
Merge branch 'pandas-dev:main' into main
2 parents c8b7a05 + 9ee361b commit 30151cd

30 files changed

+261
-66
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -820,6 +820,7 @@ Other API changes
820820
:meth:`~DataFrame.ffill`, :meth:`~DataFrame.bfill`, :meth:`~DataFrame.interpolate`,
821821
:meth:`~DataFrame.where`, :meth:`~DataFrame.mask`, :meth:`~DataFrame.clip`) now return
822822
the modified DataFrame or Series (``self``) instead of ``None`` when ``inplace=True`` (:issue:`63207`)
823+
- All Index constructors now copy ``numpy.ndarray`` and ``ExtensionArray`` inputs by default when ``copy=None``, consistent with :class:`Series` behavior (:issue:`63388`)
823824

824825
.. ---------------------------------------------------------------------------
825826
.. _whatsnew_300.deprecations:

pandas/_libs/meson.build

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ foreach ext_name, ext_dict : libs_sources
160160
ext_dict.get('sources'),
161161
cython_args: cython_args,
162162
include_directories: [inc_np, inc_pd],
163-
dependencies: ext_dict.get('deps', ''),
163+
dependencies: ext_dict.get('deps', []),
164164
subdir: 'pandas/_libs',
165165
install: true,
166166
)

pandas/_libs/tslibs/meson.build

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ foreach ext_name, ext_dict : tslibs_sources
4040
ext_dict.get('sources'),
4141
cython_args: cython_args,
4242
include_directories: [inc_np, inc_pd],
43-
dependencies: ext_dict.get('deps', ''),
43+
dependencies: ext_dict.get('deps', []),
4444
subdir: 'pandas/_libs/tslibs',
4545
install: true,
4646
)

pandas/core/algorithms.py

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -878,18 +878,15 @@ def value_counts_internal(
878878
result = result.iloc[0:0]
879879

880880
# normalizing is by len of all (regardless of dropna)
881-
counts = np.array([len(ii)])
881+
normalize_denominator = len(ii)
882882

883883
else:
884+
normalize_denominator = None
884885
if is_extension_array_dtype(values):
885886
# handle Categorical and sparse,
886887
result = Series(values, copy=False)._values.value_counts(dropna=dropna)
887888
result.name = name
888889
result.index.name = index_name
889-
counts = result._values
890-
if not isinstance(counts, np.ndarray):
891-
# e.g. ArrowExtensionArray
892-
counts = np.asarray(counts)
893890

894891
elif isinstance(values, ABCMultiIndex):
895892
# GH49558
@@ -900,10 +897,6 @@ def value_counts_internal(
900897
.size()
901898
)
902899
result.index.names = values.names
903-
# error: Incompatible types in assignment (expression has type
904-
# "ndarray[Any, Any] | DatetimeArray | TimedeltaArray | PeriodArray | Any",
905-
# variable has type "ndarray[tuple[int, ...], dtype[Any]]")
906-
counts = result._values # type: ignore[assignment]
907900

908901
else:
909902
values = _ensure_arraylike(values, func_name="value_counts")
@@ -916,8 +909,7 @@ def value_counts_internal(
916909
idx = Index(keys, dtype=keys.dtype, name=index_name)
917910

918911
if (
919-
bins is None
920-
and not sort
912+
not sort
921913
and isinstance(values, (DatetimeIndex, TimedeltaIndex))
922914
and idx.equals(values)
923915
and values.inferred_freq is not None
@@ -931,7 +923,10 @@ def value_counts_internal(
931923
result = result.sort_values(ascending=ascending, kind="stable")
932924

933925
if normalize:
934-
result = result / counts.sum()
926+
if normalize_denominator is not None:
927+
result = result / normalize_denominator
928+
else:
929+
result = result / result.sum()
935930

936931
return result
937932

pandas/core/arrays/_arrow_string_mixins.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,12 +203,16 @@ def _str_swapcase(self) -> Self:
203203
return self._from_pyarrow_array(pc.utf8_swapcase(self._pa_array))
204204

205205
def _str_removeprefix(self, prefix: str):
206+
if prefix == "":
207+
return self._from_pyarrow_array(self._pa_array)
206208
starts_with = pc.starts_with(self._pa_array, pattern=prefix)
207209
removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
208210
result = pc.if_else(starts_with, removed, self._pa_array)
209211
return self._from_pyarrow_array(result)
210212

211213
def _str_removesuffix(self, suffix: str):
214+
if suffix == "":
215+
return self._from_pyarrow_array(self._pa_array)
212216
ends_with = pc.ends_with(self._pa_array, pattern=suffix)
213217
removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
214218
result = pc.if_else(ends_with, removed, self._pa_array)

pandas/core/arrays/datetimes.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,13 +226,16 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps):
226226
"""
227227

228228
_typ = "datetimearray"
229-
_internal_fill_value = np.datetime64("NaT", "ns")
230229
_recognized_scalars = (datetime, np.datetime64)
231230
_is_recognized_dtype: Callable[[DtypeObj], bool] = lambda x: lib.is_np_dtype(
232231
x, "M"
233232
) or isinstance(x, DatetimeTZDtype)
234233
_infer_matches = ("datetime", "datetime64", "date")
235234

235+
@property
236+
def _internal_fill_value(self) -> np.datetime64:
237+
return np.datetime64("NaT", self.unit)
238+
236239
@property
237240
def _scalar_type(self) -> type[Timestamp]:
238241
return Timestamp

pandas/core/arrays/timedeltas.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,11 +154,14 @@ class TimedeltaArray(dtl.TimelikeOps):
154154
"""
155155

156156
_typ = "timedeltaarray"
157-
_internal_fill_value = np.timedelta64("NaT", "ns")
158157
_recognized_scalars = (timedelta, np.timedelta64, Tick)
159158
_is_recognized_dtype: Callable[[DtypeObj], bool] = lambda x: lib.is_np_dtype(x, "m")
160159
_infer_matches = ("timedelta", "timedelta64")
161160

161+
@property
162+
def _internal_fill_value(self) -> np.timedelta64:
163+
return np.timedelta64("NaT", self.unit)
164+
162165
@property
163166
def _scalar_type(self) -> type[Timedelta]:
164167
return Timedelta

pandas/core/indexes/base.py

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -505,12 +505,8 @@ def __new__(
505505
if not copy and isinstance(data, (ABCSeries, Index)):
506506
refs = data._references
507507

508-
if isinstance(data, (ExtensionArray, np.ndarray)):
509-
# GH 63306
510-
if copy is not False:
511-
if dtype is None or astype_is_view(data.dtype, dtype):
512-
data = data.copy()
513-
copy = False
508+
# GH 63306, GH 63388
509+
data, copy = cls._maybe_copy_array_input(data, copy, dtype)
514510

515511
# range
516512
if isinstance(data, (range, RangeIndex)):
@@ -5197,6 +5193,21 @@ def _raise_scalar_data_error(cls, data):
51975193
"was passed"
51985194
)
51995195

5196+
@classmethod
5197+
def _maybe_copy_array_input(
5198+
cls, data, copy: bool | None, dtype
5199+
) -> tuple[Any, bool]:
5200+
"""
5201+
Ensure that the input data is copied if necessary.
5202+
GH#63388
5203+
"""
5204+
if isinstance(data, (ExtensionArray, np.ndarray)):
5205+
if copy is not False:
5206+
if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)):
5207+
data = data.copy()
5208+
copy = False
5209+
return data, bool(copy)
5210+
52005211
def _validate_fill_value(self, value):
52015212
"""
52025213
Check if the value can be inserted into our array without casting,
@@ -6865,12 +6876,15 @@ def get_slice_bound(self, label, side: Literal["left", "right"]) -> int:
68656876
# we need to look up the label
68666877
try:
68676878
slc = self.get_loc(label)
6868-
except KeyError as err:
6879+
except KeyError:
68696880
try:
68706881
return self._searchsorted_monotonic(label, side)
68716882
except ValueError:
6872-
# raise the original KeyError
6873-
raise err from None
6883+
raise KeyError(
6884+
f"Cannot get {side} slice bound for non-monotonic index "
6885+
f"with a missing label {original_label!r}. "
6886+
"Either sort the index or specify an existing label."
6887+
) from None
68746888

68756889
if isinstance(slc, np.ndarray):
68766890
# get_loc may return a boolean array, which

pandas/core/indexes/datetimes.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,13 @@ class DatetimeIndex(DatetimeTimedeltaMixin):
181181
If True parse dates in `data` with the year first order.
182182
dtype : numpy.dtype or DatetimeTZDtype or str, default None
183183
Note that the only NumPy dtype allowed is `datetime64[ns]`.
184-
copy : bool, default False
185-
Make a copy of input ndarray.
184+
copy : bool, default None
185+
Whether to copy input data, only relevant for array, Series, and Index
186+
inputs (for other input, e.g. a list, a new array is created anyway).
187+
Defaults to True for array input and False for Index/Series.
188+
Set to False to avoid copying array input at your own risk (if you
189+
know the input data won't be modified elsewhere).
190+
Set to True to force copying Series/Index up front.
186191
name : label, default None
187192
Name to be stored in the index.
188193
@@ -669,7 +674,7 @@ def __new__(
669674
dayfirst: bool = False,
670675
yearfirst: bool = False,
671676
dtype: Dtype | None = None,
672-
copy: bool = False,
677+
copy: bool | None = None,
673678
name: Hashable | None = None,
674679
) -> Self:
675680
if is_scalar(data):
@@ -679,6 +684,9 @@ def __new__(
679684

680685
name = maybe_extract_name(name, data, cls)
681686

687+
# GH#63388
688+
data, copy = cls._maybe_copy_array_input(data, copy, dtype)
689+
682690
if (
683691
isinstance(data, DatetimeArray)
684692
and freq is lib.no_default

pandas/core/indexes/interval.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,13 @@ class IntervalIndex(ExtensionIndex):
169169
neither.
170170
dtype : dtype or None, default None
171171
If None, dtype will be inferred.
172-
copy : bool, default False
173-
Copy the input data.
172+
copy : bool, default None
173+
Whether to copy input data, only relevant for array, Series, and Index
174+
inputs (for other input, e.g. a list, a new array is created anyway).
175+
Defaults to True for array input and False for Index/Series.
176+
Set to False to avoid copying array input at your own risk (if you
177+
know the input data won't be modified elsewhere).
178+
Set to True to force copying Series/Index input up front.
174179
name : object, optional
175180
Name to be stored in the index.
176181
verify_integrity : bool, default True
@@ -252,12 +257,15 @@ def __new__(
252257
data,
253258
closed: IntervalClosedType | None = None,
254259
dtype: Dtype | None = None,
255-
copy: bool = False,
260+
copy: bool | None = None,
256261
name: Hashable | None = None,
257262
verify_integrity: bool = True,
258263
) -> Self:
259264
name = maybe_extract_name(name, data, cls)
260265

266+
# GH#63388
267+
data, copy = cls._maybe_copy_array_input(data, copy, dtype)
268+
261269
with rewrite_exception("IntervalArray", cls.__name__):
262270
array = IntervalArray(
263271
data,

0 commit comments

Comments
 (0)