diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index b2a146367aed0..c694061904ba4 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -57,7 +57,7 @@ jobs: run: sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 - name: Run doctests - run: cd ci && ./code_checks.sh doctests + run: cd ci && PANDAS_FUTURE_PYTHON_SCALARS="1" ./code_checks.sh doctests if: ${{ steps.build.outcome == 'success' && always() }} - name: Install pandas in editable mode diff --git a/pandas/conftest.py b/pandas/conftest.py index 74c79c7025ec6..a17ec1bf456d6 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -109,6 +109,20 @@ def pytest_addoption(parser) -> None: ) +def pytest_sessionstart(session): + import doctest + import inspect + + orig = doctest.DocTestFinder._from_module # type: ignore[attr-defined] + + def _from_module(self, module, object): + if inspect.isfunction(object) and "." in object.__qualname__: + return True + return orig(self, module, object) + + doctest.DocTestFinder._from_module = _from_module # type: ignore[attr-defined] + + def ignore_doctest_warning(item: pytest.Item, path: str, message: str) -> None: """Ignore doctest warning. @@ -135,14 +149,15 @@ def pytest_collection_modifyitems(items, config) -> None: # Warnings from doctests that can be ignored; place reason in comment above. # Each entry specifies (path, message) - see the ignore_doctest_warning function ignored_doctest_warnings = [ - ("api.interchange.from_dataframe", ".*Interchange Protocol is deprecated"), + ("api.interchange.from_dataframe", "The DataFrame Interchange Protocol"), ("is_int64_dtype", "is_int64_dtype is deprecated"), ("is_interval_dtype", "is_interval_dtype is deprecated"), ("is_period_dtype", "is_period_dtype is deprecated"), ("is_datetime64tz_dtype", "is_datetime64tz_dtype is deprecated"), ("is_categorical_dtype", "is_categorical_dtype is deprecated"), ("is_sparse", "is_sparse is deprecated"), - ("DataFrame.__dataframe__", "Interchange Protocol is deprecated"), + ("CategoricalDtype._from_values_or_dtype", "Constructing a Categorical"), + ("DataFrame.__dataframe__", "The DataFrame Interchange Protocol"), ("DataFrameGroupBy.fillna", "DataFrameGroupBy.fillna is deprecated"), ("DataFrameGroupBy.corrwith", "DataFrameGroupBy.corrwith is deprecated"), ("NDFrame.replace", "Series.replace without 'value'"), diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 01cdc417742bd..4163de0d2cf01 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -416,7 +416,7 @@ def register_dataframe_accessor(name: str) -> Callable[[TypeT], TypeT]: AttributeError: The series must contain integer data only. >>> df = pd.Series([1, 2, 3]) >>> df.int_accessor.sum() -np.int64(6)""" +6""" @set_module("pandas.api.extensions") @@ -481,7 +481,7 @@ def register_series_accessor(name: str) -> Callable[[TypeT], TypeT]: AttributeError: The series must contain integer data only. >>> df = pd.Series([1, 2, 3]) >>> df.int_accessor.sum() - np.int64(6) + 6 """ from pandas import Series diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index d3c0bbdf53310..d99b0457c2057 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -894,13 +894,13 @@ def tz_convert(self, tz) -> Self: DatetimeIndex(['2014-08-01 09:00:00+02:00', '2014-08-01 10:00:00+02:00', '2014-08-01 11:00:00+02:00'], - dtype='datetime64[ns, Europe/Berlin]', freq='h') + dtype='datetime64[us, Europe/Berlin]', freq='h') >>> dti.tz_convert("US/Central") DatetimeIndex(['2014-08-01 02:00:00-05:00', '2014-08-01 03:00:00-05:00', '2014-08-01 04:00:00-05:00'], - dtype='datetime64[ns, US/Central]', freq='h') + dtype='datetime64[us, US/Central]', freq='h') With the ``tz=None``, we can remove the timezone (after converting to UTC if necessary): @@ -1047,7 +1047,7 @@ def tz_localize( 4 2018-10-28 02:30:00+01:00 5 2018-10-28 03:00:00+01:00 6 2018-10-28 03:30:00+01:00 - dtype: datetime64[s, CET] + dtype: datetime64[us, CET] In some cases, inferring the DST is impossible. In such cases, you can pass an ndarray to the ambiguous parameter to set the DST explicitly @@ -1059,7 +1059,7 @@ def tz_localize( 0 2018-10-28 01:20:00+02:00 1 2018-10-28 02:36:00+02:00 2 2018-10-28 03:46:00+01:00 - dtype: datetime64[s, CET] + dtype: datetime64[us, CET] If the DST transition causes nonexistent times, you can shift these dates forward or backwards with a timedelta object or `'shift_forward'` diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index c15a196dc6727..8ee8fe83852f8 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1288,7 +1288,7 @@ def astype(self, dtype: AstypeArg | None = None, copy: bool = True): IntIndex Indices: array([2, 3], dtype=int32) - >>> arr.astype(SparseDtype(np.dtype("int32"))) + >>> arr.astype(pd.SparseDtype(np.dtype("int32"))) [0, 0, 1, 2] Fill: 0 IntIndex @@ -1297,7 +1297,7 @@ def astype(self, dtype: AstypeArg | None = None, copy: bool = True): Using a NumPy dtype with a different kind (e.g. float) will coerce just ``self.sp_values``. - >>> arr.astype(SparseDtype(np.dtype("float64"))) + >>> arr.astype(pd.SparseDtype(np.dtype("float64"))) ... # doctest: +NORMALIZE_WHITESPACE [nan, nan, 1.0, 2.0] Fill: nan @@ -1306,7 +1306,7 @@ def astype(self, dtype: AstypeArg | None = None, copy: bool = True): Using a SparseDtype, you can also change the fill value as well. - >>> arr.astype(SparseDtype("float64", fill_value=0.0)) + >>> arr.astype(pd.SparseDtype("float64", fill_value=0.0)) ... # doctest: +NORMALIZE_WHITESPACE [0.0, 0.0, 1.0, 2.0] Fill: 0.0 diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 514ad28f698d6..af9770fff3234 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -813,7 +813,7 @@ def total_seconds(self) -> npt.NDArray[np.float64]: >>> idx = pd.to_timedelta(np.arange(5), unit="D") >>> idx TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], - dtype='timedelta64[us]', freq=None) + dtype='timedelta64[ns]', freq=None) >>> idx.total_seconds() Index([0.0, 86400.0, 172800.0, 259200.0, 345600.0], dtype='float64') @@ -855,7 +855,7 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: >>> tidx = pd.TimedeltaIndex(data=["1 days 02:30:45", "3 days 04:15:10"]) >>> tidx TimedeltaIndex(['1 days 02:30:45', '3 days 04:15:10'], - dtype='timedelta64[ns]', freq=None) + dtype='timedelta64[us]', freq=None) >>> tidx.to_pytimedelta() array([datetime.timedelta(days=1, seconds=9045), datetime.timedelta(days=3, seconds=15310)], dtype=object) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 3b14c5348f9d6..cfcd34561515a 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2052,7 +2052,7 @@ def update_dtype(self, dtype) -> SparseDtype: Examples -------- >>> SparseDtype(int, 0).update_dtype(float) - Sparse[float64, 0.0] + Sparse[float64, np.float64(0.0)] >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan)) Sparse[float64, nan] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9d6af3c7b9917..e68bb9f33b231 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -90,6 +90,7 @@ infer_dtype_from_scalar, invalidate_string_dtypes, maybe_downcast_to_dtype, + maybe_unbox_numpy_scalar, ) from pandas.core.dtypes.common import ( infer_dtype_from_object, @@ -967,7 +968,7 @@ def __dataframe__( >>> df_not_necessarily_pandas = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) >>> interchange_object = df_not_necessarily_pandas.__dataframe__() >>> interchange_object.column_names() - Index(['A', 'B'], dtype='object') + Index(['A', 'B'], dtype='str') >>> df_pandas = pd.api.interchange.from_dataframe( ... interchange_object.select_columns_by_name(["A"]) ... ) @@ -1479,7 +1480,7 @@ def style(self) -> Styler: panda bear polar bear koala marsupial - Name: species, dtype: object + Name: species, dtype: str label: population content: panda 1864 @@ -1531,7 +1532,7 @@ def items(self) -> Iterable[tuple[Hashable, Series]]: panda bear polar bear koala marsupial - Name: species, dtype: object + Name: species, dtype: str label: population content: panda 1864 @@ -3835,7 +3836,7 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series: 4 1 1.0 1.0+0.0j 1 True >>> df.memory_usage() - Index 128 + Index 132 int64 40000 float64 40000 complex128 80000 @@ -3854,7 +3855,7 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series: The memory footprint of `object` dtype columns is ignored by default: >>> df.memory_usage(deep=True) - Index 128 + Index 132 int64 40000 float64 40000 complex128 80000 @@ -3866,7 +3867,7 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series: many repeated values. >>> df["object"].astype("category").memory_usage(deep=True) - 5136 + 5140 """ result = self._constructor_sliced( [c.memory_usage(index=False, deep=deep) for col, c in self.items()], @@ -3982,7 +3983,7 @@ def transpose( the `object` dtype: >>> df2.dtypes - name object + name str score float64 employed bool kids int64 @@ -4419,10 +4420,10 @@ def __setitem__(self, key, value) -> None: >>> s = pd.Series([10, 20], index=[1, 3]) # Note: index 3 doesn't exist in df >>> df["B"] = s # Assigns by index label, not position >>> df - A B - 0 1 NaN - 1 2 10 - 2 3 NaN + A B + 0 1 NaN + 1 2 10.0 + 2 3 NaN Series assignment with partial index match: @@ -4430,11 +4431,11 @@ def __setitem__(self, key, value) -> None: >>> s = pd.Series([100, 200], index=["b", "d"]) >>> df["B"] = s >>> df - A B - a 1 NaN - b 2 100 - c 3 NaN - d 4 200 + A B + a 1 NaN + b 2 100.0 + c 3 NaN + d 4 200.0 Series index labels NOT in DataFrame, ignored: @@ -4446,7 +4447,6 @@ def __setitem__(self, key, value) -> None: x 1 10 y 2 20 z 3 50 - # Values for 'a' and 'b' are completely ignored! """ if not CHAINED_WARNING_DISABLED: if sys.getrefcount(self) <= REF_COUNT and not com.is_local_in_caller_frame( @@ -5155,6 +5155,7 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: 2 48 3 28 4 12 + dtype: int64 Local variables shall be explicitly referenced using ``@`` character in front of the name: @@ -5166,6 +5167,7 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: 2 6 3 8 4 10 + Name: A, dtype: int64 """ from pandas.core.computation.eval import eval as _eval @@ -6013,7 +6015,7 @@ def rename( >>> df.index RangeIndex(start=0, stop=3, step=1) >>> df.rename(index=str).index - Index(['0', '1', '2'], dtype='object') + Index(['0', '1', '2'], dtype='str') >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise") Traceback (most recent call last): @@ -6086,7 +6088,7 @@ def pop(self, item: Hashable) -> Series: 1 bird 2 mammal 3 mammal - Name: class, dtype: object + Name: class, dtype: str >>> df name max_speed @@ -7833,22 +7835,22 @@ def value_counts( >>> df first_name middle_name 0 John Smith - 1 Anne - 2 John + 1 Anne NaN + 2 John NaN 3 Beth Louise >>> df.value_counts() first_name middle_name - Beth Louise 1 John Smith 1 + Beth Louise 1 Name: count, dtype: int64 >>> df.value_counts(dropna=False) first_name middle_name + John Smith 1 Anne NaN 1 + John NaN 1 Beth Louise 1 - John Smith 1 - NaN 1 Name: count, dtype: int64 >>> df.value_counts("first_name") @@ -9125,16 +9127,16 @@ def combine( ... index=[1, 2], ... ) >>> df2.combine(df1, take_smaller) - A B C - 0 0.0 NaN NaN - 1 0.0 3.0 NaN - 2 NaN 3.0 NaN + B C A + 0 NaN NaN 0.0 + 1 3.0 NaN 0.0 + 2 3.0 NaN NaN >>> df2.combine(df1, take_smaller, overwrite=False) - A B C - 0 0.0 NaN NaN - 1 0.0 3.0 1.0 - 2 NaN 3.0 1.0 + B C A + 0 NaN NaN 0.0 + 1 3.0 1.0 0.0 + 2 3.0 1.0 NaN """ other_idxlen = len(other.index) # save for compare other_columns = other.columns @@ -10992,8 +10994,8 @@ def apply( ``apply`` has type stability (variables in the function do not change their type during the execution). - >>> import bodo - >>> df.apply(lambda x: x.A + x.B, axis=1, engine=bodo.jit) + >>> import bodo # doctest: +SKIP + >>> df.apply(lambda x: x.A + x.B, axis=1, engine=bodo.jit) # doctest: +SKIP Note that JIT compilation is only recommended for functions that take a significant amount of time to run. Fast functions are unlikely to run faster @@ -12169,7 +12171,7 @@ def _get_data() -> DataFrame: df = df.astype(dtype) arr = concat_compat(list(df._iter_column_arrays())) return arr._reduce(name, skipna=skipna, keepdims=False, **kwds) - return func(df.values) + return maybe_unbox_numpy_scalar(func(df.values)) elif axis == 1: if len(df.index) == 0: # Taking a transpose would result in no columns, losing the dtype. @@ -12814,7 +12816,7 @@ def sem( Examples -------- >>> s = pd.Series([1, 2, 3]) - >>> s.sem().round(6) + >>> round(s.sem(), 6) 0.57735 With a DataFrame @@ -13320,8 +13322,8 @@ def kurt( With axis=None - >>> df.kurt(axis=None).round(6) - -0.988693 + >>> df.kurt(axis=None) + -0.9886927196984727 Using axis=1 @@ -13477,10 +13479,8 @@ def idxmin( >>> df = pd.DataFrame( ... { - ... { - ... "consumption": [10.51, 103.11, 55.48], - ... "co2_emissions": [37.2, 19.66, 1712], - ... } + ... "consumption": [10.51, 103.11, 55.48], + ... "co2_emissions": [37.2, 19.66, 1712], ... }, ... index=["Pork", "Wheat Products", "Beef"], ... ) @@ -13496,7 +13496,7 @@ def idxmin( >>> df.idxmin() consumption Pork co2_emissions Wheat Products - dtype: object + dtype: str To return the index for the minimum value in each row, use ``axis="columns"``. @@ -13504,7 +13504,7 @@ def idxmin( Pork consumption Wheat Products co2_emissions Beef consumption - dtype: object + dtype: str """ axis = self._get_axis_number(axis) @@ -13580,10 +13580,8 @@ def idxmax( >>> df = pd.DataFrame( ... { - ... { - ... "consumption": [10.51, 103.11, 55.48], - ... "co2_emissions": [37.2, 19.66, 1712], - ... } + ... "consumption": [10.51, 103.11, 55.48], + ... "co2_emissions": [37.2, 19.66, 1712], ... }, ... index=["Pork", "Wheat Products", "Beef"], ... ) @@ -13597,9 +13595,9 @@ def idxmax( By default, it returns the index for the maximum value in each column. >>> df.idxmax() - consumption Wheat Products - co2_emissions Beef - dtype: object + consumption Wheat Products + co2_emissions Beef + dtype: str To return the index for the maximum value in each row, use ``axis="columns"``. @@ -13607,7 +13605,7 @@ def idxmax( Pork co2_emissions Wheat Products consumption Beef co2_emissions - dtype: object + dtype: str """ axis = self._get_axis_number(axis) @@ -14097,7 +14095,7 @@ def to_period( >>> idx DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'], - dtype='datetime64[s]', freq=None) + dtype='datetime64[us]', freq=None) >>> idx.to_period("M") PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]') @@ -14326,7 +14324,7 @@ def isin_(x): 0 1 3 1 2 4 >>> df.columns - Index(['A', 'B'], dtype='object') + Index(['A', 'B'], dtype='str') """, ) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c0be5568170d0..3499ddb0ff246 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -12684,7 +12684,7 @@ def make_doc(name: str, ndim: int) -> str: Examples -------- >>> s = pd.Series([1, 2, 3]) - >>> s.sem().round(6) + >>> round(s.sem(), 6) 0.57735 With a DataFrame @@ -12802,8 +12802,8 @@ def make_doc(name: str, ndim: int) -> str: With axis=None - >>> df.kurt(axis=None).round(6) - -0.988693 + >>> df.kurt(axis=None) + -0.9886927196984727 Using axis=1 diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 07168331b9ac7..4825ec894b871 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1351,7 +1351,7 @@ def take( 3 parrot 2 2 lion 1 monkey - Name: name, dtype: object + Name: name, dtype: str We may take elements using negative integers for positive indices, starting from the end of the object, just like with Python lists. @@ -1361,7 +1361,7 @@ def take( 4 falcon 2 0 rabbit 1 monkey - Name: name, dtype: object + Name: name, dtype: str """ result = self._op_via_apply("take", indices=indices, **kwargs) return result @@ -3044,19 +3044,19 @@ def idxmax( ... ) >>> df - consumption co2_emissions - Pork 10.51 37.20 - Wheat Products 103.11 19.66 - Beef 55.48 1712.00 + consumption co2_emissions food_type + Pork 10.51 37.20 meat + Wheat Products 103.11 19.66 plant + Beef 55.48 1712.00 meat By default, it returns the index for the maximum value in each column according to the group. >>> df.groupby("food_type").idxmax() - consumption co2_emissions + consumption co2_emissions food_type - animal Beef Beef - plant Wheat Products Wheat Products + meat Beef Beef + plant Wheat Products Wheat Products """ return self._idxmax_idxmin("idxmax", numeric_only=numeric_only, skipna=skipna) @@ -3116,19 +3116,19 @@ def idxmin( ... ) >>> df - consumption co2_emissions - Pork 10.51 37.20 - Wheat Products 103.11 19.66 - Beef 55.48 1712.00 + consumption co2_emissions food_type + Pork 10.51 37.20 meat + Wheat Products 103.11 19.66 plant + Beef 55.48 1712.00 meat By default, it returns the index for the minimum value in each column according to the group. >>> df.groupby("food_type").idxmin() - consumption co2_emissions + consumption co2_emissions food_type - animal Pork Pork - plant Wheat Products Wheat Products + meat Pork Pork + plant Wheat Products Wheat Products """ return self._idxmax_idxmin("idxmin", numeric_only=numeric_only, skipna=skipna) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index dc37f3bfa6a4c..15fac68a51ebd 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1027,7 +1027,7 @@ def ravel(self, order: str_t = "C") -> Self: -------- >>> s = pd.Series([1, 2, 3], index=["a", "b", "c"]) >>> s.index.ravel() - Index(['a', 'b', 'c'], dtype='object') + Index(['a', 'b', 'c'], dtype='str') """ return self[:] @@ -1357,11 +1357,11 @@ def repeat(self, repeats, axis: None = None) -> Self: -------- >>> idx = pd.Index(["a", "b", "c"]) >>> idx - Index(['a', 'b', 'c'], dtype='object') + Index(['a', 'b', 'c'], dtype='str') >>> idx.repeat(2) - Index(['a', 'a', 'b', 'b', 'c', 'c'], dtype='object') + Index(['a', 'a', 'b', 'b', 'c', 'c'], dtype='str') >>> idx.repeat([1, 2, 3]) - Index(['a', 'b', 'b', 'c', 'c', 'c'], dtype='object') + Index(['a', 'b', 'b', 'c', 'c', 'c'], dtype='str') """ repeats = ensure_platform_int(repeats) nv.validate_repeat((), {"axis": axis}) @@ -1682,7 +1682,7 @@ def to_series(self, index=None, name: Hashable | None = None) -> Series: Ant Ant Bear Bear Cow Cow - Name: animal, dtype: object + Name: animal, dtype: str To enforce a new index, specify new labels to ``index``: @@ -1690,7 +1690,7 @@ def to_series(self, index=None, name: Hashable | None = None) -> Series: 0 Ant 1 Bear 2 Cow - Name: animal, dtype: object + Name: animal, dtype: str To override the name of the resulting column, specify ``name``: @@ -1699,7 +1699,7 @@ def to_series(self, index=None, name: Hashable | None = None) -> Series: Ant Ant Bear Bear Cow Cow - Name: zoo, dtype: object + Name: zoo, dtype: str """ from pandas import Series @@ -2088,7 +2088,7 @@ def rename(self, name, *, inplace: bool = False) -> Self | None: -------- >>> idx = pd.Index(["A", "C", "A", "B"], name="score") >>> idx.rename("grade") - Index(['A', 'C', 'A', 'B'], dtype='object', name='grade') + Index(['A', 'C', 'A', 'B'], dtype='str', name='grade') >>> idx = pd.MultiIndex.from_product( ... [["python", "cobra"], [2018, 2019]], names=["kind", "year"] @@ -2230,12 +2230,12 @@ def _get_level_values(self, level) -> Index: -------- >>> idx = pd.Index(list("abc")) >>> idx - Index(['a', 'b', 'c'], dtype='object') + Index(['a', 'b', 'c'], dtype='str') Get level values by supplying `level` as integer: >>> idx.get_level_values(0) - Index(['a', 'b', 'c'], dtype='object') + Index(['a', 'b', 'c'], dtype='str') """ self._validate_index_level(level) return self @@ -2676,7 +2676,7 @@ def isna(self) -> npt.NDArray[np.bool_]: >>> idx = pd.Index(["black", "", "red", None]) >>> idx - Index(['black', '', 'red', None], dtype='object') + Index(['black', '', 'red', nan], dtype='str') >>> idx.isna() array([False, False, False, True]) @@ -2687,7 +2687,7 @@ def isna(self) -> npt.NDArray[np.bool_]: ... ) >>> idx DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], - dtype='datetime64[s]', freq=None) + dtype='datetime64[us]', freq=None) >>> idx.isna() array([False, True, True, True]) """ @@ -2733,7 +2733,7 @@ def notna(self) -> npt.NDArray[np.bool_]: >>> idx = pd.Index(["black", "", "red", None]) >>> idx - Index(['black', '', 'red', None], dtype='object') + Index(['black', '', 'red', nan], dtype='str') >>> idx.notna() array([ True, True, True, False]) """ @@ -2886,18 +2886,18 @@ def drop_duplicates(self, *, keep: DropKeep = "first") -> Self: set of duplicated entries. The default value of keep is 'first'. >>> idx.drop_duplicates(keep="first") - Index(['llama', 'cow', 'beetle', 'hippo'], dtype='object') + Index(['llama', 'cow', 'beetle', 'hippo'], dtype='str') The value 'last' keeps the last occurrence for each set of duplicated entries. >>> idx.drop_duplicates(keep="last") - Index(['cow', 'beetle', 'llama', 'hippo'], dtype='object') + Index(['cow', 'beetle', 'llama', 'hippo'], dtype='str') The value ``False`` discards all sets of duplicated entries. >>> idx.drop_duplicates(keep=False) - Index(['cow', 'beetle', 'hippo'], dtype='object') + Index(['cow', 'beetle', 'hippo'], dtype='str') """ if self.is_unique: return self._view() @@ -4209,9 +4209,9 @@ def reindex( -------- >>> idx = pd.Index(["car", "bike", "train", "tractor"]) >>> idx - Index(['car', 'bike', 'train', 'tractor'], dtype='object') + Index(['car', 'bike', 'train', 'tractor'], dtype='str') >>> idx.reindex(["car", "bike"]) - (Index(['car', 'bike'], dtype='object'), array([0, 1])) + (Index(['car', 'bike'], dtype='str'), array([0, 1])) """ # GH6552: preserve names when reindexing to non-named target # (i.e. neither Index nor Series). @@ -5170,9 +5170,9 @@ def where(self, cond, other=None) -> Index: -------- >>> idx = pd.Index(["car", "bike", "train", "tractor"]) >>> idx - Index(['car', 'bike', 'train', 'tractor'], dtype='object') + Index(['car', 'bike', 'train', 'tractor'], dtype='str') >>> idx.where(idx.isin(["car", "train"]), "other") - Index(['car', 'other', 'train', 'other'], dtype='object') + Index(['car', 'other', 'train', 'other'], dtype='str') """ if isinstance(self, ABCMultiIndex): raise NotImplementedError( @@ -5529,7 +5529,7 @@ def equals(self, other: Any) -> bool: >>> idx2 = pd.Index(["1", "2", "3"]) >>> idx2 - Index(['1', '2', '3'], dtype='object') + Index(['1', '2', '3'], dtype='str') >>> idx1.equals(idx2) False @@ -5936,14 +5936,14 @@ def shift(self, periods: int = 1, freq=None) -> Self: >>> month_starts DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01', '2011-04-01', '2011-05-01'], - dtype='datetime64[ns]', freq='MS') + dtype='datetime64[us]', freq='MS') Shift the index by 10 days. >>> month_starts.shift(10, freq="D") DatetimeIndex(['2011-01-11', '2011-02-11', '2011-03-11', '2011-04-11', '2011-05-11'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[us]', freq=None) The default value of `freq` is the `freq` attribute of the index, which is 'MS' (month start) in this example. @@ -5951,7 +5951,7 @@ def shift(self, periods: int = 1, freq=None) -> Self: >>> month_starts.shift(10) DatetimeIndex(['2011-11-01', '2011-12-01', '2012-01-01', '2012-02-01', '2012-03-01'], - dtype='datetime64[ns]', freq='MS') + dtype='datetime64[us]', freq='MS') """ raise NotImplementedError( f"This method is only implemented for DatetimeIndex, PeriodIndex and " @@ -5984,14 +5984,14 @@ def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: -------- >>> idx = pd.Index(["b", "a", "d", "c"]) >>> idx - Index(['b', 'a', 'd', 'c'], dtype='object') + Index(['b', 'a', 'd', 'c'], dtype='str') >>> order = idx.argsort() >>> order array([1, 0, 3, 2]) >>> idx[order] - Index(['a', 'b', 'c', 'd'], dtype='object') + Index(['a', 'b', 'c', 'd'], dtype='str') """ # This works for either ndarray or EA, is overridden # by RangeIndex, MultIIndex @@ -6514,17 +6514,17 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): -------- >>> idx = pd.Index([1, 2, 3]) >>> idx.map({1: "a", 2: "b", 3: "c"}) - Index(['a', 'b', 'c'], dtype='object') + Index(['a', 'b', 'c'], dtype='str') Using `map` with a function: >>> idx = pd.Index([1, 2, 3]) >>> idx.map("I am a {}".format) - Index(['I am a 1', 'I am a 2', 'I am a 3'], dtype='object') + Index(['I am a 1', 'I am a 2', 'I am a 3'], dtype='str') >>> idx = pd.Index(["a", "b", "c"]) >>> idx.map(lambda x: x.upper()) - Index(['A', 'B', 'C'], dtype='object') + Index(['A', 'B', 'C'], dtype='str') """ from pandas.core.indexes.multi import MultiIndex @@ -6815,7 +6815,7 @@ def _searchsorted_monotonic(self, label, side: Literal["left", "right"] = "left" pos = self[::-1].searchsorted( label, side="right" if side == "left" else "left" ) - return len(self) - pos + return maybe_unbox_numpy_scalar(len(self) - pos) raise ValueError("index must be monotonic increasing or decreasing") @@ -7005,6 +7005,8 @@ def slice_locs( if start_slice == -1: start_slice -= len(self) + start_slice = maybe_unbox_numpy_scalar(start_slice) + end_slice = maybe_unbox_numpy_scalar(end_slice) return start_slice, end_slice def delete( @@ -7163,7 +7165,7 @@ def drop( -------- >>> idx = pd.Index(["a", "b", "c"]) >>> idx.drop(["a"]) - Index(['b', 'c'], dtype='object') + Index(['b', 'c'], dtype='str') """ if not isinstance(labels, Index): # avoid materializing e.g. RangeIndex @@ -7201,7 +7203,7 @@ def infer_objects(self, copy: bool = True) -> Index: Examples -------- >>> pd.Index(["a", 1]).infer_objects() - Index(['a', '1'], dtype='object') + Index(['a', 1], dtype='object') >>> pd.Index([1, 2], dtype="object").infer_objects() Index([1, 2], dtype='int64') """ @@ -7412,7 +7414,7 @@ def any(self, *args, **kwargs): # i.e. EA, call _reduce instead of "any" to get TypeError instead # of AttributeError return vals._reduce("any") - return np.any(vals) + return maybe_unbox_numpy_scalar(np.any(vals)) def all(self, *args, **kwargs): """ @@ -7460,7 +7462,7 @@ def all(self, *args, **kwargs): # i.e. EA, call _reduce instead of "all" to get TypeError instead # of AttributeError return vals._reduce("all") - return np.all(vals) + return maybe_unbox_numpy_scalar(np.all(vals)) @final def _maybe_disable_logical_methods(self, opname: str_t) -> None: diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index cbefaac77dd82..3e8fbce5ebada 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -508,13 +508,13 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): If the mapping is not one-to-one an :class:`~pandas.Index` is returned: >>> idx.map({"a": "first", "b": "second", "c": "first"}) - Index(['first', 'second', 'first'], dtype='object') + Index(['first', 'second', 'first'], dtype='str') If a `dict` is used, all unmapped categories are mapped to `NaN` and the result is an :class:`~pandas.Index`: >>> idx.map({"a": "first", "b": "second"}) - Index(['first', 'second', nan], dtype='object') + Index(['first', 'second', nan], dtype='str') """ mapped = self._values.map(mapper, na_action=na_action) return Index(mapped, name=self.name) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 72b009a344193..ed5e335c3eac9 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -365,13 +365,13 @@ def tz_convert(self, tz) -> Self: DatetimeIndex(['2014-08-01 09:00:00+02:00', '2014-08-01 10:00:00+02:00', '2014-08-01 11:00:00+02:00'], - dtype='datetime64[ns, Europe/Berlin]', freq='h') + dtype='datetime64[us, Europe/Berlin]', freq='h') >>> dti.tz_convert("US/Central") DatetimeIndex(['2014-08-01 02:00:00-05:00', '2014-08-01 03:00:00-05:00', '2014-08-01 04:00:00-05:00'], - dtype='datetime64[ns, US/Central]', freq='h') + dtype='datetime64[us, US/Central]', freq='h') With the ``tz=None``, we can remove the timezone (after converting to UTC if necessary): @@ -384,13 +384,13 @@ def tz_convert(self, tz) -> Self: DatetimeIndex(['2014-08-01 09:00:00+02:00', '2014-08-01 10:00:00+02:00', '2014-08-01 11:00:00+02:00'], - dtype='datetime64[ns, Europe/Berlin]', freq='h') + dtype='datetime64[us, Europe/Berlin]', freq='h') >>> dti.tz_convert(None) DatetimeIndex(['2014-08-01 07:00:00', '2014-08-01 08:00:00', '2014-08-01 09:00:00'], - dtype='datetime64[ns]', freq='h') + dtype='datetime64[us]', freq='h') """ # noqa: E501 arr = self._data.tz_convert(tz) return type(self)._simple_new(arr, name=self.name, refs=self._references) @@ -468,7 +468,7 @@ def tz_localize( >>> tz_naive DatetimeIndex(['2018-03-01 09:00:00', '2018-03-02 09:00:00', '2018-03-03 09:00:00'], - dtype='datetime64[ns]', freq='D') + dtype='datetime64[us]', freq='D') Localize DatetimeIndex in US/Eastern time zone: @@ -477,7 +477,7 @@ def tz_localize( DatetimeIndex(['2018-03-01 09:00:00-05:00', '2018-03-02 09:00:00-05:00', '2018-03-03 09:00:00-05:00'], - dtype='datetime64[ns, US/Eastern]', freq=None) + dtype='datetime64[us, US/Eastern]', freq=None) With the ``tz=None``, we can remove the time zone information while keeping the local time (not converted to UTC): @@ -485,7 +485,7 @@ def tz_localize( >>> tz_aware.tz_localize(None) DatetimeIndex(['2018-03-01 09:00:00', '2018-03-02 09:00:00', '2018-03-03 09:00:00'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[us]', freq=None) Be careful with DST changes. When there is sequential data, pandas can infer the DST time: @@ -505,7 +505,7 @@ def tz_localize( 4 2018-10-28 02:30:00+01:00 5 2018-10-28 03:00:00+01:00 6 2018-10-28 03:30:00+01:00 - dtype: datetime64[s, CET] + dtype: datetime64[us, CET] In some cases, inferring the DST is impossible. In such cases, you can pass an ndarray to the ambiguous parameter to set the DST explicitly @@ -517,7 +517,7 @@ def tz_localize( 0 2018-10-28 01:20:00+02:00 1 2018-10-28 02:36:00+02:00 2 2018-10-28 03:46:00+01:00 - dtype: datetime64[s, CET] + dtype: datetime64[us, CET] If the DST transition causes nonexistent times, you can shift these dates forward or backwards with a timedelta object or `'shift_forward'` diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 1def317bc1a88..55c5002265b48 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -40,6 +40,7 @@ infer_dtype_from_scalar, maybe_box_datetimelike, maybe_downcast_numeric, + maybe_unbox_numpy_scalar, maybe_upcast_numeric_to_64bit, ) from pandas.core.dtypes.common import ( @@ -812,7 +813,7 @@ def get_loc(self, key) -> int | slice | np.ndarray: if matches == 0: raise KeyError(key) if matches == 1: - return mask.argmax() + return maybe_unbox_numpy_scalar(mask.argmax()) res = lib.maybe_booleans_to_slice(mask.view("u1")) if isinstance(res, slice) and res.stop is None: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 90f710b0de4de..bd459c0ba0e0a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -58,7 +58,10 @@ ) from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.cast import coerce_indexer_dtype +from pandas.core.dtypes.cast import ( + coerce_indexer_dtype, + maybe_unbox_numpy_scalar, +) from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, @@ -1851,9 +1854,9 @@ def get_level_values(self, level) -> Index: Get level values by supplying level as either integer or name: >>> mi.get_level_values(0) - Index(['a', 'b', 'c'], dtype='object', name='level_1') + Index(['a', 'b', 'c'], dtype='str', name='level_1') >>> mi.get_level_values("level_2") - Index(['d', 'e', 'f'], dtype='object', name='level_2') + Index(['d', 'e', 'f'], dtype='str', name='level_2') If a level contains missing values, the return type of the level may be cast to ``float``. @@ -2481,7 +2484,9 @@ def argsort( -------- >>> midx = pd.MultiIndex.from_arrays([[3, 2], ["e", "c"]]) >>> midx - MultiIndex([(3, 'e'), (2, 'c')]) + MultiIndex([(3, 'e'), + (2, 'c')], + ) >>> order = midx.argsort() >>> order @@ -3130,7 +3135,9 @@ def get_slice_bound( """ if not isinstance(label, tuple): label = (label,) - return self._partial_tup_index(label, side=side) + result = self._partial_tup_index(label, side=side) + result = maybe_unbox_numpy_scalar(result) + return result def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]: """ @@ -3435,10 +3442,10 @@ def get_loc_level(self, key, level: IndexLabel = 0, drop_level: bool = True): >>> mi = pd.MultiIndex.from_arrays([list("abb"), list("def")], names=["A", "B"]) >>> mi.get_loc_level("b") - (slice(1, 3, None), Index(['e', 'f'], dtype='object', name='B')) + (slice(1, 3, None), Index(['e', 'f'], dtype='str', name='B')) >>> mi.get_loc_level("e", level="B") - (array([False, True, False]), Index(['b'], dtype='object', name='A')) + (array([False, True, False]), Index(['b'], dtype='str', name='A')) >>> mi.get_loc_level(["b", "e"]) (1, None) @@ -3717,7 +3724,7 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): if start == end: # The label is present in self.levels[level] but unused: raise KeyError(key) - return slice(start, end) + return slice(maybe_unbox_numpy_scalar(start), maybe_unbox_numpy_scalar(end)) def get_locs(self, seq) -> npt.NDArray[np.intp]: """ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f7fd4da2968a7..3244c20ea3a38 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -324,7 +324,7 @@ def merge( Traceback (most recent call last): ... ValueError: columns overlap but no suffix specified: - Index(['value'], dtype='object') + Index(['value'], dtype='str') >>> df1 = pd.DataFrame({"a": ["foo", "bar"], "b": [1, 2]}) >>> df2 = pd.DataFrame({"a": ["foo", "baz"], "c": [3, 4]}) diff --git a/pandas/core/series.py b/pandas/core/series.py index 512c24cc02f60..4fab5d74aa170 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1234,7 +1234,7 @@ def repeat(self, repeats: int | Sequence[int], axis: None = None) -> Series: 0 a 1 b 2 c - dtype: object + dtype: str >>> s.repeat(2) 0 a 0 a @@ -1242,7 +1242,7 @@ def repeat(self, repeats: int | Sequence[int], axis: None = None) -> Series: 1 b 2 c 2 c - dtype: object + dtype: str >>> s.repeat([1, 2, 3]) 0 a 1 b @@ -1250,7 +1250,7 @@ def repeat(self, repeats: int | Sequence[int], axis: None = None) -> Series: 2 c 2 c 2 c - dtype: object + dtype: str """ nv.validate_repeat((), {"axis": axis}) new_index = self.index.repeat(repeats) @@ -2292,7 +2292,7 @@ def drop_duplicates( 3 beetle 4 llama 5 hippo - Name: animal, dtype: object + Name: animal, dtype: str With the 'keep' parameter, the selection behavior of duplicated values can be changed. The value 'first' keeps the first occurrence for each @@ -2303,7 +2303,7 @@ def drop_duplicates( 1 cow 3 beetle 5 hippo - Name: animal, dtype: object + Name: animal, dtype: str The value 'last' for parameter 'keep' keeps the last occurrence for each set of duplicated entries. @@ -2313,7 +2313,7 @@ def drop_duplicates( 3 beetle 4 llama 5 hippo - Name: animal, dtype: object + Name: animal, dtype: str The value ``False`` for parameter 'keep' discards all sets of duplicated entries. @@ -2322,7 +2322,7 @@ def drop_duplicates( 1 cow 3 beetle 5 hippo - Name: animal, dtype: object + Name: animal, dtype: str """ inplace = validate_bool_kwarg(inplace, "inplace") result = super().drop_duplicates(keep=keep) @@ -2667,7 +2667,7 @@ def quantile( return self._constructor(result, index=idx, name=self.name) else: # scalar - return result.iloc[0] + return maybe_unbox_numpy_scalar(result.iloc[0]) def corr( self, @@ -2754,9 +2754,11 @@ def corr( other_values = other.to_numpy(dtype=float, na_value=np.nan, copy=False) if method in ["pearson", "spearman", "kendall"] or callable(method): - return nanops.nancorr( + result = nanops.nancorr( this_values, other_values, method=method, min_periods=min_periods ) + result = maybe_unbox_numpy_scalar(result) + return result raise ValueError( "method must be either 'pearson', " @@ -2808,9 +2810,11 @@ def cov( return np.nan this_values = this.to_numpy(dtype=float, na_value=np.nan, copy=False) other_values = other.to_numpy(dtype=float, na_value=np.nan, copy=False) - return nanops.nancov( + result = nanops.nancov( this_values, other_values, min_periods=min_periods, ddof=ddof ) + result = maybe_unbox_numpy_scalar(result) + return result @doc( klass="Series", @@ -3023,11 +3027,12 @@ def dot(self, other: AnyArrayLike | DataFrame) -> Series | np.ndarray: np.dot(lvals, rvals), index=other.columns, copy=False, dtype=common_type ).__finalize__(self, method="dot") elif isinstance(other, Series): - return np.dot(lvals, rvals) + result = np.dot(lvals, rvals) elif isinstance(rvals, np.ndarray): - return np.dot(lvals, rvals) + result = np.dot(lvals, rvals) else: # pragma: no cover raise TypeError(f"unsupported type: {type(other)}") + return maybe_unbox_numpy_scalar(result) def __matmul__(self, other): """ @@ -3131,7 +3136,7 @@ def compare( other a 3 self d other b - dtype: object + dtype: str Keep all original rows @@ -3377,7 +3382,7 @@ def update(self, other: Series | Sequence | Mapping) -> None: 0 d 1 b 2 e - dtype: object + dtype: str >>> s = pd.Series([1, 2, 3]) >>> s.update(pd.Series([4, 5, 6, 7, 8])) @@ -3578,7 +3583,7 @@ def sort_values( 2 d 3 a 4 c - dtype: object + dtype: str >>> s.sort_values() 3 a @@ -3586,7 +3591,7 @@ def sort_values( 4 c 2 d 0 z - dtype: object + dtype: str Sort using a key function. Your `key` function will be given the ``Series`` of values and should return an array-like. @@ -3598,14 +3603,14 @@ def sort_values( 0 a 2 c 4 e - dtype: object + dtype: str >>> s.sort_values(key=lambda x: x.str.lower()) 0 a 1 B 2 c 3 D 4 e - dtype: object + dtype: str NumPy ufuncs work well here. For example, we can sort by the ``sin`` of the value @@ -3785,7 +3790,7 @@ def sort_index( 2 b 3 a 4 d - dtype: object + dtype: str Sort Descending @@ -3794,7 +3799,7 @@ def sort_index( 3 a 2 b 1 c - dtype: object + dtype: str By default NaNs are put at the end, but use `na_position` to place them at the beginning @@ -3805,7 +3810,7 @@ def sort_index( 1.0 c 2.0 b 3.0 a - dtype: object + dtype: str Specify index level to sort @@ -4176,11 +4181,11 @@ def swaplevel( ... ], ... ) >>> s - Final exam History January A - Geography February B - Coursework History March A - Geography April C - dtype: object + Final exam History January A + Geography February B + Coursework History March A + Geography April C + dtype: str In the following example, we will swap the levels of the indices. Here, we will swap the levels column-wise, but levels can be swapped row-wise @@ -4189,11 +4194,11 @@ def swaplevel( last indices. >>> s.swaplevel() - Final exam January History A - February Geography B - Coursework March History A - April Geography C - dtype: object + Final exam January History A + February Geography B + Coursework March History A + April Geography C + dtype: str By supplying one argument, we can choose which index to swap the last index with. We can for example swap the first index with the last one as @@ -4204,7 +4209,7 @@ def swaplevel( February Geography Final exam B March History Coursework A April Geography Coursework C - dtype: object + dtype: str We can also define explicitly which indices we want to swap by supplying values for both i and j. Here, we for example swap the first and second indices. @@ -4214,7 +4219,7 @@ def swaplevel( Geography Final exam February B History Coursework March A Geography Coursework April C - dtype: object + dtype: str """ self._check_copy_deprecation(copy) assert isinstance(self.index, MultiIndex) @@ -4478,7 +4483,7 @@ def map( 1 dog 2 NaN 3 rabbit - dtype: object + dtype: str ``map`` accepts a ``dict`` or a ``Series``. Values that are not found in the ``dict`` are converted to ``NaN``, unless the dict has a default @@ -4489,7 +4494,7 @@ def map( 1 puppy 2 NaN 3 NaN - dtype: object + dtype: str It also accepts a function: @@ -4498,7 +4503,7 @@ def map( 1 I am a dog 2 I am a nan 3 I am a rabbit - dtype: object + dtype: str To avoid applying the function to missing values (and keep them as ``NaN``) ``na_action='ignore'`` can be used: @@ -4508,7 +4513,7 @@ def map( 1 I am a dog 2 NaN 3 I am a rabbit - dtype: object + dtype: str For categorical data, the function is only applied to the categories: @@ -4761,7 +4766,7 @@ def transform( Examples -------- - >>> df = pd.DataFrame({{"A": range(3), "B": range(1, 4)}}) + >>> df = pd.DataFrame({"A": range(3), "B": range(1, 4)}) >>> df A B 0 0 1 @@ -4792,19 +4797,17 @@ def transform( >>> df = pd.DataFrame( ... { - ... { - ... "Date": [ - ... "2015-05-08", - ... "2015-05-07", - ... "2015-05-06", - ... "2015-05-05", - ... "2015-05-08", - ... "2015-05-07", - ... "2015-05-06", - ... "2015-05-05", - ... ], - ... "Data": [5, 8, 6, 1, 50, 100, 60, 120], - ... } + ... "Date": [ + ... "2015-05-08", + ... "2015-05-07", + ... "2015-05-06", + ... "2015-05-05", + ... "2015-05-08", + ... "2015-05-07", + ... "2015-05-06", + ... "2015-05-05", + ... ], + ... "Data": [5, 8, 6, 1, 50, 100, 60, 120], ... } ... ) >>> df @@ -4830,10 +4833,8 @@ def transform( >>> df = pd.DataFrame( ... { - ... { - ... "c": [1, 1, 1, 2, 2, 2, 2], - ... "type": ["m", "n", "o", "m", "m", "n", "n"], - ... } + ... "c": [1, 1, 1, 2, 2, 2, 2], + ... "type": ["m", "n", "o", "m", "m", "n", "n"], ... } ... ) >>> df @@ -5347,7 +5348,7 @@ def reindex( # type: ignore[override] >>> date_index = pd.date_range("1/1/2010", periods=6, freq="D") >>> df2 = pd.DataFrame( - ... {{"prices": [100, 101, np.nan, 100, 89, 88]}}, index=date_index + ... {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index ... ) >>> df2 prices @@ -5509,13 +5510,13 @@ def rename_axis( 0 dog 1 cat 2 monkey - dtype: object + dtype: str >>> s.rename_axis("animal") animal 0 dog 1 cat 2 monkey - dtype: object + dtype: str """ return super().rename_axis( mapper=mapper, @@ -5701,7 +5702,7 @@ def pop(self, item: Hashable) -> Any: 2 3 dtype: int64 """ - return super().pop(item=item) + return maybe_unbox_numpy_scalar(super().pop(item=item)) def info( self, @@ -5771,17 +5772,17 @@ def info( Series name: None Non-Null Count Dtype -------------- ----- - 5 non-null object - dtypes: object(1) - memory usage: 80.0+ bytes + 5 non-null str + dtypes: str(1) + memory usage: 106.0 bytes Prints a summary excluding information about its values: >>> s.info(verbose=False) Index: 5 entries, 1 to 5 - dtypes: object(1) - memory usage: 80.0+ bytes + dtypes: str(1) + memory usage: 106.0 bytes Pipe output of Series.info to buffer instead of sys.stdout, get buffer content and writes to a text file: @@ -5805,9 +5806,9 @@ def info( Series name: None Non-Null Count Dtype -------------- ----- - 1000000 non-null object - dtypes: object(1) - memory usage: 7.6+ MB + 1000000 non-null str + dtypes: str(1) + memory usage: 8.6 MB >>> s.info(memory_usage="deep") @@ -5815,9 +5816,9 @@ def info( Series name: None Non-Null Count Dtype -------------- ----- - 1000000 non-null object - dtypes: object(1) - memory usage: 55.3 MB + 1000000 non-null str + dtypes: str(1) + memory usage: 8.6 MB """ return SeriesInfo(self, memory_usage).render( buf=buf, @@ -5857,7 +5858,7 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> int: -------- >>> s = pd.Series(range(3)) >>> s.memory_usage() - 152 + 156 Not including the index gives the size of the rest of the data, which is necessarily smaller: @@ -5869,11 +5870,13 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> int: >>> s = pd.Series(["a", "b"]) >>> s.values - array(['a', 'b'], dtype=object) + + ['a', 'b'] + Length: 2, dtype: str >>> s.memory_usage() - 144 + 150 >>> s.memory_usage(deep=True) - 244 + 150 """ v = self._memory_usage(deep=deep) if index: diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 48650620f9143..ae49f16a4256e 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -900,9 +900,9 @@ def sem(self, ddof: int = 1, numeric_only: bool = False): >>> s.expanding().sem() 0 NaN - 1 0.707107 - 2 0.707107 - 3 0.745356 + 1 0.500000 + 2 0.577350 + 3 0.645497 dtype: float64 """ return super().sem(ddof=ddof, numeric_only=numeric_only) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index dac523898092a..7b6600e699f63 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -78,10 +78,10 @@ # Column Non-Null Count Dtype --- ------ -------------- ----- 0 int_col 5 non-null int64 - 1 text_col 5 non-null object + 1 text_col 5 non-null str 2 float_col 5 non-null float64 - dtypes: float64(1), int64(1), object(1) - memory usage: 248.0+ bytes + dtypes: float64(1), int64(1), str(1) + memory usage: 278.0 bytes Prints a summary of columns count and its dtypes but not per column information: @@ -90,8 +90,8 @@ RangeIndex: 5 entries, 0 to 4 Columns: 3 entries, int_col to float_col - dtypes: float64(1), int64(1), object(1) - memory usage: 248.0+ bytes + dtypes: float64(1), int64(1), str(1) + memory usage: 278.0 bytes Pipe output of DataFrame.info to buffer instead of sys.stdout, get buffer content and writes to a text file: @@ -120,11 +120,11 @@ Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 column_1 1000000 non-null object - 1 column_2 1000000 non-null object - 2 column_3 1000000 non-null object - dtypes: object(3) - memory usage: 22.9+ MB + 0 column_1 1000000 non-null str + 1 column_2 1000000 non-null str + 2 column_3 1000000 non-null str + dtypes: str(3) + memory usage: 25.7 MB >>> df.info(memory_usage='deep') @@ -132,11 +132,11 @@ Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 column_1 1000000 non-null object - 1 column_2 1000000 non-null object - 2 column_3 1000000 non-null object - dtypes: object(3) - memory usage: 165.9 MB""" + 0 column_1 1000000 non-null str + 1 column_2 1000000 non-null str + 2 column_3 1000000 non-null str + dtypes: str(3) + memory usage: 25.7 MB""" )