Finish up

rhshadrach · rhshadrach · commit 82a5fbae9081 · 2025-11-20T16:45:13.000-05:00
diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
@@ -57,7 +57,7 @@ jobs:
       run: sudo apt-get update && sudo apt-get install -y libegl1 libopengl0
 
     - name: Run doctests
-      run: cd ci && ./code_checks.sh doctests
+      run: cd ci && PANDAS_FUTURE_PYTHON_SCALARS="1" ./code_checks.sh doctests
       if: ${{ steps.build.outcome == 'success' && always() }}
 
     - name: Install pandas in editable mode
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -149,14 +149,15 @@ def pytest_collection_modifyitems(items, config) -> None:
     # Warnings from doctests that can be ignored; place reason in comment above.
     # Each entry specifies (path, message) - see the ignore_doctest_warning function
     ignored_doctest_warnings = [
-        ("api.interchange.from_dataframe", ".*Interchange Protocol is deprecated"),
+        ("api.interchange.from_dataframe", "The DataFrame Interchange Protocol"),
         ("is_int64_dtype", "is_int64_dtype is deprecated"),
         ("is_interval_dtype", "is_interval_dtype is deprecated"),
         ("is_period_dtype", "is_period_dtype is deprecated"),
         ("is_datetime64tz_dtype", "is_datetime64tz_dtype is deprecated"),
         ("is_categorical_dtype", "is_categorical_dtype is deprecated"),
         ("is_sparse", "is_sparse is deprecated"),
-        ("DataFrame.__dataframe__", "Interchange Protocol is deprecated"),
+        ("CategoricalDtype._from_values_or_dtype", "Constructing a Categorical"),
+        ("DataFrame.__dataframe__", "The DataFrame Interchange Protocol"),
         ("DataFrameGroupBy.fillna", "DataFrameGroupBy.fillna is deprecated"),
         ("DataFrameGroupBy.corrwith", "DataFrameGroupBy.corrwith is deprecated"),
         ("NDFrame.replace", "Series.replace without 'value'"),
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -89,6 +89,7 @@
     infer_dtype_from_scalar,
     invalidate_string_dtypes,
     maybe_downcast_to_dtype,
+    maybe_unbox_numpy_scalar,
 )
 from pandas.core.dtypes.common import (
     infer_dtype_from_object,
@@ -3822,7 +3823,7 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series:
         many repeated values.
 
         >>> df["object"].astype("category").memory_usage(deep=True)
-        5136
+        5140
         """
         result = self._constructor_sliced(
             [c.memory_usage(index=False, deep=deep) for col, c in self.items()],
@@ -4392,11 +4393,11 @@ def _setitem(self, key, value) -> None:
         >>> s = pd.Series([100, 200], index=["b", "d"])
         >>> df["B"] = s
         >>> df
-            A    B
-        a  1  NaN
-        b  2  100
-        c  3  NaN
-        d  4  200
+           A      B
+        a  1    NaN
+        b  2  100.0
+        c  3    NaN
+        d  4  200.0
 
         Series index labels NOT in DataFrame, ignored:
 
@@ -4408,7 +4409,6 @@ def _setitem(self, key, value) -> None:
         x  1  10
         y  2  20
         z  3  50
-        # Values for 'a' and 'b' are completely ignored!
         """
         key = com.apply_if_callable(key, self)
 
@@ -5121,6 +5121,7 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
         2     6
         3     8
         4    10
+        Name: A, dtype: int64
         """
         from pandas.core.computation.eval import eval as _eval
 
@@ -7810,10 +7811,10 @@ def value_counts(
 
         >>> df.value_counts(dropna=False)
         first_name  middle_name
+        John        Smith          1
         Anne        NaN            1
+        John        NaN            1
         Beth        Louise         1
-        John        Smith          1
-                    NaN            1
         Name: count, dtype: int64
 
         >>> df.value_counts("first_name")
@@ -9095,10 +9096,10 @@ def combine(
         2  3.0  NaN NaN
 
         >>> df2.combine(df1, take_smaller, overwrite=False)
-             A    B   C
-        0  0.0  NaN NaN
-        1  0.0  3.0 1.0
-        2  NaN  3.0 1.0
+             B    C   A
+        0  NaN  NaN 0.0
+        1  3.0  1.0 0.0
+        2  3.0  1.0 NaN
         """
         other_idxlen = len(other.index)  # save for compare
         other_columns = other.columns
@@ -10954,8 +10955,8 @@ def apply(
         ``apply`` has type stability (variables in the function do not change their
         type during the execution).
 
-        >>> import bodo
-        >>> df.apply(lambda x: x.A + x.B, axis=1, engine=bodo.jit)
+        >>> import bodo  # doctest: +SKIP
+        >>> df.apply(lambda x: x.A + x.B, axis=1, engine=bodo.jit)  # doctest: +SKIP
 
         Note that JIT compilation is only recommended for functions that take a
         significant amount of time to run. Fast functions are unlikely to run faster
@@ -12131,7 +12132,7 @@ def _get_data() -> DataFrame:
                 df = df.astype(dtype)
                 arr = concat_compat(list(df._iter_column_arrays()))
                 return arr._reduce(name, skipna=skipna, keepdims=False, **kwds)
-            return func(df.values)
+            return maybe_unbox_numpy_scalar(func(df.values))
         elif axis == 1:
             if len(df.index) == 0:
                 # Taking a transpose would result in no columns, losing the dtype.
@@ -13283,8 +13284,8 @@ def kurt(
 
         With axis=None
 
-        >>> df.kurt(axis=None).round(6)
-        -0.988693
+        >>> df.kurt(axis=None)
+        -0.9886927196984727
 
         Using axis=1
 
@@ -13465,7 +13466,7 @@ def idxmin(
         Pork                consumption
         Wheat Products    co2_emissions
         Beef                consumption
-        dtype: object
+        dtype: str
         """
         axis = self._get_axis_number(axis)
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -13116,8 +13116,8 @@ def make_doc(name: str, ndim: int) -> str:
 
             With axis=None
 
-            >>> df.kurt(axis=None).round(6)
-            -0.988693
+            >>> df.kurt(axis=None)
+            -0.9886927196984727
 
             Using axis=1
 
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -6792,7 +6792,7 @@ def _searchsorted_monotonic(self, label, side: Literal["left", "right"] = "left"
             pos = self[::-1].searchsorted(
                 label, side="right" if side == "left" else "left"
             )
-            return len(self) - pos
+            return maybe_unbox_numpy_scalar(len(self) - pos)
 
         raise ValueError("index must be monotonic increasing or decreasing")
 
@@ -6979,6 +6979,8 @@ def slice_locs(
             if start_slice == -1:
                 start_slice -= len(self)
 
+        start_slice = maybe_unbox_numpy_scalar(start_slice)
+        end_slice = maybe_unbox_numpy_scalar(end_slice)
         return start_slice, end_slice
 
     def delete(
@@ -7398,7 +7400,7 @@ def any(self, *args, **kwargs):
             # i.e. EA, call _reduce instead of "any" to get TypeError instead
             #  of AttributeError
             return vals._reduce("any")
-        return np.any(vals)
+        return maybe_unbox_numpy_scalar(np.any(vals))
 
     def all(self, *args, **kwargs):
         """
@@ -7446,7 +7448,7 @@ def all(self, *args, **kwargs):
             # i.e. EA, call _reduce instead of "all" to get TypeError instead
             #  of AttributeError
             return vals._reduce("all")
-        return np.all(vals)
+        return maybe_unbox_numpy_scalar(np.all(vals))
 
     @final
     def _maybe_disable_logical_methods(self, opname: str_t) -> None:
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
@@ -40,6 +40,7 @@
     infer_dtype_from_scalar,
     maybe_box_datetimelike,
     maybe_downcast_numeric,
+    maybe_unbox_numpy_scalar,
     maybe_upcast_numeric_to_64bit,
 )
 from pandas.core.dtypes.common import (
@@ -804,7 +805,7 @@ def get_loc(self, key) -> int | slice | np.ndarray:
         if matches == 0:
             raise KeyError(key)
         if matches == 1:
-            return mask.argmax()
+            return maybe_unbox_numpy_scalar(mask.argmax())
 
         res = lib.maybe_booleans_to_slice(mask.view("u1"))
         if isinstance(res, slice) and res.stop is None:
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -58,7 +58,10 @@
 )
 from pandas.util._exceptions import find_stack_level
 
-from pandas.core.dtypes.cast import coerce_indexer_dtype
+from pandas.core.dtypes.cast import (
+    coerce_indexer_dtype,
+    maybe_unbox_numpy_scalar,
+)
 from pandas.core.dtypes.common import (
     ensure_int64,
     ensure_platform_int,
@@ -3115,7 +3118,9 @@ def get_slice_bound(
         """
         if not isinstance(label, tuple):
             label = (label,)
-        return self._partial_tup_index(label, side=side)
+        result = self._partial_tup_index(label, side=side)
+        result = maybe_unbox_numpy_scalar(result)
+        return result
 
     def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]:
         """
@@ -3702,7 +3707,7 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes):
             if start == end:
                 # The label is present in self.levels[level] but unused:
                 raise KeyError(key)
-            return slice(start, end)
+            return slice(maybe_unbox_numpy_scalar(start), maybe_unbox_numpy_scalar(end))
 
     def get_locs(self, seq) -> npt.NDArray[np.intp]:
         """
diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py
@@ -90,8 +90,8 @@
     <class 'pandas.DataFrame'>
     RangeIndex: 5 entries, 0 to 4
     Columns: 3 entries, int_col to float_col
-    dtypes: float64(1), int64(1), object(1)
-    memory usage: 248.0+ bytes
+    dtypes: float64(1), int64(1), str(1)
+    memory usage: 278.0 bytes
 
     Pipe output of DataFrame.info to buffer instead of sys.stdout, get
     buffer content and writes to a text file:
@@ -120,23 +120,23 @@
     Data columns (total 3 columns):
      #   Column    Non-Null Count    Dtype
     ---  ------    --------------    -----
-     0   column_1  1000000 non-null  object
-     1   column_2  1000000 non-null  object
-     2   column_3  1000000 non-null  object
-    dtypes: object(3)
-    memory usage: 22.9+ MB
+     0   column_1  1000000 non-null  str
+     1   column_2  1000000 non-null  str
+     2   column_3  1000000 non-null  str
+    dtypes: str(3)
+    memory usage: 25.7 MB
 
     >>> df.info(memory_usage='deep')
     <class 'pandas.DataFrame'>
     RangeIndex: 1000000 entries, 0 to 999999
     Data columns (total 3 columns):
      #   Column    Non-Null Count    Dtype
     ---  ------    --------------    -----
-     0   column_1  1000000 non-null  object
-     1   column_2  1000000 non-null  object
-     2   column_3  1000000 non-null  object
-    dtypes: object(3)
-    memory usage: 165.9 MB"""
+     0   column_1  1000000 non-null  str
+     1   column_2  1000000 non-null  str
+     2   column_3  1000000 non-null  str
+    dtypes: str(3)
+    memory usage: 25.7 MB"""
 )
 
 
diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py
@@ -433,10 +433,13 @@ def test_np_matmul():
 
 
 @pytest.mark.parametrize("box", [pd.Index, pd.Series])
-def test_np_matmul_1D(box):
+def test_np_matmul_1D(box, using_python_scalars):
     result = np.matmul(box([1, 2]), box([2, 3]))
     assert result == 8
-    assert isinstance(result, np.int64)
+    if using_python_scalars:
+        assert isinstance(result, int)
+    else:
+        assert isinstance(result, np.int64)
 
 
 def test_array_ufuncs_for_many_arguments():