format

fangchenli · fangchenli · commit e350cfd232c8 · 2025-12-18T10:04:28.000-08:00
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -1152,4 +1152,88 @@ def time_resample_multiindex(self):
         ).mean()
 
 
+class GroupByAggregateArrowDtypes:
+    param_names = ["dtype", "method"]
+    params = [
+        [
+            "bool[pyarrow]",
+            "int32[pyarrow]",
+            "float64[pyarrow]",
+            "decimal128(25, 3)[pyarrow]",
+            "string[pyarrow]",
+            "timestamp[s, tz=UTC][pyarrow]",
+            "duration[ms][pyarrow]",
+        ],
+        [
+            "any",
+            "all",
+            "sum",
+            "prod",
+            "min",
+            "max",
+            "mean",
+            "std",
+            "var",
+            "sem",
+            "count",
+            "median",
+        ],
+    ]
+
+    def setup(self, dtype, method):
+        import pyarrow as pa
+
+        # Parse dtype string
+        if dtype.startswith("decimal128"):
+            pa_type = pa.decimal128(25, 3)
+        elif dtype.startswith("timestamp"):
+            pa_type = pa.timestamp("s", "UTC")
+        elif dtype.startswith("duration"):
+            pa_type = pa.duration("ms")
+        elif dtype == "bool[pyarrow]":
+            pa_type = pa.bool_()
+        elif dtype == "int32[pyarrow]":
+            pa_type = pa.int32()
+        elif dtype == "float64[pyarrow]":
+            pa_type = pa.float64()
+        elif dtype == "string[pyarrow]":
+            pa_type = pa.string()
+        else:
+            raise ValueError(f"Unsupported dtype: {dtype}")
+
+        size = 100_000
+        ncols = 5
+        columns = list("abcde")
+
+        # Generate data based on type
+        if pa.types.is_floating(pa_type):
+            data = np.random.randn(size, ncols)
+        elif pa.types.is_integer(pa_type):
+            data = np.random.randint(0, 10_000, (size, ncols))
+        elif pa.types.is_decimal(pa_type):
+            from decimal import Decimal
+
+            data = np.random.randn(size, ncols).round(3)
+            data = [[Decimal(str(x)) for x in row] for row in data]
+        elif pa.types.is_boolean(pa_type):
+            data = np.random.choice([True, False], (size, ncols))
+        elif pa.types.is_timestamp(pa_type):
+            data = np.random.randint(0, 1_000_000, (size, ncols))
+        elif pa.types.is_duration(pa_type):
+            data = np.random.randint(0, 1_000_000, (size, ncols))
+        elif pa.types.is_string(pa_type):
+            data = np.random.choice(list(ascii_letters), (size, ncols))
+        else:
+            raise ValueError(f"Unsupported pyarrow type: {pa_type}")
+
+        df = DataFrame(data, columns=columns, dtype=dtype)
+        # Add some NAs
+        df.iloc[::10, ::2] = NA
+        df["key"] = np.random.randint(0, 100, size)
+        self.df = df
+
+    def time_frame_agg(self, dtype, method):
+        self.df.groupby("key").agg(method)
+
+
 from .pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -2601,6 +2601,143 @@ def _to_masked(self):
         arr = self.to_numpy(dtype=dtype.numpy_dtype, na_value=na_value)
         return dtype.construct_array_type()(arr, mask)
 
+    # Mapping from pandas groupby 'how' to PyArrow aggregation function names
+    _PYARROW_AGG_FUNCS: dict[str, str] = {
+        "sum": "sum",
+        "prod": "product",
+        "min": "min",
+        "max": "max",
+        "mean": "mean",
+        "std": "stddev",
+        "var": "variance",
+        "sem": "stddev",  # sem = stddev / sqrt(count), computed below
+        "count": "count",
+        "any": "any",
+        "all": "all",
+    }
+
+    # Default values for missing groups (identity elements for each operation)
+    _PYARROW_AGG_DEFAULTS: dict[str, int | bool] = {
+        "sum": 0,
+        "prod": 1,
+        "count": 0,
+        "any": False,
+        "all": True,
+    }
+
+    def _groupby_op_pyarrow(
+        self,
+        *,
+        how: str,
+        min_count: int,
+        ngroups: int,
+        ids: npt.NDArray[np.intp],
+        **kwargs,
+    ) -> Self | None:
+        """
+        Perform groupby aggregation using PyArrow's native Table.group_by.
+
+        Returns None if the operation is not supported by PyArrow,
+        in which case the caller should fall back to the Cython path.
+        """
+        pa_agg_func = self._PYARROW_AGG_FUNCS.get(how)
+        if pa_agg_func is None:
+            return None
+
+        pa_type = self._pa_array.type
+        # PyArrow doesn't support these aggregations for temporal types directly
+        if pa.types.is_temporal(pa_type) and how in ["std", "var", "sem"]:
+            return None
+
+        # PyArrow's any/all only work on boolean types
+        if how in ["any", "all"] and not pa.types.is_boolean(pa_type):
+            return None
+
+        # Filter out NA group (ids == -1) to avoid unnecessary computation
+        mask = ids >= 0
+        if not mask.all():
+            ids = ids[mask]
+            values = pc.filter(self._pa_array, mask)
+        else:
+            values = self._pa_array
+
+        # Create a PyArrow table with the values and group IDs
+        # Explicitly cast ids to int64 since np.intp is platform-dependent
+        group_id_arr = pa.array(ids, type=pa.int64())
+        table = pa.table({"value": values, "group_id": group_id_arr})
+
+        # Build aggregation list - always include count for null handling
+        # For std/var/sem, pass VarianceOptions with ddof to match pandas behavior
+        if how in ["std", "var", "sem"]:
+            ddof = kwargs.get("ddof", 1)
+            agg_with_opts = ("value", pa_agg_func, pc.VarianceOptions(ddof=ddof))
+            aggs = [agg_with_opts, ("value", "count")]
+        else:
+            aggs = [("value", pa_agg_func), ("value", "count")]
+
+        # Perform the groupby aggregation
+        result_table = table.group_by("group_id").aggregate(aggs)
+
+        # Extract results
+        result_group_ids = result_table.column("group_id")
+        result_values = result_table.column(f"value_{pa_agg_func}")
+        result_counts = result_table.column("value_count")
+
+        # For sem, compute stddev / sqrt(count) using PyArrow compute
+        if how == "sem":
+            sqrt_counts = pc.sqrt(result_counts)
+            result_values = pc.divide(result_values, sqrt_counts)
+
+        output_type = result_values.type
+        default_value = pa.scalar(self._PYARROW_AGG_DEFAULTS.get(how), type=output_type)
+
+        # Handle nulls from all-null groups for sum/prod with min_count=0
+        if result_values.null_count > 0 and how in ["sum", "prod"] and min_count == 0:
+            result_values = pc.if_else(
+                pc.is_null(result_values), default_value, result_values
+            )
+
+        # Handle min_count: groups with count < min_count should be null
+        if min_count > 0:
+            below_min_count = pc.less(result_counts, pa.scalar(min_count))
+            result_values = pc.if_else(below_min_count, None, result_values)
+
+        # PyArrow returns results in encounter order. We need to reorder to
+        # match expected output (group 0, 1, 2, ..., ngroups-1) and fill
+        # missing groups with default values.
+        #
+        # We use NumPy scatter (O(n)) instead of:
+        # - pc.scatter: doesn't handle missing groups, workaround is slower
+        # - join+sort: O(n log n), slower for high-cardinality groupby
+        #
+        # Explicitly cast to int64 to ensure usable as NumPy indices
+        result_group_ids_np = result_group_ids.to_numpy(zero_copy_only=False).astype(
+            np.int64, copy=False
+        )
+        result_values_np = result_values.to_numpy(zero_copy_only=False)
+
+        default_py = default_value.as_py()
+        if default_py is not None and min_count == 0:
+            # Operations with identity elements (sum=0, prod=1, count=0, any=False,
+            # all=True): fill missing groups with default value
+            output_np = np.full(ngroups, default_py, dtype=result_values_np.dtype)
+            output_np[result_group_ids_np] = result_values_np
+            pa_result = pa.array(output_np, type=output_type)
+        else:
+            # Operations without identity elements (mean, std, var, min, max, sem):
+            # fill missing groups with null using a boolean mask
+            output_np = np.empty(ngroups, dtype=result_values_np.dtype)
+            null_mask = np.ones(ngroups, dtype=bool)  # True = null/missing
+            output_np[result_group_ids_np] = result_values_np
+            null_mask[result_group_ids_np] = False
+            # Restore nulls for groups that had null results (min_count or all-null)
+            if result_values.null_count > 0:
+                result_nulls = pc.is_null(result_values).to_numpy()
+                null_mask[result_group_ids_np[result_nulls]] = True
+            pa_result = pa.array(output_np, type=output_type, mask=null_mask)
+
+        return self._from_pyarrow_array(pa_result)
+
     def _groupby_op(
         self,
         *,
@@ -2635,9 +2772,27 @@ def _groupby_op(
                 **kwargs,
             )
 
-        # maybe convert to a compatible dtype optimized for groupby
-        values: ExtensionArray
         pa_type = self._pa_array.type
+
+        # Try PyArrow-native path for decimal and string types where it's faster.
+        # For integer/float/boolean, the fallback path via _to_masked() is faster.
+        if (
+            pa.types.is_decimal(pa_type)
+            or pa.types.is_string(pa_type)
+            or pa.types.is_large_string(pa_type)
+        ):
+            result = self._groupby_op_pyarrow(
+                how=how,
+                min_count=min_count,
+                ngroups=ngroups,
+                ids=ids,
+                **kwargs,
+            )
+            if result is not None:
+                return result
+
+        # Fall back to converting to masked/datetime array and using Cython
+        values: ExtensionArray
         if pa.types.is_timestamp(pa_type):
             values = self._to_datetimearray()
         elif pa.types.is_duration(pa_type):
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py