@@ -3273,20 +3273,22 @@ def test_groupby_count_return_arrow_dtype(data_missing):
32733273class TestGroupbyAggPyArrowNative :
32743274 """Tests for PyArrow-native groupby aggregations on decimal and string types."""
32753275
3276+ @pytest .mark .parametrize (
3277+ "agg_func" ,
3278+ ["sum" , "prod" , "min" , "max" , "mean" , "std" , "var" , "sem" , "count" ],
3279+ )
3280+ def test_groupby_decimal_aggregations (self , agg_func ):
3281+ """Test decimal types use PyArrow-native groupby path."""
3282+ values = [Decimal (str (i )) for i in range (5 )]
3283+ ser = pd .Series (values , dtype = ArrowDtype (pa .decimal128 (10 , 2 )))
3284+ result = ser .groupby ([1 , 1 , 2 , 2 , 3 ]).agg (agg_func )
3285+ assert len (result ) == 3
3286+ assert result .index .tolist () == [1 , 2 , 3 ]
3287+ assert isinstance (result .dtype , ArrowDtype )
3288+
32763289 @pytest .mark .parametrize (
32773290 "dtype,agg_func" ,
32783291 [
3279- # Decimal aggregations
3280- (pa .decimal128 (10 , 2 ), "sum" ),
3281- (pa .decimal128 (10 , 2 ), "prod" ),
3282- (pa .decimal128 (10 , 2 ), "min" ),
3283- (pa .decimal128 (10 , 2 ), "max" ),
3284- (pa .decimal128 (10 , 2 ), "mean" ),
3285- (pa .decimal128 (10 , 2 ), "std" ),
3286- (pa .decimal128 (10 , 2 ), "var" ),
3287- (pa .decimal128 (10 , 2 ), "sem" ),
3288- (pa .decimal128 (10 , 2 ), "count" ),
3289- # String aggregations
32903292 (pa .string (), "min" ),
32913293 (pa .string (), "max" ),
32923294 (pa .string (), "count" ),
@@ -3295,132 +3297,76 @@ class TestGroupbyAggPyArrowNative:
32953297 (pa .large_string (), "count" ),
32963298 ],
32973299 )
3298- def test_groupby_aggregations (self , dtype , agg_func ):
3299- # Test that decimal/string types use PyArrow-native groupby path
3300- if pa .types .is_decimal (dtype ):
3301- values = [
3302- Decimal ("1.5" ),
3303- Decimal ("2.5" ),
3304- Decimal ("3.0" ),
3305- Decimal ("4.0" ),
3306- Decimal ("5.0" ),
3307- ]
3308- else :
3309- values = ["apple" , "banana" , "cherry" , "date" , "elderberry" ]
3310-
3311- df = pd .DataFrame (
3312- {
3313- "key" : [1 , 1 , 2 , 2 , 3 ],
3314- "value" : pd .array (values , dtype = ArrowDtype (dtype )),
3315- }
3316- )
3317- result = getattr (df .groupby ("key" )["value" ], agg_func )()
3300+ def test_groupby_string_aggregations (self , dtype , agg_func ):
3301+ """Test string types use PyArrow-native groupby path."""
3302+ ser = pd .Series (list ("abcde" ), dtype = ArrowDtype (dtype ))
3303+ result = ser .groupby ([1 , 1 , 2 , 2 , 3 ]).agg (agg_func )
33183304 assert len (result ) == 3
33193305 assert result .index .tolist () == [1 , 2 , 3 ]
33203306 assert isinstance (result .dtype , ArrowDtype )
33213307
33223308 @pytest .mark .parametrize (
3323- "dtype,agg_func" ,
3309+ "dtype,values,expected, agg_func" ,
33243310 [
3325- (pa .decimal128 (10 , 2 ), "sum" ),
3326- (pa .decimal128 (10 , 2 ), "min" ),
3327- (pa .string (), "min" ),
3328- (pa .string (), "max" ),
3311+ (
3312+ pa .decimal128 (10 , 2 ),
3313+ [Decimal ("1.0" ), None , Decimal ("3.0" ), None ],
3314+ [Decimal ("1.0" ), Decimal ("3.0" )],
3315+ "min" ,
3316+ ),
3317+ (pa .string (), ["a" , None , "c" , None ], ["a" , "c" ], "min" ),
3318+ (pa .string (), ["a" , None , "c" , None ], ["a" , "c" ], "max" ),
33293319 ],
33303320 )
3331- def test_groupby_with_nulls (self , dtype , agg_func ):
3332- # Test groupby with null values
3333- if pa .types .is_decimal (dtype ):
3334- values = [Decimal ("1.0" ), None , Decimal ("3.0" ), None ]
3335- expected = [Decimal ("1.0" ), Decimal ("3.0" )]
3336- else :
3337- values = ["a" , None , "c" , None ]
3338- expected = ["a" , "c" ]
3339-
3340- df = pd .DataFrame (
3341- {
3342- "key" : [1 , 1 , 2 , 2 ],
3343- "value" : pd .array (values , dtype = ArrowDtype (dtype )),
3344- }
3345- )
3346- result = getattr (df .groupby ("key" )["value" ], agg_func )()
3321+ def test_groupby_with_nulls (self , dtype , values , expected , agg_func ):
3322+ """Test groupby with null values."""
3323+ ser = pd .Series (values , dtype = ArrowDtype (dtype ))
3324+ result = ser .groupby ([1 , 1 , 2 , 2 ]).agg (agg_func )
33473325 assert len (result ) == 2
33483326 assert result .iloc [0 ] == expected [0 ]
33493327 assert result .iloc [1 ] == expected [1 ]
33503328
33513329 def test_groupby_sem_returns_float (self ):
3352- # Test that sem returns float dtype and handles edge cases
3353- # 1. Normal case: sem should return double[pyarrow]
3354- df = pd .DataFrame (
3355- {
3356- "key" : [1 , 1 , 2 , 2 ],
3357- "value" : pd .array (
3358- [Decimal ("1.0" ), Decimal ("2.0" ), Decimal ("3.0" ), Decimal ("4.0" )],
3359- dtype = ArrowDtype (pa .decimal128 (10 , 2 )),
3360- ),
3361- }
3362- )
3363- result = df .groupby ("key" )["value" ].sem ()
3330+ """Test that sem returns float dtype."""
3331+ values = [Decimal (str (i )) for i in range (4 )]
3332+ ser = pd .Series (values , dtype = ArrowDtype (pa .decimal128 (10 , 2 )))
3333+ result = ser .groupby ([1 , 1 , 2 , 2 ]).sem ()
33643334 assert result .dtype == ArrowDtype (pa .float64 ())
33653335
3366- # 2. Single value per group (count=1): should be NA (stddev undefined)
3367- df_single = pd .DataFrame (
3368- {
3369- "key" : [1 , 2 ],
3370- "value" : pd .array (
3371- [Decimal ("1.0" ), Decimal ("2.0" )],
3372- dtype = ArrowDtype (pa .decimal128 (10 , 2 )),
3373- ),
3374- }
3375- )
3376- result = df_single .groupby ("key" )["value" ].sem ()
3336+ def test_groupby_sem_single_value (self ):
3337+ """Test that sem returns NA for single-value groups (stddev undefined)."""
3338+ values = [Decimal ("1.0" ), Decimal ("2.0" )]
3339+ ser = pd .Series (values , dtype = ArrowDtype (pa .decimal128 (10 , 2 )))
3340+ result = ser .groupby ([1 , 2 ]).sem ()
33773341 assert pd .isna (result .iloc [0 ])
33783342 assert pd .isna (result .iloc [1 ])
33793343
3380- # 3. All nulls in a group (count=0): should be NA (no division-by-zero)
3381- df_nulls = pd .DataFrame (
3382- {
3383- "key" : [1 , 1 , 2 , 2 ],
3384- "value" : pd .array (
3385- [Decimal ("1.0" ), Decimal ("2.0" ), None , None ],
3386- dtype = ArrowDtype (pa .decimal128 (10 , 2 )),
3387- ),
3388- }
3344+ def test_groupby_sem_all_nulls (self ):
3345+ """Test that sem returns NA for all-null groups."""
3346+ ser = pd .Series (
3347+ [Decimal ("1.0" ), Decimal ("2.0" ), None , None ],
3348+ dtype = ArrowDtype (pa .decimal128 (10 , 2 )),
33893349 )
3390- result = df_nulls .groupby ("key" )[ "value" ] .sem ()
3350+ result = ser .groupby ([ 1 , 1 , 2 , 2 ]) .sem ()
33913351 assert not pd .isna (result .iloc [0 ]) # Group 1 has values
33923352 assert pd .isna (result .iloc [1 ]) # Group 2 all nulls
33933353
33943354 @pytest .mark .parametrize ("agg_func" , ["sum" , "prod" ])
33953355 def test_groupby_min_count (self , agg_func ):
3396- df = pd .DataFrame (
3397- {
3398- "key" : [1 , 1 , 2 ],
3399- "value" : pd .array (
3400- [Decimal ("1.0" ), Decimal ("2.0" ), Decimal ("3.0" )],
3401- dtype = ArrowDtype (pa .decimal128 (10 , 2 )),
3402- ),
3403- }
3404- )
3405- # min_count=2: group 2 has only 1 value, should be null
3406- result = getattr (df .groupby ("key" )["value" ], agg_func )(min_count = 2 )
3356+ """Test min_count parameter."""
3357+ values = [Decimal (str (i )) for i in range (3 )]
3358+ ser = pd .Series (values , dtype = ArrowDtype (pa .decimal128 (10 , 2 )))
3359+ result = ser .groupby ([1 , 1 , 2 ]).agg (agg_func , min_count = 2 )
34073360 assert not pd .isna (result .iloc [0 ]) # Group 1 has 2 values
34083361 assert pd .isna (result .iloc [1 ]) # Group 2 has 1 value < min_count
34093362
34103363 def test_groupby_min_count_with_nulls (self ):
3411- # Test that min_count uses non-null count, not group size
3412- df = pd .DataFrame (
3413- {
3414- "key" : [1 , 1 , 2 , 2 , 2 ],
3415- "value" : pd .array (
3416- [Decimal ("1.0" ), None , Decimal ("2.0" ), Decimal ("3.0" ), None ],
3417- dtype = ArrowDtype (pa .decimal128 (10 , 2 )),
3418- ),
3419- }
3364+ """Test that min_count uses non-null count, not group size."""
3365+ ser = pd .Series (
3366+ [Decimal ("1.0" ), None , Decimal ("2.0" ), Decimal ("3.0" ), None ],
3367+ dtype = ArrowDtype (pa .decimal128 (10 , 2 )),
34203368 )
3421- # Group 1: 2 rows but only 1 non-null -> should be null with min_count=2
3422- # Group 2: 3 rows but only 2 non-null -> should be 5.0 with min_count=2
3423- result = df .groupby ("key" )["value" ].sum (min_count = 2 )
3369+ result = ser .groupby ([1 , 1 , 2 , 2 , 2 ]).sum (min_count = 2 )
34243370 assert pd .isna (result .iloc [0 ]) # Only 1 non-null < min_count=2
34253371 assert result .iloc [1 ] == Decimal ("5.0" ) # 2 non-null >= min_count=2
34263372
@@ -3432,49 +3378,32 @@ def test_groupby_min_count_with_nulls(self):
34323378 ],
34333379 )
34343380 def test_groupby_missing_groups (self , agg_func , default_value ):
3435- df = pd .DataFrame (
3436- {
3437- "key" : pd .Categorical ([0 , 0 , 2 , 2 ], categories = [0 , 1 , 2 ]),
3438- "value" : pd .array (
3439- [Decimal ("1.0" ), Decimal ("2.0" ), Decimal ("3.0" ), Decimal ("4.0" )],
3440- dtype = ArrowDtype (pa .decimal128 (10 , 2 )),
3441- ),
3442- }
3443- )
3444- result = getattr (df .groupby ("key" , observed = False )["value" ], agg_func )()
3381+ """Test that missing groups get identity values."""
3382+ values = [Decimal (str (i )) for i in range (4 )]
3383+ ser = pd .Series (values , dtype = ArrowDtype (pa .decimal128 (10 , 2 )))
3384+ keys = pd .Categorical ([0 , 0 , 2 , 2 ], categories = [0 , 1 , 2 ])
3385+ result = ser .groupby (keys , observed = False ).agg (agg_func )
34453386 assert len (result ) == 3
3446- # Group 1 is missing, should get default value
34473387 assert result .iloc [1 ] == Decimal (str (default_value ))
34483388
3449- @pytest .mark .parametrize ("dropna" , [True , False ])
3450- def test_groupby_dropna (self , dropna ):
3451- # Test that NA group (ids == -1) is handled correctly
3452- df = pd .DataFrame (
3453- {
3454- "key" : [1 , 1 , None , 2 , 2 , None ],
3455- "value" : pd .array (
3456- [
3457- Decimal ("1.0" ),
3458- Decimal ("2.0" ),
3459- Decimal ("3.0" ),
3460- Decimal ("4.0" ),
3461- Decimal ("5.0" ),
3462- Decimal ("6.0" ),
3463- ],
3464- dtype = ArrowDtype (pa .decimal128 (10 , 2 )),
3465- ),
3466- }
3467- )
3468- result = df .groupby ("key" , dropna = dropna )["value" ].sum ()
3469- if dropna :
3470- assert len (result ) == 2
3471- assert result .iloc [0 ] == Decimal ("3.0" ) # 1 + 2
3472- assert result .iloc [1 ] == Decimal ("9.0" ) # 4 + 5
3473- else :
3474- assert len (result ) == 3
3475- assert result .iloc [0 ] == Decimal ("3.0" ) # 1 + 2
3476- assert result .iloc [1 ] == Decimal ("9.0" ) # 4 + 5
3477- assert result .iloc [2 ] == Decimal ("9.0" ) # 3 + 6 (NA group)
3389+ def test_groupby_dropna_true (self ):
3390+ """Test that NA keys are excluded when dropna=True."""
3391+ values = [Decimal (str (i )) for i in range (6 )]
3392+ ser = pd .Series (values , dtype = ArrowDtype (pa .decimal128 (10 , 2 )))
3393+ result = ser .groupby ([1 , 1 , None , 2 , 2 , None ], dropna = True ).sum ()
3394+ assert len (result ) == 2
3395+ assert result .iloc [0 ] == Decimal ("1.0" ) # 0 + 1
3396+ assert result .iloc [1 ] == Decimal ("7.0" ) # 3 + 4
3397+
3398+ def test_groupby_dropna_false (self ):
3399+ """Test that NA keys form a group when dropna=False."""
3400+ values = [Decimal (str (i )) for i in range (6 )]
3401+ ser = pd .Series (values , dtype = ArrowDtype (pa .decimal128 (10 , 2 )))
3402+ result = ser .groupby ([1 , 1 , None , 2 , 2 , None ], dropna = False ).sum ()
3403+ assert len (result ) == 3
3404+ assert result .iloc [0 ] == Decimal ("1.0" ) # 0 + 1
3405+ assert result .iloc [1 ] == Decimal ("7.0" ) # 3 + 4
3406+ assert result .iloc [2 ] == Decimal ("7.0" ) # 2 + 5 (NA group)
34783407
34793408
34803409def test_fixed_size_list ():
0 commit comments