feat(bigframes): Support groupby.agg/transform with udf transpiler#17613
feat(bigframes): Support groupby.agg/transform with udf transpiler#17613TrevorBergeron wants to merge 1 commit into
Conversation
There was a problem hiding this comment.
Code Review
This pull request introduces subscripting support (via GetItemOp and DynamicGetItemOp) across Ibis, Polars, and SQLGlot compilers for arrays, structs, and strings. It also enables transpilation of custom Python functions (UDFs) within GroupBy aggregations and transformations. The review feedback highlights several critical runtime issues, including missing imports (bigframes and window_specs) that will cause NameErrors, and unhandled string-like inputs in the Polars implementation of GetItemOp.
| def is_transpiler_eligible(func: typing.Any) -> bool: | ||
| """Return True if func is eligible for Python transpilation.""" | ||
| from bigframes._config import options | ||
|
|
||
| return ( | ||
| options.experiments.enable_python_transpiler | ||
| and callable(func) | ||
| and not isinstance(func, bigframes.functions.Udf) | ||
| ) |
There was a problem hiding this comment.
The function is_transpiler_eligible references bigframes.functions.Udf [97], but bigframes is not imported in this module. This will result in a NameError at runtime. Please import bigframes or import Udf directly from bigframes.functions.
| def is_transpiler_eligible(func: typing.Any) -> bool: | |
| """Return True if func is eligible for Python transpilation.""" | |
| from bigframes._config import options | |
| return ( | |
| options.experiments.enable_python_transpiler | |
| and callable(func) | |
| and not isinstance(func, bigframes.functions.Udf) | |
| ) | |
| def is_transpiler_eligible(func: typing.Any) -> bool: | |
| """Return True if func is eligible for Python transpilation.""" | |
| from bigframes._config import options | |
| import bigframes.functions | |
| return ( | |
| options.experiments.enable_python_transpiler | |
| and callable(func) | |
| and not isinstance(func, bigframes.functions.Udf) | |
| ) |
| @polars_compiler.register_op(generic_ops.GetItemOp) | ||
| def getitem_op_impl( | ||
| compiler: polars_compiler.PolarsExpressionCompiler, | ||
| op: generic_ops.GetItemOp, # type: ignore | ||
| input: pl.Expr, | ||
| ) -> pl.Expr: | ||
| input_type = compiler._expr_types.get(id(input)) | ||
| if input_type is not None and dtypes.is_struct_like(input_type): | ||
| if isinstance(op.key, str): | ||
| return input.struct.field(op.key) | ||
| else: | ||
| raise NotImplementedError( | ||
| "Referencing a struct field by number not implemented in polars compiler." | ||
| ) | ||
| else: | ||
| return input.list.get(op.key) |
There was a problem hiding this comment.
In getitem_op_impl, if the input type is string-like, calling input.list.get(op.key) will fail at runtime in Polars because strings are not list-like. We should handle string-like inputs by slicing them, similar to the logic in dynamic_getitem_op_impl.
| @polars_compiler.register_op(generic_ops.GetItemOp) | |
| def getitem_op_impl( | |
| compiler: polars_compiler.PolarsExpressionCompiler, | |
| op: generic_ops.GetItemOp, # type: ignore | |
| input: pl.Expr, | |
| ) -> pl.Expr: | |
| input_type = compiler._expr_types.get(id(input)) | |
| if input_type is not None and dtypes.is_struct_like(input_type): | |
| if isinstance(op.key, str): | |
| return input.struct.field(op.key) | |
| else: | |
| raise NotImplementedError( | |
| "Referencing a struct field by number not implemented in polars compiler." | |
| ) | |
| else: | |
| return input.list.get(op.key) | |
| @polars_compiler.register_op(generic_ops.GetItemOp) | |
| def getitem_op_impl( | |
| compiler: polars_compiler.PolarsExpressionCompiler, | |
| op: generic_ops.GetItemOp, # type: ignore | |
| input: pl.Expr, | |
| ) -> pl.Expr: | |
| input_type = compiler._expr_types.get(id(input)) | |
| if input_type is not None and dtypes.is_struct_like(input_type): | |
| if isinstance(op.key, str): | |
| return input.struct.field(op.key) | |
| else: | |
| raise NotImplementedError( | |
| "Referencing a struct field by number not implemented in polars compiler." | |
| ) | |
| elif input_type is not None and dtypes.is_string_like(input_type): | |
| return input.str.slice(op.key, 1) | |
| else: | |
| return input.list.get(op.key) |
| def transform(self, func, *args, **kwargs) -> df.DataFrame: | ||
| import bigframes.core.block_transforms as block_transforms | ||
|
|
||
| if block_transforms.is_transpiler_eligible(func): | ||
| window_spec = window_specs.unbound(grouping_keys=tuple(self._by_col_ids)) |
There was a problem hiding this comment.
The module bigframes.core.window_spec is referenced as window_specs [590], but it is not imported in dataframe_group_by.py. This will cause a NameError at runtime. Please import window_spec at the top of the file or locally inside the method.
| def transform(self, func, *args, **kwargs) -> df.DataFrame: | |
| import bigframes.core.block_transforms as block_transforms | |
| if block_transforms.is_transpiler_eligible(func): | |
| window_spec = window_specs.unbound(grouping_keys=tuple(self._by_col_ids)) | |
| def transform(self, func, *args, **kwargs) -> df.DataFrame: | |
| import bigframes.core.block_transforms as block_transforms | |
| import bigframes.core.window_spec as window_specs | |
| if block_transforms.is_transpiler_eligible(func): | |
| window_spec = window_specs.unbound(grouping_keys=tuple(self._by_col_ids)) |
| def transform(self, func, *args, **kwargs) -> series.Series: | ||
| import bigframes.core.block_transforms as block_transforms | ||
|
|
||
| if block_transforms.is_transpiler_eligible(func): | ||
| window_spec = window_specs.unbound(grouping_keys=tuple(self._by_col_ids)) |
There was a problem hiding this comment.
The module bigframes.core.window_spec is referenced as window_specs [307], but it is not imported in series_group_by.py. This will cause a NameError at runtime. Please import window_spec at the top of the file or locally inside the method.
def transform(self, func, *args, **kwargs) -> series.Series:
import bigframes.core.block_transforms as block_transforms
import bigframes.core.window_spec as window_specs
if block_transforms.is_transpiler_eligible(func):
window_spec = window_specs.unbound(grouping_keys=tuple(self._by_col_ids))
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
Fixes #<issue_number_goes_here> 🦕