Skip to content

Commit 5b7ad64

Browse files
timsaucerclaude
andauthored
feat: accept native Python literals on date/time functions (#1563)
Widen date/time scalar function signatures to accept native Python ``str``/``int`` literals alongside ``Expr``: - ``date_bin``: ``stride``, ``source``, ``origin`` accept ``Expr | str``. - ``make_date``, ``make_time``: components accept ``Expr | int``. - ``to_date``, ``to_time``, ``to_timestamp``, ``to_timestamp_{millis, micros,nanos,seconds}``, ``to_unixtime``: ``*formatters`` accept ``Expr | str``. Add ``coerce_to_expr_list`` public helper in ``datafusion.expr`` mirroring ``coerce_to_expr`` / ``ensure_expr_list`` for variadic call sites. ``date_bin`` uses ``Expr.string_literal`` directly because its planner coerces ``Utf8`` (not ``Utf8View``) literals to ``Interval``/``Timestamp``. Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent ac43697 commit 5b7ad64

4 files changed

Lines changed: 211 additions & 26 deletions

File tree

python/datafusion/expr.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,7 @@
252252
"WindowFrame",
253253
"WindowFrameBound",
254254
"coerce_to_expr",
255+
"coerce_to_expr_list",
255256
"coerce_to_expr_or_none",
256257
"ensure_expr",
257258
"ensure_expr_list",
@@ -348,6 +349,18 @@ def coerce_to_expr_or_none(value: Any | None) -> Expr | None:
348349
return coerce_to_expr(value)
349350

350351

352+
def coerce_to_expr_list(values: Iterable[Any]) -> list[Expr]:
353+
"""Coerce each item in an iterable to ``Expr`` via :func:`coerce_to_expr`.
354+
355+
Args:
356+
values: Iterable of ``Expr`` instances or Python literals to wrap.
357+
358+
Returns:
359+
A list of ``Expr`` instances.
360+
"""
361+
return [coerce_to_expr(value) for value in values]
362+
363+
351364
def _to_raw_expr(value: Expr | str) -> expr_internal.Expr:
352365
"""Convert a Python expression or column name to its raw variant.
353366

python/datafusion/functions/__init__.py

Lines changed: 159 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
import pyarrow as pa
4646

4747
if TYPE_CHECKING:
48-
from collections.abc import Callable
48+
from collections.abc import Callable, Iterable
4949

5050
from datafusion._internal import functions as f
5151
from datafusion.common import NullTreatment
@@ -55,6 +55,7 @@
5555
SortExpr,
5656
SortKey,
5757
coerce_to_expr,
58+
coerce_to_expr_list,
5859
coerce_to_expr_or_none,
5960
expr_list_to_raw_expr_list,
6061
sort_list_to_raw_sort_list,
@@ -2391,11 +2392,11 @@ def date_format(arg: Expr, formatter: Expr | str) -> Expr:
23912392
return to_char(arg, formatter)
23922393

23932394

2394-
def _unwrap_exprs(args: tuple[Expr, ...]) -> list:
2395+
def _unwrap_exprs(args: Iterable[Expr]) -> list:
23952396
return [arg.expr for arg in args]
23962397

23972398

2398-
def to_date(arg: Expr, *formatters: Expr) -> Expr:
2399+
def to_date(arg: Expr, *formatters: Expr | str) -> Expr:
23992400
"""Converts a value to a date (YYYY-MM-DD).
24002401
24012402
Supports strings, numeric and timestamp types as input.
@@ -2414,8 +2415,16 @@ def to_date(arg: Expr, *formatters: Expr) -> Expr:
24142415
... dfn.functions.to_date(dfn.col("a")).alias("dt"))
24152416
>>> str(result.collect_column("dt")[0].as_py())
24162417
'2021-07-20'
2418+
2419+
Pass a format string as a bare ``str``:
2420+
2421+
>>> df = ctx.from_pydict({"a": ["20-07-2021"]})
2422+
>>> result = df.select(
2423+
... dfn.functions.to_date(dfn.col("a"), "%d-%m-%Y").alias("dt"))
2424+
>>> str(result.collect_column("dt")[0].as_py())
2425+
'2021-07-20'
24172426
"""
2418-
return Expr(f.to_date(arg.expr, *_unwrap_exprs(formatters)))
2427+
return Expr(f.to_date(arg.expr, *_unwrap_exprs(coerce_to_expr_list(formatters))))
24192428

24202429

24212430
def to_local_time(*args: Expr) -> Expr:
@@ -2426,7 +2435,7 @@ def to_local_time(*args: Expr) -> Expr:
24262435
return Expr(f.to_local_time(*_unwrap_exprs(args)))
24272436

24282437

2429-
def to_time(arg: Expr, *formatters: Expr) -> Expr:
2438+
def to_time(arg: Expr, *formatters: Expr | str) -> Expr:
24302439
"""Converts a value to a time. Supports strings and timestamps as input.
24312440
24322441
If ``formatters`` is not provided strings are parsed as HH:MM:SS, HH:MM or
@@ -2443,11 +2452,19 @@ def to_time(arg: Expr, *formatters: Expr) -> Expr:
24432452
... dfn.functions.to_time(dfn.col("a")).alias("t"))
24442453
>>> str(result.collect_column("t")[0].as_py())
24452454
'14:30:00'
2455+
2456+
Pass a format string as a bare ``str``:
2457+
2458+
>>> df = ctx.from_pydict({"a": ["14h30m00s"]})
2459+
>>> result = df.select(
2460+
... dfn.functions.to_time(dfn.col("a"), "%Hh%Mm%Ss").alias("t"))
2461+
>>> str(result.collect_column("t")[0].as_py())
2462+
'14:30:00'
24462463
"""
2447-
return Expr(f.to_time(arg.expr, *_unwrap_exprs(formatters)))
2464+
return Expr(f.to_time(arg.expr, *_unwrap_exprs(coerce_to_expr_list(formatters))))
24482465

24492466

2450-
def to_timestamp(arg: Expr, *formatters: Expr) -> Expr:
2467+
def to_timestamp(arg: Expr, *formatters: Expr | str) -> Expr:
24512468
"""Converts a string and optional formats to a ``Timestamp`` in nanoseconds.
24522469
24532470
For usage of ``formatters`` see the rust chrono package ``strftime`` package.
@@ -2464,11 +2481,24 @@ def to_timestamp(arg: Expr, *formatters: Expr) -> Expr:
24642481
... )
24652482
>>> str(result.collect_column("ts")[0].as_py())
24662483
'2021-01-01 00:00:00'
2484+
2485+
Pass a format string as a bare ``str``:
2486+
2487+
>>> df = ctx.from_pydict({"a": ["01/01/2021 00:00:00"]})
2488+
>>> result = df.select(
2489+
... dfn.functions.to_timestamp(
2490+
... dfn.col("a"), "%d/%m/%Y %H:%M:%S"
2491+
... ).alias("ts")
2492+
... )
2493+
>>> str(result.collect_column("ts")[0].as_py())
2494+
'2021-01-01 00:00:00'
24672495
"""
2468-
return Expr(f.to_timestamp(arg.expr, *_unwrap_exprs(formatters)))
2496+
return Expr(
2497+
f.to_timestamp(arg.expr, *_unwrap_exprs(coerce_to_expr_list(formatters)))
2498+
)
24692499

24702500

2471-
def to_timestamp_millis(arg: Expr, *formatters: Expr) -> Expr:
2501+
def to_timestamp_millis(arg: Expr, *formatters: Expr | str) -> Expr:
24722502
"""Converts a string and optional formats to a ``Timestamp`` in milliseconds.
24732503
24742504
See :py:func:`to_timestamp` for a description on how to use formatters.
@@ -2483,11 +2513,24 @@ def to_timestamp_millis(arg: Expr, *formatters: Expr) -> Expr:
24832513
... )
24842514
>>> str(result.collect_column("ts")[0].as_py())
24852515
'2021-01-01 00:00:00'
2516+
2517+
Pass a format string as a bare ``str``:
2518+
2519+
>>> df = ctx.from_pydict({"a": ["01/01/2021 00:00:00"]})
2520+
>>> result = df.select(
2521+
... dfn.functions.to_timestamp_millis(
2522+
... dfn.col("a"), "%d/%m/%Y %H:%M:%S"
2523+
... ).alias("ts")
2524+
... )
2525+
>>> str(result.collect_column("ts")[0].as_py())
2526+
'2021-01-01 00:00:00'
24862527
"""
2487-
return Expr(f.to_timestamp_millis(arg.expr, *_unwrap_exprs(formatters)))
2528+
return Expr(
2529+
f.to_timestamp_millis(arg.expr, *_unwrap_exprs(coerce_to_expr_list(formatters)))
2530+
)
24882531

24892532

2490-
def to_timestamp_micros(arg: Expr, *formatters: Expr) -> Expr:
2533+
def to_timestamp_micros(arg: Expr, *formatters: Expr | str) -> Expr:
24912534
"""Converts a string and optional formats to a ``Timestamp`` in microseconds.
24922535
24932536
See :py:func:`to_timestamp` for a description on how to use formatters.
@@ -2502,11 +2545,24 @@ def to_timestamp_micros(arg: Expr, *formatters: Expr) -> Expr:
25022545
... )
25032546
>>> str(result.collect_column("ts")[0].as_py())
25042547
'2021-01-01 00:00:00'
2548+
2549+
Pass a format string as a bare ``str``:
2550+
2551+
>>> df = ctx.from_pydict({"a": ["01/01/2021 00:00:00"]})
2552+
>>> result = df.select(
2553+
... dfn.functions.to_timestamp_micros(
2554+
... dfn.col("a"), "%d/%m/%Y %H:%M:%S"
2555+
... ).alias("ts")
2556+
... )
2557+
>>> str(result.collect_column("ts")[0].as_py())
2558+
'2021-01-01 00:00:00'
25052559
"""
2506-
return Expr(f.to_timestamp_micros(arg.expr, *_unwrap_exprs(formatters)))
2560+
return Expr(
2561+
f.to_timestamp_micros(arg.expr, *_unwrap_exprs(coerce_to_expr_list(formatters)))
2562+
)
25072563

25082564

2509-
def to_timestamp_nanos(arg: Expr, *formatters: Expr) -> Expr:
2565+
def to_timestamp_nanos(arg: Expr, *formatters: Expr | str) -> Expr:
25102566
"""Converts a string and optional formats to a ``Timestamp`` in nanoseconds.
25112567
25122568
See :py:func:`to_timestamp` for a description on how to use formatters.
@@ -2521,11 +2577,24 @@ def to_timestamp_nanos(arg: Expr, *formatters: Expr) -> Expr:
25212577
... )
25222578
>>> str(result.collect_column("ts")[0].as_py())
25232579
'2021-01-01 00:00:00'
2580+
2581+
Pass a format string as a bare ``str``:
2582+
2583+
>>> df = ctx.from_pydict({"a": ["01/01/2021 00:00:00"]})
2584+
>>> result = df.select(
2585+
... dfn.functions.to_timestamp_nanos(
2586+
... dfn.col("a"), "%d/%m/%Y %H:%M:%S"
2587+
... ).alias("ts")
2588+
... )
2589+
>>> str(result.collect_column("ts")[0].as_py())
2590+
'2021-01-01 00:00:00'
25242591
"""
2525-
return Expr(f.to_timestamp_nanos(arg.expr, *_unwrap_exprs(formatters)))
2592+
return Expr(
2593+
f.to_timestamp_nanos(arg.expr, *_unwrap_exprs(coerce_to_expr_list(formatters)))
2594+
)
25262595

25272596

2528-
def to_timestamp_seconds(arg: Expr, *formatters: Expr) -> Expr:
2597+
def to_timestamp_seconds(arg: Expr, *formatters: Expr | str) -> Expr:
25292598
"""Converts a string and optional formats to a ``Timestamp`` in seconds.
25302599
25312600
See :py:func:`to_timestamp` for a description on how to use formatters.
@@ -2540,11 +2609,26 @@ def to_timestamp_seconds(arg: Expr, *formatters: Expr) -> Expr:
25402609
... )
25412610
>>> str(result.collect_column("ts")[0].as_py())
25422611
'2021-01-01 00:00:00'
2612+
2613+
Pass a format string as a bare ``str``:
2614+
2615+
>>> df = ctx.from_pydict({"a": ["01/01/2021 00:00:00"]})
2616+
>>> result = df.select(
2617+
... dfn.functions.to_timestamp_seconds(
2618+
... dfn.col("a"), "%d/%m/%Y %H:%M:%S"
2619+
... ).alias("ts")
2620+
... )
2621+
>>> str(result.collect_column("ts")[0].as_py())
2622+
'2021-01-01 00:00:00'
25432623
"""
2544-
return Expr(f.to_timestamp_seconds(arg.expr, *_unwrap_exprs(formatters)))
2624+
return Expr(
2625+
f.to_timestamp_seconds(
2626+
arg.expr, *_unwrap_exprs(coerce_to_expr_list(formatters))
2627+
)
2628+
)
25452629

25462630

2547-
def to_unixtime(string: Expr, *format_arguments: Expr) -> Expr:
2631+
def to_unixtime(string: Expr, *format_arguments: Expr | str) -> Expr:
25482632
"""Converts a string and optional formats to a Unixtime.
25492633
25502634
Examples:
@@ -2553,8 +2637,23 @@ def to_unixtime(string: Expr, *format_arguments: Expr) -> Expr:
25532637
>>> result = df.select(dfn.functions.to_unixtime(dfn.col("a")).alias("u"))
25542638
>>> result.collect_column("u")[0].as_py()
25552639
0
2640+
2641+
Pass a format string as a bare ``str``:
2642+
2643+
>>> df = ctx.from_pydict({"a": ["01/01/1970 00:00:00"]})
2644+
>>> result = df.select(
2645+
... dfn.functions.to_unixtime(
2646+
... dfn.col("a"), "%d/%m/%Y %H:%M:%S"
2647+
... ).alias("u")
2648+
... )
2649+
>>> result.collect_column("u")[0].as_py()
2650+
0
25562651
"""
2557-
return Expr(f.to_unixtime(string.expr, *_unwrap_exprs(format_arguments)))
2652+
return Expr(
2653+
f.to_unixtime(
2654+
string.expr, *_unwrap_exprs(coerce_to_expr_list(format_arguments))
2655+
)
2656+
)
25582657

25592658

25602659
def current_date() -> Expr:
@@ -2676,28 +2775,43 @@ def datetrunc(part: Expr | str, date: Expr) -> Expr:
26762775
return _date_trunc(part, date, "datetrunc")
26772776

26782777

2679-
def date_bin(stride: Expr, source: Expr, origin: Expr) -> Expr:
2778+
def date_bin(stride: Expr | str, source: Expr | str, origin: Expr | str) -> Expr:
26802779
"""Coerces an arbitrary timestamp to the start of the nearest specified interval.
26812780
26822781
Examples:
26832782
>>> ctx = dfn.SessionContext()
26842783
>>> df = ctx.from_pydict({"timestamp": ['2021-07-15 12:34:56', '2021-01-01']})
26852784
>>> result = df.select(
26862785
... dfn.functions.date_bin(
2687-
... dfn.string_literal("15 minutes"),
2786+
... "15 minutes",
26882787
... dfn.col("timestamp"),
2689-
... dfn.string_literal("2001-01-01 00:00:00")
2788+
... "2001-01-01 00:00:00",
26902789
... ).alias("b")
26912790
... )
26922791
>>> str(result.collect_column("b")[0].as_py())
26932792
'2021-07-15 12:30:00'
26942793
>>> str(result.collect_column("b")[1].as_py())
26952794
'2021-01-01 00:00:00'
2795+
2796+
``source`` may also be a bare literal:
2797+
2798+
>>> result = df.select(
2799+
... dfn.functions.date_bin(
2800+
... "15 minutes", "2021-07-15 12:34:56", "2001-01-01 00:00:00"
2801+
... ).alias("b")
2802+
... )
2803+
>>> str(result.collect_column("b")[0].as_py())
2804+
'2021-07-15 12:30:00'
26962805
"""
2806+
# date_bin's planner coerces Utf8 (not Utf8View) literals to Interval/Timestamp,
2807+
# so wrap bare strs via string_literal to force Utf8.
2808+
stride = Expr.string_literal(stride) if isinstance(stride, str) else stride
2809+
source = Expr.string_literal(source) if isinstance(source, str) else source
2810+
origin = Expr.string_literal(origin) if isinstance(origin, str) else origin
26972811
return Expr(f.date_bin(stride.expr, source.expr, origin.expr))
26982812

26992813

2700-
def make_date(year: Expr, month: Expr, day: Expr) -> Expr:
2814+
def make_date(year: Expr | int, month: Expr | int, day: Expr | int) -> Expr:
27012815
"""Make a date from year, month and day component parts.
27022816
27032817
Examples:
@@ -2709,11 +2823,22 @@ def make_date(year: Expr, month: Expr, day: Expr) -> Expr:
27092823
... dfn.col("d")).alias("dt"))
27102824
>>> result.collect_column("dt")[0].as_py()
27112825
datetime.date(2024, 1, 15)
2826+
2827+
Pass bare ints for any component:
2828+
2829+
>>> df = ctx.from_pydict({"y": [2024]})
2830+
>>> result = df.select(
2831+
... dfn.functions.make_date(dfn.col("y"), 1, 15).alias("dt"))
2832+
>>> result.collect_column("dt")[0].as_py()
2833+
datetime.date(2024, 1, 15)
27122834
"""
2835+
year = coerce_to_expr(year)
2836+
month = coerce_to_expr(month)
2837+
day = coerce_to_expr(day)
27132838
return Expr(f.make_date(year.expr, month.expr, day.expr))
27142839

27152840

2716-
def make_time(hour: Expr, minute: Expr, second: Expr) -> Expr:
2841+
def make_time(hour: Expr | int, minute: Expr | int, second: Expr | int) -> Expr:
27172842
"""Make a time from hour, minute and second component parts.
27182843
27192844
Examples:
@@ -2724,7 +2849,18 @@ def make_time(hour: Expr, minute: Expr, second: Expr) -> Expr:
27242849
... dfn.col("s")).alias("t"))
27252850
>>> result.collect_column("t")[0].as_py()
27262851
datetime.time(12, 30)
2852+
2853+
Pass bare ints for any component:
2854+
2855+
>>> df = ctx.from_pydict({"h": [12]})
2856+
>>> result = df.select(
2857+
... dfn.functions.make_time(dfn.col("h"), 30, 0).alias("t"))
2858+
>>> result.collect_column("t")[0].as_py()
2859+
datetime.time(12, 30)
27272860
"""
2861+
hour = coerce_to_expr(hour)
2862+
minute = coerce_to_expr(minute)
2863+
second = coerce_to_expr(second)
27282864
return Expr(f.make_time(hour.expr, minute.expr, second.expr))
27292865

27302866

0 commit comments

Comments
 (0)