From 91283489b32b7f8ac599ea020dd6c578e60d329f Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Tue, 19 May 2026 06:11:29 +0000 Subject: [PATCH 1/4] [PYTHON][TESTS] Fix PandasUDFReturnTypeTests for pandas 3 and Ubuntu 24.04 tzdata - Switch the tz-aware fixture from the legacy alias `US/Eastern` to its canonical IANA name `America/New_York`. On Ubuntu 24.04 the system `tzdata` package no longer ships the legacy `US/*` aliases (those moved to `tzdata-legacy`), so under pandas >= 3.0 (which resolves tz via stdlib zoneinfo instead of bundled pytz), the previous fixture raised `ZoneInfoNotFoundError` in CI. - Remap the loaded golden DataFrame in memory when running under pandas >= 3.0 so the pandas-2-generated golden columns still line up: `datetime64[ns]` -> `[us]` and `Categorical` categories `object` -> `str`. Only the column keys are remapped; the on-disk golden file is unchanged. Generated-by: Claude Code --- .../coercion/test_pandas_udf_return_type.py | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py b/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py index 454fe726f95cd..23594b2f7f1df 100644 --- a/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py +++ b/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py @@ -104,7 +104,7 @@ def test_data(self): np.arange(1, 3).astype("complex128"), [np.array([1, 2, 3], dtype=np.int32), np.array([1, 2, 3], dtype=np.int32)], pd.date_range("19700101", periods=2).values, - pd.date_range("19700101", periods=2, tz="US/Eastern").values, + pd.date_range("19700101", periods=2, tz="America/New_York").values, [pd.Timedelta("1 day"), pd.Timedelta("2 days")], pd.Categorical(["A", "B"]), pd.DataFrame({"_1": [1, 2]}), @@ -160,6 +160,35 @@ def _compare_or_generate_golden(self, golden_file, test_name): golden = None if not generating: golden = self.load_golden_csv(golden_csv) + # The golden file was generated under pandas 2, where the default + # dtypes differ from pandas >= 3.0 (datetime64[ns] vs [us]; object + # vs str for categorical categories). Remap the affected columns + # so the same file works under both versions. + if LooseVersion(pd.__version__) >= LooseVersion("3.0.0"): + rename = {} + for value in self.test_data: + if isinstance(value, np.ndarray) and value.dtype.kind == "M": + new_key = self.repr_value(value) + old_key = self.repr_value(value.astype("datetime64[ns]")) + elif ( + isinstance(value, pd.Categorical) + and value.categories.dtype != object + ): + new_key = self.repr_value(value) + old_key = self.repr_value( + pd.Categorical( + value.tolist(), + categories=pd.Index( + value.categories.tolist(), dtype=object + ), + ) + ) + else: + continue + if old_key != new_key: + rename[old_key] = new_key + if rename: + golden = golden.rename(columns=rename) def work(arg): spark_type, value = arg From 0238742b431f851ee97368e2cf62923dd69dc91a Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Tue, 19 May 2026 08:19:53 +0000 Subject: [PATCH 2/4] Patch ns->us cast values and one X->Decimal cell under pandas 3 Extend the pandas-3 in-memory adapter so the value comparisons also line up: - Scale 13+ digit integers in cells of datetime64 / Timedelta-list columns by 1/1000. Pandas 3 returns microseconds where pandas 2 returned nanoseconds for the same cast, e.g. bigint <- pd.date_range(...).values flips from 86_400_000_000_000 to 86_400_000_000. - Override the single decimal(10,0) x ['12','34']@list cell, which flipped from "X" (pandas 2 errored) to [Decimal('12'), Decimal('34')] (pandas 3 succeeds). Test now passes under both pandas 2.3.3 (spark-dev-313) and pandas 3.0.2 (spark-dev-313-p3) locally. Generated-by: Claude Code --- .../coercion/test_pandas_udf_return_type.py | 72 ++++++++++++------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py b/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py index 23594b2f7f1df..d70655ec4ffa8 100644 --- a/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py +++ b/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py @@ -19,6 +19,7 @@ from decimal import Decimal import itertools import os +import re import unittest from pyspark.sql.functions import pandas_udf @@ -151,6 +152,48 @@ def test_pandas_return_type_coercion_vanilla(self): def _run_pandas_udf_return_type_coercion(self, golden_file, test_name): self._compare_or_generate_golden(golden_file, test_name) + def _patch_golden_for_pandas3(self, golden): + # Rename columns whose key differs between pandas 2 and pandas 3: + # datetime64 ndarrays default to [us] instead of [ns], and Categorical + # categories default to str instead of object. + rename = {} + for value in self.test_data: + if isinstance(value, np.ndarray) and value.dtype.kind == "M": + new_key = self.repr_value(value) + old_key = self.repr_value(value.astype("datetime64[ns]")) + elif isinstance(value, pd.Categorical) and value.categories.dtype != object: + new_key = self.repr_value(value) + old_key = self.repr_value( + pd.Categorical( + value.tolist(), + categories=pd.Index(value.categories.tolist(), dtype=object), + ) + ) + else: + continue + if old_key != new_key: + rename[old_key] = new_key + if rename: + golden.rename(columns=rename, inplace=True) + + # Scale ns->us in datetime64 / Timedelta columns: any 13+ digit integer + # in those cells is a pandas-2 nanosecond value; the pandas-3 cast + # returns microseconds (1000x smaller). + def _ns_to_us(s): + return re.sub(r"\d{13,}", lambda m: str(int(m.group()) // 1000), s) + + scale_cols = [ + c for c in golden.columns if "@ndarray[datetime64[" in c or c.startswith("[Timedelta(") + ] + for col in scale_cols: + golden[col] = golden[col].map(_ns_to_us) + + # Pandas 3 succeeds at coercing string list -> Decimal where pandas 2 + # errored, so the corresponding cell flips from "X" to the new repr. + decimal_col = "['12', '34']@list" + if "decimal(10,0)" in golden.index and decimal_col in golden.columns: + golden.loc["decimal(10,0)", decimal_col] = "[Decimal('12'), Decimal('34')]" + def _compare_or_generate_golden(self, golden_file, test_name): generating = self.is_generating_golden() @@ -162,33 +205,10 @@ def _compare_or_generate_golden(self, golden_file, test_name): golden = self.load_golden_csv(golden_csv) # The golden file was generated under pandas 2, where the default # dtypes differ from pandas >= 3.0 (datetime64[ns] vs [us]; object - # vs str for categorical categories). Remap the affected columns - # so the same file works under both versions. + # vs str for categorical categories). Patch the loaded golden in + # memory so the same file works under both versions. if LooseVersion(pd.__version__) >= LooseVersion("3.0.0"): - rename = {} - for value in self.test_data: - if isinstance(value, np.ndarray) and value.dtype.kind == "M": - new_key = self.repr_value(value) - old_key = self.repr_value(value.astype("datetime64[ns]")) - elif ( - isinstance(value, pd.Categorical) - and value.categories.dtype != object - ): - new_key = self.repr_value(value) - old_key = self.repr_value( - pd.Categorical( - value.tolist(), - categories=pd.Index( - value.categories.tolist(), dtype=object - ), - ) - ) - else: - continue - if old_key != new_key: - rename[old_key] = new_key - if rename: - golden = golden.rename(columns=rename) + self._patch_golden_for_pandas3(golden) def work(arg): spark_type, value = arg From 29424ba60e925bca4e06ef82ab869c21de7b199c Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Tue, 19 May 2026 10:06:34 +0000 Subject: [PATCH 3/4] Inline pandas-3 golden patch into _compare_or_generate_golden No behavior change. Folds _patch_golden_for_pandas3 directly into the loader block where it is used, since it is only called once. Also replaces the local re.sub helper with Series.str.replace(regex=True) to drop the `import re`. Generated-by: Claude Code --- .../coercion/test_pandas_udf_return_type.py | 93 +++++++++---------- 1 file changed, 45 insertions(+), 48 deletions(-) diff --git a/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py b/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py index d70655ec4ffa8..9fd21309cb1b9 100644 --- a/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py +++ b/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py @@ -19,7 +19,6 @@ from decimal import Decimal import itertools import os -import re import unittest from pyspark.sql.functions import pandas_udf @@ -152,48 +151,6 @@ def test_pandas_return_type_coercion_vanilla(self): def _run_pandas_udf_return_type_coercion(self, golden_file, test_name): self._compare_or_generate_golden(golden_file, test_name) - def _patch_golden_for_pandas3(self, golden): - # Rename columns whose key differs between pandas 2 and pandas 3: - # datetime64 ndarrays default to [us] instead of [ns], and Categorical - # categories default to str instead of object. - rename = {} - for value in self.test_data: - if isinstance(value, np.ndarray) and value.dtype.kind == "M": - new_key = self.repr_value(value) - old_key = self.repr_value(value.astype("datetime64[ns]")) - elif isinstance(value, pd.Categorical) and value.categories.dtype != object: - new_key = self.repr_value(value) - old_key = self.repr_value( - pd.Categorical( - value.tolist(), - categories=pd.Index(value.categories.tolist(), dtype=object), - ) - ) - else: - continue - if old_key != new_key: - rename[old_key] = new_key - if rename: - golden.rename(columns=rename, inplace=True) - - # Scale ns->us in datetime64 / Timedelta columns: any 13+ digit integer - # in those cells is a pandas-2 nanosecond value; the pandas-3 cast - # returns microseconds (1000x smaller). - def _ns_to_us(s): - return re.sub(r"\d{13,}", lambda m: str(int(m.group()) // 1000), s) - - scale_cols = [ - c for c in golden.columns if "@ndarray[datetime64[" in c or c.startswith("[Timedelta(") - ] - for col in scale_cols: - golden[col] = golden[col].map(_ns_to_us) - - # Pandas 3 succeeds at coercing string list -> Decimal where pandas 2 - # errored, so the corresponding cell flips from "X" to the new repr. - decimal_col = "['12', '34']@list" - if "decimal(10,0)" in golden.index and decimal_col in golden.columns: - golden.loc["decimal(10,0)", decimal_col] = "[Decimal('12'), Decimal('34')]" - def _compare_or_generate_golden(self, golden_file, test_name): generating = self.is_generating_golden() @@ -203,12 +160,52 @@ def _compare_or_generate_golden(self, golden_file, test_name): golden = None if not generating: golden = self.load_golden_csv(golden_csv) - # The golden file was generated under pandas 2, where the default - # dtypes differ from pandas >= 3.0 (datetime64[ns] vs [us]; object - # vs str for categorical categories). Patch the loaded golden in - # memory so the same file works under both versions. + # The golden file was generated under pandas 2; patch the loaded + # copy in memory so the same file works under pandas >= 3.0. if LooseVersion(pd.__version__) >= LooseVersion("3.0.0"): - self._patch_golden_for_pandas3(golden) + # Rename columns whose key differs between the two versions: + # datetime64 ndarrays default to [us] instead of [ns], and + # Categorical categories default to str instead of object. + rename = {} + for value in self.test_data: + if isinstance(value, np.ndarray) and value.dtype.kind == "M": + new_key = self.repr_value(value) + old_key = self.repr_value(value.astype("datetime64[ns]")) + elif isinstance(value, pd.Categorical) and value.categories.dtype != object: + new_key = self.repr_value(value) + old_key = self.repr_value( + pd.Categorical( + value.tolist(), + categories=pd.Index(value.categories.tolist(), dtype=object), + ) + ) + else: + continue + if old_key != new_key: + rename[old_key] = new_key + if rename: + golden.rename(columns=rename, inplace=True) + + # Scale ns->us in datetime64 / Timedelta columns: any 13+ digit + # integer in those cells is a pandas-2 nanosecond value; the + # pandas-3 cast returns microseconds (1000x smaller). + scale_cols = [ + c + for c in golden.columns + if "@ndarray[datetime64[" in c or c.startswith("[Timedelta(") + ] + for col in scale_cols: + golden[col] = golden[col].str.replace( + r"\d{13,}", + lambda m: str(int(m.group()) // 1000), + regex=True, + ) + + # Pandas 3 succeeds at coercing string list -> Decimal where + # pandas 2 errored, so the corresponding cell flips from "X". + decimal_col = "['12', '34']@list" + if "decimal(10,0)" in golden.index and decimal_col in golden.columns: + golden.loc["decimal(10,0)", decimal_col] = "[Decimal('12'), Decimal('34')]" def work(arg): spark_type, value = arg From 7230f4855a1d940ee6709bf4a4140fd824ec830c Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Tue, 19 May 2026 11:41:04 +0000 Subject: [PATCH 4/4] Locate pandas-3 patch targets via repr_* instead of column substring No behavior change. Use self.repr_value(value) and self.repr_type(...) to derive both rename and scale targets directly from self.test_data and the affected Spark type, instead of grep-matching the golden column names. Single loop over test_data builds both rename and scale_cols. Generated-by: Claude Code --- .../coercion/test_pandas_udf_return_type.py | 39 +++++++++---------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py b/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py index 9fd21309cb1b9..f1ba3cd847239 100644 --- a/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py +++ b/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py @@ -161,39 +161,35 @@ def _compare_or_generate_golden(self, golden_file, test_name): if not generating: golden = self.load_golden_csv(golden_csv) # The golden file was generated under pandas 2; patch the loaded - # copy in memory so the same file works under pandas >= 3.0. + # copy in memory so the same file works under pandas >= 3.0, where + # the defaults differ: datetime64 ndarrays use [us] instead of [ns], + # Categorical categories use str instead of object, and the same + # casts return microseconds instead of nanoseconds. if LooseVersion(pd.__version__) >= LooseVersion("3.0.0"): - # Rename columns whose key differs between the two versions: - # datetime64 ndarrays default to [us] instead of [ns], and - # Categorical categories default to str instead of object. rename = {} + scale_cols = [] for value in self.test_data: + new_key = self.repr_value(value) if isinstance(value, np.ndarray) and value.dtype.kind == "M": - new_key = self.repr_value(value) old_key = self.repr_value(value.astype("datetime64[ns]")) + if old_key != new_key: + rename[old_key] = new_key + scale_cols.append(new_key) elif isinstance(value, pd.Categorical) and value.categories.dtype != object: - new_key = self.repr_value(value) old_key = self.repr_value( pd.Categorical( value.tolist(), categories=pd.Index(value.categories.tolist(), dtype=object), ) ) - else: - continue - if old_key != new_key: - rename[old_key] = new_key + if old_key != new_key: + rename[old_key] = new_key + elif isinstance(value, list) and value and isinstance(value[0], pd.Timedelta): + scale_cols.append(new_key) + if rename: golden.rename(columns=rename, inplace=True) - # Scale ns->us in datetime64 / Timedelta columns: any 13+ digit - # integer in those cells is a pandas-2 nanosecond value; the - # pandas-3 cast returns microseconds (1000x smaller). - scale_cols = [ - c - for c in golden.columns - if "@ndarray[datetime64[" in c or c.startswith("[Timedelta(") - ] for col in scale_cols: golden[col] = golden[col].str.replace( r"\d{13,}", @@ -203,9 +199,10 @@ def _compare_or_generate_golden(self, golden_file, test_name): # Pandas 3 succeeds at coercing string list -> Decimal where # pandas 2 errored, so the corresponding cell flips from "X". - decimal_col = "['12', '34']@list" - if "decimal(10,0)" in golden.index and decimal_col in golden.columns: - golden.loc["decimal(10,0)", decimal_col] = "[Decimal('12'), Decimal('34')]" + decimal_idx = self.repr_type(DecimalType(10, 0)) + decimal_col = self.repr_value(["12", "34"]) + if decimal_idx in golden.index and decimal_col in golden.columns: + golden.loc[decimal_idx, decimal_col] = "[Decimal('12'), Decimal('34')]" def work(arg): spark_type, value = arg