From 91283489b32b7f8ac599ea020dd6c578e60d329f Mon Sep 17 00:00:00 2001
From: Ruifeng Zheng <ruifengz@apache.org>
Date: Tue, 19 May 2026 06:11:29 +0000
Subject: [PATCH 1/4] [PYTHON][TESTS] Fix PandasUDFReturnTypeTests for pandas 3
 and Ubuntu 24.04 tzdata

- Switch the tz-aware fixture from the legacy alias `US/Eastern` to its
  canonical IANA name `America/New_York`. On Ubuntu 24.04 the system
  `tzdata` package no longer ships the legacy `US/*` aliases (those
  moved to `tzdata-legacy`), so under pandas >= 3.0 (which resolves tz
  via stdlib zoneinfo instead of bundled pytz), the previous fixture
  raised `ZoneInfoNotFoundError` in CI.
- Remap the loaded golden DataFrame in memory when running under pandas
  >= 3.0 so the pandas-2-generated golden columns still line up:
  `datetime64[ns]` -> `[us]` and `Categorical` categories `object` ->
  `str`. Only the column keys are remapped; the on-disk golden file is
  unchanged.

Generated-by: Claude Code
---
 .../coercion/test_pandas_udf_return_type.py   | 31 ++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py b/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py
index 454fe726f95cd..23594b2f7f1df 100644
--- a/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py
+++ b/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py
@@ -104,7 +104,7 @@ def test_data(self):
             np.arange(1, 3).astype("complex128"),
             [np.array([1, 2, 3], dtype=np.int32), np.array([1, 2, 3], dtype=np.int32)],
             pd.date_range("19700101", periods=2).values,
-            pd.date_range("19700101", periods=2, tz="US/Eastern").values,
+            pd.date_range("19700101", periods=2, tz="America/New_York").values,
             [pd.Timedelta("1 day"), pd.Timedelta("2 days")],
             pd.Categorical(["A", "B"]),
             pd.DataFrame({"_1": [1, 2]}),
@@ -160,6 +160,35 @@ def _compare_or_generate_golden(self, golden_file, test_name):
         golden = None
         if not generating:
             golden = self.load_golden_csv(golden_csv)
+            # The golden file was generated under pandas 2, where the default
+            # dtypes differ from pandas >= 3.0 (datetime64[ns] vs [us]; object
+            # vs str for categorical categories). Remap the affected columns
+            # so the same file works under both versions.
+            if LooseVersion(pd.__version__) >= LooseVersion("3.0.0"):
+                rename = {}
+                for value in self.test_data:
+                    if isinstance(value, np.ndarray) and value.dtype.kind == "M":
+                        new_key = self.repr_value(value)
+                        old_key = self.repr_value(value.astype("datetime64[ns]"))
+                    elif (
+                        isinstance(value, pd.Categorical)
+                        and value.categories.dtype != object
+                    ):
+                        new_key = self.repr_value(value)
+                        old_key = self.repr_value(
+                            pd.Categorical(
+                                value.tolist(),
+                                categories=pd.Index(
+                                    value.categories.tolist(), dtype=object
+                                ),
+                            )
+                        )
+                    else:
+                        continue
+                    if old_key != new_key:
+                        rename[old_key] = new_key
+                if rename:
+                    golden = golden.rename(columns=rename)
 
         def work(arg):
             spark_type, value = arg

From 0238742b431f851ee97368e2cf62923dd69dc91a Mon Sep 17 00:00:00 2001
From: Ruifeng Zheng <ruifengz@apache.org>
Date: Tue, 19 May 2026 08:19:53 +0000
Subject: [PATCH 2/4] Patch ns->us cast values and one X->Decimal cell under
 pandas 3

Extend the pandas-3 in-memory adapter so the value comparisons also
line up:

- Scale 13+ digit integers in cells of datetime64 / Timedelta-list
  columns by 1/1000. Pandas 3 returns microseconds where pandas 2
  returned nanoseconds for the same cast, e.g. bigint <-
  pd.date_range(...).values flips from 86_400_000_000_000 to
  86_400_000_000.
- Override the single decimal(10,0) x ['12','34']@list cell, which
  flipped from "X" (pandas 2 errored) to [Decimal('12'), Decimal('34')]
  (pandas 3 succeeds).

Test now passes under both pandas 2.3.3 (spark-dev-313) and pandas
3.0.2 (spark-dev-313-p3) locally.

Generated-by: Claude Code
---
 .../coercion/test_pandas_udf_return_type.py   | 72 ++++++++++++-------
 1 file changed, 46 insertions(+), 26 deletions(-)

diff --git a/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py b/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py
index 23594b2f7f1df..d70655ec4ffa8 100644
--- a/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py
+++ b/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py
@@ -19,6 +19,7 @@
 from decimal import Decimal
 import itertools
 import os
+import re
 import unittest
 
 from pyspark.sql.functions import pandas_udf
@@ -151,6 +152,48 @@ def test_pandas_return_type_coercion_vanilla(self):
     def _run_pandas_udf_return_type_coercion(self, golden_file, test_name):
         self._compare_or_generate_golden(golden_file, test_name)
 
+    def _patch_golden_for_pandas3(self, golden):
+        # Rename columns whose key differs between pandas 2 and pandas 3:
+        # datetime64 ndarrays default to [us] instead of [ns], and Categorical
+        # categories default to str instead of object.
+        rename = {}
+        for value in self.test_data:
+            if isinstance(value, np.ndarray) and value.dtype.kind == "M":
+                new_key = self.repr_value(value)
+                old_key = self.repr_value(value.astype("datetime64[ns]"))
+            elif isinstance(value, pd.Categorical) and value.categories.dtype != object:
+                new_key = self.repr_value(value)
+                old_key = self.repr_value(
+                    pd.Categorical(
+                        value.tolist(),
+                        categories=pd.Index(value.categories.tolist(), dtype=object),
+                    )
+                )
+            else:
+                continue
+            if old_key != new_key:
+                rename[old_key] = new_key
+        if rename:
+            golden.rename(columns=rename, inplace=True)
+
+        # Scale ns->us in datetime64 / Timedelta columns: any 13+ digit integer
+        # in those cells is a pandas-2 nanosecond value; the pandas-3 cast
+        # returns microseconds (1000x smaller).
+        def _ns_to_us(s):
+            return re.sub(r"\d{13,}", lambda m: str(int(m.group()) // 1000), s)
+
+        scale_cols = [
+            c for c in golden.columns if "@ndarray[datetime64[" in c or c.startswith("[Timedelta(")
+        ]
+        for col in scale_cols:
+            golden[col] = golden[col].map(_ns_to_us)
+
+        # Pandas 3 succeeds at coercing string list -> Decimal where pandas 2
+        # errored, so the corresponding cell flips from "X" to the new repr.
+        decimal_col = "['12', '34']@list"
+        if "decimal(10,0)" in golden.index and decimal_col in golden.columns:
+            golden.loc["decimal(10,0)", decimal_col] = "[Decimal('12'), Decimal('34')]"
+
     def _compare_or_generate_golden(self, golden_file, test_name):
         generating = self.is_generating_golden()
 
@@ -162,33 +205,10 @@ def _compare_or_generate_golden(self, golden_file, test_name):
             golden = self.load_golden_csv(golden_csv)
             # The golden file was generated under pandas 2, where the default
             # dtypes differ from pandas >= 3.0 (datetime64[ns] vs [us]; object
-            # vs str for categorical categories). Remap the affected columns
-            # so the same file works under both versions.
+            # vs str for categorical categories). Patch the loaded golden in
+            # memory so the same file works under both versions.
             if LooseVersion(pd.__version__) >= LooseVersion("3.0.0"):
-                rename = {}
-                for value in self.test_data:
-                    if isinstance(value, np.ndarray) and value.dtype.kind == "M":
-                        new_key = self.repr_value(value)
-                        old_key = self.repr_value(value.astype("datetime64[ns]"))
-                    elif (
-                        isinstance(value, pd.Categorical)
-                        and value.categories.dtype != object
-                    ):
-                        new_key = self.repr_value(value)
-                        old_key = self.repr_value(
-                            pd.Categorical(
-                                value.tolist(),
-                                categories=pd.Index(
-                                    value.categories.tolist(), dtype=object
-                                ),
-                            )
-                        )
-                    else:
-                        continue
-                    if old_key != new_key:
-                        rename[old_key] = new_key
-                if rename:
-                    golden = golden.rename(columns=rename)
+                self._patch_golden_for_pandas3(golden)
 
         def work(arg):
             spark_type, value = arg

From 29424ba60e925bca4e06ef82ab869c21de7b199c Mon Sep 17 00:00:00 2001
From: Ruifeng Zheng <ruifengz@apache.org>
Date: Tue, 19 May 2026 10:06:34 +0000
Subject: [PATCH 3/4] Inline pandas-3 golden patch into
 _compare_or_generate_golden

No behavior change. Folds _patch_golden_for_pandas3 directly into the
loader block where it is used, since it is only called once. Also
replaces the local re.sub helper with Series.str.replace(regex=True)
to drop the `import re`.

Generated-by: Claude Code
---
 .../coercion/test_pandas_udf_return_type.py   | 93 +++++++++----------
 1 file changed, 45 insertions(+), 48 deletions(-)

diff --git a/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py b/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py
index d70655ec4ffa8..9fd21309cb1b9 100644
--- a/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py
+++ b/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py
@@ -19,7 +19,6 @@
 from decimal import Decimal
 import itertools
 import os
-import re
 import unittest
 
 from pyspark.sql.functions import pandas_udf
@@ -152,48 +151,6 @@ def test_pandas_return_type_coercion_vanilla(self):
     def _run_pandas_udf_return_type_coercion(self, golden_file, test_name):
         self._compare_or_generate_golden(golden_file, test_name)
 
-    def _patch_golden_for_pandas3(self, golden):
-        # Rename columns whose key differs between pandas 2 and pandas 3:
-        # datetime64 ndarrays default to [us] instead of [ns], and Categorical
-        # categories default to str instead of object.
-        rename = {}
-        for value in self.test_data:
-            if isinstance(value, np.ndarray) and value.dtype.kind == "M":
-                new_key = self.repr_value(value)
-                old_key = self.repr_value(value.astype("datetime64[ns]"))
-            elif isinstance(value, pd.Categorical) and value.categories.dtype != object:
-                new_key = self.repr_value(value)
-                old_key = self.repr_value(
-                    pd.Categorical(
-                        value.tolist(),
-                        categories=pd.Index(value.categories.tolist(), dtype=object),
-                    )
-                )
-            else:
-                continue
-            if old_key != new_key:
-                rename[old_key] = new_key
-        if rename:
-            golden.rename(columns=rename, inplace=True)
-
-        # Scale ns->us in datetime64 / Timedelta columns: any 13+ digit integer
-        # in those cells is a pandas-2 nanosecond value; the pandas-3 cast
-        # returns microseconds (1000x smaller).
-        def _ns_to_us(s):
-            return re.sub(r"\d{13,}", lambda m: str(int(m.group()) // 1000), s)
-
-        scale_cols = [
-            c for c in golden.columns if "@ndarray[datetime64[" in c or c.startswith("[Timedelta(")
-        ]
-        for col in scale_cols:
-            golden[col] = golden[col].map(_ns_to_us)
-
-        # Pandas 3 succeeds at coercing string list -> Decimal where pandas 2
-        # errored, so the corresponding cell flips from "X" to the new repr.
-        decimal_col = "['12', '34']@list"
-        if "decimal(10,0)" in golden.index and decimal_col in golden.columns:
-            golden.loc["decimal(10,0)", decimal_col] = "[Decimal('12'), Decimal('34')]"
-
     def _compare_or_generate_golden(self, golden_file, test_name):
         generating = self.is_generating_golden()
 
@@ -203,12 +160,52 @@ def _compare_or_generate_golden(self, golden_file, test_name):
         golden = None
         if not generating:
             golden = self.load_golden_csv(golden_csv)
-            # The golden file was generated under pandas 2, where the default
-            # dtypes differ from pandas >= 3.0 (datetime64[ns] vs [us]; object
-            # vs str for categorical categories). Patch the loaded golden in
-            # memory so the same file works under both versions.
+            # The golden file was generated under pandas 2; patch the loaded
+            # copy in memory so the same file works under pandas >= 3.0.
             if LooseVersion(pd.__version__) >= LooseVersion("3.0.0"):
-                self._patch_golden_for_pandas3(golden)
+                # Rename columns whose key differs between the two versions:
+                # datetime64 ndarrays default to [us] instead of [ns], and
+                # Categorical categories default to str instead of object.
+                rename = {}
+                for value in self.test_data:
+                    if isinstance(value, np.ndarray) and value.dtype.kind == "M":
+                        new_key = self.repr_value(value)
+                        old_key = self.repr_value(value.astype("datetime64[ns]"))
+                    elif isinstance(value, pd.Categorical) and value.categories.dtype != object:
+                        new_key = self.repr_value(value)
+                        old_key = self.repr_value(
+                            pd.Categorical(
+                                value.tolist(),
+                                categories=pd.Index(value.categories.tolist(), dtype=object),
+                            )
+                        )
+                    else:
+                        continue
+                    if old_key != new_key:
+                        rename[old_key] = new_key
+                if rename:
+                    golden.rename(columns=rename, inplace=True)
+
+                # Scale ns->us in datetime64 / Timedelta columns: any 13+ digit
+                # integer in those cells is a pandas-2 nanosecond value; the
+                # pandas-3 cast returns microseconds (1000x smaller).
+                scale_cols = [
+                    c
+                    for c in golden.columns
+                    if "@ndarray[datetime64[" in c or c.startswith("[Timedelta(")
+                ]
+                for col in scale_cols:
+                    golden[col] = golden[col].str.replace(
+                        r"\d{13,}",
+                        lambda m: str(int(m.group()) // 1000),
+                        regex=True,
+                    )
+
+                # Pandas 3 succeeds at coercing string list -> Decimal where
+                # pandas 2 errored, so the corresponding cell flips from "X".
+                decimal_col = "['12', '34']@list"
+                if "decimal(10,0)" in golden.index and decimal_col in golden.columns:
+                    golden.loc["decimal(10,0)", decimal_col] = "[Decimal('12'), Decimal('34')]"
 
         def work(arg):
             spark_type, value = arg

From 7230f4855a1d940ee6709bf4a4140fd824ec830c Mon Sep 17 00:00:00 2001
From: Ruifeng Zheng <ruifengz@apache.org>
Date: Tue, 19 May 2026 11:41:04 +0000
Subject: [PATCH 4/4] Locate pandas-3 patch targets via repr_* instead of
 column substring

No behavior change. Use self.repr_value(value) and self.repr_type(...)
to derive both rename and scale targets directly from self.test_data
and the affected Spark type, instead of grep-matching the golden
column names. Single loop over test_data builds both rename and
scale_cols.

Generated-by: Claude Code
---
 .../coercion/test_pandas_udf_return_type.py   | 39 +++++++++----------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py b/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py
index 9fd21309cb1b9..f1ba3cd847239 100644
--- a/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py
+++ b/python/pyspark/sql/tests/coercion/test_pandas_udf_return_type.py
@@ -161,39 +161,35 @@ def _compare_or_generate_golden(self, golden_file, test_name):
         if not generating:
             golden = self.load_golden_csv(golden_csv)
             # The golden file was generated under pandas 2; patch the loaded
-            # copy in memory so the same file works under pandas >= 3.0.
+            # copy in memory so the same file works under pandas >= 3.0, where
+            # the defaults differ: datetime64 ndarrays use [us] instead of [ns],
+            # Categorical categories use str instead of object, and the same
+            # casts return microseconds instead of nanoseconds.
             if LooseVersion(pd.__version__) >= LooseVersion("3.0.0"):
-                # Rename columns whose key differs between the two versions:
-                # datetime64 ndarrays default to [us] instead of [ns], and
-                # Categorical categories default to str instead of object.
                 rename = {}
+                scale_cols = []
                 for value in self.test_data:
+                    new_key = self.repr_value(value)
                     if isinstance(value, np.ndarray) and value.dtype.kind == "M":
-                        new_key = self.repr_value(value)
                         old_key = self.repr_value(value.astype("datetime64[ns]"))
+                        if old_key != new_key:
+                            rename[old_key] = new_key
+                        scale_cols.append(new_key)
                     elif isinstance(value, pd.Categorical) and value.categories.dtype != object:
-                        new_key = self.repr_value(value)
                         old_key = self.repr_value(
                             pd.Categorical(
                                 value.tolist(),
                                 categories=pd.Index(value.categories.tolist(), dtype=object),
                             )
                         )
-                    else:
-                        continue
-                    if old_key != new_key:
-                        rename[old_key] = new_key
+                        if old_key != new_key:
+                            rename[old_key] = new_key
+                    elif isinstance(value, list) and value and isinstance(value[0], pd.Timedelta):
+                        scale_cols.append(new_key)
+
                 if rename:
                     golden.rename(columns=rename, inplace=True)
 
-                # Scale ns->us in datetime64 / Timedelta columns: any 13+ digit
-                # integer in those cells is a pandas-2 nanosecond value; the
-                # pandas-3 cast returns microseconds (1000x smaller).
-                scale_cols = [
-                    c
-                    for c in golden.columns
-                    if "@ndarray[datetime64[" in c or c.startswith("[Timedelta(")
-                ]
                 for col in scale_cols:
                     golden[col] = golden[col].str.replace(
                         r"\d{13,}",
@@ -203,9 +199,10 @@ def _compare_or_generate_golden(self, golden_file, test_name):
 
                 # Pandas 3 succeeds at coercing string list -> Decimal where
                 # pandas 2 errored, so the corresponding cell flips from "X".
-                decimal_col = "['12', '34']@list"
-                if "decimal(10,0)" in golden.index and decimal_col in golden.columns:
-                    golden.loc["decimal(10,0)", decimal_col] = "[Decimal('12'), Decimal('34')]"
+                decimal_idx = self.repr_type(DecimalType(10, 0))
+                decimal_col = self.repr_value(["12", "34"])
+                if decimal_idx in golden.index and decimal_col in golden.columns:
+                    golden.loc[decimal_idx, decimal_col] = "[Decimal('12'), Decimal('34')]"
 
         def work(arg):
             spark_type, value = arg