From 8eb40394bee274233f3c45741e6dd1abef645a1a Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Wed, 20 May 2026 09:11:50 +0800 Subject: [PATCH] [SPARK-56934][INFRA] Make build_infra_images_cache workflow error tolerant Make the `build_infra_images_cache.yml` workflow tolerant of individual image build failures: - Add `continue-on-error: true` to each of the 12 `Build and push` steps so a failure in one does not abort the remaining builds. In particular, a failure of the base `./dev/infra/` image build should no longer prevent the other image builds from running. - Add a final "Fail if any image build failed" step that runs with `if: always()`, prints each build step's `outcome`, and exits non-zero if any was `failure`. Today, a single image build failure aborts the workflow immediately, leaving the remaining cache layers stale until someone re-triggers the job. This is especially impactful when the first step (the `./dev/infra/` base image) fails, because every subsequent image build is then skipped on that run. With this change every image still gets a chance to build and refresh its cache on each run, while the overall workflow still fails if any image build did not succeed. No. YAML parses cleanly (`python3 -c "import yaml; yaml.safe_load(...)"`). Verified all 12 build steps received `continue-on-error: true` and that the final aggregator step references every build step's `outcome`. Generated-by: Claude Code (Claude Opus 4.7) Closes #55972 from zhengruifeng/build-infra-images-continue-on-error. Authored-by: Ruifeng Zheng Signed-off-by: Ruifeng Zheng --- .../workflows/build_infra_images_cache.yml | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/.github/workflows/build_infra_images_cache.yml b/.github/workflows/build_infra_images_cache.yml index c3067f28306c3..f59f11dce8eeb 100644 --- a/.github/workflows/build_infra_images_cache.yml +++ b/.github/workflows/build_infra_images_cache.yml @@ -63,6 +63,7 @@ jobs: password: ${{ secrets.GITHUB_TOKEN }} - name: Build and push id: docker_build + continue-on-error: true uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 with: context: ./dev/infra/ @@ -75,6 +76,7 @@ jobs: - name: Build and push (Documentation) if: hashFiles('dev/spark-test-image/docs/Dockerfile') != '' id: docker_build_docs + continue-on-error: true uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 with: context: ./dev/spark-test-image/docs/ @@ -88,6 +90,7 @@ jobs: - name: Build and push (Linter) if: hashFiles('dev/spark-test-image/lint/Dockerfile') != '' id: docker_build_lint + continue-on-error: true uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 with: context: ./dev/spark-test-image/lint/ @@ -101,6 +104,7 @@ jobs: - name: Build and push (SparkR) if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != '' id: docker_build_sparkr + continue-on-error: true uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 with: context: ./dev/spark-test-image/sparkr/ @@ -114,6 +118,7 @@ jobs: - name: Build and push (PySpark with old dependencies) if: hashFiles('dev/spark-test-image/python-minimum/Dockerfile') != '' id: docker_build_pyspark_python_minimum + continue-on-error: true uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 with: context: ./dev/spark-test-image/python-minimum/ @@ -127,6 +132,7 @@ jobs: - name: Build and push (PySpark with Python 3.11) if: hashFiles('dev/spark-test-image/python-311/Dockerfile') != '' id: docker_build_pyspark_python_311 + continue-on-error: true uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 with: context: ./dev/spark-test-image/python-311/ @@ -140,6 +146,7 @@ jobs: - name: Build and push (PySpark Classic Only with Python 3.12) if: hashFiles('dev/spark-test-image/python-312-classic-only/Dockerfile') != '' id: docker_build_pyspark_python_312_classic_only + continue-on-error: true uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 with: context: ./dev/spark-test-image/python-312-classic-only/ @@ -153,6 +160,7 @@ jobs: - name: Build and push (PySpark with Python 3.12) if: hashFiles('dev/spark-test-image/python-312/Dockerfile') != '' id: docker_build_pyspark_python_312 + continue-on-error: true uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 with: context: ./dev/spark-test-image/python-312/ @@ -166,6 +174,7 @@ jobs: - name: Build and push (PySpark with Python 3.12 Pandas 3) if: hashFiles('dev/spark-test-image/python-312-pandas-3/Dockerfile') != '' id: docker_build_pyspark_python_312_pandas_3 + continue-on-error: true uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 with: context: ./dev/spark-test-image/python-312-pandas-3/ @@ -179,6 +188,7 @@ jobs: - name: Build and push (PySpark with Python 3.13) if: hashFiles('dev/spark-test-image/python-313/Dockerfile') != '' id: docker_build_pyspark_python_313 + continue-on-error: true uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 with: context: ./dev/spark-test-image/python-313/ @@ -192,6 +202,7 @@ jobs: - name: Build and push (PySpark with Python 3.14) if: hashFiles('dev/spark-test-image/python-314/Dockerfile') != '' id: docker_build_pyspark_python_314 + continue-on-error: true uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 with: context: ./dev/spark-test-image/python-314/ @@ -205,6 +216,7 @@ jobs: - name: Build and push (PySpark with Python 3.14 no GIL) if: hashFiles('dev/spark-test-image/python-314-nogil/Dockerfile') != '' id: docker_build_pyspark_python_314_nogil + continue-on-error: true uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 with: context: ./dev/spark-test-image/python-314-nogil/ @@ -215,3 +227,31 @@ jobs: - name: Image digest (PySpark with Python 3.14 no GIL) if: hashFiles('dev/spark-test-image/python-314-nogil/Dockerfile') != '' run: echo ${{ steps.docker_build_pyspark_python_314_nogil.outputs.digest }} + - name: Fail if any image build failed + if: always() + run: | + status=0 + check() { + local name="$1" + local outcome="$2" + echo "$name: $outcome" + if [ "$outcome" = "failure" ]; then + status=1 + fi + } + check "base" "${{ steps.docker_build.outcome }}" + check "docs" "${{ steps.docker_build_docs.outcome }}" + check "lint" "${{ steps.docker_build_lint.outcome }}" + check "sparkr" "${{ steps.docker_build_sparkr.outcome }}" + check "pyspark-python-minimum" "${{ steps.docker_build_pyspark_python_minimum.outcome }}" + check "pyspark-python-311" "${{ steps.docker_build_pyspark_python_311.outcome }}" + check "pyspark-python-312-classic-only" "${{ steps.docker_build_pyspark_python_312_classic_only.outcome }}" + check "pyspark-python-312" "${{ steps.docker_build_pyspark_python_312.outcome }}" + check "pyspark-python-312-pandas-3" "${{ steps.docker_build_pyspark_python_312_pandas_3.outcome }}" + check "pyspark-python-313" "${{ steps.docker_build_pyspark_python_313.outcome }}" + check "pyspark-python-314" "${{ steps.docker_build_pyspark_python_314.outcome }}" + check "pyspark-python-314-nogil" "${{ steps.docker_build_pyspark_python_314_nogil.outcome }}" + if [ "$status" -ne 0 ]; then + echo "::error::One or more image builds failed; see entries above marked 'failure'." + fi + exit "$status"