runpod · deanq · Mar 23, 2026 · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026
diff --git a/.github/workflows/CI-e2e.yml b/.github/workflows/CI-e2e.yml
@@ -1,93 +1,38 @@
-# Performs a full test of the package within production environment.
-
-name: CI | End-to-End Runpod Python Tests
-
+name: CI-e2e
 on:
   push:
-    branches:
-      - main
-
+    branches: [main]
   pull_request:
-    branches:
-      - main
-
+    branches: [main]
   workflow_dispatch:
 
 jobs:
-  e2e-build:
-    name: Build and push mock-worker Docker image
+  e2e:
     if: github.repository == 'runpod/runpod-python'
     runs-on: ubuntu-latest
-    outputs:
-      docker_tag: ${{ steps.output_docker_tag.outputs.docker_tag }}
-
+    timeout-minutes: 15
     steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 2
-
-      - name: Clone and patch mock-worker
-        run: |
-          git clone https://github.com/runpod-workers/mock-worker
-          GIT_SHA=${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          echo "git+https://github.com/runpod/runpod-python.git@$GIT_SHA" > mock-worker/builder/requirements.txt
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
+      - uses: actions/checkout@v4
 
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+      - uses: astral-sh/setup-uv@v6
 
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
+      - uses: actions/setup-python@v5
         with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
+          python-version: "3.12"
 
-      - name: Define Docker Tag
-        id: docker_tag
+      - name: Install dependencies
         run: |
-          DOCKER_TAG=${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          echo "DOCKER_TAG=$(echo $DOCKER_TAG | cut -c 1-7)" >> $GITHUB_ENV
-
-      - name: Set Docker Tag as Output
-        id: output_docker_tag
-        run: echo "docker_tag=${{ env.DOCKER_TAG }}" >> $GITHUB_OUTPUT
-
-      - name: Build and push Docker image
-        uses: docker/build-push-action@v6
-        with:
-          context: ./mock-worker
-          file: ./mock-worker/Dockerfile
-          push: true
-          tags: ${{ vars.DOCKERHUB_REPO }}/${{ vars.DOCKERHUB_IMG }}:${{ env.DOCKER_TAG }}
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-
-  test:
-    name: Run End-to-End Tests
-    runs-on: ubuntu-latest
-    needs: [e2e-build]
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Run Tests
-        id: run-tests
-        uses: runpod/[email protected]
-        with:
-          image-tag: ${{ vars.DOCKERHUB_REPO }}/${{ vars.DOCKERHUB_IMG }}:${{ needs.e2e-build.outputs.docker_tag }}
-          runpod-api-key: ${{ secrets.RUNPOD_API_KEY }}
-          request-timeout: 1200
-
-      - name: Verify Tests
-        env:
-          TOTAL_TESTS: ${{ steps.run-tests.outputs.total-tests }}
-          SUCCESSFUL_TESTS: ${{ steps.run-tests.outputs.succeeded }}
+          uv venv
+          source .venv/bin/activate
+          uv pip install -e ".[test]" --quiet || uv pip install -e .
+          uv pip install runpod-flash pytest pytest-asyncio pytest-timeout pytest-rerunfailures httpx
+          uv pip install -e . --reinstall --no-deps
+          python -c "import runpod; print(f'runpod: {runpod.__version__} from {runpod.__file__}')"
+
+      - name: Run e2e tests
         run: |
-          echo "Total tests: $TOTAL_TESTS"
-          echo "Successful tests: $SUCCESSFUL_TESTS"
-          if [ "$TOTAL_TESTS" != "$SUCCESSFUL_TESTS" ]; then
-              exit 1
-          fi
+          source .venv/bin/activate
+          pytest tests/e2e/ -v -p no:xdist --timeout=600 --reruns 1 --reruns-delay 5 --log-cli-level=INFO -o "addopts="
+        env:
+          RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
+          RUNPOD_SDK_GIT_REF: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
diff --git a/.github/workflows/cleanup-endpoints.yml b/.github/workflows/cleanup-endpoints.yml
@@ -0,0 +1,110 @@
+name: Cleanup stale endpoints
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "List endpoints without deleting (true/false)"
+        required: true
+        default: "true"
+        type: choice
+        options:
+          - "true"
+          - "false"
+      name_filter:
+        description: "Only delete endpoints whose name contains this string (empty = all)"
+        required: false
+        default: ""
+
+jobs:
+  cleanup:
+    if: github.repository == 'runpod/runpod-python'
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    steps:
+      - name: Cleanup endpoints
+        env:
+          RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
+          DRY_RUN: ${{ inputs.dry_run }}
+          NAME_FILTER: ${{ inputs.name_filter }}
+        run: |
+          python3 - <<'SCRIPT'
+          import json
+          import os
+          import urllib.request
+
+          API_URL = "https://api.runpod.io/graphql"
+          API_KEY = os.environ["RUNPOD_API_KEY"]
+          DRY_RUN = os.environ.get("DRY_RUN", "true") == "true"
+          NAME_FILTER = os.environ.get("NAME_FILTER", "").strip()
+
+          def graphql(query, variables=None):
+              payload = json.dumps({"query": query, "variables": variables or {}}).encode()
+              req = urllib.request.Request(
+                  f"{API_URL}?api_key={API_KEY}",
+                  data=payload,
+                  headers={"Content-Type": "application/json"},
+              )
+              with urllib.request.urlopen(req) as resp:
+                  return json.loads(resp.read())
+
+          # List all endpoints
+          result = graphql("""
+              query {
+                  myself {
+                      endpoints {
+                          id
+                          name
+                          workersMin
+                          workersMax
+                          createdAt
+                      }
+                  }
+              }
+          """)
+
+          endpoints = result.get("data", {}).get("myself", {}).get("endpoints", [])
+          if not endpoints:
+              print("No endpoints found.")
+              raise SystemExit(0)
+
+          # Filter if requested
+          if NAME_FILTER:
+              targets = [ep for ep in endpoints if NAME_FILTER in ep.get("name", "")]
+              print(f"Filter '{NAME_FILTER}' matched {len(targets)}/{len(endpoints)} endpoints")
+          else:
+              targets = endpoints
+              print(f"Found {len(targets)} total endpoints (no filter applied)")
+
+          print(f"\n{'DRY RUN — ' if DRY_RUN else ''}{'Listing' if DRY_RUN else 'Deleting'} {len(targets)} endpoint(s):\n")
+          for ep in sorted(targets, key=lambda e: e.get("createdAt", "")):
+              print(f"  {ep['id']}  {ep.get('name', '(unnamed)'):<40}  "
+                    f"workers={ep.get('workersMin', '?')}-{ep.get('workersMax', '?')}  "
+                    f"created={ep.get('createdAt', 'unknown')}")
+
+          if DRY_RUN:
+              print(f"\nDry run complete. Re-run with dry_run=false to delete.")
+              raise SystemExit(0)
+
+          # Delete each endpoint
+          deleted = 0
+          failed = 0
+          for ep in targets:
+              ep_id = ep["id"]
+              ep_name = ep.get("name", "(unnamed)")
+              try:
+                  resp = graphql(
+                      "mutation deleteEndpoint($id: String!) { deleteEndpoint(id: $id) }",
+                      {"id": ep_id},
+                  )
+                  if "errors" in resp:
+                      print(f"  FAILED  {ep_id}  {ep_name}: {resp['errors']}")
+                      failed += 1
+                  else:
+                      print(f"  DELETED {ep_id}  {ep_name}")
+                      deleted += 1
+              except Exception as exc:
+                  print(f"  ERROR   {ep_id}  {ep_name}: {exc}")
+                  failed += 1
+
+          print(f"\nDone: {deleted} deleted, {failed} failed, {len(endpoints) - len(targets)} skipped (filtered)")
+          SCRIPT
diff --git a/pytest.ini b/pytest.ini
@@ -1,5 +1,9 @@
 [pytest]
 addopts = --durations=10 --cov-config=.coveragerc --timeout=120 --timeout_method=thread --cov=runpod --cov-report=xml --cov-report=term-missing --cov-fail-under=90 -W error -p no:cacheprovider -p no:unraisableexception
 python_files = tests.py test_*.py *_test.py
-norecursedirs = venv *.egg-info .git build
+norecursedirs = venv *.egg-info .git build tests/e2e
 asyncio_mode = auto
+markers =
+    qb: Queue-based tests (local execution, fast)
+    lb: Load-balanced tests (remote provisioning, slow)
+    cold_start: Cold start benchmark (starts own server)
diff --git a/tests/e2e/__init__.py b/tests/e2e/__init__.py
diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py
@@ -0,0 +1,78 @@
+"""E2E test fixtures: provision real endpoints, configure SDK, clean up."""
+
+import logging
+import os
+import subprocess
+from pathlib import Path
+
+import pytest
+import runpod
+
+from tests.e2e.e2e_provisioner import load_test_cases, provision_endpoints
+
+log = logging.getLogger(__name__)
+REQUEST_TIMEOUT = 300  # seconds per job request
+
+# Repo root: tests/e2e/conftest.py -> ../../
+_REPO_ROOT = Path(__file__).resolve().parents[2]
+
+
+@pytest.fixture(scope="session", autouse=True)
+def verify_local_runpod():
+    """Fail fast if the local runpod-python is not installed."""
+    log.info("runpod version=%s path=%s", runpod.__version__, runpod.__file__)
+    runpod_path = Path(runpod.__file__).resolve()
+    if not runpod_path.is_relative_to(_REPO_ROOT):
+        pytest.fail(
+            f"Expected runpod installed from {_REPO_ROOT} but got {runpod_path}. "
+            "Run: pip install -e . --force-reinstall --no-deps"
+        )
+
+
+@pytest.fixture(scope="session")
+def require_api_key():
+    """Skip entire session if RUNPOD_API_KEY is not set."""
+    key = os.environ.get("RUNPOD_API_KEY")
+    if not key:
+        pytest.skip("RUNPOD_API_KEY not set")
+    log.info("RUNPOD_API_KEY is set (length=%d)", len(key))
+
+
+@pytest.fixture(scope="session")
+def test_cases():
+    """Load test cases from tests.json."""
+    cases = load_test_cases()
+    log.info("Loaded %d test cases: %s", len(cases), [c.get("id") for c in cases])
+    return cases
+
+
+@pytest.fixture(scope="session")
+def endpoints(require_api_key, test_cases):
+    """Provision one endpoint per unique hardwareConfig.
+
+    Endpoints deploy lazily on first .run()/.runsync() call.
+    """
+    eps = provision_endpoints(test_cases)
+    for key, ep in eps.items():
+        log.info("Endpoint ready: name=%s image=%s template.dockerArgs=%s", ep.name, ep.image, ep.template.dockerArgs if ep.template else "N/A")
+    yield eps
+
+    # Undeploy only the endpoints provisioned by this test run.
+    # Uses by-name undeploy to avoid tearing down unrelated endpoints
+    # sharing the same API key (parallel CI runs, developer endpoints).
+    endpoint_names = [ep.name for ep in eps.values()]
+    log.info("Cleaning up %d provisioned endpoints: %s", len(endpoint_names), endpoint_names)
+    for name in endpoint_names:
+        try:
+            result = subprocess.run(
+                ["flash", "undeploy", name, "--force"],
+                capture_output=True,
+                text=True,
+                timeout=60,
+            )
+            if result.returncode == 0:
+                log.info("Undeployed %s", name)
+            else:
+                log.warning("flash undeploy %s failed (rc=%d): %s", name, result.returncode, result.stderr)
+        except Exception:
+            log.exception("Failed to undeploy %s", name)