-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfix-crashloop.yaml
More file actions
115 lines (103 loc) · 3.98 KB
/
fix-crashloop.yaml
File metadata and controls
115 lines (103 loc) · 3.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# Example Execution Plan: Fix CrashLoopBackOff
# This demonstrates a real-world fix for OOMKilled pods
plan:
id: example-crashloop-001
title: "Fix CrashLoopBackOff in api-service (OOMKilled)"
created: "2026-01-26T13:00:00Z"
risk: MEDIUM
estimated_time: "5min"
context:
issue: "api-service pods in CrashLoopBackOff state"
root_cause: "Memory limit too low (256Mi), pods getting OOMKilled under load"
affected_resources:
- "production/deployment/api-service"
- "3 pods currently crashing"
rollback:
method: "Rollback deployment to previous revision"
commands:
- "kubectl rollout undo deployment/api-service -n production"
- "kubectl rollout status deployment/api-service -n production"
estimated_time: "2min"
steps:
- id: step-1
action: kubectl_patch
description: "Increase memory limit from 256Mi to 512Mi"
command: |
kubectl patch deployment api-service -n production -p '{
"spec": {
"template": {
"spec": {
"containers": [{
"name": "api",
"resources": {
"limits": {"memory": "512Mi"},
"requests": {"memory": "256Mi"}
}
}]
}
}
}
}'
resource: "production/deployment/api-service"
risk: MEDIUM
reversible: true
expected_outcome: "deployment.apps/api-service patched"
- id: step-2
action: wait_for_rollout
description: "Wait for all new pods to be running and ready"
command: "kubectl rollout status deployment/api-service -n production --timeout=5m"
resource: "production/deployment/api-service"
timeout: "5m"
success_criteria: "deployment 'api-service' successfully rolled out"
- id: step-3
action: verify
description: "Verify no pods in CrashLoopBackOff"
command: "kubectl get pods -n production -l app=api-service -o jsonpath='{.items[*].status.phase}'"
expected_outcome: "Running Running Running"
- id: step-4
action: verify
description: "Check for OOMKilled events in past 5 minutes"
command: "kubectl get events -n production --field-selector involvedObject.name=api-service --sort-by='.lastTimestamp' | grep OOMKilled | tail -5"
expected_outcome: "No OOMKilled events after patch"
- id: step-5
action: verify
description: "Monitor pod restarts for 30 seconds"
command: "sleep 30 && kubectl get pods -n production -l app=api-service -o jsonpath='{.items[*].status.containerStatuses[0].restartCount}'"
expected_outcome: "No new restarts (count unchanged)"
approval:
required: true
status: pending
approver: null
approved_at: null
notes: "Awaiting operator approval"
execution:
started_at: null
completed_at: null
duration: null
status: pending
steps_completed: 0
current_step: null
error: null
validation:
pre_flight:
- check: "Deployment api-service exists in production namespace"
command: "kubectl get deployment api-service -n production"
passed: null
- check: "Current memory limit is 256Mi"
command: "kubectl get deployment api-service -n production -o jsonpath='{.spec.template.spec.containers[0].resources.limits.memory}'"
passed: null
- check: "Pods are actually in CrashLoopBackOff"
command: "kubectl get pods -n production -l app=api-service | grep CrashLoopBackOff"
passed: null
post_execution:
- check: "All pods running"
command: "kubectl get pods -n production -l app=api-service -o jsonpath='{.items[*].status.phase}'"
expected: "Running Running Running"
passed: null
- check: "Memory limit is now 512Mi"
command: "kubectl get deployment api-service -n production -o jsonpath='{.spec.template.spec.containers[0].resources.limits.memory}'"
expected: "512Mi"
passed: null
- check: "No restarts in last 5 minutes"
command: "kubectl get pods -n production -l app=api-service --sort-by='.status.startTime' | head -1"
passed: null