Skip to content

Commit a1c19d5

Browse files
authored
Merge pull request #399 from justinblalock87/cleanup-metrics
Add metrics for cleanup controller
2 parents 67daf73 + c2b32d4 commit a1c19d5

File tree

19 files changed

+1754
-4
lines changed

19 files changed

+1754
-4
lines changed

cmd/node-cleanup/main.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,23 @@ package main
1818

1919
import (
2020
"context"
21+
"log"
22+
"net/http"
2123
"time"
2224

25+
"github.com/prometheus/client_golang/prometheus"
26+
"github.com/prometheus/client_golang/prometheus/promhttp"
2327
flag "github.com/spf13/pflag"
2428
"k8s.io/client-go/informers"
2529
"k8s.io/client-go/kubernetes"
2630
"k8s.io/client-go/rest"
2731
"k8s.io/client-go/tools/clientcmd"
2832

33+
"k8s.io/component-base/metrics/legacyregistry"
34+
_ "k8s.io/component-base/metrics/prometheus/clientgo" // for client metric registration
35+
2936
"k8s.io/klog/v2"
37+
metrics "sigs.k8s.io/sig-storage-local-static-provisioner/pkg/metrics/node-cleanup"
3038
"sigs.k8s.io/sig-storage-local-static-provisioner/pkg/node-cleanup/controller"
3139
"sigs.k8s.io/sig-storage-local-static-provisioner/pkg/node-cleanup/deleter"
3240
)
@@ -40,6 +48,8 @@ var (
4048
workerThreads = flag.Uint("worker-threads", 10, "Number of controller worker threads.")
4149
pvcDeletionDelay = flag.Duration("pvc-deletion-delay", 60*time.Second, "Duration, in seconds, to wait after Node deletion for PVC cleanup.")
4250
stalePVDiscoveryInterval = flag.Duration("stale-pv-discovery-interval", 10*time.Second, "Duration, in seconds, the PV Deleter should wait between tries to clean up stale PVs.")
51+
listenAddress = flag.String("listen-address", ":8080", "The TCP network address where the prometheus metrics endpoint will listen (example: `:8080`).")
52+
metricsPath = flag.String("metrics-path", "/metrics", "The HTTP path where prometheus metrics will be exposed.")
4353
)
4454

4555
func main() {
@@ -77,6 +87,28 @@ func main() {
7787

7888
factory.Start(ctx.Done())
7989

90+
// Prepare http endpoint for metrics
91+
if *listenAddress != "" {
92+
reg := prometheus.NewRegistry()
93+
reg.MustRegister([]prometheus.Collector{
94+
metrics.PersistentVolumeDeleteTotal,
95+
metrics.PersistentVolumeDeleteFailedTotal,
96+
metrics.PersistentVolumeClaimDeleteTotal,
97+
metrics.PersistentVolumeClaimDeleteFailedTotal,
98+
}...)
99+
gatherers := prometheus.Gatherers{
100+
reg,
101+
legacyregistry.DefaultGatherer,
102+
}
103+
104+
http.Handle(*metricsPath, promhttp.HandlerFor(gatherers, promhttp.HandlerOpts{}))
105+
106+
go func() {
107+
klog.Infof("Starting metrics server at %s\n", *listenAddress)
108+
log.Fatal(http.ListenAndServe(*listenAddress, nil))
109+
}()
110+
}
111+
80112
// Start Deleter
81113
go deleter.Run(ctx, *stalePVDiscoveryInterval)
82114

deployment/kubernetes/example/node-cleanup-controller/deployment.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,7 @@ spec:
1919
args:
2020
- "--storageclass-names=nvme-ssd-block"
2121
- "--pvc-deletion-delay=60s"
22-
- "--stale-pv-discovery-interval=10s"
22+
- "--stale-pv-discovery-interval=10s"
23+
ports:
24+
- name: metrics
25+
containerPort: 8080

docs/node-cleanup-controller.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ Please see the example [deployment](../deployment/kubernetes/example/node-cleanu
2626
* `--kube-api-endpoint`: Master URL to build a client config from. Either this or kubeconfig needs to be set if the provisioner is being run out of cluster.
2727
* `--resync`: Duration, in minutes, of the resync interval of the controller. Defaults to 10 minutes.
2828
* `--worker-threads`: Number of controller worker threads. Defaults to 10.
29+
* `--listen-address`: The TCP network address where the prometheus metrics endpoint will listen. Defaults to `:8080`.
30+
* `--metrics-path`: The HTTP path where prometheus metrics will be exposed. Defaults to "/metrics".
2931

3032
## Design
3133

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
/*
2+
Copyright 2023 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package metrics
18+
19+
import (
20+
"github.com/prometheus/client_golang/prometheus"
21+
)
22+
23+
const (
24+
// LocalVolumeNodeCleanupSubsystem is prometheus subsystem name.
25+
LocalVolumeNodeCleanupSubsystem = "local_volume_node_cleanup"
26+
)
27+
28+
var (
29+
// PersistentVolumeDeleteTotal is used to collect accumulated count of persistent volume delete attempts.
30+
// This metric will report as higher than the true amount of persistent volumes deleted if
31+
// the node-cleanup deleter has a short sync period.
32+
PersistentVolumeDeleteTotal = prometheus.NewCounterVec(
33+
prometheus.CounterOpts{
34+
Subsystem: LocalVolumeNodeCleanupSubsystem,
35+
Name: "persistentvolume_delete_total",
36+
Help: "Total number of successful persistent volume delete *attempts*. Broken down by persistent volume phase.",
37+
},
38+
[]string{"phase"},
39+
)
40+
// PersistentVolumeDeleteFailedTotal is used to collect accumulated count of persistent volume delete failed attempts.
41+
PersistentVolumeDeleteFailedTotal = prometheus.NewCounterVec(
42+
prometheus.CounterOpts{
43+
Subsystem: LocalVolumeNodeCleanupSubsystem,
44+
Name: "persistentvolume_delete_failed_total",
45+
Help: "Total number of persistent volume delete failed attempts. Broken down by persistent volume phase.",
46+
},
47+
[]string{"phase"},
48+
)
49+
// PersistentVolumeClaimDeleteTotal is used to collect accumulated count of persistent volume claims deleted.
50+
PersistentVolumeClaimDeleteTotal = prometheus.NewCounter(
51+
prometheus.CounterOpts{
52+
Subsystem: LocalVolumeNodeCleanupSubsystem,
53+
Name: "persistentvolumeclaim_delete_total",
54+
Help: "Total number of persistent volume claims deleted.",
55+
},
56+
)
57+
// PersistentVolumeClaimDeleteFailedTotal is used to collect accumulated count of persistent volume claim delete failed attempts.
58+
PersistentVolumeClaimDeleteFailedTotal = prometheus.NewCounter(
59+
prometheus.CounterOpts{
60+
Subsystem: LocalVolumeNodeCleanupSubsystem,
61+
Name: "persistentvolumeclaim_delete_failed_total",
62+
Help: "Total number of persistent volume claim delete failed attempts.",
63+
},
64+
)
65+
)

pkg/node-cleanup/controller/controller.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ import (
3838
"k8s.io/klog/v2"
3939

4040
"sigs.k8s.io/sig-storage-local-static-provisioner/pkg/common"
41+
cleanupmetrics "sigs.k8s.io/sig-storage-local-static-provisioner/pkg/metrics/node-cleanup"
4142
)
4243

4344
// CleanupController handles the deletion of PVCs that reference deleted Nodes.
@@ -235,10 +236,12 @@ func (c *CleanupController) syncHandler(ctx context.Context, pvName string) erro
235236

236237
err = c.deletePVC(ctx, pvc)
237238
if err != nil {
239+
cleanupmetrics.PersistentVolumeClaimDeleteFailedTotal.Inc()
238240
klog.Errorf("failed to delete pvc %q in namespace &q: %w", pvClaimRef.Name, pvClaimRef.Namespace, err)
239241
return err
240242
}
241243

244+
cleanupmetrics.PersistentVolumeClaimDeleteTotal.Inc()
242245
klog.Infof("Deleted PVC %q that pointed to Node %q", pvClaimRef.Name, nodeName)
243246
return nil
244247
}

pkg/node-cleanup/deleter/deleter.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
"k8s.io/klog/v2"
3131

3232
"sigs.k8s.io/sig-storage-local-static-provisioner/pkg/common"
33+
cleanupmetrics "sigs.k8s.io/sig-storage-local-static-provisioner/pkg/metrics/node-cleanup"
3334
)
3435

3536
// Deleter handles cleanup of local PVs with an affinity to a deleted Node.
@@ -91,15 +92,22 @@ func (d *Deleter) DeletePVs(ctx context.Context) {
9192
continue
9293
}
9394

95+
phase := pv.Status.Phase
96+
reclaimPolicy := pv.Spec.PersistentVolumeReclaimPolicy
9497
// PV is a stale object since it references a deleted Node.
9598
// Therefore it can safely be deleted in the two following cases.
96-
isReleasedWithDeleteReclaim := pv.Status.Phase == v1.VolumeReleased && pv.Spec.PersistentVolumeReclaimPolicy == v1.PersistentVolumeReclaimDelete
97-
isAvailable := pv.Status.Phase == v1.VolumeAvailable
99+
isReleasedWithDeleteReclaim := phase == v1.VolumeReleased && reclaimPolicy == v1.PersistentVolumeReclaimDelete
100+
isAvailable := phase == v1.VolumeAvailable
98101
if isReleasedWithDeleteReclaim || isAvailable {
99-
klog.Infof("Deleting PV that has NodeAffinity to deleted Node, pv: %s", pv.Name)
102+
klog.Infof("Attempting to delete PV that has NodeAffinity to deleted Node, pv: %s", pv.Name)
100103
if err = d.deletePV(ctx, pv.Name); err != nil {
104+
cleanupmetrics.PersistentVolumeDeleteFailedTotal.WithLabelValues(string(phase)).Inc()
101105
klog.Errorf("Error deleting PV: %s", pv.Name)
106+
continue
102107
}
108+
// TODO: Cache successful deletion to avoid multiple delete calls
109+
// when there is a short sync period
110+
cleanupmetrics.PersistentVolumeDeleteTotal.WithLabelValues(string(phase)).Inc()
103111
}
104112
}
105113
}

vendor/k8s.io/client-go/tools/leaderelection/OWNERS

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/k8s.io/client-go/tools/leaderelection/healthzadaptor.go

Lines changed: 69 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)