Skip to content

Commit 40504d1

Browse files
Add metrics for cleanup controller
1 parent 67daf73 commit 40504d1

File tree

6 files changed

+107
-4
lines changed

6 files changed

+107
-4
lines changed

cmd/node-cleanup/main.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,20 @@ package main
1818

1919
import (
2020
"context"
21+
"log"
22+
"net/http"
2123
"time"
2224

25+
"github.com/prometheus/client_golang/prometheus"
26+
"github.com/prometheus/client_golang/prometheus/promhttp"
2327
flag "github.com/spf13/pflag"
2428
"k8s.io/client-go/informers"
2529
"k8s.io/client-go/kubernetes"
2630
"k8s.io/client-go/rest"
2731
"k8s.io/client-go/tools/clientcmd"
2832

2933
"k8s.io/klog/v2"
34+
metrics "sigs.k8s.io/sig-storage-local-static-provisioner/pkg/metrics/node-cleanup"
3035
"sigs.k8s.io/sig-storage-local-static-provisioner/pkg/node-cleanup/controller"
3136
"sigs.k8s.io/sig-storage-local-static-provisioner/pkg/node-cleanup/deleter"
3237
)
@@ -40,6 +45,8 @@ var (
4045
workerThreads = flag.Uint("worker-threads", 10, "Number of controller worker threads.")
4146
pvcDeletionDelay = flag.Duration("pvc-deletion-delay", 60*time.Second, "Duration, in seconds, to wait after Node deletion for PVC cleanup.")
4247
stalePVDiscoveryInterval = flag.Duration("stale-pv-discovery-interval", 10*time.Second, "Duration, in seconds, the PV Deleter should wait between tries to clean up stale PVs.")
48+
listenAddress = flag.String("listen-address", ":8080", "The TCP network address where the prometheus metrics endpoint will listen (example: `:8080`).")
49+
metricsPath = flag.String("metrics-path", "/metrics", "The HTTP path where prometheus metrics will be exposed.")
4350
)
4451

4552
func main() {
@@ -77,6 +84,22 @@ func main() {
7784

7885
factory.Start(ctx.Done())
7986

87+
// Prepare http endpoint for metrics
88+
if *listenAddress != "" {
89+
prometheus.MustRegister([]prometheus.Collector{
90+
metrics.APIServerRequestsTotal,
91+
metrics.PersistentVolumeDeleteFailedTotal,
92+
metrics.PersistentVolumeClaimDeleteTotal,
93+
metrics.PersistentVolumeClaimDeleteFailedTotal,
94+
}...)
95+
http.Handle(*metricsPath, promhttp.Handler())
96+
97+
go func() {
98+
klog.Infof("Starting metrics server at %s\n", *listenAddress)
99+
log.Fatal(http.ListenAndServe(*listenAddress, nil))
100+
}()
101+
}
102+
80103
// Start Deleter
81104
go deleter.Run(ctx, *stalePVDiscoveryInterval)
82105

deployment/kubernetes/example/node-cleanup-controller/deployment.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,7 @@ spec:
1919
args:
2020
- "--storageclass-names=nvme-ssd-block"
2121
- "--pvc-deletion-delay=60s"
22-
- "--stale-pv-discovery-interval=10s"
22+
- "--stale-pv-discovery-interval=10s"
23+
ports:
24+
- name: metrics
25+
containerPort: 8080

docs/node-cleanup-controller.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ Please see the example [deployment](../deployment/kubernetes/example/node-cleanu
2626
* `--kube-api-endpoint`: Master URL to build a client config from. Either this or kubeconfig needs to be set if the provisioner is being run out of cluster.
2727
* `--resync`: Duration, in minutes, of the resync interval of the controller. Defaults to 10 minutes.
2828
* `--worker-threads`: Number of controller worker threads. Defaults to 10.
29+
* `--listen-address`: The TCP network address where the prometheus metrics endpoint will listen. Defaults to `:8080`.
30+
* `--metrics-path`: The HTTP path where prometheus metrics will be exposed. Defaults to "/metrics".
2931

3032
## Design
3133

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
/*
2+
Copyright 2023 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package metrics
18+
19+
import (
20+
"github.com/prometheus/client_golang/prometheus"
21+
)
22+
23+
const (
24+
// LocalVolumeNodeCleanupSubsystem is prometheus subsystem name.
25+
LocalVolumeNodeCleanupSubsystem = "local_volume_node_cleanup"
26+
)
27+
28+
var (
29+
// APIServerRequestsTotal is used to collect accumulated count of apiserver requests.
30+
APIServerRequestsTotal = prometheus.NewCounterVec(
31+
prometheus.CounterOpts{
32+
Subsystem: LocalVolumeNodeCleanupSubsystem,
33+
Name: "apiserver_requests_total",
34+
Help: "Total number of apiserver requests. Broken down by method.",
35+
},
36+
[]string{"method"},
37+
)
38+
// PersistentVolumeDeleteFailedTotal is used to collect accumulated count of persistent volume delete failed attempts.
39+
PersistentVolumeDeleteFailedTotal = prometheus.NewCounterVec(
40+
prometheus.CounterOpts{
41+
Subsystem: LocalVolumeNodeCleanupSubsystem,
42+
Name: "persistentvolume_delete_failed_total",
43+
Help: "Total number of persistent volume delete failed attempts. Broken down by persistent volume status and reclaim policy.",
44+
},
45+
[]string{"status", "reclaim"},
46+
)
47+
// PersistentVolumeClaimDeleteTotal is used to collect accumulated count of persistent volume claims deleted.
48+
PersistentVolumeClaimDeleteTotal = prometheus.NewCounter(
49+
prometheus.CounterOpts{
50+
Subsystem: LocalVolumeNodeCleanupSubsystem,
51+
Name: "persistentvolumeclaim_delete_total",
52+
Help: "Total number of persistent volume claims deleted.",
53+
},
54+
)
55+
// PersistentVolumeClaimDeleteFailedTotal is used to collect accumulated count of persistent volume claim delete failed attempts.
56+
PersistentVolumeClaimDeleteFailedTotal = prometheus.NewCounter(
57+
prometheus.CounterOpts{
58+
Subsystem: LocalVolumeNodeCleanupSubsystem,
59+
Name: "persistentvolumeclaim_delete_failed_total",
60+
Help: "Total number of persistent volume claim delete failed attempts.",
61+
},
62+
)
63+
)

pkg/node-cleanup/controller/controller.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ import (
3838
"k8s.io/klog/v2"
3939

4040
"sigs.k8s.io/sig-storage-local-static-provisioner/pkg/common"
41+
"sigs.k8s.io/sig-storage-local-static-provisioner/pkg/metrics"
42+
cleanupmetrics "sigs.k8s.io/sig-storage-local-static-provisioner/pkg/metrics/node-cleanup"
4143
)
4244

4345
// CleanupController handles the deletion of PVCs that reference deleted Nodes.
@@ -235,10 +237,12 @@ func (c *CleanupController) syncHandler(ctx context.Context, pvName string) erro
235237

236238
err = c.deletePVC(ctx, pvc)
237239
if err != nil {
240+
cleanupmetrics.PersistentVolumeClaimDeleteFailedTotal.Inc()
238241
klog.Errorf("failed to delete pvc %q in namespace &q: %w", pvClaimRef.Name, pvClaimRef.Namespace, err)
239242
return err
240243
}
241244

245+
cleanupmetrics.PersistentVolumeClaimDeleteTotal.Inc()
242246
klog.Infof("Deleted PVC %q that pointed to Node %q", pvClaimRef.Name, nodeName)
243247
return nil
244248
}
@@ -296,6 +300,7 @@ func (c *CleanupController) deletePVC(ctx context.Context, pvc *v1.PersistentVol
296300
options := metav1.DeleteOptions{
297301
Preconditions: &metav1.Preconditions{UID: &pvc.UID},
298302
}
303+
cleanupmetrics.APIServerRequestsTotal.WithLabelValues(metrics.APIServerRequestDelete).Inc()
299304
err := c.client.CoreV1().PersistentVolumeClaims(pvc.Namespace).Delete(ctx, pvc.Name, options)
300305
if err != nil && errors.IsNotFound(err) {
301306
// The PVC could already be deleted by some other process

pkg/node-cleanup/deleter/deleter.go

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ import (
3030
"k8s.io/klog/v2"
3131

3232
"sigs.k8s.io/sig-storage-local-static-provisioner/pkg/common"
33+
"sigs.k8s.io/sig-storage-local-static-provisioner/pkg/metrics"
34+
cleanupmetrics "sigs.k8s.io/sig-storage-local-static-provisioner/pkg/metrics/node-cleanup"
3335
)
3436

3537
// Deleter handles cleanup of local PVs with an affinity to a deleted Node.
@@ -91,14 +93,18 @@ func (d *Deleter) DeletePVs(ctx context.Context) {
9193
continue
9294
}
9395

96+
phase := pv.Status.Phase
97+
reclaimPolicy := pv.Spec.PersistentVolumeReclaimPolicy
9498
// PV is a stale object since it references a deleted Node.
9599
// Therefore it can safely be deleted in the two following cases.
96-
isReleasedWithDeleteReclaim := pv.Status.Phase == v1.VolumeReleased && pv.Spec.PersistentVolumeReclaimPolicy == v1.PersistentVolumeReclaimDelete
97-
isAvailable := pv.Status.Phase == v1.VolumeAvailable
100+
isReleasedWithDeleteReclaim := phase == v1.VolumeReleased && reclaimPolicy == v1.PersistentVolumeReclaimDelete
101+
isAvailable := phase == v1.VolumeAvailable
98102
if isReleasedWithDeleteReclaim || isAvailable {
99-
klog.Infof("Deleting PV that has NodeAffinity to deleted Node, pv: %s", pv.Name)
103+
klog.Infof("Attempting to delete PV that has NodeAffinity to deleted Node, pv: %s", pv.Name)
100104
if err = d.deletePV(ctx, pv.Name); err != nil {
105+
cleanupmetrics.PersistentVolumeDeleteFailedTotal.WithLabelValues(string(phase), string(reclaimPolicy)).Inc()
101106
klog.Errorf("Error deleting PV: %s", pv.Name)
107+
continue
102108
}
103109
}
104110
}
@@ -127,6 +133,7 @@ func (d *Deleter) referencesNonExistentNode(localPV *v1.PersistentVolume) (bool,
127133
}
128134

129135
func (d *Deleter) deletePV(ctx context.Context, pvName string) error {
136+
cleanupmetrics.APIServerRequestsTotal.WithLabelValues(metrics.APIServerRequestDelete).Inc()
130137
err := d.client.CoreV1().PersistentVolumes().Delete(ctx, pvName, metav1.DeleteOptions{})
131138
if err != nil && errors.IsNotFound(err) {
132139
klog.Warningf("PV %q no longer exists", pvName)

0 commit comments

Comments
 (0)