Skip to content

Commit a981f53

Browse files
authored
fix(basic-metrics): Remove label from basic dns and filter out IPs from linux util plugins (#877)
# Description This PR addresses the cardinality issue in two basic plugins. By removing the label from basic dns and excluding localhost and node ip from linux util plugin tcp_connection_remote. It also adds a check in the e2e to validate metrics without checking specific labels. ## Related Issue https://github.com/azure-networking/retina-enterprise/issues/263 https://github.com/azure-networking/retina-enterprise/issues/272 ## Checklist - [x] I have read the [contributing documentation](https://retina.sh/docs/contributing). - [x] I signed and signed-off the commits (`git commit -S -s ...`). See [this documentation](https://docs.github.com/en/authentication/managing-commit-signature-verification/about-commit-signature-verification) on signing commits. - [x] I have correctly attributed the author(s) of the code. - [x] I have tested the changes locally. - [x] I have followed the project's style guidelines. - [x] I have updated the documentation, if necessary. - [ ] I have added tests, if applicable. ## Screenshots (if applicable) or Testing Completed Please add any relevant screenshots or GIFs to showcase the changes made. ``` # TYPE networkobservability_dns_request_count counter networkobservability_dns_request_count 80 # HELP networkobservability_dns_response_count DNS responses by statistics # TYPE networkobservability_dns_response_count counter networkobservability_dns_response_count 50 # TYPE networkobservability_adv_dns_request_count counter networkobservability_adv_dns_request_count{ip="10.224.3.111",namespace="kube-system-test",podname="agnhost-adv-dns-port-forward-7215873926146132341-0",query="some.non.existent.domain.",query_type="A",workload_kind="StatefulSet",workload_name="agnhost-adv-dns-port-forward-7215873926146132341"} 30 # TYPE networkobservability_adv_dns_response_count counter networkobservability_adv_dns_response_count{ip="10.224.3.111",namespace="kube-system-test",num_response="0",podname="agnhost-adv-dns-port-forward-7215873926146132341-0",query="some.non.existent.domain.",query_type="A",response="",return_code="NXDOMAIN",workload_kind="StatefulSet",workload_name="agnhost-adv-dns-port-forward-7215873926146132341"} 19 ``` Before, nodeIP is 10.224.2.241 ``` # TYPE networkobservability_tcp_connection_remote gauge # TYPE networkobservability_tcp_connection_remote gauge networkobservability_tcp_connection_remote{address="0.0.0.0",port="0"} 11 **networkobservability_tcp_connection_remote{address="10.224.2.241",port="29613"} 0** networkobservability_tcp_connection_remote{address="10.224.2.255",port="9090"} 5 networkobservability_tcp_connection_remote{address="10.224.3.2",port="9090"} 1 networkobservability_tcp_connection_remote{address="10.224.3.206",port="8082"} 1 networkobservability_tcp_connection_remote{address="10.224.3.62",port="9090"} 4 networkobservability_tcp_connection_remote{address="104.208.181.172",port="443"} 4 **networkobservability_tcp_connection_remote{address="127.0.0.1",port="50678"} 0 networkobservability_tcp_connection_remote{address="127.0.0.1",port="50684"} 0 networkobservability_tcp_connection_remote{address="127.0.0.1",port="50694"} 0 networkobservability_tcp_connection_remote{address="127.0.0.1",port="55960"} 0** networkobservability_tcp_connection_remote{address="13.107.246.41",port="443"} 0 networkobservability_tcp_connection_remote{address="150.171.69.10",port="443"} 0 networkobservability_tcp_connection_remote{address="168.63.129.16",port="80"} 0 networkobservability_tcp_connection_remote{address="169.254.169.254",port="80"} 68 networkobservability_tcp_connection_remote{address="20.190.151.68",port="443"} 0 networkobservability_tcp_connection_remote{address="20.209.154.1",port="443"} 0 networkobservability_tcp_connection_remote{address="20.209.155.1",port="443"} 0 networkobservability_tcp_connection_remote{address="20.209.179.65",port="443"} 5 networkobservability_tcp_connection_remote{address="20.36.150.0",port="443"} 0 networkobservability_tcp_connection_remote{address="4.150.240.10",port="443"} 0 networkobservability_tcp_connection_remote{address="40.64.135.140",port="443"} 0 networkobservability_tcp_connection_remote{address="52.179.73.37",port="443"} 0 networkobservability_tcp_connection_remote{address="52.179.73.39",port="443"} 0 networkobservability_tcp_connection_remote{address="52.188.247.147",port="443"} 0 networkobservability_tcp_connection_remote{address="52.253.71.198",port="443"} 4 # HELP networkobservability_tcp_connection_stats TCP connections statistics ``` after. No nodeIP or localhost ``` # TYPE networkobservability_tcp_connection_remote gauge networkobservability_tcp_connection_remote{address="0.0.0.0",port="0"} 11 networkobservability_tcp_connection_remote{address="10.224.2.255",port="9090"} 5 networkobservability_tcp_connection_remote{address="10.224.3.2",port="9090"} 2 networkobservability_tcp_connection_remote{address="10.224.3.206",port="8082"} 2 networkobservability_tcp_connection_remote{address="10.224.3.62",port="9090"} 2 networkobservability_tcp_connection_remote{address="104.208.181.172",port="443"} 1 networkobservability_tcp_connection_remote{address="168.63.129.16",port="80"} 1 networkobservability_tcp_connection_remote{address="169.254.169.254",port="80"} 68 networkobservability_tcp_connection_remote{address="20.209.179.65",port="443"} 2 networkobservability_tcp_connection_remote{address="40.64.135.140",port="443"} 0 networkobservability_tcp_connection_remote{address="52.179.73.38",port="443"} 0 networkobservability_tcp_connection_remote{address="52.188.247.151",port="443"} 1 networkobservability_tcp_connection_remote{address="52.253.71.198",port="443"} 5 # HELP networkobservability_tcp_connection_stats TCP connections statistics ``` ## Additional Notes Add any additional notes or context about the pull request here. --- Please refer to the [CONTRIBUTING.md](../CONTRIBUTING.md) file for more information on how to contribute to this project.
1 parent 59acc85 commit a981f53

File tree

8 files changed

+47
-31
lines changed

8 files changed

+47
-31
lines changed

pkg/metrics/metrics.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,13 +134,11 @@ func InitializeMetrics() {
134134
exporter.DefaultRegistry,
135135
utils.DNSRequestCounterName,
136136
dnsRequestCounterDescription,
137-
utils.DNSRequestLabels...,
138137
)
139138
DNSResponseCounter = exporter.CreatePrometheusCounterVecForMetric(
140139
exporter.DefaultRegistry,
141140
utils.DNSResponseCounterName,
142141
dnsResponseCounterDescription,
143-
utils.DNSResponseLabels...,
144142
)
145143

146144
// InfiniBand Metrics

pkg/plugin/dns/dns_linux.go

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ import (
88
"context"
99
"net"
1010
"os"
11-
"strconv"
12-
"strings"
1311

1412
v1 "github.com/cilium/cilium/pkg/hubble/api/v1"
1513
"github.com/inspektor-gadget/inspektor-gadget/pkg/gadgets/trace/dns/tracer"
@@ -96,18 +94,15 @@ func (d *dns) eventHandler(event *types.Event) {
9694
}
9795
d.l.Debug("Event received", zap.Any("event", event))
9896

99-
responses := strings.Join(event.Addresses, ",")
100-
101-
// Update basic metrics. If the event is a request, the we don't need num_response, response, or return_code
97+
// Update basic metrics
10298
if event.Qr == types.DNSPktTypeQuery {
10399
m = metrics.DNSRequestCounter
104-
m.WithLabelValues(event.QType, event.DNSName).Inc()
105100
} else if event.Qr == types.DNSPktTypeResponse {
106101
m = metrics.DNSResponseCounter
107-
m.WithLabelValues(event.Rcode, event.QType, event.DNSName, responses, strconv.Itoa(event.NumAnswers)).Inc()
108102
} else {
109103
return
110104
}
105+
m.WithLabelValues().Inc()
111106

112107
if !d.cfg.EnablePodLevel {
113108
return

pkg/plugin/dns/dns_linux_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ func TestRequestEventHandler(t *testing.T) {
141141

142142
// Basic metrics.
143143
mockCV := metrics.NewMockCounterVec(ctrl)
144-
mockCV.EXPECT().WithLabelValues(event.QType, event.DNSName).Return(c).Times(1)
144+
mockCV.EXPECT().WithLabelValues().Return(c).Times(1)
145145
before := value(c)
146146
metrics.DNSRequestCounter = mockCV
147147

@@ -191,7 +191,7 @@ func TestResponseEventHandler(t *testing.T) {
191191
// Basic metrics.
192192
c := prometheus.NewCounter(prometheus.CounterOpts{})
193193
mockCV := metrics.NewMockCounterVec(ctrl)
194-
mockCV.EXPECT().WithLabelValues(event.Rcode, event.QType, event.DNSName, "1.1.1.1,2.2.2.2", "2").Return(c).Times(1)
194+
mockCV.EXPECT().WithLabelValues().Return(c).Times(1)
195195
before := value(c)
196196
metrics.DNSResponseCounter = mockCV
197197

pkg/plugin/linuxutil/netstat_stats_linux.go

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,21 @@ const (
2222
pathNetSnmp = "/proc/net/snmp"
2323
)
2424

25+
var (
26+
nodeIP = os.Getenv("NODE_IP")
27+
ignoreList = []string{
28+
"0.0.0.0",
29+
"127.0.0.",
30+
}
31+
)
32+
33+
func init() {
34+
// Add node IP to the ignore list
35+
if nodeIP != "" {
36+
ignoreList = append(ignoreList, nodeIP)
37+
}
38+
}
39+
2540
type NetstatReader struct {
2641
l *log.ZapLogger
2742
connStats *ConnectionStats
@@ -196,6 +211,9 @@ func (nr *NetstatReader) readSockStats() error {
196211
}
197212
addr := addrPort.Addr().String()
198213
port := strconv.Itoa(int(addrPort.Port()))
214+
if !validateRemoteAddr(addr) {
215+
continue
216+
}
199217
// Check if the remote address is in the new sockStats map
200218
if _, ok := sockStats.socketByRemoteAddr[remoteAddr]; !ok {
201219
nr.l.Debug("Removing remote address from metrics", zap.String("remoteAddr", remoteAddr))
@@ -275,9 +293,11 @@ func validateRemoteAddr(addr string) bool {
275293
return false
276294
}
277295

278-
// ignore localhost addresses.
279-
if strings.Contains(addr, "127.0.0") {
280-
return false
296+
// check if the address is in the ignore list
297+
for _, addressToIgnore := range ignoreList {
298+
if strings.HasPrefix(addr, addressToIgnore) {
299+
return false
300+
}
281301
}
282302

283303
return true

pkg/plugin/linuxutil/netstat_stats_linux_test.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ package linuxutil
55
import (
66
"fmt"
77
"net"
8+
"os"
89
"testing"
910

1011
"github.com/cakturk/go-netstat/netstat"
@@ -339,6 +340,9 @@ func TestReadSockStats(t *testing.T) {
339340

340341
// Test IP that belongs to a closed connection being removed from the metrics
341342
func TestReadSockStatsRemoveClosedConnection(t *testing.T) {
343+
// Set os env variable to enable debug logs
344+
os.Setenv("NODE_IP", "10.224.0.6")
345+
342346
_, err := log.SetupZapLogger(log.GetDefaultLogOpts())
343347
require.NoError(t, err)
344348
opts := &NetstatOpts{
@@ -363,7 +367,7 @@ func TestReadSockStatsRemoveClosedConnection(t *testing.T) {
363367
// Initial value
364368
nr.opts.PrevTCPSockStats = &SocketStats{
365369
socketByRemoteAddr: map[string]int{
366-
"127.0.0.1:80": 1,
370+
"192.168.1.100:80": 1,
367371
},
368372
}
369373

@@ -372,7 +376,11 @@ func TestReadSockStatsRemoveClosedConnection(t *testing.T) {
372376
ns.EXPECT().UDPSocks(gomock.Any()).Return([]netstat.SockTabEntry{}, nil).Times(1)
373377

374378
// We are expecting the gauge to be called once for this value as it is removed
375-
MockGaugeVec.EXPECT().WithLabelValues("127.0.0.1", "80").Return(testmetric).Times(1)
379+
MockGaugeVec.EXPECT().WithLabelValues("192.168.1.100", "80").Return(testmetric).Times(1)
380+
381+
// We are not expecting the gauge to be called for localhost or node IP
382+
MockGaugeVec.EXPECT().WithLabelValues("127.0.0.1", "80").Return(testmetric).Times(0)
383+
MockGaugeVec.EXPECT().WithLabelValues("10.224.0.6", "80").Return(testmetric).Times(0)
376384

377385
err = nr.readSockStats()
378386
require.NoError(t, err)

test/e2e/framework/prometheus/prometheus.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,12 @@ func verifyValidMetricPresent(metricName string, data map[string]*promclient.Met
8383
for _, label := range metric.GetLabel() {
8484
metricLabels[label.GetName()] = label.GetValue()
8585
}
86+
87+
// if valid metric is empty, then we just need to make sure the metric and value is present
88+
if len(validMetric) == 0 && len(metricLabels) > 0 {
89+
return nil
90+
}
91+
8692
if reflect.DeepEqual(metricLabels, validMetric) {
8793
return nil
8894
}

test/e2e/scenarios/dns/validate-basic-dns-metric.go

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,7 @@ type validateBasicDNSRequestMetrics struct {
2424
func (v *validateBasicDNSRequestMetrics) Run() error {
2525
metricsEndpoint := fmt.Sprintf("http://localhost:%d/metrics", common.RetinaPort)
2626

27-
validBasicDNSRequestMetricLabels := map[string]string{
28-
"query": v.Query,
29-
"query_type": v.QueryType,
30-
}
27+
validBasicDNSRequestMetricLabels := map[string]string{}
3128

3229
err := prom.CheckMetric(metricsEndpoint, dnsBasicRequestCountMetricName, validBasicDNSRequestMetricLabels)
3330
if err != nil {
@@ -61,13 +58,7 @@ func (v *validateBasicDNSResponseMetrics) Run() error {
6158
v.Response = ""
6259
}
6360

64-
validBasicDNSResponseMetricLabels := map[string]string{
65-
"num_response": v.NumResponse,
66-
"query": v.Query,
67-
"query_type": v.QueryType,
68-
"return_code": v.ReturnCode,
69-
"response": v.Response,
70-
}
61+
validBasicDNSResponseMetricLabels := map[string]string{}
7162

7263
err := prom.CheckMetric(metricsEndpoint, dnsBasicResponseCountMetricName, validBasicDNSResponseMetricLabels)
7364
if err != nil {

test/e2e/scenarios/tcp/validate-tcp-connection-remote.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,7 @@ type ValidateRetinaTCPConnectionRemoteMetric struct {
2121
func (v *ValidateRetinaTCPConnectionRemoteMetric) Run() error {
2222
promAddress := fmt.Sprintf("http://localhost:%s/metrics", v.PortForwardedRetinaPort)
2323

24-
validMetrics := []map[string]string{
25-
{address: "0.0.0.0", port: "0"},
26-
}
24+
validMetrics := []map[string]string{}
2725

2826
for _, metric := range validMetrics {
2927
err := prom.CheckMetric(promAddress, tcpConnectionRemoteMetricName, metric)
@@ -32,7 +30,7 @@ func (v *ValidateRetinaTCPConnectionRemoteMetric) Run() error {
3230
}
3331
}
3432

35-
log.Printf("found metrics matching %+v\n", validMetrics)
33+
log.Printf("found metrics matching %+v\n", tcpConnectionRemoteMetricName)
3634
return nil
3735
}
3836

0 commit comments

Comments
 (0)