Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
title="Network tcp segments",
targets=[
g.Target(
expr="sum(rate(node_netstat_Tcp_InSegs[1m])) by (instance)",
legendFormat="InSegs {{instance}}",
),
g.Target(
expr="sum(rate(node_netstat_Tcp_OutSegs[1m])) by (instance)",
legendFormat="OutSegs {{instance}}",
),
g.Target(
expr="sum(rate(node_netstat_Tcp_RetransSegs[1m])) by (instance)",
legendFormat="RetransSegs {{instance}}",
),
],
yAxes=g.single_y_axis(format=g.SHORT_FORMAT, logBase=10),
),
]
# The final dashboard must be named 'dashboard' so that grafanalib will find it.
dashboard = d.Dashboard(
title="Master dashboard",
refresh="",
rows=[
d.Row(title="API call latency", panels=API_CALL_LATENCY_PANELS),
d.Row(title="API call latency aggregated with quantile", panels=QUANTILE_API_CALL_LATENCY_PANELS, collapse=True),
d.Row(title="Overall cluster health", panels=HEALTH_PANELS, collapse=True),
d.Row(title="etcd", panels=ETCD_PANELS, collapse=True),
d.Row(title="kube-apiserver", panels=APISERVER_PANELS, collapse=True),
d.Row(
title="kube-controller-manager",
panels=[
d.one_line(
"""
histogram_quantile(
0.50,
sum(
rate(
etcd_request_duration_seconds_bucket{
operation=~"${etcd_operation:regex}",
type=~".*(${etcd_type:pipe})"
}[1m]
)
) by (le, operation, type, instance)
)
"""
),
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
),
d.simple_graph("etcd instance id", "sum(etcd_server_id) by (instance, server_id)"),
d.simple_graph(
"etcd network latency (99th percentile)",
"histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (le, instance, To))",
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
),
d.simple_graph(
"etcd compaction keys",
"delta(etcd_debugging_mvcc_db_compaction_keys_total[1m])",
),
d.simple_graph(
"etcd compaction pause sum duration",
"delta(etcd_debugging_mvcc_db_compaction_pause_duration_milliseconds_sum[1m])",
yAxes=g.single_y_axis(format=g.MILLISECONDS_FORMAT),
),
],
yAxes=g.single_y_axis(format=g.MILLISECONDS_FORMAT),
),
d.simple_graph(
"etcd objects",
"sum(etcd_object_counts) by (resource, instance)",
legend="{{instance}}: {{resource}}",
),
d.simple_graph(
"etcd db size",
[
"etcd_mvcc_db_total_size_in_bytes",
"etcd_mvcc_db_total_size_in_use_in_bytes",
"etcd_server_quota_backend_bytes",
],
yAxes=g.single_y_axis(format=g.BYTES_FORMAT),
),
]
APISERVER_PANELS = [
d.simple_graph(
"goroutines",
'go_goroutines{job="master", endpoint="apiserver"}',
legend="{{instance}}",
),
d.simple_graph(
"gc rate",
'rate(go_gc_duration_seconds_count{job="master", endpoint="apiserver"}[1m])',
legend="{{instance}}",
),
d.simple_graph(
"alloc rate",
),
d.simple_graph(
"etcd wal fsync duration",
"histogram_quantile(1.0, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (le, endpoint))",
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
),
d.Graph(
title="etcd compaction max pause",
points=True,
lines=False,
targets=[
g.Target(
expr="histogram_quantile(1.0, sum(rate(etcd_debugging_mvcc_db_compaction_pause_duration_milliseconds_bucket[1m])) by (le, instance))"
)
],
yAxes=g.single_y_axis(format=g.MILLISECONDS_FORMAT),
),
d.simple_graph(
"etcd objects",
"sum(etcd_object_counts) by (resource, instance)",
legend="{{instance}}: {{resource}}",
),
d.simple_graph(
"etcd db size",
[
"etcd_mvcc_db_total_size_in_bytes",
"etcd_mvcc_db_total_size_in_use_in_bytes",
"etcd_server_quota_backend_bytes",
],
yAxes=g.single_y_axis(format=g.BYTES_FORMAT),
),
]
def api_call_latency(title, metric, verb, scope, threshold):
return d.Graph(
title=title,
targets=[
g.Target(expr=str(threshold), legendFormat="threshold"),
g.Target(
expr='quantile_over_time(0.99, %(metric)s{quantile="0.99", verb=~"%(verb)s", scope=~"%(scope)s"}[12h])'
% {"metric": metric, "verb": verb, "scope": scope}
),
],
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
)
),
d.simple_graph(
"Component restarts 2",
'sum(min_over_time(container_start_time_seconds{container!="",container!="POD"}[2m])) by (container)',
),
d.simple_graph(
"Active component", "sum(leader_election_master_status) by (name, instance)"
),
]
ETCD_PANELS = [
d.simple_graph("etcd leader", "etcd_server_is_leader", legend="{{instance}}"),
d.simple_graph(
"etcd bytes sent",
"rate(etcd_network_client_grpc_sent_bytes_total[1m])",
yAxes=g.single_y_axis(format=g.BYTES_PER_SEC_FORMAT),
legend="{{instance}}",
),
d.simple_graph(
"etcd operations rate",
d.one_line(
"""
sum(
rate(
etcd_request_duration_seconds_count{
operation=~"${etcd_operation:regex}",
type=~".*(${etcd_type:pipe})"
}[1m]
)
) by (operation, type)
"""
),
def api_call_latency(title, verb, scope, threshold):
return d.Graph(
title=title,
targets=[
g.Target(expr=str(threshold), legendFormat="threshold"),
g.Target(
expr=d.one_line(expression % {"verb": verb, "scope": scope}
),
# TODO(github.com/grafana/grafana/issues/19410): uncomment once fixed
# legendFormat="{{verb}} {{scope}}/{{resource}}",
),
],
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
)
def show_quantiles(queryTemplate, quantiles=None, legend=""):
quantiles = quantiles or QUANTILES
targets = []
for quantile in quantiles:
q = "{:.2f}".format(quantile)
l = legend or q
targets.append(g.Target(expr=queryTemplate.format(quantile=q), legendFormat=l))
return targets
def api_call_latency(title, verb, scope, threshold):
return d.Graph(
title=title,
targets=[
g.Target(expr=str(threshold), legendFormat="threshold"),
g.Target(
expr=d.one_line(expression % {"verb": verb, "scope": scope}
),
# TODO(github.com/grafana/grafana/issues/19410): uncomment once fixed
# legendFormat="{{verb}} {{scope}}/{{resource}}",
),
],
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
)
title="DNS latency",
targets=d.show_quantiles(
'probes:dns_lookup_latency:histogram_quantile{{quantile="{quantile}"}}',
legend="{{quantile}}",
),
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
nullPointMode="null",
),
d.Graph(
title="probe: lookup rate",
targets=[
g.Target(
expr='sum(rate(probes_in_cluster_dns_lookup_count{namespace="probes", job="dns"}[1m]))',
legendFormat="lookup rate",
),
g.Target(
expr='sum(rate(probes_in_cluster_network_latency_error{namespace="probes", job="dns"}[1m]))',
legendFormat="error rate",
),
],
),
d.Graph(
title="probe: # running",
targets=[
d.Target(
expr='count(container_memory_usage_bytes{namespace="probes", container="dns"}) by (container, namespace)'
)
],
nullPointMode="null",
),
d.Graph(
title="probe: memory usage",