279 lines
9.7 KiB
YAML
279 lines
9.7 KiB
YAML
---
|
|
|
|
prometheus_options:
|
|
global:
|
|
|
|
alerting:
|
|
alertmanagers:
|
|
- static_configs:
|
|
- targets:
|
|
- 192.168.0.252:9093
|
|
|
|
rule_files:
|
|
- alerting_rules/*.yml
|
|
|
|
scrape_configs:
|
|
- job_name: prometheus
|
|
static_configs:
|
|
- targets:
|
|
- 192.168.0.252:9090
|
|
|
|
- job_name: node
|
|
static_configs:
|
|
- targets:
|
|
# main-page
|
|
- 192.168.0.10:9100
|
|
labels:
|
|
env: dev
|
|
hostname: main-page
|
|
- targets:
|
|
# searxng
|
|
- 192.168.0.15:9100
|
|
labels:
|
|
env: dev
|
|
hostname: searxng
|
|
- targets:
|
|
# bastion
|
|
- 192.168.0.254:9100
|
|
labels:
|
|
env: common
|
|
hostname: bastion
|
|
- targets:
|
|
# load-balancer
|
|
- 192.168.0.253:9100
|
|
labels:
|
|
env: common
|
|
hostname: load-balancer
|
|
- targets:
|
|
# monitoring
|
|
- 192.168.0.252:9100
|
|
labels:
|
|
env: common
|
|
hostname: monitoring
|
|
|
|
- job_name: nginx
|
|
static_configs:
|
|
- targets:
|
|
# load-balancer
|
|
- 192.168.0.253:9113
|
|
labels:
|
|
env: common
|
|
hostname: monitoring
|
|
|
|
|
|
|
|
prometheus_alertmanager_options:
|
|
global:
|
|
smtp_smarthost: mail.cuqmbr.xyz:587
|
|
smtp_require_tls: true
|
|
smtp_from: '"Homelab Alertmanager" <no-reply@cuqmbr.xyz>'
|
|
smtp_auth_username: no-reply
|
|
smtp_auth_password: !vault |
|
|
$ANSIBLE_VAULT;1.1;AES256
|
|
31393866316539633838303936366464613935393933333338336531656239333361653664346637
|
|
3665316532336339633432303036626339363239343065630a326361306233656632653134643966
|
|
39663138303439323636666665653364396132333532383463626337653061356461643734336363
|
|
6266353533656566330a346536333836356131343832616631666330653462613436313062643330
|
|
61616664646439643839366630396137616533393664323965366630363566333632
|
|
|
|
templates:
|
|
- /etc/prometheus/alertmanager_templates/*.tmpl
|
|
|
|
route:
|
|
group_by:
|
|
- env
|
|
- hostname
|
|
|
|
group_wait: 30s
|
|
group_interval: 5m
|
|
repeat_interval: 1d
|
|
receiver: default
|
|
|
|
receivers:
|
|
- name: default
|
|
email_configs:
|
|
- to: notifications@cuqmbr.xyz
|
|
|
|
|
|
|
|
prometheus_alerting_rules:
|
|
groups:
|
|
- name: DefaultMetrics
|
|
rules:
|
|
- alert: HostOutOfMemory
|
|
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host out of memory (instance {{'{{'}} $labels.instance {{'}}'}})
|
|
description: "Node memory is filling up (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
|
|
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
|
|
- alert: HostMemoryIsUnderutilized
|
|
expr: min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8
|
|
for: 0m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: Host Memory is underutilized (instance {{'{{'}} $labels.instance {{'}}'}})
|
|
description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{'{{'}} $labels.instance {{'}}'}})\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
|
|
# Please add ignored mountpoints in node_exporter parameters like
|
|
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
|
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
|
- alert: HostOutOfDiskSpace
|
|
expr: (node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Host out of disk space (instance {{'{{'}} $labels.instance {{'}}'}})
|
|
description: "Disk is almost full (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
|
|
- alert: HostOutOfInodes
|
|
expr: (node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Host out of inodes (instance {{'{{'}} $labels.instance {{'}}'}})
|
|
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
|
|
- alert: HostHighCpuLoad
|
|
expr: (avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .90
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host high CPU load (instance {{'{{'}} $labels.instance {{'}}'}})
|
|
description: "CPU load is > 80%\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
|
|
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
|
|
- alert: HostCpuIsUnderutilized
|
|
expr: (min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8
|
|
for: 1w
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: Host CPU is underutilized (instance {{'{{'}} $labels.instance {{'}}'}})
|
|
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
|
|
- alert: HostCpuHighIowait
|
|
expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host CPU high iowait (instance {{'{{'}} $labels.instance {{'}}'}})
|
|
description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
|
|
- alert: HostSwapIsFillingUp
|
|
expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host swap is filling up (instance {{'{{'}} $labels.instance {{'}}'}})
|
|
description: "Swap is filling up (>80%)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
|
|
# - alert: HostSystemdServiceCrashed
|
|
# expr: (node_systemd_unit_state{state="failed"} == 1)
|
|
# for: 0m
|
|
# labels:
|
|
# severity: warning
|
|
# annotations:
|
|
# summary: Host systemd service crashed (instance {{'{{'}} $labels.instance {{'}}'}})
|
|
# description: "systemd service crashed\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
|
|
- alert: HostOomKillDetected
|
|
expr: (increase(node_vmstat_oom_kill[1m]) > 0)
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host OOM kill detected (instance {{'{{'}} $labels.instance {{'}}'}})
|
|
description: "OOM kill detected\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
|
|
- alert: HostClockSkew
|
|
expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host clock skew (instance {{'{{'}} $labels.instance {{'}}'}})
|
|
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
|
|
- alert: HostClockNotSynchronising
|
|
expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host clock not synchronising (instance {{'{{'}} $labels.instance {{'}}'}})
|
|
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
|
|
|
|
|
|
|
|
loki_options:
|
|
auth_enabled: false
|
|
|
|
server:
|
|
http_listen_port: 3100
|
|
grpc_listen_port: 9096
|
|
log_level: info
|
|
grpc_server_max_concurrent_streams: 1000
|
|
|
|
common:
|
|
instance_addr: 127.0.0.1
|
|
path_prefix: /tmp/loki
|
|
storage:
|
|
filesystem:
|
|
chunks_directory: /tmp/loki/chunks
|
|
rules_directory: /tmp/loki/rules
|
|
replication_factor: 1
|
|
ring:
|
|
kvstore:
|
|
store: inmemory
|
|
|
|
query_range:
|
|
results_cache:
|
|
cache:
|
|
embedded_cache:
|
|
enabled: true
|
|
max_size_mb: 100
|
|
|
|
limits_config:
|
|
metric_aggregation_enabled: true
|
|
|
|
schema_config:
|
|
configs:
|
|
- from: 2020-10-24
|
|
store: tsdb
|
|
object_store: filesystem
|
|
schema: v13
|
|
index:
|
|
prefix: index_
|
|
period: 24h
|
|
|
|
pattern_ingester:
|
|
enabled: true
|
|
metric_aggregation:
|
|
loki_address: localhost:3100
|
|
|
|
ruler:
|
|
alertmanager_url: http://localhost:9093
|
|
|
|
frontend:
|
|
encoding: protobuf
|
|
|
|
analytics:
|
|
reporting_enabled: false
|
|
|
|
|
|
|
|
fluentbit_settings:
|
|
service:
|
|
flush: 1
|
|
daemon: false
|
|
log_level: info
|
|
http_server: false
|
|
pipeline:
|
|
inputs:
|
|
- name: systemd
|
|
tag: systemd
|
|
outputs:
|
|
- name: loki
|
|
host: 192.168.0.252
|
|
labels: env=common,hostname=monitoring,service_name=systemd
|
|
match: systemd
|