diff --git a/ansible/inventories/common/group_vars/monitoring.yml b/ansible/inventories/common/group_vars/monitoring.yml index 5cbfa0c..f0d3729 100644 --- a/ansible/inventories/common/group_vars/monitoring.yml +++ b/ansible/inventories/common/group_vars/monitoring.yml @@ -28,77 +28,177 @@ users: openpgp:0xAD2BFD7F" opendoas_settings: "permit nopass ansible" -prometheus_options: - global: +prometheus_settings: + config: + global: + alerting: + alertmanagers: + - static_configs: + - targets: + - 192.168.0.252:9093 + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - 192.168.0.252:9090 + - job_name: node + static_configs: + - targets: + # main-page + - 192.168.0.10:9100 + labels: + env: dev + hostname: main-page + - targets: + # searxng + - 192.168.0.15:9100 + labels: + env: dev + hostname: searxng + - targets: + # forgejo + - 192.168.0.20:9100 + labels: + env: dev + hostname: forgejo + - targets: + # forgejo + - 192.168.0.21:9100 + labels: + env: dev + hostname: forgejo-runner + - targets: + # bastion + - 192.168.0.254:9100 + labels: + env: common + hostname: bastion + - targets: + # load-balancer + - 192.168.0.253:9100 + labels: + env: common + hostname: load-balancer + - targets: + # monitoring + - 192.168.0.252:9100 + labels: + env: common + hostname: monitoring + - job_name: nginx + static_configs: + - targets: + # load-balancer + - 192.168.0.253:9113 + labels: + env: common + hostname: monitoring - alerting: - alertmanagers: - - static_configs: - - targets: - - 192.168.0.252:9093 - - rule_files: - - alerting_rules/*.yml - - scrape_configs: - - job_name: prometheus - static_configs: - - targets: - - 192.168.0.252:9090 - - - job_name: node - static_configs: - - targets: - # main-page - - 192.168.0.10:9100 - labels: - env: dev - hostname: main-page - - targets: - # searxng - - 192.168.0.15:9100 - labels: - env: dev - hostname: searxng - - targets: - # forgejo - - 192.168.0.20:9100 - labels: - env: dev - hostname: forgejo - - targets: - # forgejo - - 192.168.0.21:9100 - labels: - env: dev - hostname: forgejo-runner - - targets: - # bastion - - 192.168.0.254:9100 - labels: - env: common - hostname: bastion - - targets: - # load-balancer - - 192.168.0.253:9100 - labels: - env: common - hostname: load-balancer - - targets: - # monitoring - - 192.168.0.252:9100 - labels: - env: common - hostname: monitoring - - - job_name: nginx - static_configs: - - targets: - # load-balancer - - 192.168.0.253:9113 - labels: - env: common - hostname: monitoring + alerting_rules: + groups: + - name: DefaultMetrics + rules: + - alert: HostOutOfMemory + expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10) + for: 2m + labels: + severity: warning + annotations: + summary: Host out of memory (instance {{'{{'}} $labels.instance {{'}}'}}) + description: "Node memory is filling up (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" + # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly + - alert: HostMemoryIsUnderutilized + expr: min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8 + for: 0m + labels: + severity: info + annotations: + summary: Host Memory is underutilized (instance {{'{{'}} $labels.instance {{'}}'}}) + description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{'{{'}} $labels.instance {{'}}'}})\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" + # Please add ignored mountpoints in node_exporter parameters like + # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". + # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. + - alert: HostOutOfDiskSpace + expr: (node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0) + for: 2m + labels: + severity: critical + annotations: + summary: Host out of disk space (instance {{'{{'}} $labels.instance {{'}}'}}) + description: "Disk is almost full (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" + - alert: HostOutOfInodes + expr: (node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) + for: 2m + labels: + severity: critical + annotations: + summary: Host out of inodes (instance {{'{{'}} $labels.instance {{'}}'}}) + description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" + - alert: HostHighCpuLoad + expr: (avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .90 + for: 10m + labels: + severity: warning + annotations: + summary: Host high CPU load (instance {{'{{'}} $labels.instance {{'}}'}}) + description: "CPU load is > 80%\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" + # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly + - alert: HostCpuIsUnderutilized + expr: (min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8 + for: 1w + labels: + severity: info + annotations: + summary: Host CPU is underutilized (instance {{'{{'}} $labels.instance {{'}}'}}) + description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" + - alert: HostCpuHighIowait + expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10 + for: 0m + labels: + severity: warning + annotations: + summary: Host CPU high iowait (instance {{'{{'}} $labels.instance {{'}}'}}) + description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" + - alert: HostSwapIsFillingUp + expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) + for: 2m + labels: + severity: warning + annotations: + summary: Host swap is filling up (instance {{'{{'}} $labels.instance {{'}}'}}) + description: "Swap is filling up (>80%)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" + # - alert: HostSystemdServiceCrashed + # expr: (node_systemd_unit_state{state="failed"} == 1) + # for: 0m + # labels: + # severity: warning + # annotations: + # summary: Host systemd service crashed (instance {{'{{'}} $labels.instance {{'}}'}}) + # description: "systemd service crashed\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" + - alert: HostOomKillDetected + expr: (increase(node_vmstat_oom_kill[1m]) > 0) + for: 0m + labels: + severity: warning + annotations: + summary: Host OOM kill detected (instance {{'{{'}} $labels.instance {{'}}'}}) + description: "OOM kill detected\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" + - alert: HostClockSkew + expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) + for: 10m + labels: + severity: warning + annotations: + summary: Host clock skew (instance {{'{{'}} $labels.instance {{'}}'}}) + description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" + - alert: HostClockNotSynchronising + expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) + for: 2m + labels: + severity: warning + annotations: + summary: Host clock not synchronising (instance {{'{{'}} $labels.instance {{'}}'}}) + description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" @@ -135,114 +235,6 @@ prometheus_alertmanager_options: - to: notifications@cuqmbr.xyz - -prometheus_alerting_rules: - groups: - - name: DefaultMetrics - rules: - - alert: HostOutOfMemory - expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10) - for: 2m - labels: - severity: warning - annotations: - summary: Host out of memory (instance {{'{{'}} $labels.instance {{'}}'}}) - description: "Node memory is filling up (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" - # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - - alert: HostMemoryIsUnderutilized - expr: min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8 - for: 0m - labels: - severity: info - annotations: - summary: Host Memory is underutilized (instance {{'{{'}} $labels.instance {{'}}'}}) - description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{'{{'}} $labels.instance {{'}}'}})\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" - # Please add ignored mountpoints in node_exporter parameters like - # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". - # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. - - alert: HostOutOfDiskSpace - expr: (node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0) - for: 2m - labels: - severity: critical - annotations: - summary: Host out of disk space (instance {{'{{'}} $labels.instance {{'}}'}}) - description: "Disk is almost full (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" - - alert: HostOutOfInodes - expr: (node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) - for: 2m - labels: - severity: critical - annotations: - summary: Host out of inodes (instance {{'{{'}} $labels.instance {{'}}'}}) - description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" - - alert: HostHighCpuLoad - expr: (avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .90 - for: 10m - labels: - severity: warning - annotations: - summary: Host high CPU load (instance {{'{{'}} $labels.instance {{'}}'}}) - description: "CPU load is > 80%\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" - # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - - alert: HostCpuIsUnderutilized - expr: (min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8 - for: 1w - labels: - severity: info - annotations: - summary: Host CPU is underutilized (instance {{'{{'}} $labels.instance {{'}}'}}) - description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" - - alert: HostCpuHighIowait - expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10 - for: 0m - labels: - severity: warning - annotations: - summary: Host CPU high iowait (instance {{'{{'}} $labels.instance {{'}}'}}) - description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" - - alert: HostSwapIsFillingUp - expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) - for: 2m - labels: - severity: warning - annotations: - summary: Host swap is filling up (instance {{'{{'}} $labels.instance {{'}}'}}) - description: "Swap is filling up (>80%)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" - # - alert: HostSystemdServiceCrashed - # expr: (node_systemd_unit_state{state="failed"} == 1) - # for: 0m - # labels: - # severity: warning - # annotations: - # summary: Host systemd service crashed (instance {{'{{'}} $labels.instance {{'}}'}}) - # description: "systemd service crashed\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" - - alert: HostOomKillDetected - expr: (increase(node_vmstat_oom_kill[1m]) > 0) - for: 0m - labels: - severity: warning - annotations: - summary: Host OOM kill detected (instance {{'{{'}} $labels.instance {{'}}'}}) - description: "OOM kill detected\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" - - alert: HostClockSkew - expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) - for: 10m - labels: - severity: warning - annotations: - summary: Host clock skew (instance {{'{{'}} $labels.instance {{'}}'}}) - description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" - - alert: HostClockNotSynchronising - expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) - for: 2m - labels: - severity: warning - annotations: - summary: Host clock not synchronising (instance {{'{{'}} $labels.instance {{'}}'}}) - description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" - - loki_settings: config: auth_enabled: false diff --git a/ansible/roles/prometheus_server/defaults/main.yml b/ansible/roles/prometheus_server/defaults/main.yml index 1cb1af5..2b77852 100644 --- a/ansible/roles/prometheus_server/defaults/main.yml +++ b/ansible/roles/prometheus_server/defaults/main.yml @@ -1,59 +1,53 @@ --- -prometheus_options: - global: - # Set the scrape interval to every 15 seconds. Default is every 1 minute. - scrape_interval: 15s - # Evaluate rules every 15 seconds. The default is every 1 minute. - evaluation_interval: 15s - # scrape_timeout is set to the global default (10s). +prometheus_settings: - # Attach these labels to any time series or alerts when communicating with - # external systems (federation, remote storage, Alertmanager). - external_labels: - monitor: 'example' - - # Alertmanager configuration - alerting: - alertmanagers: - - static_configs: - - targets: ['localhost:9093'] - - # Load rules and evaluate them according to the global 'evaluation_interval'. - rule_files: - - alerting_rules/*.yml - - # A scrape configuration containing exactly one endpoint to scrape: - # Here it's Prometheus itself. - scrape_configs: - # The job name is added as a label `job=`. - - job_name: 'prometheus' - - # Override the global default and scrape targets from this job. - scrape_interval: 5s - scrape_timeout: 5s - - # metrics_path defaults to '/metrics' - # scheme defaults to 'http'. - - static_configs: - - targets: ['localhost:9090'] - - - job_name: node - # If prometheus-node-exporter is installed, grab stats about the local - # machine by default. - static_configs: - - targets: ['localhost:9100'] - -prometheus_alerting_rules: - groups: - - name: ExampleRedisGroup - rules: - - alert: ExampleRedisDown - expr: redis_up{} == 0 - for: 2m - labels: - severity: critical - annotations: - summary: "Redis instance down" - description: "Whatever" +prometheus_default_settings: + config: + global: + # Set the scrape interval to every 15 seconds. Default is every 1 minute. + scrape_interval: 15s + # Evaluate rules every 15 seconds. The default is every 1 minute. + evaluation_interval: 15s + # scrape_timeout is set to the global default (10s). + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + # external_labels: + # monitor: 'example' + # Alertmanager configuration + # alerting: + # alertmanagers: + # - static_configs: + # - targets: ['localhost:9093'] + # Load rules and evaluate them according to the global 'evaluation_interval' + rule_files: + - alerting_rules/*.yml + # A scrape configuration containing exactly one endpoint to scrape: + # Here it's Prometheus itself. + scrape_configs: + # The job name is added as a label `job=`. + - job_name: 'prometheus' + # Override the global default and scrape targets from this job. + scrape_interval: 5s + scrape_timeout: 5s + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + static_configs: + - targets: ['localhost:9090'] + - job_name: node + # If prometheus-node-exporter is installed, grab stats about the local + # machine by default. + static_configs: + - targets: ['localhost:9100'] + alerting_rules: + # groups: + # - name: ExampleRedisGroup + # rules: + # - alert: ExampleRedisDown + # expr: redis_up{} == 0 + # for: 2m + # labels: + # severity: critical + # annotations: + # summary: "Redis instance down" + # description: "Whatever" diff --git a/ansible/roles/prometheus_server/tasks/main.yml b/ansible/roles/prometheus_server/tasks/main.yml index e4052c1..56709fe 100644 --- a/ansible/roles/prometheus_server/tasks/main.yml +++ b/ansible/roles/prometheus_server/tasks/main.yml @@ -1,5 +1,11 @@ --- +- name: Combine default and user settings, decrypt vault. + ansible.builtin.set_fact: + prometheus_settings: "{{ prometheus_default_settings | + ansible.builtin.combine(prometheus_settings, recursive=true) }}" + no_log: true + - name: Install prometheus apt package. ansible.builtin.apt: name: prometheus diff --git a/ansible/roles/prometheus_server/templates/alerting_rules.yml.j2 b/ansible/roles/prometheus_server/templates/alerting_rules.yml.j2 index e9dce36..14bd116 100644 --- a/ansible/roles/prometheus_server/templates/alerting_rules.yml.j2 +++ b/ansible/roles/prometheus_server/templates/alerting_rules.yml.j2 @@ -1,4 +1,7 @@ --- # Managed with Ansible -{{ prometheus_alerting_rules | ansible.builtin.to_nice_yaml(indent=2) }} +{{ + prometheus_settings.alerting_rules | + ansible.builtin.to_nice_yaml(indent=2) +}} diff --git a/ansible/roles/prometheus_server/templates/prometheus.yml.j2 b/ansible/roles/prometheus_server/templates/prometheus.yml.j2 index 491e7e2..d482f9e 100644 --- a/ansible/roles/prometheus_server/templates/prometheus.yml.j2 +++ b/ansible/roles/prometheus_server/templates/prometheus.yml.j2 @@ -1,4 +1,7 @@ --- # Managed with Ansible -{{ prometheus_options | ansible.builtin.to_nice_yaml(indent=2) }} +{{ + prometheus_settings.config | + ansible.builtin.to_nice_yaml(indent=2, width=80) +}}