change prometheus_server role variable structure

2025-07-03 19:59:22 +03:00 · 2025-07-03 19:59:22 +03:00 · 1b30ca221e
commit 1b30ca221e
parent 0363813807
5 changed files with 234 additions and 236 deletions
--- a/ansible/inventories/common/group_vars/monitoring.yml
+++ b/ansible/inventories/common/group_vars/monitoring.yml
@ -28,77 +28,177 @@ users:
         openpgp:0xAD2BFD7F"
    opendoas_settings: "permit nopass ansible"
-prometheus_options:
+prometheus_settings:
-  global:
+  config:
    global:
    alerting:
      alertmanagers:
      - static_configs:
        - targets: 
          - 192.168.0.252:9093
    scrape_configs:
      - job_name: prometheus
        static_configs:
          - targets:
            - 192.168.0.252:9090
      - job_name: node
        static_configs:
          - targets: 
            # main-page
            - 192.168.0.10:9100
            labels:
              env: dev
              hostname: main-page
          - targets: 
            # searxng
            - 192.168.0.15:9100
            labels:
              env: dev
              hostname: searxng
          - targets: 
            # forgejo
            - 192.168.0.20:9100
            labels:
              env: dev
              hostname: forgejo
          - targets: 
            # forgejo
            - 192.168.0.21:9100
            labels:
              env: dev
              hostname: forgejo-runner
          - targets: 
            # bastion
            - 192.168.0.254:9100
            labels:
              env: common
              hostname: bastion
          - targets: 
            # load-balancer
            - 192.168.0.253:9100
            labels:
              env: common
              hostname: load-balancer
          - targets: 
            # monitoring
            - 192.168.0.252:9100
            labels:
              env: common
              hostname: monitoring
      - job_name: nginx
        static_configs:
          - targets: 
            # load-balancer
            - 192.168.0.253:9113
            labels:
              env: common
              hostname: monitoring
-  alerting:
+  alerting_rules:
-    alertmanagers:
+    groups:
-    - static_configs:
+      - name: DefaultMetrics
-      - targets: 
+        rules:
-        - 192.168.0.252:9093
+          - alert: HostOutOfMemory
-
+            expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)
-  rule_files:
+            for: 2m
-    - alerting_rules/*.yml
+            labels:
-
+              severity: warning
-  scrape_configs:
+            annotations:
-    - job_name: prometheus
+              summary: Host out of memory (instance {{'{{'}} $labels.instance {{'}}'}})
-      static_configs:
+              description: "Node memory is filling up (< 10% left)\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
-        - targets:
+          # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
-          - 192.168.0.252:9090
+          - alert: HostMemoryIsUnderutilized
-
+            expr: min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8
-    - job_name: node
+            for: 0m
-      static_configs:
+            labels:
-        - targets: 
+              severity: info
-          # main-page
+            annotations:
-          - 192.168.0.10:9100
+              summary: Host Memory is underutilized (instance {{'{{'}} $labels.instance {{'}}'}})
-          labels:
+              description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{'{{'}} $labels.instance {{'}}'}})\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
-            env: dev
+          # Please add ignored mountpoints in node_exporter parameters like
-            hostname: main-page
+          # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
-        - targets: 
+          # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
-          # searxng
+          - alert: HostOutOfDiskSpace
-          - 192.168.0.15:9100
+            expr: (node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)
-          labels:
+            for: 2m
-            env: dev
+            labels:
-            hostname: searxng
+              severity: critical
-        - targets: 
+            annotations:
-          # forgejo
+              summary: Host out of disk space (instance {{'{{'}} $labels.instance {{'}}'}})
-          - 192.168.0.20:9100
+              description: "Disk is almost full (< 10% left)\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
-          labels:
+          - alert: HostOutOfInodes
-            env: dev
+            expr: (node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)
-            hostname: forgejo
+            for: 2m
-        - targets: 
+            labels:
-          # forgejo
+              severity: critical
-          - 192.168.0.21:9100
+            annotations:
-          labels:
+              summary: Host out of inodes (instance {{'{{'}} $labels.instance {{'}}'}})
-            env: dev
+              description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
-            hostname: forgejo-runner
+          - alert: HostHighCpuLoad
-        - targets: 
+            expr: (avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .90
-          # bastion
+            for: 10m
-          - 192.168.0.254:9100
+            labels:
-          labels:
+              severity: warning
-            env: common
+            annotations:
-            hostname: bastion
+              summary: Host high CPU load (instance {{'{{'}} $labels.instance {{'}}'}})
-        - targets: 
+              description: "CPU load is > 80%\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
-          # load-balancer
+          # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
-          - 192.168.0.253:9100
+          - alert: HostCpuIsUnderutilized
-          labels:
+            expr: (min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8
-            env: common
+            for: 1w
-            hostname: load-balancer
+            labels:
-        - targets: 
+              severity: info
-          # monitoring
+            annotations:
-          - 192.168.0.252:9100
+              summary: Host CPU is underutilized (instance {{'{{'}} $labels.instance {{'}}'}})
-          labels:
+              description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
-            env: common
+          - alert: HostCpuHighIowait
-            hostname: monitoring
+            expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10
-
+            for: 0m
-    - job_name: nginx
+            labels:
-      static_configs:
+              severity: warning
-        - targets: 
+            annotations:
-          # load-balancer
+              summary: Host CPU high iowait (instance {{'{{'}} $labels.instance {{'}}'}})
-          - 192.168.0.253:9113
+              description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
-          labels:
+          - alert: HostSwapIsFillingUp
-            env: common
+            expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)
-            hostname: monitoring
+            for: 2m
            labels:
              severity: warning
            annotations:
              summary: Host swap is filling up (instance {{'{{'}} $labels.instance {{'}}'}})
              description: "Swap is filling up (>80%)\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
          # - alert: HostSystemdServiceCrashed
          #   expr: (node_systemd_unit_state{state="failed"} == 1)
          #   for: 0m
          #   labels:
          #     severity: warning
          #   annotations:
          #     summary: Host systemd service crashed (instance {{'{{'}} $labels.instance {{'}}'}})
          #     description: "systemd service crashed\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
          - alert: HostOomKillDetected
            expr: (increase(node_vmstat_oom_kill[1m]) > 0)
            for: 0m
            labels:
              severity: warning
            annotations:
              summary: Host OOM kill detected (instance {{'{{'}} $labels.instance {{'}}'}})
              description: "OOM kill detected\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
          - alert: HostClockSkew
            expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))
            for: 10m
            labels:
              severity: warning
            annotations:
              summary: Host clock skew (instance {{'{{'}} $labels.instance {{'}}'}})
              description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
          - alert: HostClockNotSynchronising
            expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: Host clock not synchronising (instance {{'{{'}} $labels.instance {{'}}'}})
              description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
@ -135,114 +235,6 @@ prometheus_alertmanager_options:
        - to: notifications@cuqmbr.xyz
 prometheus_alerting_rules:
  groups:
    - name: DefaultMetrics
      rules:
        - alert: HostOutOfMemory
          expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)
          for: 2m
          labels:
            severity: warning
          annotations:
            summary: Host out of memory (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "Node memory is filling up (< 10% left)\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
        - alert: HostMemoryIsUnderutilized
          expr: min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8
          for: 0m
          labels:
            severity: info
          annotations:
            summary: Host Memory is underutilized (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{'{{'}} $labels.instance {{'}}'}})\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        # Please add ignored mountpoints in node_exporter parameters like
        # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
        # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
        - alert: HostOutOfDiskSpace
          expr: (node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)
          for: 2m
          labels:
            severity: critical
          annotations:
            summary: Host out of disk space (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "Disk is almost full (< 10% left)\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        - alert: HostOutOfInodes
          expr: (node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)
          for: 2m
          labels:
            severity: critical
          annotations:
            summary: Host out of inodes (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        - alert: HostHighCpuLoad
          expr: (avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .90
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: Host high CPU load (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "CPU load is > 80%\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
        - alert: HostCpuIsUnderutilized
          expr: (min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8
          for: 1w
          labels:
            severity: info
          annotations:
            summary: Host CPU is underutilized (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        - alert: HostCpuHighIowait
          expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10
          for: 0m
          labels:
            severity: warning
          annotations:
            summary: Host CPU high iowait (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        - alert: HostSwapIsFillingUp
          expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)
          for: 2m
          labels:
            severity: warning
          annotations:
            summary: Host swap is filling up (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "Swap is filling up (>80%)\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        # - alert: HostSystemdServiceCrashed
        #   expr: (node_systemd_unit_state{state="failed"} == 1)
        #   for: 0m
        #   labels:
        #     severity: warning
        #   annotations:
        #     summary: Host systemd service crashed (instance {{'{{'}} $labels.instance {{'}}'}})
        #     description: "systemd service crashed\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        - alert: HostOomKillDetected
          expr: (increase(node_vmstat_oom_kill[1m]) > 0)
          for: 0m
          labels:
            severity: warning
          annotations:
            summary: Host OOM kill detected (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "OOM kill detected\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        - alert: HostClockSkew
          expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: Host clock skew (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        - alert: HostClockNotSynchronising
          expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)
          for: 2m
          labels:
            severity: warning
          annotations:
            summary: Host clock not synchronising (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
 loki_settings:
  config:
    auth_enabled: false
--- a/ansible/roles/prometheus_server/defaults/main.yml
+++ b/ansible/roles/prometheus_server/defaults/main.yml
@ -1,59 +1,53 @@
 ---
-prometheus_options:
+prometheus_settings:
  global:
    # Set the scrape interval to every 15 seconds. Default is every 1 minute.
    scrape_interval: 15s
    # Evaluate rules every 15 seconds. The default is every 1 minute.
    evaluation_interval: 15s
    # scrape_timeout is set to the global default (10s).
-    # Attach these labels to any time series or alerts when communicating with
+prometheus_default_settings:
-    # external systems (federation, remote storage, Alertmanager).
+  config:
-    external_labels:
+    global:
-      monitor: 'example'
+      # Set the scrape interval to every 15 seconds. Default is every 1 minute.
-
+      scrape_interval: 15s
-  # Alertmanager configuration
+      # Evaluate rules every 15 seconds. The default is every 1 minute.
-  alerting:
+      evaluation_interval: 15s
-    alertmanagers:
+      # scrape_timeout is set to the global default (10s).
-      - static_configs:
+      # Attach these labels to any time series or alerts when communicating with
-          - targets: ['localhost:9093']
+      # external systems (federation, remote storage, Alertmanager).
-
+      # external_labels:
-  # Load rules and evaluate them according to the global 'evaluation_interval'.
+      #   monitor: 'example'
-  rule_files:
+    # Alertmanager configuration
-    - alerting_rules/*.yml
+    # alerting:
-
+    #   alertmanagers:
-  # A scrape configuration containing exactly one endpoint to scrape:
+    #     - static_configs:
-  # Here it's Prometheus itself.
+    #         - targets: ['localhost:9093']
-  scrape_configs:
+    # Load rules and evaluate them according to the global 'evaluation_interval'
-    # The job name is added as a label `job=<job_name>`.
+    rule_files:
-    - job_name: 'prometheus'
+      - alerting_rules/*.yml
-
+    # A scrape configuration containing exactly one endpoint to scrape:
-      # Override the global default and scrape targets from this job.
+    # Here it's Prometheus itself.
-      scrape_interval: 5s
+    scrape_configs:
-      scrape_timeout: 5s
+      # The job name is added as a label `job=<job_name>`.
-
+      - job_name: 'prometheus'
-      # metrics_path defaults to '/metrics'
+        # Override the global default and scrape targets from this job.
-      # scheme defaults to 'http'.
+        scrape_interval: 5s
-
+        scrape_timeout: 5s
-      static_configs:
+        # metrics_path defaults to '/metrics'
-        - targets: ['localhost:9090']
+        # scheme defaults to 'http'.
-
+        static_configs:
-    - job_name: node
+          - targets: ['localhost:9090']
-      # If prometheus-node-exporter is installed, grab stats about the local
+      - job_name: node
-      # machine by default.
+        # If prometheus-node-exporter is installed, grab stats about the local
-      static_configs:
+        # machine by default.
-        - targets: ['localhost:9100']
+        static_configs:
-
+          - targets: ['localhost:9100']
-prometheus_alerting_rules:
+  alerting_rules:
-  groups:
+    # groups:
-    - name: ExampleRedisGroup
+    #   - name: ExampleRedisGroup
-      rules:
+    #     rules:
-        - alert: ExampleRedisDown
+    #       - alert: ExampleRedisDown
-          expr: redis_up{} == 0
+    #         expr: redis_up{} == 0
-          for: 2m
+    #         for: 2m
-          labels:
+    #         labels:
-            severity: critical
+    #           severity: critical
-          annotations:
+    #         annotations:
-            summary: "Redis instance down"
+    #           summary: "Redis instance down"
-            description: "Whatever"
+    #           description: "Whatever"
--- a/ansible/roles/prometheus_server/tasks/main.yml
+++ b/ansible/roles/prometheus_server/tasks/main.yml
@ -1,5 +1,11 @@
 ---
 - name: Combine default and user settings, decrypt vault.
  ansible.builtin.set_fact:
    prometheus_settings: "{{ prometheus_default_settings |
        ansible.builtin.combine(prometheus_settings, recursive=true) }}"
  no_log: true
 - name: Install prometheus apt package.
  ansible.builtin.apt:
    name: prometheus
--- a/ansible/roles/prometheus_server/templates/alerting_rules.yml.j2
+++ b/ansible/roles/prometheus_server/templates/alerting_rules.yml.j2
@ -1,4 +1,7 @@
 ---
 # Managed with Ansible
-{{ prometheus_alerting_rules | ansible.builtin.to_nice_yaml(indent=2) }}
+{{
    prometheus_settings.alerting_rules |
    ansible.builtin.to_nice_yaml(indent=2)
 }}
--- a/ansible/roles/prometheus_server/templates/prometheus.yml.j2
+++ b/ansible/roles/prometheus_server/templates/prometheus.yml.j2
@ -1,4 +1,7 @@
 ---
 # Managed with Ansible
-{{ prometheus_options | ansible.builtin.to_nice_yaml(indent=2) }}
+{{
    prometheus_settings.config | 
    ansible.builtin.to_nice_yaml(indent=2, width=80)
 }}