change prometheus_server role variable structure

2025-07-03 19:59:22 +03:00 · 2025-07-03 19:59:22 +03:00 · 1b30ca221e
commit 1b30ca221e
parent 0363813807
5 changed files with 234 additions and 236 deletions
--- a/ansible/inventories/common/group_vars/monitoring.yml
+++ b/ansible/inventories/common/group_vars/monitoring.yml
@ -28,77 +28,177 @@ users:
         openpgp:0xAD2BFD7F"
    opendoas_settings: "permit nopass ansible"

-prometheus_options:
-  global:
+prometheus_settings:
+  config:
+    global:
+    alerting:
+      alertmanagers:
+      - static_configs:
+        - targets: 
+          - 192.168.0.252:9093
+    scrape_configs:
+      - job_name: prometheus
+        static_configs:
+          - targets:
+            - 192.168.0.252:9090
+      - job_name: node
+        static_configs:
+          - targets: 
+            # main-page
+            - 192.168.0.10:9100
+            labels:
+              env: dev
+              hostname: main-page
+          - targets: 
+            # searxng
+            - 192.168.0.15:9100
+            labels:
+              env: dev
+              hostname: searxng
+          - targets: 
+            # forgejo
+            - 192.168.0.20:9100
+            labels:
+              env: dev
+              hostname: forgejo
+          - targets: 
+            # forgejo
+            - 192.168.0.21:9100
+            labels:
+              env: dev
+              hostname: forgejo-runner
+          - targets: 
+            # bastion
+            - 192.168.0.254:9100
+            labels:
+              env: common
+              hostname: bastion
+          - targets: 
+            # load-balancer
+            - 192.168.0.253:9100
+            labels:
+              env: common
+              hostname: load-balancer
+          - targets: 
+            # monitoring
+            - 192.168.0.252:9100
+            labels:
+              env: common
+              hostname: monitoring
+      - job_name: nginx
+        static_configs:
+          - targets: 
+            # load-balancer
+            - 192.168.0.253:9113
+            labels:
+              env: common
+              hostname: monitoring

-  alerting:
-    alertmanagers:
-    - static_configs:
-      - targets: 
-        - 192.168.0.252:9093
-
-  rule_files:
-    - alerting_rules/*.yml
-
-  scrape_configs:
-    - job_name: prometheus
-      static_configs:
-        - targets:
-          - 192.168.0.252:9090
-
-    - job_name: node
-      static_configs:
-        - targets: 
-          # main-page
-          - 192.168.0.10:9100
-          labels:
-            env: dev
-            hostname: main-page
-        - targets: 
-          # searxng
-          - 192.168.0.15:9100
-          labels:
-            env: dev
-            hostname: searxng
-        - targets: 
-          # forgejo
-          - 192.168.0.20:9100
-          labels:
-            env: dev
-            hostname: forgejo
-        - targets: 
-          # forgejo
-          - 192.168.0.21:9100
-          labels:
-            env: dev
-            hostname: forgejo-runner
-        - targets: 
-          # bastion
-          - 192.168.0.254:9100
-          labels:
-            env: common
-            hostname: bastion
-        - targets: 
-          # load-balancer
-          - 192.168.0.253:9100
-          labels:
-            env: common
-            hostname: load-balancer
-        - targets: 
-          # monitoring
-          - 192.168.0.252:9100
-          labels:
-            env: common
-            hostname: monitoring
-
-    - job_name: nginx
-      static_configs:
-        - targets: 
-          # load-balancer
-          - 192.168.0.253:9113
-          labels:
-            env: common
-            hostname: monitoring
+  alerting_rules:
+    groups:
+      - name: DefaultMetrics
+        rules:
+          - alert: HostOutOfMemory
+            expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)
+            for: 2m
+            labels:
+              severity: warning
+            annotations:
+              summary: Host out of memory (instance {{'{{'}} $labels.instance {{'}}'}})
+              description: "Node memory is filling up (< 10% left)\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
+          # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
+          - alert: HostMemoryIsUnderutilized
+            expr: min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8
+            for: 0m
+            labels:
+              severity: info
+            annotations:
+              summary: Host Memory is underutilized (instance {{'{{'}} $labels.instance {{'}}'}})
+              description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{'{{'}} $labels.instance {{'}}'}})\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
+          # Please add ignored mountpoints in node_exporter parameters like
+          # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
+          # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
+          - alert: HostOutOfDiskSpace
+            expr: (node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)
+            for: 2m
+            labels:
+              severity: critical
+            annotations:
+              summary: Host out of disk space (instance {{'{{'}} $labels.instance {{'}}'}})
+              description: "Disk is almost full (< 10% left)\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
+          - alert: HostOutOfInodes
+            expr: (node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)
+            for: 2m
+            labels:
+              severity: critical
+            annotations:
+              summary: Host out of inodes (instance {{'{{'}} $labels.instance {{'}}'}})
+              description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
+          - alert: HostHighCpuLoad
+            expr: (avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .90
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              summary: Host high CPU load (instance {{'{{'}} $labels.instance {{'}}'}})
+              description: "CPU load is > 80%\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
+          # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
+          - alert: HostCpuIsUnderutilized
+            expr: (min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8
+            for: 1w
+            labels:
+              severity: info
+            annotations:
+              summary: Host CPU is underutilized (instance {{'{{'}} $labels.instance {{'}}'}})
+              description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
+          - alert: HostCpuHighIowait
+            expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10
+            for: 0m
+            labels:
+              severity: warning
+            annotations:
+              summary: Host CPU high iowait (instance {{'{{'}} $labels.instance {{'}}'}})
+              description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
+          - alert: HostSwapIsFillingUp
+            expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)
+            for: 2m
+            labels:
+              severity: warning
+            annotations:
+              summary: Host swap is filling up (instance {{'{{'}} $labels.instance {{'}}'}})
+              description: "Swap is filling up (>80%)\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
+          # - alert: HostSystemdServiceCrashed
+          #   expr: (node_systemd_unit_state{state="failed"} == 1)
+          #   for: 0m
+          #   labels:
+          #     severity: warning
+          #   annotations:
+          #     summary: Host systemd service crashed (instance {{'{{'}} $labels.instance {{'}}'}})
+          #     description: "systemd service crashed\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
+          - alert: HostOomKillDetected
+            expr: (increase(node_vmstat_oom_kill[1m]) > 0)
+            for: 0m
+            labels:
+              severity: warning
+            annotations:
+              summary: Host OOM kill detected (instance {{'{{'}} $labels.instance {{'}}'}})
+              description: "OOM kill detected\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
+          - alert: HostClockSkew
+            expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              summary: Host clock skew (instance {{'{{'}} $labels.instance {{'}}'}})
+              description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
+          - alert: HostClockNotSynchronising
+            expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)
+            for: 2m
+            labels:
+              severity: warning
+            annotations:
+              summary: Host clock not synchronising (instance {{'{{'}} $labels.instance {{'}}'}})
+              description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"



@ -135,114 +235,6 @@ prometheus_alertmanager_options:
        - to: notifications@cuqmbr.xyz


-
-prometheus_alerting_rules:
-  groups:
-    - name: DefaultMetrics
-      rules:
-        - alert: HostOutOfMemory
-          expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)
-          for: 2m
-          labels:
-            severity: warning
-          annotations:
-            summary: Host out of memory (instance {{'{{'}} $labels.instance {{'}}'}})
-            description: "Node memory is filling up (< 10% left)\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
-        # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
-        - alert: HostMemoryIsUnderutilized
-          expr: min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8
-          for: 0m
-          labels:
-            severity: info
-          annotations:
-            summary: Host Memory is underutilized (instance {{'{{'}} $labels.instance {{'}}'}})
-            description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{'{{'}} $labels.instance {{'}}'}})\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
-        # Please add ignored mountpoints in node_exporter parameters like
-        # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
-        # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
-        - alert: HostOutOfDiskSpace
-          expr: (node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)
-          for: 2m
-          labels:
-            severity: critical
-          annotations:
-            summary: Host out of disk space (instance {{'{{'}} $labels.instance {{'}}'}})
-            description: "Disk is almost full (< 10% left)\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
-        - alert: HostOutOfInodes
-          expr: (node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)
-          for: 2m
-          labels:
-            severity: critical
-          annotations:
-            summary: Host out of inodes (instance {{'{{'}} $labels.instance {{'}}'}})
-            description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
-        - alert: HostHighCpuLoad
-          expr: (avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .90
-          for: 10m
-          labels:
-            severity: warning
-          annotations:
-            summary: Host high CPU load (instance {{'{{'}} $labels.instance {{'}}'}})
-            description: "CPU load is > 80%\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
-        # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
-        - alert: HostCpuIsUnderutilized
-          expr: (min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8
-          for: 1w
-          labels:
-            severity: info
-          annotations:
-            summary: Host CPU is underutilized (instance {{'{{'}} $labels.instance {{'}}'}})
-            description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
-        - alert: HostCpuHighIowait
-          expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10
-          for: 0m
-          labels:
-            severity: warning
-          annotations:
-            summary: Host CPU high iowait (instance {{'{{'}} $labels.instance {{'}}'}})
-            description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
-        - alert: HostSwapIsFillingUp
-          expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)
-          for: 2m
-          labels:
-            severity: warning
-          annotations:
-            summary: Host swap is filling up (instance {{'{{'}} $labels.instance {{'}}'}})
-            description: "Swap is filling up (>80%)\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
-        # - alert: HostSystemdServiceCrashed
-        #   expr: (node_systemd_unit_state{state="failed"} == 1)
-        #   for: 0m
-        #   labels:
-        #     severity: warning
-        #   annotations:
-        #     summary: Host systemd service crashed (instance {{'{{'}} $labels.instance {{'}}'}})
-        #     description: "systemd service crashed\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
-        - alert: HostOomKillDetected
-          expr: (increase(node_vmstat_oom_kill[1m]) > 0)
-          for: 0m
-          labels:
-            severity: warning
-          annotations:
-            summary: Host OOM kill detected (instance {{'{{'}} $labels.instance {{'}}'}})
-            description: "OOM kill detected\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
-        - alert: HostClockSkew
-          expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))
-          for: 10m
-          labels:
-            severity: warning
-          annotations:
-            summary: Host clock skew (instance {{'{{'}} $labels.instance {{'}}'}})
-            description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
-        - alert: HostClockNotSynchronising
-          expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)
-          for: 2m
-          labels:
-            severity: warning
-          annotations:
-            summary: Host clock not synchronising (instance {{'{{'}} $labels.instance {{'}}'}})
-            description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
-
-
 loki_settings:
  config:
    auth_enabled: false
--- a/ansible/roles/prometheus_server/defaults/main.yml
+++ b/ansible/roles/prometheus_server/defaults/main.yml
@ -1,59 +1,53 @@
 ---

-prometheus_options:
-  global:
-    # Set the scrape interval to every 15 seconds. Default is every 1 minute.
-    scrape_interval: 15s
-    # Evaluate rules every 15 seconds. The default is every 1 minute.
-    evaluation_interval: 15s
-    # scrape_timeout is set to the global default (10s).
+prometheus_settings:

-    # Attach these labels to any time series or alerts when communicating with
-    # external systems (federation, remote storage, Alertmanager).
-    external_labels:
-      monitor: 'example'
-
-  # Alertmanager configuration
-  alerting:
-    alertmanagers:
-      - static_configs:
-          - targets: ['localhost:9093']
-
-  # Load rules and evaluate them according to the global 'evaluation_interval'.
-  rule_files:
-    - alerting_rules/*.yml
-
-  # A scrape configuration containing exactly one endpoint to scrape:
-  # Here it's Prometheus itself.
-  scrape_configs:
-    # The job name is added as a label `job=<job_name>`.
-    - job_name: 'prometheus'
-
-      # Override the global default and scrape targets from this job.
-      scrape_interval: 5s
-      scrape_timeout: 5s
-
-      # metrics_path defaults to '/metrics'
-      # scheme defaults to 'http'.
-
-      static_configs:
-        - targets: ['localhost:9090']
-
-    - job_name: node
-      # If prometheus-node-exporter is installed, grab stats about the local
-      # machine by default.
-      static_configs:
-        - targets: ['localhost:9100']
-
-prometheus_alerting_rules:
-  groups:
-    - name: ExampleRedisGroup
-      rules:
-        - alert: ExampleRedisDown
-          expr: redis_up{} == 0
-          for: 2m
-          labels:
-            severity: critical
-          annotations:
-            summary: "Redis instance down"
-            description: "Whatever"
+prometheus_default_settings:
+  config:
+    global:
+      # Set the scrape interval to every 15 seconds. Default is every 1 minute.
+      scrape_interval: 15s
+      # Evaluate rules every 15 seconds. The default is every 1 minute.
+      evaluation_interval: 15s
+      # scrape_timeout is set to the global default (10s).
+      # Attach these labels to any time series or alerts when communicating with
+      # external systems (federation, remote storage, Alertmanager).
+      # external_labels:
+      #   monitor: 'example'
+    # Alertmanager configuration
+    # alerting:
+    #   alertmanagers:
+    #     - static_configs:
+    #         - targets: ['localhost:9093']
+    # Load rules and evaluate them according to the global 'evaluation_interval'
+    rule_files:
+      - alerting_rules/*.yml
+    # A scrape configuration containing exactly one endpoint to scrape:
+    # Here it's Prometheus itself.
+    scrape_configs:
+      # The job name is added as a label `job=<job_name>`.
+      - job_name: 'prometheus'
+        # Override the global default and scrape targets from this job.
+        scrape_interval: 5s
+        scrape_timeout: 5s
+        # metrics_path defaults to '/metrics'
+        # scheme defaults to 'http'.
+        static_configs:
+          - targets: ['localhost:9090']
+      - job_name: node
+        # If prometheus-node-exporter is installed, grab stats about the local
+        # machine by default.
+        static_configs:
+          - targets: ['localhost:9100']
+  alerting_rules:
+    # groups:
+    #   - name: ExampleRedisGroup
+    #     rules:
+    #       - alert: ExampleRedisDown
+    #         expr: redis_up{} == 0
+    #         for: 2m
+    #         labels:
+    #           severity: critical
+    #         annotations:
+    #           summary: "Redis instance down"
+    #           description: "Whatever"
--- a/ansible/roles/prometheus_server/tasks/main.yml
+++ b/ansible/roles/prometheus_server/tasks/main.yml
@ -1,5 +1,11 @@
 ---

+- name: Combine default and user settings, decrypt vault.
+  ansible.builtin.set_fact:
+    prometheus_settings: "{{ prometheus_default_settings |
+        ansible.builtin.combine(prometheus_settings, recursive=true) }}"
+  no_log: true
+
 - name: Install prometheus apt package.
  ansible.builtin.apt:
    name: prometheus
--- a/ansible/roles/prometheus_server/templates/alerting_rules.yml.j2
+++ b/ansible/roles/prometheus_server/templates/alerting_rules.yml.j2
@ -1,4 +1,7 @@
 ---
 # Managed with Ansible

-{{ prometheus_alerting_rules | ansible.builtin.to_nice_yaml(indent=2) }}
+{{
+    prometheus_settings.alerting_rules |
+    ansible.builtin.to_nice_yaml(indent=2)
+}}
--- a/ansible/roles/prometheus_server/templates/prometheus.yml.j2
+++ b/ansible/roles/prometheus_server/templates/prometheus.yml.j2
@ -1,4 +1,7 @@
 ---
 # Managed with Ansible

-{{ prometheus_options | ansible.builtin.to_nice_yaml(indent=2) }}
+{{
+    prometheus_settings.config | 
+    ansible.builtin.to_nice_yaml(indent=2, width=80)
+}}