1
0

change prometheus_server role variable structure

This commit is contained in:
cuqmbr 2025-07-03 19:59:22 +03:00
parent 0363813807
commit 1b30ca221e
Signed by: cuqmbr
GPG Key ID: 1F62396D020F375C
5 changed files with 234 additions and 236 deletions

View File

@ -28,77 +28,177 @@ users:
openpgp:0xAD2BFD7F"
opendoas_settings: "permit nopass ansible"
prometheus_options:
global:
prometheus_settings:
config:
global:
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.0.252:9093
scrape_configs:
- job_name: prometheus
static_configs:
- targets:
- 192.168.0.252:9090
- job_name: node
static_configs:
- targets:
# main-page
- 192.168.0.10:9100
labels:
env: dev
hostname: main-page
- targets:
# searxng
- 192.168.0.15:9100
labels:
env: dev
hostname: searxng
- targets:
# forgejo
- 192.168.0.20:9100
labels:
env: dev
hostname: forgejo
- targets:
# forgejo
- 192.168.0.21:9100
labels:
env: dev
hostname: forgejo-runner
- targets:
# bastion
- 192.168.0.254:9100
labels:
env: common
hostname: bastion
- targets:
# load-balancer
- 192.168.0.253:9100
labels:
env: common
hostname: load-balancer
- targets:
# monitoring
- 192.168.0.252:9100
labels:
env: common
hostname: monitoring
- job_name: nginx
static_configs:
- targets:
# load-balancer
- 192.168.0.253:9113
labels:
env: common
hostname: monitoring
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.0.252:9093
rule_files:
- alerting_rules/*.yml
scrape_configs:
- job_name: prometheus
static_configs:
- targets:
- 192.168.0.252:9090
- job_name: node
static_configs:
- targets:
# main-page
- 192.168.0.10:9100
labels:
env: dev
hostname: main-page
- targets:
# searxng
- 192.168.0.15:9100
labels:
env: dev
hostname: searxng
- targets:
# forgejo
- 192.168.0.20:9100
labels:
env: dev
hostname: forgejo
- targets:
# forgejo
- 192.168.0.21:9100
labels:
env: dev
hostname: forgejo-runner
- targets:
# bastion
- 192.168.0.254:9100
labels:
env: common
hostname: bastion
- targets:
# load-balancer
- 192.168.0.253:9100
labels:
env: common
hostname: load-balancer
- targets:
# monitoring
- 192.168.0.252:9100
labels:
env: common
hostname: monitoring
- job_name: nginx
static_configs:
- targets:
# load-balancer
- 192.168.0.253:9113
labels:
env: common
hostname: monitoring
alerting_rules:
groups:
- name: DefaultMetrics
rules:
- alert: HostOutOfMemory
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Node memory is filling up (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- alert: HostMemoryIsUnderutilized
expr: min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8
for: 0m
labels:
severity: info
annotations:
summary: Host Memory is underutilized (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{'{{'}} $labels.instance {{'}}'}})\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)
for: 2m
labels:
severity: critical
annotations:
summary: Host out of disk space (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Disk is almost full (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostOutOfInodes
expr: (node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)
for: 2m
labels:
severity: critical
annotations:
summary: Host out of inodes (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostHighCpuLoad
expr: (avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .90
for: 10m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{'{{'}} $labels.instance {{'}}'}})
description: "CPU load is > 80%\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- alert: HostCpuIsUnderutilized
expr: (min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8
for: 1w
labels:
severity: info
annotations:
summary: Host CPU is underutilized (instance {{'{{'}} $labels.instance {{'}}'}})
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostCpuHighIowait
expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU high iowait (instance {{'{{'}} $labels.instance {{'}}'}})
description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostSwapIsFillingUp
expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)
for: 2m
labels:
severity: warning
annotations:
summary: Host swap is filling up (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Swap is filling up (>80%)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
# - alert: HostSystemdServiceCrashed
# expr: (node_systemd_unit_state{state="failed"} == 1)
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: Host systemd service crashed (instance {{'{{'}} $labels.instance {{'}}'}})
# description: "systemd service crashed\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostOomKillDetected
expr: (increase(node_vmstat_oom_kill[1m]) > 0)
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{'{{'}} $labels.instance {{'}}'}})
description: "OOM kill detected\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostClockSkew
expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))
for: 10m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostClockNotSynchronising
expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
@ -135,114 +235,6 @@ prometheus_alertmanager_options:
- to: notifications@cuqmbr.xyz
prometheus_alerting_rules:
groups:
- name: DefaultMetrics
rules:
- alert: HostOutOfMemory
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Node memory is filling up (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- alert: HostMemoryIsUnderutilized
expr: min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8
for: 0m
labels:
severity: info
annotations:
summary: Host Memory is underutilized (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{'{{'}} $labels.instance {{'}}'}})\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)
for: 2m
labels:
severity: critical
annotations:
summary: Host out of disk space (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Disk is almost full (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostOutOfInodes
expr: (node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)
for: 2m
labels:
severity: critical
annotations:
summary: Host out of inodes (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostHighCpuLoad
expr: (avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .90
for: 10m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{'{{'}} $labels.instance {{'}}'}})
description: "CPU load is > 80%\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- alert: HostCpuIsUnderutilized
expr: (min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8
for: 1w
labels:
severity: info
annotations:
summary: Host CPU is underutilized (instance {{'{{'}} $labels.instance {{'}}'}})
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostCpuHighIowait
expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU high iowait (instance {{'{{'}} $labels.instance {{'}}'}})
description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostSwapIsFillingUp
expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)
for: 2m
labels:
severity: warning
annotations:
summary: Host swap is filling up (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Swap is filling up (>80%)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
# - alert: HostSystemdServiceCrashed
# expr: (node_systemd_unit_state{state="failed"} == 1)
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: Host systemd service crashed (instance {{'{{'}} $labels.instance {{'}}'}})
# description: "systemd service crashed\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostOomKillDetected
expr: (increase(node_vmstat_oom_kill[1m]) > 0)
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{'{{'}} $labels.instance {{'}}'}})
description: "OOM kill detected\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostClockSkew
expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))
for: 10m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostClockNotSynchronising
expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
loki_settings:
config:
auth_enabled: false

View File

@ -1,59 +1,53 @@
---
prometheus_options:
global:
# Set the scrape interval to every 15 seconds. Default is every 1 minute.
scrape_interval: 15s
# Evaluate rules every 15 seconds. The default is every 1 minute.
evaluation_interval: 15s
# scrape_timeout is set to the global default (10s).
prometheus_settings:
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: 'example'
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ['localhost:9093']
# Load rules and evaluate them according to the global 'evaluation_interval'.
rule_files:
- alerting_rules/*.yml
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>`.
- job_name: 'prometheus'
# Override the global default and scrape targets from this job.
scrape_interval: 5s
scrape_timeout: 5s
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090']
- job_name: node
# If prometheus-node-exporter is installed, grab stats about the local
# machine by default.
static_configs:
- targets: ['localhost:9100']
prometheus_alerting_rules:
groups:
- name: ExampleRedisGroup
rules:
- alert: ExampleRedisDown
expr: redis_up{} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Redis instance down"
description: "Whatever"
prometheus_default_settings:
config:
global:
# Set the scrape interval to every 15 seconds. Default is every 1 minute.
scrape_interval: 15s
# Evaluate rules every 15 seconds. The default is every 1 minute.
evaluation_interval: 15s
# scrape_timeout is set to the global default (10s).
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
# external_labels:
# monitor: 'example'
# Alertmanager configuration
# alerting:
# alertmanagers:
# - static_configs:
# - targets: ['localhost:9093']
# Load rules and evaluate them according to the global 'evaluation_interval'
rule_files:
- alerting_rules/*.yml
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>`.
- job_name: 'prometheus'
# Override the global default and scrape targets from this job.
scrape_interval: 5s
scrape_timeout: 5s
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090']
- job_name: node
# If prometheus-node-exporter is installed, grab stats about the local
# machine by default.
static_configs:
- targets: ['localhost:9100']
alerting_rules:
# groups:
# - name: ExampleRedisGroup
# rules:
# - alert: ExampleRedisDown
# expr: redis_up{} == 0
# for: 2m
# labels:
# severity: critical
# annotations:
# summary: "Redis instance down"
# description: "Whatever"

View File

@ -1,5 +1,11 @@
---
- name: Combine default and user settings, decrypt vault.
ansible.builtin.set_fact:
prometheus_settings: "{{ prometheus_default_settings |
ansible.builtin.combine(prometheus_settings, recursive=true) }}"
no_log: true
- name: Install prometheus apt package.
ansible.builtin.apt:
name: prometheus

View File

@ -1,4 +1,7 @@
---
# Managed with Ansible
{{ prometheus_alerting_rules | ansible.builtin.to_nice_yaml(indent=2) }}
{{
prometheus_settings.alerting_rules |
ansible.builtin.to_nice_yaml(indent=2)
}}

View File

@ -1,4 +1,7 @@
---
# Managed with Ansible
{{ prometheus_options | ansible.builtin.to_nice_yaml(indent=2) }}
{{
prometheus_settings.config |
ansible.builtin.to_nice_yaml(indent=2, width=80)
}}