1
0

change prometheus_server role variable structure

This commit is contained in:
cuqmbr 2025-07-03 19:59:22 +03:00
parent 0363813807
commit 1b30ca221e
Signed by: cuqmbr
GPG Key ID: 1F62396D020F375C
5 changed files with 234 additions and 236 deletions

View File

@ -28,77 +28,177 @@ users:
openpgp:0xAD2BFD7F" openpgp:0xAD2BFD7F"
opendoas_settings: "permit nopass ansible" opendoas_settings: "permit nopass ansible"
prometheus_options: prometheus_settings:
global: config:
global:
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.0.252:9093
scrape_configs:
- job_name: prometheus
static_configs:
- targets:
- 192.168.0.252:9090
- job_name: node
static_configs:
- targets:
# main-page
- 192.168.0.10:9100
labels:
env: dev
hostname: main-page
- targets:
# searxng
- 192.168.0.15:9100
labels:
env: dev
hostname: searxng
- targets:
# forgejo
- 192.168.0.20:9100
labels:
env: dev
hostname: forgejo
- targets:
# forgejo
- 192.168.0.21:9100
labels:
env: dev
hostname: forgejo-runner
- targets:
# bastion
- 192.168.0.254:9100
labels:
env: common
hostname: bastion
- targets:
# load-balancer
- 192.168.0.253:9100
labels:
env: common
hostname: load-balancer
- targets:
# monitoring
- 192.168.0.252:9100
labels:
env: common
hostname: monitoring
- job_name: nginx
static_configs:
- targets:
# load-balancer
- 192.168.0.253:9113
labels:
env: common
hostname: monitoring
alerting: alerting_rules:
alertmanagers: groups:
- static_configs: - name: DefaultMetrics
- targets: rules:
- 192.168.0.252:9093 - alert: HostOutOfMemory
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)
rule_files: for: 2m
- alerting_rules/*.yml labels:
severity: warning
scrape_configs: annotations:
- job_name: prometheus summary: Host out of memory (instance {{'{{'}} $labels.instance {{'}}'}})
static_configs: description: "Node memory is filling up (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- targets: # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- 192.168.0.252:9090 - alert: HostMemoryIsUnderutilized
expr: min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8
- job_name: node for: 0m
static_configs: labels:
- targets: severity: info
# main-page annotations:
- 192.168.0.10:9100 summary: Host Memory is underutilized (instance {{'{{'}} $labels.instance {{'}}'}})
labels: description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{'{{'}} $labels.instance {{'}}'}})\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
env: dev # Please add ignored mountpoints in node_exporter parameters like
hostname: main-page # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
- targets: # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
# searxng - alert: HostOutOfDiskSpace
- 192.168.0.15:9100 expr: (node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)
labels: for: 2m
env: dev labels:
hostname: searxng severity: critical
- targets: annotations:
# forgejo summary: Host out of disk space (instance {{'{{'}} $labels.instance {{'}}'}})
- 192.168.0.20:9100 description: "Disk is almost full (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
labels: - alert: HostOutOfInodes
env: dev expr: (node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)
hostname: forgejo for: 2m
- targets: labels:
# forgejo severity: critical
- 192.168.0.21:9100 annotations:
labels: summary: Host out of inodes (instance {{'{{'}} $labels.instance {{'}}'}})
env: dev description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
hostname: forgejo-runner - alert: HostHighCpuLoad
- targets: expr: (avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .90
# bastion for: 10m
- 192.168.0.254:9100 labels:
labels: severity: warning
env: common annotations:
hostname: bastion summary: Host high CPU load (instance {{'{{'}} $labels.instance {{'}}'}})
- targets: description: "CPU load is > 80%\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
# load-balancer # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- 192.168.0.253:9100 - alert: HostCpuIsUnderutilized
labels: expr: (min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8
env: common for: 1w
hostname: load-balancer labels:
- targets: severity: info
# monitoring annotations:
- 192.168.0.252:9100 summary: Host CPU is underutilized (instance {{'{{'}} $labels.instance {{'}}'}})
labels: description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
env: common - alert: HostCpuHighIowait
hostname: monitoring expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10
for: 0m
- job_name: nginx labels:
static_configs: severity: warning
- targets: annotations:
# load-balancer summary: Host CPU high iowait (instance {{'{{'}} $labels.instance {{'}}'}})
- 192.168.0.253:9113 description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
labels: - alert: HostSwapIsFillingUp
env: common expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)
hostname: monitoring for: 2m
labels:
severity: warning
annotations:
summary: Host swap is filling up (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Swap is filling up (>80%)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
# - alert: HostSystemdServiceCrashed
# expr: (node_systemd_unit_state{state="failed"} == 1)
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: Host systemd service crashed (instance {{'{{'}} $labels.instance {{'}}'}})
# description: "systemd service crashed\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostOomKillDetected
expr: (increase(node_vmstat_oom_kill[1m]) > 0)
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{'{{'}} $labels.instance {{'}}'}})
description: "OOM kill detected\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostClockSkew
expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))
for: 10m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostClockNotSynchronising
expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
@ -135,114 +235,6 @@ prometheus_alertmanager_options:
- to: notifications@cuqmbr.xyz - to: notifications@cuqmbr.xyz
prometheus_alerting_rules:
groups:
- name: DefaultMetrics
rules:
- alert: HostOutOfMemory
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Node memory is filling up (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- alert: HostMemoryIsUnderutilized
expr: min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8
for: 0m
labels:
severity: info
annotations:
summary: Host Memory is underutilized (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{'{{'}} $labels.instance {{'}}'}})\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)
for: 2m
labels:
severity: critical
annotations:
summary: Host out of disk space (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Disk is almost full (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostOutOfInodes
expr: (node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)
for: 2m
labels:
severity: critical
annotations:
summary: Host out of inodes (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostHighCpuLoad
expr: (avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .90
for: 10m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{'{{'}} $labels.instance {{'}}'}})
description: "CPU load is > 80%\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- alert: HostCpuIsUnderutilized
expr: (min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8
for: 1w
labels:
severity: info
annotations:
summary: Host CPU is underutilized (instance {{'{{'}} $labels.instance {{'}}'}})
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostCpuHighIowait
expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU high iowait (instance {{'{{'}} $labels.instance {{'}}'}})
description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostSwapIsFillingUp
expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)
for: 2m
labels:
severity: warning
annotations:
summary: Host swap is filling up (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Swap is filling up (>80%)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
# - alert: HostSystemdServiceCrashed
# expr: (node_systemd_unit_state{state="failed"} == 1)
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: Host systemd service crashed (instance {{'{{'}} $labels.instance {{'}}'}})
# description: "systemd service crashed\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostOomKillDetected
expr: (increase(node_vmstat_oom_kill[1m]) > 0)
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{'{{'}} $labels.instance {{'}}'}})
description: "OOM kill detected\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostClockSkew
expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))
for: 10m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
- alert: HostClockNotSynchronising
expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{'{{'}} $labels.instance {{'}}'}})
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}"
loki_settings: loki_settings:
config: config:
auth_enabled: false auth_enabled: false

View File

@ -1,59 +1,53 @@
--- ---
prometheus_options: prometheus_settings:
global:
# Set the scrape interval to every 15 seconds. Default is every 1 minute.
scrape_interval: 15s
# Evaluate rules every 15 seconds. The default is every 1 minute.
evaluation_interval: 15s
# scrape_timeout is set to the global default (10s).
# Attach these labels to any time series or alerts when communicating with prometheus_default_settings:
# external systems (federation, remote storage, Alertmanager). config:
external_labels: global:
monitor: 'example' # Set the scrape interval to every 15 seconds. Default is every 1 minute.
scrape_interval: 15s
# Alertmanager configuration # Evaluate rules every 15 seconds. The default is every 1 minute.
alerting: evaluation_interval: 15s
alertmanagers: # scrape_timeout is set to the global default (10s).
- static_configs: # Attach these labels to any time series or alerts when communicating with
- targets: ['localhost:9093'] # external systems (federation, remote storage, Alertmanager).
# external_labels:
# Load rules and evaluate them according to the global 'evaluation_interval'. # monitor: 'example'
rule_files: # Alertmanager configuration
- alerting_rules/*.yml # alerting:
# alertmanagers:
# A scrape configuration containing exactly one endpoint to scrape: # - static_configs:
# Here it's Prometheus itself. # - targets: ['localhost:9093']
scrape_configs: # Load rules and evaluate them according to the global 'evaluation_interval'
# The job name is added as a label `job=<job_name>`. rule_files:
- job_name: 'prometheus' - alerting_rules/*.yml
# A scrape configuration containing exactly one endpoint to scrape:
# Override the global default and scrape targets from this job. # Here it's Prometheus itself.
scrape_interval: 5s scrape_configs:
scrape_timeout: 5s # The job name is added as a label `job=<job_name>`.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics' # Override the global default and scrape targets from this job.
# scheme defaults to 'http'. scrape_interval: 5s
scrape_timeout: 5s
static_configs: # metrics_path defaults to '/metrics'
- targets: ['localhost:9090'] # scheme defaults to 'http'.
static_configs:
- job_name: node - targets: ['localhost:9090']
# If prometheus-node-exporter is installed, grab stats about the local - job_name: node
# machine by default. # If prometheus-node-exporter is installed, grab stats about the local
static_configs: # machine by default.
- targets: ['localhost:9100'] static_configs:
- targets: ['localhost:9100']
prometheus_alerting_rules: alerting_rules:
groups: # groups:
- name: ExampleRedisGroup # - name: ExampleRedisGroup
rules: # rules:
- alert: ExampleRedisDown # - alert: ExampleRedisDown
expr: redis_up{} == 0 # expr: redis_up{} == 0
for: 2m # for: 2m
labels: # labels:
severity: critical # severity: critical
annotations: # annotations:
summary: "Redis instance down" # summary: "Redis instance down"
description: "Whatever" # description: "Whatever"

View File

@ -1,5 +1,11 @@
--- ---
- name: Combine default and user settings, decrypt vault.
ansible.builtin.set_fact:
prometheus_settings: "{{ prometheus_default_settings |
ansible.builtin.combine(prometheus_settings, recursive=true) }}"
no_log: true
- name: Install prometheus apt package. - name: Install prometheus apt package.
ansible.builtin.apt: ansible.builtin.apt:
name: prometheus name: prometheus

View File

@ -1,4 +1,7 @@
--- ---
# Managed with Ansible # Managed with Ansible
{{ prometheus_alerting_rules | ansible.builtin.to_nice_yaml(indent=2) }} {{
prometheus_settings.alerting_rules |
ansible.builtin.to_nice_yaml(indent=2)
}}

View File

@ -1,4 +1,7 @@
--- ---
# Managed with Ansible # Managed with Ansible
{{ prometheus_options | ansible.builtin.to_nice_yaml(indent=2) }} {{
prometheus_settings.config |
ansible.builtin.to_nice_yaml(indent=2, width=80)
}}