--- users: - name: admin password_hash: !vault | $ANSIBLE_VAULT;1.1;AES256 30623138653735643561343061356531373430393662383764633038383238383837626636393432 3138653539356430306266663864343563616332656131310a343632323363653665646363366437 66643430626437333461656231303339656435346261336238313036306431396333643965666631 3665393163623266320a373838313538626438623330393533353931336331623464613664633430 32303734396634376431383936643431313561303864343930393363623130663236666636353637 63613237383666656263316661333031643032323266636464313839653065316138343035346161 64313037336666353136383462333832373031623637636630326330313832333265386632343139 30306638356434376635346637346134653064613236326333656566383137353166393063333563 32623638343263313463313062303465626439356461613235656661623364656138 ssh_public_keys: - "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQDKNzJdo6/c7uXrg0lqVwyXOhcNxO/BnylyJeqoBe4rAO5fhjwWLsvMAeCEmYa/3i8ITSvurFEou7BELo25vM58dNfGQHig52LrA/GU/jwDAhHyTXP3AvqqgIFa0ysMaHasYny6oqXi+eb2w/KimtgOhe5/oUdNBe/KgqZ+hP3qlTchxBl5MEzZIKgXTXQeYJpYYrnFb0l/R8qSkFBJv2xzxVJxEamN71SG7OIsi9m14D6hd2pNDHDDqHgKBVbN5irxDuJAzHN5upzfziXiYCOusud23tX6/nNv8t03CbB7FW0OxaCGhAjbavTFAf164L9GM7j76BGsLwWSh2HhG9G9lKs2bEI3IQudllMc6p9N6j2FhMOCKK6YYekdAOVc3ozTFc73VLkXtN8pnTC8OCSavthSt5jOUd0qTsQGH91lWlEkVe0bWi+s9nggfeWFM7HMVmqsR1jYlOXoi5s7xYwKLUdeUjRk3/rkzIFoOxquE5sVVuNDRNCaqcpPVY4k0gE= openpgp:0x8880F3E0" - "ssh-ed25519 \ AAAAC3NzaC1lZDI1NTE5AAAAIJRnXU2My2iMXl1yCIEoASZYAUW0q1qn3P5tSUI0B0+4 \ openpgp:0xAD2BFD7F" opendoas_settings: "permit persist admin as root" - name: ansible password_hash: "" ssh_public_keys: - "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQDKNzJdo6/c7uXrg0lqVwyXOhcNxO/BnylyJeqoBe4rAO5fhjwWLsvMAeCEmYa/3i8ITSvurFEou7BELo25vM58dNfGQHig52LrA/GU/jwDAhHyTXP3AvqqgIFa0ysMaHasYny6oqXi+eb2w/KimtgOhe5/oUdNBe/KgqZ+hP3qlTchxBl5MEzZIKgXTXQeYJpYYrnFb0l/R8qSkFBJv2xzxVJxEamN71SG7OIsi9m14D6hd2pNDHDDqHgKBVbN5irxDuJAzHN5upzfziXiYCOusud23tX6/nNv8t03CbB7FW0OxaCGhAjbavTFAf164L9GM7j76BGsLwWSh2HhG9G9lKs2bEI3IQudllMc6p9N6j2FhMOCKK6YYekdAOVc3ozTFc73VLkXtN8pnTC8OCSavthSt5jOUd0qTsQGH91lWlEkVe0bWi+s9nggfeWFM7HMVmqsR1jYlOXoi5s7xYwKLUdeUjRk3/rkzIFoOxquE5sVVuNDRNCaqcpPVY4k0gE= openpgp:0x8880F3E0" - "ssh-ed25519 \ AAAAC3NzaC1lZDI1NTE5AAAAIJRnXU2My2iMXl1yCIEoASZYAUW0q1qn3P5tSUI0B0+4 \ openpgp:0xAD2BFD7F" opendoas_settings: "permit nopass ansible" prometheus_options: global: alerting: alertmanagers: - static_configs: - targets: - 192.168.0.252:9093 rule_files: - alerting_rules/*.yml scrape_configs: - job_name: prometheus static_configs: - targets: - 192.168.0.252:9090 - job_name: node static_configs: - targets: # main-page - 192.168.0.10:9100 labels: env: dev hostname: main-page - targets: # searxng - 192.168.0.15:9100 labels: env: dev hostname: searxng - targets: # forgejo - 192.168.0.20:9100 labels: env: dev hostname: forgejo - targets: # forgejo - 192.168.0.21:9100 labels: env: dev hostname: forgejo-runner - targets: # bastion - 192.168.0.254:9100 labels: env: common hostname: bastion - targets: # load-balancer - 192.168.0.253:9100 labels: env: common hostname: load-balancer - targets: # monitoring - 192.168.0.252:9100 labels: env: common hostname: monitoring - job_name: nginx static_configs: - targets: # load-balancer - 192.168.0.253:9113 labels: env: common hostname: monitoring prometheus_alertmanager_options: global: smtp_smarthost: mail.cuqmbr.xyz:587 smtp_require_tls: true smtp_from: '"Homelab Alertmanager" ' smtp_auth_username: no-reply smtp_auth_password: !vault | $ANSIBLE_VAULT;1.1;AES256 31393866316539633838303936366464613935393933333338336531656239333361653664346637 3665316532336339633432303036626339363239343065630a326361306233656632653134643966 39663138303439323636666665653364396132333532383463626337653061356461643734336363 6266353533656566330a346536333836356131343832616631666330653462613436313062643330 61616664646439643839366630396137616533393664323965366630363566333632 templates: - /etc/prometheus/alertmanager_templates/*.tmpl route: group_by: - env - hostname group_wait: 30s group_interval: 5m repeat_interval: 1d receiver: default receivers: - name: default email_configs: - to: notifications@cuqmbr.xyz prometheus_alerting_rules: groups: - name: DefaultMetrics rules: - alert: HostOutOfMemory expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10) for: 2m labels: severity: warning annotations: summary: Host out of memory (instance {{'{{'}} $labels.instance {{'}}'}}) description: "Node memory is filling up (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - alert: HostMemoryIsUnderutilized expr: min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8 for: 0m labels: severity: info annotations: summary: Host Memory is underutilized (instance {{'{{'}} $labels.instance {{'}}'}}) description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{'{{'}} $labels.instance {{'}}'}})\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" # Please add ignored mountpoints in node_exporter parameters like # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. - alert: HostOutOfDiskSpace expr: (node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0) for: 2m labels: severity: critical annotations: summary: Host out of disk space (instance {{'{{'}} $labels.instance {{'}}'}}) description: "Disk is almost full (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" - alert: HostOutOfInodes expr: (node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) for: 2m labels: severity: critical annotations: summary: Host out of inodes (instance {{'{{'}} $labels.instance {{'}}'}}) description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" - alert: HostHighCpuLoad expr: (avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .90 for: 10m labels: severity: warning annotations: summary: Host high CPU load (instance {{'{{'}} $labels.instance {{'}}'}}) description: "CPU load is > 80%\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - alert: HostCpuIsUnderutilized expr: (min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8 for: 1w labels: severity: info annotations: summary: Host CPU is underutilized (instance {{'{{'}} $labels.instance {{'}}'}}) description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" - alert: HostCpuHighIowait expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10 for: 0m labels: severity: warning annotations: summary: Host CPU high iowait (instance {{'{{'}} $labels.instance {{'}}'}}) description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" - alert: HostSwapIsFillingUp expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) for: 2m labels: severity: warning annotations: summary: Host swap is filling up (instance {{'{{'}} $labels.instance {{'}}'}}) description: "Swap is filling up (>80%)\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" # - alert: HostSystemdServiceCrashed # expr: (node_systemd_unit_state{state="failed"} == 1) # for: 0m # labels: # severity: warning # annotations: # summary: Host systemd service crashed (instance {{'{{'}} $labels.instance {{'}}'}}) # description: "systemd service crashed\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" - alert: HostOomKillDetected expr: (increase(node_vmstat_oom_kill[1m]) > 0) for: 0m labels: severity: warning annotations: summary: Host OOM kill detected (instance {{'{{'}} $labels.instance {{'}}'}}) description: "OOM kill detected\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" - alert: HostClockSkew expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) for: 10m labels: severity: warning annotations: summary: Host clock skew (instance {{'{{'}} $labels.instance {{'}}'}}) description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" - alert: HostClockNotSynchronising expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) for: 2m labels: severity: warning annotations: summary: Host clock not synchronising (instance {{'{{'}} $labels.instance {{'}}'}}) description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{'{{'}} $value {{'}}'}}\n LABELS = {{'{{'}} $labels {{'}}'}}" loki_settings: config: auth_enabled: false server: http_listen_port: 3100 log_level: info pattern_ingester: enabled: true fluentbit_settings: service: flush: 1 daemon: false log_level: info http_server: false pipeline: inputs: - name: systemd tag: systemd outputs: - name: loki host: 192.168.0.252 labels: env=common,hostname=monitoring,service_name=systemd match: systemd