homelab/ansible/inventories/dev/group_vars/monitoring.yml

---

prometheus_options:
  global:

  alerting:
    alertmanagers:
    - static_configs:
      - targets:
        - 192.168.0.252:9093

  rule_files:
    - alerting_rules/*.yml

  scrape_configs:
    - job_name: prometheus
      static_configs:
        - targets:
          - 192.168.0.252:9090

    - job_name: node
      static_configs:
        - targets:
          # main-page
          - 192.168.0.10:9100
          labels:
            env: dev
            hostname: main-page
        - targets:
          # searxng
          - 192.168.0.15:9100
          labels:
            env: dev
            hostname: searxng
        - targets:
          # bastion
          - 192.168.0.254:9100
          labels:
            env: common
            hostname: bastion
        - targets:
          # load-balancer
          - 192.168.0.253:9100
          labels:
            env: common
            hostname: load-balancer
        - targets:
          # monitoring
          - 192.168.0.252:9100
          labels:
            env: common
            hostname: monitoring

    - job_name: nginx
      static_configs:
        - targets:
          # load-balancer
          - 192.168.0.253:9113
          labels:
            env: common
            hostname: monitoring


prometheus_alertmanager_options:
  global:
    smtp_smarthost: mail.cuqmbr.xyz:587
    smtp_require_tls: true
    smtp_from: '"Homelab Alertmanager" <no-reply@cuqmbr.xyz>'
    smtp_auth_username: no-reply
    smtp_auth_password: !vault |
      $ANSIBLE_VAULT;1.1;AES256
      31393866316539633838303936366464613935393933333338336531656239333361653664346637
      3665316532336339633432303036626339363239343065630a326361306233656632653134643966
      39663138303439323636666665653364396132333532383463626337653061356461643734336363
      6266353533656566330a346536333836356131343832616631666330653462613436313062643330
      61616664646439643839366630396137616533393664323965366630363566333632

  templates:
    - /etc/prometheus/alertmanager_templates/*.tmpl

  route:
    group_by:
      - env
      - hostname

    group_wait: 30s
    group_interval: 5m
    repeat_interval: 1d
    receiver: default

  receivers:
    - name: default
      email_configs:
        - to: notifications@cuqmbr.xyz


prometheus_alerting_rules:
  groups:
    - name: DefaultMetrics
      rules:
        - alert: HostOutOfMemory
          expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)
          for: 2m
          labels:
            severity: warning
          annotations:
            summary: Host out of memory (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "Node memory is filling up (< 10% left)\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
        - alert: HostMemoryIsUnderutilized
          expr: min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8
          for: 0m
          labels:
            severity: info
          annotations:
            summary: Host Memory is underutilized (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{'{{'}} $labels.instance {{'}}'}})\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        # Please add ignored mountpoints in node_exporter parameters like
        # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
        # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
        - alert: HostOutOfDiskSpace
          expr: (node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)
          for: 2m
          labels:
            severity: critical
          annotations:
            summary: Host out of disk space (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "Disk is almost full (< 10% left)\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        - alert: HostOutOfInodes
          expr: (node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)
          for: 2m
          labels:
            severity: critical
          annotations:
            summary: Host out of inodes (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        - alert: HostHighCpuLoad
          expr: (avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .90
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: Host high CPU load (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "CPU load is > 80%\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
        - alert: HostCpuIsUnderutilized
          expr: (min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8
          for: 1w
          labels:
            severity: info
          annotations:
            summary: Host CPU is underutilized (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        - alert: HostCpuHighIowait
          expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10
          for: 0m
          labels:
            severity: warning
          annotations:
            summary: Host CPU high iowait (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        - alert: HostSwapIsFillingUp
          expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)
          for: 2m
          labels:
            severity: warning
          annotations:
            summary: Host swap is filling up (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "Swap is filling up (>80%)\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        # - alert: HostSystemdServiceCrashed
        #   expr: (node_systemd_unit_state{state="failed"} == 1)
        #   for: 0m
        #   labels:
        #     severity: warning
        #   annotations:
        #     summary: Host systemd service crashed (instance {{'{{'}} $labels.instance {{'}}'}})
        #     description: "systemd service crashed\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        - alert: HostOomKillDetected
          expr: (increase(node_vmstat_oom_kill[1m]) > 0)
          for: 0m
          labels:
            severity: warning
          annotations:
            summary: Host OOM kill detected (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "OOM kill detected\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        - alert: HostClockSkew
          expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: Host clock skew (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"
        - alert: HostClockNotSynchronising
          expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)
          for: 2m
          labels:
            severity: warning
          annotations:
            summary: Host clock not synchronising (instance {{'{{'}} $labels.instance {{'}}'}})
            description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{'{{'}} $value {{'}}'}}\n  LABELS = {{'{{'}} $labels {{'}}'}}"


loki_options:
  auth_enabled: false

  server:
    http_listen_port: 3100
    grpc_listen_port: 9096
    log_level: info
    grpc_server_max_concurrent_streams: 1000

  common:
    instance_addr: 127.0.0.1
    path_prefix: /tmp/loki
    storage:
      filesystem:
        chunks_directory: /tmp/loki/chunks
        rules_directory: /tmp/loki/rules
    replication_factor: 1
    ring:
      kvstore:
        store: inmemory

  query_range:
    results_cache:
      cache:
        embedded_cache:
          enabled: true
          max_size_mb: 100

  limits_config:
    metric_aggregation_enabled: true

  schema_config:
    configs:
      - from: 2020-10-24
        store: tsdb
        object_store: filesystem
        schema: v13
        index:
          prefix: index_
          period: 24h

  pattern_ingester:
    enabled: true
    metric_aggregation:
      loki_address: localhost:3100

  ruler:
    alertmanager_url: http://localhost:9093

  frontend:
    encoding: protobuf

  analytics:
    reporting_enabled: false


fluentbit_settings:
  service:
    flush: 1
    daemon: false
    log_level: info
    http_server: false
  pipeline:
    inputs:
      - name: systemd
        tag: systemd
    outputs:
      - name: loki
        host: 192.168.0.252
        labels: env=common,hostname=monitoring,service_name=systemd
        match: systemd