Initial commit

0f748dd0 · nd · 0f748dd0 · 0f748dd0 · 0f748dd0 · 0f748dd0
Verified Commit 0f748dd0 authored Jun 2, 2020 by nd
--- a/defaults/main.yml
+++ b/defaults/main.yml
+prometheus_alertmanager:
+  args:
+    "web.listen-address": "[::1]:9093"
+  config:
+    global:
+      # The smarthost and SMTP sender used for mail notifications.
+      smtp_from: 'alertmanager@{{ inventory_hostname }}'
+    templates:
+    - '/etc/prometheus/alertmanager_templates/*.tmpl'
+    route:
+      # The labels by which incoming alerts are grouped together. For example,
+      # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
+      # be batched into a single group.
+      group_by: ['alertname', 'cluster', 'service']
+      # When a new group of alerts is created by an incoming alert, wait at
+      # least 'group_wait' to send the initial notification.
+      # This way ensures that you get multiple alerts for the same group that start
+      # firing shortly after another are batched together on the first
+      # notification.
+      group_wait: 30s
+      # When the first notification was sent, wait 'group_interval' to send a batch
+      # of new alerts that started firing for that group.
+      group_interval: 5m
+      # If an alert has successfully been sent, wait 'repeat_interval' to
+      # resend them.
+      repeat_interval: 3h
+      # A default receiver
+      receiver: mail-default
+      # All the above attributes are inherited by all child routes and can
+      # overwritten on each.
+      # The child route trees.
+      routes: []
+    # Inhibition rules allow to mute a set of alerts given that another alert is
+    # firing.
+    # We use this to mute any warning-level notifications if the same alert is
+    # already critical.
+    inhibit_rules:
+    - source_match:
+        severity: 'critical'
+      target_match:
+        severity: 'warning'
+      # Apply inhibition if the alertname is the same.
+      equal: ['alertname', 'cluster', 'service']
+    receivers:
+    - name: "blackhole"
+    - name: 'mail-default'
+      send_resolved: True
+      email_configs:
+      - to: 'root@localhost'
--- a/handlers/main.yml
+++ b/handlers/main.yml
+- name: restart alertmanager
+  service:
+    name: prometheus-alertmanager
+    state: restarted
--- a/tasks/main.yml
+++ b/tasks/main.yml
+- name: install alertmanager
+  apt:
+    pkg: prometheus-alertmanager
+- name: wrtie alertmanager service config
+  notify: restart alertmanager
+  template:
+    src: prometheus-alertmanager.j2
+    dest: /etc/default/prometheus-alertmanager
+- name: wrtie alertmanager config
+  notify: restart alertmanager
+  copy:
+    owner: root
+    group: root
+    mode: 0644
+    dest: /etc/prometheus/alertmanager.yml
+    content: "{{ prometheus_alertmanager.config|to_nice_yaml(indent=2) }}"
--- a/templates/prometheus-alertmanager.j2
+++ b/templates/prometheus-alertmanager.j2
+# Set the command-line arguments to pass to the server.
+ARGS="{% for i in prometheus_alertmanager.args %} --{{ i }}{% if prometheus_alertmanager.args[i] and prometheus_alertmanager.args[i] != {} %}='{{ prometheus_alertmanager.args[i] }}'{% endif %} {% endfor %}"
+# The alert manager supports the following options:
+#  --config.file="/etc/prometheus/alertmanager.yml"
+#       Alertmanager configuration file name.
+#  --storage.path="/var/lib/prometheus/alertmanager/"
+#       Base path for data storage.
+#  --data.retention=120h
+#       How long to keep data for.
+#  --alerts.gc-interval=30m
+#       Interval between alert GC.
+#  --log.level=info
+#       Only log messages with the given severity or above.
+#  --web.external-url=WEB.EXTERNAL-URL
+#       The URL under which Alertmanager is externally reachable (for example,
+#       if Alertmanager is served via a reverse proxy). Used for generating
+#       relative and absolute links back to Alertmanager itself. If the URL has
+#       a path portion, it will be used to prefix all HTTP endpoints served by
+#       Alertmanager. If omitted, relevant URL components will be derived
+#       automatically.
+#  --web.route-prefix=WEB.ROUTE-PREFIX
+#       Prefix for the internal routes of web endpoints. Defaults to path of
+#       --web.external-url.
+#  --web.listen-address=":9093"
+#       Address to listen on for the web interface and API.
+#  --web.ui-path="/usr/share/prometheus/alertmanager/ui/"
+#       Path to static UI directory.
+#  --template.default="/usr/share/prometheus/alertmanager/default.tmpl"
+#       Path to default notification template.
+#  --cluster.listen-address="0.0.0.0:9094"
+#       Listen address for cluster.
+#  --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS
+#       Explicit address to advertise in cluster.
+#  --cluster.peer=CLUSTER.PEER ...
+#       Initial peers (may be repeated).
+#  --cluster.peer-timeout=15s
+#       Time to wait between peers to send notifications.
+#  --cluster.gossip-interval=200ms
+#       Interval between sending gossip messages. By lowering this value (more
+#       frequent) gossip messages are propagated across the cluster more
+#       quickly at the expense of increased bandwidth.
+#  --cluster.pushpull-interval=1m0s
+#       Interval for gossip state syncs. Setting this interval lower (more
+#       frequent) will increase convergence speeds across larger clusters at
+#       the expense of increased bandwidth usage.
+#  --cluster.tcp-timeout=10s  Timeout for establishing a stream connection
+#       with a remote node for a full state sync, and for stream read and write
+#       operations.
+#  --cluster.probe-timeout=500ms
+#       Timeout to wait for an ack from a probed node before assuming it is
+#       unhealthy. This should be set to 99-percentile of RTT (round-trip time)
+#       on your network.
+#  --cluster.probe-interval=1s
+#       Interval between random node probes. Setting this lower (more frequent)
+#       will cause the cluster to detect failed nodes more quickly at the
+#       expense of increased bandwidth usage.
+#  --cluster.settle-timeout=1m0s
+#       Maximum time to wait for cluster connections to settle before
+#       evaluating notifications.
+#  --cluster.reconnect-interval=10s
+#       Interval between attempting to reconnect to lost peers.
+#  --cluster.reconnect-timeout=6h0m0s
+#       Length of time to attempt to reconnect to a lost peer.