get prom working and updated
authorIan Kelling <ian@iankelling.org>
Sun, 15 Nov 2020 22:42:21 +0000 (17:42 -0500)
committerIan Kelling <ian@iankelling.org>
Sun, 15 Nov 2020 22:42:21 +0000 (17:42 -0500)
a/ansible.cfg
a/site.yml

index 83781c3c0e95c04b208753febcb65fd66dd15312..d367ba267afe49da38579f09f37f37bc6a203e11 100644 (file)
@@ -1,11 +1,41 @@
 [defaults]
-forks          = 200
+# 2.7.4-1ppa~xenial would use 194M of memory resident in htop
+# and run out of 4g of memory. just ran ansible 2.9.9-1ppa~bionic+9.0trisquel1
+# and with 23 fork limit, it topped out at 1 gig used of memory.
+# we have about 60 hosts, so 100 should allow them all to run in
+# parallell without a problem without having memory problems.
+# oldused=0; while true; do used=$(free -w -t | tail -n1 | awk '{print $3}'); if ((used > oldused )); then oldused=$used; echo $(date) $used | tee used; fi; sleep 1; done
+forks = 100
+
+# Ansible doesnt have have trisquels python path in its os info.
+# Silence a warning
+# https://docs.ansible.com/ansible/2.9/reference_appendices/interpreter_discovery.html
+interpreter_python = auto_silent
+# strategy = free # DO NOT ENABLE.
+#
+# As of 2019-08-07, include_tasks is very broken with the free strategy.
+# tasks will not be run for some hosts, or "when" rules ignored and run
+# for the wrong host, or some hosts a task will be run twice. Even if we
+# switch to import_tasks, I wouldn't trust using this until that bug is
+# found and fixed. repro: tested with 2.7.4, (no bug reports or fixes
+# found), Running just the common role, then searching for which hosts
+# an install.yml included role got run using
+#
+# f() { awk '/xfsprogs/ { x = 1; next }; /^TASK/ { x = 0 }; x && /\[/ { print }' $1 | sort | uniq -c | pee cat wc; }
+# f LOGFILE
 host_key_checking = False
 display_skipped_hosts = False
 retry_files_enabled = False
 # readable output
 stdout_callback = yaml
-callback_whitelist = timer, profile_tasks
+# Our logs are already pretty big. You can temporarily uncomment to enable
+# profiling info in the logs.
+#callback_whitelist = timer, profile_tasks, profile_roles
+
+# Ansible suggests using the file module instead of chmod, but then it
+# follows symlinks without an option to turn it off, which is completely
+# braindead and screwed up my system.
+command_warnings=False
 
 [ssh_connection]
 pipelining = True
index 63b6cd1d5841eea70b9431c1beae1a8db3bd5ee0..e1a2777ecbd8c03cfe650bb140f0967b7833970d 100644 (file)
@@ -1,45 +1,58 @@
 ---
-- hosts: tp.b8.nz
+- hosts: kd.b8.nz
   roles:
     - role: prom
+      tags: a
+      prometheus_targets:
+        node:
+          - targets:
+              - "{{ ansible_fqdn }}:9100"
       prometheus_scrape_configs:
         - job_name: "prometheus"
           metrics_path: "{{ prometheus_metrics_path }}"
           static_configs:
             - targets:
-                - "{{ ansible_fqdn | default(ansible_host) | default('localhost') }}:9090"
+                - "{{ ansible_fqdn }}:9090"
         - job_name: "node"
-          basic_auth:
-            username: prom
-            password_file: /etc/prometheus-pass
-          scheme: "https"
+          basic_auth:
+            username: prom
+            password_file: /etc/prometheus-pass
+          #scheme: "https"
           file_sd_configs:
             - files:
                 - "{{ prometheus_config_dir }}/file_sd/node.yml"
+      # added because of warning in log
+      prometheus_alertmanager_config:
+        - static_configs:
+            - targets:
+                - "{{ ansible_fqdn }}:9093"
 
-      prometheus_targets:
-        node:
-          - targets: "{{ groups.all|map('regex_replace','$',':9101')|list }}"
 
     - role: node-exporter
-      node_exporter_web_listen_address: "127.0.0.1:9100"
+      node_exporter_web_listen_address: "127.0.1.1:9100"
+
     - role: alertmanager
       alertmanager_smtp:
-        smarthost: 'localhost:25'
-        from: "alertmanager@{{ ansible_fqdn | default(ansible_host) | default('localhost') }}"
+        smarthost: 'mail.iankelling.org:587'
+        from: "alerts@iankelling.org"
         require_tls: false
+        hello: 'defaultnn.b8.nz'
       alertmanager_route:
         receiver: defaultreceiver
       alertmanager_receivers:
         - name: defaultreceiver
           email_configs:
             - to: alerts@iankelling.org
-              html: "{% raw -%}{{ template \"email.default.html\" . }}{% endraw -%}"
-              text: "{% raw -%}{{ template \"email.default.text\" . }}{% endraw -%}"
+              send_resolved: true
+              # the html was a bit ugly and just a huge waste of text,
+              # https://github.com/prometheus/alertmanager/issues/2232
+              # lead me to find a convenient text option to use
+              html:
+              text: '{% raw -%}{{ template "opsgenie.default.description" . }}{% endraw -%}'
+      alertmanager_web_listen_address: '127.0.1.1:9093'
 
-      alertmanager_listen_address: '127.0.0.1:9093'
     - role: grafana
-      grafana_address: "127.0.0.1"
+      grafana_address: "127.0.1.1"
       # iank: playbook will halt if no password is set. this is only
       # available to localhost, so i dont really care, but might as well
       # generate a pass isntead of putting in pw123 etc.
@@ -57,7 +70,7 @@
       # https://prometheus.io/docs/visualization/grafana/
       grafana_dashboards:
         - dashboard_id: 1860
-          revision_id: 13
+          revision_id: 21
           datasource: prometheus
         - dashboard_id: 405
           revision_id: 8