From 6d02b1f1a3e7f25b9b99d30e967056e6b056eb81 Mon Sep 17 00:00:00 2001 From: Ian Kelling Date: Sun, 15 Nov 2020 17:42:21 -0500 Subject: [PATCH] get prom working and updated --- a/ansible.cfg | 34 ++++++++++++++++++++++++++++++++-- a/site.yml | 47 ++++++++++++++++++++++++++++++----------------- 2 files changed, 62 insertions(+), 19 deletions(-) diff --git a/a/ansible.cfg b/a/ansible.cfg index 83781c3..d367ba2 100644 --- a/a/ansible.cfg +++ b/a/ansible.cfg @@ -1,11 +1,41 @@ [defaults] -forks = 200 +# 2.7.4-1ppa~xenial would use 194M of memory resident in htop +# and run out of 4g of memory. just ran ansible 2.9.9-1ppa~bionic+9.0trisquel1 +# and with 23 fork limit, it topped out at 1 gig used of memory. +# we have about 60 hosts, so 100 should allow them all to run in +# parallell without a problem without having memory problems. +# oldused=0; while true; do used=$(free -w -t | tail -n1 | awk '{print $3}'); if ((used > oldused )); then oldused=$used; echo $(date) $used | tee used; fi; sleep 1; done +forks = 100 + +# Ansible doesnt have have trisquels python path in its os info. +# Silence a warning +# https://docs.ansible.com/ansible/2.9/reference_appendices/interpreter_discovery.html +interpreter_python = auto_silent +# strategy = free # DO NOT ENABLE. +# +# As of 2019-08-07, include_tasks is very broken with the free strategy. +# tasks will not be run for some hosts, or "when" rules ignored and run +# for the wrong host, or some hosts a task will be run twice. Even if we +# switch to import_tasks, I wouldn't trust using this until that bug is +# found and fixed. repro: tested with 2.7.4, (no bug reports or fixes +# found), Running just the common role, then searching for which hosts +# an install.yml included role got run using +# +# f() { awk '/xfsprogs/ { x = 1; next }; /^TASK/ { x = 0 }; x && /\[/ { print }' $1 | sort | uniq -c | pee cat wc; } +# f LOGFILE host_key_checking = False display_skipped_hosts = False retry_files_enabled = False # readable output stdout_callback = yaml -callback_whitelist = timer, profile_tasks +# Our logs are already pretty big. You can temporarily uncomment to enable +# profiling info in the logs. +#callback_whitelist = timer, profile_tasks, profile_roles + +# Ansible suggests using the file module instead of chmod, but then it +# follows symlinks without an option to turn it off, which is completely +# braindead and screwed up my system. +command_warnings=False [ssh_connection] pipelining = True diff --git a/a/site.yml b/a/site.yml index 63b6cd1..e1a2777 100644 --- a/a/site.yml +++ b/a/site.yml @@ -1,45 +1,58 @@ --- -- hosts: tp.b8.nz +- hosts: kd.b8.nz roles: - role: prom + tags: a + prometheus_targets: + node: + - targets: + - "{{ ansible_fqdn }}:9100" prometheus_scrape_configs: - job_name: "prometheus" metrics_path: "{{ prometheus_metrics_path }}" static_configs: - targets: - - "{{ ansible_fqdn | default(ansible_host) | default('localhost') }}:9090" + - "{{ ansible_fqdn }}:9090" - job_name: "node" - basic_auth: - username: prom - password_file: /etc/prometheus-pass - scheme: "https" + # basic_auth: + # username: prom + # password_file: /etc/prometheus-pass + #scheme: "https" file_sd_configs: - files: - "{{ prometheus_config_dir }}/file_sd/node.yml" + # added because of warning in log + prometheus_alertmanager_config: + - static_configs: + - targets: + - "{{ ansible_fqdn }}:9093" - prometheus_targets: - node: - - targets: "{{ groups.all|map('regex_replace','$',':9101')|list }}" - role: node-exporter - node_exporter_web_listen_address: "127.0.0.1:9100" + node_exporter_web_listen_address: "127.0.1.1:9100" + - role: alertmanager alertmanager_smtp: - smarthost: 'localhost:25' - from: "alertmanager@{{ ansible_fqdn | default(ansible_host) | default('localhost') }}" + smarthost: 'mail.iankelling.org:587' + from: "alerts@iankelling.org" require_tls: false + hello: 'defaultnn.b8.nz' alertmanager_route: receiver: defaultreceiver alertmanager_receivers: - name: defaultreceiver email_configs: - to: alerts@iankelling.org - html: "{% raw -%}{{ template \"email.default.html\" . }}{% endraw -%}" - text: "{% raw -%}{{ template \"email.default.text\" . }}{% endraw -%}" + send_resolved: true + # the html was a bit ugly and just a huge waste of text, + # https://github.com/prometheus/alertmanager/issues/2232 + # lead me to find a convenient text option to use + html: + text: '{% raw -%}{{ template "opsgenie.default.description" . }}{% endraw -%}' + alertmanager_web_listen_address: '127.0.1.1:9093' - alertmanager_listen_address: '127.0.0.1:9093' - role: grafana - grafana_address: "127.0.0.1" + grafana_address: "127.0.1.1" # iank: playbook will halt if no password is set. this is only # available to localhost, so i dont really care, but might as well # generate a pass isntead of putting in pw123 etc. @@ -57,7 +70,7 @@ # https://prometheus.io/docs/visualization/grafana/ grafana_dashboards: - dashboard_id: 1860 - revision_id: 13 + revision_id: 21 datasource: prometheus - dashboard_id: 405 revision_id: 8 -- 2.30.2