prometheus

node-exporter

# for test
docker run --rm --name node-exporter \
-v "/proc:/host/proc" -v "/sys:/host/sys" -v "/:/rootfs" --net="host" prom/node-exporter:v1.0.0-rc.0 --path.procfs /host/proc --path.sysfs /host/proc --collector.filesystem.ignored-mount-points "^/(sys|proc|dev|host|etc)($|/)"

# for daemon
docker run -d --name node-exporter \
--restart always \
-v "/proc:/host/proc" -v "/sys:/host/sys" -v "/:/rootfs" --net="host" prom/node-exporter:v1.0.0-rc.0 --path.procfs /host/proc --path.sysfs /host/proc --collector.filesystem.ignored-mount-points "^/(sys|proc|dev|host|etc)($|/)"

blackbox-exporter

docker run -d \
--name blackbox-exporter \
--restart always \
-p 9115:9115 \
prom/blackbox-exporter:v0.18.0

prometheus

mkdir -p $(pwd)/prometheus/alert.rules.d

# docker run -it --rm \
docker run -d --name prometheus \
--restart always \
-p 19090:9090 \
-v $(pwd)/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml \
-v $(pwd)/prometheus/alert.rules.d:/etc/prometheus/alert.rules.d \
prom/prometheus:v2.22.1  --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.retention.time=10d --web.enable-lifecycle

alertmanager

mkdir -p $(pwd)/alertmanager/template

cat >$(pwd)/alertmanager/alertmanager.yml <<EOL
global:
  resolve_timeout: 5m

route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'web.hook'
receivers:
- name: 'web.hook'
  webhook_configs:
  - url: 'http://192.168.122.37:19000/hooks/redeploy-webhook'
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']
EOL


docker run -d --name alertmanager-1 --restart always \
-v $(pwd)/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml \
-v $(pwd)/alertmanager/template:/etc/alertmanager/template \
-p 19193:9193 \
-p 19194:9194 \
prom/alertmanager:v0.20.0 \
--config.file=/etc/alertmanager/alertmanager.yml \
--storage.path=/alertmanager \
--web.listen-address=":9193" \
--cluster.listen-address="0.0.0.0:9194"


docker run -d --name alertmanager-2 --restart always \
-v $(pwd)/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml \
-v $(pwd)/alertmanager/template:/etc/alertmanager/template \
-p 19293:9193 \
-p 19294:9194 \
prom/alertmanager:v0.20.0 \
--config.file=/etc/alertmanager/alertmanager.yml \
--storage.path=/alertmanager \
--web.listen-address=":9193" \
--cluster.listen-address="0.0.0.0:9194" \
--cluster.peer="192.168.122.37:19194"

docker run -d --name alertmanager-3 --restart always \
-v $(pwd)/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml \
-v $(pwd)/alertmanager/template:/etc/alertmanager/template \
-p 19393:9193 \
-p 19394:9194 \
prom/alertmanager:v0.20.0 \
--config.file=/etc/alertmanager/alertmanager.yml \
--storage.path=/alertmanager \
--web.listen-address=":9193" \
--cluster.listen-address="0.0.0.0:9194" \
--cluster.peer="192.168.122.37:19194"



cat >$(pwd)/data.json <<EOL
[
  {
    "labels": {
       "alertname": "DiskRunningFull",
       "dev": "sda1",
       "instance": "example1"
     },
     "annotations": {
        "info": "The disk sda1 is running full",
        "summary": "please check the instance example1"
      }
  },
  {
    "labels": {
       "alertname": "DiskRunningFull",
       "dev": "sdb2",
       "instance": "example2"
     },
     "annotations": {
        "info": "The disk sdb2 is running full",
        "summary": "please check the instance example2"
      }
  },
  {
    "labels": {
       "alertname": "DiskRunningFull",
       "dev": "sda1",
       "instance": "example3",
       "severity": "critical"
     }
  },
  {
    "labels": {
       "alertname": "DiskRunningFull",
       "dev": "sda1",
       "instance": "example3",
       "severity": "warning"
     }
  }
]
EOL
curl -XPOST --data @data.json http://192.168.122.37:19193/api/v1/alerts

webhook

docker run --name ding -d \
--restart always \
-p18060:8060 \
-v $(pwd)/prometheus-webhook-dingtalk/config.yml:/etc/prometheus-webhook-dingtalk/config.yml \
timonwong/prometheus-webhook-dingtalk:v1.4.0 \
--config.file=/etc/prometheus-webhook-dingtalk/config.yml \
--web.enable-ui \
--web.enable-lifecycle \
--log.level=debug



docker run -d -p 19000:9000 --name=webhook -v $(pwd)/webhook:/etc/webhook  almir/webhook:2.6.11 -verbose -hooks=/etc/webhook/hooks.json -hotreload

cat >$(pwd)/webhook/hooks.json <<EOL
[
  {
    "id": "redeploy-webhook",
    "execute-command": "/etc/webhook/redeploy.sh",
    "command-working-directory": "/tmp",
    "response-message": "I got the payload!"
  }
]
EOL
# https://github.com/adnanh/webhook
# https://github.com/adnanh/webhook/blob/master/docs/Hook-Examples.md

grafana

# 默认用户名密码为admin/admin
docker run -d --name=grafana -p 23000:3000 --restart=always grafana/grafana:6.6.2  

# https://grafana.com/grafana/dashboards/8919
# https://grafana.com/grafana/dashboards/179

cadvisor

# cadvisor

docker run \
  --volume=/:/rootfs:ro \
  --volume=/var/run:/var/run:ro \
  --volume=/sys:/sys:ro \
  --volume=/var/lib/docker/:/var/lib/docker:ro \
  --volume=/dev/disk/:/dev/disk:ro \
  --publish=8080:8080 \
  --detach=true \
  --name=cadvisor \
  gcr.azk8s.cn/google_containers/cadvisor:v0.36.0

troubleshooting


yaml: unmarshal errors:\n  line 51: field lables not found in type struct { Targets []string \"yaml:\\\"targets\\\"\"; Labels model.LabelSet \"yaml:\\\"labels\\\"\" }"

level=error ts=2020-11-09T13:48:14.875Z caller=main.go:290 msg="Error loading config (--config.file=/etc/prometheus/prometheus.yml)" err="parsing YAML file /etc/prometheus/prometheus.yml: yaml: unmarshal errors:\n  line 51: field lables not found in type struct { Targets []string \"yaml:\\\"targets\\\"\"; Labels model.LabelSet \"yaml:\\\"labels\\\"\" }"




curl "http://192.168.122.37:9115/probe?module=icmp&target=223.5.5.5"

# HELP probe_dns_lookup_time_seconds Returns the time taken for probe dns lookup in seconds
# TYPE probe_dns_lookup_time_seconds gauge
probe_dns_lookup_time_seconds 7.522e-06
# HELP probe_duration_seconds Returns how long the probe took to complete in seconds
# TYPE probe_duration_seconds gauge
probe_duration_seconds 0.005357035
# HELP probe_icmp_duration_seconds Duration of icmp request by phase
# TYPE probe_icmp_duration_seconds gauge
probe_icmp_duration_seconds{phase="resolve"} 7.522e-06
probe_icmp_duration_seconds{phase="rtt"} 0.005181756
probe_icmp_duration_seconds{phase="setup"} 5.5147e-05
# HELP probe_icmp_reply_hop_limit Replied packet hop limit (TTL for ipv4)
# TYPE probe_icmp_reply_hop_limit gauge
probe_icmp_reply_hop_limit 115
# HELP probe_ip_addr_hash Specifies the hash of IP address. It's useful to detect if the IP address changes.
# TYPE probe_ip_addr_hash gauge
probe_ip_addr_hash 2.744926669e+09
# HELP probe_ip_protocol Specifies whether probe ip protocol is IP4 or IP6
# TYPE probe_ip_protocol gauge
probe_ip_protocol 4
# HELP probe_success Displays whether or not the probe was a success
# TYPE probe_success gauge
probe_success 1


curl "http://192.168.122.37:9115/probe?module=tcp_connect&target=223.5.5.5:53"

# HELP probe_dns_lookup_time_seconds Returns the time taken for probe dns lookup in seconds
# TYPE probe_dns_lookup_time_seconds gauge
probe_dns_lookup_time_seconds 7.603e-06
# HELP probe_duration_seconds Returns how long the probe took to complete in seconds
# TYPE probe_duration_seconds gauge
probe_duration_seconds 0.005699097
# HELP probe_failed_due_to_regex Indicates if probe failed due to regex
# TYPE probe_failed_due_to_regex gauge
probe_failed_due_to_regex 0
# HELP probe_ip_addr_hash Specifies the hash of IP address. It's useful to detect if the IP address changes.
# TYPE probe_ip_addr_hash gauge
probe_ip_addr_hash 2.744926669e+09
# HELP probe_ip_protocol Specifies whether probe ip protocol is IP4 or IP6
# TYPE probe_ip_protocol gauge
probe_ip_protocol 4
# HELP probe_success Displays whether or not the probe was a success
# TYPE probe_success gauge
probe_success 1




curl "http://192.168.122.37:9115/probe?module=tcp_connect&target=223.5.5.5:5555"

# HELP probe_dns_lookup_time_seconds Returns the time taken for probe dns lookup in seconds
# TYPE probe_dns_lookup_time_seconds gauge
probe_dns_lookup_time_seconds 8.187e-06
# HELP probe_duration_seconds Returns how long the probe took to complete in seconds
# TYPE probe_duration_seconds gauge
probe_duration_seconds 31.702927304
# HELP probe_failed_due_to_regex Indicates if probe failed due to regex
# TYPE probe_failed_due_to_regex gauge
probe_failed_due_to_regex 0
# HELP probe_ip_addr_hash Specifies the hash of IP address. It's useful to detect if the IP address changes.
# TYPE probe_ip_addr_hash gauge
probe_ip_addr_hash 2.744926669e+09
# HELP probe_ip_protocol Specifies whether probe ip protocol is IP4 or IP6
# TYPE probe_ip_protocol gauge
probe_ip_protocol 4
# HELP probe_success Displays whether or not the probe was a success
# TYPE probe_success gauge
probe_success 0

prometheus

文章目录

node-exporter

blackbox-exporter

prometheus

alertmanager

webhook

grafana

cadvisor

troubleshooting

ref