二进制部署prometheus+alertmanager+grafana监控平台 -

安装prometheus

本文档使用LTS长期支持版：2.45.4

mkdir /usr/local/prometheus  # 新建存放prometheus组件目录
# 安装
tar zxvf prometheus-2.45.4.linux-amd64.tar.gz -C /usr/local/prometheus
mv /usr/local/prometheus/prometheus-2.45.4.linux-amd64 /usr/local/prometheus/prometheus

注册系统服务

vim /etc/systemd/system/prometheus.service

[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/docs/introduction/overview/
After=network-online.target

[Service]
Type=simple
#User=prometheus
#Group=prometheus
Restart=on-failure
ExecStart=/usr/local/prometheus/prometheus/prometheus \
  --config.file=/usr/local/prometheus/prometheus/prometheus.yml \
  --storage.tsdb.path=/usr/local/prometheus/prometheus/data \
  --storage.tsdb.retention.time=60d \
  --web.enable-lifecycle

[Install]
WantedBy=multi-user.target

启动并设置开机自启

systemctl enable prometheus --now

检查状态

systemctl status prometheus

验证

访问http://ip:9090，访问到即可。

安装alertmanager

tar zxvf alertmanager-0.27.0.linux-amd64.tar.gz -C /usr/local/prometheus
mv /usr/local/prometheus/alertmanager-0.27.0.linux-amd64 /usr/local/prometheus/alertmanager

注册系统服务

vim /etc/systemd/system/alertmanager.service

[Unit]
Description=Alert Manager
Wants=network-online.target
After=network-online.target

[Service]
Type=simple
#User=prometheus
#Group=prometheus
ExecStart=/usr/local/prometheus/alertmanager/alertmanager \
  --config.file=/usr/local/prometheus/alertmanager/alertmanager.yml \
  --storage.path=/usr/local/prometheus/alertmanager/data

Restart=always

[Install]
WantedBy=multi-user.target

修改配置文件

vim /usr/local/prometheus/alertmanager/alertmanager.yml

route:
  group_by: ['dingtalk']
  group_wait: 1s
  group_interval: 5m
  repeat_interval: 1h
  receiver: 'dingtalk.webhook1'
  routes:
  - receiver: "dingtalk.webhook1"
    match_re:
      altername: ".*"
receivers:
  - name: 'dingtalk.webhook1'
    webhook_configs:
      - url: 'http://localhost:8060/dingtalk/webhook1/send'
        send_resolved: true
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']

启动并设置开机自启

systemctl enable alertmanager --now

检查状态

systemctl status alertmanager

验证

安装node_exporter

tar zxvf node_exporter-1.7.0.linux-amd64.tar.gz -C /usr/local/prometheus
mv /usr/local/prometheus/node_exporter-1.7.0.linux-amd64 /usr/local/prometheus/node_exporter

注册系统服务

vim /etc/systemd/system/node_exporter.service

[Unit]
Description=node_exporter
Documentation=https://prometheus.io/
After=network.target
[Service]
#User=prometheus
#Group=prometheus
ExecStart=/usr/local/prometheus/node_exporter/node_exporter
Restart=on-failure
[Install]
WantedBy=multi-user.target

启动并设置开机自启

systemctl enable node_exporter --now

检查状态

systemctl status node_exporter

验证

访问http://ip:9100/metrics，监控数据存在即可。

安装grafana

下载地址：Grafana get started | Cloud, Self-managed, Enterprise

tar zxvf node_exporter-1.7.0.linux-amd64.tar.gz -C /usr/local/prometheus
mv /usr/local/prometheus/node_exporter-1.7.0.linux-amd64 /usr/local/prometheus/node_exporter

注册系统服务

vim /etc/systemd/system/grafana-server.service

[Unit]
Description=Grafana server
Documentation=http://docs.grafana.org
[Service]
Type=simple
#User=prometheus
#Group=prometheus
Restart=on-failure
ExecStart=/usr/local/prometheus/grafana/bin/grafana-server \
  --config=/usr/local/prometheus/grafana/conf/defaults.ini \
  --homepath=/usr/local/prometheus/grafana
[Install]
WantedBy=multi-user.target

启动并设置开机自启

systemctl enable grafana-server --now

检查状态

systemctl status grafana-server

浏览器访问http://ip:3000,账号admin，密码admin，首次根据提示修改密码

添加监控

创建告警规则

mkdir -p /usr/local/prometheus/prometheus/rules

vim node.yaml

groups:
- name: 服务器资源监控
  rules:
  - alert: 内存使用率过高
    expr: 100 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 80
    for: 3m 
    labels:
      severity: 严重告警
    annotations:
      summary: "{{ $labels.instance }} 内存使用率过高, 请尽快处理！"
      description: "{{ $labels.instance }}内存使用率超过80%,当前使用率{{ $value }}%."
          
  - alert: 服务器宕机
    expr: up == 0
    for: 1s
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} 服务器宕机, 请尽快处理!"
      description: "{{$labels.instance}} 服务器node_exporter服务被关闭,当前状态{{ $value }}. "
 
  - alert: CPU高负荷
    expr: 100 - (avg by (instance,job)(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
    for: 5m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} CPU使用率过高,请尽快处理！"
      description: "{{$labels.instance}} CPU使用大于90%,当前使用率{{ $value }}%. "
      
  - alert: 磁盘IO性能
    expr: avg(irate(node_disk_io_time_seconds_total[1m])) by(instance,job)* 100 > 90
    for: 5m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} 流入磁盘IO使用率过高,请尽快处理！"
      description: "{{$labels.instance}} 流入磁盘IO大于90%,当前使用率{{ $value }}%."
 
 
  - alert: 网络流入
    expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400
    for: 5m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} 流入网络带宽过高，请尽快处理！"
      description: "{{$labels.instance}} 流入网络带宽持续5分钟高于100M. RX带宽使用量{{$value}}."
 
  - alert: 网络流出
    expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400
    for: 5m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} 流出网络带宽过高,请尽快处理！"
      description: "{{$labels.instance}} 流出网络带宽持续5分钟高于100M. RX带宽使用量{$value}}."
  
  - alert: TCP连接数
    expr: node_netstat_Tcp_CurrEstab > 10000
    for: 2m
    labels:
      severity: 严重告警
    annotations:
      summary: " TCP_ESTABLISHED过高！"
      description: "{{$labels.instance}} TCP_ESTABLISHED大于100%,当前使用率{{ $value }}%."
 
  - alert: 磁盘容量
    expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 90
    for: 1m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.mountpoint}} 磁盘分区使用率过高，请尽快处理！"
      description: "{{$labels.instance}} 磁盘分区使用大于90%，当前使用率{{ $value }}%."

修改配置文件

vim /usr/local/prometheus/prometheus/prometheus.yml

global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
    - static_configs:
        - targets:
           - localhost:9093 # alertmanager地址，先写上

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "rules/*.yml"  # 告警规则
  # - "first_rules.yml"
  # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: "prometheus"

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
      - targets: ["localhost:9090"]  # 监控本主机

  - job_name: 'alertmanager'
    scrape_interval: 10s
    static_configs:
    - targets: ['localhost:9093'] # 监控alertmanager 主机

  - job_name: 'node-exporter'
    scrape_interval: 10s
    static_configs:
    - targets: ['localhost:9100'] # 监控节点，节点安装node_exporter
      labels:
        instance: Prometheus服务器

验证数据

重启prometheus

systemctl restart prometheus

grafana集成prometheus

浏览器添加数据源

添加Prometheus地址，我这里是同一台安装，使用localhost

最后保存即可，不需要其他配置。

添加dashboard

这里使用1860id，节点监控仪表盘，更多仪表盘访问官方获取：Dashboards | Grafana Labs