Prometheus使用netdata为数据源，并配置报警-摩杜云开发者社区

1、安装netdata

# 使用官方脚本一键部署（需服务器能上外网）
wget -O /tmp/netdata-kickstart.sh https://my-netdata.io/kickstart.sh && sh /tmp/netdata-kickstart.sh

2、安装Prometheus

# 下载安装包
wget https://github.com/prometheus/prometheus/releases/download/v2.45.0/prometheus-2.45.0.linux-amd64.tar.gz
	
# 上传服务器、解压、启动
tar zxvf prometheus-2.45.0.linux-amd64.tar.gz
mv prometheus-2.45.0.linux-amd64 /usr/local/prometheus
cd /usr/local/prometheus

# 编写systemd服务文件
cat > /usr/lib/systemd/system/prometheus.service << EOF
[Unit]
Description=Prometheus
After=network.target
Documentation=https://prometheus.io/
[Service]
Type=simple
User=root
Group=root
ExecStart=/usr/local/prometheus/prometheus --config.file=/usr/local/prometheus/prometheus.yml --storage.tsdb.path=/usr/local/prometheus/data --web.listen-address=:9090 #可通过这里设置Prometheus配置文件、数据存储目录以及监听端口
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF

# 重载、设置开机自启并启动服务
systemctl daemon-reload && systemctl enable prometheus --now

3、安装并配置alertmanager

# 下载安装包
wget https://github.com/prometheus/alertmanager/releases/download/v0.25.0/alertmanager-0.25.0.linux-amd64.tar.gz

# 解压、移动
tar zxvf alertmanager-0.25.0.linux-amd64.tar.gz 
mv alertmanager-0.25.0.linux-amd64 /usr/local/alertmanager
cd /usr/local/alertmanager/

# 修改配置，配置邮件告警
cat > /usr/local/alertmanager/alertmanager.yml << EOF
global:
  resolve_timeout: 1m
  smtp_smarthost: 'xxx'			#填写邮箱smtp地址加端口
  smtp_from: 'xxx@163.com'	#填写邮箱账号
  smtp_auth_username: 'xxx'	#填写smtp认证账号
  smtp_auth_password: 'xxx'	#填写smtp认证授权码
  smtp_require_tls: false
route:  #用于配置告警分发策略
  group_by: [alertname] 		#采用哪个标签来作为分组依据
  group_wait: 10s       		#组告警等待时间。也就是告警产生后等待10s，如果有同组告警一起发出
  group_interval: 10s    		#上下两组发送告警的间隔时间
  repeat_interval: 10m    	#重复发送告警的时间，减少相同邮件的发送频率，默认是1h
  receiver: default-receiver  #定义谁来收告警
receivers:
- name: 'default-receiver'
  email_configs:
  - to: 'xxx@qq.com'	#填写接收邮件的邮箱
    send_resolved: true
EOF

# 编写systemd服务文件
cat > /usr/lib/systemd/system/alertmanager.service << EOF
[Unit]
Description=Alertmanager
After=network.target
Documentation=https://prometheus.io/docs/alerting/latest/alertmanager/

[Service]
Type=simple
User=root
Group=root
ExecStart=/usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml
Restart=on-failure

[Install]
WantedBy=multi-user.target
EOF

# 重载、设置开机自启及启动服务
systemctl daemon-reload && systemctl restart alertmanager

4、配置Prometheus使用netdata作为数据源

# 修改Prometheus配置，在scrape_config下添加如下配置
  - job_name: 'netdata_test'
    metrics_path: '/api/v1/allmetrics'
    params:
      # format: prometheus | prometheus_all_hosts
      # You can use `prometheus_all_hosts` if you want Prometheus to set the `instance` to your hostname instead of IP 
      format: [ prometheus ]
      #
      # sources: as-collected | raw | average | sum | volume
      # default is: average
      #source: [as-collected]
      #
      # server name for this prometheus - the default is the client IP
      # for Netdata to uniquely identify it
      #server: ['prometheus1']
    honor_labels: true
    static_configs:
      - targets: [ '127.0.0.1:19999' ]	#netdata地址端口

5、配置报警

# 报警规则配置文件可自行设置，这里以开机时间大于24小时为例
# 编辑报警规则配置文件：rules.yml
root@pve:/usr/local/prometheus# vim rules.yml 
groups:
  - name: pve-server
    rules:
    - alert: pve-server开机时间大于24个小时
      expr: netdata_system_uptime_seconds_average / 60 > 1440
      for: 2s
      labels:
        severity: warnning
      annotations:
        description: "开机时间大于 24 个小时"
        
# 编辑Prometheus配置文件，添加或修改如下两项内容
root@pve:/usr/local/prometheus# cat prometheus.yml
alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - 127.0.0.1:9093	#alertmanager地址端口

rule_files:
  - "rules.yml"		#指定上面编辑的rules文件
 
# 重启Prometheus
systemctl restart prometheus

完整的Prometheus配置文件

root@pve:/usr/local/prometheus# cat prometheus.yml 
# my global config
global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - 127.0.0.1:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "rules.yml"
  # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: "prometheus"

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
      - targets: ["localhost:9090"]
  - job_name: 'netdata_test'
    metrics_path: '/api/v1/allmetrics'
    params:
      # format: prometheus | prometheus_all_hosts
      # You can use `prometheus_all_hosts` if you want Prometheus to set the `instance` to your hostname instead of IP 
      format: [ prometheus ]
      #
      # sources: as-collected | raw | average | sum | volume
      # default is: average
      #source: [as-collected]
      #
      # server name for this prometheus - the default is the client IP
      # for Netdata to uniquely identify it
      #server: ['prometheus1']
    honor_labels: true
    static_configs:
      - targets: [ '127.0.0.1:19999' ]

6、测试

页面访问：http://ip:9090

Prometheus使用netdata为数据源，并配置报警_监控

查看数据源：status-targets

Prometheus使用netdata为数据源，并配置报警_监控_02

查看报警状态：

Prometheus使用netdata为数据源，并配置报警_linux_03

# 以上配置报警为服务器开机时间大于24小时即触发报警，可进行测试

7、脚本一键部署

注意：使用该脚本需先将 prometheus-2.45.0.linux-amd64.tar.gz 和 alertmanager-0.25.0.linux-amd64.tar.gz 包放置在脚本文件夹下

#!/usr/bin/env bash
INSTALL_DIR=`pwd`
# 定义颜色
CDEF="\033[0m"      # Default color
CCIN="\033[0;36m"   # Info color
CGSC="\033[0;32m"   # Success color
CRER="\033[0;31m"   # Error color
CWAR="\033[0;33m"   # Warning color
b_CDEF="\033[1;37m" # Bold default color
b_CCIN="\033[1;36m" # Bold info color
b_CGSC="\033[1;32m" # Bold success color
b_CRER="\033[1;31m" # Bold error color
b_CWAR="\033[1;33m" # Bold warning color

# Print message with flag type to change message color
prompt() {
  case ${1} in
  "-s" | "--success")
    echo -e "${b_CGSC}""${*/-s/}""${CDEF}"
    ;; # Print success message
  "-e" | "--error")
    echo -e "${b_CRER}${*/-e/}${CDEF}"
    ;; # Print error message
  "-w" | "--warning")
    echo -e "${b_CWAR}${*/-w/}${CDEF}"
    ;; # Print warning message
  "-i" | "--info")
    echo -e "${b_CCIN}${*/-i/}${CDEF}"
    ;; # Print info message
  *)
    echo -e "$@"
    ;;
  esac
}

##=======安装netdata（需服务器能上外网）=======
prompt "==========开始安装netdata=========="
if ss -tunlp | grep 19999 | grep netdata 1>/dev/null; then
  prompt -i "netdata已成功安装并启动"
else
  prompt -i "netdata未安装，开始安装netdata"
  wget -O /tmp/netdata-kickstart.sh https://my-netdata.io/kickstart.sh && sh /tmp/netdata-kickstart.sh
        # 检查netdata是否安装成功
        if ss -tunlp | grep 19999 | grep netdata 1>/dev/null; then
          prompt -s "netdata已成功安装并启动"
        else
          prompt -e "netdata安装失败，请检查"
        fi
fi

##=======安装Prometheus=======
prompt "==========开始安装Prometheus=========="
# 检查prometheus是否已安装
if ss -tunlp | grep 9090 | grep prometheus 1>/dev/null; then
  prompt -i "prometheus已成功安装并启动，跳过安装进程"
else
  prompt -i "prometheus未安装，开始安装脚本"
tar zxvf prometheus-2.45.0.linux-amd64.tar.gz
mv ./prometheus-2.45.0.linux-amd64 /usr/local/prometheus

# 为了方便管理，编写systemd服务文件
cat > /usr/lib/systemd/system/prometheus.service << EOF
[Unit]
Description=Prometheus
After=network.target
Documentation=https://prometheus.io/
[Service]
Type=simple
User=root
Group=root
ExecStart=/usr/local/prometheus/prometheus --config.file=/usr/local/prometheus/prometheus.yml --storage.tsdb.path=/usr/local/prometheus/data --web.listen-address=:9090
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF

# 重载及启动服务
systemctl daemon-reload && systemctl enable prometheus --now
sleep 2
# 检查prometheus是否启动
if ss -tunlp | grep 9090 | grep prometheus 1>/dev/null; then
  prompt -s "prometheus已成功安装并启动"
else
  prompt -e "prometheus安装失败，请检查"
fi
fi

##=======安装部署alertmanager=======
prompt "==========开始安装alertmanager=========="
if ss -tunlp | grep 9093 | grep alertmanager 1>/dev/null; then
  prompt -i "alertmanager已成功安装并启动，跳过安装进程"
else
  prompt -i "alertmanager未安装，开始安装"
tar zxvf alertmanager-0.25.0.linux-amd64.tar.gz 
sleep 1
mv alertmanager-0.25.0.linux-amd64 /usr/local/alertmanager

# 修改配置，配置邮件告警
cat > /usr/local/alertmanager/alertmanager.yml << EOF
global:
  resolve_timeout: 1m
  smtp_smarthost: 'xxx'			#填写邮箱smtp地址加端口
  smtp_from: 'xxx@163.com'	#填写邮箱账号
  smtp_auth_username: 'xxx'	#填写smtp认证账号
  smtp_auth_password: 'xxx'	#填写smtp认证密码
  smtp_require_tls: false
route:  #用于配置告警分发策略
  group_by: [alertname] 		#采用哪个标签来作为分组依据
  group_wait: 10s       		#组告警等待时间。也就是告警产生后等待10s，如果有同组告警一起发出
  group_interval: 10s    		#上下两组发送告警的间隔时间
  repeat_interval: 10m    	#重复发送告警的时间，减少相同邮件的发送频率，默认是1h
  receiver: default-receiver  #定义谁来收告警
receivers:
- name: 'default-receiver'
  email_configs:
  - to: 'xxx@qq.com'	#填写接收邮件的邮箱
    send_resolved: true
EOF

# 编写systemd服务文件
cat > /usr/lib/systemd/system/alertmanager.service << EOF
[Unit]
Description=Alertmanager
After=network.target
Documentation=https://prometheus.io/docs/alerting/latest/alertmanager/

[Service]
Type=simple
User=root
Group=root
ExecStart=/usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml
Restart=on-failure

[Install]
WantedBy=multi-user.target
EOF

# 重载、设置开机自启及启动服务
systemctl daemon-reload && systemctl enable alertmanager --now
sleep 2
# 检查alertmanager是否启动
if ss -tunlp | grep 9093 | grep alertmanager 1>/dev/null; then
  prompt -s "alertmanager已成功安装并启动"
  sleep 1
else
  prompt -e "alertmanager安装失败，请检查"
  exit 1
fi
fi

##=======配置Prometheus=======
prompt "==========开始配置Prometheus=========="
if [[ -f /usr/local/prometheus/rules.yml ]]; then
    prompt -i "rules文件已配置，跳过修改配置文件"
else
    prompt -i "rules文件不存在，开始配置"
# 修改配置
cp -rf $INSTALL_DIR/config/* /usr/local/prometheus/

# 重启Prometheus
systemctl restart prometheus
fi