1、安装netdata
# 使用官方脚本一键部署(需服务器能上外网)
wget -O /tmp/netdata-kickstart.sh https://my-netdata.io/kickstart.sh && sh /tmp/netdata-kickstart.sh
2、安装Prometheus
# 下载安装包
wget https://github.com/prometheus/prometheus/releases/download/v2.45.0/prometheus-2.45.0.linux-amd64.tar.gz
# 上传服务器、解压、启动
tar zxvf prometheus-2.45.0.linux-amd64.tar.gz
mv prometheus-2.45.0.linux-amd64 /usr/local/prometheus
cd /usr/local/prometheus
# 编写systemd服务文件
cat > /usr/lib/systemd/system/prometheus.service << EOF
[Unit]
Description=Prometheus
After=network.target
Documentation=https://prometheus.io/
[Service]
Type=simple
User=root
Group=root
ExecStart=/usr/local/prometheus/prometheus --config.file=/usr/local/prometheus/prometheus.yml --storage.tsdb.path=/usr/local/prometheus/data --web.listen-address=:9090 #可通过这里设置Prometheus配置文件、数据存储目录以及监听端口
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
# 重载、设置开机自启并启动服务
systemctl daemon-reload && systemctl enable prometheus --now
3、安装并配置alertmanager
# 下载安装包
wget https://github.com/prometheus/alertmanager/releases/download/v0.25.0/alertmanager-0.25.0.linux-amd64.tar.gz
# 解压、移动
tar zxvf alertmanager-0.25.0.linux-amd64.tar.gz
mv alertmanager-0.25.0.linux-amd64 /usr/local/alertmanager
cd /usr/local/alertmanager/
# 修改配置,配置邮件告警
cat > /usr/local/alertmanager/alertmanager.yml << EOF
global:
resolve_timeout: 1m
smtp_smarthost: 'xxx' #填写邮箱smtp地址加端口
smtp_from: 'xxx@163.com' #填写邮箱账号
smtp_auth_username: 'xxx' #填写smtp认证账号
smtp_auth_password: 'xxx' #填写smtp认证授权码
smtp_require_tls: false
route: #用于配置告警分发策略
group_by: [alertname] #采用哪个标签来作为分组依据
group_wait: 10s #组告警等待时间。也就是告警产生后等待10s,如果有同组告警一起发出
group_interval: 10s #上下两组发送告警的间隔时间
repeat_interval: 10m #重复发送告警的时间,减少相同邮件的发送频率,默认是1h
receiver: default-receiver #定义谁来收告警
receivers:
- name: 'default-receiver'
email_configs:
- to: 'xxx@qq.com' #填写接收邮件的邮箱
send_resolved: true
EOF
# 编写systemd服务文件
cat > /usr/lib/systemd/system/alertmanager.service << EOF
[Unit]
Description=Alertmanager
After=network.target
Documentation=https://prometheus.io/docs/alerting/latest/alertmanager/
[Service]
Type=simple
User=root
Group=root
ExecStart=/usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
# 重载、设置开机自启及启动服务
systemctl daemon-reload && systemctl restart alertmanager
4、配置Prometheus使用netdata作为数据源
# 修改Prometheus配置,在scrape_config下添加如下配置
- job_name: 'netdata_test'
metrics_path: '/api/v1/allmetrics'
params:
# format: prometheus | prometheus_all_hosts
# You can use `prometheus_all_hosts` if you want Prometheus to set the `instance` to your hostname instead of IP
format: [ prometheus ]
#
# sources: as-collected | raw | average | sum | volume
# default is: average
#source: [as-collected]
#
# server name for this prometheus - the default is the client IP
# for Netdata to uniquely identify it
#server: ['prometheus1']
honor_labels: true
static_configs:
- targets: [ '127.0.0.1:19999' ] #netdata地址端口
5、配置报警
# 报警规则配置文件可自行设置,这里以开机时间大于24小时为例
# 编辑报警规则配置文件:rules.yml
root@pve:/usr/local/prometheus# vim rules.yml
groups:
- name: pve-server
rules:
- alert: pve-server开机时间大于24个小时
expr: netdata_system_uptime_seconds_average / 60 > 1440
for: 2s
labels:
severity: warnning
annotations:
description: "开机时间大于 24 个小时"
# 编辑Prometheus配置文件,添加或修改如下两项内容
root@pve:/usr/local/prometheus# cat prometheus.yml
alerting:
alertmanagers:
- static_configs:
- targets:
- 127.0.0.1:9093 #alertmanager地址端口
rule_files:
- "rules.yml" #指定上面编辑的rules文件
# 重启Prometheus
systemctl restart prometheus
- 完整的Prometheus配置文件
root@pve:/usr/local/prometheus# cat prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 127.0.0.1:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["localhost:9090"]
- job_name: 'netdata_test'
metrics_path: '/api/v1/allmetrics'
params:
# format: prometheus | prometheus_all_hosts
# You can use `prometheus_all_hosts` if you want Prometheus to set the `instance` to your hostname instead of IP
format: [ prometheus ]
#
# sources: as-collected | raw | average | sum | volume
# default is: average
#source: [as-collected]
#
# server name for this prometheus - the default is the client IP
# for Netdata to uniquely identify it
#server: ['prometheus1']
honor_labels: true
static_configs:
- targets: [ '127.0.0.1:19999' ]
6、测试
- 页面访问:http://ip:9090
- 查看数据源:status-targets
- 查看报警状态:
# 以上配置报警为服务器开机时间大于24小时即触发报警,可进行测试
7、脚本一键部署
注意:使用该脚本需先将 prometheus-2.45.0.linux-amd64.tar.gz 和 alertmanager-0.25.0.linux-amd64.tar.gz 包放置在脚本文件夹下
#!/usr/bin/env bash
INSTALL_DIR=`pwd`
# 定义颜色
CDEF="\033[0m" # Default color
CCIN="\033[0;36m" # Info color
CGSC="\033[0;32m" # Success color
CRER="\033[0;31m" # Error color
CWAR="\033[0;33m" # Warning color
b_CDEF="\033[1;37m" # Bold default color
b_CCIN="\033[1;36m" # Bold info color
b_CGSC="\033[1;32m" # Bold success color
b_CRER="\033[1;31m" # Bold error color
b_CWAR="\033[1;33m" # Bold warning color
# Print message with flag type to change message color
prompt() {
case ${1} in
"-s" | "--success")
echo -e "${b_CGSC}""${*/-s/}""${CDEF}"
;; # Print success message
"-e" | "--error")
echo -e "${b_CRER}${*/-e/}${CDEF}"
;; # Print error message
"-w" | "--warning")
echo -e "${b_CWAR}${*/-w/}${CDEF}"
;; # Print warning message
"-i" | "--info")
echo -e "${b_CCIN}${*/-i/}${CDEF}"
;; # Print info message
*)
echo -e "$@"
;;
esac
}
##=======安装netdata(需服务器能上外网)=======
prompt "==========开始安装netdata=========="
if ss -tunlp | grep 19999 | grep netdata 1>/dev/null; then
prompt -i "netdata已成功安装并启动"
else
prompt -i "netdata未安装,开始安装netdata"
wget -O /tmp/netdata-kickstart.sh https://my-netdata.io/kickstart.sh && sh /tmp/netdata-kickstart.sh
# 检查netdata是否安装成功
if ss -tunlp | grep 19999 | grep netdata 1>/dev/null; then
prompt -s "netdata已成功安装并启动"
else
prompt -e "netdata安装失败,请检查"
fi
fi
##=======安装Prometheus=======
prompt "==========开始安装Prometheus=========="
# 检查prometheus是否已安装
if ss -tunlp | grep 9090 | grep prometheus 1>/dev/null; then
prompt -i "prometheus已成功安装并启动,跳过安装进程"
else
prompt -i "prometheus未安装,开始安装脚本"
tar zxvf prometheus-2.45.0.linux-amd64.tar.gz
mv ./prometheus-2.45.0.linux-amd64 /usr/local/prometheus
# 为了方便管理,编写systemd服务文件
cat > /usr/lib/systemd/system/prometheus.service << EOF
[Unit]
Description=Prometheus
After=network.target
Documentation=https://prometheus.io/
[Service]
Type=simple
User=root
Group=root
ExecStart=/usr/local/prometheus/prometheus --config.file=/usr/local/prometheus/prometheus.yml --storage.tsdb.path=/usr/local/prometheus/data --web.listen-address=:9090
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
# 重载及启动服务
systemctl daemon-reload && systemctl enable prometheus --now
sleep 2
# 检查prometheus是否启动
if ss -tunlp | grep 9090 | grep prometheus 1>/dev/null; then
prompt -s "prometheus已成功安装并启动"
else
prompt -e "prometheus安装失败,请检查"
fi
fi
##=======安装部署alertmanager=======
prompt "==========开始安装alertmanager=========="
if ss -tunlp | grep 9093 | grep alertmanager 1>/dev/null; then
prompt -i "alertmanager已成功安装并启动,跳过安装进程"
else
prompt -i "alertmanager未安装,开始安装"
tar zxvf alertmanager-0.25.0.linux-amd64.tar.gz
sleep 1
mv alertmanager-0.25.0.linux-amd64 /usr/local/alertmanager
# 修改配置,配置邮件告警
cat > /usr/local/alertmanager/alertmanager.yml << EOF
global:
resolve_timeout: 1m
smtp_smarthost: 'xxx' #填写邮箱smtp地址加端口
smtp_from: 'xxx@163.com' #填写邮箱账号
smtp_auth_username: 'xxx' #填写smtp认证账号
smtp_auth_password: 'xxx' #填写smtp认证密码
smtp_require_tls: false
route: #用于配置告警分发策略
group_by: [alertname] #采用哪个标签来作为分组依据
group_wait: 10s #组告警等待时间。也就是告警产生后等待10s,如果有同组告警一起发出
group_interval: 10s #上下两组发送告警的间隔时间
repeat_interval: 10m #重复发送告警的时间,减少相同邮件的发送频率,默认是1h
receiver: default-receiver #定义谁来收告警
receivers:
- name: 'default-receiver'
email_configs:
- to: 'xxx@qq.com' #填写接收邮件的邮箱
send_resolved: true
EOF
# 编写systemd服务文件
cat > /usr/lib/systemd/system/alertmanager.service << EOF
[Unit]
Description=Alertmanager
After=network.target
Documentation=https://prometheus.io/docs/alerting/latest/alertmanager/
[Service]
Type=simple
User=root
Group=root
ExecStart=/usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
# 重载、设置开机自启及启动服务
systemctl daemon-reload && systemctl enable alertmanager --now
sleep 2
# 检查alertmanager是否启动
if ss -tunlp | grep 9093 | grep alertmanager 1>/dev/null; then
prompt -s "alertmanager已成功安装并启动"
sleep 1
else
prompt -e "alertmanager安装失败,请检查"
exit 1
fi
fi
##=======配置Prometheus=======
prompt "==========开始配置Prometheus=========="
if [[ -f /usr/local/prometheus/rules.yml ]]; then
prompt -i "rules文件已配置,跳过修改配置文件"
else
prompt -i "rules文件不存在,开始配置"
# 修改配置
cp -rf $INSTALL_DIR/config/* /usr/local/prometheus/
# 重启Prometheus
systemctl restart prometheus
fi