在编写巡检脚本之前,首先需要明确巡检的内容和目标。监控内容如下:
以下是一个bash脚本用于收集上述部分信息:
#!/bin/bash
# 定义日志文件路径
LOG_FILE="/var/log/system_inspection.log"
# 获取当前时间
CURRENT_TIME=$(date +"%Y-%m-%d %H:%M:%S")
# 记录脚本开始执行的时间
echo "[$CURRENT_TIME] System Inspection Script Started" >> $LOG_FILE
LogFileName=polling.$(date +%F-%T)
EchoFormat=$(for (( i=0;i<30;i++ ));do echo -n "=";done)
# 系统信息
SystemInfo(){
printf "${EchoFormat} 系统信息 ${EchoFormat}\n" >> $LOG_FILE
printf "系统类型: %-10s\n" $(uname -a| awk '{print $NF}') >> $LOG_FILE
printf "系统版本: %-10s\n" "$(cat /etc/redhat-release)" >> $LOG_FILE
printf "内核信息: %-10s\n" $(uname -r) >> $LOG_FILE
printf "主机名: %-10s\n" $(uname -n) >> $LOG_FILE
printf "编码格式: %-10s\n" ${LANG} >> $LOG_FILE
printf "系统当前时间: %-10s %-10s\n" $(date +%F) $(date +%T) >> $LOG_FILE
printf "系统运行负载: %-4s %-1s\n" $(uptime | awk -F: '{print $5 }'|awk -F, '{print $1,"%"}') >> $LOG_FILE
printf "系统运行天数: %-10s\n" $(uptime |awk '{print $3}') >> $LOG_FILE
printf "在线用户人数: %-3s\n" $(w|tail -n +3|wc -l) >> $LOG_FILE
printf "SELinux: %-10s\n" $(grep "SELINUX=[d|e|p]" /etc/selinux/config |awk -F= '{print $2}') >> $LOG_FILE
echo -e "最后一次修改时间: $(uptime -p)" >> $LOG_FILE
echo -e "IP地址: $(hostname -I | cut -d' ' -f1)" >> $LOG_FILE
echo -e "Cpu处理器: $(lscpu | grep "Model name:" | sed 's/Model name:\s*//')" >> $LOG_FILE
echo -e "内存空间: $(free -h | awk '/^Mem:/ { print $3 "/" $2 }')" >> $LOG_FILE
echo -e "交换空间: $(free -h | awk '/^Swap:/ { print $3 "/" $2 }')" >> $LOG_FILE
}
# CPU信息
CpuInfo(){
MemonyId=$(top -b -n1|awk 'NR==3'|awk -F, '{print $4}'| cut -c 1-5)
MemonyUse=$(echo "100-${MemonyId}" |bc)
printf "${EchoFormat} CPU信息 ${EchoFormat}\n" >> $LOG_FILE
printf "逻辑CPU核数: %-3s\n" $(grep "processor" /proc/cpuinfo|sort -u|wc -l) >> $LOG_FILE
printf "物理CPU核数: %-3s\n" $(grep "physical id" /proc/cpuinfo |sort -u|wc -l) >> $LOG_FILE
printf "CPU架构: %-3s\n" $(uname -m) >> $LOG_FILE
printf "CPU设置型号: %-3s\n" "$(grep "model name" /proc/cpuinfo |awk -F: '{print $2}'|sort -u|cut -c 2-50)" >> $LOG_FILE
echo -e "CPU 1分钟负载: `awk '{printf "%15s",$1}' /proc/loadavg`" >> $LOG_FILE
echo -e "CPU 5分钟负载: `awk '{printf "%15s",$2}' /proc/loadavg`" >> $LOG_FILE
echo -e "CPU10分钟负载: `awk '{printf "%15s",$3}' /proc/loadavg`" >> $LOG_FILE
printf "使用CPU占比: %-1s %-1s\n" ${MemonyUse} % >> $LOG_FILE
printf "空闲CPU占比: %-1s %-1s\n" ${MemonyId} % >> $LOG_FILE
printf "占用CPU Top10信息:\n\n" >> $LOG_FILE
ps -eo user,pid,pcpu,pmem,args --sort=-pcpu |head -n 10 >> $LOG_FILE
}
# Memory信息
MemoryInfo(){
printf "${EchoFormat} 内存信息 ${EchoFormat}\n" >> $LOG_FILE
printf "总共内存: %-1s\n" $(free -mh|awk "NR==2"|awk '{print $2}') >> $LOG_FILE
printf "使用内存: %-1s\n" $(free -mh|awk "NR==2"|awk '{print $3}') >> $LOG_FILE
printf "剩余内存: %-1s\n" $(free -mh|awk "NR==2"|awk '{print $4}') >> $LOG_FILE
printf "内存使用占比: %-1s %-1s\n" $(free | grep -i mem |awk '{print $6/$2*100}'|cut -c1-5) % >> $LOG_FILE
printf "占用内存排名前10的soft:\n\n" >> $LOG_FILE
ps -eo user,pid,pcpu,pmem,args --sort=-pmem |head -n 10 >> $LOG_FILE
}
# 磁盘使用量排序:
Disk_Info() {
printf "${EchoFormat} 各分区使用率 ${EchoFormat}\n" >> $LOG_FILE
df -h >> $LOG_FILE
echo
}
# Swap信息
SwapInfo(){
printf "${EchoFormat} Swap信息 ${EchoFormat}\n" >> $LOG_FILE
printf "Swap总大小: %-1s\n" $(free -mh|awk "NR==3"|awk '{print $2}') >> $LOG_FILE
printf "已用Swap: %-1s\n" $(free -mh|awk "NR==3"|awk '{print $3}') >> $LOG_FILE
printf "可用Swap: %-1s\n" $(free -mh|awk "NR==3"|awk '{print $4}') >> $LOG_FILE
}
# 网络信息
NetworkInfo(){
printf "${EchoFormat} 网络信息 ${EchoFormat}\n" >> $LOG_FILE
printf "IP地址: %-1s %-1s %-1s %-1s\n" $(ifconfig -a|grep inet|grep -v 127.0.0.1|grep -v inet6|awk '{print $2}'|tr -d "addr:") >> $LOG_FILE
printf "网关: %-1s %-1s %-1s %-1s\n" $(ifconfig -a|grep "netmask"|grep -v 127.0.0.1|awk '{print $4}') >> $LOG_FILE
printf "DNS: %-1s %-1s %-1s %-1s %-1s\n" $(grep "nameserver" /etc/resolv.conf | awk '{print $2}') >> $LOG_FILE
if (ping -c2 -w2 www.baidu.com &>/dev/null);then
printf "网络是否连通: %s\n" 是 >> $LOG_FILE
else
printf "网络是否连通: %s\n" 否 >> $LOG_FILE
fi
}
#直接登录后打印
SystemInfo
CpuInfo
MemoryInfo
Disk_Info
SwapInfo
NetworkInfo
在脚本中添加异常检测和报警机制,可以在检测到异常情况时通过邮件、短信或其他方式通知系统管理员。使用mail命令发送邮件报警:
# 定义报警邮件的接收者ALERT_EMAIL="admin@example.com"# 定义一个函数来发送报警邮件send_alert() { SUBJECT="System Inspection Alert" BODY="An issue has been detected on the system. Please check the log file for details." echo "$BODY" | mail -s "$SUBJECT" $ALERT_EMAIL}# 检查CPU使用率是否超过阈值CPU_THRESHOLD=80if [[ $(echo "$CPU_USAGE > $CPU_THRESHOLD" | bc -l) -eq 1 ]]; then echo "[$CURRENT_TIME] Warning: CPU Usage Exceeds Threshold" >> $LOG_FILE send_alertfi# 检查内存使用率是否超过阈值MEM_THRESHOLD=80if [[ $(echo "$MEM_USAGE > $MEM_THRESHOLD" | bc -l) -eq 1 ]]; then echo "[$CURRENT_TIME] Warning: Memory Usage Exceeds Threshold" >> $LOG_FILE send_alertfi# 检查磁盘空间是否不足DISK_THRESHOLD=80DISK_USAGE_PERCENT=$(echo $DISK_USAGE | awk '{print $1}' | sed 's/%//g')if [[ $DISK_USAGE_PERCENT -ge $DISK_THRESHOLD ]]; then echo "[$CURRENT_TIME] Warning: Disk Usage Exceeds Threshold" >> $LOG_FILE send_alertfi
将脚本保存为/usr/local/bin/check_system.sh
,并确保其具有可执行权限:
chmod +x /usr/local/bin/check_system.sh
cron
定时任务来定期执行该脚本。例如,设置每天凌晨2点执行一次:crontab -e
在crontab文件中添加以下行:
0 2 * * * /usr/local/bin/check_system.sh
这样,系统就会每天凌晨2点自动执行巡检脚本,并将结果记录到日志文件中。如果检测到异常情况,脚本还会发送报警邮件给系统管理员。
相关标签: