Linux运维实用脚本集详解
更新时间:2026年01月26日 10:07:49 作者:ICT董老师
本文分享了作者在实际生产环境中积累的实用脚本,涵盖了监控、备份、诊断、自动化等多个场景,脚本建议保存在/opt/scripts/目录,设置权限并添加日志记录和错误处理,作者还提供了一些脚本示例,包括系统监控、备份、自动化部署等
以下是在实际生产环境中积累的实用脚本,涵盖监控、备份、诊断、自动化等场景。
脚本使用建议
所有脚本保存到 /opt/scripts/ 目录并设置权限
添加日志记录和错误处理
通过 crontab -e 设置定时任务
一、系统监控与告警类
1.1 综合系统监控脚本
#!/bin/bash
# 文件名:/opt/scripts/system_monitor.sh
# 功能:综合系统状态监控,适合加入cron定时执行
# 执行频率:建议每5分钟一次
LOG_FILE="/var/log/system_monitor.log"
THRESHOLD_CPU=80 # CPU使用率阈值%
THRESHOLD_MEM=85 # 内存使用率阈值%
THRESHOLD_DISK=90 # 磁盘使用率阈值%
# 获取当前时间
echo "====== $(date '+%Y-%m-%d %H:%M:%S') ======" | tee -a $LOG_FILE
# 1. CPU监控
CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
if (( $(echo "$CPU_USAGE > $THRESHOLD_CPU" | bc -l) )); then
echo "⚠️ 警告:CPU使用率过高 - ${CPU_USAGE}%" | tee -a $LOG_FILE
# 这里可以添加告警动作,如发送邮件
# /opt/scripts/send_alert.sh "CPU报警" "CPU使用率: ${CPU_USAGE}%"
fi
# 2. 内存监控
MEM_TOTAL=$(free -m | awk '/Mem:/ {print $2}')
MEM_USED=$(free -m | awk '/Mem:/ {print $3}')
MEM_PERCENT=$((MEM_USED*100/MEM_TOTAL))
if [ $MEM_PERCENT -gt $THRESHOLD_MEM ]; then
echo "⚠️ 警告:内存使用率过高 - ${MEM_PERCENT}%" | tee -a $LOG_FILE
# 显示内存占用前10的进程
ps aux --sort=-%mem | head -11 | tee -a $LOG_FILE
fi
# 3. 磁盘监控
df -h | grep -E '^/dev/' | while read line; do
USAGE=$(echo $line | awk '{print $5}' | sed 's/%//')
MOUNT=$(echo $line | awk '{print $6}')
if [ $USAGE -gt $THRESHOLD_DISK ]; then
echo "⚠️ 警告:磁盘 $MOUNT 使用率过高 - ${USAGE}%" | tee -a $LOG_FILE
fi
done
# 4. 关键进程检查
PROCESS_LIST=("nginx" "mysql" "redis" "sshd")
for proc in "${PROCESS_LIST[@]}"; do
if ! pgrep -x "$proc" >/dev/null; then
echo "❌ 关键进程 $proc 未运行!" | tee -a $LOG_FILE
fi
done
# 5. 连接数监控(针对Web服务器)
if command -v netstat &> /dev/null; then
CONN_COUNT=$(netstat -ant | grep -c ESTABLISHED)
echo "当前ESTABLISHED连接数: $CONN_COUNT" | tee -a $LOG_FILE
fi
echo "监控完成" | tee -a $LOG_FILE
1.2 实时进程资源监控
#!/bin/bash
# 文件名:/opt/scripts/process_watch.sh
# 功能:监控指定进程的资源占用,类似简易版top
# 用法:./process_watch.sh <进程名或PID>
PROCESS_NAME=$1
INTERVAL=2 # 监控间隔(秒)
if [ -z "$PROCESS_NAME" ]; then
echo "用法: $0 <进程名或PID>"
exit 1
fi
echo "监控进程: $PROCESS_NAME,按Ctrl+C退出"
echo "时间戳 PID CPU% MEM% 虚拟内存 物理内存 进程名"
echo "----------------------------------------------------------------"
while true; do
# 通过ps获取进程信息
ps aux | grep -E "(PID|$PROCESS_NAME)" | grep -v grep | grep -v $0 | \
awk -v date="$(date '+%H:%M:%S')" '{
printf "%s %6s %5s %6s %10s %10s %s\n",
date, $2, $3, $4, $5, $6, $11
}'
# 显示进程打开的文件数(如果进程存在)
PID=$(ps aux | grep "$PROCESS_NAME" | grep -v grep | head -1 | awk '{print $2}')
if [ ! -z "$PID" ]; then
FILE_COUNT=$(ls -l /proc/$PID/fd 2>/dev/null | wc -l)
echo " 打开文件数: ${FILE_COUNT:-N/A}"
fi
sleep $INTERVAL
done
二、备份与同步类
2.1 智能增量备份脚本
#!/bin/bash
# 文件名:/opt/scripts/smart_backup.sh
# 功能:增量备份,保留最近7天,每周日全量备份
# 配置:修改SOURCE_DIR和BACKUP_DIR
SOURCE_DIR="/data/www" # 备份源目录
BACKUP_DIR="/backup/www" # 备份目标目录
RETENTION_DAYS=7 # 保留天数
DATE=$(date '+%Y%m%d_%H%M%S')
BACKUP_LOG="/var/log/backup_${DATE}.log"
# 创建目录
mkdir -p $BACKUP_DIR/{full,incremental}
# 日志函数
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a $BACKUP_LOG
}
# 检查磁盘空间
check_disk_space() {
local required=$1
local available=$(df $BACKUP_DIR | awk 'NR==2 {print $4}')
if [ $available -lt $required ]; then
log "❌ 磁盘空间不足!可用: ${available}KB, 需要: ${required}KB"
exit 1
fi
}
# 周日做全量备份,其他天做增量备份
if [ $(date '+%u') -eq 7 ]; then
BACKUP_TYPE="full"
BACKUP_PATH="$BACKUP_DIR/full/backup_${DATE}.tar.gz"
# 估算大小(单位KB)
EST_SIZE=$(du -sk $SOURCE_DIR | awk '{print $1}')
check_disk_space $((EST_SIZE * 110 / 100)) # 增加10%缓冲
log "开始全量备份..."
tar -czf $BACKUP_PATH $SOURCE_DIR 2>>$BACKUP_LOG
# 删除旧的全量备份
find $BACKUP_DIR/full -type f -mtime +$RETENTION_DAYS -delete
else
BACKUP_TYPE="incremental"
LATEST_FULL=$(ls -t $BACKUP_DIR/full/*.tar.gz 2>/dev/null | head -1)
if [ -z "$LATEST_FULL" ]; then
log "未找到全量备份,执行全量备份..."
$0 --force-full
exit 0
fi
BACKUP_PATH="$BACKUP_DIR/incremental/inc_${DATE}.tar.gz"
# 查找需要备份的修改文件(最近24小时内)
find $SOURCE_DIR -type f -mtime -1 -print > /tmp/changed_files.list
if [ -s /tmp/changed_files.list ]; then
tar -czf $BACKUP_PATH -T /tmp/changed_files.list 2>>$BACKUP_LOG
log "增量备份完成,文件数: $(wc -l < /tmp/changed_files.list)"
else
log "没有文件变化,跳过备份"
fi
rm -f /tmp/changed_files.list
fi
# 验证备份文件
if [ -f $BACKUP_PATH ]; then
BACKUP_SIZE=$(du -h $BACKUP_PATH | awk '{print $1}')
log "✅ 备份成功: $BACKUP_TYPE备份, 大小: $BACKUP_SIZE, 路径: $BACKUP_PATH"
# 发送成功通知(可选)
# echo "备份成功: $(hostname) - $BACKUP_TYPE" | mail -s "备份成功通知" admin@example.com
else
log "❌ 备份失败!"
exit 1
fi
# 清理旧日志
find /var/log/ -name "backup_*.log" -mtime +30 -delete
2.2 MySQL数据库备份
#!/bin/bash
# 文件名:/opt/scripts/mysql_backup.sh
# 功能:MySQL数据库备份,支持单库、多库、全库
# 配置前请先设置MySQL连接信息
MYSQL_USER="backup"
MYSQL_PASS="your_password"
MYSQL_HOST="localhost"
BACKUP_DIR="/backup/mysql"
RETENTION_DAYS=30
DATE=$(date '+%Y%m%d')
TIME=$(date '+%H%M')
# 创建备份目录
mkdir -p $BACKUP_DIR/{full,binlog}
# 1. 全量备份
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}
# 备份所有数据库
log "开始MySQL全量备份..."
mysqldump -h$MYSQL_HOST -u$MYSQL_USER -p$MYSQL_PASS \
--all-databases \
--single-transaction \
--routines \
--triggers \
--events \
--flush-logs \
--master-data=2 \
| gzip > $BACKUP_DIR/full/all_dbs_${DATE}_${TIME}.sql.gz
# 检查备份是否成功
if [ ${PIPESTATUS[0]} -eq 0 ]; then
log "✅ 全量备份成功"
# 2. 备份Binlog(如果开启了二进制日志)
if mysql -h$MYSQL_HOST -u$MYSQL_USER -p$MYSQL_PASS -e "SHOW BINARY LOGS" &>/dev/null; then
log "开始备份二进制日志..."
mysql -h$MYSQL_HOST -u$MYSQL_USER -p$MYSQL_PASS -e "PURGE BINARY LOGS BEFORE DATE_SUB(NOW(), INTERVAL 7 DAY)"
cp $(mysql -h$MYSQL_HOST -u$MYSQL_USER -p$MYSQL_PASS -e "SHOW VARIABLES LIKE 'log_bin_basename'" -sN | awk '{print $2}')* $BACKUP_DIR/binlog/ 2>/dev/null
fi
# 3. 清理旧备份
find $BACKUP_DIR/full -name "*.sql.gz" -mtime +$RETENTION_DAYS -delete
find $BACKUP_DIR/binlog -name "mysql-bin.*" -mtime +7 -delete
else
log "❌ 备份失败!"
exit 1
fi
三、系统维护与诊断类
3.1 自动化安全检查脚本
#!/bin/bash
# 文件名:/opt/scripts/security_audit.sh
# 功能:系统安全基线检查
REPORT_FILE="/var/log/security_audit_$(date '+%Y%m%d').txt"
echo "========== 系统安全审计报告 ==========" > $REPORT_FILE
echo "主机名: $(hostname)" >> $REPORT_FILE
echo "审计时间: $(date)" >> $REPORT_FILE
echo "=====================================" >> $REPORT_FILE
check_item() {
echo -e "\n[检查项 $1] $2" | tee -a $REPORT_FILE
}
# 1. 检查空密码账户
check_item "1" "检查空密码账户"
awk -F: '($2 == "") {print $1}' /etc/shadow >> $REPORT_FILE
# 2. 检查SUID特殊权限文件
check_item "2" "检查SUID权限文件"
find / -perm -4000 -type f 2>/dev/null | head -20 >> $REPORT_FILE
# 3. 检查最近登录
check_item "3" "最近成功登录"
last -n 10 >> $REPORT_FILE
check_item "4" "最近失败登录"
lastb -n 10 2>/dev/null >> $REPORT_FILE
# 4. 检查ssh配置
check_item "5" "SSH配置检查"
grep -E "^PermitRootLogin|^PasswordAuthentication|^Protocol" /etc/ssh/sshd_config 2>/dev/null >> $REPORT_FILE
# 5. 检查开放端口
check_item "6" "监听端口检查"
if command -v ss &> /dev/null; then
ss -tulnp | grep LISTEN >> $REPORT_FILE
else
netstat -tulnp | grep LISTEN >> $REPORT_FILE
fi
# 6. 检查系统服务
check_item "7" "危险服务检查"
for service in telnet vsftpd rsh rexec rlogin; do
systemctl is-enabled $service 2>/dev/null | grep -q "enabled" && echo "$service 服务已启用" >> $REPORT_FILE
done
echo -e "\n========== 审计完成 ==========" >> $REPORT_FILE
echo "报告已保存至: $REPORT_FILE"
3.2 日志分析脚本(查找异常)
#!/bin/bash
# 文件名:/opt/scripts/log_analyzer.sh
# 功能:分析指定日志文件的异常情况
# 用法:./log_analyzer.sh <日志文件路径>
LOG_FILE=$1
TEMP_FILE="/tmp/log_analysis_$$.tmp"
if [ ! -f "$LOG_FILE" ]; then
echo "文件不存在: $LOG_FILE"
exit 1
fi
echo "分析日志文件: $LOG_FILE"
echo "文件大小: $(du -h $LOG_FILE | awk '{print $1}')"
echo "最后修改: $(stat -c %y $LOG_FILE)"
echo "----------------------------------------"
# 1. 提取错误级别日志
echo -e "\n1. 错误级别日志统计:"
grep -i -E "(error|fatal|failed|exception|segmentation fault)" $LOG_FILE | \
awk '{print $1, $2, $3}' | \
sort | uniq -c | sort -rn | head -20
# 2. 统计HTTP状态码(针对Web日志)
if echo "$LOG_FILE" | grep -q -E "(access|nginx|apache)"; then
echo -e "\n2. HTTP状态码分布:"
awk '{print $9}' $LOG_FILE | sort | uniq -c | sort -rn
fi
# 3. 查找访问频率高的IP
echo -e "\n3. 高频访问IP TOP 10:"
awk '{print $1}' $LOG_FILE | sort | uniq -c | sort -rn | head -10
# 4. 查找可能攻击的URL模式
echo -e "\n4. 可疑请求模式:"
grep -E "(\.\./|select.*from|union.*select|eval\(|base64_decode|shell_exec)" $LOG_FILE | head -10
# 5. 按时间统计请求量
echo -e "\n5. 请求量时间分布:"
if [ -f "$LOG_FILE" ]; then
awk -F'[ :]' '{print $2":"$3}' $LOG_FILE | sort | uniq -c | tail -24
fi
# 6. 响应时间分析(如果有响应时间字段)
echo -e "\n6. 慢请求统计:"
grep -o 'rt=[0-9]*\.[0-9]*' $LOG_FILE 2>/dev/null | \
sed 's/rt=//' | \
awk '{if($1>1) print $1}' | \
sort -n | \
awk 'BEGIN{count=0;sum=0} {count++;sum+=$1} END{if(count>0) print "平均响应时间:" sum/count "秒"}'
四、自动化部署与维护
4.1 批量服务器操作脚本
#!/bin/bash
# 文件名:/opt/scripts/batch_operation.sh
# 功能:通过SSH批量在多台服务器上执行命令
# 配置:提前配置SSH免密登录
SERVER_LIST=("server1" "server2" "server3" "192.168.1.100")
COMMAND="$1"
LOG_FILE="/var/log/batch_operation_$(date '+%Y%m%d').log"
if [ -z "$COMMAND" ]; then
echo "用法: $0 '<要执行的命令>'"
echo "示例: $0 'df -h'"
echo "示例: $0 'systemctl restart nginx'"
exit 1
fi
echo "批量执行命令: $COMMAND" | tee -a $LOG_FILE
echo "开始时间: $(date)" | tee -a $LOG_FILE
echo "======================================" | tee -a $LOG_FILE
for server in "${SERVER_LIST[@]}"; do
echo -e "\n处理服务器: $server" | tee -a $LOG_FILE
# 检查服务器是否可达
if ! ping -c 1 -W 2 $server &>/dev/null; then
echo "❌ 服务器不可达" | tee -a $LOG_FILE
continue
fi
# 执行远程命令
ssh -o ConnectTimeout=5 -o BatchMode=yes $server "$COMMAND" 2>&1 | \
while IFS= read -r line; do
echo "[$server] $line" | tee -a $LOG_FILE
done
# 检查执行结果
if [ ${PIPESTATUS[0]} -eq 0 ]; then
echo "✅ 执行成功" | tee -a $LOG_FILE
else
echo "❌ 执行失败" | tee -a $LOG_FILE
fi
done
echo -e "\n======================================" | tee -a $LOG_FILE
echo "完成时间: $(date)" | tee -a $LOG_FILE
echo "详细日志: $LOG_FILE" | tee -a $LOG_FILE
4.2 自动化证书监控和续期检查
#!/bin/bash
# 文件名:/opt/scripts/cert_check.sh
# 功能:检查SSL证书过期时间
DOMAINS=(
"example.com:443"
"api.example.com:443"
"blog.example.com:443"
)
DAYS_WARNING=30 # 提前30天警告
echo "SSL证书过期检查 - $(date)"
echo "================================"
for domain_info in "${DOMAINS[@]}"; do
domain=$(echo $domain_info | cut -d: -f1)
port=$(echo $domain_info | cut -d: -f2)
# 获取证书信息
cert_info=$(echo | openssl s_client -servername $domain -connect $domain:$port 2>/dev/null | openssl x509 -noout -dates 2>/dev/null)
if [ -z "$cert_info" ]; then
echo "❌ $domain - 无法获取证书信息"
continue
fi
# 提取过期时间
not_after=$(echo "$cert_info" | grep 'notAfter' | cut -d= -f2)
expire_date=$(date -d "$not_after" '+%Y-%m-%d')
# 计算剩余天数
expire_timestamp=$(date -d "$not_after" '+%s')
current_timestamp=$(date '+%s')
days_left=$(( (expire_timestamp - current_timestamp) / 86400 ))
# 输出结果
if [ $days_left -lt 0 ]; then
echo "❌ $domain - 证书已过期 $((0 - days_left)) 天! ($expire_date)"
elif [ $days_left -lt $DAYS_WARNING ]; then
echo "⚠️ $domain - 证书将在 $days_left 天后过期 ($expire_date)"
else
echo "✅ $domain - 证书有效,剩余 $days_left 天 ($expire_date)"
fi
done
五、性能分析与优化
5.1 系统性能快照
#!/bin/bash
# 文件名:/opt/scripts/performance_snapshot.sh
# 功能:一次性收集系统性能快照
SNAPSHOT_DIR="/var/log/performance_snapshots"
mkdir -p $SNAPSHOT_DIR
SNAPSHOT_FILE="$SNAPSHOT_DIR/snapshot_$(date '+%Y%m%d_%H%M%S').txt"
echo "性能快照 - $(date)" > $SNAPSHOT_FILE
echo "================================" >> $SNAPSHOT_FILE
# 1. 系统基本信息
echo -e "\n1. 系统基本信息:" >> $SNAPSHOT_FILE
echo "主机名: $(hostname)" >> $SNAPSHOT_FILE
echo "内核版本: $(uname -r)" >> $SNAPSHOT_FILE
echo "运行时间: $(uptime -p)" >> $SNAPSHOT_FILE
# 2. CPU信息
echo -e "\n2. CPU信息:" >> $SNAPSHOT_FILE
echo "CPU型号: $(grep 'model name' /proc/cpuinfo | head -1 | cut -d: -f2 | xargs)" >> $SNAPSHOT_FILE
echo "CPU核心数: $(grep -c 'processor' /proc/cpuinfo)" >> $SNAPSHOT_FILE
echo "当前负载: $(uptime | awk -F'load average:' '{print $2}')" >> $SNAPSHOT_FILE
echo "CPU使用率:" >> $SNAPSHOT_FILE
top -bn1 | grep "Cpu(s)" >> $SNAPSHOT_FILE
# 3. 内存信息
echo -e "\n3. 内存信息:" >> $SNAPSHOT_FILE
free -h >> $SNAPSHOT_FILE
echo -e "\n内存占用前10进程:" >> $SNAPSHOT_FILE
ps aux --sort=-%mem | head -11 >> $SNAPSHOT_FILE
# 4. 磁盘信息
echo -e "\n4. 磁盘信息:" >> $SNAPSHOT_FILE
df -h >> $SNAPSHOT_FILE
echo -e "\n磁盘IO统计:" >> $SNAPSHOT_FILE
iostat -dx 1 2 2>/dev/null || echo "iostat未安装" >> $SNAPSHOT_FILE
# 5. 网络信息
echo -e "\n5. 网络连接:" >> $SNAPSHOT_FILE
if command -v ss &> /dev/null; then
ss -s >> $SNAPSHOT_FILE
else
netstat -s | head -20 >> $SNAPSHOT_FILE
fi
echo -e "\n6. 进程数统计:" >> $SNAPSHOT_FILE
ps aux --no-headers | wc -l >> $SNAPSHOT_FILE
echo "快照已保存至: $SNAPSHOT_FILE"
六、定时任务配置示例
# crontab -e 添加以下内容 # 每5分钟检查系统状态 */5 * * * * /opt/scripts/system_monitor.sh > /dev/null 2>&1 # 每天凌晨2点执行备份 0 2 * * * /opt/scripts/smart_backup.sh # 每周日凌晨3点执行全量备份 0 3 * * 0 /opt/scripts/mysql_backup.sh # 每天凌晨4点清理日志 0 4 * * * find /var/log -name "*.log" -mtime +7 -delete # 每小时检查证书 0 * * * * /opt/scripts/cert_check.sh | mail -s "证书检查报告" admin@example.com # 每月1号凌晨执行安全审计 0 0 1 * * /opt/scripts/security_audit.sh
使用建议
1. 脚本部署步骤
# 1. 创建脚本目录 sudo mkdir -p /opt/scripts # 2. 复制脚本并设置权限 sudo cp *.sh /opt/scripts/ sudo chmod +x /opt/scripts/*.sh # 3. 创建日志目录 sudo mkdir -p /var/log/scripts
2. 调试脚本
# 语法检查 bash -n script.sh # 详细执行(调试模式) bash -x script.sh # 记录执行日志 ./script.sh 2>&1 | tee /var/log/script_exec.log
3. 安全注意事项
- 脚本中避免硬编码密码,使用配置文件或环境变量
- 关键脚本设置只读权限:chmod 400 sensitive_script.sh
- 定期审计脚本内容
- 重要操作前添加确认提示
这些脚本经过生产环境验证,可以根据实际需求进行调整。建议先在小范围测试环境验证后再部署到生产环境。
总结
以上为个人经验,希望能给大家一个参考,也希望大家多多支持脚本之家。


最新评论