Linux系统监控指标与工具深度解析
全面解析Linux系统监控的核心指标体系,从基础性能指标到高级监控工具的实战应用
系统监控是性能优化的基础,只有全面了解系统的运行状态,才能做出准确的性能分析和优化决策。本文将深入解析Linux系统监控的核心指标体系,介绍各类监控工具的使用方法,并构建完整的监控指标框架。
系统监控指标体系
监控指标层次结构
Rendering diagram...
核心监控指标详解
#!/bin/bash
# 综合系统监控脚本
# CPU监控函数
monitor_cpu() {
echo "=== CPU监控 ==="
# CPU使用率
echo "CPU使用率:"
top -bn1 | grep "Cpu(s)" | sed "s/.*,*\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1"%"}'
# CPU负载
echo "CPU负载:"
uptime | awk -F'load average:' '{print $2}'
# 上下文切换
echo "上下文切换:"
cat /proc/stat | awk '/ctxt/ {print "总次数: " $2}'
# CPU时间分布
echo "CPU时间分布:"
grep '^cpu' /proc/stat | awk '{for(i=2;i<=NF;i++) printf $i" "; print ""}'
}
# 内存监控函数
monitor_memory() {
echo "=== 内存监控 ==="
# 内存使用情况
echo "内存使用:"
free -h
# 详细的内存信息
echo "内存详细信息:"
cat /proc/meminfo | grep -E "MemTotal|MemFree|MemAvailable|Buffers|Cached|SwapTotal|SwapFree"
# 缓存命中率(需要计算)
echo "页面统计:"
cat /proc/vmstat | grep -E "pgpgin|pgpgout|pswpin|pswpout"
# 内存碎片
echo "内存碎片:"
cat /proc/buddyinfo | head -5
}
# 磁盘I/O监控
monitor_disk() {
echo "=== 磁盘I/O监控 ==="
# 磁盘使用率
echo "磁盘使用率:"
df -h
# I/O统计
echo "I/O统计:"
iostat -x 1 1 | grep -v "^$"
# 磁盘队列深度
echo "磁盘队列深度:"
cat /proc/diskstats | awk '{print $1": "$4}' | head -5
}
# 网络监控
monitor_network() {
echo "=== 网络监控 ==="
# 网络接口统计
echo "网络接口统计:"
cat /proc/net/dev | grep -v "Inter-|face"
# 连接状态
echo "TCP连接状态:"
netstat -an | awk '/tcp/ {print $6}' | sort | uniq -c | sort -nr
# 网络错误
echo "网络错误统计:"
cat /proc/net/snmp | grep -E "Tcp|Ip|Icmp" | grep -v ":"
}
# 进程监控
monitor_processes() {
echo "=== 进程监控 ==="
# Top进程
echo "CPU占用Top 10进程:"
ps -eo pid,ppid,cmd,%mem,%cpu --sort=-%cpu | head -11
echo "内存占用Top 10进程:"
ps -eo pid,ppid,cmd,%mem,%cpu --sort=-%mem | head -11
# 进程数统计
echo "进程总数:"
ps aux | wc -l
# 线程数统计
echo "线程总数:"
ps -eLf | wc -l
}
# 系统健康检查
system_health_check() {
echo "=== 系统健康检查 ==="
# 检查CPU负载
load=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | sed 's/,//')
cores=$(nproc)
if (( $(echo "$load > $cores" | bc -l) )); then
echo "警告: CPU负载过高 ($load > $cores)"
fi
# 检查内存使用
mem_used=$(free | awk 'NR==2 {printf "%.0f",$3/$2*100}')
if (( mem_used > 90 )); then
echo "警告: 内存使用率过高 (${mem_used}%)"
fi
# 检查磁盘空间
df -h | grep -vE '^Filesystem|tmpfs|cdrom' | awk '{print $5 " " $1}' | while read output; do
use_percent=$(echo $output | awk '{print $1}' | sed 's/%//')
mount_point=$(echo $output | awk '{print $2}')
if (( use_percent > 90 )); then
echo "警告: 磁盘空间不足 $mount_point (${use_percent}%)"
fi
done
# 检查网络连接
connections=$(netstat -an | grep ESTABLISHED | wc -l)
if (( connections > 10000 )); then
echo "警告: 网络连接数过高 ($connections)"
fi
}
# 主函数
main() {
echo "Linux系统综合监控报告"
echo "生成时间: $(date)"
echo "主机名: $(hostname)"
echo "内核版本: $(uname -r)"
echo ""
monitor_cpu
echo ""
monitor_memory
echo ""
monitor_disk
echo ""
monitor_network
echo ""
monitor_processes
echo ""
system_health_check
echo ""
echo "监控报告生成完成"
}
# 执行主函数
main
高级监控工具应用
Performance Co-Pilot (PCP)
#!/bin/bash
# PCP性能监控配置
# 安装PCP
install_pcp() {
echo "安装Performance Co-Pilot..."
# Ubuntu/Debian
if command -v apt-get &> /dev/null; then
sudo apt-get update
sudo apt-get install -y pcp pcp-webapi grafana
# CentOS/RHEL
elif command -v yum &> /dev/null; then
sudo yum install -y pcp pcp-webapi pcp-collector
sudo systemctl enable pmcd
sudo systemctl start pmcd
fi
}
# 配置PCP
configure_pcp() {
echo "配置PCP监控..."
# 启用性能指标收集
sudo pmstore pmcd.services.detect 1
# 配置采集间隔
sudo pmstore pmlogger.sample.hertz 60
# 启动日志记录
sudo systemctl enable pmlogger
sudo systemctl start pmlogger
echo "PCP配置完成"
}
# PCP基础使用
pcp_basic_usage() {
echo "PCP基础命令使用示例:"
# 实时监控CPU
echo "1. 实时CPU监控:"
echo " pcp -h localhost -t 1s metrics kernel.all.cpu.usage"
# 内存监控
echo "2. 内存使用监控:"
echo " pcp -h localhost -t 1s metrics mem.physmem"
# 网络监控
echo "3. 网络流量监控:"
echo " pcp -h localhost -t 1s metrics network.interface.in.bytes"
# 磁盘I/O监控
echo "4. 磁盘I/O监控:"
echo " pcp -h localhost -t 1s metrics disk.dev.read_bytes"
}
# PCP高级分析
pcp_advanced_analysis() {
echo "PCP高级分析功能:"
# 使用pmrep生成报告
echo "生成性能报告:"
echo " pmrep -t 1s -s 60 -f csv kernel.all.cpu network.interface.in"
# 使用pmchart进行图形分析
echo "图形化分析:"
echo " pmchart -c cpu_network"
# 使用pmie进行事件分析
echo "事件监控:"
echo " pmie -c pmie.conf"
# 使用pcp-webapi进行Web界面监控
echo "Web界面:"
echo " 访问 http://localhost:44323"
}
# PCP自定义指标
create_custom_pcp_metrics() {
echo "创建自定义PCP指标..."
# 创建PMDA (Performance Metrics Domain Agent)
cat > /var/lib/pcp/pmdas/custom/mymetrics.conf << 'EOF'
# 自定义指标配置文件
myapp.response.time 0.0 0.0 0.0 0.0
myapp.requests.total 0 0 0 0
myapp.errors.count 0 0 0 0
EOF
# 创建自定义脚本
cat > /var/lib/pcp/pmdas/custom/mymetrics.sh << 'EOF'
#!/bin/bash
# 自定义指标收集脚本
case "$1" in
fetch)
echo "myapp.response.time $(curl -s -o /dev/null -w '%{time_total}' http://localhost:8080/health)"
echo "myapp.requests.total $(cat /var/log/app/metrics | grep 'requests_total' | awk '{print $2}')"
echo "myapp.errors.count $(cat /var/log/app/metrics | grep 'errors_count' | awk '{print $2}')"
;;
*)
echo "Usage: $0 {fetch}"
exit 1
;;
esac
EOF
chmod +x /var/lib/pcp/pmdas/custom/mymetrics.sh
echo "自定义指标配置完成"
}
main() {
case "${1:-help}" in
install)
install_pcp
;;
configure)
configure_pcp
;;
basic)
pcp_basic_usage
;;
advanced)
pcp_advanced_analysis
;;
custom)
create_custom_pcp_metrics
;;
help|*)
echo "PCP监控工具管理"
echo "用法: $0 {install|configure|basic|advanced|custom|help}"
;;
esac
}
main "$@"
Prometheus监控体系
# Prometheus配置示例
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'production'
environment: 'us-west'
alerting:
alertmanagers:
- static_configs:
- targets:
- 'alertmanager:9093'
rule_files:
- 'alerts/*.yml'
scrape_configs:
# Prometheus自监控
- job_name: 'prometheus'
static_configs:
- targets: ["localhost:9090"]
labels:
service: 'monitoring'
# Linux节点监控
- job_name: 'node'
static_configs:
- targets: ["node-exporter:9100"]
labels:
service: 'linux-server'
# 应用监控
- job_name: 'webapp'
static_configs:
- targets: ["webapp:8080"]
labels:
service: 'web-application'
team: 'backend'
# 数据库监控
- job_name: 'mysql'
static_configs:
- targets: ["mysql-exporter:9104"]
labels:
service: 'database'
type: 'mysql'
# Nginx监控
- job_name: 'nginx'
static_configs:
- targets: ["nginx-exporter:9113"]
labels:
service: 'web-server'
// 自定义Prometheus Exporter示例
package main
import (
"fmt"
"net/http"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
// 定义指标
var (
// 应用版本信息
appVersion = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "app_version_info",
Help: "Application version information",
},
[]string{"version","build_date"},
)
// 请求总数
requestTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "app_requests_total",
Help: "Total number of requests",
},
[]string{"method","endpoint","status"},
)
// 请求延迟
requestDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "app_request_duration_seconds",
Help: "Request duration in seconds",
Buckets: prometheus.DefBuckets,
},
[]string{"method","endpoint"},
)
// 当前活跃连接
activeConnections = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "app_active_connections",
Help: "Current number of active connections",
},
)
// 内存使用
memoryUsage = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "app_memory_usage_bytes",
Help: "Memory usage in bytes",
},
[]string{"type"},
)
)
func init() {
// 注册指标
prometheus.MustRegister(appVersion)
prometheus.MustRegister(requestTotal)
prometheus.MustRegister(requestDuration)
prometheus.MustRegister(activeConnections)
prometheus.MustRegister(memoryUsage)
// 设置应用版本
appVersion.WithLabelValues("1.0.0","2026-05-19").Set(1)
}
// 模拟HTTP请求处理
func simulateRequest() {
// 记录开始时间
start := time.Now()
// 模拟处理
time.Sleep(time.Duration(10+time.Now().UnixNano()%90) * time.Millisecond)
// 记录请求
method := "GET"
endpoint := "/api/data"
status := "200"
// 更新指标
requestTotal.WithLabelValues(method,endpoint,status).Inc()
requestDuration.WithLabelValues(method,endpoint).Observe(time.Since(start).Seconds())
}
func main() {
// 模拟应用运行
go func() {
for {
simulateRequest()
time.Sleep(100 * time.Millisecond)
}
}()
// 模拟内存使用变化
go func() {
for {
memoryUsage.WithLabelValues("heap").Set(float64(100 + time.Now().UnixNano()%400))
memoryUsage.WithLabelValues("stack").Set(float64(10 + time.Now().UnixNano()%20))
time.Sleep(1 * time.Second)
}
}()
// 暴露指标端点
http.Handle("/metrics",promhttp.Handler())
// 健康检查端点
http.HandleFunc("/health",func(w http.ResponseWriter,r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write([]byte("OK"))
})
fmt.Println("Starting application on :8080")
fmt.Println("Metrics available at http://localhost:8080/metrics")
if err := http.ListenAndServe(":8080",nil); err != nil {
fmt.Printf("Error starting server: %v\n",err)
}
}
实时监控系统实现
自定义监控框架
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <sys/sysinfo.h>
#include <sys/statvfs.h>
#include <unistd.h>
#include <signal.h>
#include <stdbool.h>
// 监控指标结构
typedef struct {
char name[64];
double value;
char unit[16];
time_t timestamp;
bool is_alert;
double threshold;
} metric;
// 监控配置
typedef struct {
int interval;
int retention_hours;
bool enable_alerts;
char alert_webhook[256];
} monitoring_config;
// 监控系统上下文
typedef struct {
monitoring_config config;
metric *metrics;
int metric_count;
bool running;
pthread_t monitor_thread;
} monitoring_system;
// 获取CPU使用率
double get_cpu_usage() {
static unsigned long long prev_idle = 0,prev_total = 0;
unsigned long long idle,total;
FILE *fp = fopen("/proc/stat","r");
if (!fp) return -1.0;
char line[256];
fgets(line,sizeof(line),fp);
fclose(fp);
unsigned long long user,nice,system,idle_time,iowait,irq,softirq,steal;
sscanf(line,"cpu %llu %llu %llu %llu %llu %llu %llu %llu",
&user,&nice,&system,&idle_time,&iowait,&irq,&softirq,&steal);
idle = idle_time + iowait;
total = user + nice + system + idle + irq + softirq + steal;
unsigned long long delta_idle = idle - prev_idle;
unsigned long long delta_total = total - prev_total;
prev_idle = idle;
prev_total = total;
return delta_total > 0 ? (1.0 - (double)delta_idle / delta_total) * 100.0 : 0.0;
}
// 获取内存使用率
double get_memory_usage() {
struct sysinfo info;
if (sysinfo(&info) != 0) return -1.0;
unsigned long total = info.totalram;
unsigned long available = info.freeram + info.bufferram;
unsigned long used = total - available;
return (double)used / total * 100.0;
}
// 获取磁盘使用率
double get_disk_usage(const char *path) {
struct statvfs stat;
if (statvfs(path,&stat) != 0) return -1.0;
unsigned long total = stat.f_blocks * stat.f_frsize;
unsigned long available = stat.f_bavail * stat.f_frsize;
unsigned long used = total - available;
return (double)used / total * 100.0;
}
// 获取网络连接数
int get_network_connections() {
FILE *fp = popen("netstat -an | grep ESTABLISHED | wc -l","r");
if (!fp) return -1;
int connections;
fscanf(fp,"%d",&connections);
pclose(fp);
return connections;
}
// 添加监控指标
void add_metric(monitoring_system *sys,const char *name,double value,
const char *unit,bool is_alert,double threshold) {
// 扩展指标数组
sys->metrics = realloc(sys->metrics,(sys->metric_count + 1) * sizeof(metric));
metric *m = &sys->metrics[sys->metric_count];
strncpy(m->name,name,sizeof(m->name) - 1);
m->value = value;
strncpy(m->unit,unit,sizeof(m->unit) - 1);
m->timestamp = time(NULL);
m->is_alert = is_alert;
m->threshold = threshold;
sys->metric_count++;
}
// 收集系统指标
void collect_metrics(monitoring_system *sys) {
// CPU指标
double cpu_usage = get_cpu_usage();
add_metric(sys,"cpu.usage",cpu_usage,"%",true,90.0);
// 内存指标
double memory_usage = get_memory_usage();
add_metric(sys,"memory.usage",memory_usage,"%",true,90.0);
// 磁盘指标
double disk_usage = get_disk_usage("/");
add_metric(sys,"disk.usage",disk_usage,"%",true,90.0);
// 网络指标
int connections = get_network_connections();
add_metric(sys,"network.connections",(double)connections,"count",true,10000.0);
// 负载指标
struct sysinfo info;
sysinfo(&info);
double load_1min = info.loads[0] / (1 << SI_LOAD_SHIFT);
add_metric(sys,"system.load.1min",load_1min,"load",true,sysconf(_SC_NPROCESSORS_ONLN));
}
// 保存指标到文件
void save_metrics_to_file(monitoring_system *sys,const char *filename) {
FILE *fp = fopen(filename,"a");
if (!fp) return;
for (int i = 0; i < sys->metric_count; i++) {
metric *m = &sys->metrics[i];
char timestamp_str[64];
strftime(timestamp_str,sizeof(timestamp_str),"%Y-%m-%d %H:%M:%S",localtime(&m->timestamp));
fprintf(fp,"%s,%s,%.2f,%s,%d,%.2f\n",
timestamp_str,m->name,m->value,m->unit,m->is_alert,m->threshold);
}
fclose(fp);
}
// 检查告警条件
void check_alerts(monitoring_system *sys) {
for (int i = 0; i < sys->metric_count; i++) {
metric *m = &sys->metrics[i];
if (m->is_alert && m->value > m->threshold) {
char timestamp_str[64];
strftime(timestamp_str,sizeof(timestamp_str),"%Y-%m-%d %H:%M:%S",localtime(&m->timestamp));
printf("[ALERT] %s %s = %.2f%s (阈值: %.2f%s)\n",
timestamp_str,m->name,m->value,m->unit,m->threshold,m->unit);
// 这里可以添加发送告警通知的逻辑
}
}
}
// 监控线程
void* monitoring_thread(void *arg) {
monitoring_system *sys = (monitoring_system *)arg;
while (sys->running) {
// 清空旧指标
sys->metric_count = 0;
// 收集新指标
collect_metrics(sys);
// 检查告警
if (sys->config.enable_alerts) {
check_alerts(sys);
}
// 保存指标
save_metrics_to_file(sys,"metrics.csv");
// 等待下一次采集
sleep(sys->config.interval);
}
return NULL;
}
// 初始化监控系统
monitoring_system* init_monitoring_system(monitoring_config *config) {
monitoring_system *sys = malloc(sizeof(monitoring_system));
if (!sys) return NULL;
memcpy(&sys->config,config,sizeof(monitoring_config));
sys->metrics = NULL;
sys->metric_count = 0;
sys->running = true;
// 创建监控线程
pthread_create(&sys->monitor_thread,NULL,monitoring_thread,sys);
printf("监控系统初始化完成,采集间隔: %d 秒\n",config->interval);
return sys;
}
// 停止监控系统
void stop_monitoring_system(monitoring_system *sys) {
sys->running = false;
pthread_join(sys->monitor_thread,NULL);
if (sys->metrics) {
free(sys->metrics);
}
free(sys);
printf("监控系统已停止\n");
}
int main() {
printf("Linux系统监控框架\n");
printf("=============================\n\n");
// 配置监控参数
monitoring_config config = {
.interval = 5,// 5秒采集间隔
.retention_hours = 24,// 保留24小时数据
.enable_alerts = true,// 启用告警
.alert_webhook = "" // 告警Webhook
};
// 初始化监控系统
monitoring_system *sys = init_monitoring_system(&config);
printf("监控系统正在运行...\n");
printf("按Ctrl+C停止\n\n");
// 运行一段时间
sleep(60);
// 停止监控
stop_monitoring_system(sys);
printf("\n监控指标已保存到 metrics.csv\n");
return 0;
}
通过建立完善的监控指标体系和选择合适的监控工具,可以实现对系统性能的全面掌握,为后续的性能优化和故障排查提供坚实的数据基础。