树莓派网址nginx访问日志分析

日志解析完成,总访问量:641805 条

===== 统计结果 =====
总访问量:641805
独立IP数:14
最活跃IP:192.168.1.29(访问492948次)
访问高峰时段:16时(59997次)
图表已保存至:C:/Users/czliu/Downloads/nginx_analysis.png

import re
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from datetime import datetime

# ===================== 配置项(你的日志路径) =====================
LOG_PATH = r"C:\Users\czliu\Downloads\access.log"
# ==================================================================

# 辅助函数:清理字符串中的无效字符
def clean_string(s):
    if isinstance(s, str):
        # 移除非打印字符和二进制数据
        return ''.join(c for c in s if c.isprintable())
    return s

# 1. Nginx日志正则表达式(通用格式)
# 匹配格式:IP - - [时间] "请求" 状态码 大小 "来源" "UA"
log_pattern = re.compile(
    r'(\d+\.\d+\.\d+\.\d+)\s+-\s+-\s+\[(.*?)\]\s+"(.*?)"\s+(\d+)\s+(\d+|-)'
)

# 2. 解析日志
parsed_data = []
with open(LOG_PATH, 'r', encoding='utf-8', errors='ignore') as f:
    for line in f:
        # 清理日志行中的无效字符
        clean_line = clean_string(line)
        match = log_pattern.match(clean_line.strip())
        if match:
            ip, time_str, request, status, size = match.groups()
            
            # 清理各个字段
            ip = clean_string(ip)
            time_str = clean_string(time_str)
            request = clean_string(request)
            status = clean_string(status)
            size = clean_string(size)
            
            # 处理时间
            dt = None
            try:
                # 日志时间格式:03/Jan/2025:12:34:56 +0800
                dt = datetime.strptime(time_str, '%d/%b/%Y:%H:%M:%S %z')
                hour = dt.hour  # 按小时统计
            except:
                hour = -1
            
            # 提取请求方法和URL
            req_parts = request.split()
            method = req_parts[0] if len(req_parts) > 0 else 'UNKNOWN'
            url = req_parts[1] if len(req_parts) > 1 else '/'
            
            # 再次清理方法和URL
            method = clean_string(method)
            url = clean_string(url)
            
            parsed_data.append({
                'ip': ip,
                'time': dt,
                'hour': hour,
                'method': method,
                'url': url,
                'status': int(status),
                'size': int(size) if size != '-' else 0
            })

# 转为DataFrame方便分析
df = pd.DataFrame(parsed_data)
print(f"日志解析完成,总访问量:{len(df)} 条")

# 清理数据
if len(df) > 0:
    df['ip'] = df['ip'].apply(clean_string)
    df['method'] = df['method'].apply(clean_string)
    df['url'] = df['url'].apply(clean_string)

# 3. 分析核心数据
# 3.1 访问量TOP10 IP
ip_top10 = Counter(df['ip']).most_common(10)

# 3.2 按小时访问趋势
hour_counts = df[df['hour'] != -1]['hour'].value_counts().sort_index()

# 3.3 HTTP状态码分布
status_counts = df['status'].value_counts()

# 3.4 请求方法分布
method_counts = df['method'].value_counts()

# 4. 绘制综合图表(2x2子图)
if len(df) > 0:
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 解决中文乱码
    plt.rcParams['axes.unicode_minus'] = False
    plt.rcParams['text.usetex'] = False  # 禁用LaTeX
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 10))
    fig.suptitle('Nginx访问日志分析报告', fontsize=18, fontweight='bold')
    
    # 子图1:TOP10访问IP
    if ip_top10:
        # 清理IP地址中的特殊字符并限制长度
        clean_ips = []
        for ip, _ in ip_top10:
            clean_ip = clean_string(ip)
            # 限制IP显示长度,避免过长
            if len(clean_ip) > 15:
                clean_ip = clean_ip[:15] + '...'
            clean_ips.append(clean_ip)
        ax1.barh(clean_ips, [cnt for _, cnt in ip_top10], color='#1f77b4')
        ax1.set_title('TOP10 访问IP', fontsize=14)
        ax1.set_xlabel('访问次数')
    else:
        ax1.set_title('TOP10 访问IP', fontsize=14)
        ax1.text(0.5, 0.5, '无数据', ha='center', va='center')
    
    # 子图2:小时访问趋势
    if not hour_counts.empty:
        ax2.plot(hour_counts.index, hour_counts.values, marker='o', color='#ff7f0e', linewidth=2)
        ax2.set_title('24小时访问趋势', fontsize=14)
        ax2.set_xlabel('小时')
        ax2.set_ylabel('访问量')
        ax2.grid(alpha=0.3)
    else:
        ax2.set_title('24小时访问趋势', fontsize=14)
        ax2.text(0.5, 0.5, '无数据', ha='center', va='center')
    
    # 子图3:HTTP状态码分布
    if not status_counts.empty:
        colors = ['#2ca02c', '#d62728', '#9467bd', '#8c564b']
        # 确保标签是字符串且干净
        labels = [str(label) for label in status_counts.index]
        ax3.pie(status_counts.values, labels=labels, autopct='%1.1f%%', colors=colors[:len(status_counts)])
        ax3.set_title('HTTP状态码分布', fontsize=14)
    else:
        ax3.set_title('HTTP状态码分布', fontsize=14)
        ax3.text(0.5, 0.5, '无数据', ha='center', va='center')
    
    # 子图4:请求方法分布
    if not method_counts.empty:
        # 清理请求方法中的特殊字符并过滤掉无效方法
        clean_methods = []
        clean_values = []
        for method, cnt in method_counts.items():
            clean_method = clean_string(method)
            # 只保留有效的HTTP方法
            if clean_method in ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS'] or cnt > 10:
                clean_methods.append(clean_method)
                clean_values.append(cnt)
        
        if clean_methods:
            ax4.bar(clean_methods, clean_values, color='#e377c2')
            ax4.set_title('请求方法分布', fontsize=14)
            ax4.set_ylabel('次数')
        else:
            ax4.set_title('请求方法分布', fontsize=14)
            ax4.text(0.5, 0.5, '无数据', ha='center', va='center')
    else:
        ax4.set_title('请求方法分布', fontsize=14)
        ax4.text(0.5, 0.5, '无数据', ha='center', va='center')
    
    plt.tight_layout()
    plt.savefig(r'C:\Users\czliu\Downloads\nginx_analysis.png', dpi=300, bbox_inches='tight')
    plt.close()  # 关闭图表,避免在非交互式环境中出现问题

# 5. 打印关键统计
print("\n===== 统计结果 =====")
print(f"总访问量:{len(df)}")
print(f"独立IP数:{df['ip'].nunique()}")
if ip_top10:
    print(f"最活跃IP:{ip_top10[0][0]}(访问{ip_top10[0][1]}次)")
else:
    print("最活跃IP:无数据")
if not hour_counts.empty:
    print(f"访问高峰时段:{hour_counts.idxmax()}时({hour_counts.max()}次)")
else:
    print("访问高峰时段:无数据")
if len(df) > 0:
    print(f"图表已保存至:C:/Users/czliu/Downloads/nginx_analysis.png")
else:
    print("图表:无数据,未生成")