Skip to content

异常值检测

异常值(离群点)是指与其他观测值显著不同的数据点,需要特别处理。

统计方法检测

python
import pandas as pd
import numpy as np

df = pd.DataFrame({
    '分数': [85, 90, 78, 92, 88, 95, 30, 87, 91, 200]
})

# 使用描述统计查看异常
print(df.describe())

# 计算Z分数
from scipy import stats
z_scores = np.abs(stats.zscore(df['分数']))
print(f"Z分数: {z_scores}")

# 标记Z分数大于3的异常值
outliers = df[z_scores > 3]
print(f"异常值: {outliers}")

箱线图法

python
# 计算四分位数
Q1 = df['分数'].quantile(0.25)
Q3 = df['分数'].quantile(0.75)
IQR = Q3 - Q1

# 定义异常值范围
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"下界: {lower_bound}, 上界: {upper_bound}")

# 检测异常值
outliers = df[(df['分数'] < lower_bound) | (df['分数'] > upper_bound)]
print(f"异常值:\n{outliers}")

# 标记异常值
df['是否异常'] = (df['分数'] < lower_bound) | (df['分数'] > upper_bound)
print(df)

百分位数法

python
# 使用百分位数检测异常值
lower_percentile = df['分数'].quantile(0.01)
upper_percentile = df['分数'].quantile(0.99)

print(f"1%分位数: {lower_percentile}")
print(f"99%分位数: {upper_percentile}")

# 标记异常值
df['是否异常'] = (df['分数'] < lower_percentile) | (df['分数'] > upper_percentile)
print(df)

处理异常值

删除异常值

python
# 删除异常值
df_clean = df[(df['分数'] >= lower_bound) & (df['分数'] <= upper_bound)]
print(df_clean)

替换异常值

python
# 用边界值替换
df['分数处理'] = df['分数'].clip(lower=lower_bound, upper=upper_bound)
print(df)

# 用中位数替换
df['分数处理'] = df['分数'].where(
    (df['分数'] >= lower_bound) & (df['分数'] <= upper_bound),
    df['分数'].median()
)
print(df)

对数变换

python
# 对数变换减少异常值影响
df['分数对数'] = np.log1p(df['分数'])
print(df)

分组检测异常值

python
df = pd.DataFrame({
    '班级': ['A', 'A', 'A', 'B', 'B', 'B'],
    '分数': [85, 90, 30, 78, 200, 88]
})

def detect_outliers(group):
    Q1 = group['分数'].quantile(0.25)
    Q3 = group['分数'].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    group['是否异常'] = (group['分数'] < lower) | (group['分数'] > upper)
    return group

df = df.groupby('班级').apply(detect_outliers)
print(df)

多变量异常检测

python
df = pd.DataFrame({
    '身高': [170, 165, 180, 175, 168, 300],
    '体重': [65, 55, 80, 70, 60, 200]
})

# 使用马氏距离检测多变量异常值
from scipy.spatial.distance import mahalanobis

def mahalanobis_outliers(df):
    mean = df.mean()
    cov = df.cov()
    inv_cov = np.linalg.inv(cov)
    
    distances = []
    for i in range(len(df)):
        d = mahalanobis(df.iloc[i], mean, inv_cov)
        distances.append(d)
    
    df['马氏距离'] = distances
    df['是否异常'] = df['马氏距离'] > np.percentile(distances, 95)
    return df

df = mahalanobis_outliers(df)
print(df)

异常值检测是数据质量控制的重要环节,选择合适的检测方法可以有效识别和处理异常数据。