Python:从数据趋势中查找异常值

问题描述 投票:0回答:2

请注意,这篇文章不会与 SO 上的以下任何相关文章重复:

我在实验中得到了数据:


    import matplotlib.pyplot as plt
    
    x = [22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50]
    y_NaOH = [94.2, 146.2, 222.2, 276.2, 336.2, 372.2, 428.2, 542.2, 576.2, 684.2, 766.2, 848.2, 904.2, 1042.2, 1136.2]
    y_NaHCO3 = [232.0, 308.0, 322.0, 374.0, 436.0, 494.0, 592.0, 660.0, 704.0, 824.0, 900.0, 958.0, 1048.0, 1138.0, 1232.0]
    y_BaOH2 = [493.1, 533.1, 549.1, 607.1, 665.1, 731.1, 797.1, 867.1, 971.1, 1007.1, 1091.1, 1221.1, 1311.1, 1371.1, 1497.1, ]
    
    plt.plot(x, y_NaOH)
    plt.plot(x, y_NaHCO3)
    plt.plot(x, y_BaOH2)
    plt.show()

enter image description here

但是,我在标记异常值时遇到了困难,这是我尝试过的:


    import matplotlib.pyplot as plt
    import statistics
    
    x = [22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50]
    y_NaOH = [94.2, 146.2, 222.2, 276.2, 336.2, 372.2, 428.2, 542.2, 576.2, 684.2, 766.2, 848.2, 904.2, 1042.2, 1136.2]
    y_NaHCO3 = [232.0, 308.0, 322.0, 374.0, 436.0, 494.0, 592.0, 660.0, 704.0, 824.0, 900.0, 958.0, 1048.0, 1138.0, 1232.0]
    y_BaOH2 = [493.1, 533.1, 549.1, 607.1, 665.1, 731.1, 797.1, 867.1, 971.1, 1007.1, 1091.1, 1221.1, 1311.1, 1371.1, 1497.1, ]
    
    # plt.plot(x, y_NaOH)
    # plt.plot(x, y_NaHCO3)
    # plt.plot(x, y_BaOH2)
    # plt.show()
    
    
    def detect_outlier(data_1):
        threshold = 1
        mean_1 = statistics.mean(data_1)
        std_1 = statistics.stdev(data_1)
        result_dataset = [y  for y in data_1 if abs((y - mean_1)/std_1)<=threshold ]
    
        return result_dataset
    
    
    if __name__=="__main__":
        dataset = y_NaHCO3
        result_dataset = detect_outlier(dataset)
        print(result_dataset)
        # [374.0, 436.0, 494.0, 592.0, 660.0, 704.0, 824.0, 900.0, 958.0]

错误的是,这种方法总是过滤掉我的数据的边缘值,实际上我试图删除不适合曲线的点。


另外,我可以手动观察曲线的形状并标记异常值,但这确实花费了很多时间。我将非常感谢您的帮助。


预期产量

我想在线绘制数据并将异常值标记为点,例如:


    from matplotlib import pyplot as plt
    
    x = [22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50]
    y_NaOH = [94.2, 146.2, 222.2, 276.2, 336.2, 372.2, 428.2, 542.2, 576.2, 684.2, 766.2, 848.2, 904.2, 1042.2, 1136.2]
    y_NaHCO3 = [232.0, 308.0, 322.0, 374.0, 436.0, 494.0, 592.0, 660.0, 704.0, 824.0, 900.0, 958.0, 1048.0, 1138.0, 1232.0]
    y_BaOH2 = [493.1, 533.1, 549.1, 607.1, 665.1, 731.1, 797.1, 867.1, 971.1, 1007.1, 1091.1, 1221.1, 1311.1, 1371.1, 1497.1, ]
    
    o_NaOH = [542.2]
    o_NaHCO3 = [308.0]
    o_BaOH2 = [493.1]
    
    
    def sketch_rejected(xv, yv, y_out):
        nx = []
        ny = []
        x_out = []
        for ii, dd in enumerate(yv):
            if dd not in y_out:
                nx.append(xv[ii])
                ny.append(dd)
            else:
                x_out.append(xv[ii])
        plt.plot(nx, ny)
        plt.scatter(x_out, y_out)
    
    
    sketch_rejected(x, y_NaOH, o_NaOH)
    sketch_rejected(x, y_NaHCO3, o_NaHCO3)
    sketch_rejected(x, y_BaOH2, o_BaOH2)
    
    plt.show()

enter image description here

异常值是曲线上的尖峰部分,其中的点不适合梯度。

我可以使用模块首先对数据进行回归,然后计算异常值,而不是手动绘制每个图表并识别异常值。

在现实生活中,我有大量的测试结果,但我不知道每个结果的通用方程。

感谢您的帮助。

python matplotlib regression data-science outliers
2个回答
2
投票

有很多用于数据科学的 GitHub 存储库,您所要做的就是完成您的 git 安装

用于使用 outliers.py


    from outliers.variance import graph
    
    x = [22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50]
    y_NaOH = [94.2, 146.2, 222.2, 276.2, 336.2, 372.2, 428.2, 542.2, 576.2, 684.2, 766.2, 848.2, 904.2, 1042.2, 1136.2]
    y_NaHCO3 = [232.0, 308.0, 322.0, 374.0, 436.0, 494.0, 592.0, 660.0, 704.0, 824.0, 900.0, 958.0, 1048.0, 1138.0, 1232.0]
    y_BaOH2 = [493.1, 533.1, 549.1, 607.1, 665.1, 731.1, 797.1, 867.1, 971.1, 1007.1, 1091.1, 1221.1, 1311.1, 1371.1, 1497.1, ]
    
    graph(
        xs=x,
        ys=[y_NaOH, y_NaHCO3, y_BaOH2],
        title='title',
        legends=[f'legend {i + 1}' for i in range(len(x))],
        xlabel='xlabel',
        ylabel='ylabel',
    )
    

enter image description here


0
投票

这个想法:

  • 拟合趋势线:计算线性回归线(趋势)

  • 计算残差:对于每个数据点,残差是
    实际值 (y) 与趋势值之间的绝对差 就在那时。

  • 设置阈值:残差的标准差为 计算,阈值设定为该标准的3倍 偏差 (3σ),定义显着偏差。然而,为了您的 出于目的,您可以使用不同的阈值。

  • 识别异常值:残差超过阈值的数据点 阈值被标记为异常值。

     x = list(range(0, len(y)))
     # Calculate linear regression with Numpy
     regression_model = np.poly1d(np.polyfit(x, y, 1))
     trend = regression_model(x)
     print(f'trend: {trend}')
    
     # Calculate residuals
     y_np = np.array(y)
     residuals = y_np - trend
     trend_std_residuals = np.std(residuals)
     # Threshold for unusual deviation (e.g., 3σ)
     threshold = 3 * trend_std_residuals
     print(f'threshold: {threshold}')
     # Outliers calculated on aligned_y
     outliers = []
     for i in range(len(y)):
         data_value = y[i]
         trend_value = trend[i]
         if data_value is not None:
             residual = abs(data_value - trend_value)
             if residual > threshold:
                 outliers.append(True)
             else:
                 outliers.append(False)
         else:
             outliers.append(None)
     print(f'outliers: {outliers}')
    

在 Python 中使用线性回归定义趋势异常值

最新问题
© www.soinside.com 2019 - 2025. All rights reserved.