Python 数据过滤以消除密度图周围的异常值

问题描述 投票:0回答:1

参考下图,我想删除以黑色椭圆形标记的密度区域之外的所有异常值。我可以使用简单的水平过滤器,例如 -4 < data < 4. But outliers still remain. I am looking for any technique that precisely captures the density samples but drops the outliers.

enter image description here

样本数据:

x = array([1243. , 1261. ,  973. ,  842. ,  592. ,  499. , 1088. ,  739.5,
        567.5,  536.5,  854. ,  763. ,  671. ,  574. ,  498.5,  510.5,
        541.5,  544. ,  565.5,  482. ,  416. ,  412.5,  440. ,  540. ,
        652. ,  735. ,  878. , 1030. , 1022. , 1105. , 1034. , 1064. ,
       1089. , 1115. , 1145. , 1146. , 1111. , 1117. , 1140. , 1168. ,
        845. , 1173. ,  898. , 1091. ,  591. ,  570.5,  506. ,  592.5,
        682.5,  619.5,  663. ,  593. ,  470. ,  810. ,  694.5,  900. ,
        965. ,  954. ,  771. ,  608.5,  631. ,  593. ,  652. ,  428. ,
        486. ,  445. ,  395.5,  387.5,  383. ,  390. ,  408. ,  420. ,
        470. ,  543.5,  686. ,  550. ,  588. ,  556.5,  475.5,  606. ,
        617. ,  674. ,  571. ,  810. ,  913. ,  868. ,  621.5,  417. ,
        388. ,  428. ,  501. ,  586.5,  668. ,  739. ,  914. ,  829. ,
        966. ,  995. , 1008. ,  961. ])

y = array([[-10.6,   0.4,   0.1,  -0.1,  -0.5,   0. ],
       [-12.5,   1.5,   1.4,   0.9,   0.7,   0.7],
       [  4.5,   0.3,   0.2,   0. ,   0.6,   0.2],
       [  4.6,  -0.7,  -0.8,  -0.9,  -0.7,  -0.8],
       [  1.8,  -1.3,  -1.6,  -1.8,  -1.4,  -1.5],
       [ 10.4,  -1.4,  -1.5,  -1.1,  -1.2,  -1.1],
       [  1. ,  -0.6,  -0.5,  -0.3,  -0.2,  -0.2],
       [  0. ,   0.2,  -0.1,   0.1,  -0.1,  -0.1],
       [ -1.7,  -1.1,  -1. ,  -0.9,  -0.8,  -0.7],
       [  1.6,  -1. ,  -1.3,  -0.7,  -1. ,  -0.8],
       [  0.5,   0. ,   0. ,   0.3,   0.1,   0.3],
       [ -0.1,  -0.3,  -0.5,  -0.2,  -0.1,  -0.1],
       [  0.8,  -0.4,  -0.3,  -0.4,  -0.5,  -0.5],
       [ -1.3,  -0.8,  -1. ,  -1. ,  -1.3,  -1.1],
       [ -0.1,  -1.9,  -2.2,  -1.6,  -1.7,  -1.5],
       [ -0.9,  -1.3,  -1.5,  -1.9,  -1.7,  -2.1],
       [ -0.5,  -0.8,  -0.9,  -1.3,  -1.4,  -1.3],
       [ -0.2,  -0.6,  -0.5,  -0.8,  -1.6,  -0.9],
       [ -0.8,  -1.2,  -1. ,  -0.6,  -0.8,  -0.9],
       [ -1.2,  -0.6,  -1. ,  -0.4,  -1.3,  -0.4],
       [ -1.1,  -1. ,  -1.1,  -1.2,  -1. ,  -1.3],
       [ -0.8,  -0.9,  -1. ,  -1. ,  -2.7,  -1. ],
       [ -1.2,  -1.4,  -1.4,  -1.1,  -1.6,  -1.1],
       [ -0.4,  -0.6,  -0.7,  -0.5,   3.5,  -0.6],
       [  0.4,   0.1,   0. ,   0.1,   7.3,   0.1],
       [  0.2,  -0.1,   0. ,   0.5,   3.2,   0.6],
       [  0.3,   0.4,   0.2,   0.1, -16.7,   0.1],
       [  1.3,   1.1,   1.1,   1.4,  -2.1,   1.3],
       [  1.2,   1.4,   1.3,   1.3,  -1.7,   1.4],
       [  1.6,   1.2,   1.3,   1.5,   1.6,   1.6],
       [  0.8,   1.3,   1.3,   1.1,   1.1,   1.2],
       [  0.4,   1. ,   1.1,   0.6,   0.8,   0.7],
       [  1. ,   1.1,   1.3,   0.9,   1. ,   1.1],
       [  0. ,   0.3,   0.3,  -0.2,  -0.4,  -0.2],
       [  0.4,   0.6,   0.7,   0.1,  -0.1,   0.2],
       [  1.6,   1. ,   0.9,   0.6,   0.8,   0.6],
       [  0.3,   0.6,   0.6,   0.3,   0.4,   0.5],
       [  0.2,  -0.6,   0. ,   0.2,   0.1,   0.2],
       [ -0.3,   0.6,   0.2,  -0.1,  -0.2,  -0.2],
       [  0.4,   0.5,   0.6,   0.2,   0.2,   0.3],
       [ -0.1,   0.1,   0.1,  -0.2,   0. ,  -0.2],
       [ -0.3,  -0.6,  -0.5,  -0.3,  -0.4,  -0.2],
       [  0.2,   0.1,   0.3,   0.1,   0.1,   0. ],
       [ -0.3,  -0.5,  -0.5,  -0.7,  -0.7,  -0.6],
       [ -1.1,  -0.8,  -0.9,  -0.8,  -1. ,  -0.9],
       [ -2.9,  -1.9,  -2.2,  -2.3,  -2.3,  -2.4],
       [ -3. ,  -2.4,  -2.5,  -2.2,  -1.9,  -2.3],
       [ -0.4,  -1.5,  -1.4,  -0.8,  -0.6,  -0.9],
       [  0.4,   0.1,   0. ,   0.4,   0. ,   0.4],
       [ -0.1,  -0.8,  -0.7,   0. ,  -0.1,  -0.1],
       [ -0.3,  -0.6,  -0.3,  -0.2,  -0.2,  -0.2],
       [  0.4,   0.4,   0.2,  -0.1,  -0.1,  -0.1],
       [ -1.9,  -1.6,  -1.8,  -1.7,  -1.8,  -1.8],
       [ -0.5,  -0.8,  -0.8,  -0.6,  -0.1,  -0.6],
       [  0.8,   0.4,   0.5,   0.8,   0.7,   0.7],
       [  1.1,   1. ,   1. ,   0.7,   0.9,   0.8],
       [  0.7,   0.8,   0.9,   0.7,   0.6,   0.7],
       [  1. ,   1.1,   1. ,   0.8,   0.8,   0.8],
       [  0.2,   0.5,   0.4,   0.3,   0.1,   0.3],
       [ -0.3,  -1.2,  -1. ,  -0.7,  -0.5,  -0.8],
       [ -0.4,  -0.5,  -0.4,  -0.2,  -0.4,  -0.2],
       [  0. ,  -0.5,  -0.2,   0.3,   0.1,   0.2],
       [  0.2,   0. ,   0.1,   0.1,  -0.1,   0. ],
       [ -1.1,  -0.6,  -0.8,  -0.7,  -0.6,  -0.7],
       [ -0.8,  -0.9,  -0.9,  -0.6,  -0.7,  -0.6],
       [ -0.7,  -0.4,  -0.6,  -0.5,  -0.6,  -0.4],
       [ -1.6,  -1.2,  -1.4,  -1.1,  -1.2,  -1.3],
       [ -0.5,  -1.6,  -1.5,  -0.7,  -0.7,  -0.7],
       [ -1. ,  -1.2,  -1.3,  -0.6,  -0.9,  -0.8],
       [ -0.7,  -0.4,  -0.4,  -0.5,  -0.7,  -0.5],
       [ -0.1,  -0.2,  -0.3,   0. ,  -0.2,  -0.1],
       [ -0.5,  -0.4,  -0.4,  -0.3,  -0.3,  -0.2],
       [ -0.5,  -0.3,  -0.5,  -0.3,  -0.4,  -0.4],
       [  0.2,   0. ,   0. ,   0.1,   0. ,   0.1],
       [  0.9,   0.7,   0.8,   0.5,   0.6,   0.6],
       [  0.5,   0.6,   0.5,   0.6,   0.5,   0.5],
       [ -0.1,   0.2,   0.2,   0.4,   0.4,   0.4],
       [  0. ,   0.2,   0.1,   0.2,   0.2,   0.2],
       [ -0.4,  -0.2,  -0.4,  -0.2,  -0.3,  -0.2],
       [ -0.1,  -0.1,  -0.1,  -0.3,  -0.2,  -0.2],
       [  0.1,   0.4,   0.3,   0.1,   0.1,   0.1],
       [  0. ,   0. ,  -0.1,   0.2,   0.2,   0.3],
       [  0.7,   0.8,   0.9,   0.6,   0.6,   0.5],
       [  0.4,   0.2,   0.4,  -0.1,   0. ,   0.1],
       [  1.7,   1.4,   1.4,   1.2,   1.3,   1.2],
       [  0.9,   1. ,   1. ,   0.8,   1. ,   0.8],
       [  0.3,   0.5,   0.6,   0.4,   0.3,   0.3],
       [ -1.4,  -1. ,  -1.2,  -0.9,  -0.7,  -0.8],
       [ -1. ,  -1. ,  -1. ,  -1. ,  -1.2,  -1.1],
       [ -0.6,  -0.7,  -0.8,  -0.9,  -0.9,  -0.8],
       [ -0.5,  -0.8,  -0.7,  -0.3,  -0.4,  -0.4],
       [  0. ,  -0.2,  -0.1,  -0.3,  -0.5,  -0.3],
       [ -0.3,   0.2,   0. ,   0.1,   0. ,   0. ],
       [  0.8,   0.3,   0.4,   0.4,   0.5,   0.5],
       [  1.2,   1. ,   1.2,   0.8,   0.8,   0.6],
       [  1.7,   1.3,   1.4,   1.8,   1.8,   1.7],
       [  1.2,   1.1,   1.2,   1.1,   1.3,   1.3],
       [  1.5,   1.6,   1.6,   1.4,   1.7,   1.4],
       [  1.7,   1.8,   2. ,   1.5,   1.8,   1.5],
       [  0.6,   0.8,   1. ,   0.8,   1.3,   1. ]])
python dataframe scikit-learn cluster-analysis outliers
1个回答
0
投票

下面的代码使用

IsolationForest
来区分异常值和内部值。要调整的主要参数是
contamination=

它适用于样本数据,但为了推广到更大的数据集,我们需要一个更具代表性的样本。

enter image description here

import numpy as np
from matplotlib import pyplot as plt

#x, y from OP

x_flat = np.repeat(x, y.shape[1])
y_flat = y.ravel()
xy = np.column_stack([x_flat, y_flat])

#Outlier modelling
from sklearn.ensemble import IsolationForest

contamination_fraction = 5 / 100 #5%
model = IsolationForest(contamination=contamination_fraction * 0.5)
pred = model.fit_predict(xy)

cmap = 'PiYG'
plt.scatter(xy[:, 0], xy[:, 1], c=pred, edgecolor='none', cmap=cmap)
plt.scatter(xy[:, 0], xy[:, 1], marker='.', s=5, color='white')
plt.gcf().set_size_inches(7, 4)
plt.xlabel('x')
plt.ylabel('y')
plt.gca().spines[:].set_visible(False)

#Legend
plt.scatter([], [], color=plt.get_cmap(cmap, 2)(0.), label='outlier')
plt.scatter([], [], color=plt.get_cmap(cmap, 2)(1.), label='inlier')
plt.scatter([], [], edgecolor='gray', color='white', label='data')
plt.legend()
© www.soinside.com 2019 - 2024. All rights reserved.