参考下图,我想删除以黑色椭圆形标记的密度区域之外的所有异常值。我可以使用简单的水平过滤器,例如 -4 < data < 4. But outliers still remain. I am looking for any technique that precisely captures the density samples but drops the outliers.
样本数据:
x = array([1243. , 1261. , 973. , 842. , 592. , 499. , 1088. , 739.5,
567.5, 536.5, 854. , 763. , 671. , 574. , 498.5, 510.5,
541.5, 544. , 565.5, 482. , 416. , 412.5, 440. , 540. ,
652. , 735. , 878. , 1030. , 1022. , 1105. , 1034. , 1064. ,
1089. , 1115. , 1145. , 1146. , 1111. , 1117. , 1140. , 1168. ,
845. , 1173. , 898. , 1091. , 591. , 570.5, 506. , 592.5,
682.5, 619.5, 663. , 593. , 470. , 810. , 694.5, 900. ,
965. , 954. , 771. , 608.5, 631. , 593. , 652. , 428. ,
486. , 445. , 395.5, 387.5, 383. , 390. , 408. , 420. ,
470. , 543.5, 686. , 550. , 588. , 556.5, 475.5, 606. ,
617. , 674. , 571. , 810. , 913. , 868. , 621.5, 417. ,
388. , 428. , 501. , 586.5, 668. , 739. , 914. , 829. ,
966. , 995. , 1008. , 961. ])
y = array([[-10.6, 0.4, 0.1, -0.1, -0.5, 0. ],
[-12.5, 1.5, 1.4, 0.9, 0.7, 0.7],
[ 4.5, 0.3, 0.2, 0. , 0.6, 0.2],
[ 4.6, -0.7, -0.8, -0.9, -0.7, -0.8],
[ 1.8, -1.3, -1.6, -1.8, -1.4, -1.5],
[ 10.4, -1.4, -1.5, -1.1, -1.2, -1.1],
[ 1. , -0.6, -0.5, -0.3, -0.2, -0.2],
[ 0. , 0.2, -0.1, 0.1, -0.1, -0.1],
[ -1.7, -1.1, -1. , -0.9, -0.8, -0.7],
[ 1.6, -1. , -1.3, -0.7, -1. , -0.8],
[ 0.5, 0. , 0. , 0.3, 0.1, 0.3],
[ -0.1, -0.3, -0.5, -0.2, -0.1, -0.1],
[ 0.8, -0.4, -0.3, -0.4, -0.5, -0.5],
[ -1.3, -0.8, -1. , -1. , -1.3, -1.1],
[ -0.1, -1.9, -2.2, -1.6, -1.7, -1.5],
[ -0.9, -1.3, -1.5, -1.9, -1.7, -2.1],
[ -0.5, -0.8, -0.9, -1.3, -1.4, -1.3],
[ -0.2, -0.6, -0.5, -0.8, -1.6, -0.9],
[ -0.8, -1.2, -1. , -0.6, -0.8, -0.9],
[ -1.2, -0.6, -1. , -0.4, -1.3, -0.4],
[ -1.1, -1. , -1.1, -1.2, -1. , -1.3],
[ -0.8, -0.9, -1. , -1. , -2.7, -1. ],
[ -1.2, -1.4, -1.4, -1.1, -1.6, -1.1],
[ -0.4, -0.6, -0.7, -0.5, 3.5, -0.6],
[ 0.4, 0.1, 0. , 0.1, 7.3, 0.1],
[ 0.2, -0.1, 0. , 0.5, 3.2, 0.6],
[ 0.3, 0.4, 0.2, 0.1, -16.7, 0.1],
[ 1.3, 1.1, 1.1, 1.4, -2.1, 1.3],
[ 1.2, 1.4, 1.3, 1.3, -1.7, 1.4],
[ 1.6, 1.2, 1.3, 1.5, 1.6, 1.6],
[ 0.8, 1.3, 1.3, 1.1, 1.1, 1.2],
[ 0.4, 1. , 1.1, 0.6, 0.8, 0.7],
[ 1. , 1.1, 1.3, 0.9, 1. , 1.1],
[ 0. , 0.3, 0.3, -0.2, -0.4, -0.2],
[ 0.4, 0.6, 0.7, 0.1, -0.1, 0.2],
[ 1.6, 1. , 0.9, 0.6, 0.8, 0.6],
[ 0.3, 0.6, 0.6, 0.3, 0.4, 0.5],
[ 0.2, -0.6, 0. , 0.2, 0.1, 0.2],
[ -0.3, 0.6, 0.2, -0.1, -0.2, -0.2],
[ 0.4, 0.5, 0.6, 0.2, 0.2, 0.3],
[ -0.1, 0.1, 0.1, -0.2, 0. , -0.2],
[ -0.3, -0.6, -0.5, -0.3, -0.4, -0.2],
[ 0.2, 0.1, 0.3, 0.1, 0.1, 0. ],
[ -0.3, -0.5, -0.5, -0.7, -0.7, -0.6],
[ -1.1, -0.8, -0.9, -0.8, -1. , -0.9],
[ -2.9, -1.9, -2.2, -2.3, -2.3, -2.4],
[ -3. , -2.4, -2.5, -2.2, -1.9, -2.3],
[ -0.4, -1.5, -1.4, -0.8, -0.6, -0.9],
[ 0.4, 0.1, 0. , 0.4, 0. , 0.4],
[ -0.1, -0.8, -0.7, 0. , -0.1, -0.1],
[ -0.3, -0.6, -0.3, -0.2, -0.2, -0.2],
[ 0.4, 0.4, 0.2, -0.1, -0.1, -0.1],
[ -1.9, -1.6, -1.8, -1.7, -1.8, -1.8],
[ -0.5, -0.8, -0.8, -0.6, -0.1, -0.6],
[ 0.8, 0.4, 0.5, 0.8, 0.7, 0.7],
[ 1.1, 1. , 1. , 0.7, 0.9, 0.8],
[ 0.7, 0.8, 0.9, 0.7, 0.6, 0.7],
[ 1. , 1.1, 1. , 0.8, 0.8, 0.8],
[ 0.2, 0.5, 0.4, 0.3, 0.1, 0.3],
[ -0.3, -1.2, -1. , -0.7, -0.5, -0.8],
[ -0.4, -0.5, -0.4, -0.2, -0.4, -0.2],
[ 0. , -0.5, -0.2, 0.3, 0.1, 0.2],
[ 0.2, 0. , 0.1, 0.1, -0.1, 0. ],
[ -1.1, -0.6, -0.8, -0.7, -0.6, -0.7],
[ -0.8, -0.9, -0.9, -0.6, -0.7, -0.6],
[ -0.7, -0.4, -0.6, -0.5, -0.6, -0.4],
[ -1.6, -1.2, -1.4, -1.1, -1.2, -1.3],
[ -0.5, -1.6, -1.5, -0.7, -0.7, -0.7],
[ -1. , -1.2, -1.3, -0.6, -0.9, -0.8],
[ -0.7, -0.4, -0.4, -0.5, -0.7, -0.5],
[ -0.1, -0.2, -0.3, 0. , -0.2, -0.1],
[ -0.5, -0.4, -0.4, -0.3, -0.3, -0.2],
[ -0.5, -0.3, -0.5, -0.3, -0.4, -0.4],
[ 0.2, 0. , 0. , 0.1, 0. , 0.1],
[ 0.9, 0.7, 0.8, 0.5, 0.6, 0.6],
[ 0.5, 0.6, 0.5, 0.6, 0.5, 0.5],
[ -0.1, 0.2, 0.2, 0.4, 0.4, 0.4],
[ 0. , 0.2, 0.1, 0.2, 0.2, 0.2],
[ -0.4, -0.2, -0.4, -0.2, -0.3, -0.2],
[ -0.1, -0.1, -0.1, -0.3, -0.2, -0.2],
[ 0.1, 0.4, 0.3, 0.1, 0.1, 0.1],
[ 0. , 0. , -0.1, 0.2, 0.2, 0.3],
[ 0.7, 0.8, 0.9, 0.6, 0.6, 0.5],
[ 0.4, 0.2, 0.4, -0.1, 0. , 0.1],
[ 1.7, 1.4, 1.4, 1.2, 1.3, 1.2],
[ 0.9, 1. , 1. , 0.8, 1. , 0.8],
[ 0.3, 0.5, 0.6, 0.4, 0.3, 0.3],
[ -1.4, -1. , -1.2, -0.9, -0.7, -0.8],
[ -1. , -1. , -1. , -1. , -1.2, -1.1],
[ -0.6, -0.7, -0.8, -0.9, -0.9, -0.8],
[ -0.5, -0.8, -0.7, -0.3, -0.4, -0.4],
[ 0. , -0.2, -0.1, -0.3, -0.5, -0.3],
[ -0.3, 0.2, 0. , 0.1, 0. , 0. ],
[ 0.8, 0.3, 0.4, 0.4, 0.5, 0.5],
[ 1.2, 1. , 1.2, 0.8, 0.8, 0.6],
[ 1.7, 1.3, 1.4, 1.8, 1.8, 1.7],
[ 1.2, 1.1, 1.2, 1.1, 1.3, 1.3],
[ 1.5, 1.6, 1.6, 1.4, 1.7, 1.4],
[ 1.7, 1.8, 2. , 1.5, 1.8, 1.5],
[ 0.6, 0.8, 1. , 0.8, 1.3, 1. ]])
IsolationForest
来区分异常值和内部值。要调整的主要参数是contamination=
。
它适用于样本数据,但为了推广到更大的数据集,我们需要一个更具代表性的样本。
import numpy as np
from matplotlib import pyplot as plt
#x, y from OP
x_flat = np.repeat(x, y.shape[1])
y_flat = y.ravel()
xy = np.column_stack([x_flat, y_flat])
#Outlier modelling
from sklearn.ensemble import IsolationForest
contamination_fraction = 5 / 100 #5%
model = IsolationForest(contamination=contamination_fraction * 0.5)
pred = model.fit_predict(xy)
cmap = 'PiYG'
plt.scatter(xy[:, 0], xy[:, 1], c=pred, edgecolor='none', cmap=cmap)
plt.scatter(xy[:, 0], xy[:, 1], marker='.', s=5, color='white')
plt.gcf().set_size_inches(7, 4)
plt.xlabel('x')
plt.ylabel('y')
plt.gca().spines[:].set_visible(False)
#Legend
plt.scatter([], [], color=plt.get_cmap(cmap, 2)(0.), label='outlier')
plt.scatter([], [], color=plt.get_cmap(cmap, 2)(1.), label='inlier')
plt.scatter([], [], edgecolor='gray', color='white', label='data')
plt.legend()