我想在Python中实现series_outlier方法并使用以下代码
import pandas as pd
import numpy as np
from scipy.stats import norm
# Load the data into a DataFrame
data = {
'series': [67.95675, 58.63898, 33.59188, 4906.018, 5.372538, 702.1194, 0.037261, 11161.05, 1.403496, 100.116]
}
df = pd.DataFrame(data)
# Function to calculate the outlier score based on custom percentiles
def custom_percentile_outliers(series, p_low=10, p_high=90):
# Calculate custom percentiles
percentile_low = np.percentile(series, p_low)
percentile_high = np.percentile(series, p_high)
# Calculate Z-scores for the percentiles assuming normal distribution
z_low = norm.ppf(p_low / 100)
z_high = norm.ppf(p_high / 100)
# Calculate normalization factor
normalization_factor = (2 * z_high - z_low) / (2 * z_high - 2.704)
# Calculate outliers score
return series.apply(lambda x: (x - percentile_high) / (percentile_high - percentile_low) * normalization_factor
if x > percentile_high else ((x - percentile_low) / (percentile_high - percentile_low) * normalization_factor
if x < percentile_low else 0))
# Apply the custom percentile outlier scoring function
df['outliers'] = custom_percentile_outliers(df['series'], p_low=10, p_high=90)
# Display the DataFrame with outliers
print(df)
并获得该系列的以下结果
series outliers
0 67.956750 0.000000 1 58.638980 0.000000 2 33.591880 0.000000 3 4906.018000 0.000000 4 5.372538 0.000000 5 702.119400 0.000000 6 0.037261 0.006067 7 11161.050000 -27.776847 8 1.403496 0.000000 9 100.116000 0.000000
我参考了 github 文章 https://github.com/microsoft/Kusto-Query-Language/issues/136 并尝试在 stackoverflow 上给出的解决方案的帮助下实现和手动计算 - Kusto series_outliers( )计算异常分数?
我的标准化分数计算可能出错了。如果有人能帮忙那就太好了
您可以使用下面的代码来使用series_outlier方法:
import pandas as r
from scipy.stats import norm as r_nm
import numpy as rn
rith_test = {
'r_sr': [67.95675, 58.63898, 33.59188, 4906.018, 5.372538, 702.1194, 0.037261, 11161.05, 1.403496, 100.116]
}
rd = r.DataFrame(rith_test)
def test(r_sr, p_l=10, p_h=90):
rpl = rn.percentile(r_sr, p_l)
rph = rn.percentile(r_sr, p_h)
z_low = r_nm.ppf(p_l / 100)
z_high = r_nm.ppf(p_h / 100)
rnf = (z_high - z_low) / (rph - rpl)
def rtst(ri):
if ri > rph:
return (ri - rph) * rnf
elif ri < rpl:
return (ri - rpl) * rnf
else:
return 0
return r_sr.apply(rtst)
rd['outliers'] = test(rd['r_sr'], p_l=10, p_h=90)
print(rd)