我有以下 pandas 数据框,它代表 10 个人 (
day_0
) 7 天的消费(day_-1
是今天,ids
是昨天等):
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randint(8, 15, size=(10, 7)))
df.columns = ['day_0', 'day_-1', 'day_-2', 'day_-3', 'day_-4', 'day_-5', 'day_-6']
df.index.name = 'id'
print(df.reset_index())
id day_0 day_-1 day_-2 day_-3 day_-4 day_-5 day_-6
0 0 10 10 14 8 14 14 14
1 1 10 13 11 11 8 10 10
2 2 10 12 9 12 9 10 10
3 3 12 12 9 11 9 12 13
4 4 12 13 8 12 8 11 9
5 5 13 9 8 13 9 12 10
6 6 8 9 8 14 8 13 14
7 7 13 10 14 12 8 9 11
8 8 8 8 10 12 11 14 14
9 9 14 13 13 9 11 14 13
我想找到每日体重(总共7个体重:
w_0, w_-1, w_-2, w_-3, w_-4, w_-5, w_-6
),它需要具有以下属性:
w_0
> w_-1
> w_-2
> ... > w_-6
w_0
+ w_-1
+ w_-2
+ ... + w_-6
= 7ids
的加权平均值低于阈值(例如 11)我可以通过使用指数衰减函数并随后进行归一化来实现先决条件 1 和 2:
import numpy as np
n = 7
_lambda = 0.5
# Calculate the weights using exponential decay
weights = np.exp(-_lambda * np.arange(n))
# Normalize the weights so that their sum is equal to the length of the time series
weights *= n / np.sum(weights)
但我不知道如何应用先决条件 3。
有什么想法吗?这可能吗?我怎样才能在Python中做到这一点?
这是一个疯狂的猜测。这不使用指数衰减,因为这对于满足您的要求似乎不是特别有用。定义具有析取约束的 ILP,即 m ID 中至少有 n 的组合的加权平均值低于阈值:
import io
import numpy as np
import pandas as pd
import scipy.sparse
from scipy.optimize import milp, Bounds, LinearConstraint
with io.StringIO(
'''id, 0, -1, -2, -3, -4, -5, -6
0, 10, 10, 14, 8, 14, 14, 14
1, 10, 13, 11, 11, 8, 10, 10
2, 10, 12, 9, 12, 9, 10, 10
3, 12, 12, 9, 11, 9, 12, 13
4, 12, 13, 8, 12, 8, 11, 9
5, 13, 9, 8, 13, 9, 12, 10
6, 8, 9, 8, 14, 8, 13, 14
7, 13, 10, 14, 12, 8, 9, 11
8, 8, 8, 10, 12, 11, 14, 14
9, 14, 13, 13, 9, 11, 14, 13
''') as f:
df = pd.read_csv(f, skipinitialspace=True, index_col=0)
df.columns = pd.Index(name='day', data=df.columns.astype(int))
m, n = df.shape # number of IDs, days
'''
LP variables:
n weights
m weighted mean threshold binary predicates
'''
# The weight sum must be equal to n
sum_constraint = LinearConstraint(
A=np.concatenate((
np.ones(n), np.zeros(m),
)),
lb=n, ub=n,
)
# The weights must be strictly decreasing by this amount
min_decrease = 1e-2 # chosen fully arbitrarily
antimonotonic_constraint = LinearConstraint(
A=scipy.sparse.diags_array(
(
np.ones(shape=n - 1),
np.full(shape=n - 1, fill_value=-1),
),
offsets=(0, 1), shape=(n - 1, n + m), format='csc',
),
lb=min_decrease,
)
'''
For each binary threshold predicate:
pred = 1 iff weights.df_values/n <= threshold
pred <= 2 - (weights.values)/n/threshold
weights.values/threshold + pred*n <= 2*n
'''
threshold = 11
mean_constraint = LinearConstraint(
A=scipy.sparse.hstack(
(
df.values/threshold,
scipy.sparse.diags_array(
np.full(shape=m, fill_value=n),
),
),
format='csc',
),
ub=2*n,
)
# At least n out of m IDs must be at the threshold or lower
disjunction_constraint = LinearConstraint(
A=np.concatenate((np.zeros(n), np.ones(m))),
lb=n,
)
result = milp(
c=np.zeros(n + m), # no optimisation objective
integrality=np.concatenate((
np.zeros(shape=n, dtype=np.uint8), # weights are continuous
np.ones(shape=m, dtype=np.uint8), # predicates are binary
)),
bounds=Bounds(
lb=np.concatenate((
np.full(shape=n, fill_value=1e-2), # minimum weight, arbitrary
np.zeros(shape=m), # binary predicate
)),
ub=np.concatenate((
np.full(shape=n, fill_value=np.inf),
np.ones(shape=m), # binary predicate
)),
),
constraints=(
sum_constraint,
antimonotonic_constraint,
mean_constraint,
disjunction_constraint,
),
)
if not result.success:
raise ValueError(result.message)
weights, threshold_preds = np.split(result.x, (n,))
means = df @ (weights/n)
print('weights =')
print(weights)
print('threshold predicates =')
print(threshold_preds)
print('means =')
print(means)
weights =
[2.89083333 1.16833333 1.15833333 0.88125 0.87125 0.02
0.01 ]
threshold predicates =
[1. 1. 1. 0. 1. 1. 1. 0. 1. 0.]
means =
id
0 10.925119
1 10.543155
2 10.295655
3 11.005714
4 11.000000
5 11.000000
6 8.945119
7 11.902262
8 9.233631
9 12.663333
dtype: float64