我有一个输入表,我称之为claim_data。看起来像这样:
Record_ID,Event_ID,Policy No,Claim Name,Client_number's_PO,Claim No,Life_Assured,Client_number's_LA,Received Date,Incur Date,Product_code,Claim_type,Benefit/component_code,,
12,40,80063166,Tr????ng An Diå÷??m,10003205,1,1,10053675,29/11/2019,10/11/2019,Rider03,RC,Rider03_04,,
1,126,80063166,Tr????ng An Diå÷??m,10003205,2,2,10003205,15/05/2020,4/05/2020,Rider03,RC,Rider03_04,,
19,188,80063166,Tr????ng An Diå÷??m,10003205,5,2,10003205,3/07/2020,6/06/2020,Rider03,RC,Rider03_04,,
11,189,80063166,Tr????ng An Diå÷??m,10003205,4,1,10053675,3/07/2020,7/06/2020,Rider03,RC,Rider03_04,,
16,181,80069631,Tr????ng An Diå÷??m,10003205,2,1,10073684,2/07/2020,7/06/2020,Rider03,RC,Rider03_04,,
3,610,80063166,Tr????ng An Diå÷??m,10003205,6,2,10003205,29/10/2020,7/10/2020,Rider03,RC,Rider03_04,,
14,611,80063166,Tr????ng An Diå÷??m,10003205,7,1,10053675,29/10/2020,18/10/2020,Rider03,RC,Rider03_04,,
2,1014,80063166,Tr????ng An Diå÷??m,10003205,8,2,10003205,20/12/2020,26/11/2020,Rider03,RC,Rider03_04,,
17,1015,80069631,Tr????ng An Diå÷??m,10003205,4,1,10073684,20/12/2020,4/12/2020,Rider03,RC,Rider03_04,,
13,3687,80063166,Tr????ng An Diå÷??m,10003205,9,1,10053675,21/09/2021,7/09/2021,Rider03,RC,Rider03_04,,
18,3689,80069631,Tr????ng An Diå÷??m,10003205,5,1,10073684,21/09/2021,7/09/2021,Rider03,RC,Rider03_04,,
20,3690,80069631,Tr????ng An Diå÷??m,10003205,6,1,10073684,21/09/2021,16/09/2021,Rider03,RC,Rider03_04,,
6,5635,80063166,Tr????ng An Diå÷??m,10003205,13,2,10003205,25/12/2021,8/11/2021,Rider03,RC,Rider03_04,,
8,5637,80093241,Tr????ng An Diå÷??m,10003205,1,1,10003213,25/12/2021,8/11/2021,Rider03,RC,Rider03_04,,
43,5634,80063166,Tr????ng An Diå÷??m,10003205,12,3,10003213,25/12/2021,8/11/2021,Rider03,RC,Rider03_04,,
9,7544,80093241,Tr????ng An Diå÷??m,10003205,2,1,10003213,19/04/2022,8/04/2022,Rider03,RC,Rider03_04,,
41,7544,80063166,Tr????ng An Diå÷??m,10003205,15,3,10003213,19/04/2022,8/04/2022,Rider03,RC,Rider03_04,,
7,13004,80063166,Tr????ng An Diå÷??m,10003205,18,2,10003205,21/02/2023,5/01/2023,Rider03,RC,Rider03_04,,
42,13005,80063166,Tr????ng An Diå÷??m,10003205,19,3,10003213,21/02/2023,5/01/2023,Rider03,RC,Rider03_04,,
0,13097,80063166,Tr????ng An Diå÷??m,10003205,21,2,10003205,24/02/2023,6/02/2023,Rider03,RC,Rider03_04,,
10,12881,80093241,Tr????ng An Diå÷??m,10003205,3,1,10003213,14/02/2023,6/02/2023,Rider03,RC,Rider03_04,,
15,12880,80063166,Tr????ng An Diå÷??m,10003205,16,1,10053675,14/02/2023,6/02/2023,Rider03,RC,Rider03_04,,
44,13096,80063166,Tr????ng An Diå÷??m,10003205,20,3,10003213,24/02/2023,6/02/2023,Rider03,RC,Rider03_04,,
4,26741,80063166,Tr????ng An Diå÷??m,10003205,27,2,10003205,6/12/2023,22/10/2023,Rider03,RC,Rider03_04,,
5,24564,80063166,Tr????ng An Diå÷??m,10003205,26,2,10003205,1/11/2023,28/10/2023,Rider03,RC,Rider03_04,,
这就是我的函数的样子。
def flag_date_within_m_months(group, col_label, col_date):
group[col_date] = pd.to_datetime(group[col_date])
group = group.sort_values(by=col_date) # Sort by date
group[col_label] = False
# Create a rolling window to count claims
for i in range(len(group)):
current_date = group[col_date].iloc[i]
window_start = current_date - pd.DateOffset(months=m)
# Count the number of claims within the window
count_within_window = group[(group[col_date] <= current_date) & (group[col_date] >= window_start)].shape[0]
if count_within_window > n:
# Flag all claims in the window
group.loc[(group[col_date] <= current_date) & (group[col_date] >= window_start), col_label] = True
print(f"Flagging claims between {window_start} and {current_date}") # Debug output
return group
但是,当我按照以下方式使用它时,我没有得到我期望的结果。以下代码块的目的是标记客户 ('Client_number's_PO') 在 [m] 个月内(发生的/事件日期)经手动检查,有许多行应标记为 True,但当我运行此命令时,所有行都是 False。:
# Control parameters
n = 5
m = 6
# Create an index so that every row with the same 'Client_number's_PO' and 'Benefit/component_code' has the same index number.
claim_data["1. Index"] = claim_data_m.groupby(["Client_number's_PO", "Benefit/component_code"]).ngroup()
# Flag the rows based on the condition
claim_data["1"] = False
claim_data.groupby("1. Index").apply(flag_date_within_m_months, col_label="1", col_date="Incur Date")
请帮我解决这个问题。
我尝试了上面描述的方法,我期望在 m 个月内出现超过 n 次且具有相同“Client_number's_PO”和“Benefit/component_code”的行在“1”列中被标记为 True。例如,对于 Client_number's_PO 10003205 和 Benefit/component_code Rider03_04 的行,我希望结果如下所示(如果基于“发生日期”在 6 个月的窗口内有超过 5 项索赔,则为 TRUE):
Record_ID,Event_ID,Policy No,Claim Name,Client_number's_PO,Claim No,Life_Assured,Client_number's_LA,Received Date,Incur Date,Product_code,Claim_type,Benefit/component_code,1,
12,40,80063166,Tr????ng An Diå÷??m,10003205,1,1,10053675,29/11/2019,10/11/2019,Rider03,RC,Rider03_04,FALSE,
1,126,80063166,Tr????ng An Diå÷??m,10003205,2,2,10003205,15/05/2020,4/05/2020,Rider03,RC,Rider03_04,TRUE,
19,188,80063166,Tr????ng An Diå÷??m,10003205,5,2,10003205,3/07/2020,6/06/2020,Rider03,RC,Rider03_04,TRUE,
11,189,80063166,Tr????ng An Diå÷??m,10003205,4,1,10053675,3/07/2020,7/06/2020,Rider03,RC,Rider03_04,TRUE,
16,181,80069631,Tr????ng An Diå÷??m,10003205,2,1,10073684,2/07/2020,7/06/2020,Rider03,RC,Rider03_04,TRUE,
3,610,80063166,Tr????ng An Diå÷??m,10003205,6,2,10003205,29/10/2020,7/10/2020,Rider03,RC,Rider03_04,TRUE,
14,611,80063166,Tr????ng An Diå÷??m,10003205,7,1,10053675,29/10/2020,18/10/2020,Rider03,RC,Rider03_04,TRUE,
2,1014,80063166,Tr????ng An Diå÷??m,10003205,8,2,10003205,20/12/2020,26/11/2020,Rider03,RC,Rider03_04,TRUE,
17,1015,80069631,Tr????ng An Diå÷??m,10003205,4,1,10073684,20/12/2020,4/12/2020,Rider03,RC,Rider03_04,TRUE,
13,3687,80063166,Tr????ng An Diå÷??m,10003205,9,1,10053675,21/09/2021,7/09/2021,Rider03,RC,Rider03_04,TRUE,
18,3689,80069631,Tr????ng An Diå÷??m,10003205,5,1,10073684,21/09/2021,7/09/2021,Rider03,RC,Rider03_04,TRUE,
20,3690,80069631,Tr????ng An Diå÷??m,10003205,6,1,10073684,21/09/2021,16/09/2021,Rider03,RC,Rider03_04,TRUE,
6,5635,80063166,Tr????ng An Diå÷??m,10003205,13,2,10003205,25/12/2021,8/11/2021,Rider03,RC,Rider03_04,TRUE,
8,5637,80093241,Tr????ng An Diå÷??m,10003205,1,1,10003213,25/12/2021,8/11/2021,Rider03,RC,Rider03_04,TRUE,
43,5634,80063166,Tr????ng An Diå÷??m,10003205,12,3,10003213,25/12/2021,8/11/2021,Rider03,RC,Rider03_04,TRUE,
9,7544,80093241,Tr????ng An Diå÷??m,10003205,2,1,10003213,19/04/2022,8/04/2022,Rider03,RC,Rider03_04,FALSE,
41,7544,80063166,Tr????ng An Diå÷??m,10003205,15,3,10003213,19/04/2022,8/04/2022,Rider03,RC,Rider03_04,FALSE,
7,13004,80063166,Tr????ng An Diå÷??m,10003205,18,2,10003205,21/02/2023,5/01/2023,Rider03,RC,Rider03_04,TRUE,
42,13005,80063166,Tr????ng An Diå÷??m,10003205,19,3,10003213,21/02/2023,5/01/2023,Rider03,RC,Rider03_04,TRUE,
0,13097,80063166,Tr????ng An Diå÷??m,10003205,21,2,10003205,24/02/2023,6/02/2023,Rider03,RC,Rider03_04,TRUE,
10,12881,80093241,Tr????ng An Diå÷??m,10003205,3,1,10003213,14/02/2023,6/02/2023,Rider03,RC,Rider03_04,TRUE,
15,12880,80063166,Tr????ng An Diå÷??m,10003205,16,1,10053675,14/02/2023,6/02/2023,Rider03,RC,Rider03_04,TRUE,
44,13096,80063166,Tr????ng An Diå÷??m,10003205,20,3,10003213,24/02/2023,6/02/2023,Rider03,RC,Rider03_04,TRUE,
4,26741,80063166,Tr????ng An Diå÷??m,10003205,27,2,10003205,6/12/2023,22/10/2023,Rider03,RC,Rider03_04,FALSE,
5,24564,80063166,Tr????ng An Diå÷??m,10003205,26,2,10003205,1/11/2023,28/10/2023,Rider03,RC,Rider03_04,FALSE,
import pandas as pd
def flag_date_within_m_months(group, col_label, col_date, n_claims=5, months_window=6):
"""
Flag rows within a group where there are more than n claims within m months.
Parameters:
-----------
group : pandas.DataFrame
Group of claims to analyze
col_label : str
Name of the column to store the flag
col_date : str
Name of the column containing dates
n_claims : int, default=5
Number of claims threshold
months_window : int, default=6
Number of months to look back
Returns:
--------
pandas.DataFrame
Group with added flag column
"""
# Convert dates and sort
group = group.copy() # Prevent SettingWithCopyWarning
group[col_date] = pd.to_datetime(group[col_date], format='%d/%m/%Y')
group = group.sort_values(by=col_date)
# Initialize flag column
group[col_label] = False
# For each date, look back m months and count claims
for i in range(len(group)):
current_date = group[col_date].iloc[i]
window_start = current_date - pd.DateOffset(months=months_window)
# Find all dates in the window
mask = (group[col_date] >= window_start) & (group[col_date] <= current_date)
claims_in_window = sum(mask)
# If more than n claims in window, flag all claims in that window
if claims_in_window > n_claims:
group.loc[mask, col_label] = True
return group
# Usage example:
def process_claims_data(df, n_claims=5, months_window=6):
"""
Process the entire claims dataset to flag high-frequency claims.
Parameters:
-----------
df : pandas.DataFrame
Claims data
n_claims : int, default=5
Number of claims threshold
months_window : int, default=6
Number of months to look back
Returns:
--------
pandas.DataFrame
Processed dataframe with flags
"""
# Create group index
df = df.copy()
df["group_index"] = df.groupby(["Client_number's_PO", "Benefit/component_code"]).ngroup()
# Apply flagging function to each group
result = df.groupby("group_index").apply(
flag_date_within_m_months,
col_label="flag",
col_date="Incur Date",
n_claims=n_claims,
months_window=months_window
).reset_index(drop=True)
return result