我正在尝试实施一种解决方案,在给定的时间范围内找到警报的后续警报。我已经从数据帧创建了一个intervalIndex,并使用np.vectorize将时间戳与间隔索引进行比较,但对于大型数据集(350000行)来说它很慢。有没有办法让它更快,我不知道如何真正矢量化操作。
def find_top_consequential_alarms(
ui: frontPage.Application, data: pd.DataFrame, fromAr: bool
):
# Get selected alarm and time window based on the source
selected_alarm = (
ui.select_conc_alarm.currentText()
if not fromAr
else ui.ar_alarm_combo.currentText()
)
selected_time = (
ui.select_conc_time_window.currentText()
if not fromAr
else ui.ar_timewindow_combo.currentText()
)
# Convert selected time to timedelta
time_map = {
"5 Minutes": timedelta(minutes=5),
"10 Minutes": timedelta(minutes=10),
"15 Minutes": timedelta(minutes=15),
"30 Minutes": timedelta(minutes=30),
"1 Hour": timedelta(hours=1),
"5 Hours": timedelta(hours=5),
"24 Hours": timedelta(hours=24),
}
time_window = time_map[selected_time]
# Filter for occurrences of the selected alarm
specific_alarms = data[data["PtName"].str.strip() == selected_alarm].copy()
# Create time intervals for each specific alarm occurrence
specific_alarms["window_start"] = specific_alarms["TimestampUTC"]
specific_alarms["window_end"] = specific_alarms["TimestampUTC"] + time_window
intervals = pd.IntervalIndex.from_arrays(
specific_alarms["window_start"], specific_alarms["window_end"], closed="right"
)
# Filter for other alarms that fall within any of the intervals
other_alarms = data[data["PtName"] != selected_alarm].copy()
in_interval = np.vectorize(lambda x: intervals.contains(x).any(), otypes=[bool])
mask = in_interval(other_alarms["TimestampUTC"])
consequential_alarms = other_alarms[mask]
# Count the occurrences of each alarm within the time windows and get the top 10
consequential_alarm_counts = consequential_alarms["PtName"].value_counts().head(10)
title = f"Top 10 Consequential Alarms for {selected_alarm}\n(Time Window: {time_window})"
return consequential_alarm_counts, title
整体功能如上
# Filter for other alarms that fall within any of the intervals
other_alarms = data[data["PtName"] != selected_alarm].copy()
in_interval = np.vectorize(lambda x: intervals.contains(x).any(), otypes=[bool])
mask = in_interval(other_alarms["TimestampUTC"])
consequential_alarms = other_alarms[mask]
这是需要改进的部分。
任何帮助将不胜感激。
np.vectorize
逐行工作,对于大型数据集来说速度很慢。相反,请使用 pd.merge_asof
,它执行高效的矢量化操作以实现基于时间的数据对齐。
import pandas as pd
import numpy as np
from datetime import timedelta
import time
np.random.seed(42)
num_rows = 350000
timestamps = pd.date_range(start="2024-11-01", end="2024-11-10", freq="1min")
random_timestamps = np.random.choice(timestamps, num_rows)
alarm_names = [f"Alarm{np.random.randint(1, 100)}" for _ in range(num_rows)]
data = pd.DataFrame({"PtName": alarm_names, "TimestampUTC": random_timestamps})
def find_top_consequential_alarms(ui, data, fromAr):
selected_alarm = ui.select_conc_alarm.currentText() if not fromAr else ui.ar_alarm_combo.currentText()
selected_time = ui.select_conc_time_window.currentText() if not fromAr else ui.ar_timewindow_combo.currentText()
time_map = {
"5 Minutes": timedelta(minutes=5),
"10 Minutes": timedelta(minutes=10),
"15 Minutes": timedelta(minutes=15),
"30 Minutes": timedelta(minutes=30),
"1 Hour": timedelta(hours=1),
"5 Hours": timedelta(hours=5),
"24 Hours": timedelta(hours=24),
}
time_window = time_map[selected_time]
specific_alarms = data[data["PtName"].str.strip() == selected_alarm].copy()
specific_alarms["window_start"] = specific_alarms["TimestampUTC"]
specific_alarms["window_end"] = specific_alarms["TimestampUTC"] + time_window
other_alarms = data[data["PtName"].str.strip() != selected_alarm].copy()
specific_alarms = specific_alarms.sort_values(by="window_start")
other_alarms = other_alarms.sort_values(by="TimestampUTC")
start_time = time.time()
matched_alarms = pd.merge_asof(
other_alarms,
specific_alarms,
left_on="TimestampUTC",
right_on="window_start",
direction="backward",
tolerance=time_window
)
execution_time = time.time() - start_time
consequential_alarms = matched_alarms.dropna(subset=["window_start"])
consequential_alarm_counts = consequential_alarms["PtName_x"].value_counts().head(10)
title = f"Top 10 Consequential Alarms for {selected_alarm}\n(Time Window: {time_window})"
return consequential_alarm_counts, title, execution_time
result_counts, result_title, execution_time = find_top_consequential_alarms(ui, data, fromAr=False)
print(result_counts)
print(result_title)
print(f"Execution Time: {execution_time} seconds")
这给了你
PtName_x
Alarm19 2954
Alarm36 2951
Alarm41 2951
Alarm31 2939
Alarm15 2933
Alarm75 2932
Alarm83 2932
Alarm40 2932
Alarm88 2927
Alarm54 2927
Name: count, dtype: int64
Top 10 Consequential Alarms for Alarm1
(Time Window: 0:05:00)
Execution Time: 0.022200584411621094 seconds
如您所见,在 350 000 行的数据帧上执行速度相当低。