我正在使用 PostGIS 来计算两类员工之间的互动时长。我对每个区域 id 执行相同的操作集。这是我的代码:
CREATE OR REPLACE FUNCTION public.interactions_looped_ucmo_aic(areaid_array character varying[], d_date date, start_time__ timestamp without time zone, end_time__ timestamp without time zone)
RETURNS TABLE(area_id uuid, "createdAt_" timestamp without time zone, "updatedAt_" timestamp without time zone, "ID_" character varying, parent_imei character varying, child_imei character varying, parent_ping_time timestamp without time zone, child_ping_time timestamp without time zone, maxima timestamp without time zone, minima timestamp without time zone, "interactionID" character varying, interaction_type character varying)
LANGUAGE plpgsql AS $function$
DECLARE i integer := 1;
BEGIN
FOR i IN 1..array_length(areaid_array, 1) LOOP
RETURN QUERY with pings as (
select * from -- ================= SELECTING Parent INFO
(select p.imei,
st_transform(p."geoJson", 32643) as geom,
"generatedAt" as t,
p."ID" as u_ping_id
from campaign_pings p
where "generatedAt" + interval '5 hours' between start_time__ and end_time__
and p.imei in (/*multiple left joins*/)
) u
left join ( -- ================= SELECTING child INFO
SELECT p.imei,
p."ID" as a_ping_id,
st_transform(p."geoJson", 32643) as geom,
"generatedAt" as t
FROM campaign_pings p
where "generatedAt"+ interval '5 hours' between start_time__ and end_time__
and p.imei in (/*multiple left joins*/)
) a on ST_DWithin(u."geom", a."geom", 30)
and a.t BETWEEN u.t - INTERVAL '30 seconds'
and u.t + INTERVAL '30 seconds'
where
a.imei is not null
)/*more code to calculate length of continuous interactions in minutes*/;
END LOOP;
END $function$;
所以本质上,我对每个区域 id 执行相同的一组操作。有没有办法利用这里的缓存来减少执行时间?我的意思是强制使用缓存。
P.S - 这是完整的功能:
CREATE OR REPLACE FUNCTION public.interactions_looped_ucmo_aic(areaid_array character varying[], d_date date, start_time__ timestamp without time zone, end_time__ timestamp without time zone)
RETURNS TABLE(area_id uuid, "createdAt_" timestamp without time zone, "updatedAt_" timestamp without time zone, "ID_" character varying, parent_imei character varying, child_imei character varying, parent_ping_time timestamp without time zone, child_ping_time timestamp without time zone, maxima timestamp without time zone, minima timestamp without time zone, "interactionID" character varying, interaction_type character varying)
LANGUAGE plpgsql
AS $function$
DECLARE
i integer := 1;
BEGIN
FOR i IN 1..array_length(areaid_array, 1) LOOP
RETURN QUERY
--SELECT areaid_array[i], d_date;
with pings as ( -- ================= SELECTING JOINED data with join conditions -- ================= -- ================= -- ================= -- ================= -- =================
select
u."imei" as u_imei,
a."imei" as a_imei,
u.t as u_ping_time,
a.t as a_ping_time,
u_ping_id,
a_ping_id
from
(-- ================= SELECTING Parent INFO [imei, geom, generatedAt, pingID] -- ================= -- ================= -- ================= -- ================= -- =================
select
p.imei,
st_transform(p."geoJson", 32643) as geom,
"generatedAt" as t,
p."ID" as u_ping_id
from
campaign_pings p
where --"createdAt"::date = d_date--d_date
"generatedAt" + interval '5 hours' between start_time__ and end_time__
and
p.imei in (
select
assets."IMEI"
from
campaign_staff staff
left join campaign_staff_info csi on csi."campaignStaffID"=staff."ID" and csi."date" = (select max(date) from campaign_staff_info csi2 where csi2."campaignStaffID" = csi."campaignStaffID" and csi2.date <= d_date)
left join campaign_staff_assets csa on csa."campaignStaffID" = staff."ID" and csa."date" = (select max(date) from campaign_staff_assets cs2 where cs2."campaignStaffID"= csa."campaignStaffID" and cs2.date<=d_date)
left join campaign_area_assets caa on caa."assetID" = csa."assetID"
and caa."campaignScopeAreaID" = csa."campaignScopeID"
left join assets assets on assets."ID" = caa."assetID"
left join staff_type staff_type ON staff_type."ID" = staff."staffTypeID"
left join campaign_asset_status cas on cas."campaignStaffID" = staff."ID" and cas."date" = (select max(date) from campaign_asset_status cas2 where cas2."campaignStaffID"= cas."campaignStaffID" and cas2.date<=d_date)
left join campaign_scope cs on cs."areaID" = caa."campaignScopeAreaID"
where
caa."campaignScopeAreaID" = areaid_array[i]::uuid --'0156276d-29e3-4da8-bc76-39d6ad3bf1f6'
and assets."IMEI" is not null
and staff_type."ID" = '38eb4157-c0e8-4334-ad79-9801610c4ab8' -- UCMO ID
and (cas.status is null or cas.status <> 'DeployedWoPhone')
and ((csi.number is null) or (csi.number <> '0'))
and d_date between cs."campaignStartDate" and cs."catchupEndDate"
)
) u
left join ( -- ================= SELECTING child INFO [imei, geom, generatedAt, pingID] -- ================= -- ================= -- ================= -- ================= -- =================
SELECT
p.imei,
p."ID" as a_ping_id,
st_transform(p."geoJson", 32643) as geom,
"generatedAt" as t
FROM
campaign_pings p
where --"createdAt"::date = d_date --d_date
"generatedAt"+ interval '5 hours' between start_time__ and end_time__
and
p.imei in (
select
assets."IMEI"
from
campaign_staff staff
left join campaign_staff_assets csa on csa."campaignStaffID" = staff."ID" and csa."date" = (select max(date) from campaign_staff_assets cs2 where cs2."campaignStaffID"= csa."campaignStaffID" and cs2.date<=d_date)
left join campaign_area_assets caa on caa."assetID" = csa."assetID"
and caa."campaignScopeAreaID" = csa."campaignScopeID"
left join assets assets on assets."ID" = caa."assetID"
left join staff_type staff_type ON staff_type."ID" = staff."staffTypeID"
left join campaign_asset_status cas on cas."campaignStaffID" = staff."ID" and cas."date" = (select max(date) from campaign_asset_status cas2 where cas2."campaignStaffID"= cas."campaignStaffID" and cas2.date<=d_date)
left join campaign_scope cs on cs."areaID" = caa."campaignScopeAreaID"
where
caa."campaignScopeAreaID" = areaid_array[i]::uuid
and assets."IMEI" is not null
and staff_type."ID" = '93313048-4e75-4e91-9bcd-44897a6df7c7' -- AIC ID
and (cas.status is null or cas.status <> 'DeployedWoPhone')
and d_date between cs."campaignStartDate" and cs."catchupEndDate"
)
) a on ST_DWithin(u."geom", a."geom", 30)
and a.t BETWEEN u.t - INTERVAL '30 seconds'
and u.t + INTERVAL '30 seconds'
where
a.imei is not null
)
,
ranked_interactions AS (
SELECT
u_imei,
a_imei,
a_ping_time +interval '5 hours' as a_ping_time,
u_ping_time+interval '5 hours' as u_ping_time,
(LEAD(u_ping_time) OVER (PARTITION BY u_imei, a_imei ORDER BY u_ping_time)) + interval '5 hours' AS next_parent_time,-- OG
ROW_NUMBER() OVER (PARTITION BY u_imei, a_imei ORDER BY u_ping_time) AS interaction_rank
FROM
pings
)
,
dummy as (
select *,next_parent_time-u_ping_time as time_diff from ranked_interactions
)
,
periods as (
select *, case when time_diff < interval '3 minute' then 'normal'
when time_diff >= interval '3 minute' then 'abnormal'
end as periods
from dummy )
,
ranked_data AS (
SELECT
*,
LEAD(a_ping_time) OVER (PARTITION BY u_imei, a_imei ORDER BY a_ping_time) AS start_time,
max(a_ping_time) over (PARTITION BY u_imei, a_imei) as max_time
FROM periods
),
f_data AS (
SELECT
*
FROM ranked_data
WHERE periods = 'abnormal'
union
select * FROM ranked_data
where interaction_rank=1
)
,
format as (
select *,lead(a_ping_time) over (PARTITION BY u_imei, a_imei) as duration_calc
from f_data
)
,
results as (
select *,
duration_calc-start_time as duration from format
where interaction_rank <> 1 and duration_calc is not null
union
select *,
-- duration_calc - a_ping_time as duration from format
duration_calc - start_time as duration from format---changedcode
where interaction_rank = 1
union
select *, max_time - start_time as duration from format
where duration_calc is null
order by u_imei,a_imei,interaction_rank
)
,
results_filtered as (
select * from results where duration>= interval '3 minute'
)
,
--parent_imei, child_imei, parent_ping_time, child_ping_time, maxima, minima, "interactionID", "forUcmo"
adding as (
select
--null as created_at,null as updated_at,null as id,
--results_filtered.*,
*,
case when interaction_rank = 1 and periods='normal'
then start_time - interval '1 minute'
else start_time
end as start_time_final from results_filtered)
select areaid_array[i]::uuid as area_id_,
NOW()::TIMESTAMP WITHOUT TIME zone as createdat,NOW()::TIMESTAMP WITHOUT TIME ZONE as updatedat,
CONCAT(adding.start_time_final, '-', adding.a_imei, '-', adding.u_imei,'-',NOW())::varchar as id,
--adding.created_at,
--adding.updated_at,
--adding.id,
adding.u_imei::varchar as parent_imei_ ,
adding.a_imei::varchar as child_imei_,
adding.u_ping_time as parent_ping_time_,
adding.a_ping_time as child_ping_time_,
case when adding.interaction_rank=1 and adding.periods='normal'
then adding.start_time_final+duration+interval '1 minute'
else adding.start_time_final+duration
end as maxima_,
adding.start_time_final as minima_,
--adding.u_imei+adding.a_imei+duration as "interactionID_",
CONCAT(adding.u_imei, '-', adding.a_imei, '-', duration)::varchar as "interactionID_",
'UA'::varchar as interaction_type
--adding.duration,adding.start_time_final
from adding
;
END LOOP;
END
$function$
;
这个例子是不可重现的,所以我只是将一堆评论编译成对如何加快速度这一基本问题的某种答案,加上一些随机提示,没有特定的顺序:
int
迭代器,然后为数组添加下标。您可以直接迭代数组
CREATE OR REPLACE FUNCTION public.interactions_looped_ucmo_aic(areaid_array uuid[]
--...
DECLARE current_areaid uuid;
BEGIN FOREACH current_areaid IN ARRAY areaid_array LOOP RETURN QUERY
--...
WHERE caa."campaignScopeAreaID" = current_areaid
null
时),否则请使用 concat_ws()
。它允许您指定一次分隔符,然后仅列出要由其分隔的字段。
CONCAT_WS('-',adding.start_time_final, adding.a_imei, adding.u_imei,NOW())::varchar as id,
campaign_staff
的大小驱动了 imei
列表子查询,然后您将其过滤掉。您可能希望确保它由最小的表驱动,并且尽早过滤掉内容。在某种程度上,Postgres 会尝试重新安排您的连接和条件,尝试自动执行此操作,但有这么多东西可能不会有效。campaign_staff_info
、campaign_staff_assets
和 campaign_asset_status
,然后使用同一个表中的子查询对其进行过滤。您也可以直接加入子查询并在其中进行过滤。您还可以尝试将这些 max(date)
与 csi."campaignStaffID"
一起预取,然后加入到该预取的静态临时表中 - 如果保留未完全过滤的集合中的附加列并不多花费一定的费用,您可以通过在循环之前执行一次而不是在每个循环中重复两次来获得一些性能。areaid
的普通 SQL 函数,然后让另一个常规 SQL 函数将其包装以允许接受数组,只需 unnest()
并为每个元素调用主函数。 PostgreSQL 可能会更好地分配工作,而不一定将其全部放入队列中,其中每个区域都必须等待前面的区域完成。它甚至可能会分配一些并行工作人员并同时评估每个 areaid
的所有查询。