我正在尝试在 BigQuery 中创建一个函数,该函数使用我们的一些业务逻辑/计算。但更重要的是,有一张由我们的工作人员定期更新的表格。 该表包含年份日期和其他特定值,但在本例中为“Holiday_Flag”。
此函数的目的是,在我们的许多报告程序中,他们将再次使用 2 个日期列,其中包含许多 CTE 等,然后此函数为我们提供从作业开始到结束的总延迟(以小时为单位)。
我尝试了很多不同的方法,甚至尝试使用 BiGQuery 原生的 ARRAY_AGG 和 UNNEST,但没有成功。
当我从函数中完全删除对另一个表的引用,并硬编码一些虚拟日期时,它就起作用了。
我尝试了 GPT AL 世界的一些帮助,但都遇到了相同的错误。
这是我尝试过的最新代码,我对 BigQuery 还很陌生,最近才发现 UDF 限制。我确实明白,让一个函数对一百万行表执行计算并引用源表一百万次是效率不高的。
我的时间不多了,需要一些指导,当我将一些虚拟日期和标志硬编码到 CTE 中时,我什至得到了这个工作,但使用查找表失败了
这是代码。
staff_time 使用:cal_date 只是 yyyy-mm-dd,holiday_flag 只是“Y”或“N”
CREATE OR REPLACE FUNCTION `my.gcp.function`(ip_start_date TIMESTAMP, ip_end_date TIMESTAMP) RETURNS NUMERIC AS (
(
WITH temp_date AS (
SELECT
CASE
WHEN ip_start_date > ip_end_date THEN DATE(ip_end_date)
ELSE DATE(ip_start_date)
END AS ip_date_01,
CASE
WHEN ip_start_date > ip_end_date THEN DATE(ip_start_date)
ELSE DATE(ip_end_date)
END AS ip_date_02
),
validated_dates AS (
SELECT
ip_start_date,
ip_end_date,
CASE
WHEN SAFE_CAST(ip_start_date AS DATE) IS NULL THEN 'Invalid start date format'
WHEN SAFE_CAST(ip_end_date AS DATE) IS NULL THEN 'Invalid end date format'
ELSE 'Valid dates'
END AS date_validation
FROM
temp_date
),
holiday_array AS (
SELECT
ARRAY_AGG(STRUCT(DATE(cal_date) AS cal_date, holiday_flag)) AS holidays
FROM
`dataset.staff_time`
),
working_days AS (
SELECT
CASE
WHEN DATE(ip_start_date) <> DATE(ip_end_date) THEN SUM(
CASE
WHEN cal_date NOT IN (
SELECT
cal_date
FROM
UNNEST(
(
SELECT
holidays
FROM
holiday_array
)
)
WHERE
holiday_flag = 'Y'
)
AND EXTRACT(
DAYOFWEEK
FROM
cal_date
) NOT IN (1, 7) THEN 1
ELSE 0
END
) - 1 --as we are now working in arrays, this is zero based counting
ELSE NULL
END AS working_day,
ip_start_date,
ip_end_date
FROM
UNNEST(
GENERATE_DATE_ARRAY(
SAFE_CAST(ip_start_date AS DATE),
DATE_SUB(SAFE_CAST(ip_end_date AS DATE), INTERVAL 1 DAY),
INTERVAL 1 DAY
)
) AS cal_date
WHERE
cal_date NOT IN (
SELECT
cal_date
FROM
UNNEST(
(
SELECT
holidays
FROM
holiday_array
)
)
WHERE
holiday_flag = 'Y'
)
AND EXTRACT(
DAYOFWEEK
FROM
cal_date
) NOT IN (1, 7)
),
start_date AS (
SELECT
CASE
WHEN DATE(ip_start_date) NOT IN (
SELECT
cal_date
FROM
UNNEST(
(
SELECT
holidays
FROM
holiday_array
)
)
WHERE
holiday_flag = 'Y'
)
AND EXTRACT(
DAYOFWEEK
FROM
DATE(ip_start_date)
) NOT IN (1, 7) THEN ip_start_date
ELSE CURRENT_TIMESTAMP()
END AS ip_start_date_1
),
end_date AS (
SELECT
CASE
WHEN DATE(ip_end_date) NOT IN (
SELECT
cal_date
FROM
UNNEST(
(
SELECT
holidays
FROM
holiday_array
)
)
WHERE
holiday_flag = 'Y'
)
AND EXTRACT(
DAYOFWEEK
FROM
DATE(ip_end_date)
) NOT IN (1, 7) THEN ip_end_date
ELSE CURRENT_TIMESTAMP()
END AS ip_end_date_1
),
start_working_hours AS (
SELECT
CASE
WHEN DATE(ip_end_date) <> DATE(ip_end_date) THEN CASE
WHEN DATE(ip_start_date_1) <> DATE(CURRENT_TIMESTAMP()) THEN 17 - ROUND(
EXTRACT(
HOUR
FROM
ip_start_date_1
) + (
EXTRACT(
MINUTE
FROM
ip_start_date_1
) / 60
),
2
)
ELSE 0
END
END AS start_working_hour
FROM
start_date
),
end_working_hours AS (
SELECT
CASE
WHEN DATE(ip_start_date) <> DATE(ip_end_date) THEN CASE
WHEN DATE(ip_end_date_1) <> DATE(CURRENT_TIMESTAMP()) THEN ROUND(
EXTRACT(
HOUR
FROM
ip_end_date_1
) + EXTRACT(
MINUTE
FROM
ip_end_date_1
) / 60 - 8,
2
)
ELSE 0
END
END AS end_working_hour
FROM
end_date
),
when_delay_on_same_day1 AS (
SELECT
CASE
WHEN DATE(ip_start_date) = DATE(ip_end_date) THEN CASE
WHEN DATE(ip_start_date) NOT IN (
SELECT
cal_date
FROM
UNNEST(
(
SELECT
holidays
FROM
holiday_array
)
)
WHERE
holiday_flag = 'Y'
)
AND EXTRACT(
DAYOFWEEK
FROM
DATE(ip_start_date)
) NOT IN (1, 7) THEN ROUND(
ABS(TIMESTAMP_DIFF(ip_end_date, ip_start_date, SECOND)) / 10000,
3
)
ELSE NULL
END
ELSE NULL
END AS when_delay_on_same_day
)
SELECT
CAST(
IFNULL(start_working_hour, 0) + IFNULL(end_working_hour, 0) + IFNULL(working_day, 0) * 9 + IFNULL(when_delay_on_same_day, 0) AS NUMERIC
) AS final_working_hour
FROM
start_working_hours,
end_working_hours,
working_days,
when_delay_on_same_day1,
start_date,
end_date
)
);
我已经尝试在工作日 CTE 上使用连接,但仍然没有乐趣。
working_days AS (
SELECT
CASE
WHEN DATE(ip_start_date) <> DATE(ip_end_date) THEN SUM(
CASE
WHEN ot.cal_date IS NULL
AND EXTRACT(DAYOFWEEK FROM cal_date) NOT IN (1, 7) THEN 1
ELSE 0
END
) - 1 -- Adjusting for zero-based counting
ELSE NULL
END AS working_day,
ip_start_date,
ip_end_date
FROM
UNNEST(GENERATE_DATE_ARRAY(SAFE_CAST(ip_start_date AS DATE), DATE_SUB(SAFE_CAST(ip_end_date AS DATE), INTERVAL 1 DAY), INTERVAL 1 DAY)) AS cal_date
LEFT JOIN holiday_array ot
ON cal_date = ot.cal_date -- Both are now DATE types
WHERE ot.cal_date IS NULL
AND EXTRACT(DAYOFWEEK FROM cal_date) NOT IN (1, 7)
),
非常感谢您的帮助
在此 SQL 示例中,仅将假期日期连接到表中。即使对于 100 万行,这也可以在 3 分钟内完成。当开始日期和结束日期混合时,请在额外的 CTE 中修复此问题。 CTE
holiday_tbl
包含所有周末以外的假期日期的表格。 CTE tbl1
计数在范围内。 CTE tbl2
将开始和结束日期移至下周一或上周五下午。 tbl3
计算所选时间范围内的working_days
周一和周五,不考虑节假日。 start_minutes
和 end_minutes
是给定时间范围的第一天和最后一天的分钟。进行了一些修复,以便仅添加正时间且最多 9 小时。
工作时间是每天 9 个小时,我们需要从工作日中减去假期,还需要多一天,因为我们忽略了开始和结束日期:
9*(working_days-1-holidays)
。如果第一天是假期,那么我们会从计算中减去一天。这是错误的,因为第一天不计入工作日。为了解决这个问题,我们增加了 9 个小时。如果第一天不是假期,我们会添加该天的工作时间('start_minutes/60')。我们最后一天也做同样的事情。
针对
working_days
的备注。任务是找到周末。因此,我们通过 date_trunc(date(start_),ISOWEEK)
搜索上周一。与最终日期的周差是周末。因为我们截断了工作日的日期,所以需要从日期差中减去周末时间二以获得该时间范围内的工作日。
create temp function working_hours(start TIMESTAMP,ending TIMESTAMP) as ((
WITH
holiday_tbl as (SELECT cal_date from Test.staff_time where holiday_flag='Y' and EXTRACT(DAYOFWEEK FROM cal_date) NOT in (1,7) ),
tbl1 as (
SELECT count(cal_date) as holidays,
ifnull(max(cal_date=date(start)),false) as start_on_holiday,
ifnull(max(cal_date=date(ending)),false) as ending_on_holiday
from holiday_tbl
Where date(cal_date) between date(start) and date(ending)
),
tbl2 as (
Select *,
Case EXTRACT(DAYOFWEEK FROM start) when 1 then timestamp_add(timestamp(date(start)+1),interval 17 hour) # Sunday is off, plus one day is Monday
when 7 then timestamp(date(start)+2) # Saturday is off, plus two days is Monday
else start end as start_,
Case EXTRACT(DAYOFWEEK FROM ending) when 1 then timestamp_add(timestamp(date(ending)-2),interval 17 hour) # Sunday is off,minus two day is Friday
when 7 then timestamp_add(timestamp(date(ending)-1),interval 17 hour) # Saturday is off, minus one day is Friday
else ending end as ending_,
from tbl1),
tbl3 as (
Select *,date_diff(date(ending_),date(start_),day) - 2* date_diff(date(ending_),date_trunc(date(start_),ISOWEEK), week) as working_days,
least(greatest(time_diff(time("2000-01-01 17:00:00 UTC"), time(start_), minute),0),9*60) as start_minutes,
least(greatest(time_diff(time(ending_),time("2000-01-01 8:00:00 UTC"), minute),0),9*60) as ending_minutes,
from tbl2
)
Select struct( 9*(working_days-1-holidays) + if(start_on_holiday, +9, start_minutes/60 ) + if(ending_on_holiday, +9, ending_minutes/60 ) as working_hours,
holidays,start_on_holiday,ending_on_holiday,start_,ending_,working_days,start_minutes,ending_minutes)
from tbl3
));
with test as ( Select timestamp("2023-01-05 12:55:00 UTC") as start ,timestamp("2023-01-07 14:55:00 UTC") as ending union all SELECT timestamp("2023-01-05 12:55:00 UTC") as start ,timestamp("2023-02-07 14:55:00 UTC") union all SELECT timestamp("2023-01-01 12:55:00 UTC") as start ,timestamp("2023-02-07 14:55:00 UTC")
union all Select timestamp("2023-01-04 14:00:00 UTC"),timestamp("2023-01-04 16:00:00 UTC")
),
test2 as (SELECT id, timestamp_sub(current_timestamp(),INTERVAL CAST((rand()*100-5)*3600*24 as int64) second)start ,timestamp_sub(current_timestamp(),INTERVAL CAST((rand()*100-10)*3600*24 as int64) second) ending from unnest(generate_array(1,1000*1000)) as id)
SELECT *,
working_hours(start,ending)
from test