GCP - 查询错误:相关子查询 - UDF 中的 BigQuery 限制

问题描述 投票:0回答:1

我正在尝试在 BigQuery 中创建一个函数,该函数使用我们的一些业务逻辑/计算。但更重要的是,有一张由我们的工作人员定期更新的表格。 该表包含年份日期和其他特定值,但在本例中为“Holiday_Flag”。

此函数的目的是,在我们的许多报告程序中,他们将再次使用 2 个日期列,其中包含许多 CTE 等,然后此函数为我们提供从作业开始到结束的总延迟(以小时为单位)。

我尝试了很多不同的方法,甚至尝试使用 BiGQuery 原生的 ARRAY_AGG 和 UNNEST,但没有成功。

当我从函数中完全删除对另一个表的引用,并硬编码一些虚拟日期时,它就起作用了。

我尝试了 GPT AL 世界的一些帮助,但都遇到了相同的错误。

这是我尝试过的最新代码,我对 BigQuery 还很陌生,最近才发现 UDF 限制。我确实明白,让一个函数对一百万行表执行计算并引用源表一百万次是效率不高的。

我的时间不多了,需要一些指导,当我将一些虚拟日期和标志硬编码到 CTE 中时,我什至得到了这个工作,但使用查找表失败了

这是代码。

staff_time 使用:cal_date 只是 yyyy-mm-dd,holiday_flag 只是“Y”或“N”

        CREATE OR REPLACE FUNCTION `my.gcp.function`(ip_start_date TIMESTAMP, ip_end_date TIMESTAMP) RETURNS NUMERIC AS (
(
WITH temp_date AS (
    SELECT
        CASE
            WHEN ip_start_date > ip_end_date THEN DATE(ip_end_date)
            ELSE DATE(ip_start_date)
        END AS ip_date_01,
        CASE
            WHEN ip_start_date > ip_end_date THEN DATE(ip_start_date)
            ELSE DATE(ip_end_date)
        END AS ip_date_02
),
validated_dates AS (
    SELECT
        ip_start_date,
        ip_end_date,
        CASE
            WHEN SAFE_CAST(ip_start_date AS DATE) IS NULL THEN 'Invalid start date format'
            WHEN SAFE_CAST(ip_end_date AS DATE) IS NULL THEN 'Invalid end date format'
            ELSE 'Valid dates'
        END AS date_validation
    FROM
        temp_date
),
holiday_array AS (
    SELECT
        ARRAY_AGG(STRUCT(DATE(cal_date) AS cal_date, holiday_flag)) AS holidays
    FROM
        `dataset.staff_time`
),
working_days AS (
    SELECT
        CASE
            WHEN DATE(ip_start_date) <> DATE(ip_end_date) THEN SUM(
                CASE
                    WHEN cal_date NOT IN (
                        SELECT
                            cal_date
                        FROM
                            UNNEST(
                                (
                                    SELECT
                                        holidays
                                    FROM
                                        holiday_array
                                )
                            )
                        WHERE
                            holiday_flag = 'Y'
                    )
                    AND EXTRACT(
                        DAYOFWEEK
                        FROM
                            cal_date
                    ) NOT IN (1, 7) THEN 1
                    ELSE 0
                END
            ) - 1 --as we are now working in arrays, this is zero based counting
            ELSE NULL
        END AS working_day,
        ip_start_date,
        ip_end_date
    FROM
        UNNEST(
            GENERATE_DATE_ARRAY(
                SAFE_CAST(ip_start_date AS DATE),
                DATE_SUB(SAFE_CAST(ip_end_date AS DATE), INTERVAL 1 DAY),
                INTERVAL 1 DAY
            )
        ) AS cal_date
    WHERE
        cal_date NOT IN (
            SELECT
                cal_date
            FROM
                UNNEST(
                    (
                        SELECT
                            holidays
                        FROM
                            holiday_array
                    )
                )
            WHERE
                holiday_flag = 'Y'
        )
        AND EXTRACT(
            DAYOFWEEK
            FROM
                cal_date
        ) NOT IN (1, 7)
),
start_date AS (
    SELECT
        CASE
            WHEN DATE(ip_start_date) NOT IN (
                SELECT
                    cal_date
                FROM
                    UNNEST(
                        (
                            SELECT
                                holidays
                            FROM
                                holiday_array
                        )
                    )
                WHERE
                    holiday_flag = 'Y'
            )
            AND EXTRACT(
                DAYOFWEEK
                FROM
                    DATE(ip_start_date)
            ) NOT IN (1, 7) THEN ip_start_date
            ELSE CURRENT_TIMESTAMP()
        END AS ip_start_date_1
),
end_date AS (
    SELECT
        CASE
            WHEN DATE(ip_end_date) NOT IN (
                SELECT
                    cal_date
                FROM
                    UNNEST(
                        (
                            SELECT
                                holidays
                            FROM
                                holiday_array
                        )
                    )
                WHERE
                    holiday_flag = 'Y'
            )
            AND EXTRACT(
                DAYOFWEEK
                FROM
                    DATE(ip_end_date)
            ) NOT IN (1, 7) THEN ip_end_date
            ELSE CURRENT_TIMESTAMP()
        END AS ip_end_date_1
),
start_working_hours AS (
    SELECT
        CASE
            WHEN DATE(ip_end_date) <> DATE(ip_end_date) THEN CASE
                WHEN DATE(ip_start_date_1) <> DATE(CURRENT_TIMESTAMP()) THEN 17 - ROUND(
                    EXTRACT(
                        HOUR
                        FROM
                            ip_start_date_1
                    ) + (
                        EXTRACT(
                            MINUTE
                            FROM
                                ip_start_date_1
                        ) / 60
                    ),
                    2
                )
                ELSE 0
            END
        END AS start_working_hour
    FROM
        start_date
),
end_working_hours AS (
    SELECT
        CASE
            WHEN DATE(ip_start_date) <> DATE(ip_end_date) THEN CASE
                WHEN DATE(ip_end_date_1) <> DATE(CURRENT_TIMESTAMP()) THEN ROUND(
                    EXTRACT(
                        HOUR
                        FROM
                            ip_end_date_1
                    ) + EXTRACT(
                        MINUTE
                        FROM
                            ip_end_date_1
                    ) / 60 - 8,
                    2
                )
                ELSE 0
            END
        END AS end_working_hour
    FROM
        end_date
),
when_delay_on_same_day1 AS (
    SELECT
        CASE
            WHEN DATE(ip_start_date) = DATE(ip_end_date) THEN CASE
                WHEN DATE(ip_start_date) NOT IN (
                    SELECT
                        cal_date
                    FROM
                        UNNEST(
                            (
                                SELECT
                                    holidays
                                FROM
                                    holiday_array
                            )
                        )
                    WHERE
                        holiday_flag = 'Y'
                )
                AND EXTRACT(
                    DAYOFWEEK
                    FROM
                        DATE(ip_start_date)
                ) NOT IN (1, 7) THEN ROUND(
                    ABS(TIMESTAMP_DIFF(ip_end_date, ip_start_date, SECOND)) / 10000,
                    3
                )
                ELSE NULL
            END
            ELSE NULL
        END AS when_delay_on_same_day
)
SELECT
    CAST(
        IFNULL(start_working_hour, 0) + IFNULL(end_working_hour, 0) + IFNULL(working_day, 0) * 9 + IFNULL(when_delay_on_same_day, 0) AS NUMERIC
    ) AS final_working_hour
FROM
    start_working_hours,
    end_working_hours,
    working_days,
    when_delay_on_same_day1,
    start_date,
    end_date
)
);

我已经尝试在工作日 CTE 上使用连接,但仍然没有乐趣。

working_days AS (
    SELECT
        CASE
            WHEN DATE(ip_start_date) <> DATE(ip_end_date) THEN SUM(
                CASE
                    WHEN ot.cal_date IS NULL 
                    AND EXTRACT(DAYOFWEEK FROM cal_date) NOT IN (1, 7) THEN 1
                    ELSE 0
                END
            ) - 1 -- Adjusting for zero-based counting
            ELSE NULL
        END AS working_day,
        ip_start_date,
        ip_end_date
    FROM
        UNNEST(GENERATE_DATE_ARRAY(SAFE_CAST(ip_start_date AS DATE), DATE_SUB(SAFE_CAST(ip_end_date AS DATE), INTERVAL 1 DAY), INTERVAL 1 DAY)) AS cal_date
    LEFT JOIN holiday_array ot
    ON cal_date = ot.cal_date -- Both are now DATE types
    WHERE ot.cal_date IS NULL
    AND EXTRACT(DAYOFWEEK FROM cal_date) NOT IN (1, 7)
),

非常感谢您的帮助

google-bigquery
1个回答
0
投票

在此 SQL 示例中,仅将假期日期连接到表中。即使对于 100 万行,这也可以在 3 分钟内完成。当开始日期和结束日期混合时,请在额外的 CTE 中修复此问题。 CTE

holiday_tbl
包含所有周末以外的假期日期的表格。 CTE
tbl1
计数在范围内。 CTE
tbl2
将开始和结束日期移至下周一或上周五下午。
tbl3
计算所选时间范围内的
working_days
周一和周五,不考虑节假日。
start_minutes
end_minutes
是给定时间范围的第一天和最后一天的分钟。进行了一些修复,以便仅添加正时间且最多 9 小时。

工作时间是每天 9 个小时,我们需要从工作日中减去假期,还需要多一天,因为我们忽略了开始和结束日期:

9*(working_days-1-holidays) 
。如果第一天是假期,那么我们会从计算中减去一天。这是错误的,因为第一天不计入工作日。为了解决这个问题,我们增加了 9 个小时。如果第一天不是假期,我们会添加该天的工作时间('start_minutes/60')。我们最后一天也做同样的事情。

针对

working_days
的备注。任务是找到周末。因此,我们通过
date_trunc(date(start_),ISOWEEK)
搜索上周一。与最终日期的周差是周末。因为我们截断了工作日的日期,所以需要从日期差中减去周末时间二以获得该时间范围内的工作日。

create temp function working_hours(start TIMESTAMP,ending TIMESTAMP)  as ((
  WITH
  holiday_tbl as (SELECT cal_date from Test.staff_time where holiday_flag='Y' and EXTRACT(DAYOFWEEK FROM cal_date) NOT in (1,7) ),
  tbl1 as (
SELECT count(cal_date) as holidays,
  ifnull(max(cal_date=date(start)),false) as start_on_holiday,
  ifnull(max(cal_date=date(ending)),false) as ending_on_holiday 
  from  holiday_tbl 
  Where date(cal_date) between date(start) and date(ending)
  ),
  tbl2 as (
  Select  *,
    Case EXTRACT(DAYOFWEEK FROM start) when 1 then timestamp_add(timestamp(date(start)+1),interval 17 hour) # Sunday is off, plus one day is Monday
      when 7 then timestamp(date(start)+2) # Saturday is off, plus two days is Monday
      else start end as start_,
    Case EXTRACT(DAYOFWEEK FROM ending) when 1 then timestamp_add(timestamp(date(ending)-2),interval 17 hour) # Sunday is off,minus two day is Friday
      when 7 then timestamp_add(timestamp(date(ending)-1),interval 17 hour) # Saturday is off, minus one day is Friday
      else ending end as ending_,
  from tbl1),
  tbl3 as (
  Select *,date_diff(date(ending_),date(start_),day) - 2*  date_diff(date(ending_),date_trunc(date(start_),ISOWEEK), week) as working_days,
   least(greatest(time_diff(time("2000-01-01 17:00:00 UTC"), time(start_), minute),0),9*60) as start_minutes,
   least(greatest(time_diff(time(ending_),time("2000-01-01 8:00:00 UTC"), minute),0),9*60) as ending_minutes,
  from tbl2
  )
  Select struct( 9*(working_days-1-holidays) + if(start_on_holiday, +9, start_minutes/60 ) + if(ending_on_holiday, +9, ending_minutes/60 ) as working_hours,
  holidays,start_on_holiday,ending_on_holiday,start_,ending_,working_days,start_minutes,ending_minutes)
  
  from tbl3
  
));

with test as ( Select timestamp("2023-01-05 12:55:00 UTC") as start ,timestamp("2023-01-07 14:55:00 UTC") as ending union all SELECT timestamp("2023-01-05 12:55:00 UTC") as start ,timestamp("2023-02-07 14:55:00 UTC") union all SELECT timestamp("2023-01-01 12:55:00 UTC") as start ,timestamp("2023-02-07 14:55:00 UTC")

union all Select timestamp("2023-01-04 14:00:00 UTC"),timestamp("2023-01-04 16:00:00 UTC")
 ),
test2 as (SELECT id, timestamp_sub(current_timestamp(),INTERVAL CAST((rand()*100-5)*3600*24 as int64) second)start ,timestamp_sub(current_timestamp(),INTERVAL CAST((rand()*100-10)*3600*24 as int64) second) ending from unnest(generate_array(1,1000*1000)) as id)

SELECT *,
working_hours(start,ending)

from test
© www.soinside.com 2019 - 2024. All rights reserved.