我使用的是 DuckDB,它大部分应该遵循 postgresql 的语法。
t_cons
(~200k)和t_flows
(~1.2M)。t_cons
,有许多 t_flows
与 asset, year, rep_period
t_cons
的所有这些,存储满足操作 > 0 的所有 t_flows.variable_index
。我设法通过两个查询获得结果,一个用于 INNER JOIN 计算操作和过滤,第二个用于聚合,但它太慢了。
我试图将它们加入到单个查询中并删除内部联接以防止扫描,但我对 SQL 不太熟悉。非常感谢任何帮助。
sql 的链接和数据示例:
https://gist.github.com/abelsiqueira/d8124d6b32db8e20297be4440405e093
CREATE
OR REPLACE TABLE t_flows AS
SELECT
*
FROM
't_flows.csv';
CREATE
OR REPLACE TABLE t_cons AS
SELECT
*
FROM
't_cons.csv';
CREATE
OR REPLACE TABLE t_incoming_nonzero AS
SELECT
t_flows.from_asset,
t_flows.to_asset,
t_flows.year,
t_flows.rep_period,
t_flows.variable_index,
t_cons.time_block_start,
greatest(
0,
1 + least(t_flows.time_block_end, t_cons.time_block_end) - greatest(t_flows.time_block_start, t_cons.time_block_start)
) AS duration
FROM
t_flows
INNER JOIN t_cons ON t_flows.to_asset = t_cons.asset
AND t_flows.year = t_cons.year
AND t_flows.rep_period = t_cons.rep_period
WHERE
duration > 0;
CREATE
OR REPLACE TABLE t_result AS
SELECT
t_cons.asset,
t_cons.year,
t_cons.rep_period,
t_cons.time_block_start,
COALESCE(indices, []) AS indices,
COALESCE(durations, []) AS durations,
FROM
t_cons
LEFT JOIN (
SELECT
to_asset AS asset,
YEAR,
rep_period,
time_block_start,
array_agg(variable_index) AS indices,
array_agg(duration) AS durations,
FROM
t_incoming_nonzero
GROUP BY
to_asset,
YEAR,
rep_period,
time_block_start
) AS t_tmp ON t_cons.asset = t_tmp.asset
AND t_cons.year = t_tmp.year
AND t_cons.rep_period = t_tmp.rep_period
AND t_cons.time_block_start = t_tmp.time_block_start;
SELECT
*
FROM
t_result;
“EXPLAIN ANALYZE”的输出(DuckDB 不支持解释的其他设置):
┌─────────────────────────────────────┐
│┌───────────────────────────────────┐│
││ Query Profiling Information ││
│└───────────────────────────────────┘│
└─────────────────────────────────────┘
EXPLAIN ( ANALYZE ) CREATE OR REPLACE TABLE t_incoming_nonzero AS SELECT t_flows.from_asset, t_flows.to_asset, t_flows.year, t_flows.rep_period, t_flows.variable_index, t_cons.time_block_start, greatest( 0, 1 + least(t_flows.time_block_end, t_cons.time_block_end) - greatest(t_flows.time_block_start, t_cons.time_block_start) ) AS duration FROM t_flows INNER JOIN t_cons ON t_flows.to_asset = t_cons.asset AND t_flows.year = t_cons.year AND t_flows.rep_period = t_cons.rep_period WHERE duration > 0;
┌────────────────────────────────────────────────┐
│┌──────────────────────────────────────────────┐│
││ Total Time: 20.11s ││
│└──────────────────────────────────────────────┘│
└────────────────────────────────────────────────┘
┌───────────────────────────┐
│ QUERY │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│ EXPLAIN_ANALYZE │
│ ──────────────────── │
│ 0 Rows │
│ (0.00s) │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│ CREATE_TABLE_AS │
│ ──────────────────── │
│ 1 Rows │
│ (0.13s) │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│ PROJECTION │
│ ──────────────────── │
│ from_asset │
│ to_asset │
│ year │
│ rep_period │
│ variable_index │
│ time_block_start │
│ duration │
│ │
│ 1372400 Rows │
│ (0.02s) │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│ PROJECTION │
│ ──────────────────── │
│ #0 │
│ #1 │
│ #2 │
│ #3 │
│ #4 │
│ #5 │
│ #6 │
│ #10 │
│ #11 │
│ │
│ 1372400 Rows │
│ (0.00s) │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│ FILTER │
│ ──────────────────── │
│ (greatest(0, ((1 + least │
│ (time_block_end, │
│ time_block_end)) - │
│ greatest(time_block_start│
│ , time_block_start))) > 0)│
│ │
│ 1372400 Rows │
│ (78.74s) │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│ HASH_JOIN │
│ ──────────────────── │
│ Join Type: INNER │
│ │
│ Conditions: │
│ rep_period = rep_period ├──────────────┐
│ year = year │ │
│ to_asset = asset │ │
│ │ │
│ 5184051200 Rows │ │
│ (49.01s) │ │
└─────────────┬─────────────┘ │
┌─────────────┴─────────────┐┌─────────────┴─────────────┐
│ TABLE_SCAN ││ TABLE_SCAN │
│ ──────────────────── ││ ──────────────────── │
│ t_flows ││ t_cons │
│ ││ │
│ Projections: ││ Projections: │
│ to_asset ││ asset │
│ year ││ year │
│ rep_period ││ rep_period │
│ time_block_end ││ time_block_end │
│ time_block_start ││ time_block_start │
│ from_asset ││ │
│ variable_index ││ │
│ ││ │
│ 1335900 Rows ││ 220460 Rows │
│ (0.04s) ││ (0.00s) │
└───────────────────────────┘└───────────────────────────┘
┌─────────────────────────────────────┐
│┌───────────────────────────────────┐│
││ Query Profiling Information ││
│└───────────────────────────────────┘│
└─────────────────────────────────────┘
EXPLAIN ( ANALYZE ) CREATE OR REPLACE TABLE t_result AS SELECT t_cons.asset, t_cons.year, t_cons.rep_period, t_cons.time_block_start, COALESCE(indices, []) AS indices, COALESCE(durations, []) AS durations, FROM t_cons LEFT JOIN ( SELECT to_asset AS asset, YEAR, rep_period, time_block_start, array_agg(variable_index) AS indices, array_agg(duration) AS durations, FROM t_incoming_nonzero GROUP BY to_asset, YEAR, rep_period, time_block_start ) AS t_tmp ON t_cons.asset = t_tmp.asset AND t_cons.year = t_tmp.year AND t_cons.rep_period = t_tmp.rep_period AND t_cons.time_block_start = t_tmp.time_block_start;
┌────────────────────────────────────────────────┐
│┌──────────────────────────────────────────────┐│
││ Total Time: 0.0560s ││
│└──────────────────────────────────────────────┘│
└────────────────────────────────────────────────┘
┌───────────────────────────┐
│ QUERY │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│ EXPLAIN_ANALYZE │
│ ──────────────────── │
│ 0 Rows │
│ (0.00s) │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│ CREATE_TABLE_AS │
│ ──────────────────── │
│ 1 Rows │
│ (0.01s) │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│ PROJECTION │
│ ──────────────────── │
│ asset │
│ year │
│ rep_period │
│ time_block_start │
│ indices │
│ durations │
│ │
│ 220460 Rows │
│ (0.00s) │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│ HASH_JOIN │
│ ──────────────────── │
│ Join Type: LEFT │
│ │
│ Conditions: │
│ asset = asset │
│ year = year ├──────────────┐
│ rep_period = rep_period │ │
│ time_block_start = │ │
│ time_block_start │ │
│ │ │
│ 220460 Rows │ │
│ (0.04s) │ │
└─────────────┬─────────────┘ │
┌─────────────┴─────────────┐┌─────────────┴─────────────┐
│ TABLE_SCAN ││ PROJECTION │
│ ──────────────────── ││ ──────────────────── │
│ t_cons ││__internal_decompress_strin│
│ ││ g(#0) │
│ Projections: ││__internal_decompress_integ│
│ asset ││ ral_bigint(#1, 2030) │
│ year ││__internal_decompress_integ│
│ rep_period ││ ral_bigint(#2, 1) │
│ time_block_start ││__internal_decompress_integ│
│ ││ ral_bigint(#3, 1) │
│ ││ #4 │
│ ││ #5 │
│ ││ │
│ 220460 Rows ││ 220460 Rows │
│ (0.00s) ││ (0.00s) │
└───────────────────────────┘└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│ HASH_GROUP_BY │
│ ──────────────────── │
│ Groups: │
│ #0 │
│ #1 │
│ #2 │
│ #3 │
│ │
│ Aggregates: │
│ array_agg(#4) │
│ array_agg(#5) │
│ │
│ 220460 Rows │
│ (0.15s) │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│ PROJECTION │
│ ──────────────────── │
│ to_asset │
│ year │
│ rep_period │
│ time_block_start │
│ variable_index │
│ duration │
│ │
│ 1372400 Rows │
│ (0.00s) │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│ PROJECTION │
│ ──────────────────── │
│__internal_compress_string_│
│ hugeint(#0) │
│__internal_compress_integra│
│ l_utinyint(#1, 2030) │
│__internal_compress_integra│
│ l_utinyint(#2, 1) │
│__internal_compress_integra│
│ l_usmallint(#3, 1) │
│ #4 │
│ #5 │
│ │
│ 1372400 Rows │
│ (0.01s) │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│ TABLE_SCAN │
│ ──────────────────── │
│ t_incoming_nonzero │
│ │
│ Projections: │
│ to_asset │
│ year │
│ rep_period │
│ time_block_start │
│ variable_index │
│ duration │
│ │
│ 1372400 Rows │
│ (0.01s) │
└───────────────────────────┘
输入和预期输出示例:
CREATE
OR REPLACE TABLE t_flows (
from_asset text,
to_asset text,
YEAR int,
rep_period int,
time_block_start int,
time_block_end int,
variable_index int
);
--
INSERT INTO
t_flows
VALUES
('producer1', 'hub', 2030, 1, 1, 3, 1),
('producer1', 'hub', 2030, 1, 4, 6, 2),
('producer1', 'hub', 2030, 1, 7, 9, 3),
('producer1', 'hub', 2030, 1, 10, 12, 4),
('hub', 'demand', 2030, 1, 1, 4, 5),
('hub', 'demand', 2030, 1, 5, 8, 6),
('hub', 'demand', 2030, 1, 9, 12, 7);
--
CREATE
OR REPLACE TABLE t_cons (
asset text,
YEAR int,
rep_period int,
time_block_start int,
time_block_end int
);
--
INSERT INTO
t_cons
VALUES
('hub', 2030, 1, 1, 5),
('hub', 2030, 1, 6, 6),
('hub', 2030, 1, 7, 8),
('hub', 2030, 1, 9, 12);
┌─────────┬───────┬────────────┬──────────────────┬─────────┬───────────┐
│ asset │ YEAR │ rep_period │ time_block_start │ indices │ durations │
│ varchar │ int32 │ int32 │ int32 │ int32[] │ int32[] │
├─────────┼───────┼────────────┼──────────────────┼─────────┼───────────┤
│ hub │ 2030 │ 1 │ 1 │ [1, 2] │ [3, 2] │
│ hub │ 2030 │ 1 │ 6 │ [2] │ [1] │
│ hub │ 2030 │ 1 │ 7 │ [3] │ [2] │
│ hub │ 2030 │ 1 │ 9 │ [3, 4] │ [1, 3] │
└─────────┴───────┴────────────┴──────────────────┴─────────┴───────────┘
我建议考虑更紧凑的查询。
不创建临时表。
如果您需要没有数据的资产和期间,请尝试将 INNER JOIN 替换为 LEFT JOIN。
SELECT
asset,
year,
rep_period,
time_block_start,
array_agg(variable_index) AS indices,
array_agg(duration) AS durations
FROM(
SELECT
t_cons.asset,
t_flows.from_asset,
t_flows.to_asset,
t_flows.year,
t_flows.rep_period,
t_flows.variable_index,
t_cons.time_block_start,
greatest(
0,
1 + least(t_flows.time_block_end, t_cons.time_block_end) - greatest(t_flows.time_block_start, t_cons.time_block_start)
) AS duration
FROM t_flows
INNER JOIN t_cons ON t_flows.to_asset = t_cons.asset
AND t_flows.year = t_cons.year
AND t_flows.rep_period = t_cons.rep_period
)t
WHERE duration > 0
GROUP BY asset, YEAR, rep_period, time_block_start