我有一个包含各种信息的行的表,按GroupingColumn
分组,如下例所示:
--------------------------------------------------------------------
| GroupingColumn | Column1 | Column2 | Column3 | Column4 | Column5 |
|----------------|---------|---------|---------|---------|---------|
| g | a | b | c | d | e |
| g | j | k | l | m | n |
| g | a | b | c | d | NULL |
| g | a | NULL | NULL | NULL | NULL |
| g | NULL | NULL | c | d | e |
| g | x | y | NULL | NULL | NULL |
| g | x | z | w | NULL | NULL |
| g | J | NULL | NULL | NULL | NULL |
| q | a | b | NULL | NULL | NULL |
| q | x | z | v | NULL | NULL |
| q | x | z | NULL | NULL | NULL |
| q | NULL | NULL | e | o | p |
--------------------------------------------------------------------
从上表中,我想只选择最具描述性的行,因此考虑到应对每个GroupingColumn
值进行过滤,因此排除那些信息较少的行。这将导致以下结果:
--------------------------------------------------------------------
| GroupingColumn | Column1 | Column2 | Column3 | Column4 | Column5 |
|----------------|---------|---------|---------|---------|---------|
| g | a | b | c | d | e |
| g | j | k | l | m | n |
| g | x | y | NULL | NULL | NULL |
| g | x | z | w | NULL | NULL |
| q | a | b | NULL | NULL | NULL |
| q | x | z | v | NULL | NULL |
| q | NULL | NULL | e | o | p |
--------------------------------------------------------------------
将排除以下行:
g a b c d NULL
g a NULL NULL NULL NULL
g NULL NULL c d e
q j NULL NULL NULL NULL
q x z NULL NULL NULL
因为有更多具有相同价值的描述性的。
值得注意的是,行可以容纳的信息量可能存在变化,如果Column1没有值,则其他列也不是必需的。
到目前为止,我已经尝试(并成功)只对两列执行此操作,如下面的查询:
WITH DetailedRows(GroupingColumn, Column1)
AS
(
SELECT GroupingColumn, Column1
FROM TheTable
WHERE Column2 IS NOT NULL
),
FilteredRows(GroupingColumn, Column1, Column2)
AS
(
SELECT GroupingColumn, Column1, Column2
FROM TheTable
WHERE Column1 IN (SELECT Column1 FROM DetailedRows WHERE GroupingColumn = TheTable.GroupingColumn)
AND Column2 IS NOT NULL
UNION
SELECT GroupingColumn, Column1, NULL
FROM TheTable
WHERE Column1 NOT IN (SELECT Column1 FROM DetailedRows WHERE GroupingColumn = TheTable.GroupingColumn)
)
SELECT * FROM FilteredRows
ORDER BY GroupingColumn, Column1, Column2
但我觉得这可以更有效地完成,特别是在考虑所有5列时。
我们热烈欢迎任何实现这一目标的想法,提前感谢!
好问题 - 我不得不考虑一下这个问题。
如果您对数据进行排序,例如: Column1 desc, Column2 desc, ...
然后超集将与其相应的子集相邻(反之亦然)。例如,如果我们对您的样本数据进行排序
--------------------------------------------------------------------
| GroupingColumn | Column1 | Column2 | Column3 | Column4 | Column5 |
|----------------|---------|---------|---------|---------|---------|
| g | x | z | w | NULL | NULL |
| g | x | y | NULL | NULL | NULL |
| g | j | k | l | m | n |
| g | j | NULL | NULL | NULL | NULL | <--
| g | a | b | c | d | e |
| g | a | b | c | d | NULL | <--
| q | x | z | v | NULL | NULL |
| q | a | b | NULL | NULL | NULL |
--------------------------------------------------------------------
您可以看到要排除的两行(作为更具描述性的行的子集)位于这些超集行的正下方。
因此问题可以解决
nulls
上将结果留给自己(处理rownum - 1
)where
子句以排除成功连接的任何内容,因为连接表示真正的子集说了这么多 - 用CTE构建行号并在自联接中使用它。这是查询
with sorted as
(
select *, row_number() over (partition by GroupingColumn order by Column1 desc,
Column2 desc, Column3 desc, Column4 desc, Column5 desc) as rnum
from TheTable
)
select t.GroupingColumn, t.Column1, t.Column2, t.Column3, t.Column4, t.Column5
from sorted as t
left outer join sorted as super
on t.GroupingColumn = super.GroupingColumn and t.rnum - 1 >= super.rnum
and coalesce(t.Column1, super.Column1, '') = coalesce(super.Column1, '')
and coalesce(t.Column2, super.Column2, '') = coalesce(super.Column2, '')
and coalesce(t.Column3, super.Column3, '') = coalesce(super.Column3, '')
and coalesce(t.Column4, super.Column4, '') = coalesce(super.Column4, '')
and coalesce(t.Column5, super.Column5, '') = coalesce(super.Column5, '')
where super.rnum is null;
用于测试的示例数据
create table TheTable (GroupingColumn char, Column1 char, Column2 char,
Column3 char, Column4 char, Column5 char)
insert into TheTable select 'g', 'a', 'b', 'c', 'd', 'e'
union all select 'g', 'j', 'k', 'l', 'm', 'n'
union all select 'g', 'a', 'b', 'c', 'd', null
union all select 'g', 'x', 'y', null, null, null
union all select 'g', 'x', 'z', 'w', null, null
union all select 'g', 'j', null, null, null, null
union all select 'q', 'a', 'b', null, null, null
union all select 'q', 'x', 'z', 'v', null, null;
有点罗嗦,但我认为它有效
declare @t table (pk int identity primary key, id char(1), col1 char(1), col2 char(1), col3 char(1), col4 char(1), col5 char(1))
insert into @t (id, col1, col2, col3, col4, col5) values
('g', 'a', 'b', 'c', 'd', 'e')
, ('g' ,'j', 'k', 'l', 'm','n')
, ('g' ,'a' ,'b' ,'c' ,'d' ,null)
, ('g', 'x', 'y', NULL, NULL, NULL)
, ('g', 'x', 'z', 'w', NULL, NULL)
, ('g', 'J', NULL, NULL, NULL, NULL)
, ('q', 'a', 'b', NULL, NULL, NULL)
, ('q', 'x', 'z', 'v', NULL, NULL);
with cte as
( select *
, case when col1 is not null then 1 else 0 end
+ case when col2 is not null then 1 else 0 end
+ case when col3 is not null then 1 else 0 end
+ case when col4 is not null then 1 else 0 end
+ case when col5 is not null then 1 else 0 end
as cnt
from @t t1
)
select t1.*
from cte t1
where t1.cnt = 1
and not exists (select 1 from cte t2
where t2.pk <> t1.pk
and t2.id = t1.id
and t2.col1 = t1.col1
and t2.cnt > t1.cnt)
union all
select t1.*
from cte t1
where t1.cnt = 2
and not exists (select 1 from cte t2
where t2.pk <> t1.pk
and t2.id = t1.id
and t2.col1 = t1.col1
and t2.col2 = t1.col2
and t2.cnt > t1.cnt)
union all
select t1.*
from cte t1
where t1.cnt = 3
and not exists (select 1 from cte t2
where t2.pk <> t1.pk
and t2.id = t1.id
and t2.col1 = t1.col1
and t2.col2 = t1.col2
and t2.col3 = t1.col3
and t2.cnt > t1.cnt)
union all
select t1.*
from cte t1
where t1.cnt = 4
and not exists (select 1 from cte t2
where t2.pk <> t1.pk
and t2.id = t1.id
and t2.col1 = t1.col1
and t2.col2 = t1.col2
and t2.col3 = t1.col3
and t2.col4 = t1.col4
and t2.cnt > t1.cnt)
union all
select t1.*
from cte t1
where t1.cnt = 5
and not exists (select 1 from cte t2
where t2.pk <> t1.pk
and t2.id = t1.id
and t2.col1 = t1.col1
and t2.col2 = t1.col2
and t2.col3 = t1.col3
and t2.col4 = t1.col4
and t2.col5 = t1.col5)
order by pk;