在 SQL 中逻辑回填行

问题描述 投票:0回答:1

我在 SQL 中有这个表:

CREATE TABLE myt (
    name VARCHAR(50) NOT NULL,
    year INTEGER NOT NULL,
    gender CHAR(1) NOT NULL CHECK (gender IN ('M', 'F')),
    country_of_birth VARCHAR(50) NOT NULL,
    age INTEGER NOT NULL CHECK (age > 0),
    number_of_football_games_attended INTEGER CHECK (number_of_football_games_attended >= 0),
    PRIMARY KEY (name, year)
);

INSERT INTO myt (name, year, gender, country_of_birth, age, number_of_football_games_attended)
VALUES 
    ('John', 2010, 'M', 'USA', 25, 3),
    ('John', 2015, 'M', 'USA', 30, 8),
    ('John', 2020, 'M', 'USA', 35, 12),
    
    ('Maria', 2012, 'F', 'Brazil', 22, 2),
    ('Maria', 2014, 'F', 'Brazil', 24, 5),
    ('Maria', 2019, 'F', 'Brazil', 29, 15),
    
    
    ('Sofia', 2018, 'F', 'Spain', 31, 7),
    ('Sofia', 2019, 'F', 'Spain', 32, 9),
    ('Sofia', 2021, 'F', 'Spain', 34, 13)
);

  name year gender country_of_birth age number_of_football_games_attended
  John 2010      M              USA  25                                 3
  John 2015      M              USA  30                                 8
  John 2020      M              USA  35                                12
 Maria 2012      F           Brazil  22                                 2
 Maria 2014      F           Brazil  24                                 5
 Maria 2019      F           Brazil  29                                15
 Sofia 2018      F            Spain  31                                 7
 Sofia 2019      F            Spain  32                                 9
 Sofia 2021      F            Spain  34                                13

我的目标是在每个人的最短年份和最长年份之间添加缺失的数据行,并用逻辑值回填列(例如,性别始终保持不变,出生国家始终保持不变,年龄增加 1,参加的足球比赛不适用,因为我们无法从逻辑上推断这一点)

name year gender country_of_birth age number_of_football_games_attended data_source
  John 2010      M            USA  25                               3    IN ORIGINAL DATA
  John 2011      M            USA  26                          MISSING    BACKFILLED
  John 2012      M            USA  27                          MISSING    BACKFILLED
  John 2013      M            USA  28                          MISSING    BACKFILLED
  John 2014      M            USA  29                          MISSING    BACKFILLED
  John 2015      M            USA  30                               8    IN ORIGINAL DATA
  John 2016      M            USA  31                          MISSING    BACKFILLED
  John 2017      M            USA  32                          MISSING    BACKFILLED
  John 2018      M            USA  33                          MISSING    BACKFILLED
 John 2019      M            USA  34                          MISSING    BACKFILLED
 John 2020      M            USA  35                              12    IN ORIGINAL DATA

 Maria 2012     F         Brazil  22                               2    IN ORIGINAL DATA
 Maria 2013     F         Brazil  23                          MISSING    BACKFILLED
 Maria 2014     F         Brazil  24                               5    IN ORIGINAL DATA
 Maria 2015     F         Brazil  25                          MISSING    BACKFILLED
 Maria 2016     F         Brazil  26                          MISSING    BACKFILLED
 Maria 2017     F         Brazil  27                          MISSING    BACKFILLED
 Maria 2018     F         Brazil  28                          MISSING    BACKFILLED
 Maria 2019     F         Brazil  29                              15    IN ORIGINAL DATA

 Sofia 2018     F          Spain  31                               7    IN ORIGINAL DATA
 Sofia 2019     F          Spain  32                               9    IN ORIGINAL DATA
 Sofia 2020     F          Spain  33                          MISSING    BACKFILLED
 Sofia 2021     F          Spain  34                              13    IN ORIGINAL DATA

我尝试使用间隙和孤岛方法为此编写 SQL 代码(我在 1=1 上使用联接而不是交叉联接,以便这可以在不支持交叉联接的服务器上工作):

WITH calendar AS (
    SELECT 2010 as year UNION ALL
    SELECT 2011 UNION ALL
    SELECT 2012 UNION ALL
    SELECT 2013 UNION ALL
    SELECT 2014 UNION ALL
    SELECT 2015 UNION ALL
    SELECT 2016 UNION ALL
    SELECT 2017 UNION ALL
    SELECT 2018 UNION ALL
    SELECT 2019 UNION ALL
    SELECT 2020 UNION ALL
    SELECT 2021
),
person_info AS (
    SELECT DISTINCT
        name,
        FIRST_VALUE(gender) OVER (PARTITION BY name ORDER BY year) as gender,
        FIRST_VALUE(country_of_birth) OVER (PARTITION BY name ORDER BY year) as country_of_birth,
        MIN(year) as min_year,
        MAX(year) as max_year,
        MIN(age) as start_age
    FROM myt
    GROUP BY name
),
filled_gaps AS (
    SELECT 
        p.name,
        c.year,
        p.gender,
        p.country_of_birth,
        CASE 
            WHEN m.age IS NOT NULL THEN m.age
            ELSE p.start_age + (c.year - p.min_year)
        END as age,
        COALESCE(
            CAST(m.number_of_football_games_attended AS VARCHAR),
            'MISSING'
        ) as number_of_football_games_attended,
        CASE 
            WHEN m.name IS NOT NULL THEN 'IN ORIGINAL DATA'
            ELSE 'BACKFILLED'
        END as data_source
    FROM person_info p
    JOIN calendar c ON 1=1
    LEFT JOIN myt m 
        ON m.name = p.name 
        AND m.year = c.year
    WHERE c.year BETWEEN p.min_year AND p.max_year
)
SELECT * 
FROM filled_gaps
ORDER BY name, year;

代码似乎可以运行,但我得到的是 NA 而不是“MISSING”:

 name year gender country_of_birth age number_of_football_games_attended      data_source
  John 2010      M              USA  25                                 3 IN ORIGINAL DATA
  John 2011      M              USA  26                                NA       BACKFILLED
  John 2012      M              USA  27                                NA       BACKFILLED
  John 2013      M              USA  28                                NA       BACKFILLED
  John 2014      M              USA  29                                NA       BACKFILLED
  John 2015      M              USA  30                                 8 IN ORIGINAL DATA
  John 2016      M              USA  31                                NA       BACKFILLED
  John 2017      M              USA  32                                NA       BACKFILLED
  John 2018      M              USA  33                                NA       BACKFILLED
  John 2019      M              USA  34                                NA       BACKFILLED
  John 2020      M              USA  35                                12 IN ORIGINAL DATA
 Maria 2012      F           Brazil  22                                 2 IN ORIGINAL DATA
 Maria 2013      F           Brazil  23                                NA       BACKFILLED
 Maria 2014      F           Brazil  24                                 5 IN ORIGINAL DATA
 Maria 2015      F           Brazil  25                                NA       BACKFILLED
 Maria 2016      F           Brazil  26                                NA       BACKFILLED
 Maria 2017      F           Brazil  27                                NA       BACKFILLED
 Maria 2018      F           Brazil  28                                NA       BACKFILLED
 Maria 2019      F           Brazil  29                                15 IN ORIGINAL DATA
 Sofia 2018      F            Spain  31                                 7 IN ORIGINAL DATA
 Sofia 2019      F            Spain  32                                 9 IN ORIGINAL DATA
 Sofia 2020      F            Spain  33                                NA       BACKFILLED
 Sofia 2021      F            Spain  34                                13 IN ORIGINAL DATA

有办法解决这个问题吗?

sql db2
1个回答
0
投票

COALESCE
可能会导致问题,尝试用
CASE
切换它,您可以将值设置为 null,请在代码中替换它:

COALESCE(
    CAST(m.number_of_football_games_attended AS VARCHAR),
    'MISSING'
) as number_of_football_games_attended,

与:

CASE 

    WHEN m.number_of_football_games_attended IS NOT NULL THEN CAST(m.number_of_football_games_attended AS VARCHAR)

    ELSE 'MISSING'

END as number_of_football_games_attended,
© www.soinside.com 2019 - 2024. All rights reserved.