基于正则表达式的雪花拆分列到表

问题描述 投票:0回答:1

我正在使用雪花查询根据表列中的输入字符串将数据拆分为多个列。我没有得到预期的输出。需要一些帮助:

我们只说输入是:

Activity type [DP - mcr modifyand quac endo; bio fert]; PharmSon BT acticity code [AYx765]

输出应该是:

Activity type in column1.
DP - mcr modifyand quac endo; bio fert in column2

PharmSon BT acticity code in column1
AYx765 in column2

相反,我得到的输出为:

Activity type  in column1-- column2 is null
bio fert] in column1--column2 is null
PharmSon BT acticity code in column1--AYx765 in column2
WITH parsed_data AS (
    -- Split FILATT data by semicolons and flatten the array into rows
    SELECT
        INTRNL AS EQ_INTRNL,
        'DEV' AS ENVIRONMENT,
        FILATT AS raw_data, 
        SPLIT(FILATT, ';') AS data_array -- Split the data by ';'
    FROM table1
),

exploded_data AS (
    -- Flatten the array so each item is in a separate row
    SELECT
        EQ_INTRNL,
        ENVIRONMENT,
        raw_data,
        TRIM(value) AS part, -- Each value is now in 'part'
        ROW_NUMBER() OVER (PARTITION BY EQ_INTRNL ORDER BY CURRENT_TIMESTAMP()) AS column1 -- Assign a sequential number, reset for each EQ_INTRNL
    FROM parsed_data,
    LATERAL FLATTEN(input => data_array)
),

extracted_columns AS (
    SELECT
        EQ_INTRNL,
        ENVIRONMENT,
        column1, -- The sequential number for the row
        -- Extract the identifier part (before '[')
        REGEXP_SUBSTR(part, '^[^\\[]+') AS column2,
        -- Extract the content inside brackets, excluding the brackets themselves
        REGEXP_SUBSTR(part, '\\[([^\\]]+)\\]', 1, 1, 'e') AS column3
    FROM exploded_data
)

SELECT
    EQ_INTRNL,
    column2,
    column3,
    ENVIRONMENT,
    column1
FROM extracted_columns 
WHERE column3 !=''
ORDER BY EQ_INTRNL, column1;
snowflake-cloud-data-platform
1个回答
0
投票

这应该可以使用 UDTF - 您可以使用 Javascript 或基于 Python

这对于多行 1+ 来说应该足够动态



create or replace function dev._neeru.split_data(data varchar)
returns table (data_part varchar, col1 varchar, col2 varchar)
language python
RUNTIME_VERSION = 3.9
handler='SplitData'
as $$
import re

class SplitData:
    def process(self, data):
      data_parts = re.findall(r'.*?\];?', data)
      for data_part in data_parts:
          pattern = re.compile(r'(.*?)\[(.*?)\]')
          cols = pattern.findall(data_part)
          yield (data_part, cols[0][0], cols[0][1])
$$;


with details as (
    select 'Activity type [DP - mcr modifyand quac endo; bio fert]; PharmSon BT acticity code [AYx765]' as raw_data
    union all
    select 'Name [John]; Age [15]' as raw_data
    union all
    select 'Name [Jack]' as raw_data
    union all
    select 'Name [Jack]; Activity type [DP - mcr modifyand quac endo; bio fert];  Age [15]' as raw_data
)
select
    raw_data,
    trim(col1) as col1,
    trim(col2) as col2
from
details,
table ( dev._neeru.split_data(details.raw_data) );

+------------------------------------------------------------------------------------------+-------------------------+--------------------------------------+
|RAW_DATA                                                                                  |COL1                     |COL2                                  |
+------------------------------------------------------------------------------------------+-------------------------+--------------------------------------+
|Activity type [DP - mcr modifyand quac endo; bio fert]; PharmSon BT acticity code [AYx765]|Activity type            |DP - mcr modifyand quac endo; bio fert|
|Activity type [DP - mcr modifyand quac endo; bio fert]; PharmSon BT acticity code [AYx765]|PharmSon BT acticity code|AYx765                                |
|Name [John]; Age [15]                                                                     |Name                     |John                                  |
|Name [John]; Age [15]                                                                     |Age                      |15                                    |
|Name [Jack]                                                                               |Name                     |Jack                                  |
|Name [Jack]; Activity type [DP - mcr modifyand quac endo; bio fert];  Age [15]            |Name                     |Jack                                  |
|Name [Jack]; Activity type [DP - mcr modifyand quac endo; bio fert];  Age [15]            |Activity type            |DP - mcr modifyand quac endo; bio fert|
|Name [Jack]; Activity type [DP - mcr modifyand quac endo; bio fert];  Age [15]            |Age                      |15                                    |
+------------------------------------------------------------------------------------------+-------------------------+--------------------------------------+



© www.soinside.com 2019 - 2024. All rights reserved.