data_01 = [(787, 10, 1, 2024, '0202410IN D600000787', '7987979')]
数据_02 = [ (787, 10, 2, 2024, '00007870202410111439000000000', '7987979jk'), (787, 10, 2, 2024, '00007870202410011493500000000', '234098032') ]
数据_03 = [ (787, 10, 3, 2024, '0000787020241011143900000000001', 'kkhjbhjk'), (787, 10, 3, 2024, '0000787020241011143900000000002', 'kjk872'), (787, 10, 3, 2024, '0000787020241001149350000000001', 'kjk872') ]
数据_04 = [ (787, 10, 4, 2024, '000078702024101114390000000000101', 'sdfsdf8798'), (787, 10, 4, 2024, '000078702024101114390000000000201', 'fd4598sd'), (787, 10, 4, 2024, '000078702024100114935000000000101', '8932hjty'), (787, 10, 4, 2024, '000078702024100114935000000000102', '79793213jk'), (787, 10, 4, 2024, '000078702024100114935000000000103', '8479k') ]
列= [“emp_no”,“月”,“indc”,“年”,“键”,“值”]
根据提供的数据和图像,您的目标是连接 PySpark 中的四个表以实现所需的结果。这是处理数据并获得预期输出的分步解决方案。
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
# Start Spark session
spark = SparkSession.builder.appName("JoinTables").getOrCreate()
data_01 = [(787, 10, 1, 2024, '0202410IN D600000787', '7987979')]
data_02 = [(787, 10, 2, 2024, '00007870202410111439000000000', '7987979jk'),
(787, 10, 2, 2024, '00007870202410011493500000000', '234098032')]
data_03 = [(787, 10, 3, 2024, '0000787020241011143900000000001', 'kkhjbhjk'),
(787, 10, 3, 2024, '0000787020241011143900000000002', 'kjk872'),
(787, 10, 3, 2024, '0000787020241001149350000000001', 'kjk872')]
data_04 = [(787, 10, 4, 2024, '000078702024101114390000000000101', 'sdfsdf8798'),
(787, 10, 4, 2024, '000078702024101114390000000000201', 'fd4598sd'),
(787, 10, 4, 2024, '000078702024100114935000000000101', '8932hjty'),
(787, 10, 4, 2024, '000078702024100114935000000000102', '79793213jk'),
(787, 10, 4, 2024, '000078702024100114935000000000103', '8479k')]
columns = ['emp_no', 'month', 'indc', 'year', 'key', 'Value']
df_01 = spark.createDataFrame(data_01, columns)
df_02 = spark.createDataFrame(data_02, columns)
df_03 = spark.createDataFrame(data_03, columns)
df_04 = spark.createDataFrame(data_04, columns)
我们将使用
union
函数将所有四个表合并到一个 DataFrame 中。
df_combined = df_01.union(df_02).union(df_03).union(df_04)
df_combined.show(truncate=False)
这将生成一个合并四个表中所有记录的 DataFrame。
现在,从图像中,我们需要对数据进行分组并确保记录按
indc
值的顺序显示。
df_combined.orderBy(col("indc").asc()).show(truncate=False)
这将给出最终结果,按
indc
列排序,这似乎是所提供数据中的关键区分因素。
如果需要进一步的转换(如过滤或重复数据删除),可以在观察组合结果后根据具体要求进行应用。
请告诉我这是否符合您的预期结果,或者是否需要进一步改进。