ab=ab.melt(ids=["id","emp_id","comp_id","site_id","emp_name"],values=["location","doj","status","time"], variableColumnName='Description',
valueColumnName='value')
这里有一个更复杂的方法。
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp, col
from pyspark.sql.types import StructType, StructField, StringType
from itertools import chain
spark = SparkSession.builder.appName("NoUnstack").getOrCreate()
schema = StructType([
StructField("id", StringType(), True),
StructField("emp_id", StringType(), True),
StructField("comp_id", StringType(), True),
StructField("site_id", StringType(), True),
StructField("emp_name", StringType(), True),
StructField("emp_address", StringType(), True),
StructField("join_date", StringType(), True),
StructField("status", StringType(), True),
StructField("time", StringType(), True)
])
data = [
("aaa111", "emp_123", "123-1", "rrr1", "aaa sdf er", "mumbai", "22-4-2024", "active", "5:00"),
("aaa222", "emp_124", "123-2", "rrr2", "aaae sdf er", "jalna", "22-4-2025", "inactive", "6:00"),
("aaa333", "emp_125", "123-3", "rrr3", "aaa sdef er", "pune", "22-4-2026", "active", "7:00")
]
df = spark.createDataFrame(data, schema)
df.show()
map_cols = ["emp_name", "emp_address", "join_date", "status", "time"]
id_cols = ["id", "emp_id", "comp_id", "site_id"]
new_value = list(chain(*[ (lit(c), col(c)) for c in map_cols] ))
print(*new_value)
df_with_map = df.withColumn("attributes", create_map( *new_value )).drop(*map_cols)
df_with_map.show(truncate=False)
df_exploded = df_with_map.select(*id_cols, explode(col("attributes")).alias("description", "values"))
df_exploded.show(truncate=False)
输出:
+------+-------+-------+-------+-----------+-----------+---------+--------+----+
| id| emp_id|comp_id|site_id| emp_name|emp_address|join_date| status|time|
+------+-------+-------+-------+-----------+-----------+---------+--------+----+
|aaa111|emp_123| 123-1| rrr1| aaa sdf er| mumbai|22-4-2024| active|5:00|
|aaa222|emp_124| 123-2| rrr2|aaae sdf er| jalna|22-4-2025|inactive|6:00|
|aaa333|emp_125| 123-3| rrr3|aaa sdef er| pune|22-4-2026| active|7:00|
+------+-------+-------+-------+-----------+-----------+---------+--------+----+
Column<'emp_name'> Column<'emp_name'> Column<'emp_address'> Column<'emp_address'> Column<'join_date'> Column<'join_date'> Column<'status'> Column<'status'> Column<'time'> Column<'time'>
+------+-------+-------+-------+---------------------------------------------------------------------------------------------------------+
|id |emp_id |comp_id|site_id|attributes |
+------+-------+-------+-------+---------------------------------------------------------------------------------------------------------+
|aaa111|emp_123|123-1 |rrr1 |{emp_name -> aaa sdf er, emp_address -> mumbai, join_date -> 22-4-2024, status -> active, time -> 5:00} |
|aaa222|emp_124|123-2 |rrr2 |{emp_name -> aaae sdf er, emp_address -> jalna, join_date -> 22-4-2025, status -> inactive, time -> 6:00}|
|aaa333|emp_125|123-3 |rrr3 |{emp_name -> aaa sdef er, emp_address -> pune, join_date -> 22-4-2026, status -> active, time -> 7:00} |
+------+-------+-------+-------+---------------------------------------------------------------------------------------------------------+
+------+-------+-------+-------+-----------+-----------+
|id |emp_id |comp_id|site_id|description|values |
+------+-------+-------+-------+-----------+-----------+
|aaa111|emp_123|123-1 |rrr1 |emp_name |aaa sdf er |
|aaa111|emp_123|123-1 |rrr1 |emp_address|mumbai |
|aaa111|emp_123|123-1 |rrr1 |join_date |22-4-2024 |
|aaa111|emp_123|123-1 |rrr1 |status |active |
|aaa111|emp_123|123-1 |rrr1 |time |5:00 |
|aaa222|emp_124|123-2 |rrr2 |emp_name |aaae sdf er|
|aaa222|emp_124|123-2 |rrr2 |emp_address|jalna |
|aaa222|emp_124|123-2 |rrr2 |join_date |22-4-2025 |
|aaa222|emp_124|123-2 |rrr2 |status |inactive |
|aaa222|emp_124|123-2 |rrr2 |time |6:00 |
|aaa333|emp_125|123-3 |rrr3 |emp_name |aaa sdef er|
|aaa333|emp_125|123-3 |rrr3 |emp_address|pune |
|aaa333|emp_125|123-3 |rrr3 |join_date |22-4-2026 |
|aaa333|emp_125|123-3 |rrr3 |status |active |
|aaa333|emp_125|123-3 |rrr3 |time |7:00 |
+------+-------+-------+-------+-----------+-----------+