我有一个数据框,其中包含多个包含列表项的列。
我最终想要
explode
列(但只有当所有行在列表中具有相同的元素计数时它才会起作用)
df.explode("a", "b")
对于每一行,为了匹配
nos of elements
(对于所有列),我会在我的 列表中插入虚拟项目。
def generate_dummy(c1, c2):
return pl.lit([""] * (pl.col(c1).cast(pl.Int32) - pl.col(c2).cast(pl.Int32)), dtype=pl.List(pl.String))
# Original dataframe
df = pl.DataFrame({"a": [[1, 2], [3], [4, 5], [1]], "b": [[4, 5, 7], [6], [4, 5], [3, 2]]})
# Collect the list lengths in each column.
df = df.with_columns(alens=pl.col("a").list.len(), blens=pl.col("b").list.len())
### ERROR STEP ###
# Add dummy element [""] where the length is shorter.
df = df.with_columns(
pl.when(pl.col("alens") > pl.col("blens"))
.then(pl.col("b").list.concat(generate_dummy("alens", "blens")))
.otherwise(pl.col("a").list.concat(generate_dummy("blens", "alens")))
)
但是在计算要添加的
#'s of dummy
元素时我陷入了困境。
我得到的错误,
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[304], line 3
1 df.with_columns(
2 pl.col("a").list.concat(
----> 3 pl.lit(
4 [""] * (pl.col("alens").cast(pl.Int32) - pl.col("blens").cast(pl.Int32)), dtype=pl.List(pl.String)
5 )
6 )
7 )
File ~/data/opt/mambaforge/envs/orbital_py395/lib/python3.9/site-packages/polars/functions/lit.py:130, in lit(value, dtype, allow_object)
127 return lit(pl.Series("literal", [value], dtype=dtype))
129 if dtype:
--> 130 return wrap_expr(plr.lit(value, allow_object)).cast(dtype)
132 try:
133 # numpy literals like np.float32(0) have item/dtype
134 item = value.item()
TypeError: cannot create expression literal for value of type Expr: <Expr ['[(Series[literal]) * ([(col("a…'] at 0x15007DCF7370>
Hint: Pass `allow_object=True` to accept any value and create a literal of type Object.
我尝试使用 kwargs
allow_object=True
,最终出现错误
---------------------------------------------------------------------------
ComputeError Traceback (most recent call last)
Cell In[305], line 1
----> 1 df.with_columns(
2 pl.col("a").list.concat(
3 pl.lit(
4 [""] * (pl.col("alens").cast(pl.Int32) - pl.col("blens").cast(pl.Int32)), dtype=pl.List(pl.String), allow_object=True
5 )
6 )
7 )
File ~/data/opt/mambaforge/envs/orbital_py395/lib/python3.9/site-packages/polars/dataframe/frame.py:8310, in DataFrame.with_columns(self, *exprs, **named_exprs)
8164 def with_columns(
8165 self,
8166 *exprs: IntoExpr | Iterable[IntoExpr],
8167 **named_exprs: IntoExpr,
8168 ) -> DataFrame:
8169 """
8170 Add columns to this DataFrame.
8171
(...)
8308 └─────┴──────┴─────────────┘
8309 """
-> 8310 return self.lazy().with_columns(*exprs, **named_exprs).collect(_eager=True)
File ~/data/opt/mambaforge/envs/orbital_py395/lib/python3.9/site-packages/polars/lazyframe/frame.py:1816, in LazyFrame.collect(self, type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, no_optimization, streaming, background, _eager, **_kwargs)
1813 # Only for testing purposes atm.
1814 callback = _kwargs.get("post_opt_callback")
-> 1816 return wrap_df(ldf.collect(callback))
ComputeError: cannot cast 'Object' type
您可以创建一个新的列表列,每行包含所需数量的虚拟元素(动态),并将它们连接到列表列以进行分解。
target_length = pl.max_horizontal(pl.col("a", "b").list.len())
(
df
.with_columns(
pl.col("a").list.concat(pl.lit(-1).repeat_by(target_length - pl.col("a").list.len())),
pl.col("b").list.concat(pl.lit(-1).repeat_by(target_length - pl.col("b").list.len())),
)
)
shape: (4, 2)
┌────────────┬───────────┐
│ a ┆ b │
│ --- ┆ --- │
│ list[i64] ┆ list[i64] │
╞════════════╪═══════════╡
│ [1, 2, -1] ┆ [4, 5, 7] │
│ [3] ┆ [6] │
│ [4, 5] ┆ [4, 5] │
│ [1, -1] ┆ [3, 2] │
└────────────┴───────────┘