如何根据左侧
pl.LazyFrame
列中的内容使用每个 pl.LazyFrame
中的两列连接两个 pl.LazyFrame
?
import polars as pl
lf1 = pl.LazyFrame(
data={
"col_1": ["a", "b", "c"],
"col_2": ["d", None, None],
"col_3": [None, "e", None],
},
schema={
"col_1": pl.Utf8,
"col_2": pl.Utf8,
"col_3": pl.Utf8,
},
)
lf2 = pl.LazyFrame(
data={
"col_a": ["d", "xyz"],
"col_b": ["xyz", "e"],
"col_c": ["relevant_info_1", "relevant_info_2"],
},
schema={
"col_a": pl.Utf8,
"col_b": pl.Utf8,
"col_c": pl.Utf8,
},
)
所需加入的伪代码:
lf1.join(lf2,
when(col("col_2").isnotnull().then(left_on="col_2", right_on="col_a")
when(col("col_3").isnotnull().then(left_on="col_3", right_on="col_b")
otherwise(do_nothing)
)
预期结果:
col_1 | col_2 | col_3 | col_c
a | d | None | relevant_info_1
b | None | e | relevant_info_2
c | None | None | None
这里有一个方法。首先我们分别进行两个连接:
join1 = lf1.join(lf2, left_on=["col_2"], right_on=["col_a"], how="left").collect()
join2 = lf1.join(lf2, left_on=["col_3"], right_on=["col_b"], how="left").collect()
然后可以将
join1
和 join2
连接起来,并通过合并 col_c
和 col_c
中的 join1
来制作所需的 join2
。
我只保留您要求的最后一栏,但删除 select
声明可能会有所帮助。
(
join1
.join(
join2.select(['col_1', 'col_c'])
, on=["col_1"]
)
.with_columns(pl.coalesce(['col_c', 'col_c_right']).alias('col_c'))
.select(['col_1', 'col_2', 'col_3', 'col_c'])
)
import polars as pl
LF1 = pl.LazyFrame(
data={
"col_1": ["a", "b", "c"],
"col_2": ["d", None, None],
"col_3": [None, "e", None],
},
schema={
"col_1": pl.Utf8,
"col_2": pl.Utf8,
"col_3": pl.Utf8,
},
)
LF2 = pl.LazyFrame(
data={
"col_a": ["d", "xyz"],
"col_b": ["xyz", "e"],
"col_c": ["relevant_info_1", "relevant_info_2"],
},
schema={
"col_a": pl.Utf8,
"col_b": pl.Utf8,
"col_c": pl.Utf8,
},
)
def foo1() -> pl.LazyFrame:
lf_joined_on_col_2 = (
LF1.join(other=LF2, left_on=["col_2"], right_on=["col_a"])
.with_columns(pl.lit(None).cast(pl.Utf8).alias("col_a"))
.select(["col_1", "col_2", "col_3", "col_c"])
)
lf_joined_on_col_3 = (
LF1.join(other=LF2, left_on=["col_3"], right_on=["col_b"])
.with_columns(pl.lit(None).cast(pl.Utf8).alias("col_b"))
.select(["col_1", "col_2", "col_3", "col_c"])
)
lf_rows_with_null_on_col_2_and_col_3 = LF1.filter(
pl.col("col_2").is_null() & pl.col("col_3").is_null()
).with_columns(pl.lit(None).cast(pl.Utf8).alias("col_c"))
return pl.concat(
[lf_joined_on_col_2, lf_joined_on_col_3, lf_rows_with_null_on_col_2_and_col_3]
)
def foo2():
lf_joined_on_col_2 = LF1.join(
other=LF2, left_on=["col_2"], right_on=["col_a"], how="left"
)
lf_joined_on_col_3 = LF1.join(
other=LF2, left_on=["col_3"], right_on=["col_b"], how="left"
)
return (
lf_joined_on_col_2.join(
lf_joined_on_col_3.select(["col_1", "col_c"]), on=["col_1"]
)
.with_columns(pl.coalesce(["col_c", "col_c_right"]).alias("col_c"))
.select(["col_1", "col_2", "col_3", "col_c"])
)
@robertdj 提供的解决方案 foo2()
速度是
foo1()
的两倍 :
> python -m timeit -n 10000 -s "import test1" "test1.foo1()"
10000 loops, best of 5: 116 usec per loop
> python -m timeit -n 10000 -s "import test1" "test1.foo2()"
10000 loops, best of 5: 65 usec per loop