将派生数据添加到 Polars multiIndex LazyFrame 的有效方法

Question

我正在使用 Polars，需要将派生数据添加到多索引 LazyFrame。为了修改数据，我在不执行任何聚合的情况下旋转 DataFrame，进行一些计算，然后将 DataFrame 融化回其原始格式。这个操作需要在不同的索引级别上进行。 Polars 文档提到枢轴操作在惰性模式下不可用。由于旋转/逆旋转，需要在急切模式和惰性模式之间来回切换。有没有更有效的方法来实现这一目标，而无需在急切模式和惰性模式之间切换？

这是一个例子：

import numpy as np
import polars as pl

def get_raw_data() -> pl.LazyFrame:
    """Generate random a multiindex LazyFrame with different size of indexes as example."""
    names = np.array(['A', 'B', 'C'])
    measures = np.array(['height', 'width'])
    repeats: np.ndarray = np.array([3, 3, 2])
    return pl.DataFrame({
        'id': np.repeat(names, repeats*measures.size),
        'measure': np.concatenate([np.repeat(measures, rep) for rep in repeats]),
        'date': np.concatenate([np.arange(size) for size in np.repeat(repeats, measures.size)]),
        'value': np.round(np.random.default_rng(111).random(measures.size*repeats.sum()), 2),
    }).lazy()

print(get_raw_data().collect())
# shape: (16, 4)
# ┌─────┬─────────┬──────┬───────┐
# │ id  ┆ measure ┆ date ┆ value │
# │ --- ┆ ---     ┆ ---  ┆ ---   │
# │ str ┆ str     ┆ i64  ┆ f64   │
# ╞═════╪═════════╪══════╪═══════╡
# │ A   ┆ height  ┆ 0    ┆ 0.15  │
# │ A   ┆ height  ┆ 1    ┆ 0.17  │
# │ A   ┆ height  ┆ 2    ┆ 0.51  │
# │ A   ┆ width   ┆ 0    ┆ 0.66  │
# │ A   ┆ width   ┆ 1    ┆ 0.77  │
# │ …   ┆ …       ┆ …    ┆ …     │
# │ B   ┆ width   ┆ 2    ┆ 0.72  │
# │ C   ┆ height  ┆ 0    ┆ 0.08  │
# │ C   ┆ height  ┆ 1    ┆ 0.42  │
# │ C   ┆ width   ┆ 0    ┆ 0.4   │
# │ C   ┆ width   ┆ 1    ┆ 0.94  │
# └─────┴─────────┴──────┴───────┘

def expr_add_categories() -> pl.Expr:
    """Generate a sample list of expressions to add some derived categories."""
    return [(pl.col('height')/pl.col('width')).alias('ratio')]

def expr_add_ids() -> pl.Expr:
    """Generate a sample list of expressions to add some derived ids."""
    return [
        (pl.col('A') / pl.col('B')).alias('AB'),
        (pl.col('A') / pl.col('C')).alias('AC')
    ]

def add_categories(df: pl.LazyFrame) -> pl.LazyFrame:
    """Add various derived categories to LazyFrame."""
    return (
        df
        .collect()  # pivot requires eager mode
        .pivot(index=['id', 'date'], columns='measure', values='value')
        .lazy()     # back to lazy mode
        .with_columns(expr_add_categories())
        .melt(id_vars=['id', 'date'], variable_name='measure')
        .drop_nulls()
        .select(['id', 'measure', 'date', 'value'])
        .sort(['id', 'measure', 'date'])
        .set_sorted(['id', 'measure', 'date'])
    )

def add_ids(df: pl.LazyFrame) -> pl.LazyFrame:
    """Add various derived IDs to LazyFrame."""
    return (
        df
        .collect()  # pivot requires eager mode
        .pivot(index=['measure', 'date'], columns='id', values='value')
        .lazy()     # back to lazy mode
        .with_columns(expr_add_ids())
        .melt(id_vars=['measure', 'date'], variable_name='id')
        .drop_nulls()
        .select(['id', 'measure', 'date', 'value'])
        .sort(['id', 'measure', 'date'])
        .set_sorted(['id', 'measure', 'date'])
    )

def get_modified_data() -> pl.LazyFrame:
    """Get raw data and add derived categories and names to LazyFrame."""
    return (
        get_raw_data()
        .pipe(add_categories)
        .pipe(add_ids)
    )

print(get_modified_data().collect())
# shape: (39, 4)
# ┌─────┬─────────┬──────┬──────────┐
# │ id  ┆ measure ┆ date ┆ value    │
# │ --- ┆ ---     ┆ ---  ┆ ---      │
# │ str ┆ str     ┆ i64  ┆ f64      │
# ╞═════╪═════════╪══════╪══════════╡
# │ A   ┆ height  ┆ 0    ┆ 0.15     │
# │ A   ┆ height  ┆ 1    ┆ 0.17     │
# │ A   ┆ height  ┆ 2    ┆ 0.51     │
# │ A   ┆ ratio   ┆ 0    ┆ 0.227273 │
# │ A   ┆ ratio   ┆ 1    ┆ 0.220779 │
# │ …   ┆ …       ┆ …    ┆ …        │
# │ C   ┆ height  ┆ 1    ┆ 0.42     │
# │ C   ┆ ratio   ┆ 0    ┆ 0.2      │
# │ C   ┆ ratio   ┆ 1    ┆ 0.446809 │
# │ C   ┆ width   ┆ 0    ┆ 0.4      │
# │ C   ┆ width   ┆ 1    ┆ 0.94     │
# └─────┴─────────┴──────┴──────────┘

# *************************************************************
# Python: 3.12.0
# Numpy: 1.26.4
# Polars: 0.20.31
# *************************************************************

Answer 1

来自

pl.DataFrame.pivot

的文档：

请注意，
pivot
仅在 eager 模式下可用。如果您知道预先唯一的列值，您可以使用
polars.LazyFrame.groupby()
在懒惰中获得与上面相同的结果模式：[...]

在您的具体示例中，您需要提前知道

measure

（即

"height"

和

"width"

）和

id

（即

"A"

、

"B"

、

"C"

）的唯一值才能分别重构

add_categories

和

add_ids

。

对在惰性模式下完全工作的函数的重构如下：

def add_categories(df: pl.LazyFrame) -> pl.LazyFrame:
    """Add various derived categories to LazyFrame."""
    measurement_categories = ["height", "width"]
    return (
        df
        .group_by("id", "date", maintain_order=True)
        .agg(
            pl.col("value").filter(pl.col("measure") == cat).first().alias(cat)
            for cat in measurement_categories
        )
        .with_columns(
            expr_add_categories()
        )
        .melt(id_vars=['id', 'date'], variable_name='measure')
        .drop_nulls()
        .select(['id', 'measure', 'date', 'value'])
        .sort(['id', 'measure', 'date'])
        .set_sorted(['id', 'measure', 'date'])
    )

def add_ids(df: pl.LazyFrame) -> pl.LazyFrame:
    """Add various derived IDs to LazyFrame."""
    ids = ["A", "B", "C"]
    return (
        df
        .group_by("measure", "date", maintain_order=True)
        .agg(
            pl.col("value").filter(pl.col("id") == id).first().alias(id)
            for id in ids
        )
        .with_columns(expr_add_ids())
        .melt(id_vars=['measure', 'date'], variable_name='id')
        .drop_nulls()
        .select(['id', 'measure', 'date', 'value'])
        .sort(['id', 'measure', 'date'])
        .set_sorted(['id', 'measure', 'date'])
    )

将派生数据添加到 Polars multiIndex LazyFrame 的有效方法

问题描述投票：0回答：1

1个回答

最新问题

将派生数据添加到 Polars multiIndex LazyFrame 的有效方法

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1