如何使用 Polars 内置方法缩写短语?

问题描述 投票:0回答:1

我需要通过提取大写单词来缩写一系列短语或短语,然后根据它们的比例长度创建缩写。 这就是我想要实现的目标:

  • 从每个短语中提取大写单词。
  • 根据每个短语中大写单词的总长度计算比例长度。
  • 调整长度以确保缩写满足目标长度(例如 4 个字符)。

TODO:如果缩写导致重复,我需要:

  • 自动解决它们(例如,通过添加数字或修改字符)
  • 用警告标记它们。

目前,我正在使用 Python 函数并将其映射到 Polars 系列。 有没有更有效的方法来使用 Polars 内置方法来做到这一点。

这是我目前的方法:

import polars as pl


def _abbreviate_phrase(phrase: str, length: int) -> str:
    """Abbreviate a single phrase by a constant length.

    The function aims to abbreviate phrases into a constant length by focusing
    on capitalized words and adjusting them according to their proportional lengths.

    Example:
        phrase = 'Commercial & Professional'
        length = 4
        res = _abbreviate_phrase(phrase, length)
        print(res)
        # CoPr
    """
    # determine size of slices
    capitalized_words = [word for word in phrase.split(' ') if word[0].isupper()]
    word_lengths = [len(word) for word in capitalized_words]
    total_word_length = sum(word_lengths)

    if total_word_length == 0:
        return ''  # Return empty if no capitalized words

    proportional_lengths = [round(wl / total_word_length * length) for wl in word_lengths]
    total_proportional_length = sum(proportional_lengths)

    # Adjust slices if their total length doesn't match target length
    if total_proportional_length < length:
        for i in range(length - total_proportional_length):
            proportional_lengths[i] += 1
    elif total_proportional_length > length:
        for i in range(total_proportional_length - length):
            proportional_lengths[i] -= 1

    # Combine the abbreviated words and return the result
    abbreviated_phrase = ''.join([word[:plength] for word, plength in zip(capitalized_words, proportional_lengths)])
    return abbreviated_phrase


def abbreviate_phrases(phrases: pl.Series, length: int) -> pl.Series:
    """Abbreviate phrases by a constant length.

    Example:
        phrases = pl.Series([
            'Sunshine',
            'Sunset',
            'Climate Change and Environmental Impact',
            'Health and Wellness',
            'Quantum Computing and Physics',
            'Global Warming and Renewable Resources'
        ])
        length = 4
        res = abbreviate_phrases(phrases, length)
        print(res)
        # Series: '' [str]
        # [
        #   "Suns"
        #   "Suns"
        #   "CEnI"
        #   "HeWe"
        #   "QCoP"
        #   "GWRR"
        # ]
    """
    abbreviates = phrases.map_elements(lambda x: _abbreviate_phrase(x, length), return_dtype=pl.String)
    # if not abbreviates.is_unique().all():
    #     print('WARNING: There are duplicated abbreviations.')
    return abbreviates
python string python-polars
1个回答
0
投票

给你

phrases = pl.DataFrame({
    "p": [
        'Sunshine',
        'Sunset',
        'Climate Change and Environmental Impact',
        'Health and Wellness',
        'Quantum Computing and Physics',
        'Global Warming and Renewable Resources',
        "no capital letters",
    ]
})

def abbreviate_phrase(df: pl.DataFrame, *, phrase_column: pl.Expr | str) -> pl.DataFrame:
    if isinstance(phrase_column, str):
        phrase_column = pl.col(phrase_column)

    # cap_words defined below
    word_lengths = pl.col("cap_words").str.len_chars()
    total_word_length = word_lengths.sum().over("index")
    proportional_lengths = (word_lengths / total_word_length * length).round()
    total_proportional_length = pl.col("proportional_lengths").sum().over("index")
    length_diff = length - total_proportional_length
    proportion_adjustment = pl.when(length_diff < 0).then(-1).otherwise(1)
    slice_length = (
        proportional_lengths + (
            pl.when(
                length_diff != 0,
                length_diff.abs() == pl.col("index").cum_count().over("index"),
            )
            .then(proportion_adjustment)
            .otherwise(0)
        )
    )
    
    output = (
        df
        .with_row_index()
        .with_columns(cap_words=phrase_column.str.extract_all("[A-Z]\w+"))
        .explode("cap_words")
        .with_columns(proportional_lengths=proportional_lengths)
        .with_columns(sliced=pl.col("cap_words").str.slice(0, slice_length))
        .group_by("index")
        .agg(
            phrase_column.first(),
            abbreviates=pl.col("sliced").str.join(""),
        )
        .drop("index")
    )

    if not output.filter(output.select("abbreviates").is_duplicated()).is_empty():
        print('WARNING: There are duplicated abbreviations.')
    
    return output


print(abbreviate_phrase(phrases, phrase_column="p"))
WARNING: There are duplicated abbreviations.
shape: (7, 2)
┌─────────────────────────────────┬─────────────┐
│ p                               ┆ abbreviates │
│ ---                             ┆ ---         │
│ str                             ┆ str         │
╞═════════════════════════════════╪═════════════╡
│ Sunshine                        ┆ Suns        │
│ Sunset                          ┆ Suns        │
│ Climate Change and Environment… ┆ CEnI        │
│ Health and Wellness             ┆ HeWe        │
│ Quantum Computing and Physics   ┆ QCoP        │
│ Global Warming and Renewable R… ┆ GWRR        │
│ no capital letters              ┆             │
└─────────────────────────────────┴─────────────┘
© www.soinside.com 2019 - 2024. All rights reserved.