我需要通过提取大写单词来缩写一系列短语或短语,然后根据它们的比例长度创建缩写。 这就是我想要实现的目标:
TODO:如果缩写导致重复,我需要:
目前,我正在使用 Python 函数并将其映射到 Polars 系列。 有没有更有效的方法来使用 Polars 内置方法来做到这一点。
这是我目前的方法:
import polars as pl
def _abbreviate_phrase(phrase: str, length: int) -> str:
"""Abbreviate a single phrase by a constant length.
The function aims to abbreviate phrases into a constant length by focusing
on capitalized words and adjusting them according to their proportional lengths.
Example:
phrase = 'Commercial & Professional'
length = 4
res = _abbreviate_phrase(phrase, length)
print(res)
# CoPr
"""
# determine size of slices
capitalized_words = [word for word in phrase.split(' ') if word[0].isupper()]
word_lengths = [len(word) for word in capitalized_words]
total_word_length = sum(word_lengths)
if total_word_length == 0:
return '' # Return empty if no capitalized words
proportional_lengths = [round(wl / total_word_length * length) for wl in word_lengths]
total_proportional_length = sum(proportional_lengths)
# Adjust slices if their total length doesn't match target length
if total_proportional_length < length:
for i in range(length - total_proportional_length):
proportional_lengths[i] += 1
elif total_proportional_length > length:
for i in range(total_proportional_length - length):
proportional_lengths[i] -= 1
# Combine the abbreviated words and return the result
abbreviated_phrase = ''.join([word[:plength] for word, plength in zip(capitalized_words, proportional_lengths)])
return abbreviated_phrase
def abbreviate_phrases(phrases: pl.Series, length: int) -> pl.Series:
"""Abbreviate phrases by a constant length.
Example:
phrases = pl.Series([
'Sunshine',
'Sunset',
'Climate Change and Environmental Impact',
'Health and Wellness',
'Quantum Computing and Physics',
'Global Warming and Renewable Resources'
])
length = 4
res = abbreviate_phrases(phrases, length)
print(res)
# Series: '' [str]
# [
# "Suns"
# "Suns"
# "CEnI"
# "HeWe"
# "QCoP"
# "GWRR"
# ]
"""
abbreviates = phrases.map_elements(lambda x: _abbreviate_phrase(x, length), return_dtype=pl.String)
# if not abbreviates.is_unique().all():
# print('WARNING: There are duplicated abbreviations.')
return abbreviates
给你
phrases = pl.DataFrame({
"p": [
'Sunshine',
'Sunset',
'Climate Change and Environmental Impact',
'Health and Wellness',
'Quantum Computing and Physics',
'Global Warming and Renewable Resources',
"no capital letters",
]
})
def abbreviate_phrase(df: pl.DataFrame, *, phrase_column: pl.Expr | str) -> pl.DataFrame:
if isinstance(phrase_column, str):
phrase_column = pl.col(phrase_column)
# cap_words defined below
word_lengths = pl.col("cap_words").str.len_chars()
total_word_length = word_lengths.sum().over("index")
proportional_lengths = (word_lengths / total_word_length * length).round()
total_proportional_length = pl.col("proportional_lengths").sum().over("index")
length_diff = length - total_proportional_length
proportion_adjustment = pl.when(length_diff < 0).then(-1).otherwise(1)
slice_length = (
proportional_lengths + (
pl.when(
length_diff != 0,
length_diff.abs() == pl.col("index").cum_count().over("index"),
)
.then(proportion_adjustment)
.otherwise(0)
)
)
output = (
df
.with_row_index()
.with_columns(cap_words=phrase_column.str.extract_all("[A-Z]\w+"))
.explode("cap_words")
.with_columns(proportional_lengths=proportional_lengths)
.with_columns(sliced=pl.col("cap_words").str.slice(0, slice_length))
.group_by("index")
.agg(
phrase_column.first(),
abbreviates=pl.col("sliced").str.join(""),
)
.drop("index")
)
if not output.filter(output.select("abbreviates").is_duplicated()).is_empty():
print('WARNING: There are duplicated abbreviations.')
return output
print(abbreviate_phrase(phrases, phrase_column="p"))
WARNING: There are duplicated abbreviations.
shape: (7, 2)
┌─────────────────────────────────┬─────────────┐
│ p ┆ abbreviates │
│ --- ┆ --- │
│ str ┆ str │
╞═════════════════════════════════╪═════════════╡
│ Sunshine ┆ Suns │
│ Sunset ┆ Suns │
│ Climate Change and Environment… ┆ CEnI │
│ Health and Wellness ┆ HeWe │
│ Quantum Computing and Physics ┆ QCoP │
│ Global Warming and Renewable R… ┆ GWRR │
│ no capital letters ┆ │
└─────────────────────────────────┴─────────────┘