我有包含 1000 个文本行的数据框。
df['text']
我还有 5 个单词,我想知道每个单词代表文本的程度(0 到 1 之间)
每个分数都会在
df["word1"]
,df["word2"]
等
我会很高兴获得建议如何做到这一点
编辑
代表=单词到文本之间的语义距离。
例如- 假设第一行的文字是“我想吃东西” 我有两个词:食物和房子。
所以在
df["food "]
中的分数会比在df["house"]
中更高
sentence_transformers
: 中预先训练的句子转换器模型
import pandas as pd
from sentence_transformers import SentenceTransformer, util
class SemanticSimilarityCalculator:
def __init__(self, model_name: str = 'all-MiniLM-L6-v2') -> None:
self.model = SentenceTransformer(model_name)
self.word_embeddings = None
def encode_words(self, words: list[str]) -> None:
self.word_embeddings = self.model.encode(words, convert_to_tensor=True)
self.words = words
def calculate_similarity(self, text: str) -> list[float]:
if self.word_embeddings is None:
raise ValueError("Words must be encoded before calculating similarity.")
text_embedding = self.model.encode(text, convert_to_tensor=True)
similarities = util.cos_sim(text_embedding, self.word_embeddings)[0].tolist()
return similarities
def add_similarity_scores_to_df(self, df: pd.DataFrame, text_column: str) -> pd.DataFrame:
if self.words is None:
raise ValueError("Words must be encoded before adding scores to the DataFrame.")
similarity_columns = ["word_" + word for word in self.words]
df[similarity_columns] = df[text_column].apply(
lambda text: pd.Series(self.calculate_similarity(text))
)
return df
def main():
data = {'text': ["I want to eat", "The house is big", "I need to sleep"]}
df = pd.DataFrame(data)
words = ["food", "house", "sleep", "drink", "run"]
calculator = SemanticSimilarityCalculator()
calculator.encode_words(words)
df_with_scores = calculator.add_similarity_scores_to_df(df, text_column="text")
print(df_with_scores)
if __name__ == "__main__":
main()
输出:
text word_food word_house word_sleep word_drink word_run
0 I want to eat 0.592410 0.215032 0.254065 0.370329 0.259350
1 The house is big 0.243262 0.672110 0.170785 0.213780 0.119716
2 I need to sleep 0.253703 0.222462 0.725105 0.358372 0.303838