将 Pandas 对象数据帧转换为向量数据帧

问题描述 投票:0回答:1

我有一个 Pandas 数据框(超过 1k 行)。我的数据框中有数字、对象、字符串和布尔值。我想将数据帧的每个“单元格”转换为向量,并使用结果向量。然后我计划比较每一行向量的相似性。


Col 0,Col 1,Col 2,Col 3,Col 4,Col 5,Col 6,Col 7,Col 8,Col 9,Col 10

12,65e1e35b7fe333,harry Joe,1,FALSE,swe,1,,,text1,0


Col 0,Col 1,Col 2,Col 3,Col 4,Col 5,Col 6,Col 7,Col 8,Col 9,Col 10


Vect12,Vect 13,Vect 14,Vect4,Vect5,Vect6,Vect7,Vect 15,Vect 16,Vect 17,Vect11

Vect18,Vect 19,Vect 20,Vect4,Vect5,Vect6,Vect7,Vect 21,Vect 22,Vect 23,Vect11

也许有一个在 Python w SciKit 中做到这一点的好方法吗?


import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


vectorizer = TfidfVectorizer()

# Transform the data to feature vectors
X = vectorizer.fit_transform(df)
X = pd.DataFrame(_X.todense(), index=df.index, columns=vectorizer.vocabulary_)

# Labels
y = df['label']


TypeError                                 Traceback (most recent call last)
Cell In[11], line 14
     11 vectorizer = TfidfVectorizer()
     13 # Transform the text data to feature vectors
---> 14 X = vectorizer.fit_transform(df)
     16 X = pd.DataFrame(_X.todense(), index=df.index, columns=vectorizer.vocabulary_)
     17 X.head()

File /anaconda/envs/xxx_py38/lib/python3.9/site-packages/sklearn/feature_extraction/text.py:2079, in TfidfVectorizer.fit_transform(self, raw_documents, y)
   2072 self._check_params()
   2073 self._tfidf = TfidfTransformer(
   2074     norm=self.norm,
   2075     use_idf=self.use_idf,
   2076     smooth_idf=self.smooth_idf,
   2077     sublinear_tf=self.sublinear_tf,
   2078 )
-> 2079 X = super().fit_transform(raw_documents)
   2080 self._tfidf.fit(X)
   2081 # X is already a transformed view of raw_documents so
   2082 # we set copy to False

File /anaconda/envs/xxx_py38/lib/python3.9/site-packages/sklearn/feature_extraction/text.py:1338, in CountVectorizer.fit_transform(self, raw_documents, y)
   1330             warnings.warn(
   1331                 "Upper case characters found in"
   1332                 " vocabulary while 'lowercase'"
   1333                 " is True. These entries will not"
   1334                 " be matched with any documents"
   1335             )
   1336             break
-> 1338 vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
   1340 if self.binary:
   1341     X.data.fill(1)

File /anaconda/envs/xxx_py38/lib/python3.9/site-packages/sklearn/feature_extraction/text.py:1207, in CountVectorizer._count_vocab(self, raw_documents, fixed_vocab)
   1205 values = _make_int_array()
   1206 indptr.append(0)
-> 1207 for doc in raw_documents:
   1208     feature_counter = {}
   1209     for feature in analyze(doc):

TypeError: 'Data' object is not iterable
python pandas dataframe vector scikit-learn


import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

df = pd.DataFrame({
    'Col 0': [10, 100, 1000],
    'Col 1': ['65e1e35b7fe333', '65e1e35b7fe599', '65e1e35b7fe165'],
    'Col 2': [True, False, False],
    'Col 3': ['Protijayi', 'Roopa', 'Gina'],

  Col 0           Col 1  Col 2      Col 3
0     10  65e1e35b7fe333   True  Protijayi
1    100  65e1e35b7fe599  False      Roopa
2   1000  65e1e35b7fe165  False       Gina
# Define transformations for different types of data
preprocessor = ColumnTransformer(
        ('num', StandardScaler(), ['Col 0']),  # For numerical columns
        ('cat', OneHotEncoder(), ['Col 1']),  # For categorical columns
        ('bool', 'passthrough', ['Col 2']),  # For Boolean columns (already 0 or 1)
        ('text', TfidfVectorizer(), 'Col 3')  # For text data
    remainder='drop'  # Drop untransformed columns

# Apply transformations
X = preprocessor.fit_transform(df)

# Convert the resulting matrix to a DataFrame for easy inspection
X_df = pd.DataFrame(X.toarray() if hasattr(X, 'toarray') else X)

         0    1    2    3    4    5    6    7
0 -0.805387  0.0  1.0  0.0  1.0  0.0  1.0  0.0
1 -0.604040  0.0  0.0  1.0  0.0  0.0  0.0  1.0
2  1.409428  1.0  0.0  0.0  0.0  1.0  0.0  0.0
© www.soinside.com 2019 - 2024. All rights reserved.