我刚刚开始学习机器学习。我正在尝试处理简单的例子。 我有一个 *.csv 文件,其中包含文本、数字和日期的列。出于学习目的,我的目标是使用 KMeans 等在无监督模式下对该文件的行进行分类。
我不知道如何组合所有变压器以将其与 KMeans 一起使用。 也许有人可以帮助我提出不同的建议。
`
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
class DateTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
X_date_encoded = np.zeros_like(X)
valid_dates = ~np.isnat(X).all(axis=1)
if np.any(valid_dates):
X_date_encoded[valid_dates] = (X[valid_dates] - X[valid_dates].min()) / (
X[valid_dates].max() - X[valid_dates].min())
return X_date_encoded
def classify_excel_rows(file_path, n_clusters=3):
"""
Classify the rows of an Excel sheet using unsupervised learning (KMeans).
Args:
- file_path: The path to the Excel file.
- n_clusters: The number of clusters to form.
Returns:
- A DataFrame with an additional column 'Cluster' indicating the cluster each row belongs to.
"""
# Read the Excel file
data = pd.read_csv(file_path, sep=';', skiprows=4)
data['Value'] = data['Value'].str.replace(',', '.').astype(float)
data['Value'] = pd.to_numeric(data['Value'], errors='coerce')
date_format = "%d.%m.%Y"
data['Date_1'] = pd.to_datetime(data['Date_1'], format=date_format, errors='coerce')
data['Date_2'] = pd.to_datetime(data['Date_2'], format=date_format, errors='coerce')
# Identify different data types
text_columns = data.select_dtypes(include=['object']).columns.tolist()
numeric_columns = data.select_dtypes(include=['float64']).columns
date_columns = data.select_dtypes(include=['datetime']).columns
# Preprocess the data
numeric_transformer = make_pipeline(StandardScaler())
text_transformer = make_pipeline(TfidfVectorizer())
date_transformer = make_pipeline(DateTransformer())
**X = ???? # how to combine all transformers to use it with KMeans?**
[...]
# Perform K-Means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(X)
`
我已经尝试过上面的示例代码。我不知道如何使用 KMeans 处理此类数据进行无监督学习/分类。我希望根据列值对 *.csv 文件的行进行聚类。
我确实找到了替代方案:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
def classify_excel_rows(file_path, n_clusters=3):
"""
Classify the rows of an Excel sheet using unsupervised learning (KMeans).
Args:
- file_path: The path to the Excel file.
- n_clusters: The number of clusters to form.
Returns:
- A DataFrame with an additional column 'Cluster' indicating the cluster each row belongs to.
"""
# Read the Excel file
data = pd.read_csv(file_path, sep=';', skiprows=4)
data['Value'] = data['Value'].str.replace(',', '.').astype(float)
data['Value'] = pd.to_numeric(data['Value'], errors='coerce')
date_format = "%d.%m.%Y"
data['Date_1'] = pd.to_datetime(data['Date_1'], format=date_format, errors='coerce')
data['Date_2'] = pd.to_datetime(data['Date_2'], format=date_format, errors='coerce')
data = data.ffill()
data = pd.get_dummies(data, drop_first=True)
data['year'] = data['Date_1'].dt.year
data['month'] = data['Date_1'].dt.month
data['day'] = data['Date_1'].dt.day
data = data.drop(['Date_1', 'Date_2'], axis=1)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
# # Identify different data types
# text_columns = data.select_dtypes(include=['object']).columns.tolist()
# numeric_columns = data.select_dtypes(include=['float64']).columns
# date_columns = data.select_dtypes(include=['datetime']).columns
# # Perform K-Means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(scaled_data)
# Add the cluster labels to the original DataFrame
data['Cluster'] = kmeans.labels_
return data