我想将
log()
一起应用于我的 DataFrame
和 MinMaxScaler() 。
我希望输出是 pandas DataFrame() ,其中包含原始数据的索引和列。
我想使用用于 fit_transform()
到 inverse_transform()
的参数产生一个新的数据框。所以,它需要在FunctionTransformer
内部构建。
我尝试过的:
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
# Initialize MinMaxScaler with range (0, 1)
scaler_logMinMax = MinMaxScaler(feature_range=(0, 1))
# Log transformation function
def log_and_scale(X, scaler=scaler_logMinMax, shift=1e-9):
X_log = np.log(X + shift) # Apply log transformation with a small shift
return pd.DataFrame(scaler.fit_transform(X_log)) # Scale the log-transformed data
# Inverse transformation: first unscale, then inverse log transform
def inv_log_and_scale(X, scaler=scaler_logMinMax, shift=1e-9):
X_unscaled = scaler.inverse_transform(X) # Inverse scaling
return np.exp(X_unscaled) - shift # Inverse of log transformation
# Create FunctionTransformer for the log and scale transformation
log_and_scale_transformer = FunctionTransformer(func=log_and_scale, inverse_func=inv_log_and_scale, validate=True)
df_subset = pd.DataFrame(
{
1: [135.2342984, 83.17136704, 23.41329775, 3.574450787],
2: [59.31328422, 18.15285711, 11.1736562, 4.788951527],
3: [45.0087282, 4.094515245, 106.536704, 527.0962651],
}
)
df_subset.columns = [1, 2, 3]
df_subset.index = ["201001", "201002", "201003", "201004"]
df_subset.index.name = "Date"
df_subset.columns.name = "id"
cols_to_apply_scaler = [1, 2]
df_subset
id 1 2 3
Date
201001 135.234298 59.313284 45.008728
201002 83.171367 18.152857 4.094515
201003 23.413298 11.173656 106.536704
201004 3.574451 4.788952 527.096265
# Transforming
df_subset[cols_to_apply_scaler] = pd.DataFrame(log_and_scale_transformer.fit_transform(df_subset[cols_to_apply_scaler]))
df_subset
id 1 2 3
Date
201001 NaN NaN 45.008728
201002 NaN NaN 4.094515
201003 NaN NaN 106.536704
201004 NaN NaN 527.096265
# The way that I expect to apply the inverse transformer.
# df_subset[cols_to_apply_scaler] = log_and_scale_transformer.inverse_transform(df_subset[cols_to_apply_scaler])
问题:
pd.DataFrame(log_and_scale_transformer.fit_transform(df_subset[cols_to_apply_scaler]))
可以工作,但它无法分配给原始DataFrame,因为列的名称发生了变化。怎么解决?scaler_logMinMax
中的fit_transform()
的值是如何通过inverse_transform
进行的?创建数据框后我也尝试了
log_and_scale_transformer = log_and_scale_transformer.set_output(transform="pandas")
,但没有成功。
我需要在应用该函数之前过滤列。 我还想坚持使用
FunctionTransformer
,因为我使用具有相同结构的其他变压器。例如:
# Define the inverse transformation function with a shift
def inv_y(X, shift=0.5):
return 1 / (X + shift)
# Define the inverse inverse transformation to revert to original values
def inv_inv_y(X, shift=0.5):
return (1 - X * shift) / X
# Create the FunctionTransformer
inverse_transformer = FunctionTransformer(func=inv_y, inverse_func=inv_inv_y, validate=False, check_inverse=True)
总之,我无法同时应用函数和缩放器。
使用一个不同的简单示例,它可以工作:
# DataFrame Example
X = np.array([[0, 1, 2], [2, 3, 4], [5, 7, 9]])
cols = ["A", "B", "C"]
cols_to_apply_scaler = cols[:-1]
X = pd.DataFrame(X, columns=cols, index=[0,1,2])
X
A B C
0 0 1 2
1 2 3 4
2 5 7 9
# Transforming
X[cols_to_apply_scaler] = pd.DataFrame(log_and_scale_transformer.fit_transform(X[cols_to_apply_scaler]))
A B C
0 0.000000 0.000000 2
1 0.958971 0.564575 4
2 1.000000 1.000000 9
/home/guilherme/anaconda3/envs/time_series/lib/python3.11/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but FunctionTransformer was fitted with feature names
warnings.warn(
# Inverse
X[cols_to_apply_scaler] = log_and_scale_transformer.inverse_transform(X[cols_to_apply_scaler])
X
A B C
0 6.203855e-25 1.0 2
1 2.000000e+00 3.0 4
2 5.000000e+00 7.0 9
但我不明白这个警告。我可以修复它吗?
要保留索引和列,请使用 Dataframes。
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
import pandas as pd, numpy as np
scaler_logMinMax = MinMaxScaler(feature_range=(0, 1))
# Log transformation function
def log_and_scale(X, scaler=scaler_logMinMax, shift=1e-9):
X_log = np.log(X + shift)
scaled = scaler.fit_transform(X_log)
return pd.DataFrame(scaled, index=X.index, columns=X.columns)
def inv_log_and_scale(X, scaler=scaler_logMinMax, shift=1e-9):
unscaled = scaler.inverse_transform(X)
return pd.DataFrame(np.exp(unscaled) - shift, index=X.index, columns=X.columns)
log_and_scale_transformer = FunctionTransformer(
func=log_and_scale,
inverse_func=inv_log_and_scale,
validate=False # Allow pandas
)
df_subset = pd.DataFrame(
{
1: [135.2342984, 83.17136704, 23.41329775, 3.574450787],
2: [59.31328422, 18.15285711, 11.1736562, 4.788951527],
3: [45.0087282, 4.094515245, 106.536704, 527.0962651],
},
index=["201001", "201002", "201003", "201004"]
)
df_subset.columns = [1, 2, 3]
df_subset.index.name = "Date"
df_subset.columns.name = "id"
cols_to_apply_scaler = [1, 2]
# fit and transform
df_subset[cols_to_apply_scaler] = log_and_scale_transformer.fit_transform(df_subset[cols_to_apply_scaler])
print("Transformed DataFrame:")
print(df_subset)
# inverse transform the same columns
df_subset[cols_to_apply_scaler] = log_and_scale_transformer.inverse_transform(df_subset[cols_to_apply_scaler])
print("\nInverse Transformed DataFrame:")
print(df_subset)
此打印
id 1 2 3
Date
201001 1.000000 1.000000 45.008728
201002 0.894048 0.789684 4.094515
201003 0.574869 0.649927 106.536704
201004 0.000000 0.000000 527.096265
和
id 1 2 3
Date
201001 135.234298 59.313284 45.008728
201002 83.171367 18.152857 4.094515
201003 23.413298 11.173656 106.536704
201004 3.574451 4.788952 527.096265