我正在尝试使用混合协作过滤和基于句子相似性的推荐系统来解决根据用户的症状和位置推荐医生的问题,该系统遵循以下步骤:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
# Load the dataset
file_path = "Egyptian Doctors.csv"
df = pd.read_csv(file_path)
# Data Preprocessing
# Handle missing values
imputer = SimpleImputer(strategy='median')
df['avg_rate'] = imputer.fit_transform(df[['avg_rate']])
df['Wait_time_Minutes'] = imputer.fit_transform(df[['Wait_time_Minutes']])
df['doctor_visitors'] = imputer.fit_transform(df[['doctor_visitors']])
# Encode categorical features for non-BERT parts
label_encoder = LabelEncoder()
df['specialization_encoded'] = label_encoder.fit_transform(df['specialization'])
df['location_encoded'] = label_encoder.fit_transform(df['location'])
# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['avg_rate', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes']
df[numerical_features] = scaler.fit_transform(df[numerical_features])
# Generate BERT embeddings for text features
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
def get_bert_embeddings(text_list):
text_list = [str(text) for text in text_list] # Convert all elements to strings
inputs = tokenizer(text_list, return_tensors='pt', padding=True, truncation=True, max_length=128)
with torch.no_grad():
outputs = model(**inputs)
return outputs.last_hidden_state[:, 0, :].numpy() # Use [CLS] token embeddings
# Combine specialization and location for BERT embeddings
df['text_features'] = df['specialization'] + " " + df['location']
df['symptoms'] = df['specialization'] # Assuming symptoms are similar to specialization for this example
# Generate BERT embeddings
embeddings = get_bert_embeddings(df['text_features'].tolist())
embeddings_df = pd.DataFrame(embeddings, index=df.index)
# Split the data into train and test sets
# Keep 'avg_rate' for collaborative filtering
X = pd.concat([df.drop(columns=['text_features', 'specialization', 'location']), embeddings_df], axis=1)
X.columns = X.columns.astype(str)
y = df['avg_rate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Select numerical and encoded categorical features for ML model, including embeddings
X_train_ml = X_train[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + X_train.columns[X_train.columns.isin(embeddings_df.columns)].tolist()]
X_test_ml = X_test[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + X_test.columns[X_test.columns.isin(embeddings_df.columns)].tolist()]
# Impute missing values in X_train_ml and X_test_ml
imputer = SimpleImputer(strategy='median') # Or any other strategy you prefer
X_train_ml = imputer.fit_transform(X_train_ml)
X_test_ml = imputer.transform(X_test_ml)
# Train the RandomForestRegressor
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train_ml, y_train)
# Collaborative Filtering
user_item_matrix = X_train.pivot_table(index='specialization_encoded', columns='location_encoded', values='avg_rate')
user_item_matrix.fillna(0, inplace=True)
def hybrid_recommendation(user_symptoms, user_location, matrix, df, model_rf, embeddings_df):
# Find the nearest doctors based on user symptoms and location
user_embedding = get_bert_embeddings([user_symptoms + " " + user_location])[0]
cosine_similarities = cosine_similarity([user_embedding], embeddings_df.values)[0]
df['similarity'] = cosine_similarities
# Get top 10 similar doctors
similar_doctors = df.nlargest(10, 'similarity')
recommendations = []
for _, row in similar_doctors.iterrows():
doctor_id = row.name
doctor_specialization = row['specialization_encoded']
doctor_location = row['location_encoded']
doctor_rating = row['avg_rate']
# Collaborative Filtering Score
cf_score = matrix.loc[doctor_specialization, doctor_location]
# RandomForest Score
rf_features = row[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + list(embeddings_df.columns)].values.reshape(1, -1)
rf_score = model_rf.predict(rf_features)[0]
# Combine scores
final_score = (cf_score + rf_score) / 2
recommendations.append((doctor_id, final_score))
recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)
return recommendations
# User input for recommendations
user_symptoms = "Skin Rash"
user_location = "El-Mansoura"
recommendations = hybrid_recommendation(user_symptoms, user_location, user_item_matrix, df, model_rf, embeddings_df)
print("Top 5 Recommendations:")
for rec in recommendations[:5]:
print(f"Doctor ID: {rec[0]}, Score: {rec[1]}")
KeyError Traceback (most recent call last)
<ipython-input-7-6399ed6786be> in <cell line: 113>()
111 user_symptoms = "Skin Rash"
112 user_location = "El-Mansoura"
--> 113 recommendations = hybrid_recommendation(user_symptoms, user_location, user_item_matrix, df, model_rf, embeddings_df)
114
115 print("Top 5 Recommendations:")
8 frames
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in _raise_if_missing(self, key, indexer, axis_name)
5939
5940 not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
-> 5941 raise KeyError(f"{not_found} not in index")
5942
5943 @overload
KeyError: '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,...
错误出现在hybrid_recommendation函数中。该代码试图预测与其训练数据不同的数据。数据的变化是由于嵌入数据与步骤中使用的训练数据未正确连接而导致的。
# Select numerical and encoded categorical features for ML model, including embeddings
X_train_ml = X_train[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + X_train.columns[X_train.columns.isin(embeddings_df.columns)].tolist()]
X_test_ml = X_test[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + X_test.columns[X_test.columns.isin(embeddings_df.columns)].tolist()]
这是对 Hybrid_recommendation 函数的修改,以仅使用训练数据中存在的数据:
def hybrid_recommendation(user_symptoms, user_location, matrix, df, model_rf, embeddings_df):
# Find the nearest doctors based on user symptoms and location
user_embedding = get_bert_embeddings([user_symptoms + " " + user_location])[0]
cosine_similarities = cosine_similarity([user_embedding], embeddings_df.values)[0]
df['similarity'] = cosine_similarities
# Get top 10 similar doctors
similar_doctors = df.nlargest(10, 'similarity')
recommendations = []
for _, row in similar_doctors.iterrows():
doctor_id = row.name
doctor_specialization = row['specialization_encoded']
doctor_location = row['location_encoded']
doctor_rating = row['avg_rate']
# Collaborative Filtering Score
cf_score = matrix.loc[doctor_specialization, doctor_location]
# RandomForest Score
# rf_features = row[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + list(embeddings_df.columns)].values.reshape(1, -1)
#This is where the error is, the data used to train the model is different from what you are trying to predict.
rf_features = row[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes']].values.reshape(1, -1)
rf_features = imputer.transform(rf_features)
rf_score = model_rf.predict(rf_features)[0]
# Combine scores
final_score = (cf_score + rf_score) / 2
recommendations.append((doctor_id, final_score))
recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)
return recommendations
我希望您觉得这有帮助。