混合协同过滤和基于句子相似性的系统,用于根据用户输入的症状和位置进行医生推荐

问题描述 投票:0回答:1

我正在尝试使用混合协作过滤和基于句子相似性的推荐系统来解决根据用户的症状和位置推荐医生的问题,该系统遵循以下步骤:

  1. 处理缺失值。
  2. 对分类特征进行编码。
  3. 标准化数字特征。
  4. 生成句子嵌入:使用 BERT 生成文本特征(症状和医生专业化)的嵌入。
  5. 协作过滤模型:根据医生专业和位置创建用户-项目矩阵。
  6. 基于句子相似度的推荐器:使用 BERT 嵌入的余弦相似度计算用户症状和医生专业之间的相似度。
  7. 混合模型:结合协同过滤分数和句子相似度分数来提供个性化推荐。
  8. 使用精度、召回率、F1 分数和准确性等指标评估推荐质量。

这是我写的代码

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
file_path = "Egyptian Doctors.csv"
df = pd.read_csv(file_path)

# Data Preprocessing
# Handle missing values
imputer = SimpleImputer(strategy='median')
df['avg_rate'] = imputer.fit_transform(df[['avg_rate']])
df['Wait_time_Minutes'] = imputer.fit_transform(df[['Wait_time_Minutes']])
df['doctor_visitors'] = imputer.fit_transform(df[['doctor_visitors']])

# Encode categorical features for non-BERT parts
label_encoder = LabelEncoder()
df['specialization_encoded'] = label_encoder.fit_transform(df['specialization'])
df['location_encoded'] = label_encoder.fit_transform(df['location'])

# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['avg_rate', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Generate BERT embeddings for text features
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text_list):
    text_list = [str(text) for text in text_list]  # Convert all elements to strings
    inputs = tokenizer(text_list, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()  # Use [CLS] token embeddings

# Combine specialization and location for BERT embeddings
df['text_features'] = df['specialization'] + " " + df['location']
df['symptoms'] = df['specialization']  # Assuming symptoms are similar to specialization for this example

# Generate BERT embeddings
embeddings = get_bert_embeddings(df['text_features'].tolist())
embeddings_df = pd.DataFrame(embeddings, index=df.index)

# Split the data into train and test sets
# Keep 'avg_rate' for collaborative filtering
X = pd.concat([df.drop(columns=['text_features', 'specialization', 'location']), embeddings_df], axis=1)
X.columns = X.columns.astype(str)
y = df['avg_rate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Select numerical and encoded categorical features for ML model, including embeddings
X_train_ml = X_train[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + X_train.columns[X_train.columns.isin(embeddings_df.columns)].tolist()]
X_test_ml = X_test[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + X_test.columns[X_test.columns.isin(embeddings_df.columns)].tolist()]

# Impute missing values in X_train_ml and X_test_ml
imputer = SimpleImputer(strategy='median') # Or any other strategy you prefer
X_train_ml = imputer.fit_transform(X_train_ml)
X_test_ml = imputer.transform(X_test_ml)

# Train the RandomForestRegressor
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train_ml, y_train)

# Collaborative Filtering
user_item_matrix = X_train.pivot_table(index='specialization_encoded', columns='location_encoded', values='avg_rate')
user_item_matrix.fillna(0, inplace=True)

def hybrid_recommendation(user_symptoms, user_location, matrix, df, model_rf, embeddings_df):
    # Find the nearest doctors based on user symptoms and location
    user_embedding = get_bert_embeddings([user_symptoms + " " + user_location])[0]
    
    cosine_similarities = cosine_similarity([user_embedding], embeddings_df.values)[0]
    df['similarity'] = cosine_similarities

    # Get top 10 similar doctors
    similar_doctors = df.nlargest(10, 'similarity')

    recommendations = []

    for _, row in similar_doctors.iterrows():
        doctor_id = row.name
        doctor_specialization = row['specialization_encoded']
        doctor_location = row['location_encoded']
        doctor_rating = row['avg_rate']

        # Collaborative Filtering Score
        cf_score = matrix.loc[doctor_specialization, doctor_location]

        # RandomForest Score
        rf_features = row[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + list(embeddings_df.columns)].values.reshape(1, -1)
        rf_score = model_rf.predict(rf_features)[0]

        # Combine scores
        final_score = (cf_score + rf_score) / 2

        recommendations.append((doctor_id, final_score))

    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)

    return recommendations

# User input for recommendations
user_symptoms = "Skin Rash"
user_location = "El-Mansoura"
recommendations = hybrid_recommendation(user_symptoms, user_location, user_item_matrix, df, model_rf, embeddings_df)

print("Top 5 Recommendations:")
for rec in recommendations[:5]:
    print(f"Doctor ID: {rec[0]}, Score: {rec[1]}")

这是错误

KeyError                                  Traceback (most recent call last)
<ipython-input-7-6399ed6786be> in <cell line: 113>()
    111 user_symptoms = "Skin Rash"
    112 user_location = "El-Mansoura"
--> 113 recommendations = hybrid_recommendation(user_symptoms, user_location, user_item_matrix, df, model_rf, embeddings_df)
    114 
    115 print("Top 5 Recommendations:")

8 frames
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in _raise_if_missing(self, key, indexer, axis_name)
   5939 
   5940             not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
-> 5941             raise KeyError(f"{not_found} not in index")
   5942 
   5943     @overload

KeyError: '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316,...
random-forest bert-language-model recommendation-engine collaborative-filtering sentence-similarity
1个回答
0
投票

错误出现在hybrid_recommendation函数中。该代码试图预测与其训练数据不同的数据。数据的变化是由于嵌入数据与步骤中使用的训练数据未正确连接而导致的。

# Select numerical and encoded categorical features for ML model, including embeddings
X_train_ml = X_train[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + X_train.columns[X_train.columns.isin(embeddings_df.columns)].tolist()]
X_test_ml = X_test[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + X_test.columns[X_test.columns.isin(embeddings_df.columns)].tolist()]

这是对 Hybrid_recommendation 函数的修改,以仅使用训练数据中存在的数据:

def hybrid_recommendation(user_symptoms, user_location, matrix, df, model_rf, embeddings_df):
# Find the nearest doctors based on user symptoms and location
user_embedding = get_bert_embeddings([user_symptoms + " " + user_location])[0]

cosine_similarities = cosine_similarity([user_embedding], embeddings_df.values)[0]
df['similarity'] = cosine_similarities

# Get top 10 similar doctors
similar_doctors = df.nlargest(10, 'similarity')

recommendations = []

for _, row in similar_doctors.iterrows():
    doctor_id = row.name
    doctor_specialization = row['specialization_encoded']
    doctor_location = row['location_encoded']
    doctor_rating = row['avg_rate']

    # Collaborative Filtering Score
    cf_score = matrix.loc[doctor_specialization, doctor_location]

    # RandomForest Score
    # rf_features = row[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes'] + list(embeddings_df.columns)].values.reshape(1, -1)

    #This is where the error is, the data used to train the model is different from what you are trying to predict.

    rf_features = row[['specialization_encoded', 'location_encoded', 'fees.1', 'doctors_views', 'doctor_visitors', 'Wait_time_Minutes']].values.reshape(1, -1)
    rf_features = imputer.transform(rf_features)
    rf_score = model_rf.predict(rf_features)[0]

    # Combine scores
    final_score = (cf_score + rf_score) / 2

    recommendations.append((doctor_id, final_score))

recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)

return recommendations

我希望您觉得这有帮助。

© www.soinside.com 2019 - 2024. All rights reserved.