重现问题
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.sql import Window, DataFrame
from pyspark.sql.functions import col
from sklearn.metrics import r2_score
# from recommenders.utils.spark_utils import start_or_get_spark
# from recommenders.evaluation.spark_evaluation import SparkRatingEvaluation
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
import numpy as np
import os
COL_USER = "UserId"
COL_ITEM = "MovieId"
COL_RATING = "Rating"
COL_PREDICTION = "Rating"
HEADER = {
"col_user": COL_USER,
"col_item": COL_ITEM,
"col_rating": COL_RATING,
"col_prediction": COL_PREDICTION,
}
class SparkRatingEvaluation:
"""Spark Rating Evaluator"""
def __init__(
self,
rating_true,
rating_pred,
col_user = COL_USER,
col_item = COL_ITEM,
col_rating = COL_RATING,
col_prediction = COL_PREDICTION,
):
"""Initializer.
This is the Spark version of rating metrics evaluator.
The methods of this class, calculate rating metrics such as root mean squared error, mean absolute error,
R squared, and explained variance.
Args:
rating_true (pyspark.sql.DataFrame): True labels.
rating_pred (pyspark.sql.DataFrame): Predicted labels.
col_user (str): column name for user.
col_item (str): column name for item.
col_rating (str): column name for rating.
col_prediction (str): column name for prediction.
"""
self.rating_true = rating_true
self.rating_pred = rating_pred
self.col_user = col_user
self.col_item = col_item
self.col_rating = col_rating
self.col_prediction = col_prediction
# Check if inputs are Spark DataFrames.
if not isinstance(self.rating_true, DataFrame):
raise TypeError(
"rating_true should be but is not a Spark DataFrame"
) # pragma : No Cover
if not isinstance(self.rating_pred, DataFrame):
raise TypeError(
"rating_pred should be but is not a Spark DataFrame"
) # pragma : No Cover
# Check if columns exist.
true_columns = self.rating_true.columns
pred_columns = self.rating_pred.columns
if rating_true.count() == 0:
raise ValueError("Empty input dataframe")
if rating_pred.count() == 0:
raise ValueError("Empty input dataframe")
if self.col_user not in true_columns:
raise ValueError("Schema of rating_true not valid. Missing User Col")
if self.col_item not in true_columns:
raise ValueError("Schema of rating_true not valid. Missing Item Col")
if self.col_rating not in true_columns:
raise ValueError("Schema of rating_true not valid. Missing Rating Col")
if self.col_user not in pred_columns:
raise ValueError(
"Schema of rating_pred not valid. Missing User Col"
) # pragma : No Cover
if self.col_item not in pred_columns:
raise ValueError(
"Schema of rating_pred not valid. Missing Item Col"
) # pragma : No Cover
if self.col_prediction not in pred_columns:
raise ValueError("Schema of rating_pred not valid. Missing Prediction Col")
self.rating_true = self.rating_true.select(
col(self.col_user),
col(self.col_item),
col(self.col_rating).cast("double").alias("label"),
)
self.rating_pred = self.rating_pred.select(
col(self.col_user),
col(self.col_item),
col(self.col_prediction).cast("double").alias("prediction"),
)
self.y_pred_true = (
self.rating_true.join(
self.rating_pred, [self.col_user, self.col_item], "inner"
)
.drop(self.col_user)
.drop(self.col_item)
)
self.metrics = RegressionMetrics(
self.y_pred_true.rdd.map(lambda x: (x.prediction, x.label))
)
def rmse(self):
"""Calculate Root Mean Squared Error.
Returns:
float: Root mean squared error.
"""
return self.metrics.rootMeanSquaredError
def mae(self):
"""Calculate Mean Absolute Error.
Returns:
float: Mean Absolute Error.
"""
return self.metrics.meanAbsoluteError
def rsquared(self):
"""Calculate R squared.
Returns:
float: R squared.
"""
return self.metrics.r2
def exp_var(self):
"""Calculate explained variance.
Note:
Spark MLLib's implementation is buggy (can lead to values > 1), hence we use var().
Returns:
float: Explained variance (min=0, max=1).
"""
var1 = self.y_pred_true.selectExpr("variance(label-prediction)").collect()[0][0]
var2 = self.y_pred_true.selectExpr("variance(label)").collect()[0][0]
if var1 is None or var2 is None:
return -np.inf
else:
# numpy divide is more tolerant to var2 being zero
return 1 - np.divide(var1, var2)
def start_or_get_spark(
app_name="Sample",
url="local[*]",
memory="10g",
config=None,
packages=None,
jars=None,
repositories=None,
):
"""Start Spark if not started
Args:
app_name (str): set name of the application
url (str): URL for spark master
memory (str): size of memory for spark driver. This will be ignored if spark.driver.memory is set in config.
config (dict): dictionary of configuration options
packages (list): list of packages to install
jars (list): list of jar files to add
repositories (list): list of maven repositories
Returns:
object: Spark context.
"""
submit_args = ""
if packages is not None:
submit_args = "--packages {} ".format(",".join(packages))
if jars is not None:
submit_args += "--jars {} ".format(",".join(jars))
if repositories is not None:
submit_args += "--repositories {}".format(",".join(repositories))
if submit_args:
os.environ["PYSPARK_SUBMIT_ARGS"] = "{} pyspark-shell".format(submit_args)
spark_opts = [
'SparkSession.builder.appName("{}")'.format(app_name),
'master("{}")'.format(url),
]
if config is not None:
for key, raw_value in config.items():
value = (
'"{}"'.format(raw_value) if isinstance(raw_value, str) else raw_value
)
spark_opts.append('config("{key}", {value})'.format(key=key, value=value))
if config is None or "spark.driver.memory" not in config:
spark_opts.append('config("spark.driver.memory", "{}")'.format(memory))
# Set larger stack size
spark_opts.append('config("spark.executor.extraJavaOptions", "-Xss4m")')
spark_opts.append('config("spark.driver.extraJavaOptions", "-Xss4m")')
spark_opts.append("getOrCreate()")
return eval(".".join(spark_opts))
我有实际数据帧和预测数据帧,然后用于计算 r2 值:
x_true = pd.DataFrame({COL_USER: [1, 2, 3, 4], COL_ITEM: [5, 6, 7, 8], COL_RATING: [9, 10, 11, 12]})
x_pred = pd.DataFrame({COL_USER: [1, 2, 3, 4], COL_ITEM: [5, 6, 7, 8], COL_RATING: [13, 14, 15, 16]})
spark = start_or_get_spark("EvaluationTesting", "local")
X_true = spark.createDataFrame(x_true)
X_pred = spark.createDataFrame(x_pred)
sre = SparkRatingEvaluation(X_true, X_pred, **HEADER)
r2 = sre.rsquared()
print(r2)
plt.annotate("r-squared = {:.3f}".format(r2_score(X_true, x_pred)), (0, 1))
plt.show()
我想在散点图中可视化结果。我尝试过注释,但它不起作用。它显示:InvalidParameterError:r2_score 的“y_true”参数必须是类似数组的。改为获取 DataFrame[UserId:bigint,MovieId:bigint,Rating:bigint]。
我遇到了类似的问题,为此,您可以做的是将 Spark 数据帧转换回 pandas 数据帧,然后为真实评分与预测评分创建散点图,然后用散点图对其进行注释。
转换:
x_true=X_true.toPandas() #direct function to convert
x_pred=X_pred.toPandas()
绘制:
plt.scatter(x_true[COL_RATING], x_pred[COL_RATING], label='Data Points')
plt.xlabel('True Ratings')
plt.ylabel('Predicted Ratings')
plt.title('True vs Predicted Ratings')
然后按照你想要的方式绘制,我更喜欢这样做:
plt.annotate(f"R-squared = {r2:.3f}", xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12,horizontalalignment='left', verticalalignment='top')
在您的代码中,您传递了 X_true、x_pred。第一个 X_true 是 Spark 数据帧,然后 x_pred 是 pandas 数据帧,我认为您想使用这两个 Spark 数据帧(如果都是 pandas 数据帧,答案将是相同的),然后另一个错误是传递整个数据帧,其中它只接受列表/数组作为输入
仅选择评级列:
X_true_Rating = [row[COL_RATING] for row in X_true.select(COL_RATING).collect()]
X_pred_Rating = [row[COL_RATING] for row in X_pred.select(COL_RATING).collect()]
如果你先创建情节会有帮助
plt.scatter(X_true_Rating, X_pred_Rating)
那么你的注释在(0,1),但是你的x轴从9开始,y轴从13开始,所以你看不到注释
plt.annotate("r-squared = {:.3f}".format(r2_score(X_true_Rating, X_pred_Rating)), (min(X_true_Rating), max(X_pred_Rating)))
plt.show()