如何从 pyspark DataFrame 绘制 R 平方值?

问题描述 投票:0回答:2

重现问题

import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.sql import Window, DataFrame
from pyspark.sql.functions import col
from sklearn.metrics import r2_score
# from recommenders.utils.spark_utils import start_or_get_spark
# from recommenders.evaluation.spark_evaluation import SparkRatingEvaluation
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
import numpy as np
import os

COL_USER = "UserId"
COL_ITEM = "MovieId"
COL_RATING = "Rating"
COL_PREDICTION = "Rating"

HEADER = {
    "col_user": COL_USER,
    "col_item": COL_ITEM,
    "col_rating": COL_RATING,
    "col_prediction": COL_PREDICTION,
}

class SparkRatingEvaluation:
    """Spark Rating Evaluator"""

    def __init__(
        self,
        rating_true,
        rating_pred,
        col_user = COL_USER,
        col_item = COL_ITEM,
        col_rating = COL_RATING,
        col_prediction = COL_PREDICTION,
    ):
        """Initializer.

        This is the Spark version of rating metrics evaluator.
        The methods of this class, calculate rating metrics such as root mean squared error, mean absolute error,
        R squared, and explained variance.

        Args:
            rating_true (pyspark.sql.DataFrame): True labels.
            rating_pred (pyspark.sql.DataFrame): Predicted labels.
            col_user (str): column name for user.
            col_item (str): column name for item.
            col_rating (str): column name for rating.
            col_prediction (str): column name for prediction.
        """
        self.rating_true = rating_true
        self.rating_pred = rating_pred
        self.col_user = col_user
        self.col_item = col_item
        self.col_rating = col_rating
        self.col_prediction = col_prediction

        # Check if inputs are Spark DataFrames.
        if not isinstance(self.rating_true, DataFrame):
            raise TypeError(
                "rating_true should be but is not a Spark DataFrame"
            )  # pragma : No Cover

        if not isinstance(self.rating_pred, DataFrame):
            raise TypeError(
                "rating_pred should be but is not a Spark DataFrame"
            )  # pragma : No Cover

        # Check if columns exist.
        true_columns = self.rating_true.columns
        pred_columns = self.rating_pred.columns

        if rating_true.count() == 0:
            raise ValueError("Empty input dataframe")
        if rating_pred.count() == 0:
            raise ValueError("Empty input dataframe")

        if self.col_user not in true_columns:
            raise ValueError("Schema of rating_true not valid. Missing User Col")
        if self.col_item not in true_columns:
            raise ValueError("Schema of rating_true not valid. Missing Item Col")
        if self.col_rating not in true_columns:
            raise ValueError("Schema of rating_true not valid. Missing Rating Col")

        if self.col_user not in pred_columns:
            raise ValueError(
                "Schema of rating_pred not valid. Missing User Col"
            )  # pragma : No Cover
        if self.col_item not in pred_columns:
            raise ValueError(
                "Schema of rating_pred not valid. Missing Item Col"
            )  # pragma : No Cover
        if self.col_prediction not in pred_columns:
            raise ValueError("Schema of rating_pred not valid. Missing Prediction Col")

        self.rating_true = self.rating_true.select(
            col(self.col_user),
            col(self.col_item),
            col(self.col_rating).cast("double").alias("label"),
        )
        self.rating_pred = self.rating_pred.select(
            col(self.col_user),
            col(self.col_item),
            col(self.col_prediction).cast("double").alias("prediction"),
        )

        self.y_pred_true = (
            self.rating_true.join(
                self.rating_pred, [self.col_user, self.col_item], "inner"
            )
            .drop(self.col_user)
            .drop(self.col_item)
        )

        self.metrics = RegressionMetrics(
            self.y_pred_true.rdd.map(lambda x: (x.prediction, x.label))
        )

    def rmse(self):
        """Calculate Root Mean Squared Error.

        Returns:
            float: Root mean squared error.
        """
        return self.metrics.rootMeanSquaredError

    def mae(self):
        """Calculate Mean Absolute Error.

        Returns:
            float: Mean Absolute Error.
        """
        return self.metrics.meanAbsoluteError

    def rsquared(self):
        """Calculate R squared.

        Returns:
            float: R squared.
        """
        return self.metrics.r2

    def exp_var(self):
        """Calculate explained variance.

        Note:
           Spark MLLib's implementation is buggy (can lead to values > 1), hence we use var().

        Returns:
            float: Explained variance (min=0, max=1).
        """
        var1 = self.y_pred_true.selectExpr("variance(label-prediction)").collect()[0][0]
        var2 = self.y_pred_true.selectExpr("variance(label)").collect()[0][0]

        if var1 is None or var2 is None:
            return -np.inf
        else:
            # numpy divide is more tolerant to var2 being zero
            return 1 - np.divide(var1, var2)

def start_or_get_spark(
    app_name="Sample",
    url="local[*]",
    memory="10g",
    config=None,
    packages=None,
    jars=None,
    repositories=None,
):
    """Start Spark if not started

    Args:
        app_name (str): set name of the application
        url (str): URL for spark master
        memory (str): size of memory for spark driver. This will be ignored if spark.driver.memory is set in config.
        config (dict): dictionary of configuration options
        packages (list): list of packages to install
        jars (list): list of jar files to add
        repositories (list): list of maven repositories

    Returns:
        object: Spark context.
    """

    submit_args = ""
    if packages is not None:
        submit_args = "--packages {} ".format(",".join(packages))
    if jars is not None:
        submit_args += "--jars {} ".format(",".join(jars))
    if repositories is not None:
        submit_args += "--repositories {}".format(",".join(repositories))
    if submit_args:
        os.environ["PYSPARK_SUBMIT_ARGS"] = "{} pyspark-shell".format(submit_args)

    spark_opts = [
        'SparkSession.builder.appName("{}")'.format(app_name),
        'master("{}")'.format(url),
    ]

    if config is not None:
        for key, raw_value in config.items():
            value = (
                '"{}"'.format(raw_value) if isinstance(raw_value, str) else raw_value
            )
            spark_opts.append('config("{key}", {value})'.format(key=key, value=value))

    if config is None or "spark.driver.memory" not in config:
        spark_opts.append('config("spark.driver.memory", "{}")'.format(memory))

    # Set larger stack size
    spark_opts.append('config("spark.executor.extraJavaOptions", "-Xss4m")')
    spark_opts.append('config("spark.driver.extraJavaOptions", "-Xss4m")')

    spark_opts.append("getOrCreate()")
    return eval(".".join(spark_opts))

我有实际数据帧和预测数据帧,然后用于计算 r2 值:

x_true = pd.DataFrame({COL_USER: [1, 2, 3, 4], COL_ITEM: [5, 6, 7, 8], COL_RATING: [9, 10, 11, 12]})
x_pred = pd.DataFrame({COL_USER: [1, 2, 3, 4], COL_ITEM: [5, 6, 7, 8], COL_RATING: [13, 14, 15, 16]})
spark = start_or_get_spark("EvaluationTesting", "local")
X_true = spark.createDataFrame(x_true)
X_pred = spark.createDataFrame(x_pred)
sre = SparkRatingEvaluation(X_true, X_pred, **HEADER)
r2 = sre.rsquared()
print(r2)

plt.annotate("r-squared = {:.3f}".format(r2_score(X_true, x_pred)), (0, 1))
plt.show()

我想在散点图中可视化结果。我尝试过注释,但它不起作用。它显示:InvalidParameterError:r2_score 的“y_true”参数必须是类似数组的。改为获取 DataFrame[UserId:bigint,MovieId:bigint,Rating:bigint]。

pyspark scatter-plot
2个回答
0
投票

我遇到了类似的问题,为此,您可以做的是将 Spark 数据帧转换回 pandas 数据帧,然后为真实评分与预测评分创建散点图,然后用散点图对其进行注释。

转换:

x_true=X_true.toPandas() #direct function to convert
x_pred=X_pred.toPandas()

绘制:

plt.scatter(x_true[COL_RATING], x_pred[COL_RATING], label='Data Points')
plt.xlabel('True Ratings')
plt.ylabel('Predicted Ratings')
plt.title('True vs Predicted Ratings')

然后按照你想要的方式绘制,我更喜欢这样做:

plt.annotate(f"R-squared = {r2:.3f}", xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12,horizontalalignment='left', verticalalignment='top')

0
投票

在您的代码中,您传递了 X_true、x_pred。第一个 X_true 是 Spark 数据帧,然后 x_pred 是 pandas 数据帧,我认为您想使用这两个 Spark 数据帧(如果都是 pandas 数据帧,答案将是相同的),然后另一个错误是传递整个数据帧,其中它只接受列表/数组作为输入

仅选择评级列:

X_true_Rating = [row[COL_RATING] for row in X_true.select(COL_RATING).collect()]
X_pred_Rating = [row[COL_RATING] for row in X_pred.select(COL_RATING).collect()]

如果你先创建情节会有帮助

plt.scatter(X_true_Rating, X_pred_Rating)

那么你的注释在(0,1),但是你的x轴从9开始,y轴从13开始,所以你看不到注释

plt.annotate("r-squared = {:.3f}".format(r2_score(X_true_Rating, X_pred_Rating)), (min(X_true_Rating), max(X_pred_Rating)))
plt.show()
© www.soinside.com 2019 - 2024. All rights reserved.