如何编写评估指标代码来评估 L.R. pyspark 中的模型性能?
数据框:
data = spark.createDataFrame([
(0, 18.0, "male", 5.0, 35000),
(1, 20.0, "female", 3.0, 45000),
(2, 22.0, "male", 8.0, 58000),
(3, 25.0, "female", 2.0, 62000),
], [“身份证”,“年龄”,“性别”,“经验”,“工资”])
accuracy = evaluator_accuracy.evaluate(predictions)
precision = evaluator_precision.evaluate(predictions)
recall = evaluator_recall.evaluate(predictions)
f1 = evaluator_f1.evaluate(predictions)
有错误
像这样尝试它是工作代码
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
spark = SparkSession.builder.appName("lr").getOrCreate()
data = spark.createDataFrame([
(0, 18.0, "male", 5.0, 35000),
(1, 20.0, "female", 3.0, 45000),
(2, 22.0, "male", 8.0, 58000),
(3, 25.0, "female", 2.0, 62000),
(4, 28.0, "female", 4.0, 70000)
], ["id", "age", "gender", "experience", "salary"])
# Index the categorical column "gender"
ind = StringIndexer(inputCol="gender", outputCol="genderIndex")
data = ind.fit(data).transform(data)
# Prepare the features using VectorAssembler
assem = VectorAssembler(
inputCols=["age", "experience", "salary", "genderIndex"],
outputCol="features"
)
data = assem.transform(data)
data = data.withColumn("label", (data.salary > 50000).cast("double"))
train, test = data.randomSplit([0.8, 0.2], seed=1234)
model= LogisticRegression(featuresCol="features", labelCol="label")
lr_model = model.fit(train)
pred = lr_model.transform(test)
evaluator = MulticlassClassificationEvaluator(
labelCol="label", predictionCol="prediction", metricName="accuracy"
)
accuracy = evaluator.evaluate(pred)
precision_eval = MulticlassClassificationEvaluator(
labelCol="label", predictionCol="prediction", metricName="weightedPrecision"
)
precision = precision_eval.evaluate(pred)
recall_eval = MulticlassClassificationEvaluator(
labelCol="label", predictionCol="prediction", metricName="weightedRecall"
)
recall = recall_eval.evaluate(pred)
f1_eval = MulticlassClassificationEvaluator(
labelCol="label", predictionCol="prediction", metricName="f1"
)
f1 = f1_eval.evaluate(pred)
print(f1)
spark.stop()