我正在研究一个情绪分析项目,使用从stocktwits中提取的json格式提取的数据。每条推文都被分配到一个情绪分数,该分数是0到1之间的浮点数。我想使用pyspark Mllib训练随机森林。
以下是我的代码:
我将SparkDF转换为RDD然后我尝试应用RandomForest模型
import nltk
import time
from collections import Counter
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
import csv
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import regexp_tokenize, wordpunct_tokenize,blankline_tokenize
from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
import re
import string
from collections import Counter
import json
import re as regex
import xgboost as xgb
from sklearn import model_selection, preprocessing
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
import numpy as np
import findspark
findspark.init()
#findspark.init("C:\opt\spark\spark-2.3.0-bin-hadoop2.7")
import pyspark.sql.types as typ
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder \
.master("local") \
.appName("Spark ML") \
.getOrCreate()
#read json file into Spark DataFrame
#create spark dataframe from Pandas DF
df=pd.read_json("Microblog_Trialdata.json")
def list_sp(row):
idx=row["spans"]
#ch="".join(x for x in idx if x)
ch=' '.join(idx)
row["spans"]=ch
return row
df = df.apply(list_sp, axis=1)
train_data=sparkSession.createDataFrame(df)
df.head()
#extraire les champ de sentiment score et tweets
import pyspark.mllib.regression
from pyspark.mllib.regression import LabeledPoint
train_data.show()
spans=train_data.select("sentiment score","spans")
#renommer le champ sentiment score en "label"
spans=spans.toDF("label","spans")
#remove some additional features(numbers and unctuatuion)
import pyspark.ml.feature as ft
tokenizer = ft.RegexTokenizer(
inputCol='spans',
outputCol='tokens',
pattern='\s+|[$,.\"]')
#tokenize the text
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
countTokens = udf(lambda words: len(words), IntegerType())
tok = tokenizer \
.transform(new) \
#now remove stopwords from the review(list of words)
from pyspark.ml.feature import StopWordsRemover
remover=StopWordsRemover(inputCol="tokens", outputCol="filtered")
filtered_df=remover.transform(tok)
#now make 2-gram model
from pyspark.ml.feature import NGram
ngram=NGram(n=1, inputCol="filtered", outputCol="n-gram")
gram_df=ngram.transform(filtered_df)
#apply countvectorizer model
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import StringIndexer
# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="n-gram", outputCol="features", vocabSize=20, minDF=2.0)
model = cv.fit(gram_df)
result = model.transform(gram_df)
#former le pipeline totale
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[tokenizer, remover, ngram,cv])
pipelineFit = pipeline.fit(spans)
final_df = pipelineFit.transform(spans)
from pyspark.mllib.tree import RandomForest
model = RandomForest.trainRegressor(train_df,{}, numTrees =10,maxDepth =None,maxBins =32, seed=42)
如果你的问题是将数据帧转换为RDD,那么你必须像这样使用.rdd
newRDD = yourdataframe.rdd