我训练了一个简单的神经网络模型来进行二元分类,并且能够区分真假新闻
#Create the class of the model
class FakeNewsDetectionModelV0(nn.Module):
def __init__(self, input_size):
super().__init__()
self.layer_1=nn.Linear(in_features=input_size, out_features=8)
self.layer_2=nn.Linear(in_features=8, out_features=1) #takes the 5 features from the previous layer and outputs a single feature
#define a forward() for the forward pass
def forward(self, x, mask):
# Apply the mask to ignore certain values
if mask is not None:
x = x * mask
x = self.layer_1(x)
x = self.layer_2(x)
return x
我使用 CountVectorizer 将文本转换为列表,然后转换为张量
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(df['text'])
X=vectorizer.fit_transform(df['text']).toarray()
问题在于,由于数据集有超过 9000 个条目,因此训练模型的输入大小非常大(大约 120000 个)。因此,当我尝试对单个句子进行预测时,由于大小明显较小,我需要过度填充句子以使其适合模型的输入,这极大地影响了模型的准确性。
from io import StringIO
from torch.nn.functional import pad
import string
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
try:
#nltk.download('stopwords')
nltk.download('punkt')
except:
print("error in downloading stopwords")
def normalise_text (text):
text = text.lower() # lowercase
text = text.replace(r"\#","") # replaces hashtags
text = text.replace(r"http\S+","URL") # remove URL addresses
text = text.replace(r"@","")
text = text.replace(r"[^A-Za-z0-9()!?\'\`\"]", " ")
text = text.replace("\s{2,}", " ")
text = re.sub(r'[^\w\s]', '', text)
return text
def fake_news_detection(df, model, model_input_size):
predictions = []
max_words = 10000
max_length = model_input_size
model.eval()
for prediction_data in df['text'][:4000]:
prediction_data=normalise_text(prediction_data)
#print([prediction_data])
# Use CountVectorizer to transform text data to array
vectorizer = CountVectorizer(min_df=0, lowercase=False)
prediction_data_array = vectorizer.fit_transform([prediction_data]).toarray()
#tokenizer = Tokenizer(num_words=max_words)
#tokenizer.fit_on_texts([prediction_data])
#sequences = tokenizer.texts_to_sequences([prediction_data])
#prediction_data_array = pad_sequences(sequences, maxlen=max_length,value=-1.0)
#print(prediction_data_array.shape)
# Check the shape of the transformed data
current_input_size = prediction_data_array.shape[1]
prediction_data_tensor = torch.tensor(prediction_data_array, dtype=torch.float32)
# If the shape doesn't match, resize it
if current_input_size != model_input_size:
print(current_input_size)
padding = model_input_size - current_input_size
prediction_data_tensor = pad(prediction_data_tensor, (0, padding), 'constant', value = 0)
mask_tensor = torch.ones_like(prediction_data_tensor)
mask_tensor[:, -padding:] = 0 # Set values in the padded region to 0
#print(torch.unique(mask_tensor, return_counts=True))
# Apply the mask to ignore certain values
#prediction_data_tensor = prediction_data_tensor * mask_tensor
# Assuming your model takes input_data as input
with torch.inference_mode():
prediction = torch.round(torch.sigmoid(model(prediction_data_tensor, mask_tensor))).squeeze()
predictions.append(round(prediction.item()))
print(f"our data tensor shape is {prediction_data_tensor.shape}")
predictions_tensor = torch.FloatTensor(predictions)
return predictions_tensor
有谁知道有什么解决方法可以让我将数据拟合到我的模型而不降低其准确性分数吗?
尝试过:在对小尺寸数据进行预测时填充向量
预期:准确的预测类似于我在训练/评估过程中得到的结果
得到:不准确的预测,准确度非常低(大约 43%)
问题是您正在为循环的每一步创建不同的特征器。这意味着您在每一步中都为模型提供了完全不同的特征向量。
您需要在训练数据集上
fit
CountVectorizer
一次,然后使用相同的拟合 CountVectorizer
来转换所有数据。