我已经用 Python 实现了 FCNN。 我的数据库由大约 5000 场足球比赛组成,具有以下特征:射门、角球、红牌、犯规、创造的绝佳机会、传球、传中和扑救。
我的结果是二元的:如何获胜或客场获胜或平局。
我在输出层(1 个神经元)中使用“binary_crossentropy”作为损失函数和激活“sigmoid”。我也实现了 dropout 和提前停止。
我的训练和验证准确率都在 0.75 左右(听起来一点也不差),但两个损失值都没有下降 0.5,据我所知,这不是很好。
我可以实施任何优化来减少损失吗?
[编辑]:我添加了另外 2 个似乎会改变游戏规则的功能:主客场射门转化率(进球/射门)。现在损失约为 0.1(或更小)。 唯一的一点是,使用随机森林绘制特征重要性似乎这两个比其他的更相关。这对我来说听起来不太好
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import LearningRateScheduler
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
from tensorflow.keras.utils import to_categorical
if len(sys.argv) == 4:
file_name = sys.argv[1]
pattern = sys.argv[2]
network_path = sys.argv[3]
else:
print("[ERROR] Incorrect parameters")
quit
# Step 1: Load the dataset
df = pd.read_csv(file_name)
print("Available columns:", df.columns)
# Remove corrupted lines
df = df[df['MatchId'] != ""]
# Aggregate shots in just 2 columns, for home and away
df['HomeTeamShots'] = df['HomeTeamShotsOnTarget'] + df['HomeTeamShotsOffTarget'] + df['HomeTeamBlockedShots']
df['AwayTeamShots'] = df['AwayTeamShotsOnTarget'] + df['AwayTeamShotsOffTarget'] + df['AwayTeamBlockedShots']
# Shot conversion rate [newly added]
# df['HomeTeamConversionRate'] = 0 # Default value for conversion rate
# df.loc[df['HomeTeamShots'] > 0, 'HomeTeamConversionRate'] = df['HomeTeamFTGoal'] / df['HomeTeamShots']
# df['AwayTeamConversionRate'] = 0 # Default value for conversion rate
# df.loc[df['AwayTeamShots'] > 0, 'AwayTeamConversionRate'] = df['AwayTeamFTGoal'] / df['AwayTeamShots']
# Home wins
if pattern == "HW":
df['Outcome'] = df.apply(
lambda row: 0 if row['HomeTeamFTGoal'] > row['AwayTeamFTGoal'] \
else 1,
axis=1
)
# Home wins or draws
elif pattern == "HWD":
df['Outcome'] = df.apply(
lambda row: 0 if row['HomeTeamFTGoal'] >= row['AwayTeamFTGoal'] \
else 1,
axis=1
)
# Away wins
elif pattern == "AW":
df['Outcome'] = df.apply(
lambda row: 0 if row['HomeTeamFTGoal'] < row['AwayTeamFTGoal'] \
else 1,
axis=1
)
# Over 2.5 goals
elif pattern == "OV25":
df['Outcome'] = df.apply(
lambda row: 0 if row['HomeTeamFTGoal'] + row['AwayTeamFTGoal'] > 2 \
else 1,
axis=1
)
# BTTS
elif pattern == "BTTS":
df['Outcome'] = df.apply(
lambda row: 0 if row['HomeTeamFTGoal'] > 0 and row['AwayTeamFTGoal'] > 0 \
else 1,
axis=1
)
# Over 3.5 goals
elif pattern == "OV35":
df['Outcome'] = df.apply(
lambda row: 0 if row['HomeTeamFTGoal'] + row['AwayTeamFTGoal'] > 3 \
else 1,
axis=1
)
# Draw
elif pattern == "DRAW":
df['Outcome'] = df.apply(
lambda row: 0 if row['HomeTeamFTGoal'] == row['AwayTeamFTGoal'] \
else 1,
axis=1
)
# Home over 0.5
elif pattern == "HOV05":
df['Outcome'] = df.apply(
lambda row: 0 if row['HomeTeamFTGoal'] > 0 \
else 1,
axis=1
)
# Away over 0.5
elif pattern == "AOV05":
df['Outcome'] = df.apply(
lambda row: 0 if row['AwayTeamFTGoal'] > 0 \
else 1,
axis=1
)
# Home over 0.5
elif pattern == "HOV15":
df['Outcome'] = df.apply(
lambda row: 0 if row['HomeTeamFTGoal'] > 1 \
else 1,
axis=1
)
# Away over 0.5
elif pattern == "AOV15":
df['Outcome'] = df.apply(
lambda row: 0 if row['AwayTeamFTGoal'] > 1 \
else 1,
axis=1
)
# Drop not relevant columns
df = df.drop(columns=['HomeTeamID', 'AwayTeamID', 'MatchId', 'HomeTeamFTGoal', 'AwayTeamFTGoal', 'HomeTeamName', 'AwayTeamName', 'MatchDate', 'HomeTeamxG', 'AwayTeamxG',
'HomeTeamBlockedShots', 'AwayTeamBlockedShots',
'HomeTeamShotsInsideBox', 'AwayTeamShotsInsideBox', 'HomeTeamShotsOutsideBox','AwayTeamShotsOutsideBox', 'HomeTeamShotsOnTarget', 'AwayTeamShotsOnTarget',
'HomeTeamShotsOffTarget', 'AwayTeamShotsOffTarget' ])
# Split features and target
X = df.drop(columns=['Outcome']).values # Features
y = df['Outcome'].values # Target
# Encode target variable as categorical
# y_one_hot = to_categorical(y)
y_one_hot = y
# Split training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.2, random_state=42)
# and scale
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
############### Random forest for feature evaluation ###############
# Store column names before scaling
X = pd.DataFrame(X)
feature_names = df.drop(columns=['Outcome']).columns
X_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
# Get feature importances
importances_rf = rf_model.feature_importances_
# Print or plot the feature importances
print("Feature Importances from Random Forest:")
for feature, importance in zip(feature_names, importances_rf):
print(f"{feature}: {importance}")
# Optionally, plot the importances
plt.figure(figsize=(10, 6))
plt.barh(feature_names, importances_rf)
plt.xlabel('Feature Importance')
plt.title('Feature Importance from Random Forest')
plt.show()
############### Random forest for feature evaluation ###############
############### Models ###############
model = Sequential([
Dense(64, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.01)),
BatchNormalization(),
# Dropout(0.6), # Increased dropout to regularize more
Dropout(0.2), # Increased dropout to regularize more
Dense(32, activation='relu', kernel_regularizer=l2(0.01)),
BatchNormalization(),
# Dropout(0.5),
Dropout(0.2),
Dense(32, activation='relu'),
# Dense(32, activation='relu'),
Dense(1, activation='sigmoid') # Output layer for multi-class classification
])
model.compile(optimizer=Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy'])
# Implement early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_accuracy', patience=20, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=3, min_lr=1e-6, verbose=1)
history = model.fit(
X_train_scaled, y_train,
validation_data=(X_test_scaled, y_test),
epochs=50,
batch_size=32,
callbacks=[early_stopping, reduce_lr],
verbose=1
)
# Plot accuracy and loss
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss', color='blue')
plt.plot(history.history['val_loss'], label='Validation Loss', color='orange')
plt.title('Training and Validation Loss Over Epochs', fontsize=16)
plt.xlabel('Epochs', fontsize=14)
plt.ylabel('Loss', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True)
plt.show()
predictions = model.predict(X_test_scaled)
plt.hist(predictions, bins=50)
plt.title("Distribution of Predicted Probabilities")
plt.show()
# Plot training and validation accuracy
import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
# Save the model to a file
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = network_path + "\\" + pattern + "_Model_" + timestamp + ".h5"
print(f"Network path: {filename}")
print(network_path)
model.save(filename)
我认为有两件事你可以尝试:
更改您的超参数,看看这如何影响您的模型性能。
但我认为更重要的是产生新功能。例如,每支球队在前十场比赛中获胜的频率、排名如何(例如在联赛中的排名)以及您可以根据现有数据生成的其他统计数据。要有创意。您还可以尝试添加外部数据,例如天气,或者客队距离家乡有多远(支持者较少)。