我正在开发一个机器学习项目,使用 TensorFlow(版本 2.10.1)在 sEMG 信号上训练卷积神经网络 (CNN)。虽然我的模型几乎利用了所有 GPU 内存 (10.8/12 GB),但我注意到 GPU 使用率仍然极低,在 0-5% 之间波动。这极大地影响了我的模型的训练速度和整体效率。我的 GPU 是 12 GB 的 RTX 3060。
# Configurar GPU
import tensorflow as tf
tf.config.list_physical_devices('GPU')
import numpy as np
import pandas as pd
from tensorflow.keras import regularizers, layers, Model
from tensorflow.keras.callbacks import Callback
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import f1_score, recall_score, confusion_matrix
import pickle
# 1. Preparación de los Datos
def segment_data(data, window_size, stride):
segments = []
num_samples = len(data)
num_segments = (num_samples - window_size) // stride + 1
for i in range(0, num_segments * stride, stride):
segment = data[i:i + window_size]
segments.append(segment)
return np.array(segments)
def load_data(file_path_data, file_path_labels, window_size, stride):
# Cargar datos
datos = pd.read_csv(file_path_data, header=None)
objetivos = pd.read_csv(file_path_labels, header=None)
data = datos.values
labels = objetivos.values.flatten()
# Segmentar los datos
segments = segment_data(data, window_size, stride)
segments = np.transpose(segments, (0, 2, 1))
segment_labels = labels
num_samples = len(segment_labels)
permutation = np.random.permutation(num_samples)
shuffled_segments = segments[permutation]
shuffled_labels = segment_labels[permutation]
return shuffled_segments, shuffled_labels
def create_tf_dataset(X, y, batch_size):
dataset = tf.data.Dataset.from_tensor_slices((X, y))
dataset = dataset.shuffle(buffer_size=1024) # Mezcla los datos
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(tf.data.AUTOTUNE) # Pre-fetching para optimizar el rendimiento
return dataset
# Cargar los datos
window_size = 400
stride = 400
X, y = load_data('../CSVs/DATOS_200ms_80V_S1.csv', '../CSVs/OBJETIVOS_200MS_80V_S1.csv', window_size, stride)
# Separar en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Crear conjuntos de datos
input_shape = (12, window_size, 1)
num_classes = 18
learning_rate = 0.0001
import tensorflow as tf
from tensorflow.keras import layers, models
# Definir bloque secuencial con capas paralelas y conexiones residuales (ResNet style)
def sequential_block(input_tensor, filters):
# Primera convolución estándar (principal)
x1 = layers.Conv2D(filters, (5, 5), padding='same', dilation_rate=2, activation='gelu')(input_tensor)
# Segunda convolución estándar (adicional en paralelo)
x2 = layers.Conv2D(filters, (5, 5), padding='same', dilation_rate=2, activation='gelu')(input_tensor)
# Tercera convolución estándar (adicional en paralelo)
x3 = layers.Conv2D(filters, (5, 5), padding='same', dilation_rate=2, activation='gelu')(input_tensor)
# Concatenar las salidas de las tres capas en paralelo
x = layers.Concatenate()([x1, x2, x3])
# Aplicar maxpooling y batch normalization
#x = layers.MaxPooling2D(pool_size=(2, 2))(x)
x = layers.BatchNormalization()(x)
# Conexión residual (al estilo ResNet)
residual = layers.Conv2D(filters * 3, (1, 1), padding='same')(input_tensor) # Para igualar dimensiones
x = layers.Add()([x, residual])
return x
# Definir el modelo de CNN
def create_model(input_shape, num_classes):
inputs = layers.Input(shape=input_shape)
# Primer bloque secuencial con 32 filtros
x = sequential_block(inputs, 16)
# Segundo bloque secuencial con 64 filtros
x = sequential_block(x, 16)
# Tercer bloque secuencial con 64 filtros
x = sequential_block(x, 16)
# Bloque final con convoluciones point-wise (1x1) y activación GELU
x = layers.Conv2D(64, (1, 1), activation='gelu')(x)
x = layers.Conv2D(64, (1, 1), activation='gelu')(x)
# Aplicar BatchNormalization
x = layers.BatchNormalization()(x)
# Global MaxPooling
x = layers.GlobalMaxPooling2D()(x)
# Capa densa con 128 unidades
x = layers.Dense(128, activation='gelu')(x)
# Capa de salida
outputs = layers.Dense(num_classes, activation='softmax')(x)
# Construir el modelo
model = models.Model(inputs, outputs)
# Compilar el modelo
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
return model
# Parámetros del modelo
input_shape = (12, 400, 1)
num_classes = 18
# Crear el modelo
model = create_model(input_shape, num_classes)
# Mostrar el resumen del modelo
model.summary()
# 3. Función de cross-validation
from tensorflow.keras.models import clone_model # Para clonar el modelo
def cross_validate_model(X_train, y_train, X_test, y_test, num_folds=5, file_path='Resultados/Resultados_CNN15_S1.pkl'):
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
validation_accs = []
validation_f1s = []
validation_sensitivities = []
history_list = []
best_val_acc = 0
best_model_weights = None # Aquí guardaremos los mejores pesos del modelo
fold = 1
for train_index, val_index in kf.split(X_train):
print(f"--- Starting fold {fold} ---")
# Dividir los datos para el pliegue actual
X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
# Crear conjuntos de datos
train_dataset_fold = create_tf_dataset(X_train_fold, y_train_fold, batch_size=16)
validation_dataset_fold = create_tf_dataset(X_val_fold, y_val_fold, batch_size=16)
# Crear y compilar el modelo
with tf.device('/gpu:0'):
model = create_model(input_shape=input_shape, num_classes=num_classes)
# Entrenar el modelo
history = model.fit(train_dataset_fold, validation_data=validation_dataset_fold, epochs=30, use_multiprocessing=True, workers=8)
#history = model.fit(train_dataset_fold, validation_data=validation_dataset_fold, epochs=30)
# Guardar el historial de entrenamiento para cada pliegue
history_list.append(history.history)
# Evaluar en el conjunto de validación
val_loss, val_acc = model.evaluate(validation_dataset_fold)
y_val_pred = np.argmax(model.predict(X_val_fold), axis=1)
val_f1 = f1_score(y_val_fold, y_val_pred, average='macro')
val_sensitivity = recall_score(y_val_fold, y_val_pred, average='macro')
validation_accs.append(val_acc)
validation_f1s.append(val_f1)
validation_sensitivities.append(val_sensitivity)
# Guardar el mejor modelo basado en la métrica de validación (ej. val_acc)
if val_acc > best_val_acc:
best_val_acc = val_acc
best_model_weights = model.get_weights() # Guardar los mejores pesos del modelo
fold += 1
# Calcular estadísticas de validación
val_acc_mean, val_acc_std = np.mean(validation_accs), np.std(validation_accs)
val_f1_mean, val_f1_std = np.mean(validation_f1s), np.std(validation_f1s)
val_sensitivity_mean, val_sensitivity_std = np.mean(validation_sensitivities), np.std(validation_sensitivities)
# Evaluar el conjunto de prueba usando el mejor modelo
print("--- Evaluating on test set with the best model ---")
final_model = create_model(input_shape=input_shape, num_classes=num_classes)
final_model.set_weights(best_model_weights) # Cargar los pesos del mejor modelo
test_dataset = create_tf_dataset(X_test, y_test, batch_size=16)
test_loss, test_acc = final_model.evaluate(test_dataset)
y_test_pred = np.argmax(final_model.predict(X_test), axis=1)
test_f1 = f1_score(y_test, y_test_pred, average='macro')
test_sensitivity = recall_score(y_test, y_test_pred, average='macro')
# Generar matriz de confusión
conf_matrix = confusion_matrix(y_test, y_test_pred, normalize='true')
# Guardar resultados
resultados = {
'validation_accuracy_mean': val_acc_mean,
'validation_accuracy_std': val_acc_std,
'validation_f1_mean': val_f1_mean,
'validation_f1_std': val_f1_std,
'validation_sensitivity_mean': val_sensitivity_mean,
'validation_sensitivity_std': val_sensitivity_std,
'test_accuracy': test_acc,
'test_f1': test_f1,
'test_sensitivity': test_sensitivity,
'conf_matrix': conf_matrix,
'history': history_list
}
with open(file_path, 'wb') as f:
pickle.dump(resultados, f)
print("Resultados guardados en", file_path)
# Ejecutar el entrenamiento con validación cruzada
cross_validate_model(X_train, y_train, X_test, y_test, num_folds=5)
我尝试了几种策略来提高 GPU 利用率:
增加批处理大小:我将批处理大小增加到 256,期望更大的批处理能够更有效地利用 GPU 并减少每个 epoch 的训练时间。
使用 tf.data.Dataset:我实现了带有预取功能的 tf.data.Dataset API,以优化数据加载和处理,期望这将有助于在训练期间保持 GPU 持续输入数据。
Multiprocessing:我设置了use_multiprocessing=True,并在训练期间将worker数量增加到8个,看看并行数据加载是否可以缓解瓶颈。我预计这将能够更有效地使用 CPU,从而更快地准备数据。
监控 GPU 使用情况:我使用 nvidia-smi 持续监控 GPU 使用情况,以确保内存分配正确并识别任何使用模式。
尽管做出了这些努力,GPU 利用率仍然很低(0-5%),而内存使用率接近 100%。我预计通过实施这些更改,GPU 将显示出更高的利用率,从而提高模型的训练速度。
你肯定没有使用GPU;该代码不会在 GPU 上自动运行。您需要使用 Numba 库中的 @jit 装饰器指定要在 GPU 上运行的函数。使用 GPU,您提供的代码执行时间不应超过 5 分钟(我有点夸张)。您会知道您正在使用 GPU,因为任务管理器将显示 100% 使用率。