模型训练时 GPU 利用率低(0-5%),内存使用率高

问题描述 投票:0回答:1

我正在开发一个机器学习项目,使用 TensorFlow(版本 2.10.1)在 sEMG 信号上训练卷积神经网络 (CNN)。虽然我的模型几乎利用了所有 GPU 内存 (10.8/12 GB),但我注意到 GPU 使用率仍然极低,在 0-5% 之间波动。这极大地影响了我的模型的训练速度和整体效率。我的 GPU 是 12 GB 的 RTX 3060。

# Configurar GPU
import tensorflow as tf
tf.config.list_physical_devices('GPU')

import numpy as np
import pandas as pd
from tensorflow.keras import regularizers, layers, Model
from tensorflow.keras.callbacks import Callback
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import f1_score, recall_score, confusion_matrix
import pickle

# 1. Preparación de los Datos
def segment_data(data, window_size, stride):
    segments = []
    num_samples = len(data)
    num_segments = (num_samples - window_size) // stride + 1
    for i in range(0, num_segments * stride, stride):
        segment = data[i:i + window_size]
        segments.append(segment)
    return np.array(segments)

def load_data(file_path_data, file_path_labels, window_size, stride):
    # Cargar datos
    datos = pd.read_csv(file_path_data, header=None)
    objetivos = pd.read_csv(file_path_labels, header=None)

    data = datos.values
    labels = objetivos.values.flatten()

    # Segmentar los datos
    segments = segment_data(data, window_size, stride)
    segments = np.transpose(segments, (0, 2, 1))
    segment_labels = labels

    num_samples = len(segment_labels)
    permutation = np.random.permutation(num_samples)
    shuffled_segments = segments[permutation]
    shuffled_labels = segment_labels[permutation]

    return shuffled_segments, shuffled_labels

def create_tf_dataset(X, y, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((X, y))
    dataset = dataset.shuffle(buffer_size=1024)  # Mezcla los datos
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)  # Pre-fetching para optimizar el rendimiento
    return dataset

# Cargar los datos
window_size = 400
stride = 400
X, y = load_data('../CSVs/DATOS_200ms_80V_S1.csv', '../CSVs/OBJETIVOS_200MS_80V_S1.csv', window_size, stride)

# Separar en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Crear conjuntos de datos
input_shape = (12, window_size, 1)
num_classes = 18
learning_rate = 0.0001


import tensorflow as tf
from tensorflow.keras import layers, models

# Definir bloque secuencial con capas paralelas y conexiones residuales (ResNet style)
def sequential_block(input_tensor, filters):
    # Primera convolución estándar (principal)
    x1 = layers.Conv2D(filters, (5, 5), padding='same', dilation_rate=2, activation='gelu')(input_tensor)
    
    # Segunda convolución estándar (adicional en paralelo)
    x2 = layers.Conv2D(filters, (5, 5), padding='same', dilation_rate=2, activation='gelu')(input_tensor)
    
    # Tercera convolución estándar (adicional en paralelo)
    x3 = layers.Conv2D(filters, (5, 5), padding='same', dilation_rate=2, activation='gelu')(input_tensor)
    
    # Concatenar las salidas de las tres capas en paralelo
    x = layers.Concatenate()([x1, x2, x3])
    
    # Aplicar maxpooling y batch normalization
    #x = layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = layers.BatchNormalization()(x)
    
    # Conexión residual (al estilo ResNet)
    residual = layers.Conv2D(filters * 3, (1, 1), padding='same')(input_tensor)  # Para igualar dimensiones
    x = layers.Add()([x, residual])
    
    return x

# Definir el modelo de CNN
def create_model(input_shape, num_classes):
    inputs = layers.Input(shape=input_shape)
    
    # Primer bloque secuencial con 32 filtros
    x = sequential_block(inputs, 16)
    
    # Segundo bloque secuencial con 64 filtros
    x = sequential_block(x, 16)

    # Tercer bloque secuencial con 64 filtros
    x = sequential_block(x, 16)
    
    # Bloque final con convoluciones point-wise (1x1) y activación GELU
    x = layers.Conv2D(64, (1, 1), activation='gelu')(x)
    x = layers.Conv2D(64, (1, 1), activation='gelu')(x)
    
    # Aplicar BatchNormalization
    x = layers.BatchNormalization()(x)
    
    # Global MaxPooling
    x = layers.GlobalMaxPooling2D()(x)
    
    # Capa densa con 128 unidades
    x = layers.Dense(128, activation='gelu')(x)
    
    # Capa de salida
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    
    # Construir el modelo
    model = models.Model(inputs, outputs)
    
    # Compilar el modelo
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

# Parámetros del modelo
input_shape = (12, 400, 1)
num_classes = 18

# Crear el modelo
model = create_model(input_shape, num_classes)

# Mostrar el resumen del modelo
model.summary()

# 3. Función de cross-validation
from tensorflow.keras.models import clone_model  # Para clonar el modelo

def cross_validate_model(X_train, y_train, X_test, y_test, num_folds=5, file_path='Resultados/Resultados_CNN15_S1.pkl'):
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    validation_accs = []
    validation_f1s = []
    validation_sensitivities = []
    history_list = []

    best_val_acc = 0
    best_model_weights = None  # Aquí guardaremos los mejores pesos del modelo

    fold = 1
    for train_index, val_index in kf.split(X_train):
        print(f"--- Starting fold {fold} ---")
        # Dividir los datos para el pliegue actual
        X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

        # Crear conjuntos de datos
        train_dataset_fold = create_tf_dataset(X_train_fold, y_train_fold, batch_size=16)
        validation_dataset_fold = create_tf_dataset(X_val_fold, y_val_fold, batch_size=16)

        # Crear y compilar el modelo
        with tf.device('/gpu:0'):
            model = create_model(input_shape=input_shape, num_classes=num_classes)

        # Entrenar el modelo
        history = model.fit(train_dataset_fold, validation_data=validation_dataset_fold, epochs=30, use_multiprocessing=True, workers=8)
        #history = model.fit(train_dataset_fold, validation_data=validation_dataset_fold, epochs=30)

        # Guardar el historial de entrenamiento para cada pliegue
        history_list.append(history.history)

        # Evaluar en el conjunto de validación
        val_loss, val_acc = model.evaluate(validation_dataset_fold)
        y_val_pred = np.argmax(model.predict(X_val_fold), axis=1)
        val_f1 = f1_score(y_val_fold, y_val_pred, average='macro')
        val_sensitivity = recall_score(y_val_fold, y_val_pred, average='macro')

        validation_accs.append(val_acc)
        validation_f1s.append(val_f1)
        validation_sensitivities.append(val_sensitivity)

        # Guardar el mejor modelo basado en la métrica de validación (ej. val_acc)
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model_weights = model.get_weights()  # Guardar los mejores pesos del modelo

        fold += 1

    # Calcular estadísticas de validación
    val_acc_mean, val_acc_std = np.mean(validation_accs), np.std(validation_accs)
    val_f1_mean, val_f1_std = np.mean(validation_f1s), np.std(validation_f1s)
    val_sensitivity_mean, val_sensitivity_std = np.mean(validation_sensitivities), np.std(validation_sensitivities)

    # Evaluar el conjunto de prueba usando el mejor modelo
    print("--- Evaluating on test set with the best model ---")
    final_model = create_model(input_shape=input_shape, num_classes=num_classes)
    final_model.set_weights(best_model_weights)  # Cargar los pesos del mejor modelo

    test_dataset = create_tf_dataset(X_test, y_test, batch_size=16)
    test_loss, test_acc = final_model.evaluate(test_dataset)
    y_test_pred = np.argmax(final_model.predict(X_test), axis=1)
    test_f1 = f1_score(y_test, y_test_pred, average='macro')
    test_sensitivity = recall_score(y_test, y_test_pred, average='macro')

    # Generar matriz de confusión
    conf_matrix = confusion_matrix(y_test, y_test_pred, normalize='true')

    # Guardar resultados
    resultados = {
        'validation_accuracy_mean': val_acc_mean,
        'validation_accuracy_std': val_acc_std,
        'validation_f1_mean': val_f1_mean,
        'validation_f1_std': val_f1_std,
        'validation_sensitivity_mean': val_sensitivity_mean,
        'validation_sensitivity_std': val_sensitivity_std,
        'test_accuracy': test_acc,
        'test_f1': test_f1,
        'test_sensitivity': test_sensitivity,
        'conf_matrix': conf_matrix,
        'history': history_list
    }

    with open(file_path, 'wb') as f:
        pickle.dump(resultados, f)

    print("Resultados guardados en", file_path)

# Ejecutar el entrenamiento con validación cruzada
cross_validate_model(X_train, y_train, X_test, y_test, num_folds=5)

我尝试了几种策略来提高 GPU 利用率:

  • 增加批处理大小:我将批处理大小增加到 256,期望更大的批处理能够更有效地利用 GPU 并减少每个 epoch 的训练时间。

  • 使用 tf.data.Dataset:我实现了带有预取功能的 tf.data.Dataset API,以优化数据加载和处理,期望这将有助于在训练期间保持 GPU 持续输入数据。

  • Multiprocessing:我设置了use_multiprocessing=True,并在训练期间将worker数量增加到8个,看看并行数据加载是否可以缓解瓶颈。我预计这将能够更有效地使用 CPU,从而更快地准备数据。

  • 监控 GPU 使用情况:我使用 nvidia-smi 持续监控 GPU 使用情况,以确保内存分配正确并识别任何使用模式。

尽管做出了这些努力,GPU 利用率仍然很低(0-5%),而内存使用率接近 100%。我预计通过实施这些更改,GPU 将显示出更高的利用率,从而提高模型的训练速度。

tensorflow keras deep-learning gpu
1个回答
0
投票

你肯定没有使用GPU;该代码不会在 GPU 上自动运行。您需要使用 Numba 库中的 @jit 装饰器指定要在 GPU 上运行的函数。使用 GPU,您提供的代码执行时间不应超过 5 分钟(我有点夸张)。您会知道您正在使用 GPU,因为任务管理器将显示 100% 使用率。

© www.soinside.com 2019 - 2024. All rights reserved.