我是一名计算机科学专业的本科生,目前正在研究一个实现 ConvMixerSwinV2 模型的项目。我尝试在代码中实现该模型并在 CIFAR100 数据集上对其进行训练,但出现了很多错误。


RuntimeError                              Traceback (most recent call last)
<ipython-input-95-3b522f5841c4> in <cell line: 189>()
    200     optimizer.zero_grad()
--> 201     outputs = model(inputs)
    202     loss = criterion(outputs, labels)

5 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1499                 or _global_backward_pre_hooks or _global_backward_hooks
   1500                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501             return forward_call(*args, **kwargs)
   1502         # Do not call functions when jit is used
   1503         full_backward_hooks, non_full_backward_hooks = [], []

<ipython-input-95-3b522f5841c4> in forward(self, x)
    136     x = x.permute(0, 1, 3, 2, 4, 5)  # Trasponi gli assi per raggruppare i patch
    137     x = x.reshape(-1, self.swin_transformer.patch_size[0] * self.swin_transformer.patch_size[1], C)
--> 138     x = self.swin_transformer(x)
    139     x = x.mean(dim=[1, 2])
    140     x = self.classifier(x)

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1499                 or _global_backward_pre_hooks or _global_backward_hooks
   1500                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501             return forward_call(*args, **kwargs)
   1502         # Do not call functions when jit is used
   1503         full_backward_hooks, non_full_backward_hooks = [], []

<ipython-input-95-3b522f5841c4> in forward(self, x)
    104     x = x.reshape(-1, self.img_size[0] // self.patch_size[0], self.img_size[1] // self.patch_size[1], self.embed_dim)
    105     for layer in self.layers:
--> 106         x = layer(x)
    107     x = x.mean(dim=[1, 2])
    108     x = self.linear(x)

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
   1499                 or _global_backward_pre_hooks or _global_backward_hooks
   1500                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501             return forward_call(*args, **kwargs)
   1502         # Do not call functions when jit is used
   1503         full_backward_hooks, non_full_backward_hooks = [], []

<ipython-input-95-3b522f5841c4> in forward(self, x)
     66     B, C, H, W = x.shape  # Get batch size, channels, height, and width
     67     x = x.permute(0, 2, 3, 1)  # Permute axes to get the correct format
---> 68     x = x.reshape(B, H // self.patch_size[0], W // self.patch_size[1], self.patch_size[0], self.patch_size[1], C)
     69     x = x.permute(0, 1, 3, 2, 4, 5)
     70     x = x.reshape(-1, self.patch_size[0] * self.patch_size[1], C)

RuntimeError: shape '[1600, 3, 16, 8, 8, 28]' is invalid for input of size 160563200



import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader

# Define the SwinTransformerAttention model
class SwinTransformerAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, qkv_bias, drop_path_rate):
        super(SwinTransformerAttention, self).__init__()

        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.qkv_bias = qkv_bias
        self.drop_path_rate = drop_path_rate

        # Initialize the layers for Q, K, and V transformations
        self.q = nn.Linear(embed_dim, embed_dim * num_heads, bias=qkv_bias)
        self.k = nn.Linear(embed_dim, embed_dim * num_heads, bias=qkv_bias)
        self.v = nn.Linear(embed_dim, embed_dim * num_heads, bias=qkv_bias)
        # Initialize the softmax and dropout layers
        self.softmax = nn.Softmax(dim=-1)
        self.dropout = nn.Dropout(drop_path_rate)

    def forward(self, x):
        # Apply linear transformations to input for Q, K, and V
        q = self.q(x)
        k = self.k(x)
        v = self.v(x)

        B, H, W, C = q.shape  # Store the original shape
        # Reshape Q, K, and V for attention calculation
        q = q.reshape(B, H * W, self.num_heads, self.embed_dim // self.num_heads)
        k = k.reshape(B, H * W, self.num_heads, self.embed_dim // self.num_heads)
        v = v.reshape(B, H * W, self.num_heads, self.embed_dim // self.num_heads)

        # Calculate attention scores and apply softmax
        attn = self.softmax(q @ k.transpose(-1, -2))
        attn = self.dropout(attn)

        # Calculate the output using attention scores and V
        out = attn @ v
        out = out.reshape(B, H, W, -1)

        return out

# Define the SwinTransformerBlock model
class SwinTransformerBlock(nn.Module):
    def __init__(self, img_size, patch_size, embed_dim, num_heads, mlp_ratio, qkv_bias, drop_path_rate):
        super(SwinTransformerBlock, self).__init__()

        self.img_size = img_size
        self.patch_size = patch_size
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.mlp_ratio = mlp_ratio
        self.qkv_bias = qkv_bias
        self.drop_path_rate = drop_path_rate

        # Initialize Layer Normalization for the first normalization step
        self.norm1 = nn.LayerNorm(embed_dim)
        # Initialize the SwinTransformerAttention module
        self.attn = SwinTransformerAttention(embed_dim, num_heads, qkv_bias, drop_path_rate)
        # Initialize Layer Normalization for the second normalization step
        self.norm2 = nn.LayerNorm(embed_dim)
        # Initialize the Multi-Layer Perceptron (MLP)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, int(embed_dim * mlp_ratio)),
            nn.Linear(int(embed_dim * mlp_ratio), embed_dim),

    def forward(self, x):
        B, C, H, W = x.shape  # Get batch size, channels, height, and width
        x = x.permute(0, 2, 3, 1)  # Permute axes for correct format
        x = x.reshape(B, H // self.patch_size[0], W // self.patch_size[1], self.patch_size[0], self.patch_size[1], C)  # Reshape into patches
        x = x.permute(0, 1, 3, 2, 4, 5)  # Permute axes to group patches
        x = x.reshape(-1, self.patch_size[0] * self.patch_size[1], C)  # Reshape for attention calculation
        x = self.attn(x)  # Apply the attention mechanism
        x = x.reshape(B, H // self.patch_size[0], W // self.patch_size[1], -1)  # Reshape back to patch grid
        x = x.permute(0, 3, 1, 2)  # Permute axes back to the original format
        x = self.norm1(x)  # Apply Layer Normalization
        residual = x  # Save the residual for later use
        x = self.mlp(x)  # Apply the MLP
        x = x + residual  # Add back the residual connection
        x = self.norm2(x)  # Apply Layer Normalization again
        return x

# Define the SwinTransformerV2 model
class SwinTransformerV2(nn.Module):
    def __init__(self, img_size, patch_size, embed_dim, depths, num_heads, mlp_ratio, qkv_bias, drop_path_rate):
        super(SwinTransformerV2, self).__init__()

        self.img_size = img_size
        self.patch_size = patch_size
        self.embed_dim = embed_dim
        self.depths = depths
        self.num_heads = num_heads
        self.mlp_ratio = mlp_ratio
        self.qkv_bias = qkv_bias
        self.drop_path_rate = drop_path_rate

        # Initialize a list of SwinTransformerBlock layers
        self.layers = nn.ModuleList()
        for i in range(self.depths):
            self.layers.append(SwinTransformerBlock(img_size, patch_size, embed_dim, num_heads, mlp_ratio, qkv_bias, drop_path_rate))
        # Initialize the final linear layer for classification
        self.linear = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        x = x.reshape(-1, self.img_size[0] // self.patch_size[0], self.img_size[1] // self.patch_size[1], self.embed_dim)
        for layer in self.layers:
            x = layer(x)  # Apply each SwinTransformerBlock layer
        x = x.mean(dim=[1, 2])  # Global average pooling
        x = self.linear(x)  # Classification layer
        return x

# Define the ConvMixerSwinV2 model
class ConvMixerSwinV2(nn.Module):
    def __init__(self, patch_size, embed_dim, depths, num_heads, mlp_ratio, qkv_bias, drop_path_rate):
        super(ConvMixerSwinV2, self).__init__()

        # Initialize the first convolutional layer
        self.conv1 = nn.Conv2d(3, 128, kernel_size=3, stride=1, padding=1)
        # Initialize the SwinTransformerV2 model
        self.swin_transformer = SwinTransformerV2(img_size, patch_size, embed_dim, depths, num_heads, mlp_ratio, qkv_bias, drop_path_rate)
        # Initialize the final linear layer for classification
        self.classifier = nn.Linear(128, 100)  # Adjust the output dimension for your specific classification task

    def forward(self, x):
        x = self.conv1(x)  # Apply the initial convolution
        B, C, H, W = x.shape
        x = x.permute(0, 2, 3, 1)  # Transpose the axes
        x = x.reshape(B, H // self.swin_transformer.patch_size[0], self.swin_transformer.patch_size[0], W // self.swin_transformer.patch_size[1], self.swin_transformer.patch_size[1], C)
        x = x.permute(0, 1, 3, 2, 4, 5)  # Transpose axes to group patches
        x = x.reshape(-1, self.swin_transformer.patch_size[0] * self.swin_transformer.patch_size[1], C)  # Reshape for attention calculation
        x = self.swin_transformer(x)  # Apply the SwinTransformerV2 model
        x = x.mean(dim=[1, 2])  # Global average pooling
        x = self.classifier(x)  # Classification layer
        return x

# Check if a GPU is available and set the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Provide all the necessary values for creating SwinTransformerV2
img_size = (224, 224)  # Input image size (upscale to 224x224 for compatibility with SwinTransformerV2)
patch_size = (8, 8)    # Patch size
embed_dim = 128        # Embedding dimension
depths = 12            # Number of blocks
num_heads = 32         # Number of attention heads
mlp_ratio = 4          # Inner MLP dimension ratio
qkv_bias = True        # Bias for QKV operations
drop_path_rate = 0.2   # Drop path rate for dropout

# Hyperparameters
batch_size = 32
learning_rate = 0.001
num_epochs = 5
num_classes = 100  # Number of classes in the classification task

# Hyperparameters
batch_size = 25  # Mini-batch size
accumulation_steps = 5  # Number of mini-batches to accumulate before performing an update

# Image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Pre-calculated mean and standard deviation values

# Load the CIFAR-100 dataset for training
train_dataset = datasets.CIFAR100(root='./data', train=True, transform=transform, download=True)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Load the CIFAR-100 dataset for testing
test_dataset = datasets.CIFAR100(root='./data', train=False, transform=transform, download=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Create the model and move the weights to the GPU
model = ConvMixerSwinV2(patch_size, embed_dim, depths, num_heads, mlp_ratio, qkv_bias, drop_path_rate).to(device)

# Define the optimizer and the loss function
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Model training
for epoch in range(num_epochs):
    total_loss = 0.0
    mini_batch_count = 0  # Counter for mini-batch accumulation

    for inputs, labels in train_loader:
        inputs = inputs.to(device)  # Move data to the GPU if available
        labels = labels.to(device)

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()  # Accumulate gradient for subsequent updates

        mini_batch_count += 1

        if mini_batch_count == accumulation_steps:
            # Perform weight update after accumulating gradients for the defined number of mini-batches
            optimizer.zero_grad()  # RESET GRADIENT AFTER UPDATE

            mini_batch_count = 0

        total_loss += loss.item() * inputs.size(0)

        # Perform a final update if accumulation is not complete for the last batch
        if mini_batch_count > 0:

    average_loss = total_loss / len(train_loader.dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Average Loss: {average_loss:.4f}")

    # Valutazione del modello sulla fase di validazione
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
      for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

accuracy = total_correct / total_samples
print(f"Epoch [{epoch+1}/{num_epochs}], Validation Accuracy: {accuracy:.4f}")





修改了SwinTransformerBlock块中x的尺寸,以确保尺寸与补丁数量一致。 在 SwinTransformerBlock 块内添加了排列和调整大小到正确的尺寸,以正确处理注意力计算步骤。 添加注释来解释对代码所做的每次修改的目的。 更正了训练部分的实现,包括处理梯度累积。 然而,实现中似乎仍然存在一些错误,因为“形状对于输入尺寸无效”错误不断出现。

此代码实现的期望是在 CIFAR-100 数据集上成功创建和训练用于图像分类的 ConvMixerSwinV2 模型。 ConvMixerSwinV2 模型将 ConvMixer 架构与 Swin Transformer 主干相结合,旨在提高图像理解任务的性能。



模型定义:定义ConvMixerSwinV2模型,该模型由卷积层、Swin Transformer主干和线性分类头组成。








具有 (a) 维度顺序错误和 (b) 大小错误。



import torch

x = torch.zeros((1600, 28, 28, 128))
patch_size = (8, 8)

B, C, H, W = x.shape  # Get batch size, channels, height, and width
x = x.permute(0, 2, 3, 1)  # Permute axes for correct format
x = x.reshape(B, H // patch_size[0], W // patch_size[1], patch_size[0], patch_size[1], C)  # Reshape into patches

# >>> RuntimeError: shape '[1600, 3, 16, 8, 8, 28]' is invalid for input of size 160563200


  • 我假设对于 CIFAR 图像数据,高度和宽度是相同的值。在你的例子中,看起来你有28个通道,高度为28,宽度为128。难道不应该是128个通道,高度为28,宽度为28吗?
  • 即使您有 28 像素的匹配高度和宽度,带有
    的行也会失败,因为 28 不能整除而没有余数被补丁大小 8 整除。换句话说:尺寸为 28x28 的图像不能被细分为大小为 8x8 的块。

建议: 检查

来完成此操作。请注意,该错误可能发生在您的任何 Swin Transformer 块中,而在第一个块中则不必要(我没有深入研究您的代码)。

