import numpy as np
import torch
import torch.nn.functional as F
def layer_norm(x, weight, bias, eps=1e-6):
# x shape: [bs, h, w, c]
# Calculate mean and variance across the spatial dimensions (height, width)
mean = np.mean(x, axis=(1, 2), keepdims=True) # shape: (batch_size, 1, 1, channels)
var = np.var(x, axis=(1, 2), keepdims=True, ddof=0) # Use ddof=0 for biased variance
# Normalize
x_normalized = (x - mean) / np.sqrt(var + eps)
# Applying weight and bias
out = weight[None, None, None, :] * x_normalized + bias[None, None, None, :]
return out
def test1(x):
x = np.transpose(x, (0, 2, 3, 1)) # Transpose to [bs, h, w, c]
weight = np.ones(channels)
bias = np.zeros(channels)
normalized_output = layer_norm(x, weight, bias)
return normalized_output
def test2(x):
global channels
x = np.transpose(x, (0, 2, 3, 1)) # Transpose to [bs, h, w, c]
x_tensor = torch.tensor(x, dtype=torch.float32)
weight = torch.ones(channels)
bias = torch.zeros(channels)
# Use PyTorch's layer norm, normalizing over the last dimension (channels)
normalized_output = F.layer_norm(x_tensor, normalized_shape=(channels,), weight=weight, bias=bias)
return normalized_output.detach().numpy()
# Testing
batch, channels, height, width = 4, 3, 8, 8
# Generate random input
x = np.random.randint(-10, 10, (batch, channels, height, width))
# Calculate outputs from both implementations
layernorm1 = test1(x)
layernorm2 = test2(x)
# Check if outputs are close
are_close = np.allclose(layernorm1, layernorm2, atol=1e-4)
print("Outputs are close:", are_close) # Should output True if they are close enough
var = np.var(x, axis=(1, 2), keepdims=True, ddof=0) # Use ddof=0 for biased variance
var = np.var(x, axis=(1, 2), keepdims=True)
我的期望是are_close==True,这意味着layernorm1和layernom2的距离非常小。由于layernorm1和layernorm2具有较大的形状,因此我将显示layernorm1和layernorm2的部分结果。 layernorm1[0,0,0:3,0:4] 数组([[ 0.35208505, 1.06448374, -0.52827179], [-1.6216472 , -1.7376534 , -1.07653225], [-1.12821414, 0.88935017, 1 .84752351]]) 层范数2[0 ,0,0:3,0:4] 数组([[ 0.07412489, 1.1859984 , -1.2601235 ], [-1.0690411 , -0.2672601 , 1.336302 ], [-1.3920445 , 0.4800153 , 0.91202 91]],dtype=float32)
我尝试过带或不带 ddof=0 的 variacne 方法,在 print 语句中得到所有 False 。 我想知道如何实现与 Pytorch 内置的layernorm 函数类似的自定义layernorm。 从代码的角度来看,什么是layernorm步骤? Layernorm 对计算机视觉的特征图有何作用?
您正在沿错误的维度计算平均值/无功值。
Pytorch 的层范数沿着
normalized_shape
指定的维度计算平均值/方差值。从文档来看,输入尺寸预计为(*, normalized_shape[0], normalized_shape[1], ...)
。
您的代码具有
normalized_shape=(channels,)
,其输入排列为将通道作为最后一个维度,因此您应该沿着该维度计算平均值/var。相反,您可以沿着暗淡的方向计算它(1, 2)
。
这是正确的实现:
def layer_norm(x, weight, bias, eps=1e-5):
# x shape: [bs, h, w, c]
# Calculate mean and variance across the spatial dimensions (height, width)
mean = np.mean(x, axis=-1, keepdims=True) # shape: (batch_size, 1, 1, channels)
var = np.var(x, axis=-1, keepdims=True, ddof=0) # Use ddof=0 for biased variance
# Normalize
x_normalized = (x - mean) / np.sqrt(var + eps)
# Applying weight and bias
out = weight[None, None, None, :] * x_normalized + bias[None, None, None, :]
return out
请注意,我还设置了
eps=1e-5
,这是 pytorch 默认值。
这样,运行:
batch, channels, height, width = 4, 3, 8, 8
# Generate random input
x = np.random.randint(-10, 10, (batch, channels, height, width))
# Calculate outputs from both implementations
layernorm1 = test1(x)
layernorm2 = test2(x)
# Check if outputs are close
are_close = np.allclose(layernorm1, layernorm2, atol=1e-6)
结果
are_close == True