我正在使用 Pycuda 创建一个 python 程序,以与 GPU 并行实现 Sobel 算法。 我的代码是:
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
def loadImage(image_path):
# Load the image
image = Image.open(image_path) # Convert to grayscale
image_array = np.asarray(image)
return image_array
def applySobel(image):
# Convert image to grayscale if needed
if len(image.shape) == 3:
image = np.mean(image, axis=2)
# Create a GPU device array from the image
image_gpu = cuda.to_device(image.astype(np.float32))
# Allocate GPU memory for the output
output_gpu = cuda.mem_alloc_like(image_gpu)
# Define the Sobel filter kernels
sobel_x_kernel = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], dtype=np.float32)
sobel_y_kernel = np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], dtype=np.float32)
# Compile the CUDA kernel
mod = SourceModule("""
__global__ void sobelFilter(const float* input, float* output, int width, int height) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < height && col < width) {
float sum_x = 0.0;
float sum_y = 0.0;
for (int i = -1; i <= 1; ++i) {
for (int j = -1; j <= 1; ++j) {
int neighbor_row = row + i;
int neighbor_col = col + j;
if (neighbor_row >= 0 && neighbor_row < height && neighbor_col >= 0 && neighbor_col < width) {
float pixel = input[neighbor_row * width + neighbor_col];
sum_x += pixel * sobel_x_kernel[i + 1][j + 1];
sum_y += pixel * sobel_y_kernel[i + 1][j + 1];
}
}
}
output[row * width + col] = sqrt(sum_x * sum_x + sum_y * sum_y);
}
}
""")
# Get the compiled kernel function
sobel_filter_func = mod.get_function("sobelFilter")
# Set the block and grid dimensions
block_dim = (16, 16)
grid_dim = ((image.shape[1] - 1) // block_dim[0] + 1, (image.shape[0] - 1) // block_dim[1] + 1)
# Call the CUDA kernel
sobel_filter_func(image_gpu, output_gpu, np.int32(image.shape[1]), np.int32(image.shape[0]), block=(block_dim[0], block_dim[1], 1), grid=(grid_dim[0], grid_dim[1]))
# Copy the result back to the CPU
output = np.empty_like(image)
cuda.memcpy_dtoh(output, output_gpu)
return output
# Copy the processed image back to CPU
processed_image = np.empty_like(image)
cuda.memcpy_dtoh(processed_image, image_gpu)
return processed_image
def showImage(image):
# Display the image
plt.imshow(image, cmap="gray")
plt.axis("off")
plt.show()
#Test the code
image_path = "original_image.PNG"
#Load the image
image = loadImage(image_path)
#Apply Sobel filtering
processed_image = applySobel(image)
#Show the original and processed images
showImage(image)
showImage(processed_image)
尝试运行时,给出:
---------------------------------------------------------------------------
LogicError Traceback (most recent call last)
<ipython-input-17-404cd1292d54> in <cell line: 92>()
90
91 #Apply Sobel filtering
---> 92 processed_image = applySobel(image)
93
94 #Show the original and processed images
1 frames
/usr/local/lib/python3.10/dist-packages/pycuda/driver.py in to_device(bf_obj)
1077 else:
1078 bf = buffer(bf_obj)
-> 1079 result = mem_alloc(len(bf))
1080 memcpy_htod(result, bf)
1081 return result
LogicError: cuMemAlloc failed: an illegal memory access was encountered
我在其他算法中使用了相同的内存管理并且进展顺利,但这个不是。 请问有人可以帮我解决问题吗?
我希望有人提供解决我的问题的代码,或者至少提供一个可供审查的示例。
问题是传递给mem_alloc_like函数的对象必须支持nbytes属性。因此,您需要使用图像对象而不是image_gpu。将相关行更正为:
output_gpu = cuda.mem_alloc_like(image)
我还注意到,在 CUDA 内核中,您引用了字段 sobel_x_kernel 和 sobel_y_kernel,它们没有传递给内核。这是必须要做的。该过程与传输图像时类似。