正如这个 CodeReview 问题提到的,我正在尝试修改代码以处理视频中逐帧雨条纹的去除。此代码中使用了 FFmpeg 包。
import argparse
import os
import time
import cv2
import ffmpeg
import numpy as np
import torch
from skimage import img_as_ubyte
from torch.utils.data import DataLoader
from tqdm import tqdm
import utils
from data_RGB import get_test_data
from MFDNet import HPCNet as mfdnet
def process_video_frame_by_frame(input_file, output_file, model_restoration):
"""
Decodes a video frame by frame, processes each frame,
and re-encodes to a new video.
Args:
input_file: Path to the input video file.
output_file: Path to the output video file.
"""
try:
# Probe for video information
probe = ffmpeg.probe(input_file)
video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
width = int(video_stream['width'])
height = int(video_stream['height'])
# Input
process1 = (
ffmpeg
.input(input_file)
.output('pipe:', format='rawvideo', pix_fmt='rgb24')
.run_async(pipe_stdout=True)
)
# Output
process2 = (
ffmpeg
.input('pipe:', format='rawvideo', pix_fmt='rgb24', s='{}x{}'.format(width, height))
.output(output_file, vcodec='libx264', pix_fmt='yuv420p')
.overwrite_output()
.run_async(pipe_stdin=True)
)
# Process frame (deraining processing)
while in_bytes := process1.stdout.read(width * height * 3):
in_frame = torch.frombuffer(in_bytes, dtype=torch.uint8).float().reshape((1, 3, width, height))
restored = model_restoration(torch.div(in_frame, 255).to(device='cuda'))
restored = torch.clamp(restored[0], 0, 1)
restored = restored.cpu().detach().numpy()
restored *= 255
out_frame = restored
np.reshape(out_frame, (3, width, height))
# Encode and write the frame
process2.stdin.write(
out_frame
.astype(np.uint8)
.tobytes()
)
# Close streams
process1.stdout.close()
process2.stdin.close()
process1.wait()
process2.wait()
except ffmpeg.Error as e:
print('stdout:', e.stdout.decode('utf8'))
print('stderr:', e.stderr.decode('utf8'))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Image Deraining using MPRNet')
parser.add_argument('--weights', default='./checkpoints/checkpoints_mfd.pth', type=str,
help='Path to weights')
parser.add_argument('--gpus', default='0', type=str, help='CUDA_VISIBLE_DEVICES')
args = parser.parse_args()
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
model_restoration = mfdnet()
utils.load_checkpoint(model_restoration, args.weights)
print("===>Testing using weights: ", args.weights)
model_restoration.eval().cuda()
input_video = "Input_video.mp4"
output_video = 'output_video.mp4'
process_video_frame_by_frame(input_video, output_video, model_restoration)
让我们重点关注
while
循环部分:
上面代码片段的版本可以正确执行。在下一步中,我尝试按照301_Moved_Permanently的答案来使用
torch.save
。因此,while
循环的内容如下:
# Process frame (deraining processing)
while in_bytes := process1.stdout.read(width * height * 3):
in_frame = torch.frombuffer(in_bytes, dtype=torch.uint8).float().reshape((1, 3, width, height))
restored = model_restoration(torch.div(in_frame, 255).to(device='cuda'))
restored = torch.clamp(restored[0], 0, 1)
out_frame = torch.mul(restored.cpu().detach(), 255).reshape(3, width, height).byte()
torch.save(out_frame, process2.stdin)
发生内存不足错误,并显示以下消息:
torch.OutOfMemoryError:CUDA 内存不足。尝试分配 676.00 MiB。 GPU 0 的总容量为 23.99 GiB,其中 0 字节可用。在分配的内存中,84.09 GiB 由 PyTorch 分配,1.21 GiB 由 PyTorch 保留但未分配。
为了诊断错误,我删除了最后两行代码:
# Process frame (deraining processing)
while in_bytes := process1.stdout.read(width * height * 3):
in_frame = torch.frombuffer(in_bytes, dtype=torch.uint8).float().reshape((1, 3, width, height))
restored = model_restoration(torch.div(in_frame, 255).to(device='cuda'))
restored = torch.clamp(restored[0], 0, 1)
内存不足错误仍然发生。这对我来说很奇怪。我对可执行版本代码的理解,
restored = restored.cpu().detach().numpy()
行是将GPU内存中的restored
数据传输到主存,然后转换为numpy格式。为什么我删除这行代码然后发生内存不足错误?
我使用的硬件和软件规格如下:
CPU:第 12 代 Intel(R) Core(TM) i9-12900K 3.20 GHz
RAM:128 GB(128 GB 可用)
显卡:NVIDIA GeForce RTX 4090
操作系统:Windows 11 Pro 22H2,操作系统内部版本:22621.4317
Pytorch 版本:
> python -c "import torch; print(torch.__version__)"
2.5.0+cu124
如果没有有关模型实现的其他信息,代码片段中唯一可能导致内存问题的内容与迭代中的张量累积有关。你有一个非常好的配置,所以我认为轻量级模型不会难以逐帧处理。
另外,尝试使用
torch.no_grad()
:你正在做推理,不需要存储不必要的梯度操作。
while in_bytes := process1.stdout.read(width * height * 3):
with torch.no_grad():
in_frame = torch.frombuffer(in_bytes, dtype=torch.uint8).float().reshape((1, 3, width, height))
in_frame_gpu = torch.div(in_frame, 255).to(device='cuda')
restored = model_restoration(in_frame_gpu)
restored = torch.clamp(restored[0], 0, 1)
out_frame = (restored.cpu() * 255).byte().numpy()
# Clear cache and del intermediate vars
torch.cuda.empty_cache()
del in_frame_gpu
del restored
process2.stdin.write(out_frame.tobytes())