我正在尝试创建一个 python 脚本,它接受两个输入、一个图像和一个音频文件。然后脚本将输出一个以图像为背景的 mp4 文件,以及音频的波形可视化表示。类似于您可能会看到的音频演示、一些音乐等 Youtube 视频类型。类似于此:https://youtu.be/5GlacvXQGPU
我大部分时间都在工作,但在我的输出视频中,视觉波形与音频不同步。波形似乎在实际发生前一秒显示声音(未准确测量,只是目测)。
我认为这是有问题的部分,稍微简化了。更具体地说,我在视觉更新的每一帧上从音频中读取 256 帧(Chunk 是 256)这一事实导致了这个问题。但我不知道如何解决它。我最初将 Chunk 设置为 1。我的想法是我将分析 1 帧音频,并生成 1 帧输出视频。但是当一次读取音频1帧时,波形输出是胡说八道。
如有任何意见,我们将不胜感激。
# animation loop
def update(frame, *fargs):
global freq, PSD
in_data = wf.readframes(chunk)
# convert audio stream to correct format
audio = np.frombuffer(in_data, dtype=np.int16)
# fft - generate frequency and power spectrum arrays
freq, PSD = sig.periodogram(audio, sampleRate, nfft=sampleRate/10)
# take PSD data and create amplitude values for the frequency plot
divs[0] = PSD[0]
waves[0].set_ydata([-divs[0],divs[0]])
for i in range(1, numDivs):
# square-root of the average 'volume' for each frequency range
divs[i] = (np.average(PSD[int(freqDivs[i][0]):int(freqDivs[i][1])])**0.5)/2
# instant growth, proportional decay
if divs[i] > amp[i]: amp[i] = divs[i]
elif divs[i] < amp[i]: amp[i] = amp[i]-(amp[i]-divs[i])/1.8
waves[i].set_ydata([-amp[i],amp[i]])
interval = fps/1000
frame_count = int((wf.getnframes()/wf.getframerate())*fps)
# Save visual animation
anim = animation.FuncAnimation(fig, update, frames=frame_count, interval=interval)
writer = animation.FFMpegWriter(fps=fps)
anim.save('animation.mp4', writer=writer, dpi=10)
完整代码:
import pyaudio
import numpy as np
from scipy import signal as sig
import matplotlib
from matplotlib import pyplot as plt
import matplotlib.cbook as cbook
import matplotlib.animation as animation
import wave
from moviepy.editor import VideoFileClip, AudioFileClip
import random
freq = []
PSD = []
Audio_File=r'C:\Users\<user>\Documents\Waveform generator\3.wav'
Background_Image_File=r'C:\Users\<user>\Documents\Waveform generator\pic.png'
wf = wave.open(Audio_File, 'rb')
image = plt.imread(cbook.get_sample_data(Background_Image_File))
color_map = 'RdPu'
numDivs = 150
fps = 60
matplotlib setup
plt.ioff()
matplotlib.rcParams['toolbar'] = 'None'
fig = plt.figure()
fig.patch.set_facecolor('black')
Set resolution
fig.set_size_inches(128, 72, True)
dpi = 100
generate plot limits and hide axes
#ax = fig.add_axes([0, 0, 1, 1], frameon=False) # Adding the background image slows things down a lot
#ax.set_xticks([]) # So it's commented while testing
#ax.set_yticks([])
#im = ax.imshow(image, origin='upper', aspect=1)
ax2 = fig.add_axes([0, 0, 1, 1], frameon=False)
ax2.set_xlim(0,1.1), ax2.set_xticks([])
ax2.set_ylim(-40, 40), ax2.set_yticks([])
audio sampling parameters
chunk = 256
sampleRate = wf.getframerate()
divs = np.zeros(numDivs)
linear colormap normalized to numDivs
cMap = matplotlib.pyplot.get_cmap(color_map)
cInd = matplotlib.colors.Normalize(vmin=0, vmax=numDivs)
create frequency plot using matplotlib lines
waves = []
for i in range(0, numDivs):
wave, = plt.plot([0.05 + i/numDivs, 0.05 + i/numDivs], [0,0], color='w', linewidth = 3)
waves.append(wave)
generate frequency spectrum
freqDivs = []
freqDivs.append([0])
freqDivs.append([1,2])
for i in range(2, numDivs):
prevLow = freqDivs[i-1][0]
prevHigh = freqDivs[i-1][1]
freqDivs.append([prevHigh+1,prevHigh+1+(prevHigh-prevLow)*1.0625])
use the full fft spectrum
if len(freqDivs) == numDivs:
freqDivs[numDivs-1] = [freqDivs[numDivs-1][0], len(freq)]
amp = np.zeros(numDivs)
animation loop
def update(frame, *fargs):
global freq, PSD
print(frame)
in_data = wf.readframes(chunk)
convert audio stream to correct format
audio = np.frombuffer(in_data, dtype=np.int16)
fft - generate frequency and power spectrum arrays
freq, PSD = sig.periodogram(audio, sampleRate, nfft=sampleRate/10)
take PSD data and create amplitude values for the frequency plot
divs[0] = PSD[0]
waves[0].set_ydata([-divs[0],divs[0]])
for i in range(1, numDivs):
square-root of the average 'volume' for each frequency range
divs[i] = (np.average(PSD[int(freqDivs[i][0]):int(freqDivs[i][1])])**0.5)/2
instant growth, proportional decay
if divs[i] > amp[i]: amp[i] = divs[i]
elif divs[i] < amp[i]: amp[i] = amp[i]-(amp[i]-divs[i])/1.8
some (crude) processing, add some smoothing to the curves, reduce flickering, etc.
if amp[i] > 10:
amp[i] = 10 + amp[i]/5
if i == 0:
surr_avg = (amp[i+1] + amp[len(amp)])/ 2
elif i == len(amp)-1:
surr_avg = (amp[i] + amp[0])/ 2
else:
surr_avg = (amp[i+1] + amp[i-1])/ 2
amp[i] = (amp[i] + surr_avg)/2
if amp[i] < 0.3:
amp[i] = 0.3
waves[i].set_ydata([-amp[i],amp[i]])
interval = fps/1000
frame_count = int((wf.getnframes()/wf.getframerate())*fps)
Save visual animation
anim = animation.FuncAnimation(fig, update, frames=frame_count, interval=interval)
writer = animation.FFMpegWriter(fps=fps)
anim.save('animation.mp4', writer=writer, dpi=10)
Add the audio to the newly generated video
video = VideoFileClip("animation.mp4")
audio = AudioFileClip("3.wav")
video = video.set_audio(audio)
video.write_videofile("output.mp4")