我有一种用于转录音频文件的火炬列车脚本。我有 2 个音频文件,其中一个具有 [1 1 16000] 形状,另一个具有 [1 1 16000] 形状。两个文件都是 10 秒。我无法处理该错误。如果您对音频有所了解,也许它会有所帮助。
文件位于路径中并且。在此之前,我会像 10sec+10sec+10sec+10sec+...+10sec 一样解析它们,如果你想看看我可以分享完整的代码。 当我使用“zorlu.m4a”运行代码时,它正在工作。但是当我用bell.m4a运行代码时,它给了我错误。
"""------------------------------blocks of code--------------------"""
# Define the path where the .wav files are located
wav_directory = os.getcwd()+"/wav" # Add your wav dir
# Define the output file name
output_file = os.path.join(wav_directory, "metadata.txt")
# Define the range of .wav files (1 to 165)
# Change the 165 to what ever amount of wav files you have if 100 the change it to 101
wav_files_range = range(1, int(len(ses)/parca_uzunlugu+1))
# Initialize the list to store file paths and transcripts
file_and_transcripts = []
# Initialize the wav2vec model and processor
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
# Iterate through the .wav files
for i in wav_files_range:
wav_file = os.path.join(wav_directory, f"{i}.wav")
# Check if the .wav file exists
if os.path.exists(wav_file):
# Recognize the speech in the .wav file
try:
waveform, sample_rate = torchaudio.load(wav_file)
print("waveform",type(waveform),waveform.shape)
waveform = waveform.squeeze() # Squeeze the batch dimension
print("waveform",type(waveform),waveform.shape)
resampler = torchaudio.transforms.Resample(
orig_freq=sample_rate, new_freq=16000)
print("waveform",type(waveform),waveform.shape)
waveform = resampler(waveform)
print("waveform",type(waveform),waveform.shape)
input_values = processor(
waveform, return_tensors="pt", sampling_rate=16000).input_values
print("waveform",type(waveform),waveform.shape)
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcript = processor.decode(predicted_ids[0])
except FileNotFoundError:
print(f"File not found: {wav_file}")
continue
# Append the desired path format and transcript to the list
file_and_transcripts.append(
f"{wav_directory}/{i}.wav|{transcript.capitalize()}.|{transcript.capitalize()}.")
else:
print(f"File not found: {wav_file}")
# Write the file paths and transcripts to the output file
with open(output_file, "w") as f:
for line in file_and_transcripts:
f.write(f"{line}\n")
print(f"File '{output_file}' created successfully.")
shutil.move(wav_directory+'/metadata.txt',wav_directory[:-4]+'/metadata.txt')
"""------------------------------blocks of code--------------------"""
Erors:
waveform <class 'torch.Tensor'> torch.Size([2, 441000])
waveform <class 'torch.Tensor'> torch.Size([2, 441000])
waveform <class 'torch.Tensor'> torch.Size([2, 441000])
waveform <class 'torch.Tensor'> torch.Size([2, 160000])
waveform <class 'torch.Tensor'> torch.Size([2, 160000])
--------i add a print line on torch/nn/modules/conv.py line 307------- <class 'torch.Tensor'> torch.Size([1, 1, 2, 160000])
Traceback (most recent call last):
File "/home/yunus/Desktop/piper/datasets/preprocess.py", line 87, in <module>
logits = model(input_values).logits
File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py", line 1962, in forward
outputs = self.wav2vec2(
File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py", line 1547, in forward
extract_features = self.feature_extractor(input_values)
File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py", line 459, in forward
hidden_states = conv_layer(hidden_states)
File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py", line 362, in forward
hidden_states = self.conv(hidden_states)
File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/conv.py", line 311, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/conv.py", line 307, in _conv_forward
return F.conv1d(input, weight, bias, self.stride,
RuntimeError: Expected 2D (unbatched) or 3D (batched) input to conv1d, but got input of size: [1, 1, 2, 160000]```
我发现了问题。我的第二个音频是立体声。这是关于如何解决它的解决方案。 如何在 Python 中将 WAV 从立体声转换为单声道?