使用 load_dataset() 加载 Mozilla Common Voice (v11) 数据集时,生成的数据集 (ds) 具有作为 numpy 数组的 audio.arrays。我不知道如何重现这个。
如何将一个特征设置为 ndarrays?
在检查通用语音时:
> tt = load_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
split=f'{data_args.train_split_name}[:15%]', # Load only the first %
cache_dir=model_args.cache_dir,
token=model_args.token,
)
> type(tt.select([0])['audio'][0]['path'])
<class 'str'>
> type(tt.select([0])['audio'][0]['array'])
<class 'numpy.ndarray'>
> type(tt.select([0])['path'][0]) # They repeat paths as a top level feature
<class 'str'>
但在我自己的代码中,我无法存储 numpy 数组,除非我发现
ds = ds.with_format('np')
,这确实会导致重新加载数据集,但所有顶级功能最终都会成为 numpy 数据类型(请参阅下面的完整代码来测试/重现) ):
> type(test_ds['path'][0])
<class 'numpy.str_'>
我只需要“音频 -> 数组”数据为 1d numpy 数组。
这里是创建数据集并重新加载它以检查类型的测试代码:
#!/usr/bin/env python
# Trying to save and reload a numpy array to/from a huggingface dataset
# The type of the loaded array must be a numpy array()
from datasets import Dataset, Features, Array2D, Sequence, Value
import numpy as np
audio_arrays = [np.random.rand(16000), np.random.rand(16000)]
features = Features({
# Each audio contains a np array of audio data, and a path to the src audio file
'audio': Sequence({
#'array': Sequence(feature=Array2D(shape=(None,), dtype="float32")),
'array': Sequence(feature=Value('float32')),
'path': Value('string'),
}),
'path': Value('string'), # Path is redundant in common voice set also
})
ddata = {
'path': [], # This will be a list of strings
'audio': [], # This will be a list of dictionaries
}
ddata['path'] = ['/foo0/', '/bar0/'] # # ensures we see storage difference
ddata['audio'] = [
{'array': audio_arrays[0], 'path': '/foo1/' },
{'array': audio_arrays[1], 'path': '/bar1/', },
]
ds = Dataset.from_dict(ddata)
ds = ds.with_format('np')
ds.save_to_disk('/tmp/ds.ds')
loaded_dataset = Dataset.load_from_disk('/tmp/ds.ds')
ld = loaded_dataset
au = ld['audio'][0]
ar = ld['audio'][0]['array']
print("Type of audio array:", type(ar))
print("Type of path:", type(ld['path'][0]))
print("Type of au path:", type(ld['audio'][0]['path']))
import ipdb; ipdb.set_trace(context=16); pass
明白了。 输出是:
{'path': Value(dtype='string', id=None), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None)}
{'path': '/foo', 'array': array([0.31222534, 0.04180908, 0.84359741, ..., 0.01086426, 0.37417603,
0.14474487]), 'sampling_rate': 16000}
Saving the dataset (1/1 shards): 100%|███████████████████████████| 2/2 [00:00<00:00, 984.58 examples/s]
Type of path: <class 'str'>
Type of audio array: <class 'numpy.ndarray'>
Type of audio.path: <class 'str'>
注意,访问字节形式的音频数组时会出现错误,格式无效。我们使用声音文件将它们重新格式化为正确的完整 WAV 文件字节表示。
要测试/重现的代码:
#!/usr/bin/env python
# Store audio snippets, without actual associated files, as dataset Audio() types.
# When loaded, the ['audio']['array'] items will be numpy ndarrays
from datasets import Dataset, Features, Array2D, Sequence, Value, Audio
import numpy as np
import sys
import soundfile as sf
import io
# Reference:
# Input: The Audio feature accepts as input:
# A str: Absolute path to the audio file (i.e. random access is allowed).
# A dict with the keys:
# path: String with relative path of the audio file to the archive file.
# bytes: Bytes content of the audio file.
# Convert the NumPy arrays to audio bytes in WAV format
def numpy_to_bytes(audio_array, sampling_rate=16000):
with io.BytesIO() as bytes_io:
sf.write(bytes_io, audio_array, samplerate=sampling_rate, format='WAV')
return bytes_io.getvalue()
audio_arrays = [np.random.rand(10000).astype('float32'), np.random.rand(8300).astype('float32')]
audio_bytes = [numpy_to_bytes(audio_array) for audio_array in audio_arrays]
features = Features({
'path': Value('string'), # Path is redundant in common voice set also
'audio': Audio(sampling_rate=16000),
})
ddata = {
'path': ['/foo', '/bar'],
'audio': [
{'bytes': audio_bytes[0], 'path': '/foo', },
{'bytes': audio_bytes[1], 'path': '/bar', },
],
}
ds = Dataset.from_dict(ddata, features=features)
# ds = Dataset.from_dict(ddata).cast_column("audio", Audio())
# ds = ds.with_format('np')
print(ds.features)
print(ds[0]['audio'])
ds.save_to_disk('/tmp/ds.ds')
loaded_dataset = Dataset.load_from_disk('/tmp/ds.ds')
ld = loaded_dataset[0]
au = ld['audio']
ar = ld['audio']['array']
print("Type of path:", type(ld['path']))
print("Type of audio array:", type(ar))
print("Type of audio array:", type(ar))
print("Type of audio.path:", type(ld['audio']['path']))
import ipdb; ipdb.set_trace(context=16); pass