AudioEffector 使用方法
作者: Moto Hira
本教程展示了如何使用 torchaudio.io.AudioEffector 对波形张量应用各种效果和编解码器。
本教程需要 FFmpeg 库。有关详细信息,请参阅 FFmpeg 依赖。
概述
AudioEffector 结合了由 StreamWriter 和 StreamReader 提供的内存编码、解码和过滤功能。
下图展示了该过程。

import torch
import torchaudio
print(torch.__version__)
print(torchaudio.__version__)
2.6.0
2.6.0
from torchaudio.io import AudioEffector, CodecConfig
import matplotlib.pyplot as plt
from IPython.display import Audio
for k, v in torchaudio.utils.ffmpeg_utils.get_versions().items():
print(k, v)
libavcodec (60, 3, 100)
libavdevice (60, 1, 100)
libavfilter (9, 3, 100)
libavformat (60, 3, 100)
libavutil (58, 2, 100)
用法
要使用 AudioEffector,请使用 effect 和 format 实例化它,然后将波形传递给 apply() 或 stream() 方法。
effector = AudioEffector(effect=..., format=...,)
# Apply at once
applied = effector.apply(waveform, sample_rate)
apply 方法将效果和编解码器一次性应用于整个波形。因此,如果输入波形较长,且内存消耗是一个问题,可以使用 stream 方法逐块处理。
# Apply chunk by chunk
for applied_chunk = effector.stream(waveform, sample_rate):
...
示例
src = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
waveform, sr = torchaudio.load(src, channels_first=False)
图库
def show(effect, *, stereo=False):
wf = torch.cat([waveform] * 2, dim=1) if stereo else waveform
figsize = (6.4, 2.1 if stereo else 1.2)
effector = AudioEffector(effect=effect, pad_end=False)
result = effector.apply(wf, int(sr))
num_channels = result.size(1)
f, ax = plt.subplots(num_channels, 1, squeeze=False, figsize=figsize, sharex=True)
for i in range(num_channels):
ax[i][0].specgram(result[:, i], Fs=sr)
f.set_tight_layout(True)
return Audio(result.numpy().T, rate=sr)
原始代码
show(effect=None)

效果
节拍
https://ffmpeg.org/ffmpeg-filters.html#atempo
show("atempo=0.7")

show("atempo=1.8")

高通滤波器
https://ffmpeg.org/ffmpeg-filters.html#highpass
show("highpass=frequency=1500")

低通滤波器
https://ffmpeg.org/ffmpeg-filters.html#lowpass
show("lowpass=frequency=1000")

全通滤波器
https://ffmpeg.org/ffmpeg-filters.html#allpass
show("allpass")

带通滤波器
https://ffmpeg.org/ffmpeg-filters.html#bandpass
show("bandpass=frequency=3000")

带阻滤波器
https://ffmpeg.org/ffmpeg-filters.html#bandreject
show("bandreject=frequency=3000")

回声
https://ffmpeg.org/ffmpeg-filters.html#aecho
show("aecho=in_gain=0.8:out_gain=0.88:delays=6:decays=0.4")

show("aecho=in_gain=0.8:out_gain=0.88:delays=60:decays=0.4")

show("aecho=in_gain=0.8:out_gain=0.9:delays=1000:decays=0.3")

合唱效果
https://ffmpeg.org/ffmpeg-filters.html#chorus
show("chorus=0.5:0.9:50|60|40:0.4|0.32|0.3:0.25|0.4|0.3:2|2.3|1.3")

FFT 滤波器
https://ffmpeg.org/ffmpeg-filters.html#afftfilt
# fmt: off
show(
"afftfilt="
"real='re * (1-clip(b * (b/nb), 0, 1))':"
"imag='im * (1-clip(b * (b/nb), 0, 1))'"
)

show(
"afftfilt="
"real='hypot(re,im) * sin(0)':"
"imag='hypot(re,im) * cos(0)':"
"win_size=512:"
"overlap=0.75"
)

show(
"afftfilt="
"real='hypot(re,im) * cos(2 * 3.14 * (random(0) * 2-1))':"
"imag='hypot(re,im) * sin(2 * 3.14 * (random(1) * 2-1))':"
"win_size=128:"
"overlap=0.8"
)
# fmt: on

颤音效果
https://ffmpeg.org/ffmpeg-filters.html#vibrato
show("vibrato=f=10:d=0.8")

/pytorch/audio/ci_env/lib/python3.10/site-packages/IPython/lib/display.py:188: RuntimeWarning: invalid value encountered in cast
return scaled.astype("<h").tobytes(), nchan
颤音效果
https://ffmpeg.org/ffmpeg-filters.html#tremolo
show("tremolo=f=8:d=0.8")

晶体化器
https://ffmpeg.org/ffmpeg-filters.html#crystalizer
show("crystalizer")

镶边效果
https://ffmpeg.org/ffmpeg-filters.html#flanger
show("flanger")

相位器
https://ffmpeg.org/ffmpeg-filters.html#aphaser
show("aphaser")

脉冲器
https://ffmpeg.org/ffmpeg-filters.html#apulsator
show("apulsator", stereo=True)

haas
https://ffmpeg.org/ffmpeg-filters.html#haas
show("haas")

编解码器
def show_multi(configs):
results = []
for config in configs:
effector = AudioEffector(**config)
results.append(effector.apply(waveform, int(sr)))
num_configs = len(configs)
figsize = (6.4, 0.3 + num_configs * 0.9)
f, axes = plt.subplots(num_configs, 1, figsize=figsize, sharex=True)
for result, ax in zip(results, axes):
ax.specgram(result[:, 0], Fs=sr)
f.set_tight_layout(True)
return [Audio(r.numpy().T, rate=sr) for r in results]
ogg
results = show_multi(
[
{"format": "ogg"},
{"format": "ogg", "encoder": "vorbis"},
{"format": "ogg", "encoder": "opus"},
]
)

ogg - 默认编码器 (flac)
results[0]
ogg - vorbis
results[1]
ogg - opus
results[2]
mp3
https://trac.ffmpeg.org/wiki/Encode/MP3
results = show_multi(
[
{"format": "mp3"},
{"format": "mp3", "codec_config": CodecConfig(compression_level=1)},
{"format": "mp3", "codec_config": CodecConfig(compression_level=9)},
{"format": "mp3", "codec_config": CodecConfig(bit_rate=192_000)},
{"format": "mp3", "codec_config": CodecConfig(bit_rate=8_000)},
{"format": "mp3", "codec_config": CodecConfig(qscale=9)},
{"format": "mp3", "codec_config": CodecConfig(qscale=1)},
]
)

默认
results[0]
compression_level=1
results[1]
compression_level=9
results[2]
比特率=192k
results[3]
比特率=8k
results[4]
qscale=9
results[5]
qscale=1
results[6]
标签: torchaudio.io