11# -*- coding: utf-8 -*-
22"""
3- Audio Data Augmentation
3+ ์ค๋์ค ๋ฐ์ดํฐ ์ฆ๊ฐ
44=======================
55
6- ``torchaudio`` provides a variety of ways to augment audio data.
6+ *์ญ์*: Lee Jong Bub <https://github.com/bub3690>
77
8- In this tutorial, we look into a way to apply effects, filters,
9- RIR (room impulse response) and codecs.
8+ ``torchaudio`` ๋ ์ค๋์ค ๋ฐ์ดํฐ๋ฅผ ์ฆ๊ฐ์ํค๋ ๋ค์ํ ๋ฐฉ๋ฒ๋ค์ ์ ๊ณตํฉ๋๋ค.
109
11- At the end, we synthesize noisy speech over phone from clean speech.
10+ ์ด ํํ ๋ฆฌ์ผ์์๋ ํจ๊ณผ, ํํฐ,
11+ ๊ณต๊ฐ ์ํ์ค ์๋ต(RIR, Room Impulse Response)๊ณผ ์ฝ๋ฑ์ ์ ์ฉํ๋ ๋ฐฉ๋ฒ์ ์ดํด๋ณด๊ฒ ์ต๋๋ค.
12+
13+ ํ๋จ๋ถ์์๋, ๊นจ๋ํ ์์ฑ์ผ๋ก ๋ถํฐ ํด๋ํฐ ๋๋จธ์ ์ก์์ด ๋ ์์ฑ์ ํฉ์ฑํ๊ฒ ์ต๋๋ค.
1214"""
1315
1416import torch
1921print (torchaudio .__version__ )
2022
2123######################################################################
22- # Preparation
24+ # ์ค๋น
2325# -----------
2426#
25- # First, we import the modules and download the audio assets we use in this tutorial .
27+ # ๋จผ์ , ๋ชจ๋์ ๋ถ๋ฌ์ค๊ณ ํํ ๋ฆฌ์ผ์ ์ฌ์ฉํ ์ค๋์ค ์๋ฃ๋ค์ ๋ค์ด๋ก๋ํฉ๋๋ค .
2628#
2729
2830import math
3941
4042
4143######################################################################
42- # Applying effects and filtering
44+ # ํจ๊ณผ์ ํํฐ๋ง ์ ์ฉํ๊ธฐ
4345# ------------------------------
4446#
45- # :py:func:`torchaudio.sox_effects` allows for directly applying filters similar to
46- # those available in ``sox`` to Tensor objects and file object audio sources .
47+ # :py:func:`torchaudio.sox_effects` ๋ ``sox`` ์ ์ ์ฌํ ํํฐ๋ค์
48+ # ํ
์ ๊ฐ์ฒด๋ค๊ณผ ํ์ผ ๊ฐ์ฒด ์ค๋์ค ์์ค๋ค์ ์ง์ ์ ์ฉ ํด์ค๋๋ค .
4749#
48- # There are two functions for this :
50+ # ์ด๋ฅผ ์ํด ๋๊ฐ์ง ํจ์๊ฐ ์ฌ์ฉ๋ฉ๋๋ค :
4951#
50- # - :py:func:`torchaudio.sox_effects.apply_effects_tensor` for applying effects
51- # to Tensor .
52- # - :py:func:`torchaudio.sox_effects.apply_effects_file` for applying effects to
53- # other audio sources .
52+ # - :py:func:`torchaudio.sox_effects.apply_effects_tensor` ๋ ํ
์์
53+ # ํจ๊ณผ๋ฅผ ์ ์ฉํฉ๋๋ค .
54+ # - :py:func:`torchaudio.sox_effects.apply_effects_file` ๋ ๋ค๋ฅธ ์ค๋์ค ์์ค๋ค์
55+ # ํจ๊ณผ๋ฅผ ์ ์ฉํฉ๋๋ค .
5456#
55- # Both functions accept effect definitions in the form
56- # ``List[List[str]]``.
57- # This is mostly consistent with how ``sox`` command works, but one caveat is
58- # that ``sox`` adds some effects automatically, whereas ``torchaudio``โs
59- # implementation does not.
57+ # ๋ ํจ์๋ค์ ํจ๊ณผ์ ์ ์๋ฅผ ``List[List[str]]`` ํํ๋ก ๋ฐ์๋ค์
๋๋ค.
58+ # ``sox`` ์ ์๋ํ๋ ๋ฐฉ๋ฒ์ด ๊ฑฐ์ ์ ์ฌํฉ๋๋ค. ํ์ง๋ง, ํ๊ฐ์ง ์ ์์ ์
59+ # ``sox`` ๋ ์๋์ผ๋ก ํจ๊ณผ๋ฅผ ์ถ๊ฐํ์ง๋ง, ``torchaudio`` ์ ๊ตฌํ์ ๊ทธ๋ ์ง ์๋ค๋ ์ ์
๋๋ค.
6060#
61- # For the list of available effects, please refer to `the sox
62- # documentation <http://sox.sourceforge.net/sox.html>`__.
61+ # ์ฌ์ฉ ๊ฐ๋ฅํ ํจ๊ณผ๋ค์ ๋ชฉ๋ก์ ์๊ณ ์ถ๋ค๋ฉด, `the sox
62+ # documentation <http://sox.sourceforge.net/sox.html>`__ ์ ์ฐธ์กฐํด์ฃผ์ธ์ .
6363#
64- # **Tip** If you need to load and resample your audio data on the fly,
65- # then you can use :py:func:`torchaudio.sox_effects.apply_effects_file`
66- # with effect ``"rate"``.
64+ # **Tip** ์ฆ์์ผ๋ก ์ค๋์ค ๋ฐ์ดํฐ ๋ก๋์ ๋ค์ ์ํ๋ง ํ๊ณ ์ถ๋ค๋ฉด,
65+ # ํจ๊ณผ ``"rate"`` ์ ํจ๊ป :py:func:`torchaudio.sox_effects.apply_effects_file` ์ ์ฌ์ฉํ์ธ์.
6766#
68- # **Note** :py:func:`torchaudio.sox_effects.apply_effects_file` accepts a
69- # file-like object or path-like object.
70- # Similar to :py:func:`torchaudio.load`, when the audio format cannot be
71- # inferred from either the file extension or header, you can provide
72- # argument ``format`` to specify the format of the audio source.
67+ # **Note** :py:func:`torchaudio.sox_effects.apply_effects_file` ๋ ํ์ผ ํํ์ ๊ฐ์ฒด ๋๋ ์ฃผ์ ํํ์ ๊ฐ์ฒด๋ฅผ ๋ฐ์ต๋๋ค.
68+ # :py:func:`torchaudio.load` ์ ์ ์ฌํ๊ฒ, ์ค๋์ค ํฌ๋งท์ด
69+ # ํ์ผ ํ์ฅ์๋ ํค๋๋ฅผ ํตํด ์ถ๋ก ๋ ์ ์์ผ๋ฉด,
70+ # ์ ๋ฌ์ธ์ ``format`` ์ ์ฃผ์ด, ์ค๋์ค ์์ค์ ํฌ๋งท์ ๊ตฌ์ฒดํ ํด์ค ์ ์์ต๋๋ค.
7371#
74- # **Note** This process is not differentiable .
72+ # **Note** ์ด ๊ณผ์ ์ ๋ฏธ๋ถ ๋ถ๊ฐ๋ฅํฉ๋๋ค .
7573#
7674
77- # Load the data
75+ # ๋ฐ์ดํฐ๋ฅผ ๋ถ๋ฌ์ต๋๋ค.
7876waveform1 , sample_rate1 = torchaudio .load (SAMPLE_WAV )
7977
80- # Define effects
78+ # ํจ๊ณผ๋ค์ ์ ์ํฉ๋๋ค.
8179effects = [
82- ["lowpass" , "-1" , "300" ], # apply single-pole lowpass filter
83- ["speed" , "0.8" ], # reduce the speed
84- # This only changes sample rate, so it is necessary to
85- # add `rate` effect with original sample rate after this .
80+ ["lowpass" , "-1" , "300" ], # ๋จ๊ทน ์ ์ฃผํ ํต๊ณผ ํํฐ๋ฅผ ์ ์ฉํฉ๋๋ค.
81+ ["speed" , "0.8" ], # ์๋๋ฅผ ๊ฐ์์ํต๋๋ค.
82+ # ์ด ๋ถ๋ถ์ ์ํ ๋ ์ดํธ๋ง ๋ณ๊ฒฝํ๊ธฐ์, ์ดํ์
83+ # ํ์์ ์ผ๋ก `rate` ํจ๊ณผ๋ฅผ ๊ธฐ์กด ์ํ ๋ ์ดํธ๋ก ์ฃผ์ด์ผํฉ๋๋ค .
8684 ["rate" , f"{ sample_rate1 } " ],
87- ["reverb" , "-w" ], # Reverbration gives some dramatic feeling
85+ ["reverb" , "-w" ], # ์ํฅ์ ์ฝ๊ฐ์ ๊ทน์ ์ธ ๋๋์ ์ค๋๋ค.
8886]
8987
90- # Apply effects
88+ # ํจ๊ณผ๋ค์ ์ ์ฉํฉ๋๋ค.
9189waveform2 , sample_rate2 = torchaudio .sox_effects .apply_effects_tensor (waveform1 , sample_rate1 , effects )
9290
9391print (waveform1 .shape , sample_rate1 )
9492print (waveform2 .shape , sample_rate2 )
9593
9694######################################################################
97- # Note that the number of frames and number of channels are different from
98- # those of the original after the effects are applied. Letโs listen to the
99- # audio.
95+ # ํจ๊ณผ๊ฐ ์ ์ฉ๋๋ฉด, ํ๋ ์์ ์์ ์ฑ๋์ ์๋ ๊ธฐ์กด์ ์ ์ฉ๋ ๊ฒ๋ค๊ณผ ๋ฌ๋ผ์ง์ ์ฃผ์ํ์ธ์.
96+ # ์ด์ ์ค๋์ค๋ฅผ ๋ค์ด๋ด
์๋ค.
10097#
10198
10299def plot_waveform (waveform , sample_rate , title = "Waveform" , xlim = None ):
@@ -139,7 +136,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
139136 plt .show (block = False )
140137
141138######################################################################
142- # Original :
139+ # ๊ธฐ์กด :
143140# ~~~~~~~~~
144141#
145142
@@ -148,7 +145,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
148145Audio (waveform1 , rate = sample_rate1 )
149146
150147######################################################################
151- # Effects applied :
148+ # ํจ๊ณผ ์ ์ฉ ํ :
152149# ~~~~~~~~~~~~~~~~
153150#
154151
@@ -157,24 +154,22 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
157154Audio (waveform2 , rate = sample_rate2 )
158155
159156######################################################################
160- # Doesnโt it sound more dramatic ?
157+ # ์ข ๋ ๊ทน์ ์ผ๋ก ๋ค๋ฆฌ์ง ์๋์ ?
161158#
162159
163160######################################################################
164- # Simulating room reverberation
161+ # ๋ฐฉ ์ํฅ ๋ชจ์ ์คํํ๊ธฐ
165162# -----------------------------
166163#
167164# `Convolution
168- # reverb <https://en.wikipedia.org/wiki/Convolution_reverb>`__ is a
169- # technique that's used to make clean audio sound as though it has been
170- # produced in a different environment.
165+ # reverb <https://en.wikipedia.org/wiki/Convolution_reverb>`__ ๋
166+ # ๊นจ๋ํ ์ค๋์ค๋ฅผ ๋ค๋ฅธ ํ๊ฒฝ์์ ์์ฑ๋ ๊ฒ์ฒ๋ผ ๋ง๋ค์ด์ฃผ๋ ๊ธฐ์ ์
๋๋ค.
171167#
172- # Using Room Impulse Response (RIR), for instance, we can make clean speech
173- # sound as though it has been uttered in a conference room .
168+ # ์๋ฅผ๋ค์ด, ๊ณต๊ฐ ์ํ์ค ์๋ต (RIR)์ ํ์ฉํ์ฌ, ๊นจ๋ํ ์์ฑ์
169+ # ๋ง์น ํ์์ค์์ ๋ฐ์๋ ๊ฒ์ฒ๋ผ ๋ง๋ค ์ ์์ต๋๋ค .
174170#
175- # For this process, we need RIR data. The following data are from the VOiCES
176- # dataset, but you can record your own โ just turn on your microphone
177- # and clap your hands.
171+ # ์ด ๊ณผ์ ์ ์ํด์, RIR ๋ฐ์ดํฐ๊ฐ ํ์ํฉ๋๋ค. ๋ค์ ๋ฐ์ดํฐ๋ค์ VOiCES ๋ฐ์ดํฐ์
์์ ์์ต๋๋ค.
172+ # ํ์ง๋ง, ์ง์ ๋
น์ํ ์๋ ์์ต๋๋ค. - ์ง์ ๋ง์ดํฌ๋ฅผ ์ผ์๊ณ , ๋ฐ์๋ฅผ ์น์ธ์!
178173#
179174
180175rir_raw , sample_rate = torchaudio .load (SAMPLE_RIR )
@@ -183,8 +178,8 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
183178Audio (rir_raw , rate = sample_rate )
184179
185180######################################################################
186- # First, we need to clean up the RIR. We extract the main impulse, normalize
187- # the signal power, then flip along the time axis .
181+ # ๋จผ์ , RIR์ ๊นจ๋ํ๊ฒ ๋ง๋ค์ด์ค์ผํฉ๋๋ค. ์ฃผ์ํ ์ํ์ค๋ฅผ ์ถ์ถํ๊ณ ,
182+ # ์ ํธ ์ ๋ ฅ์ ์ ๊ทํ ํฉ๋๋ค. ๊ทธ๋ฆฌ๊ณ ๋์ ์๊ฐ์ถ์ ๋ค์ง์ด ์ค๋๋ค .
188183#
189184
190185rir = rir_raw [:, int (sample_rate * 1.01 ) : int (sample_rate * 1.3 )]
@@ -194,7 +189,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
194189plot_waveform (rir , sample_rate , title = "Room Impulse Response" )
195190
196191######################################################################
197- # Then, we convolve the speech signal with the RIR filter .
192+ # ๊ทธ ํ, RIR ํํฐ์ ์์ฑ ์ ํธ๋ฅผ ํฉ์ฑ๊ณฑ ํฉ๋๋ค .
198193#
199194
200195speech , _ = torchaudio .load (SAMPLE_SPEECH )
@@ -203,7 +198,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
203198augmented = torch .nn .functional .conv1d (speech_ [None , ...], RIR [None , ...])[0 ]
204199
205200######################################################################
206- # Original :
201+ # ๊ธฐ์กด :
207202# ~~~~~~~~~
208203#
209204
@@ -212,7 +207,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
212207Audio (speech , rate = sample_rate )
213208
214209######################################################################
215- # RIR applied :
210+ # RIR ์ ์ฉ ํ :
216211# ~~~~~~~~~~~~
217212#
218213
@@ -222,13 +217,12 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
222217
223218
224219######################################################################
225- # Adding background noise
220+ # ๋ฐฐ๊ฒฝ ์์ ์ถ๊ฐํ๊ธฐ
226221# -----------------------
227222#
228- # To add background noise to audio data, you can simply add a noise Tensor to
229- # the Tensor representing the audio data. A common method to adjust the
230- # intensity of noise is changing the Signal-to-Noise Ratio (SNR).
231- # [`wikipedia <https://en.wikipedia.org/wiki/Signal-to-noise_ratio>`__]
223+ # ์ค๋์ค ๋ฐ์ดํฐ์ ์์์ ์ถ๊ฐํ๊ธฐ ์ํด์, ๊ฐ๋จํ ์์ ํ
์๋ฅผ ์ค๋์ค ๋ฐ์ดํฐ ํ
์์ ๋ํ ์ ์์ต๋๋ค.
224+ # ์์์ ์ ๋๋ฅผ ์กฐ์ ํ๋ ํํ ๋ฐฉ๋ฒ์ ์ ํธ ๋ ์ก์๋น (SNR)๋ฅผ ๋ฐ๊พธ๋ ๊ฒ์
๋๋ค.
225+ # [`wikipedia <https://ko.wikipedia.org/wiki/%EC%8B%A0%ED%98%B8_%EB%8C%80_%EC%9E%A1%EC%9D%8C%EB%B9%84>`__]
232226#
233227# $$ \\mathrm{SNR} = \\frac{P_{signal}}{P_{noise}} $$
234228#
@@ -250,7 +244,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
250244 noisy_speeches .append ((scale * speech + noise ) / 2 )
251245
252246######################################################################
253- # Background noise :
247+ # ๋ฐฐ๊ฒฝ ์ก์ :
254248# ~~~~~~~~~~~~~~~~~
255249#
256250
@@ -290,13 +284,12 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
290284
291285
292286######################################################################
293- # Applying codec to Tensor object
287+ # ์ฝ๋ฑ์ ํ
์ ๊ฐ์ฒด์ ์ ์ฉํ๊ธฐ
294288# -------------------------------
295289#
296- # :py:func:`torchaudio.functional.apply_codec` can apply codecs to
297- # a Tensor object.
290+ # :py:func:`torchaudio.functional.apply_codec` ๋ ํ
์ ์ค๋ธ์ ํธ์ ์ฝ๋ฑ์ ์ ์ฉํฉ๋๋ค.
298291#
299- # **Note** This process is not differentiable .
292+ # **Note** ์ด ๊ณผ์ ์ ๋ฏธ๋ถ ๋ถ๊ฐ๋ฅํฉ๋๋ค .
300293#
301294
302295
@@ -349,29 +342,27 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
349342Audio (waveforms [2 ], rate = sample_rate )
350343
351344######################################################################
352- # Simulating a phone recoding
345+ # ์ ํ ๋
น์ ๋ชจ์ ์คํํ๊ธฐ
353346# ---------------------------
354347#
355- # Combining the previous techniques, we can simulate audio that sounds
356- # like a person talking over a phone in a echoey room with people talking
357- # in the background.
348+ # ์ด์ ๊ธฐ์ ๋ค์ ํผํฉํ์ฌ, ๋ฐํฅ์๋ ๋ฐฉ์ ์ฌ๋๋ค์ด ์ด์ผ๊ธฐํ๋ ๋ฐฐ๊ฒฝ์์ ์ ํ ํตํํ๋
349+ # ๊ฒ ์ฒ๋ผ ๋ค๋ฆฌ๋ ์ค๋์ค๋ฅผ ๋ชจ์ ์คํํ ์ ์์ต๋๋ค.
358350#
359351
360352sample_rate = 16000
361353original_speech , sample_rate = torchaudio .load (SAMPLE_SPEECH )
362354
363355plot_specgram (original_speech , sample_rate , title = "Original" )
364356
365- # Apply RIR
357+ # RIR ์ ์ฉํ๊ธฐ
366358speech_ = torch .nn .functional .pad (original_speech , (RIR .shape [1 ] - 1 , 0 ))
367359rir_applied = torch .nn .functional .conv1d (speech_ [None , ...], RIR [None , ...])[0 ]
368360
369361plot_specgram (rir_applied , sample_rate , title = "RIR Applied" )
370362
371- # Add background noise
372- # Because the noise is recorded in the actual environment, we consider that
373- # the noise contains the acoustic feature of the environment. Therefore, we add
374- # the noise after RIR application.
363+ # ๋ฐฐ๊ฒฝ ์ก์ ์ถ๊ฐํ๊ธฐ
364+ # ์ก์์ด ์ค์ ํ๊ฒฝ์์ ๋
น์๋์๊ธฐ ๋๋ฌธ์, ์ก์์ด ํ๊ฒฝ์ ์ํฅ ํน์ง์ ๊ฐ์ง๊ณ ์๋ค๊ณ ๊ณ ๋ คํ์ต๋๋ค.
365+ # ๋ฐ๋ผ์, RIR ์ ์ฉ ํ์ ์ก์์ ์ถ๊ฐํ์ต๋๋ค
375366noise , _ = torchaudio .load (SAMPLE_NOISE )
376367noise = noise [:, : rir_applied .shape [1 ]]
377368
@@ -381,7 +372,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
381372
382373plot_specgram (bg_added , sample_rate , title = "BG noise added" )
383374
384- # Apply filtering and change sample rate
375+ # ํํฐ๋ง์ ์ ์ฉํ๊ณ ์ํ ๋ ์ดํธ ์์ ํ๊ธฐ
385376filtered , sample_rate2 = torchaudio .sox_effects .apply_effects_tensor (
386377 bg_added ,
387378 sample_rate ,
@@ -401,42 +392,42 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
401392
402393plot_specgram (filtered , sample_rate2 , title = "Filtered" )
403394
404- # Apply telephony codec
395+ # ์ ํ ์ฝ๋ฑ ์ ์ฉํ๊ธฐ
405396codec_applied = F .apply_codec (filtered , sample_rate2 , format = "gsm" )
406397
407398plot_specgram (codec_applied , sample_rate2 , title = "GSM Codec Applied" )
408399
409400
410401######################################################################
411- # Original speech :
402+ # ๊ธฐ์กด ์์ฑ :
412403# ~~~~~~~~~~~~~~~~
413404#
414405
415406Audio (original_speech , rate = sample_rate )
416407
417408######################################################################
418- # RIR applied :
409+ # RIR ์ ์ฉ ํ :
419410# ~~~~~~~~~~~~
420411#
421412
422413Audio (rir_applied , rate = sample_rate )
423414
424415######################################################################
425- # Background noise added :
416+ # ๋ฐฐ๊ฒฝ ์ก์ ์ถ๊ฐ ํ :
426417# ~~~~~~~~~~~~~~~~~~~~~~~
427418#
428419
429420Audio (bg_added , rate = sample_rate )
430421
431422######################################################################
432- # Filtered :
423+ # ํํฐ๋ง ์ ์ฉ ํ :
433424# ~~~~~~~~~
434425#
435426
436427Audio (filtered , rate = sample_rate2 )
437428
438429######################################################################
439- # Codec aplied :
430+ # ์ฝ๋ฑ ์ ์ฉ ํ :
440431# ~~~~~~~~~~~~~
441432#
442433
0 commit comments