2626######################################################################
2727# Let’s check if a CUDA GPU is available and select our device. Running
2828# the network on a GPU will greatly decrease the training/testing runtime.
29- #
29+ #
3030
3131device = torch .device ("cuda" if torch .cuda .is_available () else "cpu" )
3232print (device )
3535######################################################################
3636# Importing the Dataset
3737# ---------------------
38- #
38+ #
3939# We will use the UrbanSound8K dataset to train our network. It is
4040# available for free `here <https://urbansounddataset.weebly.com/>`_ and contains
4141# 10 audio classes with over 8000 audio samples! Once you have downloaded
4242# the compressed dataset, extract it to your current working directory.
4343# First, we will look at the csv file that provides information about the
4444# individual sound files. ``pandas`` allows us to open the csv file and
4545# use ``.iloc()`` to access the data within it.
46- #
46+ #
4747
4848csvData = pd .read_csv ('./data/UrbanSound8K/metadata/UrbanSound8K.csv' )
4949print (csvData .iloc [0 , :])
5555# gun_shot, jackhammer, siren, and street_music. Let’s play a couple files
5656# and see what they sound like. The first file is street music and the
5757# second is an air conditioner.
58- #
58+ #
5959
6060import IPython .display as ipd
6161ipd .Audio ('./data/UrbanSound8K/audio/fold1/108041-9-0-5.wav' )
6666######################################################################
6767# Formatting the Data
6868# -------------------
69- #
69+ #
7070# Now that we know the format of the csv file entries, we can construct
7171# our dataset. We will create a rapper class for our dataset using
7272# ``torch.utils.data.Dataset`` that will handle loading the files and
7676# class will store the file names, labels, and folder numbers of the audio
7777# files in the inputted folder list when initialized. The actual loading
7878# and formatting steps will happen in the access function ``__getitem__``.
79- #
79+ #
8080# In ``__getitem__``, we use ``torchaudio.load()`` to convert the wav
8181# files to tensors. ``torchaudio.load()`` returns a tuple containing the
8282# newly created tensor along with the sampling frequency of the audio file
9292# long enough to handle the downsampling so these tensors will need to be
9393# padded with zeros. The minimum length that won’t require padding is
9494# 160,000 samples.
95- #
95+ #
9696
9797class UrbanSoundDataset (Dataset ):
9898#rapper for the UrbanSound8K dataset
9999 # Argument List
100100 # path to the UrbanSound8K csv file
101101 # path to the UrbanSound8K audio files
102102 # list of folders to use in the dataset
103-
103+
104104 def __init__ (self , csv_path , file_path , folderList ):
105105 csvData = pd .read_csv (csv_path )
106106 #initialize lists to hold file names, labels, and folder numbers
@@ -113,11 +113,11 @@ def __init__(self, csv_path, file_path, folderList):
113113 self .file_names .append (csvData .iloc [i , 0 ])
114114 self .labels .append (csvData .iloc [i , 6 ])
115115 self .folders .append (csvData .iloc [i , 5 ])
116-
116+
117117 self .file_path = file_path
118118 self .mixer = torchaudio .transforms .DownmixMono () #UrbanSound8K uses two channels, this will convert them to one
119119 self .folderList = folderList
120-
120+
121121 def __getitem__ (self , index ):
122122 #format the file path and load the file
123123 path = self .file_path + "fold" + str (self .folders [index ]) + "/" + self .file_names [index ]
@@ -130,17 +130,17 @@ def __getitem__(self, index):
130130 tempData [:soundData .numel ()] = soundData [:]
131131 else :
132132 tempData [:] = soundData [:160000 ]
133-
133+
134134 soundData = tempData
135135 soundFormatted = torch .zeros ([32000 , 1 ])
136136 soundFormatted [:32000 ] = soundData [::5 ] #take every fifth sample of soundData
137137 soundFormatted = soundFormatted .permute (1 , 0 )
138138 return soundFormatted , self .labels [index ]
139-
139+
140140 def __len__ (self ):
141141 return len (self .file_names )
142142
143-
143+
144144csv_path = './data/UrbanSound8K/metadata/UrbanSound8K.csv'
145145file_path = './data/UrbanSound8K/audio/'
146146
@@ -158,7 +158,7 @@ def __len__(self):
158158######################################################################
159159# Define the Network
160160# ------------------
161- #
161+ #
162162# For this tutorial we will use a convolutional neural network to process
163163# the raw audio data. Usually more advanced transforms are applied to the
164164# audio data, however CNNs can be used to accurately process the raw data.
@@ -169,7 +169,7 @@ def __len__(self):
169169# processing audio sampled at 8kHz the receptive field is around 10ms.
170170# This size is similar to speech processing applications that often use
171171# receptive fields ranging from 20ms to 40ms.
172- #
172+ #
173173
174174class Net (nn .Module ):
175175 def __init__ (self ):
@@ -188,7 +188,7 @@ def __init__(self):
188188 self .pool4 = nn .MaxPool1d (4 )
189189 self .avgPool = nn .AvgPool1d (30 ) #input should be 512x30 so this outputs a 512x1
190190 self .fc1 = nn .Linear (512 , 10 )
191-
191+
192192 def forward (self , x ):
193193 x = self .conv1 (x )
194194 x = F .relu (self .bn1 (x ))
@@ -217,7 +217,7 @@ def forward(self, x):
217217# optimizer with weight decay set to 0.0001. At first, we will train with
218218# a learning rate of 0.01, but we will use a ``scheduler`` to decrease it
219219# to 0.001 during training.
220- #
220+ #
221221
222222optimizer = optim .Adam (model .parameters (), lr = 0.01 , weight_decay = 0.0001 )
223223scheduler = optim .lr_scheduler .StepLR (optimizer , step_size = 20 , gamma = 0.1 )
@@ -226,10 +226,10 @@ def forward(self, x):
226226######################################################################
227227# Training and Testing the Network
228228# --------------------------------
229- #
229+ #
230230# Now let’s define a training function that will feed our training data
231231# into the model and perform the backward pass and optimization steps.
232- #
232+ #
233233
234234def train (model , epoch ):
235235 model .train ()
@@ -239,7 +239,7 @@ def train(model, epoch):
239239 target = target .to (device )
240240 data = data .requires_grad_ () #set requires_grad to True for training
241241 output = model (data )
242- output = output .permute (1 , 0 , 2 ) #original output dimensions are batchSizex1x10
242+ output = output .permute (1 , 0 , 2 ) #original output dimensions are batchSizex1x10
243243 loss = F .nll_loss (output [0 ], target ) #the loss functions expects a batchSizex10 input
244244 loss .backward ()
245245 optimizer .step ()
@@ -256,7 +256,7 @@ def train(model, epoch):
256256# variable in all modules in the network to false. Certain layers like
257257# batch normalization and dropout layers behave differently during
258258# training so this step is crucial for getting correct results.
259- #
259+ #
260260
261261def test (model , epoch ):
262262 model .eval ()
@@ -278,10 +278,13 @@ def test(model, epoch):
278278# for ten epochs then reduce the learn rate and train for ten more epochs.
279279# The network will be tested after each epoch to see how the accuracy
280280# varies during the training.
281- #
281+ #
282+ # .. note:: Due to a build issue, we've reduced the number of epochs to 10.
283+ # Run this sample with 40 locally to get the proper values.
284+ #
282285
283286log_interval = 20
284- for epoch in range (1 , 41 ):
287+ for epoch in range (1 , 11 ):
285288 if epoch == 31 :
286289 print ("First round of training complete. Setting learn rate to 0.001." )
287290 scheduler .step ()
@@ -292,16 +295,16 @@ def test(model, epoch):
292295######################################################################
293296# Conclusion
294297# ----------
295- #
298+ #
296299# If trained on 9 folders, the network should be more than 50% accurate by
297300# the end of the training process. Training on less folders will result in
298301# a lower overall accuracy but may be necessary if long runtimes are a
299302# problem. Greater accuracies can be achieved using deeper CNNs at the
300303# expense of a larger memory footprint.
301- #
304+ #
302305# For more advanced audio applications, such as speech recognition,
303306# recurrent neural networks (RNNs) are commonly used. There are also other
304307# data preprocessing methods, such as finding the mel frequency cepstral
305308# coefficients (MFCC), that can reduce the size of the dataset.
306- #
309+ #
307310
0 commit comments