@@ -211,6 +211,7 @@ def batchify(data, bsz):
211211# ``N`` is along dimension 1.
212212#
213213
214+ # In 'run'
214215 bptt = 35
215216 def get_batch (source , i ):
216217 seq_len = min (bptt , len (source ) - 1 - i )
@@ -246,6 +247,7 @@ def get_batch(source, i):
246247# allows the Pipe to work with only two partitions and avoid any
247248# cross-partition overheads.
248249
250+ # In 'run'
249251 ntokens = len (vocab .stoi ) # the size of vocabulary
250252 emsize = 4096 # embedding dimension
251253 nhid = 4096 # the dimension of the feedforward network model in nn.TransformerEncoder
@@ -317,6 +319,7 @@ def get_total_params(module: torch.nn.Module):
317319# function to scale all the gradient together to prevent exploding.
318320#
319321
322+ # In 'run'
320323 criterion = nn .CrossEntropyLoss ()
321324 lr = 5.0 # learning rate
322325 optimizer = torch .optim .SGD (model .parameters (), lr = lr )
@@ -380,6 +383,7 @@ def evaluate(eval_model, data_source):
380383# Loop over epochs. Save the model if the validation loss is the best
381384# we've seen so far. Adjust the learning rate after each epoch.
382385
386+ # In 'run'
383387 best_val_loss = float ("inf" )
384388 epochs = 3 # The number of epochs
385389 best_model = None
@@ -410,6 +414,7 @@ def evaluate(eval_model, data_source):
410414######################################################################
411415# Apply the best model to check the result with the test dataset.
412416
417+ # In 'run'
413418 test_loss = evaluate (best_model , test_data )
414419 print ('=' * 89 )
415420 print ('| End of training | test loss {:5.2f} | test ppl {:8.2f}' .format (
0 commit comments