@@ -441,20 +441,20 @@ def forward_step(self, input, hidden):
441441# :alt:
442442#
443443#
444- # Bahdanau attention, also known as additive attention, is a commonly used
445- # attention mechanism in sequence-to-sequence models, particularly in neural
446- # machine translation tasks. It was introduced by Bahdanau et al. in their
447- # paper titled `Neural Machine Translation by Jointly Learning to Align and Translate <https://arxiv.org/pdf/1409.0473.pdf>`__.
448- # This attention mechanism employs a learned alignment model to compute attention
449- # scores between the encoder and decoder hidden states. It utilizes a feed-forward
444+ # Bahdanau attention, also known as additive attention, is a commonly used
445+ # attention mechanism in sequence-to-sequence models, particularly in neural
446+ # machine translation tasks. It was introduced by Bahdanau et al. in their
447+ # paper titled `Neural Machine Translation by Jointly Learning to Align and Translate <https://arxiv.org/pdf/1409.0473.pdf>`__.
448+ # This attention mechanism employs a learned alignment model to compute attention
449+ # scores between the encoder and decoder hidden states. It utilizes a feed-forward
450450# neural network to calculate alignment scores.
451451#
452- # However, there are alternative attention mechanisms available, such as Luong attention,
453- # which computes attention scores by taking the dot product between the decoder hidden
454- # state and the encoder hidden states. It does not involve the non-linear transformation
452+ # However, there are alternative attention mechanisms available, such as Luong attention,
453+ # which computes attention scores by taking the dot product between the decoder hidden
454+ # state and the encoder hidden states. It does not involve the non-linear transformation
455455# used in Bahdanau attention.
456456#
457- # In this tutorial, we will be using Bahdanau attention. However, it would be a valuable
457+ # In this tutorial, we will be using Bahdanau attention. However, it would be a valuable
458458# exercise to explore modifying the attention mechanism to use Luong attention.
459459
460460class BahdanauAttention (nn .Module ):
@@ -467,7 +467,7 @@ def __init__(self, hidden_size):
467467 def forward (self , query , keys ):
468468 scores = self .Va (torch .tanh (self .Wa (query ) + self .Ua (keys )))
469469 scores = scores .squeeze (2 ).unsqueeze (1 )
470-
470+
471471 weights = F .softmax (scores , dim = - 1 )
472472 context = torch .bmm (weights , keys )
473473
@@ -605,9 +605,9 @@ def get_dataloader(batch_size):
605605# ``teacher_forcing_ratio`` up to use more of it.
606606#
607607
608- def train_epoch (dataloader , encoder , decoder , encoder_optimizer ,
608+ def train_epoch (dataloader , encoder , decoder , encoder_optimizer ,
609609 decoder_optimizer , criterion ):
610-
610+
611611 total_loss = 0
612612 for data in dataloader :
613613 input_tensor , target_tensor = data
@@ -617,7 +617,7 @@ def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
617617
618618 encoder_outputs , encoder_hidden = encoder (input_tensor )
619619 decoder_outputs , _ , _ = decoder (encoder_outputs , encoder_hidden , target_tensor )
620-
620+
621621 loss = criterion (
622622 decoder_outputs .view (- 1 , decoder_outputs .size (- 1 )),
623623 target_tensor .view (- 1 )
@@ -628,7 +628,7 @@ def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
628628 decoder_optimizer .step ()
629629
630630 total_loss += loss .item ()
631-
631+
632632 return total_loss / len (dataloader )
633633
634634
@@ -671,7 +671,7 @@ def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
671671 plot_losses = []
672672 print_loss_total = 0 # Reset every print_every
673673 plot_loss_total = 0 # Reset every plot_every
674-
674+
675675 encoder_optimizer = optim .Adam (encoder .parameters (), lr = learning_rate )
676676 decoder_optimizer = optim .Adam (decoder .parameters (), lr = learning_rate )
677677 criterion = nn .NLLLoss ()
@@ -680,7 +680,7 @@ def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
680680 loss = train_epoch (train_dataloader , encoder , decoder , encoder_optimizer , decoder_optimizer , criterion )
681681 print_loss_total += loss
682682 plot_loss_total += loss
683-
683+
684684 if epoch % print_every == 0 :
685685 print_loss_avg = print_loss_total / print_every
686686 print_loss_total = 0
@@ -691,7 +691,7 @@ def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
691691 plot_loss_avg = plot_loss_total / plot_every
692692 plot_losses .append (plot_loss_avg )
693693 plot_loss_total = 0
694-
694+
695695 showPlot (plot_losses )
696696
697697######################################################################
@@ -736,7 +736,7 @@ def evaluate(encoder, decoder, sentence, input_lang, output_lang):
736736
737737 _ , topi = decoder_outputs .topk (1 )
738738 decoded_ids = topi .squeeze ()
739-
739+
740740 decoded_words = []
741741 for idx in decoded_ids :
742742 if idx .item () == EOS_token :
@@ -793,7 +793,9 @@ def evaluateRandomly(encoder, decoder, n=10):
793793
794794######################################################################
795795#
796-
796+ # Set dropout layers to ``eval`` mode
797+ encoder .eval ()
798+ decoder .eval ()
797799evaluateRandomly (encoder , decoder )
798800
799801
@@ -807,7 +809,7 @@ def evaluateRandomly(encoder, decoder, n=10):
807809# at each time step.
808810#
809811# You could simply run ``plt.matshow(attentions)`` to see attention output
810- # displayed as a matrix. For a better viewing experience we will do the
812+ # displayed as a matrix. For a better viewing experience we will do the
811813# extra work of adding axes and labels:
812814#
813815
0 commit comments