Updated URLs for dataset (#1955)

Nayef211 · web-flow · commit bc11e8f247b4 · 2022-06-22T19:18:33.000-07:00
diff --git a/beginner_source/translation_transformer.py b/beginner_source/translation_transformer.py
@@ -24,10 +24,15 @@
 
 from torchtext.data.utils import get_tokenizer
 from torchtext.vocab import build_vocab_from_iterator
-from torchtext.datasets import Multi30k
+from torchtext.datasets import multi30k, Multi30k
 from typing import Iterable, List
 
 
+# We need to modify the URLs for the dataset since the links to the original dataset are broken
+# Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info
+multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
+multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"
+
 SRC_LANGUAGE = 'de'
 TGT_LANGUAGE = 'en'
 
@@ -37,6 +42,7 @@
 
 
 # Create source and target language tokenizer. Make sure to install the dependencies.
+# pip install -U torchdata
 # pip install -U spacy
 # python -m spacy download en_core_web_sm
 # python -m spacy download de_core_news_sm