File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 2424
2525from torchtext .data .utils import get_tokenizer
2626from torchtext .vocab import build_vocab_from_iterator
27- from torchtext .datasets import Multi30k
27+ from torchtext .datasets import multi30k , Multi30k
2828from typing import Iterable , List
2929
3030
31+ # We need to modify the URLs for the dataset since the links to the original dataset are broken
32+ # Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info
33+ multi30k .URL ["train" ] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
34+ multi30k .URL ["valid" ] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"
35+
3136SRC_LANGUAGE = 'de'
3237TGT_LANGUAGE = 'en'
3338
3742
3843
3944# Create source and target language tokenizer. Make sure to install the dependencies.
45+ # pip install -U torchdata
4046# pip install -U spacy
4147# python -m spacy download en_core_web_sm
4248# python -m spacy download de_core_news_sm
You can’t perform that action at this time.
0 commit comments