Fixed links to evaluation data in makefile (tensorflow#5402)

Matthias Winkelmann · waterson · commit eb370577755e · 2018-10-04T17:27:20.000-07:00
diff --git a/research/swivel/.gitignore b/research/swivel/.gitignore
@@ -6,7 +6,9 @@ Mtruk.csv
 SimLex-999.zip
 analogy
 fastprep
-myz_naacl13_test_set.tgz
+*.dSYM
 questions-words.txt
+word_relationship.*
+tensorflow/
 rw.zip
 ws353simrel.tar.gz
diff --git a/research/swivel/README.md b/research/swivel/README.md
@@ -155,10 +155,10 @@ You can do some simple exploration using `nearest.py`:
     ...
 
 To evaluate the embeddings using common word similarity and analogy datasets,
-use `eval.mk` to retrieve the data sets and build the tools:
+use `eval.mk` to retrieve the data sets and build the tools. Note that wordsim is currently not compatible with Python 3.x.
 
     make -f eval.mk
-    ./wordsim.py -v vocab.txt -e vecs.bin *.ws.tab
+    ./wordsim.py --vocab vocab.txt --embeddings vecs.bin *.ws.tab
     ./analogy --vocab vocab.txt --embeddings vecs.bin *.an.tab
 
 The word similarity evaluation compares the embeddings' estimate of "similarity"
diff --git a/research/swivel/eval.mk b/research/swivel/eval.mk
@@ -59,9 +59,9 @@ simlex999.ws.tab: SimLex-999.zip
 mikolov.an.tab: questions-words.txt
 	egrep -v -E '^:' $^ | tr '[A-Z] ' '[a-z]\t' > $@
 
-msr.an.tab: myz_naacl13_test_set.tgz
-	tar Oxfz $^ test_set/word_relationship.questions | tr ' ' '\t' > /tmp/q
-	tar Oxfz $^ test_set/word_relationship.answers | cut -f2 -d ' ' > /tmp/a
+msr.an.tab: word_relationship.questions word_relationship.answers
+	cat word_relationship.questions | tr ' ' '\t' > /tmp/q
+	cat word_relationship.answers | cut -f2 -d ' ' > /tmp/a
 	paste /tmp/q /tmp/a > $@
 	rm -f /tmp/q /tmp/a
 
@@ -75,7 +75,7 @@ MEN.tar.gz:
 	wget http://clic.cimec.unitn.it/~elia.bruni/resources/MEN.tar.gz
 
 Mtruk.csv:
-	wget http://tx.technion.ac.il/~kirar/files/Mtruk.csv
+	wget http://www.kiraradinsky.com/files/Mtruk.csv
 
 rw.zip:
 	wget http://www-nlp.stanford.edu/~lmthang/morphoNLM/rw.zip
@@ -84,15 +84,18 @@ SimLex-999.zip:
 	wget http://www.cl.cam.ac.uk/~fh295/SimLex-999.zip
 
 questions-words.txt:
-	wget http://word2vec.googlecode.com/svn/trunk/questions-words.txt
+	wget http://download.tensorflow.org/data/questions-words.txt
 
-myz_naacl13_test_set.tgz:
-	wget http://research.microsoft.com/en-us/um/people/gzweig/Pubs/myz_naacl13_test_set.tgz
+word_relationship.questions:
+	wget https://github.com/darshanhegde/SNLPProject/raw/master/word2vec/eval/word_relationship.questions
+
+word_relationship.answers:
+	wget https://github.com/darshanhegde/SNLPProject/raw/master/word2vec/eval/word_relationship.answers
 
 analogy: analogy.cc
 
 clean:
 	rm -f *.ws.tab *.an.tab analogy *.pyc
 
 distclean: clean
-	rm -f *.tgz *.tar.gz *.zip Mtruk.csv questions-words.txt
+	rm -f *.tgz *.tar.gz *.zip Mtruk.csv questions-words.txt word_relationship.{questions,answers}
diff --git a/research/swivel/swivel.py b/research/swivel/swivel.py
diff --git a/research/swivel/vecs.py b/research/swivel/vecs.py
@@ -38,7 +38,7 @@ def __init__(self, vocab_filename, rows_filename, cols_filename=None):
             'unexpected file size for binary vector file %s' % rows_filename)
 
       # Memory map the rows.
-      dim = size / (4 * n)
+      dim = round(size / (4 * n))
       rows_mm = mmap.mmap(rows_fh.fileno(), 0, prot=mmap.PROT_READ)
       rows = np.matrix(
           np.frombuffer(rows_mm, dtype=np.float32).reshape(n, dim))