coder1379
diff --git a/‎README.md‎
Lines changed: 30 additions & 13 deletions b/‎README.md‎
Lines changed: 30 additions & 13 deletions
diff --git a/‎benchmark.py‎
Lines changed: 79 additions & 0 deletions b/‎benchmark.py‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎demo.py‎
Lines changed: 1 addition & 12 deletions b/‎demo.py‎
Lines changed: 1 addition & 12 deletions
diff --git a/‎setup.py‎
Lines changed: 2 additions & 2 deletions b/‎setup.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎synonyms/__init__.py‎
Lines changed: 55 additions & 10 deletions b/‎synonyms/__init__.py‎
Lines changed: 55 additions & 10 deletions
diff --git a/‎synonyms/data/words.nearby.2.pklz‎
20.9 MB b/‎synonyms/data/words.nearby.2.pklz‎
20.9 MB
diff --git a/‎synonyms/data/words.nearby.gz‎ ‎synonyms/data/words.nearby.3.pklz‎synonyms/data/words.nearby.gz renamed to synonyms/data/words.nearby.3.pklz
12 MB b/‎synonyms/data/words.nearby.gz‎ ‎synonyms/data/words.nearby.3.pklz‎synonyms/data/words.nearby.gz renamed to synonyms/data/words.nearby.3.pklz
12 MB
diff --git a/‎synonyms/data/words.wc.gz‎
-784 KB b/‎synonyms/data/words.wc.gz‎
-784 KB
@@ -1,5 +1,6 @@
 # Synonyms
 Chinese Synonyms for Natural Language Processing and Understanding.
+最好的中文近义词工具包。
 
 ```synonyms```可以用于自然语言理解的很多任务：文本对齐，推荐算法，相似度计算，语义偏移等。
 
@@ -31,7 +32,7 @@ synonyms.nearby(人脸) = [
 095, 0.525344, 0.524009, 0.523101, 0.516046]]
 ```
 
-在OOV的情况下，返回  ```[[], []]```。
+在OOV的情况下，返回  ```[[], []]```，目前的字典大小: 125,792。
 
 ### synonyms#compare
 两个句子的相似度比较
@@ -81,28 +82,44 @@ assert synonyms.compare(sen1, sen2) == 0.0, "the similarity should be zero"
 
 ![](assets/2.png)
 
-## Similarity Demo
+## Demo
 ```
 $ pip install -r Requirements.txt
 $ python demo.py
->> Synonyms on loading ...
->> Synonyms vocabulary size: 125792
-Model loaded succeed
-人脸: [['图片', '图像', '通过观察', '数字图像', '几何图形', '脸部', '图象', '放大镜', '面孔', 'Mii'], [0.597284, 0.580373, 0.568486, 0.535674, 0.531835, 0.530
-095, 0.525344, 0.524009, 0.523101, 0.516046]]
-识别: [['辨识', '辨别', '辨认', '标识', '鉴别', '标记', '识别系统', '分辨', '检测', '区分'], [0.872249, 0.764099, 0.725761, 0.702918, 0.68861, 0.678132, 0.663
-829, 0.661863, 0.639442, 0.611004]]
 ```
 
 ## Data
 ```
-words.nearby.gz # 近义词汇源数据
-words.wc.gz     # 词频统计
+synonyms/data/words.nearby.x.pklz # compressed pickle object
+```
+
+data is built based on [wikidata-corpus](https://github.com/Samurais/wikidata-corpus).
+
+## benchmark
+
+Test with py3, MacBook Pro.
+
+```
+python benchmark.py
 ```
-View data with ```zmore```, ```zgrep```, ```zcat```.
 
-data is built based on https://github.com/Samurais/wikidata-corpus.
+++++++++++ OS Name and version ++++++++++
+Platform: Darwin
+Kernel: 16.7.0
+Distro: ('', '', '')
+Architecture: ('64bit', '')
+
+++++++++++ CPU Cores ++++++++++
+Cores: 4
+CPU Load: 60
+
+++++++++++ System Memory ++++++++++
 
+meminfo 8GB
+
+Model loaded succeed
+>> Synonyms on loading ...
+synonyms#nearby: 100000 loops, best of 3 epochs: 0.209 usec per loop
 
 ## 声明
 [Synonyms](https://github.com/shuzi/insuranceQA)发布证书 GPL 3.0。数据和程序可用于研究和商业产品，必须注明引用和地址，比如发布的任何媒体、期刊、杂志或博客等内容。
 
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#===============================================================================
+#
+# Copyright (c) 2017 <> All Rights Reserved
+#
+#
+# File: /Users/hain/ai/Synonyms/benchmark.py
+# Author: Hai Liang Wang
+# Date: 2017-10-21:11:26:53
+#
+#===============================================================================
+
+"""
+   
+"""
+from __future__ import print_function
+from __future__ import division
+
+__copyright__ = "Copyright (c) 2017 . All Rights Reserved"
+__author__    = "Hai Liang Wang"
+__date__      = "2017-10-21:11:26:53"
+
+
+import os
+import sys
+import platform
+import multiprocessing
+curdir = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(curdir)
+
+if sys.version_info[0] < 3:
+    reload(sys)
+    sys.setdefaultencoding("utf-8")
+    # raise "Must be using Python 3"
+
+import timeit
+
+print("\nEnumerating Available System Resources...")
+
+print("\n++++++++++ OS Name and version ++++++++++")
+
+print("Platform:", platform.system())
+print("Kernel:", platform.release())
+print("Distro:", platform.linux_distribution())
+print("Architecture:", platform.architecture())
+
+print("\n++++++++++ CPU Cores ++++++++++")
+p = os.popen("ps aux|awk 'NR > 0{s +=$3};END{print s}'").read()
+print("Cores:", multiprocessing.cpu_count(), '\nCPU Load:', p)
+
+print("\n++++++++++ System Memory ++++++++++\n")
+
+def meminfo():
+    meminfo=dict()
+
+    with os.popen('cat /proc/meminfo') as f:
+        for line in f:
+            meminfo[line.split(':')[0]] = line.split(':')[1].strip()
+    return meminfo
+try:
+    meminfo = meminfo()
+    print('Total Memory: {0}'.format(meminfo['MemTotal']))
+    print('Free Memory: {0}'.format(meminfo['MemFree']))
+except:
+    print("meminfo unavailable")
+
+def main():
+    repeat = 3
+    number = 100000
+    unit = "usec" #  微秒
+    unittosec = {"usec": 1e6, "msec": 1000, "sec": 1}
+    result = timeit.repeat("synonyms.nearby('人脸')", "import synonyms", number=number, repeat=repeat)
+    print("%s: %d loops, best of %d epochs: %.3g %s per loop" %
+              ("synonyms#nearby", number, repeat,
+               min(result) / number * unittosec[unit], unit))
+
+if __name__ == '__main__':
+    main()
@@ -35,7 +35,6 @@
 import synonyms # https://github.com/huyingxi/Synonyms
 import numpy
 import unittest
-import thulac
 
 # run testcase: python /Users/hain/ai/Synonyms/demo.py Test.testExample
 class Test(unittest.TestCase):
@@ -61,17 +60,7 @@ def testSenSimilarity(self):
         assert synonyms.compare(sen1, sen2) > 0, "the similarity should be bigger then zero"
 
     def testNearbyWords(self):
-        thu1 = thulac.thulac() #默认模式
-        text = thu1.cut("人脸识别", text=True)  #进行一句话分词
-        words, tags = [], []
-        data = [x.rsplit('_', 1) for x in text.split()]
-        for _ in data:
-            assert len(_) == 2, "seg len should be 2"
-            words.append(_[0])
-            tags.append(_[1])
-        for (k,v) in enumerate(tags):
-            if v.startswith("n") or v.startswith("v"): # 去停，去标，去副词、形容词、代词 etc.
-                synonyms.display(words[k]) # synonyms.display calls synonyms.nearby
+        synonyms.display("人脸") # synonyms.display calls synonyms.nearby
 
 def test():
     unittest.main()
 
@@ -12,7 +12,7 @@
 """
 
 setup(name='synonyms',
-      version='1.3',
+      version='1.5',
       description='Chinese Synonyms for Natural Language Processing and Understanding',
       long_description=LONGDOC,
       author='Hai Liang Wang, Hu Ying Xi',
@@ -41,5 +41,5 @@
       install_requires=[
           'thulac==0.1.2',
       ],
-      package_data={'synonyms':['**/*gz', 'LICENSE']}
+      package_data={'synonyms':['**/*.pklz', 'LICENSE']}
 )
@@ -38,18 +38,33 @@
 
 import gzip
 import thulac # http://thulac.thunlp.org/
-from collections import defaultdict
+import shutil
 
-_vocab = defaultdict(lambda: [[], []])
+_vocab = dict()
 _size = 0
 _thulac = thulac.thulac() #默认模式
-_fin = []
-_fin_path = os.path.join(curdir, 'data', 'words.nearby.gz')
+_fin_path = os.path.join(curdir, os.path.pardir, 'tmp', 'words.nearby.gz')
+_fin_cached_vocab_path = os.path.join(curdir, 'data', 'words.nearby.%d.pklz' % PLT)
+
 if PLT == 2:
-    import io
-    _fin=io.TextIOWrapper(io.BufferedReader(gzip.open(_fin_path)), encoding='utf8', errors='ignore')
+    import cPickle as pickle
 else:
-    _fin=gzip.open(_fin_path,'rt', encoding='utf-8', errors = "ignore")
+    import pickle
+
+def dump_pickle_file(file_path, data):
+    if os.path.exists(file_path):
+        shutil.rmtree(file_path)
+    with gzip.open(file_path, "wb") as fout:
+        print("dump pickle file, version ", pickle.HIGHEST_PROTOCOL)
+        pickle.dump(data, fout, protocol=pickle.HIGHEST_PROTOCOL)
+        print("done.")
+
+def load_pickle_file(file_path):
+    if os.path.exists(file_path):
+        with gzip.open(file_path, "rb") as fin:
+            return pickle.load(fin)
+    else: 
+        return None
 
 def add_word_to_vocab(word, nearby, nearby_score):
     '''
@@ -67,6 +82,13 @@ def _build_vocab():
     '''
     Build vocab
     '''
+    _fin = []
+    if PLT == 2:
+        import io
+        _fin=io.TextIOWrapper(io.BufferedReader(gzip.open(_fin_path)), encoding='utf8', errors='ignore')
+    else:
+        _fin=gzip.open(_fin_path,'rt', encoding='utf-8', errors = "ignore")
+
     c = None # current word
     w = []   # word nearby 
     s = []   # score of word nearby
@@ -86,15 +108,36 @@ def _build_vocab():
     add_word_to_vocab(c, w, s) # add the last word
     print(">> Synonyms vocabulary size: %s" % _size)
 
+def _load_vocab():
+    '''
+    load vocab dict
+    '''
+    global _vocab
+    try:
+        o = load_pickle_file(_fin_cached_vocab_path)
+        if o is None:
+            _build_vocab()
+            dump_pickle_file(_fin_cached_vocab_path, _vocab)
+        else:
+            _vocab = o
+    except Exception as e:
+        '''
+        Just load the data without cached policy
+        '''
+        _build_vocab()
+
 # build on load
 print(">> Synonyms on loading ...")
-_build_vocab()
+_load_vocab()
 
 def nearby(word):
     '''
     Nearby word
     '''
-    return _vocab[word]
+    try:
+        return _vocab[word]
+    except KeyError as e:
+        return [[],[]]
 
 def _segment_words(sen):
     '''
@@ -144,12 +187,14 @@ def compare(s1, s2):
 def display(word):
     print("'%s'近义词：" % word)
     o = nearby(word)
+    assert len(o) == 2, "should contain 2 list"
+    if len(o[0]) == 0: print(" out of vocabulary")
     for k,v in enumerate(o[0]):
         print("  %d. %s:%s" %(k+1, v, o[1][k]))
 
-
 def main():
     display("人脸")
+    display("NOT_EXIST")
 
 if __name__ == '__main__':
     main()