Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 07de9b2

Browse files
committed
refactor with pkl, add benchmark
1 parent 6132372 commit 07de9b2

8 files changed

Lines changed: 167 additions & 37 deletions

File tree

README.md

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Synonyms
22
Chinese Synonyms for Natural Language Processing and Understanding.
3+
最好的中文近义词工具包。
34

45
```synonyms```可以用于自然语言理解的很多任务:文本对齐,推荐算法,相似度计算,语义偏移等。
56

@@ -31,7 +32,7 @@ synonyms.nearby(人脸) = [
3132
095, 0.525344, 0.524009, 0.523101, 0.516046]]
3233
```
3334

34-
在OOV的情况下,返回 ```[[], []]```
35+
在OOV的情况下,返回 ```[[], []]```,目前的字典大小: 125,792
3536

3637
### synonyms#compare
3738
两个句子的相似度比较
@@ -81,28 +82,44 @@ assert synonyms.compare(sen1, sen2) == 0.0, "the similarity should be zero"
8182

8283
![](assets/2.png)
8384

84-
## Similarity Demo
85+
## Demo
8586
```
8687
$ pip install -r Requirements.txt
8788
$ python demo.py
88-
>> Synonyms on loading ...
89-
>> Synonyms vocabulary size: 125792
90-
Model loaded succeed
91-
人脸: [['图片', '图像', '通过观察', '数字图像', '几何图形', '脸部', '图象', '放大镜', '面孔', 'Mii'], [0.597284, 0.580373, 0.568486, 0.535674, 0.531835, 0.530
92-
095, 0.525344, 0.524009, 0.523101, 0.516046]]
93-
识别: [['辨识', '辨别', '辨认', '标识', '鉴别', '标记', '识别系统', '分辨', '检测', '区分'], [0.872249, 0.764099, 0.725761, 0.702918, 0.68861, 0.678132, 0.663
94-
829, 0.661863, 0.639442, 0.611004]]
9589
```
9690

9791
## Data
9892
```
99-
words.nearby.gz # 近义词汇源数据
100-
words.wc.gz # 词频统计
93+
synonyms/data/words.nearby.x.pklz # compressed pickle object
94+
```
95+
96+
data is built based on [wikidata-corpus](https://github.com/Samurais/wikidata-corpus).
97+
98+
## benchmark
99+
100+
Test with py3, MacBook Pro.
101+
102+
```
103+
python benchmark.py
101104
```
102-
View data with ```zmore```, ```zgrep```, ```zcat```.
103105

104-
data is built based on https://github.com/Samurais/wikidata-corpus.
106+
++++++++++ OS Name and version ++++++++++
107+
Platform: Darwin
108+
Kernel: 16.7.0
109+
Distro: ('', '', '')
110+
Architecture: ('64bit', '')
111+
112+
++++++++++ CPU Cores ++++++++++
113+
Cores: 4
114+
CPU Load: 60
115+
116+
++++++++++ System Memory ++++++++++
105117

118+
meminfo 8GB
119+
120+
Model loaded succeed
121+
>> Synonyms on loading ...
122+
synonyms#nearby: 100000 loops, best of 3 epochs: 0.209 usec per loop
106123

107124
## 声明
108125
[Synonyms](https://github.com/shuzi/insuranceQA)发布证书 GPL 3.0。数据和程序可用于研究和商业产品,必须注明引用和地址,比如发布的任何媒体、期刊、杂志或博客等内容。

benchmark.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
#===============================================================================
4+
#
5+
# Copyright (c) 2017 <> All Rights Reserved
6+
#
7+
#
8+
# File: /Users/hain/ai/Synonyms/benchmark.py
9+
# Author: Hai Liang Wang
10+
# Date: 2017-10-21:11:26:53
11+
#
12+
#===============================================================================
13+
14+
"""
15+
16+
"""
17+
from __future__ import print_function
18+
from __future__ import division
19+
20+
__copyright__ = "Copyright (c) 2017 . All Rights Reserved"
21+
__author__ = "Hai Liang Wang"
22+
__date__ = "2017-10-21:11:26:53"
23+
24+
25+
import os
26+
import sys
27+
import platform
28+
import multiprocessing
29+
curdir = os.path.dirname(os.path.abspath(__file__))
30+
sys.path.append(curdir)
31+
32+
if sys.version_info[0] < 3:
33+
reload(sys)
34+
sys.setdefaultencoding("utf-8")
35+
# raise "Must be using Python 3"
36+
37+
import timeit
38+
39+
print("\nEnumerating Available System Resources...")
40+
41+
print("\n++++++++++ OS Name and version ++++++++++")
42+
43+
print("Platform:", platform.system())
44+
print("Kernel:", platform.release())
45+
print("Distro:", platform.linux_distribution())
46+
print("Architecture:", platform.architecture())
47+
48+
print("\n++++++++++ CPU Cores ++++++++++")
49+
p = os.popen("ps aux|awk 'NR > 0{s +=$3};END{print s}'").read()
50+
print("Cores:", multiprocessing.cpu_count(), '\nCPU Load:', p)
51+
52+
print("\n++++++++++ System Memory ++++++++++\n")
53+
54+
def meminfo():
55+
meminfo=dict()
56+
57+
with os.popen('cat /proc/meminfo') as f:
58+
for line in f:
59+
meminfo[line.split(':')[0]] = line.split(':')[1].strip()
60+
return meminfo
61+
try:
62+
meminfo = meminfo()
63+
print('Total Memory: {0}'.format(meminfo['MemTotal']))
64+
print('Free Memory: {0}'.format(meminfo['MemFree']))
65+
except:
66+
print("meminfo unavailable")
67+
68+
def main():
69+
repeat = 3
70+
number = 100000
71+
unit = "usec" # 微秒
72+
unittosec = {"usec": 1e6, "msec": 1000, "sec": 1}
73+
result = timeit.repeat("synonyms.nearby('人脸')", "import synonyms", number=number, repeat=repeat)
74+
print("%s: %d loops, best of %d epochs: %.3g %s per loop" %
75+
("synonyms#nearby", number, repeat,
76+
min(result) / number * unittosec[unit], unit))
77+
78+
if __name__ == '__main__':
79+
main()

demo.py

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535
import synonyms # https://github.com/huyingxi/Synonyms
3636
import numpy
3737
import unittest
38-
import thulac
3938

4039
# run testcase: python /Users/hain/ai/Synonyms/demo.py Test.testExample
4140
class Test(unittest.TestCase):
@@ -61,17 +60,7 @@ def testSenSimilarity(self):
6160
assert synonyms.compare(sen1, sen2) > 0, "the similarity should be bigger then zero"
6261

6362
def testNearbyWords(self):
64-
thu1 = thulac.thulac() #默认模式
65-
text = thu1.cut("人脸识别", text=True) #进行一句话分词
66-
words, tags = [], []
67-
data = [x.rsplit('_', 1) for x in text.split()]
68-
for _ in data:
69-
assert len(_) == 2, "seg len should be 2"
70-
words.append(_[0])
71-
tags.append(_[1])
72-
for (k,v) in enumerate(tags):
73-
if v.startswith("n") or v.startswith("v"): # 去停,去标,去副词、形容词、代词 etc.
74-
synonyms.display(words[k]) # synonyms.display calls synonyms.nearby
63+
synonyms.display("人脸") # synonyms.display calls synonyms.nearby
7564

7665
def test():
7766
unittest.main()

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"""
1313

1414
setup(name='synonyms',
15-
version='1.3',
15+
version='1.5',
1616
description='Chinese Synonyms for Natural Language Processing and Understanding',
1717
long_description=LONGDOC,
1818
author='Hai Liang Wang, Hu Ying Xi',
@@ -41,5 +41,5 @@
4141
install_requires=[
4242
'thulac==0.1.2',
4343
],
44-
package_data={'synonyms':['**/*gz', 'LICENSE']}
44+
package_data={'synonyms':['**/*.pklz', 'LICENSE']}
4545
)

synonyms/__init__.py

Lines changed: 55 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,18 +38,33 @@
3838

3939
import gzip
4040
import thulac # http://thulac.thunlp.org/
41-
from collections import defaultdict
41+
import shutil
4242

43-
_vocab = defaultdict(lambda: [[], []])
43+
_vocab = dict()
4444
_size = 0
4545
_thulac = thulac.thulac() #默认模式
46-
_fin = []
47-
_fin_path = os.path.join(curdir, 'data', 'words.nearby.gz')
46+
_fin_path = os.path.join(curdir, os.path.pardir, 'tmp', 'words.nearby.gz')
47+
_fin_cached_vocab_path = os.path.join(curdir, 'data', 'words.nearby.%d.pklz' % PLT)
48+
4849
if PLT == 2:
49-
import io
50-
_fin=io.TextIOWrapper(io.BufferedReader(gzip.open(_fin_path)), encoding='utf8', errors='ignore')
50+
import cPickle as pickle
5151
else:
52-
_fin=gzip.open(_fin_path,'rt', encoding='utf-8', errors = "ignore")
52+
import pickle
53+
54+
def dump_pickle_file(file_path, data):
55+
if os.path.exists(file_path):
56+
shutil.rmtree(file_path)
57+
with gzip.open(file_path, "wb") as fout:
58+
print("dump pickle file, version ", pickle.HIGHEST_PROTOCOL)
59+
pickle.dump(data, fout, protocol=pickle.HIGHEST_PROTOCOL)
60+
print("done.")
61+
62+
def load_pickle_file(file_path):
63+
if os.path.exists(file_path):
64+
with gzip.open(file_path, "rb") as fin:
65+
return pickle.load(fin)
66+
else:
67+
return None
5368

5469
def add_word_to_vocab(word, nearby, nearby_score):
5570
'''
@@ -67,6 +82,13 @@ def _build_vocab():
6782
'''
6883
Build vocab
6984
'''
85+
_fin = []
86+
if PLT == 2:
87+
import io
88+
_fin=io.TextIOWrapper(io.BufferedReader(gzip.open(_fin_path)), encoding='utf8', errors='ignore')
89+
else:
90+
_fin=gzip.open(_fin_path,'rt', encoding='utf-8', errors = "ignore")
91+
7092
c = None # current word
7193
w = [] # word nearby
7294
s = [] # score of word nearby
@@ -86,15 +108,36 @@ def _build_vocab():
86108
add_word_to_vocab(c, w, s) # add the last word
87109
print(">> Synonyms vocabulary size: %s" % _size)
88110

111+
def _load_vocab():
112+
'''
113+
load vocab dict
114+
'''
115+
global _vocab
116+
try:
117+
o = load_pickle_file(_fin_cached_vocab_path)
118+
if o is None:
119+
_build_vocab()
120+
dump_pickle_file(_fin_cached_vocab_path, _vocab)
121+
else:
122+
_vocab = o
123+
except Exception as e:
124+
'''
125+
Just load the data without cached policy
126+
'''
127+
_build_vocab()
128+
89129
# build on load
90130
print(">> Synonyms on loading ...")
91-
_build_vocab()
131+
_load_vocab()
92132

93133
def nearby(word):
94134
'''
95135
Nearby word
96136
'''
97-
return _vocab[word]
137+
try:
138+
return _vocab[word]
139+
except KeyError as e:
140+
return [[],[]]
98141

99142
def _segment_words(sen):
100143
'''
@@ -144,12 +187,14 @@ def compare(s1, s2):
144187
def display(word):
145188
print("'%s'近义词:" % word)
146189
o = nearby(word)
190+
assert len(o) == 2, "should contain 2 list"
191+
if len(o[0]) == 0: print(" out of vocabulary")
147192
for k,v in enumerate(o[0]):
148193
print(" %d. %s:%s" %(k+1, v, o[1][k]))
149194

150-
151195
def main():
152196
display("人脸")
197+
display("NOT_EXIST")
153198

154199
if __name__ == '__main__':
155200
main()

synonyms/data/words.nearby.2.pklz

20.9 MB
Binary file not shown.

synonyms/data/words.wc.gz

-784 KB
Binary file not shown.

0 commit comments

Comments
 (0)