Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 86e24d5

Browse files
committed
Merge branch 'master' of github.com:huyingxi/Synonyms
2 parents ae69e67 + eaf2d16 commit 86e24d5

2 files changed

Lines changed: 82 additions & 10 deletions

File tree

synonyms/data/stopwords.txt

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1595,4 +1595,73 @@
15951595
非特
15961596
非独
15971597
高兴
1598-
若果
1598+
若果
1599+
·
1600+
~
1601+
-
1602+
——
1603+
=
1604+
+
1605+
1606+
{
1607+
}
1608+
1609+
1610+
|
1611+
1612+
1613+
1614+
1615+
1616+
1617+
1618+
1619+
1620+
1621+
/
1622+
1623+
*
1624+
1625+
@
1626+
#
1627+
1628+
%
1629+
……
1630+
&
1631+
1632+
1633+
`
1634+
~
1635+
!
1636+
@
1637+
#
1638+
$
1639+
%
1640+
^
1641+
&
1642+
(
1643+
)
1644+
[
1645+
]
1646+
|
1647+
\
1648+
;
1649+
:
1650+
'
1651+
"
1652+
,
1653+
<
1654+
.
1655+
>
1656+
/
1657+
?
1658+
0
1659+
1
1660+
2
1661+
3
1662+
4
1663+
5
1664+
6
1665+
7
1666+
8
1667+
9

synonyms/synonyms.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ def _load_w2v(model_file=_f_model, binary=True):
134134
print(">> Synonyms on loading vectors [%s] ..." % _f_model)
135135
_vectors = _load_w2v(model_file=_f_model)
136136

137-
def _get_wv(sentence):
137+
def _get_wv(sentence, ignore=False):
138138
'''
139139
get word2vec data by sentence
140140
sentence is segmented string.
@@ -151,10 +151,13 @@ def _get_wv(sentence):
151151
try:
152152
c.append(_vectors.word_vec(y_))
153153
except KeyError as error:
154-
logging.warn("not exist in w2v model: %s" % y_)
155-
# c.append(np.zeros((100,), dtype=float))
156-
random_state = np.random.RandomState(seed=(hash(y_) % (2**32 - 1)))
157-
c.append(random_state.uniform(low=-10.0, high=10.0, size=(100,)))
154+
if ignore:
155+
continue
156+
else:
157+
logging.warning("not exist in w2v model: %s" % y_)
158+
# c.append(np.zeros((100,), dtype=float))
159+
random_state = np.random.RandomState(seed=(hash(y_) % (2**32 - 1)))
160+
c.append(random_state.uniform(low=-10.0, high=10.0, size=(100,)))
158161
for n in syns:
159162
if n is None: continue
160163
try:
@@ -223,13 +226,13 @@ def _nearby_levenshtein_distance(s1, s2):
223226
s = np.sum(scores) / maxlen
224227
return s
225228

226-
def _similarity_distance(s1, s2):
229+
def _similarity_distance(s1, s2, ignore):
227230
'''
228231
compute similarity with distance measurement
229232
'''
230233
g = 0.0
231234
try:
232-
g_ = cosine(_flat_sum_array(_get_wv(s1)), _flat_sum_array(_get_wv(s2)))
235+
g_ = cosine(_flat_sum_array(_get_wv(s1, ignore)), _flat_sum_array(_get_wv(s2, ignore)))
233236
if is_digit(g_): g = g_
234237
except: pass
235238

@@ -275,7 +278,7 @@ def nearby(word):
275278
_cache_nearby[w] = (words, scores)
276279
return words, scores
277280

278-
def compare(s1, s2, seg=True):
281+
def compare(s1, s2, seg=True, ignore=False):
279282
'''
280283
compare similarity
281284
s1 : sentence1
@@ -291,7 +294,7 @@ def compare(s1, s2, seg=True):
291294
s1 = s1.split()
292295
s2 = s2.split()
293296
assert len(s1) > 0 and len(s2) > 0, "The length of s1 and s2 should > 0."
294-
return _similarity_distance(s1, s2)
297+
return _similarity_distance(s1, s2, ignore)
295298

296299
def display(word):
297300
print("'%s'近义词:" % word)

0 commit comments

Comments
 (0)