@@ -134,7 +134,7 @@ def _load_w2v(model_file=_f_model, binary=True):
134134print (">> Synonyms on loading vectors [%s] ..." % _f_model )
135135_vectors = _load_w2v (model_file = _f_model )
136136
137- def _get_wv (sentence ):
137+ def _get_wv (sentence , ignore = False ):
138138 '''
139139 get word2vec data by sentence
140140 sentence is segmented string.
@@ -151,10 +151,13 @@ def _get_wv(sentence):
151151 try :
152152 c .append (_vectors .word_vec (y_ ))
153153 except KeyError as error :
154- logging .warn ("not exist in w2v model: %s" % y_ )
155- # c.append(np.zeros((100,), dtype=float))
156- random_state = np .random .RandomState (seed = (hash (y_ ) % (2 ** 32 - 1 )))
157- c .append (random_state .uniform (low = - 10.0 , high = 10.0 , size = (100 ,)))
154+ if ignore :
155+ continue
156+ else :
157+ logging .warning ("not exist in w2v model: %s" % y_ )
158+ # c.append(np.zeros((100,), dtype=float))
159+ random_state = np .random .RandomState (seed = (hash (y_ ) % (2 ** 32 - 1 )))
160+ c .append (random_state .uniform (low = - 10.0 , high = 10.0 , size = (100 ,)))
158161 for n in syns :
159162 if n is None : continue
160163 try :
@@ -223,13 +226,13 @@ def _nearby_levenshtein_distance(s1, s2):
223226 s = np .sum (scores ) / maxlen
224227 return s
225228
226- def _similarity_distance (s1 , s2 ):
229+ def _similarity_distance (s1 , s2 , ignore ):
227230 '''
228231 compute similarity with distance measurement
229232 '''
230233 g = 0.0
231234 try :
232- g_ = cosine (_flat_sum_array (_get_wv (s1 )), _flat_sum_array (_get_wv (s2 )))
235+ g_ = cosine (_flat_sum_array (_get_wv (s1 , ignore )), _flat_sum_array (_get_wv (s2 , ignore )))
233236 if is_digit (g_ ): g = g_
234237 except : pass
235238
@@ -275,7 +278,7 @@ def nearby(word):
275278 _cache_nearby [w ] = (words , scores )
276279 return words , scores
277280
278- def compare (s1 , s2 , seg = True ):
281+ def compare (s1 , s2 , seg = True , ignore = False ):
279282 '''
280283 compare similarity
281284 s1 : sentence1
@@ -291,7 +294,7 @@ def compare(s1, s2, seg=True):
291294 s1 = s1 .split ()
292295 s2 = s2 .split ()
293296 assert len (s1 ) > 0 and len (s2 ) > 0 , "The length of s1 and s2 should > 0."
294- return _similarity_distance (s1 , s2 )
297+ return _similarity_distance (s1 , s2 , ignore )
295298
296299def display (word ):
297300 print ("'%s'近义词:" % word )
0 commit comments