2017-09-27 15:27:47 +08:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# -*- coding: utf-8 -*-
|
2017-10-31 16:54:55 +08:00
|
|
|
#=========================================================================
|
2017-09-27 15:27:47 +08:00
|
|
|
#
|
|
|
|
# Copyright (c) 2017 <> All Rights Reserved
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# File: /Users/hain/ai/Synonyms/synonyms/__init__.py
|
|
|
|
# Author: Hai Liang Wang
|
2017-09-28 22:47:20 +08:00
|
|
|
# Date: 2017-09-27
|
2017-09-27 15:27:47 +08:00
|
|
|
#
|
2017-10-31 16:54:55 +08:00
|
|
|
#=========================================================================
|
2017-09-27 15:27:47 +08:00
|
|
|
|
|
|
|
"""
|
|
|
|
Chinese Synonyms for Natural Language Processing and Understanding.
|
|
|
|
"""
|
|
|
|
from __future__ import print_function
|
|
|
|
from __future__ import division
|
|
|
|
|
|
|
|
__copyright__ = "Copyright (c) 2017 . All Rights Reserved"
|
2017-10-31 16:54:55 +08:00
|
|
|
__author__ = "Hu Ying Xi<>, Hai Liang Wang<hailiang.hl.wang@gmail.com>"
|
|
|
|
__date__ = "2017-09-27"
|
2017-09-27 15:27:47 +08:00
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
import sys
|
2017-10-31 10:54:31 +08:00
|
|
|
import numpy as np
|
2017-09-27 15:27:47 +08:00
|
|
|
curdir = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
sys.path.append(curdir)
|
|
|
|
|
2017-10-18 09:49:09 +08:00
|
|
|
PLT = 2
|
|
|
|
|
2017-09-27 15:27:47 +08:00
|
|
|
if sys.version_info[0] < 3:
|
2017-10-21 22:15:25 +08:00
|
|
|
default_stdout = sys.stdout
|
|
|
|
default_stderr = sys.stderr
|
2017-09-27 15:27:47 +08:00
|
|
|
reload(sys)
|
2017-10-21 22:15:25 +08:00
|
|
|
sys.stdout = default_stdout
|
|
|
|
sys.stderr = default_stderr
|
2017-09-27 15:27:47 +08:00
|
|
|
sys.setdefaultencoding("utf-8")
|
|
|
|
# raise "Must be using Python 3"
|
2017-10-18 09:49:09 +08:00
|
|
|
else:
|
|
|
|
PLT = 3
|
2017-09-27 15:27:47 +08:00
|
|
|
|
2017-10-31 16:54:55 +08:00
|
|
|
import json
|
2017-09-27 15:27:47 +08:00
|
|
|
import gzip
|
2017-10-21 11:45:15 +08:00
|
|
|
import shutil
|
2018-01-25 16:13:44 +08:00
|
|
|
from synonyms.word2vec import KeyedVectors
|
|
|
|
from synonyms.utils import any2utf8
|
|
|
|
from synonyms.utils import any2unicode
|
2017-10-28 10:06:11 +08:00
|
|
|
import jieba.posseg as _tokenizer
|
2017-10-31 10:54:31 +08:00
|
|
|
import jieba
|
2017-09-27 15:27:47 +08:00
|
|
|
|
2017-10-31 17:17:01 +08:00
|
|
|
'''
|
|
|
|
globals
|
|
|
|
'''
|
2017-10-21 11:45:15 +08:00
|
|
|
_vocab = dict()
|
2017-09-28 22:00:13 +08:00
|
|
|
_size = 0
|
2017-10-31 17:17:01 +08:00
|
|
|
_vectors = None
|
|
|
|
_stopwords = set()
|
|
|
|
|
2017-10-31 10:54:31 +08:00
|
|
|
|
2017-10-31 16:54:55 +08:00
|
|
|
'''
|
|
|
|
nearby
|
|
|
|
'''
|
|
|
|
def _load_vocab(file_path):
|
2017-10-21 11:45:15 +08:00
|
|
|
'''
|
|
|
|
load vocab dict
|
|
|
|
'''
|
|
|
|
global _vocab
|
2017-10-31 16:54:55 +08:00
|
|
|
if PLT == 2:
|
|
|
|
import io
|
|
|
|
fin = io.TextIOWrapper(
|
|
|
|
io.BufferedReader(
|
|
|
|
gzip.open(file_path)),
|
|
|
|
encoding='utf8',
|
|
|
|
errors='ignore')
|
|
|
|
else:
|
|
|
|
fin = gzip.open(file_path, 'rt', encoding='utf-8', errors="ignore")
|
|
|
|
|
|
|
|
_vocab = json.loads(fin.read())
|
2017-10-21 11:45:15 +08:00
|
|
|
|
2017-09-28 22:14:18 +08:00
|
|
|
# build on load
|
2017-10-31 16:54:55 +08:00
|
|
|
print(">> Synonyms on loading vocab ...")
|
|
|
|
_load_vocab(os.path.join(curdir, "data", "words.nearby.json.gz"))
|
2017-09-28 22:14:18 +08:00
|
|
|
|
|
|
|
def nearby(word):
|
|
|
|
'''
|
|
|
|
Nearby word
|
|
|
|
'''
|
2017-10-21 11:45:15 +08:00
|
|
|
try:
|
2017-10-31 16:54:55 +08:00
|
|
|
return _vocab[any2unicode(word)]
|
2017-10-21 11:45:15 +08:00
|
|
|
except KeyError as e:
|
2017-10-31 16:54:55 +08:00
|
|
|
return [[], []]
|
|
|
|
|
2017-09-28 22:14:18 +08:00
|
|
|
|
2017-10-31 17:17:01 +08:00
|
|
|
'''
|
|
|
|
similarity
|
|
|
|
'''
|
|
|
|
|
|
|
|
# stopwords
|
|
|
|
_fin_stopwords_path = os.path.join(curdir, 'data', 'stopwords.txt')
|
|
|
|
def _load_stopwords(file_path):
|
|
|
|
'''
|
|
|
|
load stop words
|
|
|
|
'''
|
|
|
|
global _stopwords
|
2017-11-08 09:37:48 +08:00
|
|
|
if sys.version_info[0] < 3:
|
|
|
|
words = open(file_path, 'r')
|
|
|
|
else:
|
|
|
|
words = open(file_path, 'r', encoding='utf-8')
|
2017-10-31 17:17:01 +08:00
|
|
|
stopwords = words.readlines()
|
|
|
|
for w in stopwords:
|
|
|
|
_stopwords.add(any2unicode(w).strip())
|
|
|
|
|
|
|
|
print(">> Synonyms on loading stopwords ...")
|
|
|
|
_load_stopwords(_fin_stopwords_path)
|
|
|
|
|
2017-10-16 22:29:51 +08:00
|
|
|
def _segment_words(sen):
|
|
|
|
'''
|
2017-10-31 17:17:01 +08:00
|
|
|
segment words with jieba
|
2017-10-16 22:29:51 +08:00
|
|
|
'''
|
|
|
|
words, tags = [], []
|
2017-10-31 16:54:55 +08:00
|
|
|
m = _tokenizer.cut(sen, HMM=True) # HMM更好的识别新词
|
2017-10-28 10:06:11 +08:00
|
|
|
for x in m:
|
|
|
|
words.append(x.word)
|
|
|
|
tags.append(x.flag)
|
2017-10-16 22:29:51 +08:00
|
|
|
return words, tags
|
|
|
|
|
2017-10-31 16:54:55 +08:00
|
|
|
# vectors
|
|
|
|
_f_model = os.path.join(curdir, 'data', 'words.vector')
|
|
|
|
def _load_w2v(model_file=_f_model, binary=True):
|
2017-10-31 17:17:01 +08:00
|
|
|
'''
|
|
|
|
load word2vec model
|
|
|
|
'''
|
2017-10-31 16:54:55 +08:00
|
|
|
if not os.path.exists(model_file):
|
|
|
|
print("os.path : ", os.path)
|
|
|
|
raise Exception("Model file does not exist.")
|
|
|
|
return KeyedVectors.load_word2vec_format(
|
|
|
|
model_file, binary=binary, unicode_errors='ignore')
|
|
|
|
print(">> Synonyms on loading vectors ...")
|
|
|
|
_vectors = _load_w2v(model_file=_f_model)
|
|
|
|
|
|
|
|
_sim_molecule = lambda x: np.sum(x, axis=0) # 分子
|
2017-10-31 10:54:31 +08:00
|
|
|
|
2017-10-31 16:54:55 +08:00
|
|
|
def _get_wv(sentence):
|
2017-10-16 22:29:51 +08:00
|
|
|
'''
|
2017-10-31 16:54:55 +08:00
|
|
|
get word2vec data by sentence
|
|
|
|
sentence is segmented string.
|
2017-10-16 22:29:51 +08:00
|
|
|
'''
|
2017-10-31 16:54:55 +08:00
|
|
|
global _vectors
|
|
|
|
vectors = []
|
|
|
|
for y in sentence.split():
|
|
|
|
y_ = any2unicode(y).strip()
|
|
|
|
if y_ not in _stopwords:
|
|
|
|
syns = nearby(y_)[0]
|
|
|
|
# print("sentence %s word: %s" %(sentence, y_))
|
|
|
|
# print("sentence %s word nearby: %s" %(sentence, " ".join(syns)))
|
|
|
|
c = []
|
|
|
|
try:
|
|
|
|
c.append(_vectors.word_vec(y_))
|
|
|
|
except KeyError as error:
|
|
|
|
print("not exist in w2v model: %s" % y_)
|
|
|
|
c.append(np.zeros((100,), dtype=float))
|
|
|
|
for n in syns:
|
|
|
|
if n is None: continue
|
2017-10-31 10:54:31 +08:00
|
|
|
try:
|
2017-10-31 16:54:55 +08:00
|
|
|
v = _vectors.word_vec(any2unicode(n))
|
|
|
|
except KeyError as error:
|
|
|
|
v = np.zeros((100,), dtype=float)
|
|
|
|
c.append(v)
|
|
|
|
r = np.average(c, axis=0)
|
|
|
|
vectors.append(r)
|
|
|
|
return vectors
|
|
|
|
|
|
|
|
|
|
|
|
def _unigram_overlap(sentence1, sentence2):
|
|
|
|
'''
|
|
|
|
compute unigram overlap
|
|
|
|
'''
|
|
|
|
x = set(sentence1.split())
|
|
|
|
y = set(sentence2.split())
|
|
|
|
|
|
|
|
intersection = x & y
|
|
|
|
union = x | y
|
|
|
|
|
|
|
|
return ((float)(len(intersection)) / (float)(len(union)))
|
|
|
|
|
2017-11-14 23:53:28 +08:00
|
|
|
def _levenshtein_distance(sentence1, sentence2):
|
|
|
|
'''
|
|
|
|
Return the Levenshtein distance between two strings.
|
|
|
|
Based on:
|
|
|
|
http://rosettacode.org/wiki/Levenshtein_distance#Python
|
|
|
|
'''
|
|
|
|
first = sentence1.split()
|
|
|
|
second = sentence2.split()
|
|
|
|
if len(first) > len(second):
|
|
|
|
first, second = second, first
|
|
|
|
distances = range(len(first) + 1)
|
|
|
|
for index2, char2 in enumerate(second):
|
|
|
|
new_distances = [index2 + 1]
|
|
|
|
for index1, char1 in enumerate(first):
|
|
|
|
if char1 == char2:
|
|
|
|
new_distances.append(distances[index1])
|
|
|
|
else:
|
|
|
|
new_distances.append(1 + min((distances[index1],
|
|
|
|
distances[index1 + 1],
|
|
|
|
new_distances[-1])))
|
|
|
|
distances = new_distances
|
|
|
|
levenshtein = distances[-1]
|
|
|
|
return 2 ** (-1 * levenshtein)
|
|
|
|
|
2017-10-31 16:54:55 +08:00
|
|
|
|
|
|
|
def _similarity_distance(s1, s2):
|
|
|
|
'''
|
|
|
|
compute similarity with distance measurement
|
|
|
|
'''
|
|
|
|
a = _sim_molecule(_get_wv(s1))
|
|
|
|
b = _sim_molecule(_get_wv(s2))
|
|
|
|
# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html
|
|
|
|
g = 1 / (np.linalg.norm(a - b) + 1)
|
2017-11-14 23:53:28 +08:00
|
|
|
u = _levenshtein_distance(s1, s2)
|
2017-12-31 19:01:05 +08:00
|
|
|
r = g * 5 + u * 0.8
|
|
|
|
r = min(r, 1.0)
|
2017-10-31 16:54:55 +08:00
|
|
|
|
|
|
|
return float("%.3f" % r)
|
|
|
|
|
|
|
|
|
|
|
|
def compare(s1, s2, seg=True):
|
2017-10-16 22:29:51 +08:00
|
|
|
'''
|
|
|
|
compare similarity
|
2017-10-31 10:54:31 +08:00
|
|
|
s1 : sentence1
|
|
|
|
s2 : sentence2
|
|
|
|
seg : True : The original sentences need jieba.cut
|
|
|
|
Flase : The original sentences have been cut.
|
2017-10-16 22:29:51 +08:00
|
|
|
'''
|
2017-10-31 10:54:31 +08:00
|
|
|
assert len(s1) > 0 and len(s2) > 0, "The length of s1 and s2 should > 0."
|
|
|
|
if seg:
|
2017-10-31 16:54:55 +08:00
|
|
|
s1 = ' '.join(jieba.cut(s1))
|
|
|
|
s2 = ' '.join(jieba.cut(s2))
|
|
|
|
return _similarity_distance(s1, s2)
|
2017-10-31 10:54:31 +08:00
|
|
|
|
2017-10-16 22:29:51 +08:00
|
|
|
|
2017-10-18 09:49:09 +08:00
|
|
|
def display(word):
|
|
|
|
print("'%s'近义词:" % word)
|
2017-10-21 09:59:26 +08:00
|
|
|
o = nearby(word)
|
2017-10-21 11:45:15 +08:00
|
|
|
assert len(o) == 2, "should contain 2 list"
|
2017-10-31 16:54:55 +08:00
|
|
|
if len(o[0]) == 0:
|
|
|
|
print(" out of vocabulary")
|
|
|
|
for k, v in enumerate(o[0]):
|
|
|
|
print(" %d. %s:%s" % (k + 1, v, o[1][k]))
|
|
|
|
|
2017-10-18 09:49:09 +08:00
|
|
|
|
2017-09-28 21:56:24 +08:00
|
|
|
def main():
|
2017-10-18 09:49:09 +08:00
|
|
|
display("人脸")
|
2017-10-21 11:45:15 +08:00
|
|
|
display("NOT_EXIST")
|
2017-09-28 21:56:24 +08:00
|
|
|
|
2017-10-31 16:54:55 +08:00
|
|
|
|
2017-09-28 21:56:24 +08:00
|
|
|
if __name__ == '__main__':
|
2017-10-21 22:15:25 +08:00
|
|
|
main()
|