Synonyms/synonyms/__init__.py

260 lines
6.5 KiB
Python
Raw Normal View History

2017-09-27 15:27:47 +08:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
2017-10-31 16:54:55 +08:00
#=========================================================================
2017-09-27 15:27:47 +08:00
#
# Copyright (c) 2017 <> All Rights Reserved
#
#
# File: /Users/hain/ai/Synonyms/synonyms/__init__.py
# Author: Hai Liang Wang
2017-09-28 22:47:20 +08:00
# Date: 2017-09-27
2017-09-27 15:27:47 +08:00
#
2017-10-31 16:54:55 +08:00
#=========================================================================
2017-09-27 15:27:47 +08:00
"""
Chinese Synonyms for Natural Language Processing and Understanding.
"""
from __future__ import print_function
from __future__ import division
__copyright__ = "Copyright (c) 2017 . All Rights Reserved"
2017-10-31 16:54:55 +08:00
__author__ = "Hu Ying Xi<>, Hai Liang Wang<hailiang.hl.wang@gmail.com>"
__date__ = "2017-09-27"
2017-09-27 15:27:47 +08:00
import os
import sys
2017-10-31 10:54:31 +08:00
import numpy as np
2017-09-27 15:27:47 +08:00
curdir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(curdir)
2017-10-18 09:49:09 +08:00
PLT = 2
2017-09-27 15:27:47 +08:00
if sys.version_info[0] < 3:
2017-10-21 22:15:25 +08:00
default_stdout = sys.stdout
default_stderr = sys.stderr
2017-09-27 15:27:47 +08:00
reload(sys)
2017-10-21 22:15:25 +08:00
sys.stdout = default_stdout
sys.stderr = default_stderr
2017-09-27 15:27:47 +08:00
sys.setdefaultencoding("utf-8")
# raise "Must be using Python 3"
2017-10-18 09:49:09 +08:00
else:
PLT = 3
2017-09-27 15:27:47 +08:00
2017-10-31 16:54:55 +08:00
import json
2017-09-27 15:27:47 +08:00
import gzip
2017-10-21 11:45:15 +08:00
import shutil
from synonyms.word2vec import KeyedVectors
from synonyms.utils import any2utf8
from synonyms.utils import any2unicode
2017-10-28 10:06:11 +08:00
import jieba.posseg as _tokenizer
2017-10-31 10:54:31 +08:00
import jieba
2017-09-27 15:27:47 +08:00
2017-10-31 17:17:01 +08:00
'''
globals
'''
2017-10-21 11:45:15 +08:00
_vocab = dict()
2017-09-28 22:00:13 +08:00
_size = 0
2017-10-31 17:17:01 +08:00
_vectors = None
_stopwords = set()
2017-10-31 10:54:31 +08:00
2017-10-31 16:54:55 +08:00
'''
nearby
'''
def _load_vocab(file_path):
2017-10-21 11:45:15 +08:00
'''
load vocab dict
'''
global _vocab
2017-10-31 16:54:55 +08:00
if PLT == 2:
import io
fin = io.TextIOWrapper(
io.BufferedReader(
gzip.open(file_path)),
encoding='utf8',
errors='ignore')
else:
fin = gzip.open(file_path, 'rt', encoding='utf-8', errors="ignore")
_vocab = json.loads(fin.read())
2017-10-21 11:45:15 +08:00
2017-09-28 22:14:18 +08:00
# build on load
2017-10-31 16:54:55 +08:00
print(">> Synonyms on loading vocab ...")
_load_vocab(os.path.join(curdir, "data", "words.nearby.json.gz"))
2017-09-28 22:14:18 +08:00
def nearby(word):
'''
Nearby word
'''
2017-10-21 11:45:15 +08:00
try:
2017-10-31 16:54:55 +08:00
return _vocab[any2unicode(word)]
2017-10-21 11:45:15 +08:00
except KeyError as e:
2017-10-31 16:54:55 +08:00
return [[], []]
2017-09-28 22:14:18 +08:00
2017-10-31 17:17:01 +08:00
'''
similarity
'''
# stopwords
_fin_stopwords_path = os.path.join(curdir, 'data', 'stopwords.txt')
def _load_stopwords(file_path):
'''
load stop words
'''
global _stopwords
if sys.version_info[0] < 3:
words = open(file_path, 'r')
else:
words = open(file_path, 'r', encoding='utf-8')
2017-10-31 17:17:01 +08:00
stopwords = words.readlines()
for w in stopwords:
_stopwords.add(any2unicode(w).strip())
print(">> Synonyms on loading stopwords ...")
_load_stopwords(_fin_stopwords_path)
2017-10-16 22:29:51 +08:00
def _segment_words(sen):
'''
2017-10-31 17:17:01 +08:00
segment words with jieba
2017-10-16 22:29:51 +08:00
'''
words, tags = [], []
2017-10-31 16:54:55 +08:00
m = _tokenizer.cut(sen, HMM=True) # HMM更好的识别新词
2017-10-28 10:06:11 +08:00
for x in m:
words.append(x.word)
tags.append(x.flag)
2017-10-16 22:29:51 +08:00
return words, tags
2017-10-31 16:54:55 +08:00
# vectors
_f_model = os.path.join(curdir, 'data', 'words.vector')
def _load_w2v(model_file=_f_model, binary=True):
2017-10-31 17:17:01 +08:00
'''
load word2vec model
'''
2017-10-31 16:54:55 +08:00
if not os.path.exists(model_file):
print("os.path : ", os.path)
raise Exception("Model file does not exist.")
return KeyedVectors.load_word2vec_format(
model_file, binary=binary, unicode_errors='ignore')
print(">> Synonyms on loading vectors ...")
_vectors = _load_w2v(model_file=_f_model)
_sim_molecule = lambda x: np.sum(x, axis=0) # 分子
2017-10-31 10:54:31 +08:00
2017-10-31 16:54:55 +08:00
def _get_wv(sentence):
2017-10-16 22:29:51 +08:00
'''
2017-10-31 16:54:55 +08:00
get word2vec data by sentence
sentence is segmented string.
2017-10-16 22:29:51 +08:00
'''
2017-10-31 16:54:55 +08:00
global _vectors
vectors = []
for y in sentence.split():
y_ = any2unicode(y).strip()
if y_ not in _stopwords:
syns = nearby(y_)[0]
# print("sentence %s word: %s" %(sentence, y_))
# print("sentence %s word nearby: %s" %(sentence, " ".join(syns)))
c = []
try:
c.append(_vectors.word_vec(y_))
except KeyError as error:
print("not exist in w2v model: %s" % y_)
c.append(np.zeros((100,), dtype=float))
for n in syns:
if n is None: continue
2017-10-31 10:54:31 +08:00
try:
2017-10-31 16:54:55 +08:00
v = _vectors.word_vec(any2unicode(n))
except KeyError as error:
v = np.zeros((100,), dtype=float)
c.append(v)
r = np.average(c, axis=0)
vectors.append(r)
return vectors
def _unigram_overlap(sentence1, sentence2):
'''
compute unigram overlap
'''
x = set(sentence1.split())
y = set(sentence2.split())
intersection = x & y
union = x | y
return ((float)(len(intersection)) / (float)(len(union)))
def _levenshtein_distance(sentence1, sentence2):
'''
Return the Levenshtein distance between two strings.
Based on:
http://rosettacode.org/wiki/Levenshtein_distance#Python
'''
first = sentence1.split()
second = sentence2.split()
if len(first) > len(second):
first, second = second, first
distances = range(len(first) + 1)
for index2, char2 in enumerate(second):
new_distances = [index2 + 1]
for index1, char1 in enumerate(first):
if char1 == char2:
new_distances.append(distances[index1])
else:
new_distances.append(1 + min((distances[index1],
distances[index1 + 1],
new_distances[-1])))
distances = new_distances
levenshtein = distances[-1]
return 2 ** (-1 * levenshtein)
2017-10-31 16:54:55 +08:00
def _similarity_distance(s1, s2):
'''
compute similarity with distance measurement
'''
a = _sim_molecule(_get_wv(s1))
b = _sim_molecule(_get_wv(s2))
# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html
g = 1 / (np.linalg.norm(a - b) + 1)
u = _levenshtein_distance(s1, s2)
2017-12-31 19:01:05 +08:00
r = g * 5 + u * 0.8
r = min(r, 1.0)
2017-10-31 16:54:55 +08:00
return float("%.3f" % r)
def compare(s1, s2, seg=True):
2017-10-16 22:29:51 +08:00
'''
compare similarity
2017-10-31 10:54:31 +08:00
s1 : sentence1
s2 : sentence2
seg : True : The original sentences need jieba.cut
Flase : The original sentences have been cut.
2017-10-16 22:29:51 +08:00
'''
2017-10-31 10:54:31 +08:00
assert len(s1) > 0 and len(s2) > 0, "The length of s1 and s2 should > 0."
if seg:
2017-10-31 16:54:55 +08:00
s1 = ' '.join(jieba.cut(s1))
s2 = ' '.join(jieba.cut(s2))
return _similarity_distance(s1, s2)
2017-10-31 10:54:31 +08:00
2017-10-16 22:29:51 +08:00
2017-10-18 09:49:09 +08:00
def display(word):
print("'%s'近义词:" % word)
2017-10-21 09:59:26 +08:00
o = nearby(word)
2017-10-21 11:45:15 +08:00
assert len(o) == 2, "should contain 2 list"
2017-10-31 16:54:55 +08:00
if len(o[0]) == 0:
print(" out of vocabulary")
for k, v in enumerate(o[0]):
print(" %d. %s:%s" % (k + 1, v, o[1][k]))
2017-10-18 09:49:09 +08:00
2017-09-28 21:56:24 +08:00
def main():
2017-10-18 09:49:09 +08:00
display("人脸")
2017-10-21 11:45:15 +08:00
display("NOT_EXIST")
2017-09-28 21:56:24 +08:00
2017-10-31 16:54:55 +08:00
2017-09-28 21:56:24 +08:00
if __name__ == '__main__':
2017-10-21 22:15:25 +08:00
main()