Synonyms/synonyms/__init__.py
2017-10-31 17:17:01 +08:00

232 lines
5.5 KiB
Python
Executable File

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#=========================================================================
#
# Copyright (c) 2017 <> All Rights Reserved
#
#
# File: /Users/hain/ai/Synonyms/synonyms/__init__.py
# Author: Hai Liang Wang
# Date: 2017-09-27
#
#=========================================================================
"""
Chinese Synonyms for Natural Language Processing and Understanding.
"""
from __future__ import print_function
from __future__ import division
__copyright__ = "Copyright (c) 2017 . All Rights Reserved"
__author__ = "Hu Ying Xi<>, Hai Liang Wang<hailiang.hl.wang@gmail.com>"
__date__ = "2017-09-27"
import os
import sys
import numpy as np
curdir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(curdir)
PLT = 2
if sys.version_info[0] < 3:
default_stdout = sys.stdout
default_stderr = sys.stderr
reload(sys)
sys.stdout = default_stdout
sys.stderr = default_stderr
sys.setdefaultencoding("utf-8")
# raise "Must be using Python 3"
else:
PLT = 3
import json
import gzip
import shutil
from word2vec import KeyedVectors
from utils import any2utf8
from utils import any2unicode
import jieba.posseg as _tokenizer
import jieba
'''
globals
'''
_vocab = dict()
_size = 0
_vectors = None
_stopwords = set()
'''
nearby
'''
def _load_vocab(file_path):
'''
load vocab dict
'''
global _vocab
if PLT == 2:
import io
fin = io.TextIOWrapper(
io.BufferedReader(
gzip.open(file_path)),
encoding='utf8',
errors='ignore')
else:
fin = gzip.open(file_path, 'rt', encoding='utf-8', errors="ignore")
_vocab = json.loads(fin.read())
# build on load
print(">> Synonyms on loading vocab ...")
_load_vocab(os.path.join(curdir, "data", "words.nearby.json.gz"))
def nearby(word):
'''
Nearby word
'''
try:
return _vocab[any2unicode(word)]
except KeyError as e:
return [[], []]
'''
similarity
'''
# stopwords
_fin_stopwords_path = os.path.join(curdir, 'data', 'stopwords.txt')
def _load_stopwords(file_path):
'''
load stop words
'''
global _stopwords
words = open(file_path, 'r')
stopwords = words.readlines()
for w in stopwords:
_stopwords.add(any2unicode(w).strip())
print(">> Synonyms on loading stopwords ...")
_load_stopwords(_fin_stopwords_path)
def _segment_words(sen):
'''
segment words with jieba
'''
words, tags = [], []
m = _tokenizer.cut(sen, HMM=True) # HMM更好的识别新词
for x in m:
words.append(x.word)
tags.append(x.flag)
return words, tags
# vectors
_f_model = os.path.join(curdir, 'data', 'words.vector')
def _load_w2v(model_file=_f_model, binary=True):
'''
load word2vec model
'''
if not os.path.exists(model_file):
print("os.path : ", os.path)
raise Exception("Model file does not exist.")
return KeyedVectors.load_word2vec_format(
model_file, binary=binary, unicode_errors='ignore')
print(">> Synonyms on loading vectors ...")
_vectors = _load_w2v(model_file=_f_model)
_sim_molecule = lambda x: np.sum(x, axis=0) # 分子
def _get_wv(sentence):
'''
get word2vec data by sentence
sentence is segmented string.
'''
global _vectors
vectors = []
for y in sentence.split():
y_ = any2unicode(y).strip()
if y_ not in _stopwords:
syns = nearby(y_)[0]
# print("sentence %s word: %s" %(sentence, y_))
# print("sentence %s word nearby: %s" %(sentence, " ".join(syns)))
c = []
try:
c.append(_vectors.word_vec(y_))
except KeyError as error:
print("not exist in w2v model: %s" % y_)
c.append(np.zeros((100,), dtype=float))
for n in syns:
if n is None: continue
try:
v = _vectors.word_vec(any2unicode(n))
except KeyError as error:
v = np.zeros((100,), dtype=float)
c.append(v)
r = np.average(c, axis=0)
vectors.append(r)
return vectors
def _unigram_overlap(sentence1, sentence2):
'''
compute unigram overlap
'''
x = set(sentence1.split())
y = set(sentence2.split())
intersection = x & y
union = x | y
return ((float)(len(intersection)) / (float)(len(union)))
def _similarity_distance(s1, s2):
'''
compute similarity with distance measurement
'''
a = _sim_molecule(_get_wv(s1))
b = _sim_molecule(_get_wv(s2))
# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html
g = 1 / (np.linalg.norm(a - b) + 1)
u = _unigram_overlap(s1, s2)
r = g * 0.8 + u * 0.2
return float("%.3f" % r)
def compare(s1, s2, seg=True):
'''
compare similarity
s1 : sentence1
s2 : sentence2
seg : True : The original sentences need jieba.cut
Flase : The original sentences have been cut.
'''
assert len(s1) > 0 and len(s2) > 0, "The length of s1 and s2 should > 0."
if seg:
s1 = ' '.join(jieba.cut(s1))
s2 = ' '.join(jieba.cut(s2))
return _similarity_distance(s1, s2)
def display(word):
print("'%s'近义词:" % word)
o = nearby(word)
assert len(o) == 2, "should contain 2 list"
if len(o[0]) == 0:
print(" out of vocabulary")
for k, v in enumerate(o[0]):
print(" %d. %s:%s" % (k + 1, v, o[1][k]))
def main():
display("人脸")
display("NOT_EXIST")
if __name__ == '__main__':
main()