Synonyms/synonyms/__init__.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#=========================================================================
#
# Copyright (c) 2017 <> All Rights Reserved
#
#
# File: /Users/hain/ai/Synonyms/synonyms/__init__.py
# Author: Hai Liang Wang
# Date: 2017-09-27
#
#=========================================================================

"""
Chinese Synonyms for Natural Language Processing and Understanding.
"""
from __future__ import print_function
from __future__ import division

__copyright__ = "Copyright (c) 2017 . All Rights Reserved"
__author__ = "Hu Ying Xi<>, Hai Liang Wang<hailiang.hl.wang@gmail.com>"
__date__ = "2017-09-27"


import os
import sys
import numpy as np
curdir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(curdir)

PLT = 2

if sys.version_info[0] < 3:
    default_stdout = sys.stdout
    default_stderr = sys.stderr
    reload(sys)
    sys.stdout = default_stdout
    sys.stderr = default_stderr
    sys.setdefaultencoding("utf-8")
    # raise "Must be using Python 3"
else:
    PLT = 3

import json
import gzip
import shutil
from synonyms.word2vec import KeyedVectors
from synonyms.utils import any2utf8
from synonyms.utils import any2unicode
import jieba.posseg as _tokenizer
import jieba

'''
globals
'''
_vocab = dict()
_size = 0
_vectors = None
_stopwords = set()


'''
nearby
'''
def _load_vocab(file_path):
    '''
    load vocab dict
    '''
    global _vocab
    if PLT == 2:
        import io
        fin = io.TextIOWrapper(
            io.BufferedReader(
                gzip.open(file_path)),
            encoding='utf8',
            errors='ignore')
    else:
        fin = gzip.open(file_path, 'rt', encoding='utf-8', errors="ignore")

    _vocab = json.loads(fin.read())

# build on load
print(">> Synonyms on loading vocab ...")
_load_vocab(os.path.join(curdir, "data", "words.nearby.json.gz"))

def nearby(word):
    '''
    Nearby word
    '''
    try:
        return _vocab[any2unicode(word)]
    except KeyError as e:
        return [[], []]


'''
similarity
'''

# stopwords
_fin_stopwords_path = os.path.join(curdir, 'data', 'stopwords.txt')
def _load_stopwords(file_path):
    '''
    load stop words
    '''
    global _stopwords
    if sys.version_info[0] < 3:
        words = open(file_path, 'r')
    else:
        words = open(file_path, 'r', encoding='utf-8')
    stopwords = words.readlines()
    for w in stopwords:
        _stopwords.add(any2unicode(w).strip())

print(">> Synonyms on loading stopwords ...")
_load_stopwords(_fin_stopwords_path)

def _segment_words(sen):
    '''
    segment words with jieba
    '''
    words, tags = [], []
    m = _tokenizer.cut(sen, HMM=True)  # HMM更好的识别新词
    for x in m:
        words.append(x.word)
        tags.append(x.flag)
    return words, tags

# vectors
_f_model = os.path.join(curdir, 'data', 'words.vector')
def _load_w2v(model_file=_f_model, binary=True):
    '''
    load word2vec model
    '''
    if not os.path.exists(model_file):
        print("os.path : ", os.path)
        raise Exception("Model file does not exist.")
    return KeyedVectors.load_word2vec_format(
        model_file, binary=binary, unicode_errors='ignore')
print(">> Synonyms on loading vectors ...")
_vectors = _load_w2v(model_file=_f_model)

_sim_molecule = lambda x: np.sum(x, axis=0)  # 分子

def _get_wv(sentence):
    '''
    get word2vec data by sentence
    sentence is segmented string.
    '''
    global _vectors
    vectors = []
    for y in sentence.split():
        y_ = any2unicode(y).strip()
        if y_ not in _stopwords:
            syns = nearby(y_)[0]
            # print("sentence %s word: %s" %(sentence, y_))
            # print("sentence %s word nearby: %s" %(sentence, " ".join(syns)))
            c = []
            try:
                c.append(_vectors.word_vec(y_))
            except KeyError as error:
                print("not exist in w2v model: %s" % y_)
                c.append(np.zeros((100,), dtype=float))
            for n in syns:
                if n is None: continue
                try:
                    v = _vectors.word_vec(any2unicode(n))
                except KeyError as error:
                    v = np.zeros((100,), dtype=float)
                c.append(v)
            r = np.average(c, axis=0)
            vectors.append(r)
    return vectors


def _unigram_overlap(sentence1, sentence2):
    '''
    compute unigram overlap
    '''
    x = set(sentence1.split())
    y = set(sentence2.split())

    intersection = x & y
    union = x | y

    return ((float)(len(intersection)) / (float)(len(union)))

def _levenshtein_distance(sentence1, sentence2):
    '''
    Return the Levenshtein distance between two strings.
    Based on:
        http://rosettacode.org/wiki/Levenshtein_distance#Python
    '''
    first = sentence1.split()
    second = sentence2.split()
    if len(first) > len(second):
        first, second = second, first
    distances = range(len(first) + 1)
    for index2, char2 in enumerate(second):
        new_distances = [index2 + 1]
        for index1, char1 in enumerate(first):
            if char1 == char2:
                new_distances.append(distances[index1])
            else:
                new_distances.append(1 + min((distances[index1],
                                             distances[index1 + 1],
                                             new_distances[-1])))
        distances = new_distances
    levenshtein = distances[-1]
    return 2 ** (-1 * levenshtein)


def _similarity_distance(s1, s2):
    '''
    compute similarity with distance measurement
    '''
    a = _sim_molecule(_get_wv(s1))
    b = _sim_molecule(_get_wv(s2))
    # https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html
    g = 1 / (np.linalg.norm(a - b) + 1)
    u = _levenshtein_distance(s1, s2)
    r = g * 5 + u * 0.8
    r = min(r, 1.0)

    return float("%.3f" % r)


def compare(s1, s2, seg=True):
    '''
    compare similarity
    s1 : sentence1
    s2 : sentence2
    seg : True : The original sentences need jieba.cut
          Flase : The original sentences have been cut.
    '''
    assert len(s1) > 0 and len(s2) > 0, "The length of s1 and s2 should > 0."
    if seg:
        s1 = ' '.join(jieba.cut(s1))
        s2 = ' '.join(jieba.cut(s2))
    return _similarity_distance(s1, s2)


def display(word):
    print("'%s'近义词：" % word)
    o = nearby(word)
    assert len(o) == 2, "should contain 2 list"
    if len(o[0]) == 0:
        print(" out of vocabulary")
    for k, v in enumerate(o[0]):
        print("  %d. %s:%s" % (k + 1, v, o[1][k]))


def main():
    display("人脸")
    display("NOT_EXIST")


if __name__ == '__main__':
    main()
init 2017-09-27 15:27:47 +08:00			`#!/usr/bin/env python`
			`# -- coding: utf-8 --`
#6 simplify code and support py2,3 2017-10-31 16:54:55 +08:00			`#=========================================================================`
init 2017-09-27 15:27:47 +08:00			`#`
			`# Copyright (c) 2017 <> All Rights Reserved`
			`#`
			`#`
			`# File: /Users/hain/ai/Synonyms/synonyms/__init__.py`
			`# Author: Hai Liang Wang`
Add demo as example 2017-09-28 22:47:20 +08:00			`# Date: 2017-09-27`
init 2017-09-27 15:27:47 +08:00			`#`
#6 simplify code and support py2,3 2017-10-31 16:54:55 +08:00			`#=========================================================================`
init 2017-09-27 15:27:47 +08:00
			`"""`
			`Chinese Synonyms for Natural Language Processing and Understanding.`
			`"""`
			`from __future__ import print_function`
			`from __future__ import division`

			`__copyright__ = "Copyright (c) 2017 . All Rights Reserved"`
#6 simplify code and support py2,3 2017-10-31 16:54:55 +08:00			`__author__ = "Hu Ying Xi<>, Hai Liang Wang<hailiang.hl.wang@gmail.com>"`
			`__date__ = "2017-09-27"`
init 2017-09-27 15:27:47 +08:00

			`import os`
			`import sys`
inhence sentence similarity 2017-10-31 10:54:31 +08:00			`import numpy as np`
init 2017-09-27 15:27:47 +08:00			`curdir = os.path.dirname(os.path.abspath(__file__))`
			`sys.path.append(curdir)`

add compatible codes for py2 2017-10-18 09:49:09 +08:00			`PLT = 2`

init 2017-09-27 15:27:47 +08:00			`if sys.version_info[0] < 3:`
chore: reattach stdout to sys 2017-10-21 22:15:25 +08:00			`default_stdout = sys.stdout`
			`default_stderr = sys.stderr`
init 2017-09-27 15:27:47 +08:00			`reload(sys)`
chore: reattach stdout to sys 2017-10-21 22:15:25 +08:00			`sys.stdout = default_stdout`
			`sys.stderr = default_stderr`
init 2017-09-27 15:27:47 +08:00			`sys.setdefaultencoding("utf-8")`
			`# raise "Must be using Python 3"`
add compatible codes for py2 2017-10-18 09:49:09 +08:00			`else:`
			`PLT = 3`
init 2017-09-27 15:27:47 +08:00
#6 simplify code and support py2,3 2017-10-31 16:54:55 +08:00			`import json`
init 2017-09-27 15:27:47 +08:00			`import gzip`
refactor with pkl, add benchmark 2017-10-21 11:45:15 +08:00			`import shutil`
Resolve the ImportError when there's "utils" model. 2018-01-25 16:13:44 +08:00			`from synonyms.word2vec import KeyedVectors`
			`from synonyms.utils import any2utf8`
			`from synonyms.utils import any2unicode`
use jieba as tokenizer 2017-10-28 10:06:11 +08:00			`import jieba.posseg as _tokenizer`
inhence sentence similarity 2017-10-31 10:54:31 +08:00			`import jieba`
init 2017-09-27 15:27:47 +08:00
#6 move useless code into data.py 2017-10-31 17:17:01 +08:00			`'''`
			`globals`
			`'''`
refactor with pkl, add benchmark 2017-10-21 11:45:15 +08:00			`_vocab = dict()`
Enable build vocab 2017-09-28 22:00:13 +08:00			`_size = 0`
#6 move useless code into data.py 2017-10-31 17:17:01 +08:00			`_vectors = None`
			`_stopwords = set()`

inhence sentence similarity 2017-10-31 10:54:31 +08:00
#6 simplify code and support py2,3 2017-10-31 16:54:55 +08:00			`'''`
			`nearby`
			`'''`
			`def _load_vocab(file_path):`
refactor with pkl, add benchmark 2017-10-21 11:45:15 +08:00			`'''`
			`load vocab dict`
			`'''`
			`global _vocab`
#6 simplify code and support py2,3 2017-10-31 16:54:55 +08:00			`if PLT == 2:`
			`import io`
			`fin = io.TextIOWrapper(`
			`io.BufferedReader(`
			`gzip.open(file_path)),`
			`encoding='utf8',`
			`errors='ignore')`
			`else:`
			`fin = gzip.open(file_path, 'rt', encoding='utf-8', errors="ignore")`

			`_vocab = json.loads(fin.read())`
refactor with pkl, add benchmark 2017-10-21 11:45:15 +08:00
Update README.md 2017-09-28 22:14:18 +08:00			`# build on load`
#6 simplify code and support py2,3 2017-10-31 16:54:55 +08:00			`print(">> Synonyms on loading vocab ...")`
			`_load_vocab(os.path.join(curdir, "data", "words.nearby.json.gz"))`
Update README.md 2017-09-28 22:14:18 +08:00
			`def nearby(word):`
			`'''`
			`Nearby word`
			`'''`
refactor with pkl, add benchmark 2017-10-21 11:45:15 +08:00			`try:`
#6 simplify code and support py2,3 2017-10-31 16:54:55 +08:00			`return _vocab[any2unicode(word)]`
refactor with pkl, add benchmark 2017-10-21 11:45:15 +08:00			`except KeyError as e:`
#6 simplify code and support py2,3 2017-10-31 16:54:55 +08:00			`return [[], []]`

Update README.md 2017-09-28 22:14:18 +08:00
#6 move useless code into data.py 2017-10-31 17:17:01 +08:00			`'''`
			`similarity`
			`'''`

			`# stopwords`
			`_fin_stopwords_path = os.path.join(curdir, 'data', 'stopwords.txt')`
			`def _load_stopwords(file_path):`
			`'''`
			`load stop words`
			`'''`
			`global _stopwords`
fix python3 open stopwords file UnicodeDecodeError bug 2017-11-08 09:37:48 +08:00			`if sys.version_info[0] < 3:`
			`words = open(file_path, 'r')`
			`else:`
			`words = open(file_path, 'r', encoding='utf-8')`
#6 move useless code into data.py 2017-10-31 17:17:01 +08:00			`stopwords = words.readlines()`
			`for w in stopwords:`
			`_stopwords.add(any2unicode(w).strip())`

			`print(">> Synonyms on loading stopwords ...")`
			`_load_stopwords(_fin_stopwords_path)`

Add compare similarity api 2017-10-16 22:29:51 +08:00			`def _segment_words(sen):`
			`'''`
#6 move useless code into data.py 2017-10-31 17:17:01 +08:00			`segment words with jieba`
Add compare similarity api 2017-10-16 22:29:51 +08:00			`'''`
			`words, tags = [], []`
#6 simplify code and support py2,3 2017-10-31 16:54:55 +08:00			`m = _tokenizer.cut(sen, HMM=True) # HMM更好的识别新词`
use jieba as tokenizer 2017-10-28 10:06:11 +08:00			`for x in m:`
			`words.append(x.word)`
			`tags.append(x.flag)`
Add compare similarity api 2017-10-16 22:29:51 +08:00			`return words, tags`

#6 simplify code and support py2,3 2017-10-31 16:54:55 +08:00			`# vectors`
			`_f_model = os.path.join(curdir, 'data', 'words.vector')`
			`def _load_w2v(model_file=_f_model, binary=True):`
#6 move useless code into data.py 2017-10-31 17:17:01 +08:00			`'''`
			`load word2vec model`
			`'''`
#6 simplify code and support py2,3 2017-10-31 16:54:55 +08:00			`if not os.path.exists(model_file):`
			`print("os.path : ", os.path)`
			`raise Exception("Model file does not exist.")`
			`return KeyedVectors.load_word2vec_format(`
			`model_file, binary=binary, unicode_errors='ignore')`
			`print(">> Synonyms on loading vectors ...")`
			`_vectors = _load_w2v(model_file=_f_model)`

			`_sim_molecule = lambda x: np.sum(x, axis=0) # 分子`
inhence sentence similarity 2017-10-31 10:54:31 +08:00
#6 simplify code and support py2,3 2017-10-31 16:54:55 +08:00			`def _get_wv(sentence):`
Add compare similarity api 2017-10-16 22:29:51 +08:00			`'''`
#6 simplify code and support py2,3 2017-10-31 16:54:55 +08:00			`get word2vec data by sentence`
			`sentence is segmented string.`
Add compare similarity api 2017-10-16 22:29:51 +08:00			`'''`
#6 simplify code and support py2,3 2017-10-31 16:54:55 +08:00			`global _vectors`
			`vectors = []`
			`for y in sentence.split():`
			`y_ = any2unicode(y).strip()`
			`if y_ not in _stopwords:`
			`syns = nearby(y_)[0]`
			`# print("sentence %s word: %s" %(sentence, y_))`
			`# print("sentence %s word nearby: %s" %(sentence, " ".join(syns)))`
			`c = []`
			`try:`
			`c.append(_vectors.word_vec(y_))`
			`except KeyError as error:`
			`print("not exist in w2v model: %s" % y_)`
			`c.append(np.zeros((100,), dtype=float))`
			`for n in syns:`
			`if n is None: continue`
inhence sentence similarity 2017-10-31 10:54:31 +08:00			`try:`
#6 simplify code and support py2,3 2017-10-31 16:54:55 +08:00			`v = _vectors.word_vec(any2unicode(n))`
			`except KeyError as error:`
			`v = np.zeros((100,), dtype=float)`
			`c.append(v)`
			`r = np.average(c, axis=0)`
			`vectors.append(r)`
			`return vectors`


			`def _unigram_overlap(sentence1, sentence2):`
			`'''`
			`compute unigram overlap`
			`'''`
			`x = set(sentence1.split())`
			`y = set(sentence2.split())`

			`intersection = x & y`
			`union = x \| y`

			`return ((float)(len(intersection)) / (float)(len(union)))`

Update __init__.py Use _levenshtein_distance() to replace _unigram_overlap. We still need to adjust weight for better result. Right now following two sentences are still regarded as same. synonyms.compare('目前你用什么方法来保护朋友', '目前你用什么方法来保护家人') 2017-11-14 23:53:28 +08:00			`def _levenshtein_distance(sentence1, sentence2):`
			`'''`
			`Return the Levenshtein distance between two strings.`
			`Based on:`
			`http://rosettacode.org/wiki/Levenshtein_distance#Python`
			`'''`
			`first = sentence1.split()`
			`second = sentence2.split()`
			`if len(first) > len(second):`
			`first, second = second, first`
			`distances = range(len(first) + 1)`
			`for index2, char2 in enumerate(second):`
			`new_distances = [index2 + 1]`
			`for index1, char1 in enumerate(first):`
			`if char1 == char2:`
			`new_distances.append(distances[index1])`
			`else:`
			`new_distances.append(1 + min((distances[index1],`
			`distances[index1 + 1],`
			`new_distances[-1])))`
			`distances = new_distances`
			`levenshtein = distances[-1]`
			`return 2 ** (-1 * levenshtein)`

#6 simplify code and support py2,3 2017-10-31 16:54:55 +08:00
			`def _similarity_distance(s1, s2):`
			`'''`
			`compute similarity with distance measurement`
			`'''`
			`a = _sim_molecule(_get_wv(s1))`
			`b = _sim_molecule(_get_wv(s2))`
			`# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html`
			`g = 1 / (np.linalg.norm(a - b) + 1)`
Update __init__.py Use _levenshtein_distance() to replace _unigram_overlap. We still need to adjust weight for better result. Right now following two sentences are still regarded as same. synonyms.compare('目前你用什么方法来保护朋友', '目前你用什么方法来保护家人') 2017-11-14 23:53:28 +08:00			`u = _levenshtein_distance(s1, s2)`
Refine distance params, upgrade to v2 2017-12-31 19:01:05 +08:00			`r = g * 5 + u * 0.8`
			`r = min(r, 1.0)`
#6 simplify code and support py2,3 2017-10-31 16:54:55 +08:00
			`return float("%.3f" % r)`


			`def compare(s1, s2, seg=True):`
Add compare similarity api 2017-10-16 22:29:51 +08:00			`'''`
			`compare similarity`
inhence sentence similarity 2017-10-31 10:54:31 +08:00			`s1 : sentence1`
			`s2 : sentence2`
			`seg : True : The original sentences need jieba.cut`
			`Flase : The original sentences have been cut.`
Add compare similarity api 2017-10-16 22:29:51 +08:00			`'''`
inhence sentence similarity 2017-10-31 10:54:31 +08:00			`assert len(s1) > 0 and len(s2) > 0, "The length of s1 and s2 should > 0."`
			`if seg:`
#6 simplify code and support py2,3 2017-10-31 16:54:55 +08:00			`s1 = ' '.join(jieba.cut(s1))`
			`s2 = ' '.join(jieba.cut(s2))`
			`return _similarity_distance(s1, s2)`
inhence sentence similarity 2017-10-31 10:54:31 +08:00
Add compare similarity api 2017-10-16 22:29:51 +08:00
add compatible codes for py2 2017-10-18 09:49:09 +08:00			`def display(word):`
			`print("'%s'近义词：" % word)`
fix display mth 2017-10-21 09:59:26 +08:00			`o = nearby(word)`
refactor with pkl, add benchmark 2017-10-21 11:45:15 +08:00			`assert len(o) == 2, "should contain 2 list"`
#6 simplify code and support py2,3 2017-10-31 16:54:55 +08:00			`if len(o[0]) == 0:`
			`print(" out of vocabulary")`
			`for k, v in enumerate(o[0]):`
			`print(" %d. %s:%s" % (k + 1, v, o[1][k]))`

add compatible codes for py2 2017-10-18 09:49:09 +08:00
enable build dict 2017-09-28 21:56:24 +08:00			`def main():`
add compatible codes for py2 2017-10-18 09:49:09 +08:00			`display("人脸")`
refactor with pkl, add benchmark 2017-10-21 11:45:15 +08:00			`display("NOT_EXIST")`
enable build dict 2017-09-28 21:56:24 +08:00
#6 simplify code and support py2,3 2017-10-31 16:54:55 +08:00
enable build dict 2017-09-28 21:56:24 +08:00			`if __name__ == '__main__':`
chore: reattach stdout to sys 2017-10-21 22:15:25 +08:00			`main()`