利用doc2vec计算文档相似度
效果不是很好,需要进一步处理语料
This commit is contained in:
commit
83581cc272
52
doc2vec.py
Normal file
52
doc2vec.py
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
import gensim
|
||||||
|
import numpy as np
|
||||||
|
import jieba
|
||||||
|
from gensim.models.doc2vec import Doc2Vec, LabeledSentence
|
||||||
|
# stop_text = open('stop_list.txt', 'r')
|
||||||
|
# stop_word = []
|
||||||
|
# for line in stop_text:
|
||||||
|
# stop_word.append(line.strip())
|
||||||
|
TaggededDocument = gensim.models.doc2vec.TaggedDocument
|
||||||
|
|
||||||
|
def get_corpus():
|
||||||
|
|
||||||
|
with open("corpus_seg.txt", 'r') as doc:
|
||||||
|
docs = doc.readlines()
|
||||||
|
train_docs = []
|
||||||
|
for i, text in enumerate(docs):
|
||||||
|
word_list = text.split(' ')
|
||||||
|
length = len(word_list)
|
||||||
|
word_list[length - 1] = word_list[length - 1].strip()
|
||||||
|
document = TaggededDocument(word_list, tags=[i])
|
||||||
|
train_docs.append(document)
|
||||||
|
return train_docs
|
||||||
|
|
||||||
|
def train(x_train, size=200, epoch_num=1):
|
||||||
|
model_dm = Doc2Vec(x_train, min_count=1, window=3, size=size, sample=1e-3, negative=5, workers=4)
|
||||||
|
model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=70)
|
||||||
|
model_dm.save('model_doc2vec')
|
||||||
|
return model_dm
|
||||||
|
|
||||||
|
def test():
|
||||||
|
model_dm = Doc2Vec.load("model_doc2vec")
|
||||||
|
text_test = u'武汉东湖新技术开发区人民检察院指控: 2013年4月27日21时许,被告人王某、连某经预谋后,窜至本区流芳高新四路联想工地内,窃取该处扣件若干欲离开时,被此处工地值班人员刘某发现并制止。被告人王某、连某遂共同用拳头、安全帽及啤酒瓶殴打刘某的头部、背部等处,致被害人刘某轻微伤,后共同逃离现场。 2013年11月1日,被告人王某被公安机关抓获。同年11月25日,被告人王某按照公安机关的安排,以打电话的方式联系被告人连某投案。到案后,上述二被告人共同赔偿被害人刘某人民币1.5万元,并获得谅解。 针对上述指控的事实,公诉机关当庭出示和宣读的证据有:1、抓获及破案经过;2、调解协议、谅解书、病历等书证;3、涉案物品照片;4、鉴定意见书;5、证人证言;6、被害人陈述;7、被告人的供述及辩解、讯问同步录音录像等。 公诉机关认为,被告人王某、连某以非法占有为目的,在实施盗窃行为时,为抗拒抓捕,当场使用暴力,致一人轻微伤,其行为均触犯了《中华人民共和国刑法》第二百六十九条、第二百六十三条的规定,应当以抢劫罪追究其刑事责任。案发后,被告人王某协助公安机关抓捕同案犯,具有《中华人民共和国刑法》第六十八条规定的情节;被告人连某主动投案,并如实供述自己的犯罪事实,具有《中华人民共和国刑法》第六十七条第一款规定的情节。'
|
||||||
|
text_cut = jieba.cut(text_test)
|
||||||
|
text_raw = []
|
||||||
|
for i in list(text_cut):
|
||||||
|
text_raw.append(i)
|
||||||
|
inferred_vector_dm = model_dm.infer_vector(text_raw)
|
||||||
|
sims = model_dm.docvecs.most_similar([inferred_vector_dm], topn=10)
|
||||||
|
|
||||||
|
return sims
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
x_train = get_corpus()
|
||||||
|
# model_dm = train(x_train)
|
||||||
|
sims = test()
|
||||||
|
for count, sim in sims:
|
||||||
|
sentence = x_train[count]
|
||||||
|
words = ''
|
||||||
|
for word in sentence[0]:
|
||||||
|
words = words + word + ' '
|
||||||
|
print(words, sim, len(sentence[0]))
|
1208
stop_list.txt
Executable file
1208
stop_list.txt
Executable file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user