Macropodus/macropodus/summarize/topic_base/topic_lsi.py
2020-01-01 00:51:11 +08:00

99 lines
5.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/11/2 21:03
# @author :Mo
# @function :topic model of LSI
# @paper :Text summarization using Latent Semantic Analysis
from macropodus.preprocess.tools_ml import cut_sentence, macropodus_cut
from macropodus.preprocess.tools_ml import extract_chinese, tfidf_fit
from macropodus.data.words_common.stop_words import stop_words
# sklearn
from sklearn.decomposition import TruncatedSVD
import numpy as np
class LSISum:
def __init__(self):
self.stop_words = stop_words.values()
self.algorithm = 'lsi'
def summarize(self, text, num=320, topic_min=5, judge_topic='all'):
"""
:param text:
:param num:
:return:
"""
# 切句
if type(text) == str:
self.sentences = cut_sentence(text)
elif type(text) == list:
self.sentences = text
else:
raise RuntimeError("text type must be list or str")
len_sentences_cut = len(self.sentences)
# 切词
sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence))
if word.strip()] for sentence in self.sentences]
# 去除停用词等
self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut]
self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut]
# 计算每个句子的tfidf
sen_tfidf = tfidf_fit(self.sentences_cut)
# 主题数, 经验判断
topic_num = min(topic_min, int(len(sentences_cut)/2)) # 设定最小主题数为3
svd_tfidf = TruncatedSVD(n_components=topic_num, n_iter=32)
res_svd_u = svd_tfidf.fit_transform(sen_tfidf.T)
res_svd_v = svd_tfidf.components_
if judge_topic:
### 方案一, 获取最大那个主题的k个句子
##################################################################################
topic_t_score = np.sum(res_svd_v, axis=-1)
# 对每列(一个句子topic_num个主题),得分进行排序,0为最大
res_nmf_h_soft = res_svd_v.argsort(axis=0)[-topic_num:][::-1]
# 统计为最大每个主题的句子个数
exist = (res_nmf_h_soft <= 0) * 1.0
factor = np.ones(res_nmf_h_soft.shape[1])
topic_t_count = np.dot(exist, factor)
# 标准化
topic_t_count /= np.sum(topic_t_count, axis=-1)
topic_t_score /= np.sum(topic_t_score, axis=-1)
# 主题最大个数占比, 与主题总得分占比选择最大的主题
topic_t_tc = topic_t_count + topic_t_score
topic_t_tc_argmax = np.argmax(topic_t_tc)
# 最后得分选择该最大主题的
res_nmf_h_soft_argmax = res_svd_v[topic_t_tc_argmax].tolist()
res_combine = {}
for l in range(len_sentences_cut):
res_combine[self.sentences[l]] = res_nmf_h_soft_argmax[l]
score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
#####################################################################################
else:
### 方案二, 获取最大主题概率的句子, 不分主题
res_combine = {}
for i in range(len_sentences_cut):
res_row_i = res_svd_v[:, i]
res_row_i_argmax = np.argmax(res_row_i)
res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax]
score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
num_min = min(num, int(len_sentences_cut * 0.6))
return score_sen[0:num_min]
if __name__ == '__main__':
lsi = LSISum()
doc = "多知网5月26日消息今日方直科技发公告拟用自有资金人民币1.2亿元," \
"与深圳嘉道谷投资管理有限公司、深圳嘉道功程股权投资基金(有限合伙)共同发起设立嘉道方直教育产业投资基金(暂定名)。" \
"该基金认缴出资总规模为人民币3.01亿元。" \
"基金的出资方式具体如下出资进度方面基金合伙人的出资应于基金成立之日起四年内分四期缴足每期缴付7525万元" \
"各基金合伙人每期按其出资比例缴付。合伙期限为11年投资目标为教育领域初创期或成长期企业。" \
"截止公告披露日,深圳嘉道谷投资管理有限公司股权结构如下:截止公告披露日,深圳嘉道功程股权投资基金产权结构如下:" \
"公告还披露,方直科技将探索在中小学教育、在线教育、非学历教育、学前教育、留学咨询等教育行业其他分支领域的投资。" \
"方直科技2016年营业收入9691万元营业利润1432万元归属于普通股股东的净利润1847万元。多知网 黎珊)}}"
sum = lsi.summarize(doc, num=8)
for i in sum:
print(i)