99 lines
5.0 KiB
Python
99 lines
5.0 KiB
Python
# -*- coding: UTF-8 -*-
|
||
# !/usr/bin/python
|
||
# @time :2019/11/2 21:03
|
||
# @author :Mo
|
||
# @function :topic model of LSI
|
||
# @paper :Text summarization using Latent Semantic Analysis
|
||
|
||
|
||
from macropodus.preprocess.tools_ml import cut_sentence, macropodus_cut
|
||
from macropodus.preprocess.tools_ml import extract_chinese, tfidf_fit
|
||
from macropodus.data.words_common.stop_words import stop_words
|
||
# sklearn
|
||
from sklearn.decomposition import TruncatedSVD
|
||
import numpy as np
|
||
|
||
|
||
class LSISum:
|
||
def __init__(self):
|
||
self.stop_words = stop_words.values()
|
||
self.algorithm = 'lsi'
|
||
|
||
def summarize(self, text, num=320, topic_min=5, judge_topic='all'):
|
||
"""
|
||
|
||
:param text:
|
||
:param num:
|
||
:return:
|
||
"""
|
||
# 切句
|
||
if type(text) == str:
|
||
self.sentences = cut_sentence(text)
|
||
elif type(text) == list:
|
||
self.sentences = text
|
||
else:
|
||
raise RuntimeError("text type must be list or str")
|
||
len_sentences_cut = len(self.sentences)
|
||
# 切词
|
||
sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence))
|
||
if word.strip()] for sentence in self.sentences]
|
||
# 去除停用词等
|
||
self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut]
|
||
self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut]
|
||
# 计算每个句子的tfidf
|
||
sen_tfidf = tfidf_fit(self.sentences_cut)
|
||
# 主题数, 经验判断
|
||
topic_num = min(topic_min, int(len(sentences_cut)/2)) # 设定最小主题数为3
|
||
svd_tfidf = TruncatedSVD(n_components=topic_num, n_iter=32)
|
||
res_svd_u = svd_tfidf.fit_transform(sen_tfidf.T)
|
||
res_svd_v = svd_tfidf.components_
|
||
|
||
if judge_topic:
|
||
### 方案一, 获取最大那个主题的k个句子
|
||
##################################################################################
|
||
topic_t_score = np.sum(res_svd_v, axis=-1)
|
||
# 对每列(一个句子topic_num个主题),得分进行排序,0为最大
|
||
res_nmf_h_soft = res_svd_v.argsort(axis=0)[-topic_num:][::-1]
|
||
# 统计为最大每个主题的句子个数
|
||
exist = (res_nmf_h_soft <= 0) * 1.0
|
||
factor = np.ones(res_nmf_h_soft.shape[1])
|
||
topic_t_count = np.dot(exist, factor)
|
||
# 标准化
|
||
topic_t_count /= np.sum(topic_t_count, axis=-1)
|
||
topic_t_score /= np.sum(topic_t_score, axis=-1)
|
||
# 主题最大个数占比, 与主题总得分占比选择最大的主题
|
||
topic_t_tc = topic_t_count + topic_t_score
|
||
topic_t_tc_argmax = np.argmax(topic_t_tc)
|
||
# 最后得分选择该最大主题的
|
||
res_nmf_h_soft_argmax = res_svd_v[topic_t_tc_argmax].tolist()
|
||
res_combine = {}
|
||
for l in range(len_sentences_cut):
|
||
res_combine[self.sentences[l]] = res_nmf_h_soft_argmax[l]
|
||
score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
|
||
#####################################################################################
|
||
else:
|
||
### 方案二, 获取最大主题概率的句子, 不分主题
|
||
res_combine = {}
|
||
for i in range(len_sentences_cut):
|
||
res_row_i = res_svd_v[:, i]
|
||
res_row_i_argmax = np.argmax(res_row_i)
|
||
res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax]
|
||
score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
|
||
num_min = min(num, int(len_sentences_cut * 0.6))
|
||
return score_sen[0:num_min]
|
||
|
||
|
||
if __name__ == '__main__':
|
||
lsi = LSISum()
|
||
doc = "多知网5月26日消息,今日,方直科技发公告,拟用自有资金人民币1.2亿元," \
|
||
"与深圳嘉道谷投资管理有限公司、深圳嘉道功程股权投资基金(有限合伙)共同发起设立嘉道方直教育产业投资基金(暂定名)。" \
|
||
"该基金认缴出资总规模为人民币3.01亿元。" \
|
||
"基金的出资方式具体如下:出资进度方面,基金合伙人的出资应于基金成立之日起四年内分四期缴足,每期缴付7525万元;" \
|
||
"各基金合伙人每期按其出资比例缴付。合伙期限为11年,投资目标为教育领域初创期或成长期企业。" \
|
||
"截止公告披露日,深圳嘉道谷投资管理有限公司股权结构如下:截止公告披露日,深圳嘉道功程股权投资基金产权结构如下:" \
|
||
"公告还披露,方直科技将探索在中小学教育、在线教育、非学历教育、学前教育、留学咨询等教育行业其他分支领域的投资。" \
|
||
"方直科技2016年营业收入9691万元,营业利润1432万元,归属于普通股股东的净利润1847万元。(多知网 黎珊)}}"
|
||
sum = lsi.summarize(doc, num=8)
|
||
for i in sum:
|
||
print(i)
|