NewsSpider/ml/InverseIndex.py

136 lines
3.2 KiB
Python
Raw Permalink Normal View History

2016-04-28 22:26:14 +08:00
# -*- coding: utf-8 -*-
2016-04-25 23:58:51 +08:00
import jieba
2016-05-03 18:40:45 +08:00
import jieba.analyse as analyse
2016-04-25 23:58:51 +08:00
import json
import sys
2016-04-29 16:08:50 +08:00
reload(sys)
sys.path.append("..")
sys.setdefaultencoding('utf-8')
import tools.Global as Global
2016-04-28 22:26:14 +08:00
from Cut import Cut
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
2016-04-29 16:08:50 +08:00
from tools.show import show
import numpy as np
2016-04-25 23:58:51 +08:00
2016-04-26 00:56:46 +08:00
class InverseIndex:
def __init__(self):
2016-04-28 22:26:14 +08:00
self.file_data= open(Global.title_dir)
2016-04-26 00:56:46 +08:00
self.file_sw = open(Global.stopword_dir)
2016-04-28 22:26:14 +08:00
#self.ii = open(Global.inverse_dir,'wb')
2016-04-26 00:56:46 +08:00
self.stopword=[]
self.worddict = dict()
2016-04-28 22:26:14 +08:00
#load stopword list
2016-04-26 00:56:46 +08:00
def loadsw(self):
while True:
line = self.file_sw.readline()
if not line:
break
self.stopword.append(line)
print line,
2016-04-28 22:26:14 +08:00
#load origin data:news.json,title.json
def CalcInverseIndex(self):
2016-04-26 00:56:46 +08:00
self.loadsw()
count=0
while True:
line = self.file_data.readline()
if not line:
break
data = json.loads(line)
2016-05-05 23:04:30 +08:00
seg_list = list(jieba.cut(data['title'], cut_all=False))
2016-04-26 00:56:46 +08:00
count+=1
for w in seg_list:
if w not in self.worddict:
2016-04-29 17:56:36 +08:00
self.worddict[w] = []
2016-04-26 00:56:46 +08:00
if w not in self.stopword:
2016-04-27 23:21:28 +08:00
print w,
2016-04-26 00:56:46 +08:00
self.worddict[w].append(count)
2016-04-28 22:26:14 +08:00
def loadDataFromFile(self):
doc = []
f = open(Global.content_dir,'r')
while True:
line = f.readline()
if not line:
break
data = json.loads(line)
2016-05-05 23:04:30 +08:00
seg_list = list(jieba.cut(data['title'],cut_all=False))
2016-04-28 22:26:14 +08:00
doc.append(seg_list)
return doc
def loadDataFromCutFile(self,totalnum):
doc = []
cut = Cut()
for i in range(1,totalnum):
line = cut.getRow(i,Global.cutnews_dir,Global.filesize)
if not line:
break
data = json.loads(line)
keyword = analyse.extract_tags(data['content'],topK=20)
2016-05-03 18:40:45 +08:00
seg = " ".join(keyword)
print seg
doc.append(seg)
2016-04-28 22:26:14 +08:00
return doc
#calculate tf-idf
def CalcTFIDF(self):
sh = show()
count = sh.showcount()
docArray = self.loadDataFromCutFile(count)
#docArray = self.loadDataFromCutFile(10)
2016-04-28 22:26:14 +08:00
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray))
print 'done'
2016-04-29 17:56:36 +08:00
#write index-doc to file
i = 0
indexdoc = dict()
f = open(Global.inverse_dir+'id.txt','wb')
2016-05-03 18:40:45 +08:00
word = vectorizer.get_feature_names()
2016-04-28 22:26:14 +08:00
for name in vectorizer.get_feature_names():
2016-05-02 23:45:36 +08:00
i+=1
indexdoc[name] = i
2016-04-29 17:56:36 +08:00
f.write(json.dumps(indexdoc))
2016-05-02 23:45:36 +08:00
f.close()
2016-05-03 18:40:45 +08:00
colnum = tfidf.shape[1]
#for i in range(0,colnum):
# filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
# f = open(filename,'a')
# idx_list = dict()
# for j in range(0,row):
# val = tfidf[j,i]
# if val > 0:
# idx_list[j+1] = val
# f.write(json.dumps(idx_list)+'\n')
# f.close()
#i表示词项的编号row表示非零文档所在的行
2016-05-02 23:45:36 +08:00
for i in range(0,colnum):
filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
coldata = tfidf.getcol(i)
col_nonzero_index = np.nonzero(coldata)
item_weight_dict = dict()
for row in col_nonzero_index[0]:
item_weight_dict[row+1] = coldata[row][0].data[0]
2016-05-03 18:40:45 +08:00
f = open(filename,'a')
f.write(json.dumps(item_weight_dict)+'\n')
2016-05-02 23:45:36 +08:00
f.close()
print 'item ',i,'calculate done'
2016-05-02 23:45:36 +08:00
def WriteInverseIndex(self,mat):
pass
2016-04-29 17:56:36 +08:00
2016-04-28 22:26:14 +08:00
#test
2016-05-05 23:04:30 +08:00
#ii = InverseIndex()
#ii.CalcTFIDF()
2016-05-03 18:40:45 +08:00
#ii.loadDataFromCutFile(20)