修改稀疏矩阵读取方式,提高建立索引速度,前一次提交复杂度太高

This commit is contained in:
lzjqsdd 2016-05-05 21:29:25 +08:00
parent 996d126b69
commit 2626dd15e6
16 changed files with 42 additions and 17 deletions

View File

@ -12,6 +12,7 @@ from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from tools.show import show
import numpy as np
class InverseIndex:
@ -82,7 +83,10 @@ class InverseIndex:
#calculate tf-idf
def CalcTFIDF(self):
docArray = self.loadDataFromCutFile(100)
sh = show()
count = sh.showcount()
docArray = self.loadDataFromCutFile(count)
#docArray = self.loadDataFromCutFile(10)
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray))
@ -99,18 +103,29 @@ class InverseIndex:
f.close()
colnum = tfidf.shape[1]
row = tfidf.shape[0]
#for i in range(0,colnum):
# filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
# f = open(filename,'a')
# idx_list = dict()
# for j in range(0,row):
# val = tfidf[j,i]
# if val > 0:
# idx_list[j+1] = val
# f.write(json.dumps(idx_list)+'\n')
# f.close()
#i表示词项的编号row表示非零文档所在的行
for i in range(0,colnum):
filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
coldata = tfidf.getcol(i)
col_nonzero_index = np.nonzero(coldata)
item_weight_dict = dict()
for row in col_nonzero_index[0]:
item_weight_dict[row+1] = coldata[row][0].data[0]
f = open(filename,'a')
idx_list = dict()
for j in range(0,row):
val = tfidf[j,i]
if val > 0:
idx_list[j+1] = val
f.write(json.dumps(idx_list)+'\n')
f.write(json.dumps(item_weight_dict)+'\n')
f.close()
print 'item ',i,'calculate done'
def WriteInverseIndex(self,mat):
pass

View File

@ -19,7 +19,7 @@ class Search:
return kw_id
def getQueryItem(self,searchWord):
def QuerySingle(self,searchWord):
idx = self.kw_id[searchWord.decode('utf-8')]
cut = Cut()
ii_line = cut.getInverseIndexRow(idx,Global.inverse_dir,Global.filesize)
@ -29,6 +29,15 @@ class Search:
data = json.loads(line)
print data['title'],'\n',data['time'],'\n',data['content'],'\n'
def getQueryItem(self,searchWord):
idx = self.kw_id[searchWord.decode('utf-8')]
cut = Cut()
ii_line = cut.getInverseIndexRow(idx,Global.inverse_dir,Global.filesize)
record =json.loads(ii_line)
for rec in record:
line = cut.getRow(int(rec),Global.cutnews_origin_dir,Global.filesize)
data = json.loads(line)
print data['title'],'\n',data['time'],'\n',data['content'],'\n'
def getInverseRecord(self,item):

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -7,9 +7,9 @@ from ml.Cut import Cut
s = show()
#s.showcount()
#s.shownews(1)
#s.showKeyWord()
s.showKeyWord()
#s.showitem(2608)
c = Cut()
line = c.getRow(50,Global.cutnews_origin_dir,Global.filesize)
s.showitem(line)
#c = Cut()
#line = c.getRow(50,Global.cutnews_origin_dir,Global.filesize)
#s.showitem(line)

View File

@ -30,6 +30,7 @@ class show:
c+=1
f.close()
print c
return c
def showitem(self,line):
data = json.loads(line)
@ -42,7 +43,7 @@ class show:
data = json.loads(line)
print 'load keyword done.'
print type(data)
for k in data.keys():
print k,data[k]
print len(data)
# for k in data.keys():
# print k,data[k]

Binary file not shown.