修改稀疏矩阵读取方式,提高建立索引速度,前一次提交复杂度太高
This commit is contained in:
parent
996d126b69
commit
2626dd15e6
@ -12,6 +12,7 @@ from sklearn import feature_extraction
|
||||
from sklearn.feature_extraction.text import TfidfTransformer
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from tools.show import show
|
||||
import numpy as np
|
||||
|
||||
|
||||
class InverseIndex:
|
||||
@ -82,7 +83,10 @@ class InverseIndex:
|
||||
|
||||
#calculate tf-idf
|
||||
def CalcTFIDF(self):
|
||||
docArray = self.loadDataFromCutFile(100)
|
||||
sh = show()
|
||||
count = sh.showcount()
|
||||
docArray = self.loadDataFromCutFile(count)
|
||||
#docArray = self.loadDataFromCutFile(10)
|
||||
vectorizer = CountVectorizer()
|
||||
transformer = TfidfTransformer()
|
||||
tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray))
|
||||
@ -99,17 +103,28 @@ class InverseIndex:
|
||||
f.close()
|
||||
|
||||
colnum = tfidf.shape[1]
|
||||
row = tfidf.shape[0]
|
||||
#for i in range(0,colnum):
|
||||
# filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
|
||||
# f = open(filename,'a')
|
||||
# idx_list = dict()
|
||||
# for j in range(0,row):
|
||||
# val = tfidf[j,i]
|
||||
# if val > 0:
|
||||
# idx_list[j+1] = val
|
||||
# f.write(json.dumps(idx_list)+'\n')
|
||||
# f.close()
|
||||
#i表示词项的编号,row表示非零文档所在的行
|
||||
for i in range(0,colnum):
|
||||
filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
|
||||
coldata = tfidf.getcol(i)
|
||||
col_nonzero_index = np.nonzero(coldata)
|
||||
item_weight_dict = dict()
|
||||
for row in col_nonzero_index[0]:
|
||||
item_weight_dict[row+1] = coldata[row][0].data[0]
|
||||
f = open(filename,'a')
|
||||
idx_list = dict()
|
||||
for j in range(0,row):
|
||||
val = tfidf[j,i]
|
||||
if val > 0:
|
||||
idx_list[j+1] = val
|
||||
f.write(json.dumps(idx_list)+'\n')
|
||||
f.write(json.dumps(item_weight_dict)+'\n')
|
||||
f.close()
|
||||
print 'item ',i,'calculate done'
|
||||
|
||||
|
||||
def WriteInverseIndex(self,mat):
|
||||
|
11
ml/Search.py
11
ml/Search.py
@ -19,6 +19,16 @@ class Search:
|
||||
return kw_id
|
||||
|
||||
|
||||
def QuerySingle(self,searchWord):
|
||||
idx = self.kw_id[searchWord.decode('utf-8')]
|
||||
cut = Cut()
|
||||
ii_line = cut.getInverseIndexRow(idx,Global.inverse_dir,Global.filesize)
|
||||
record =json.loads(ii_line)
|
||||
for rec in record:
|
||||
line = cut.getRow(int(rec),Global.cutnews_origin_dir,Global.filesize)
|
||||
data = json.loads(line)
|
||||
print data['title'],'\n',data['time'],'\n',data['content'],'\n'
|
||||
|
||||
def getQueryItem(self,searchWord):
|
||||
idx = self.kw_id[searchWord.decode('utf-8')]
|
||||
cut = Cut()
|
||||
@ -30,7 +40,6 @@ class Search:
|
||||
print data['title'],'\n',data['time'],'\n',data['content'],'\n'
|
||||
|
||||
|
||||
|
||||
def getInverseRecord(self,item):
|
||||
pass
|
||||
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -7,9 +7,9 @@ from ml.Cut import Cut
|
||||
s = show()
|
||||
#s.showcount()
|
||||
#s.shownews(1)
|
||||
#s.showKeyWord()
|
||||
s.showKeyWord()
|
||||
#s.showitem(2608)
|
||||
|
||||
c = Cut()
|
||||
line = c.getRow(50,Global.cutnews_origin_dir,Global.filesize)
|
||||
s.showitem(line)
|
||||
#c = Cut()
|
||||
#line = c.getRow(50,Global.cutnews_origin_dir,Global.filesize)
|
||||
#s.showitem(line)
|
||||
|
@ -30,6 +30,7 @@ class show:
|
||||
c+=1
|
||||
f.close()
|
||||
print c
|
||||
return c
|
||||
|
||||
def showitem(self,line):
|
||||
data = json.loads(line)
|
||||
@ -42,7 +43,7 @@ class show:
|
||||
data = json.loads(line)
|
||||
print 'load keyword done.'
|
||||
print type(data)
|
||||
for k in data.keys():
|
||||
print k,data[k]
|
||||
|
||||
print len(data)
|
||||
# for k in data.keys():
|
||||
# print k,data[k]
|
||||
|
||||
|
BIN
tools/show.pyc
BIN
tools/show.pyc
Binary file not shown.
Loading…
Reference in New Issue
Block a user