修改稀疏矩阵读取方式,提高建立索引速度,前一次提交复杂度太高
This commit is contained in:
parent
996d126b69
commit
2626dd15e6
@ -12,6 +12,7 @@ from sklearn import feature_extraction
|
|||||||
from sklearn.feature_extraction.text import TfidfTransformer
|
from sklearn.feature_extraction.text import TfidfTransformer
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
from tools.show import show
|
from tools.show import show
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
class InverseIndex:
|
class InverseIndex:
|
||||||
@ -82,7 +83,10 @@ class InverseIndex:
|
|||||||
|
|
||||||
#calculate tf-idf
|
#calculate tf-idf
|
||||||
def CalcTFIDF(self):
|
def CalcTFIDF(self):
|
||||||
docArray = self.loadDataFromCutFile(100)
|
sh = show()
|
||||||
|
count = sh.showcount()
|
||||||
|
docArray = self.loadDataFromCutFile(count)
|
||||||
|
#docArray = self.loadDataFromCutFile(10)
|
||||||
vectorizer = CountVectorizer()
|
vectorizer = CountVectorizer()
|
||||||
transformer = TfidfTransformer()
|
transformer = TfidfTransformer()
|
||||||
tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray))
|
tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray))
|
||||||
@ -99,18 +103,29 @@ class InverseIndex:
|
|||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
colnum = tfidf.shape[1]
|
colnum = tfidf.shape[1]
|
||||||
row = tfidf.shape[0]
|
#for i in range(0,colnum):
|
||||||
|
# filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
|
||||||
|
# f = open(filename,'a')
|
||||||
|
# idx_list = dict()
|
||||||
|
# for j in range(0,row):
|
||||||
|
# val = tfidf[j,i]
|
||||||
|
# if val > 0:
|
||||||
|
# idx_list[j+1] = val
|
||||||
|
# f.write(json.dumps(idx_list)+'\n')
|
||||||
|
# f.close()
|
||||||
|
#i表示词项的编号,row表示非零文档所在的行
|
||||||
for i in range(0,colnum):
|
for i in range(0,colnum):
|
||||||
filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
|
filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
|
||||||
|
coldata = tfidf.getcol(i)
|
||||||
|
col_nonzero_index = np.nonzero(coldata)
|
||||||
|
item_weight_dict = dict()
|
||||||
|
for row in col_nonzero_index[0]:
|
||||||
|
item_weight_dict[row+1] = coldata[row][0].data[0]
|
||||||
f = open(filename,'a')
|
f = open(filename,'a')
|
||||||
idx_list = dict()
|
f.write(json.dumps(item_weight_dict)+'\n')
|
||||||
for j in range(0,row):
|
|
||||||
val = tfidf[j,i]
|
|
||||||
if val > 0:
|
|
||||||
idx_list[j+1] = val
|
|
||||||
f.write(json.dumps(idx_list)+'\n')
|
|
||||||
f.close()
|
f.close()
|
||||||
|
print 'item ',i,'calculate done'
|
||||||
|
|
||||||
|
|
||||||
def WriteInverseIndex(self,mat):
|
def WriteInverseIndex(self,mat):
|
||||||
pass
|
pass
|
||||||
|
11
ml/Search.py
11
ml/Search.py
@ -19,7 +19,7 @@ class Search:
|
|||||||
return kw_id
|
return kw_id
|
||||||
|
|
||||||
|
|
||||||
def getQueryItem(self,searchWord):
|
def QuerySingle(self,searchWord):
|
||||||
idx = self.kw_id[searchWord.decode('utf-8')]
|
idx = self.kw_id[searchWord.decode('utf-8')]
|
||||||
cut = Cut()
|
cut = Cut()
|
||||||
ii_line = cut.getInverseIndexRow(idx,Global.inverse_dir,Global.filesize)
|
ii_line = cut.getInverseIndexRow(idx,Global.inverse_dir,Global.filesize)
|
||||||
@ -29,6 +29,15 @@ class Search:
|
|||||||
data = json.loads(line)
|
data = json.loads(line)
|
||||||
print data['title'],'\n',data['time'],'\n',data['content'],'\n'
|
print data['title'],'\n',data['time'],'\n',data['content'],'\n'
|
||||||
|
|
||||||
|
def getQueryItem(self,searchWord):
|
||||||
|
idx = self.kw_id[searchWord.decode('utf-8')]
|
||||||
|
cut = Cut()
|
||||||
|
ii_line = cut.getInverseIndexRow(idx,Global.inverse_dir,Global.filesize)
|
||||||
|
record =json.loads(ii_line)
|
||||||
|
for rec in record:
|
||||||
|
line = cut.getRow(int(rec),Global.cutnews_origin_dir,Global.filesize)
|
||||||
|
data = json.loads(line)
|
||||||
|
print data['title'],'\n',data['time'],'\n',data['content'],'\n'
|
||||||
|
|
||||||
|
|
||||||
def getInverseRecord(self,item):
|
def getInverseRecord(self,item):
|
||||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -7,9 +7,9 @@ from ml.Cut import Cut
|
|||||||
s = show()
|
s = show()
|
||||||
#s.showcount()
|
#s.showcount()
|
||||||
#s.shownews(1)
|
#s.shownews(1)
|
||||||
#s.showKeyWord()
|
s.showKeyWord()
|
||||||
#s.showitem(2608)
|
#s.showitem(2608)
|
||||||
|
|
||||||
c = Cut()
|
#c = Cut()
|
||||||
line = c.getRow(50,Global.cutnews_origin_dir,Global.filesize)
|
#line = c.getRow(50,Global.cutnews_origin_dir,Global.filesize)
|
||||||
s.showitem(line)
|
#s.showitem(line)
|
||||||
|
@ -30,6 +30,7 @@ class show:
|
|||||||
c+=1
|
c+=1
|
||||||
f.close()
|
f.close()
|
||||||
print c
|
print c
|
||||||
|
return c
|
||||||
|
|
||||||
def showitem(self,line):
|
def showitem(self,line):
|
||||||
data = json.loads(line)
|
data = json.loads(line)
|
||||||
@ -42,7 +43,7 @@ class show:
|
|||||||
data = json.loads(line)
|
data = json.loads(line)
|
||||||
print 'load keyword done.'
|
print 'load keyword done.'
|
||||||
print type(data)
|
print type(data)
|
||||||
for k in data.keys():
|
print len(data)
|
||||||
print k,data[k]
|
# for k in data.keys():
|
||||||
|
# print k,data[k]
|
||||||
|
|
||||||
|
BIN
tools/show.pyc
BIN
tools/show.pyc
Binary file not shown.
Loading…
Reference in New Issue
Block a user