写入索引
This commit is contained in:
parent
a732b75e58
commit
92e20e0957
BIN
ml/Cut.pyc
BIN
ml/Cut.pyc
Binary file not shown.
@ -78,16 +78,9 @@ class InverseIndex:
|
|||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
#save inverse table to file
|
|
||||||
def write2file(self):
|
|
||||||
for w in self.worddict:
|
|
||||||
ii.write(w+' '+str(worddict[w])+'\n')
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#calculate tf-idf
|
#calculate tf-idf
|
||||||
def CalcTFIDF(self):
|
def CalcTFIDF(self):
|
||||||
docArray = self.loadDataFromCutFile(10000)
|
docArray = self.loadDataFromCutFile(100)
|
||||||
vectorizer = CountVectorizer()
|
vectorizer = CountVectorizer()
|
||||||
transformer = TfidfTransformer()
|
transformer = TfidfTransformer()
|
||||||
tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray))
|
tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray))
|
||||||
@ -97,9 +90,28 @@ class InverseIndex:
|
|||||||
indexdoc = dict()
|
indexdoc = dict()
|
||||||
f = open(Global.inverse_dir+'id.txt','wb')
|
f = open(Global.inverse_dir+'id.txt','wb')
|
||||||
for name in vectorizer.get_feature_names():
|
for name in vectorizer.get_feature_names():
|
||||||
i+=1
|
|
||||||
indexdoc[i] = name
|
indexdoc[i] = name
|
||||||
|
i+=1
|
||||||
f.write(json.dumps(indexdoc))
|
f.write(json.dumps(indexdoc))
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
colnum = tfidf.shape[1]
|
||||||
|
row = tfidf.shape[0]
|
||||||
|
for i in range(0,colnum):
|
||||||
|
filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
|
||||||
|
f = open(filename,'a')
|
||||||
|
idx_list = dict()
|
||||||
|
for j in range(0,row):
|
||||||
|
val = tfidf[j,i]
|
||||||
|
if val > 0:
|
||||||
|
idx_list[j] = val
|
||||||
|
f.write(json.dumps(idx_list)+'\n')
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
|
def WriteInverseIndex(self,mat):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
#test
|
#test
|
||||||
ii = InverseIndex()
|
ii = InverseIndex()
|
||||||
|
Loading…
Reference in New Issue
Block a user