暂未解决数据问题

This commit is contained in:
lzjqsdd 2016-05-03 18:40:45 +08:00
parent 69dfb7b4b1
commit c5dd35c5aa
3 changed files with 22 additions and 10 deletions

View File

@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
import jieba
import jieba.analyse as analyse
import json
import sys
reload(sys)
@ -70,11 +71,12 @@ class InverseIndex:
if not line:
break
data = json.loads(line)
seg_list = jieba.cut(data['content'],cut_all=True)
for seg in seg_list:
seg=''.join(seg.split())
if(seg!='' and seg!="\n" and seg!="\n\n"):
doc.append(seg)
# seg_list = jieba.cut(data['content'],cut_all=True)
keyword = analyse.extract_tags(data['content'],topK=10)
seg = " ".join(keyword)
print seg
doc.append(seg)
return doc
@ -89,25 +91,26 @@ class InverseIndex:
i = 0
indexdoc = dict()
f = open(Global.inverse_dir+'id.txt','wb')
word = vectorizer.get_feature_names()
for name in vectorizer.get_feature_names():
indexdoc[name] = i
i+=1
f.write(json.dumps(indexdoc))
f.close()
colnum = tfidf.shape[1]
colnum = tfidf.shape[1]
row = tfidf.shape[0]
for i in range(0,colnum):
filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
f = open(filename,'a'
idx_list = dict()
f = open(filename,'a')
idx_list = list()
for j in range(0,row):
val = tfidf[j,i]
if val > 0:
idx_list[j] = val
f.write(json.dumps(idx_list)+'\n')
f.close()
def WriteInverseIndex(self,mat):
pass
@ -116,3 +119,4 @@ class InverseIndex:
#test
ii = InverseIndex()
ii.CalcTFIDF()
#ii.loadDataFromCutFile(20)

View File

@ -1,8 +1,16 @@
import sys
sys.path.append("..")
from tools.show import show
import tools.Global as Global
from ml.Cut import Cut
s = show()
#s.showcount()
#s.shownews(1)
s.showKeyWord()
#s.showKeyWord()
#s.showitem(2608)
c = Cut()
line = c.getRow(3176,Global.cutnews_dir,Global.filesize)
s.showitem(line)

Binary file not shown.