暂未解决数据问题

This commit is contained in:
lzjqsdd 2016-05-03 18:40:45 +08:00
parent 69dfb7b4b1
commit c5dd35c5aa
3 changed files with 22 additions and 10 deletions

View File

@ -1,5 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import jieba import jieba
import jieba.analyse as analyse
import json import json
import sys import sys
reload(sys) reload(sys)
@ -70,11 +71,12 @@ class InverseIndex:
if not line: if not line:
break break
data = json.loads(line) data = json.loads(line)
seg_list = jieba.cut(data['content'],cut_all=True) # seg_list = jieba.cut(data['content'],cut_all=True)
for seg in seg_list:
seg=''.join(seg.split()) keyword = analyse.extract_tags(data['content'],topK=10)
if(seg!='' and seg!="\n" and seg!="\n\n"): seg = " ".join(keyword)
doc.append(seg) print seg
doc.append(seg)
return doc return doc
@ -89,25 +91,26 @@ class InverseIndex:
i = 0 i = 0
indexdoc = dict() indexdoc = dict()
f = open(Global.inverse_dir+'id.txt','wb') f = open(Global.inverse_dir+'id.txt','wb')
word = vectorizer.get_feature_names()
for name in vectorizer.get_feature_names(): for name in vectorizer.get_feature_names():
indexdoc[name] = i indexdoc[name] = i
i+=1 i+=1
f.write(json.dumps(indexdoc)) f.write(json.dumps(indexdoc))
f.close() f.close()
colnum = tfidf.shape[1] colnum = tfidf.shape[1]
row = tfidf.shape[0] row = tfidf.shape[0]
for i in range(0,colnum): for i in range(0,colnum):
filename = Global.inverse_dir+str(i/Global.filesize)+'.txt' filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
f = open(filename,'a' f = open(filename,'a')
idx_list = dict() idx_list = list()
for j in range(0,row): for j in range(0,row):
val = tfidf[j,i] val = tfidf[j,i]
if val > 0: if val > 0:
idx_list[j] = val idx_list[j] = val
f.write(json.dumps(idx_list)+'\n') f.write(json.dumps(idx_list)+'\n')
f.close() f.close()
def WriteInverseIndex(self,mat): def WriteInverseIndex(self,mat):
pass pass
@ -116,3 +119,4 @@ class InverseIndex:
#test #test
ii = InverseIndex() ii = InverseIndex()
ii.CalcTFIDF() ii.CalcTFIDF()
#ii.loadDataFromCutFile(20)

View File

@ -1,8 +1,16 @@
import sys import sys
sys.path.append("..") sys.path.append("..")
from tools.show import show from tools.show import show
import tools.Global as Global
from ml.Cut import Cut
s = show() s = show()
#s.showcount() #s.showcount()
#s.shownews(1) #s.shownews(1)
s.showKeyWord() #s.showKeyWord()
#s.showitem(2608)
c = Cut()
line = c.getRow(3176,Global.cutnews_dir,Global.filesize)
s.showitem(line)

Binary file not shown.