暂未解决数据问题
This commit is contained in:
parent
69dfb7b4b1
commit
c5dd35c5aa
@ -1,5 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import jieba
|
import jieba
|
||||||
|
import jieba.analyse as analyse
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
reload(sys)
|
reload(sys)
|
||||||
@ -70,11 +71,12 @@ class InverseIndex:
|
|||||||
if not line:
|
if not line:
|
||||||
break
|
break
|
||||||
data = json.loads(line)
|
data = json.loads(line)
|
||||||
seg_list = jieba.cut(data['content'],cut_all=True)
|
# seg_list = jieba.cut(data['content'],cut_all=True)
|
||||||
for seg in seg_list:
|
|
||||||
seg=''.join(seg.split())
|
keyword = analyse.extract_tags(data['content'],topK=10)
|
||||||
if(seg!='' and seg!="\n" and seg!="\n\n"):
|
seg = " ".join(keyword)
|
||||||
doc.append(seg)
|
print seg
|
||||||
|
doc.append(seg)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
@ -89,25 +91,26 @@ class InverseIndex:
|
|||||||
i = 0
|
i = 0
|
||||||
indexdoc = dict()
|
indexdoc = dict()
|
||||||
f = open(Global.inverse_dir+'id.txt','wb')
|
f = open(Global.inverse_dir+'id.txt','wb')
|
||||||
|
word = vectorizer.get_feature_names()
|
||||||
for name in vectorizer.get_feature_names():
|
for name in vectorizer.get_feature_names():
|
||||||
indexdoc[name] = i
|
indexdoc[name] = i
|
||||||
i+=1
|
i+=1
|
||||||
f.write(json.dumps(indexdoc))
|
f.write(json.dumps(indexdoc))
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
colnum = tfidf.shape[1]
|
colnum = tfidf.shape[1]
|
||||||
row = tfidf.shape[0]
|
row = tfidf.shape[0]
|
||||||
for i in range(0,colnum):
|
for i in range(0,colnum):
|
||||||
filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
|
filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
|
||||||
f = open(filename,'a'
|
f = open(filename,'a')
|
||||||
idx_list = dict()
|
idx_list = list()
|
||||||
for j in range(0,row):
|
for j in range(0,row):
|
||||||
val = tfidf[j,i]
|
val = tfidf[j,i]
|
||||||
if val > 0:
|
if val > 0:
|
||||||
idx_list[j] = val
|
idx_list[j] = val
|
||||||
f.write(json.dumps(idx_list)+'\n')
|
f.write(json.dumps(idx_list)+'\n')
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
def WriteInverseIndex(self,mat):
|
def WriteInverseIndex(self,mat):
|
||||||
pass
|
pass
|
||||||
@ -116,3 +119,4 @@ class InverseIndex:
|
|||||||
#test
|
#test
|
||||||
ii = InverseIndex()
|
ii = InverseIndex()
|
||||||
ii.CalcTFIDF()
|
ii.CalcTFIDF()
|
||||||
|
#ii.loadDataFromCutFile(20)
|
||||||
|
@ -1,8 +1,16 @@
|
|||||||
import sys
|
import sys
|
||||||
sys.path.append("..")
|
sys.path.append("..")
|
||||||
from tools.show import show
|
from tools.show import show
|
||||||
|
import tools.Global as Global
|
||||||
|
from ml.Cut import Cut
|
||||||
|
|
||||||
s = show()
|
s = show()
|
||||||
#s.showcount()
|
#s.showcount()
|
||||||
#s.shownews(1)
|
#s.shownews(1)
|
||||||
s.showKeyWord()
|
#s.showKeyWord()
|
||||||
|
#s.showitem(2608)
|
||||||
|
|
||||||
|
c = Cut()
|
||||||
|
line = c.getRow(3176,Global.cutnews_dir,Global.filesize)
|
||||||
|
s.showitem(line)
|
||||||
|
|
||||||
|
BIN
tools/show.pyc
BIN
tools/show.pyc
Binary file not shown.
Loading…
Reference in New Issue
Block a user