提取关键词写入文件
This commit is contained in:
parent
84a89ec61d
commit
a732b75e58
1
.gitignore
vendored
1
.gitignore
vendored
@ -3,3 +3,4 @@ data/news.db
|
||||
data/news.json
|
||||
data/title.json
|
||||
data/cutnews
|
||||
data/inversedata
|
||||
|
BIN
ml/Cut.pyc
BIN
ml/Cut.pyc
Binary file not shown.
@ -44,7 +44,7 @@ class InverseIndex:
|
||||
count+=1
|
||||
for w in seg_list:
|
||||
if w not in self.worddict:
|
||||
self.worddict[w] = []
|
||||
self.worddict[w] = []
|
||||
if w not in self.stopword:
|
||||
print w,
|
||||
self.worddict[w].append(count)
|
||||
@ -87,14 +87,20 @@ class InverseIndex:
|
||||
|
||||
#calculate tf-idf
|
||||
def CalcTFIDF(self):
|
||||
docArray = self.loadDataFromCutFile(100)
|
||||
docArray = self.loadDataFromCutFile(10000)
|
||||
vectorizer = CountVectorizer()
|
||||
transformer = TfidfTransformer()
|
||||
tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray))
|
||||
print 'done'
|
||||
#write index-doc to file
|
||||
i = 0
|
||||
indexdoc = dict()
|
||||
f = open(Global.inverse_dir+'id.txt','wb')
|
||||
for name in vectorizer.get_feature_names():
|
||||
print name
|
||||
|
||||
i+=1
|
||||
indexdoc[i] = name
|
||||
f.write(json.dumps(indexdoc))
|
||||
|
||||
#test
|
||||
ii = InverseIndex()
|
||||
ii.CalcTFIDF()
|
||||
|
8
test/test_tool.py
Normal file
8
test/test_tool.py
Normal file
@ -0,0 +1,8 @@
|
||||
import sys
|
||||
sys.path.append("..")
|
||||
from tools.show import show
|
||||
|
||||
s = show()
|
||||
#s.showcount()
|
||||
#s.shownews(1)
|
||||
s.showKeyWord()
|
@ -3,6 +3,6 @@ title_dir = project_root+"data/title.json"
|
||||
content_dir=project_root+"data/news.json"
|
||||
db_dir = project_root+"data/news.db"
|
||||
stopword_dir=project_root+"data/stopword.txt"
|
||||
inverse_dir=project_root+"data/inversedata"
|
||||
cutnews_dir=project_root+"data/cutnews"
|
||||
inverse_dir=project_root+"data/inversedata/"
|
||||
cutnews_dir=project_root+"data/cutnews/"
|
||||
filesize = 100
|
||||
|
BIN
tools/Global.pyc
BIN
tools/Global.pyc
Binary file not shown.
@ -36,3 +36,13 @@ class show:
|
||||
print "-->",data['time'],data['title'],data['url'],data['content']
|
||||
|
||||
|
||||
def showKeyWord(self):
|
||||
f = open(Global.inverse_dir+'id.txt','r')
|
||||
line = f.readline()
|
||||
data = json.loads(line)
|
||||
print 'load keyword done.'
|
||||
print type(data)
|
||||
for k in data.keys():
|
||||
print k,data[k]
|
||||
|
||||
|
||||
|
BIN
tools/show.pyc
BIN
tools/show.pyc
Binary file not shown.
Loading…
Reference in New Issue
Block a user