提取关键词写入文件

This commit is contained in:
lzjqsdd 2016-04-29 17:56:36 +08:00
parent 84a89ec61d
commit a732b75e58
8 changed files with 31 additions and 6 deletions

1
.gitignore vendored
View File

@ -3,3 +3,4 @@ data/news.db
data/news.json
data/title.json
data/cutnews
data/inversedata

Binary file not shown.

View File

@ -44,7 +44,7 @@ class InverseIndex:
count+=1
for w in seg_list:
if w not in self.worddict:
self.worddict[w] = []
self.worddict[w] = []
if w not in self.stopword:
print w,
self.worddict[w].append(count)
@ -87,14 +87,20 @@ class InverseIndex:
#calculate tf-idf
def CalcTFIDF(self):
docArray = self.loadDataFromCutFile(100)
docArray = self.loadDataFromCutFile(10000)
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray))
print 'done'
#write index-doc to file
i = 0
indexdoc = dict()
f = open(Global.inverse_dir+'id.txt','wb')
for name in vectorizer.get_feature_names():
print name
i+=1
indexdoc[i] = name
f.write(json.dumps(indexdoc))
#test
ii = InverseIndex()
ii.CalcTFIDF()

8
test/test_tool.py Normal file
View File

@ -0,0 +1,8 @@
import sys
sys.path.append("..")
from tools.show import show
s = show()
#s.showcount()
#s.shownews(1)
s.showKeyWord()

View File

@ -3,6 +3,6 @@ title_dir = project_root+"data/title.json"
content_dir=project_root+"data/news.json"
db_dir = project_root+"data/news.db"
stopword_dir=project_root+"data/stopword.txt"
inverse_dir=project_root+"data/inversedata"
cutnews_dir=project_root+"data/cutnews"
inverse_dir=project_root+"data/inversedata/"
cutnews_dir=project_root+"data/cutnews/"
filesize = 100

Binary file not shown.

View File

@ -36,3 +36,13 @@ class show:
print "-->",data['time'],data['title'],data['url'],data['content']
def showKeyWord(self):
f = open(Global.inverse_dir+'id.txt','r')
line = f.readline()
data = json.loads(line)
print 'load keyword done.'
print type(data)
for k in data.keys():
print k,data[k]

Binary file not shown.