提取关键词写入文件

This commit is contained in:
lzjqsdd 2016-04-29 17:56:36 +08:00
parent 84a89ec61d
commit a732b75e58
8 changed files with 31 additions and 6 deletions

1
.gitignore vendored
View File

@ -3,3 +3,4 @@ data/news.db
data/news.json data/news.json
data/title.json data/title.json
data/cutnews data/cutnews
data/inversedata

Binary file not shown.

View File

@ -87,13 +87,19 @@ class InverseIndex:
#calculate tf-idf #calculate tf-idf
def CalcTFIDF(self): def CalcTFIDF(self):
docArray = self.loadDataFromCutFile(100) docArray = self.loadDataFromCutFile(10000)
vectorizer = CountVectorizer() vectorizer = CountVectorizer()
transformer = TfidfTransformer() transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray)) tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray))
print 'done' print 'done'
#write index-doc to file
i = 0
indexdoc = dict()
f = open(Global.inverse_dir+'id.txt','wb')
for name in vectorizer.get_feature_names(): for name in vectorizer.get_feature_names():
print name i+=1
indexdoc[i] = name
f.write(json.dumps(indexdoc))
#test #test
ii = InverseIndex() ii = InverseIndex()

8
test/test_tool.py Normal file
View File

@ -0,0 +1,8 @@
import sys
sys.path.append("..")
from tools.show import show
s = show()
#s.showcount()
#s.shownews(1)
s.showKeyWord()

View File

@ -3,6 +3,6 @@ title_dir = project_root+"data/title.json"
content_dir=project_root+"data/news.json" content_dir=project_root+"data/news.json"
db_dir = project_root+"data/news.db" db_dir = project_root+"data/news.db"
stopword_dir=project_root+"data/stopword.txt" stopword_dir=project_root+"data/stopword.txt"
inverse_dir=project_root+"data/inversedata" inverse_dir=project_root+"data/inversedata/"
cutnews_dir=project_root+"data/cutnews" cutnews_dir=project_root+"data/cutnews/"
filesize = 100 filesize = 100

Binary file not shown.

View File

@ -36,3 +36,13 @@ class show:
print "-->",data['time'],data['title'],data['url'],data['content'] print "-->",data['time'],data['title'],data['url'],data['content']
def showKeyWord(self):
f = open(Global.inverse_dir+'id.txt','r')
line = f.readline()
data = json.loads(line)
print 'load keyword done.'
print type(data)
for k in data.keys():
print k,data[k]

Binary file not shown.