提取关键词写入文件
This commit is contained in:
parent
84a89ec61d
commit
a732b75e58
1
.gitignore
vendored
1
.gitignore
vendored
@ -3,3 +3,4 @@ data/news.db
|
|||||||
data/news.json
|
data/news.json
|
||||||
data/title.json
|
data/title.json
|
||||||
data/cutnews
|
data/cutnews
|
||||||
|
data/inversedata
|
||||||
|
BIN
ml/Cut.pyc
BIN
ml/Cut.pyc
Binary file not shown.
@ -44,7 +44,7 @@ class InverseIndex:
|
|||||||
count+=1
|
count+=1
|
||||||
for w in seg_list:
|
for w in seg_list:
|
||||||
if w not in self.worddict:
|
if w not in self.worddict:
|
||||||
self.worddict[w] = []
|
self.worddict[w] = []
|
||||||
if w not in self.stopword:
|
if w not in self.stopword:
|
||||||
print w,
|
print w,
|
||||||
self.worddict[w].append(count)
|
self.worddict[w].append(count)
|
||||||
@ -87,14 +87,20 @@ class InverseIndex:
|
|||||||
|
|
||||||
#calculate tf-idf
|
#calculate tf-idf
|
||||||
def CalcTFIDF(self):
|
def CalcTFIDF(self):
|
||||||
docArray = self.loadDataFromCutFile(100)
|
docArray = self.loadDataFromCutFile(10000)
|
||||||
vectorizer = CountVectorizer()
|
vectorizer = CountVectorizer()
|
||||||
transformer = TfidfTransformer()
|
transformer = TfidfTransformer()
|
||||||
tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray))
|
tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray))
|
||||||
print 'done'
|
print 'done'
|
||||||
|
#write index-doc to file
|
||||||
|
i = 0
|
||||||
|
indexdoc = dict()
|
||||||
|
f = open(Global.inverse_dir+'id.txt','wb')
|
||||||
for name in vectorizer.get_feature_names():
|
for name in vectorizer.get_feature_names():
|
||||||
print name
|
i+=1
|
||||||
|
indexdoc[i] = name
|
||||||
|
f.write(json.dumps(indexdoc))
|
||||||
|
|
||||||
#test
|
#test
|
||||||
ii = InverseIndex()
|
ii = InverseIndex()
|
||||||
ii.CalcTFIDF()
|
ii.CalcTFIDF()
|
||||||
|
8
test/test_tool.py
Normal file
8
test/test_tool.py
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
import sys
|
||||||
|
sys.path.append("..")
|
||||||
|
from tools.show import show
|
||||||
|
|
||||||
|
s = show()
|
||||||
|
#s.showcount()
|
||||||
|
#s.shownews(1)
|
||||||
|
s.showKeyWord()
|
@ -3,6 +3,6 @@ title_dir = project_root+"data/title.json"
|
|||||||
content_dir=project_root+"data/news.json"
|
content_dir=project_root+"data/news.json"
|
||||||
db_dir = project_root+"data/news.db"
|
db_dir = project_root+"data/news.db"
|
||||||
stopword_dir=project_root+"data/stopword.txt"
|
stopword_dir=project_root+"data/stopword.txt"
|
||||||
inverse_dir=project_root+"data/inversedata"
|
inverse_dir=project_root+"data/inversedata/"
|
||||||
cutnews_dir=project_root+"data/cutnews"
|
cutnews_dir=project_root+"data/cutnews/"
|
||||||
filesize = 100
|
filesize = 100
|
||||||
|
BIN
tools/Global.pyc
BIN
tools/Global.pyc
Binary file not shown.
@ -36,3 +36,13 @@ class show:
|
|||||||
print "-->",data['time'],data['title'],data['url'],data['content']
|
print "-->",data['time'],data['title'],data['url'],data['content']
|
||||||
|
|
||||||
|
|
||||||
|
def showKeyWord(self):
|
||||||
|
f = open(Global.inverse_dir+'id.txt','r')
|
||||||
|
line = f.readline()
|
||||||
|
data = json.loads(line)
|
||||||
|
print 'load keyword done.'
|
||||||
|
print type(data)
|
||||||
|
for k in data.keys():
|
||||||
|
print k,data[k]
|
||||||
|
|
||||||
|
|
||||||
|
BIN
tools/show.pyc
BIN
tools/show.pyc
Binary file not shown.
Loading…
Reference in New Issue
Block a user