This commit is contained in:
lzjqsdd 2016-04-27 23:21:28 +08:00
parent 6c9b4c5051
commit 9da1dac6e8
4 changed files with 31 additions and 1 deletions

24
ml/Cut.py Normal file
View File

@ -0,0 +1,24 @@
#encoding:utf8
import json
import sys
import Global
class Cut:
def __init__(self):
#every 30 news in a flie
self.size = 30
self.file_data = open(Global.data_dir)
def cutfile(self,file):
num = 0
While(True):
line = self.file_data.readline()
if not line:
break
num+=1
filename = str(num/self.size)+'.txt'
if num%self.size != 0:

View File

@ -1,3 +1,4 @@
data_dir = "../news_spider/title.json" data_dir = "../news_spider/title.json"
db_dir = "../news_spider/news.db" db_dir = "../news_spider/news.db"
stopword_dir="./stopword.txt" stopword_dir="./stopword.txt"
inverse_dir="./ii.txt"

Binary file not shown.

View File

@ -12,6 +12,7 @@ class InverseIndex:
def __init__(self): def __init__(self):
self.file_data= open(Global.data_dir) self.file_data= open(Global.data_dir)
self.file_sw = open(Global.stopword_dir) self.file_sw = open(Global.stopword_dir)
self.ii = open(Global.inverse_dir,'wb')
self.stopword=[] self.stopword=[]
self.worddict = dict() self.worddict = dict()
@ -24,7 +25,6 @@ class InverseIndex:
print line, print line,
def loaddata(self): def loaddata(self):
self.loadsw() self.loadsw()
count=0 count=0
while True: while True:
@ -38,8 +38,13 @@ class InverseIndex:
if w not in self.worddict: if w not in self.worddict:
self.worddict[w] = [] self.worddict[w] = []
if w not in self.stopword: if w not in self.stopword:
print w,
self.worddict[w].append(count) self.worddict[w].append(count)
def write2file(self):
for w in self.worddict:
ii.write(w+' '+str(worddict[w])+'\n')
ii = InverseIndex() ii = InverseIndex()
ii.loaddata() ii.loaddata()
ii.write2file()