none
This commit is contained in:
parent
6c9b4c5051
commit
9da1dac6e8
24
ml/Cut.py
Normal file
24
ml/Cut.py
Normal file
@ -0,0 +1,24 @@
|
||||
#encoding:utf8
|
||||
import json
|
||||
import sys
|
||||
import Global
|
||||
|
||||
class Cut:
|
||||
def __init__(self):
|
||||
#every 30 news in a flie
|
||||
self.size = 30
|
||||
self.file_data = open(Global.data_dir)
|
||||
|
||||
def cutfile(self,file):
|
||||
|
||||
num = 0
|
||||
While(True):
|
||||
line = self.file_data.readline()
|
||||
if not line:
|
||||
break
|
||||
|
||||
num+=1
|
||||
filename = str(num/self.size)+'.txt'
|
||||
if num%self.size != 0:
|
||||
|
||||
|
@ -1,3 +1,4 @@
|
||||
data_dir = "../news_spider/title.json"
|
||||
db_dir = "../news_spider/news.db"
|
||||
stopword_dir="./stopword.txt"
|
||||
inverse_dir="./ii.txt"
|
||||
|
BIN
ml/Global.pyc
BIN
ml/Global.pyc
Binary file not shown.
@ -12,6 +12,7 @@ class InverseIndex:
|
||||
def __init__(self):
|
||||
self.file_data= open(Global.data_dir)
|
||||
self.file_sw = open(Global.stopword_dir)
|
||||
self.ii = open(Global.inverse_dir,'wb')
|
||||
self.stopword=[]
|
||||
self.worddict = dict()
|
||||
|
||||
@ -24,7 +25,6 @@ class InverseIndex:
|
||||
print line,
|
||||
|
||||
def loaddata(self):
|
||||
|
||||
self.loadsw()
|
||||
count=0
|
||||
while True:
|
||||
@ -38,8 +38,13 @@ class InverseIndex:
|
||||
if w not in self.worddict:
|
||||
self.worddict[w] = []
|
||||
if w not in self.stopword:
|
||||
print w,
|
||||
self.worddict[w].append(count)
|
||||
|
||||
def write2file(self):
|
||||
for w in self.worddict:
|
||||
ii.write(w+' '+str(worddict[w])+'\n')
|
||||
|
||||
ii = InverseIndex()
|
||||
ii.loaddata()
|
||||
ii.write2file()
|
||||
|
Loading…
Reference in New Issue
Block a user