none
This commit is contained in:
parent
6c9b4c5051
commit
9da1dac6e8
24
ml/Cut.py
Normal file
24
ml/Cut.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
#encoding:utf8
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import Global
|
||||||
|
|
||||||
|
class Cut:
|
||||||
|
def __init__(self):
|
||||||
|
#every 30 news in a flie
|
||||||
|
self.size = 30
|
||||||
|
self.file_data = open(Global.data_dir)
|
||||||
|
|
||||||
|
def cutfile(self,file):
|
||||||
|
|
||||||
|
num = 0
|
||||||
|
While(True):
|
||||||
|
line = self.file_data.readline()
|
||||||
|
if not line:
|
||||||
|
break
|
||||||
|
|
||||||
|
num+=1
|
||||||
|
filename = str(num/self.size)+'.txt'
|
||||||
|
if num%self.size != 0:
|
||||||
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
|||||||
data_dir = "../news_spider/title.json"
|
data_dir = "../news_spider/title.json"
|
||||||
db_dir = "../news_spider/news.db"
|
db_dir = "../news_spider/news.db"
|
||||||
stopword_dir="./stopword.txt"
|
stopword_dir="./stopword.txt"
|
||||||
|
inverse_dir="./ii.txt"
|
||||||
|
BIN
ml/Global.pyc
BIN
ml/Global.pyc
Binary file not shown.
@ -12,6 +12,7 @@ class InverseIndex:
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.file_data= open(Global.data_dir)
|
self.file_data= open(Global.data_dir)
|
||||||
self.file_sw = open(Global.stopword_dir)
|
self.file_sw = open(Global.stopword_dir)
|
||||||
|
self.ii = open(Global.inverse_dir,'wb')
|
||||||
self.stopword=[]
|
self.stopword=[]
|
||||||
self.worddict = dict()
|
self.worddict = dict()
|
||||||
|
|
||||||
@ -24,7 +25,6 @@ class InverseIndex:
|
|||||||
print line,
|
print line,
|
||||||
|
|
||||||
def loaddata(self):
|
def loaddata(self):
|
||||||
|
|
||||||
self.loadsw()
|
self.loadsw()
|
||||||
count=0
|
count=0
|
||||||
while True:
|
while True:
|
||||||
@ -38,8 +38,13 @@ class InverseIndex:
|
|||||||
if w not in self.worddict:
|
if w not in self.worddict:
|
||||||
self.worddict[w] = []
|
self.worddict[w] = []
|
||||||
if w not in self.stopword:
|
if w not in self.stopword:
|
||||||
|
print w,
|
||||||
self.worddict[w].append(count)
|
self.worddict[w].append(count)
|
||||||
|
|
||||||
|
def write2file(self):
|
||||||
|
for w in self.worddict:
|
||||||
|
ii.write(w+' '+str(worddict[w])+'\n')
|
||||||
|
|
||||||
ii = InverseIndex()
|
ii = InverseIndex()
|
||||||
ii.loaddata()
|
ii.loaddata()
|
||||||
|
ii.write2file()
|
||||||
|
Loading…
Reference in New Issue
Block a user