增加停用词
This commit is contained in:
parent
d94fb654dd
commit
6c9b4c5051
3
ml/Global.py
Normal file
3
ml/Global.py
Normal file
@ -0,0 +1,3 @@
|
||||
data_dir = "../news_spider/title.json"
|
||||
db_dir = "../news_spider/news.db"
|
||||
stopword_dir="./stopword.txt"
|
BIN
ml/Global.pyc
Normal file
BIN
ml/Global.pyc
Normal file
Binary file not shown.
@ -2,25 +2,44 @@
|
||||
import jieba
|
||||
import json
|
||||
import sys
|
||||
|
||||
import Global
|
||||
reload(sys)
|
||||
sys.setdefaultencoding('utf-8')
|
||||
|
||||
|
||||
file = open('../news_spider/title.json')
|
||||
worddict = dict()
|
||||
count=0
|
||||
while True:
|
||||
line = file.readline()
|
||||
class InverseIndex:
|
||||
|
||||
def __init__(self):
|
||||
self.file_data= open(Global.data_dir)
|
||||
self.file_sw = open(Global.stopword_dir)
|
||||
self.stopword=[]
|
||||
self.worddict = dict()
|
||||
|
||||
def loadsw(self):
|
||||
while True:
|
||||
line = self.file_sw.readline()
|
||||
if not line:
|
||||
break
|
||||
self.stopword.append(line)
|
||||
print line,
|
||||
|
||||
def loaddata(self):
|
||||
|
||||
self.loadsw()
|
||||
count=0
|
||||
while True:
|
||||
line = self.file_data.readline()
|
||||
if not line:
|
||||
break
|
||||
data = json.loads(line)
|
||||
seg_list = list(jieba.cut(data['title'], cut_all=True))
|
||||
count+=1
|
||||
for w in seg_list:
|
||||
if w not in worddict:
|
||||
worddict[w] = []
|
||||
worddict[w].append(count)
|
||||
if w not in self.worddict:
|
||||
self.worddict[w] = []
|
||||
if w not in self.stopword:
|
||||
self.worddict[w].append(count)
|
||||
|
||||
for i in worddict:
|
||||
print i
|
||||
|
||||
ii = InverseIndex()
|
||||
ii.loaddata()
|
||||
|
1208
ml/stopword.txt
Normal file
1208
ml/stopword.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,15 +0,0 @@
|
||||
import re
|
||||
import time
|
||||
|
||||
timee = " - -- 2015-06-15 15:34 "
|
||||
|
||||
day = ['31','30','29','28','27','26','25','24','23','22','21',
|
||||
'20','19','18','17','16','15','14','13','12','11','10',
|
||||
'09','08','07','06','05','04','03','02','01']
|
||||
pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
|
||||
#pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}")
|
||||
#pattern = re.compile("[0-9]")
|
||||
tm = pattern.findall(timee)[0]
|
||||
|
||||
a = time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M'))
|
||||
print int(a)
|
Loading…
Reference in New Issue
Block a user