增加停用词

This commit is contained in:
lzjqsdd 2016-04-26 00:56:46 +08:00
parent d94fb654dd
commit 6c9b4c5051
5 changed files with 1247 additions and 32 deletions

3
ml/Global.py Normal file
View File

@ -0,0 +1,3 @@
data_dir = "../news_spider/title.json"
db_dir = "../news_spider/news.db"
stopword_dir="./stopword.txt"

BIN
ml/Global.pyc Normal file

Binary file not shown.

View File

@ -2,25 +2,44 @@
import jieba import jieba
import json import json
import sys import sys
import Global
reload(sys) reload(sys)
sys.setdefaultencoding('utf-8') sys.setdefaultencoding('utf-8')
file = open('../news_spider/title.json') class InverseIndex:
worddict = dict()
count=0
while True:
line = file.readline()
if not line:
break
data = json.loads(line)
seg_list = list(jieba.cut(data['title'], cut_all=True))
count+=1
for w in seg_list:
if w not in worddict:
worddict[w] = []
worddict[w].append(count)
for i in worddict: def __init__(self):
print i self.file_data= open(Global.data_dir)
self.file_sw = open(Global.stopword_dir)
self.stopword=[]
self.worddict = dict()
def loadsw(self):
while True:
line = self.file_sw.readline()
if not line:
break
self.stopword.append(line)
print line,
def loaddata(self):
self.loadsw()
count=0
while True:
line = self.file_data.readline()
if not line:
break
data = json.loads(line)
seg_list = list(jieba.cut(data['title'], cut_all=True))
count+=1
for w in seg_list:
if w not in self.worddict:
self.worddict[w] = []
if w not in self.stopword:
self.worddict[w].append(count)
ii = InverseIndex()
ii.loaddata()

1208
ml/stopword.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,15 +0,0 @@
import re
import time
timee = " - -- 2015-06-15 15:34 "
day = ['31','30','29','28','27','26','25','24','23','22','21',
'20','19','18','17','16','15','14','13','12','11','10',
'09','08','07','06','05','04','03','02','01']
pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
#pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}")
#pattern = re.compile("[0-9]")
tm = pattern.findall(timee)[0]
a = time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M'))
print int(a)