支持英文搜索,中文搜索编码存在问题
This commit is contained in:
parent
c5dd35c5aa
commit
d1824516d3
1
.gitignore
vendored
1
.gitignore
vendored
@ -3,4 +3,5 @@ data/news.db
|
||||
data/news.json
|
||||
data/title.json
|
||||
data/cutnews
|
||||
data/orinews
|
||||
data/inversedata
|
||||
|
22
ml/Cut.py
22
ml/Cut.py
@ -35,6 +35,27 @@ class Cut:
|
||||
cut_file.write(json.dumps(data)+'\n')
|
||||
cut_file.close()
|
||||
num+=1
|
||||
|
||||
def cutfileWithoutCut(self,path,fliename,size):
|
||||
file_data = open(fliename,'r')
|
||||
num = 0
|
||||
flag = 0
|
||||
while True:
|
||||
if flag == 1:
|
||||
break
|
||||
if not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
cutfilename = path+'/'+str(num)+'.txt'
|
||||
cut_file = open(cutfilename,'wb')
|
||||
print 'Generate:'+cutfilename+'...'
|
||||
for i in range(0,size):
|
||||
line = file_data.readline()
|
||||
if not line:
|
||||
flag = 1
|
||||
break
|
||||
cut_file.write(line)
|
||||
cut_file.close()
|
||||
num+=1
|
||||
def getRow(self,recordnum,path,size):
|
||||
filenum = (recordnum-1)/size
|
||||
linenum = (recordnum-1)%size+1
|
||||
@ -48,6 +69,7 @@ class Cut:
|
||||
|
||||
#test cutfile
|
||||
#c = Cut()
|
||||
#c.cutfileWithoutCut(Global.cutnews_origin_dir,Global.content_dir,Global.filesize)
|
||||
#c.cutfile(Global.cutnews_dir,Global.content_dir,Global.filesize)
|
||||
|
||||
#test getRow
|
||||
|
BIN
ml/Cut.pyc
BIN
ml/Cut.pyc
Binary file not shown.
@ -73,7 +73,7 @@ class InverseIndex:
|
||||
data = json.loads(line)
|
||||
# seg_list = jieba.cut(data['content'],cut_all=True)
|
||||
|
||||
keyword = analyse.extract_tags(data['content'],topK=10)
|
||||
keyword = analyse.extract_tags(data['content'],topK=20)
|
||||
seg = " ".join(keyword)
|
||||
print seg
|
||||
doc.append(seg)
|
||||
@ -93,8 +93,8 @@ class InverseIndex:
|
||||
f = open(Global.inverse_dir+'id.txt','wb')
|
||||
word = vectorizer.get_feature_names()
|
||||
for name in vectorizer.get_feature_names():
|
||||
indexdoc[name] = i
|
||||
i+=1
|
||||
indexdoc[name] = i
|
||||
f.write(json.dumps(indexdoc))
|
||||
f.close()
|
||||
|
||||
@ -103,11 +103,11 @@ class InverseIndex:
|
||||
for i in range(0,colnum):
|
||||
filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
|
||||
f = open(filename,'a')
|
||||
idx_list = list()
|
||||
idx_list = dict()
|
||||
for j in range(0,row):
|
||||
val = tfidf[j,i]
|
||||
if val > 0:
|
||||
idx_list[j] = val
|
||||
idx_list[j+1] = val
|
||||
f.write(json.dumps(idx_list)+'\n')
|
||||
f.close()
|
||||
|
||||
|
30
ml/Search.py
30
ml/Search.py
@ -1,12 +1,34 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import sys
|
||||
import json
|
||||
reload(sys)
|
||||
sys.path.append("..")
|
||||
sys.setdefaultencoding('utf-8')
|
||||
from Cut import Cut
|
||||
import tools.Global as Global
|
||||
|
||||
class Search:
|
||||
def __init__(self):
|
||||
pass
|
||||
self.kw_id = self.loadKW_ID()
|
||||
|
||||
def getQueryItem(self,InputItem):
|
||||
pass
|
||||
def loadKW_ID(self):
|
||||
f = open(Global.inverse_dir+'id.txt')
|
||||
line = f.readline()
|
||||
kw_id = json.loads(line, encoding='utf-8')
|
||||
kwid = dict()
|
||||
for ki in kw_id:
|
||||
kwid[ki.encode('utf-8')] = kw_id[ki]
|
||||
for i in kwid:
|
||||
print i,kwid[i]
|
||||
return kwid
|
||||
|
||||
|
||||
def getQueryItem(self,searchWord):
|
||||
idx = self.kw_id[searchWord]
|
||||
cut = Cut()
|
||||
line = cut.getRow(idx,Global.cutnews_origin_dir,Global.filesize)
|
||||
data = json.loads(line)
|
||||
print data['title'],'\n',data['time'],'\n',data['content'],'\n'
|
||||
|
||||
def getInverseRecord(self,item):
|
||||
pass
|
||||
@ -14,3 +36,5 @@ class Search:
|
||||
def mergeInverseRecord(self,RecordList):
|
||||
pass
|
||||
|
||||
search = Search()
|
||||
search.getQueryItem(sys.argv[1].decode('utf-8'))
|
||||
|
@ -11,6 +11,5 @@ s = show()
|
||||
#s.showitem(2608)
|
||||
|
||||
c = Cut()
|
||||
line = c.getRow(3176,Global.cutnews_dir,Global.filesize)
|
||||
line = c.getRow(2,Global.cutnews_origin_dir,Global.filesize)
|
||||
s.showitem(line)
|
||||
|
||||
|
@ -5,4 +5,5 @@ db_dir = project_root+"data/news.db"
|
||||
stopword_dir=project_root+"data/stopword.txt"
|
||||
inverse_dir=project_root+"data/inversedata/"
|
||||
cutnews_dir=project_root+"data/cutnews/"
|
||||
cutnews_origin_dir=project_root+"data/orinews"
|
||||
filesize = 100
|
||||
|
BIN
tools/Global.pyc
BIN
tools/Global.pyc
Binary file not shown.
Binary file not shown.
BIN
tools/show.pyc
BIN
tools/show.pyc
Binary file not shown.
Loading…
Reference in New Issue
Block a user