支持英文搜索,中文搜索编码存在问题

This commit is contained in:
lzjqsdd 2016-05-04 00:46:51 +08:00
parent c5dd35c5aa
commit d1824516d3
10 changed files with 56 additions and 9 deletions

1
.gitignore vendored
View File

@ -3,4 +3,5 @@ data/news.db
data/news.json data/news.json
data/title.json data/title.json
data/cutnews data/cutnews
data/orinews
data/inversedata data/inversedata

View File

@ -35,6 +35,27 @@ class Cut:
cut_file.write(json.dumps(data)+'\n') cut_file.write(json.dumps(data)+'\n')
cut_file.close() cut_file.close()
num+=1 num+=1
def cutfileWithoutCut(self,path,fliename,size):
file_data = open(fliename,'r')
num = 0
flag = 0
while True:
if flag == 1:
break
if not os.path.exists(path):
os.makedirs(path)
cutfilename = path+'/'+str(num)+'.txt'
cut_file = open(cutfilename,'wb')
print 'Generate:'+cutfilename+'...'
for i in range(0,size):
line = file_data.readline()
if not line:
flag = 1
break
cut_file.write(line)
cut_file.close()
num+=1
def getRow(self,recordnum,path,size): def getRow(self,recordnum,path,size):
filenum = (recordnum-1)/size filenum = (recordnum-1)/size
linenum = (recordnum-1)%size+1 linenum = (recordnum-1)%size+1
@ -48,6 +69,7 @@ class Cut:
#test cutfile #test cutfile
#c = Cut() #c = Cut()
#c.cutfileWithoutCut(Global.cutnews_origin_dir,Global.content_dir,Global.filesize)
#c.cutfile(Global.cutnews_dir,Global.content_dir,Global.filesize) #c.cutfile(Global.cutnews_dir,Global.content_dir,Global.filesize)
#test getRow #test getRow

Binary file not shown.

View File

@ -73,7 +73,7 @@ class InverseIndex:
data = json.loads(line) data = json.loads(line)
# seg_list = jieba.cut(data['content'],cut_all=True) # seg_list = jieba.cut(data['content'],cut_all=True)
keyword = analyse.extract_tags(data['content'],topK=10) keyword = analyse.extract_tags(data['content'],topK=20)
seg = " ".join(keyword) seg = " ".join(keyword)
print seg print seg
doc.append(seg) doc.append(seg)
@ -93,8 +93,8 @@ class InverseIndex:
f = open(Global.inverse_dir+'id.txt','wb') f = open(Global.inverse_dir+'id.txt','wb')
word = vectorizer.get_feature_names() word = vectorizer.get_feature_names()
for name in vectorizer.get_feature_names(): for name in vectorizer.get_feature_names():
indexdoc[name] = i
i+=1 i+=1
indexdoc[name] = i
f.write(json.dumps(indexdoc)) f.write(json.dumps(indexdoc))
f.close() f.close()
@ -103,11 +103,11 @@ class InverseIndex:
for i in range(0,colnum): for i in range(0,colnum):
filename = Global.inverse_dir+str(i/Global.filesize)+'.txt' filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
f = open(filename,'a') f = open(filename,'a')
idx_list = list() idx_list = dict()
for j in range(0,row): for j in range(0,row):
val = tfidf[j,i] val = tfidf[j,i]
if val > 0: if val > 0:
idx_list[j] = val idx_list[j+1] = val
f.write(json.dumps(idx_list)+'\n') f.write(json.dumps(idx_list)+'\n')
f.close() f.close()

View File

@ -1,12 +1,34 @@
# -*- coding: utf-8 -*-
import sys import sys
import json import json
reload(sys)
sys.path.append("..")
sys.setdefaultencoding('utf-8')
from Cut import Cut
import tools.Global as Global
class Search: class Search:
def __init__(self): def __init__(self):
pass self.kw_id = self.loadKW_ID()
def getQueryItem(self,InputItem): def loadKW_ID(self):
pass f = open(Global.inverse_dir+'id.txt')
line = f.readline()
kw_id = json.loads(line, encoding='utf-8')
kwid = dict()
for ki in kw_id:
kwid[ki.encode('utf-8')] = kw_id[ki]
for i in kwid:
print i,kwid[i]
return kwid
def getQueryItem(self,searchWord):
idx = self.kw_id[searchWord]
cut = Cut()
line = cut.getRow(idx,Global.cutnews_origin_dir,Global.filesize)
data = json.loads(line)
print data['title'],'\n',data['time'],'\n',data['content'],'\n'
def getInverseRecord(self,item): def getInverseRecord(self,item):
pass pass
@ -14,3 +36,5 @@ class Search:
def mergeInverseRecord(self,RecordList): def mergeInverseRecord(self,RecordList):
pass pass
search = Search()
search.getQueryItem(sys.argv[1].decode('utf-8'))

View File

@ -11,6 +11,5 @@ s = show()
#s.showitem(2608) #s.showitem(2608)
c = Cut() c = Cut()
line = c.getRow(3176,Global.cutnews_dir,Global.filesize) line = c.getRow(2,Global.cutnews_origin_dir,Global.filesize)
s.showitem(line) s.showitem(line)

View File

@ -5,4 +5,5 @@ db_dir = project_root+"data/news.db"
stopword_dir=project_root+"data/stopword.txt" stopword_dir=project_root+"data/stopword.txt"
inverse_dir=project_root+"data/inversedata/" inverse_dir=project_root+"data/inversedata/"
cutnews_dir=project_root+"data/cutnews/" cutnews_dir=project_root+"data/cutnews/"
cutnews_origin_dir=project_root+"data/orinews"
filesize = 100 filesize = 100

Binary file not shown.

Binary file not shown.

Binary file not shown.