支持英文搜索,中文搜索编码存在问题
This commit is contained in:
parent
c5dd35c5aa
commit
d1824516d3
1
.gitignore
vendored
1
.gitignore
vendored
@ -3,4 +3,5 @@ data/news.db
|
|||||||
data/news.json
|
data/news.json
|
||||||
data/title.json
|
data/title.json
|
||||||
data/cutnews
|
data/cutnews
|
||||||
|
data/orinews
|
||||||
data/inversedata
|
data/inversedata
|
||||||
|
22
ml/Cut.py
22
ml/Cut.py
@ -35,6 +35,27 @@ class Cut:
|
|||||||
cut_file.write(json.dumps(data)+'\n')
|
cut_file.write(json.dumps(data)+'\n')
|
||||||
cut_file.close()
|
cut_file.close()
|
||||||
num+=1
|
num+=1
|
||||||
|
|
||||||
|
def cutfileWithoutCut(self,path,fliename,size):
|
||||||
|
file_data = open(fliename,'r')
|
||||||
|
num = 0
|
||||||
|
flag = 0
|
||||||
|
while True:
|
||||||
|
if flag == 1:
|
||||||
|
break
|
||||||
|
if not os.path.exists(path):
|
||||||
|
os.makedirs(path)
|
||||||
|
cutfilename = path+'/'+str(num)+'.txt'
|
||||||
|
cut_file = open(cutfilename,'wb')
|
||||||
|
print 'Generate:'+cutfilename+'...'
|
||||||
|
for i in range(0,size):
|
||||||
|
line = file_data.readline()
|
||||||
|
if not line:
|
||||||
|
flag = 1
|
||||||
|
break
|
||||||
|
cut_file.write(line)
|
||||||
|
cut_file.close()
|
||||||
|
num+=1
|
||||||
def getRow(self,recordnum,path,size):
|
def getRow(self,recordnum,path,size):
|
||||||
filenum = (recordnum-1)/size
|
filenum = (recordnum-1)/size
|
||||||
linenum = (recordnum-1)%size+1
|
linenum = (recordnum-1)%size+1
|
||||||
@ -48,6 +69,7 @@ class Cut:
|
|||||||
|
|
||||||
#test cutfile
|
#test cutfile
|
||||||
#c = Cut()
|
#c = Cut()
|
||||||
|
#c.cutfileWithoutCut(Global.cutnews_origin_dir,Global.content_dir,Global.filesize)
|
||||||
#c.cutfile(Global.cutnews_dir,Global.content_dir,Global.filesize)
|
#c.cutfile(Global.cutnews_dir,Global.content_dir,Global.filesize)
|
||||||
|
|
||||||
#test getRow
|
#test getRow
|
||||||
|
BIN
ml/Cut.pyc
BIN
ml/Cut.pyc
Binary file not shown.
@ -73,7 +73,7 @@ class InverseIndex:
|
|||||||
data = json.loads(line)
|
data = json.loads(line)
|
||||||
# seg_list = jieba.cut(data['content'],cut_all=True)
|
# seg_list = jieba.cut(data['content'],cut_all=True)
|
||||||
|
|
||||||
keyword = analyse.extract_tags(data['content'],topK=10)
|
keyword = analyse.extract_tags(data['content'],topK=20)
|
||||||
seg = " ".join(keyword)
|
seg = " ".join(keyword)
|
||||||
print seg
|
print seg
|
||||||
doc.append(seg)
|
doc.append(seg)
|
||||||
@ -93,8 +93,8 @@ class InverseIndex:
|
|||||||
f = open(Global.inverse_dir+'id.txt','wb')
|
f = open(Global.inverse_dir+'id.txt','wb')
|
||||||
word = vectorizer.get_feature_names()
|
word = vectorizer.get_feature_names()
|
||||||
for name in vectorizer.get_feature_names():
|
for name in vectorizer.get_feature_names():
|
||||||
indexdoc[name] = i
|
|
||||||
i+=1
|
i+=1
|
||||||
|
indexdoc[name] = i
|
||||||
f.write(json.dumps(indexdoc))
|
f.write(json.dumps(indexdoc))
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
@ -103,11 +103,11 @@ class InverseIndex:
|
|||||||
for i in range(0,colnum):
|
for i in range(0,colnum):
|
||||||
filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
|
filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
|
||||||
f = open(filename,'a')
|
f = open(filename,'a')
|
||||||
idx_list = list()
|
idx_list = dict()
|
||||||
for j in range(0,row):
|
for j in range(0,row):
|
||||||
val = tfidf[j,i]
|
val = tfidf[j,i]
|
||||||
if val > 0:
|
if val > 0:
|
||||||
idx_list[j] = val
|
idx_list[j+1] = val
|
||||||
f.write(json.dumps(idx_list)+'\n')
|
f.write(json.dumps(idx_list)+'\n')
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
|
30
ml/Search.py
30
ml/Search.py
@ -1,12 +1,34 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
|
reload(sys)
|
||||||
|
sys.path.append("..")
|
||||||
|
sys.setdefaultencoding('utf-8')
|
||||||
|
from Cut import Cut
|
||||||
|
import tools.Global as Global
|
||||||
|
|
||||||
class Search:
|
class Search:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
self.kw_id = self.loadKW_ID()
|
||||||
|
|
||||||
def getQueryItem(self,InputItem):
|
def loadKW_ID(self):
|
||||||
pass
|
f = open(Global.inverse_dir+'id.txt')
|
||||||
|
line = f.readline()
|
||||||
|
kw_id = json.loads(line, encoding='utf-8')
|
||||||
|
kwid = dict()
|
||||||
|
for ki in kw_id:
|
||||||
|
kwid[ki.encode('utf-8')] = kw_id[ki]
|
||||||
|
for i in kwid:
|
||||||
|
print i,kwid[i]
|
||||||
|
return kwid
|
||||||
|
|
||||||
|
|
||||||
|
def getQueryItem(self,searchWord):
|
||||||
|
idx = self.kw_id[searchWord]
|
||||||
|
cut = Cut()
|
||||||
|
line = cut.getRow(idx,Global.cutnews_origin_dir,Global.filesize)
|
||||||
|
data = json.loads(line)
|
||||||
|
print data['title'],'\n',data['time'],'\n',data['content'],'\n'
|
||||||
|
|
||||||
def getInverseRecord(self,item):
|
def getInverseRecord(self,item):
|
||||||
pass
|
pass
|
||||||
@ -14,3 +36,5 @@ class Search:
|
|||||||
def mergeInverseRecord(self,RecordList):
|
def mergeInverseRecord(self,RecordList):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
search = Search()
|
||||||
|
search.getQueryItem(sys.argv[1].decode('utf-8'))
|
||||||
|
@ -11,6 +11,5 @@ s = show()
|
|||||||
#s.showitem(2608)
|
#s.showitem(2608)
|
||||||
|
|
||||||
c = Cut()
|
c = Cut()
|
||||||
line = c.getRow(3176,Global.cutnews_dir,Global.filesize)
|
line = c.getRow(2,Global.cutnews_origin_dir,Global.filesize)
|
||||||
s.showitem(line)
|
s.showitem(line)
|
||||||
|
|
||||||
|
@ -5,4 +5,5 @@ db_dir = project_root+"data/news.db"
|
|||||||
stopword_dir=project_root+"data/stopword.txt"
|
stopword_dir=project_root+"data/stopword.txt"
|
||||||
inverse_dir=project_root+"data/inversedata/"
|
inverse_dir=project_root+"data/inversedata/"
|
||||||
cutnews_dir=project_root+"data/cutnews/"
|
cutnews_dir=project_root+"data/cutnews/"
|
||||||
|
cutnews_origin_dir=project_root+"data/orinews"
|
||||||
filesize = 100
|
filesize = 100
|
||||||
|
BIN
tools/Global.pyc
BIN
tools/Global.pyc
Binary file not shown.
Binary file not shown.
BIN
tools/show.pyc
BIN
tools/show.pyc
Binary file not shown.
Loading…
Reference in New Issue
Block a user