Add parase query

This commit is contained in:
lzjqsdd 2016-05-05 23:04:30 +08:00
parent 2626dd15e6
commit 4692601b5b
4 changed files with 57 additions and 29 deletions

View File

@ -29,7 +29,7 @@ class Cut:
flag = 1
break
data = json.loads(line)
seg_list = jieba.cut(data['content'],cut_all=True)
seg_list = jieba.cut(data['content'],cut_all=False)
result = ' '.join(seg_list)
data['content'] = result
cut_file.write(json.dumps(data)+'\n')

Binary file not shown.

View File

@ -42,7 +42,7 @@ class InverseIndex:
if not line:
break
data = json.loads(line)
seg_list = list(jieba.cut(data['title'], cut_all=True))
seg_list = list(jieba.cut(data['title'], cut_all=False))
count+=1
for w in seg_list:
if w not in self.worddict:
@ -59,7 +59,7 @@ class InverseIndex:
if not line:
break
data = json.loads(line)
seg_list = list(jieba.cut(data['title'],cut_all=True))
seg_list = list(jieba.cut(data['title'],cut_all=False))
doc.append(seg_list)
return doc
@ -72,8 +72,6 @@ class InverseIndex:
if not line:
break
data = json.loads(line)
# seg_list = jieba.cut(data['content'],cut_all=True)
keyword = analyse.extract_tags(data['content'],topK=20)
seg = " ".join(keyword)
print seg
@ -132,6 +130,6 @@ class InverseIndex:
#test
ii = InverseIndex()
ii.CalcTFIDF()
#ii = InverseIndex()
#ii.CalcTFIDF()
#ii.loadDataFromCutFile(20)

View File

@ -19,32 +19,62 @@ class Search:
return kw_id
def QuerySingle(self,searchWord):
idx = self.kw_id[searchWord.decode('utf-8')]
#返回文档号
def QuerySingle(self,searchWord,ishow):
if self.kw_id.has_key(searchWord.decode('utf-8')):
idx = self.kw_id[searchWord.decode('utf-8')]
cut = Cut()
ii_line = cut.getInverseIndexRow(idx,Global.inverse_dir,Global.filesize)
record =json.loads(ii_line)
if ishow:
for rec in record:
line = cut.getRow(int(rec),Global.cutnews_origin_dir,Global.filesize)
data = json.loads(line)
print data['title'],'\n',data['time'],'\n',data['content'],'\n'
#返回单个词项对应的倒排记录表
return record
else:
if isshow:
print 'Not Exists Record!'
#调用该函数后需要对结果进行判断
return dict()
#'与'查询:先分词,再合并倒排记录,不考虑权重,返回文档号
def QueryPhrase(self,searchPhrase,ishow = True):
words = jieba.cut(searchPhrase.decode('utf-8'),cut_all=False)
cut = Cut()
ii_line = cut.getInverseIndexRow(idx,Global.inverse_dir,Global.filesize)
record =json.loads(ii_line)
for rec in record:
line = cut.getRow(int(rec),Global.cutnews_origin_dir,Global.filesize)
data = json.loads(line)
print data['title'],'\n',data['time'],'\n',data['content'],'\n'
result = set(range(1,100000))
for word in words:
if not self.kw_id.has_key(word):
print 'Not Exist Record'
return set()
idx = self.kw_id[word]
ii_line = cut.getInverseIndexRow(idx,Global.inverse_dir,Global.filesize)
record =json.loads(ii_line)
re = set()
for rec in record:
re.add(int(rec))
result = result & re
print result
if ishow:
if len(result) == 0:
print 'Not Exists Record!'
else:
for rst in result:
line = cut.getRow(int(rst),Global.cutnews_origin_dir,Global.filesize)
data = json.loads(line)
print data['title'],'\n',data['time'],'\n',data['content'],'\n'
return result
def getQueryItem(self,searchWord):
idx = self.kw_id[searchWord.decode('utf-8')]
cut = Cut()
ii_line = cut.getInverseIndexRow(idx,Global.inverse_dir,Global.filesize)
record =json.loads(ii_line)
for rec in record:
line = cut.getRow(int(rec),Global.cutnews_origin_dir,Global.filesize)
data = json.loads(line)
print data['title'],'\n',data['time'],'\n',data['content'],'\n'
def getInverseRecord(self,item):
#返回热点新闻
def QueryHotNews(self):
pass
def mergeInverseRecord(self,RecordList):
#返回最新新闻
def QueryByTime(self):
pass
search = Search()
search.getQueryItem(sys.argv[1])
#search.QueryPhrase(sys.argv[1])
search.QueryPhrase(sys.argv[1])