Add parase query
This commit is contained in:
parent
2626dd15e6
commit
4692601b5b
@ -29,7 +29,7 @@ class Cut:
|
||||
flag = 1
|
||||
break
|
||||
data = json.loads(line)
|
||||
seg_list = jieba.cut(data['content'],cut_all=True)
|
||||
seg_list = jieba.cut(data['content'],cut_all=False)
|
||||
result = ' '.join(seg_list)
|
||||
data['content'] = result
|
||||
cut_file.write(json.dumps(data)+'\n')
|
||||
|
BIN
ml/Cut.pyc
BIN
ml/Cut.pyc
Binary file not shown.
@ -42,7 +42,7 @@ class InverseIndex:
|
||||
if not line:
|
||||
break
|
||||
data = json.loads(line)
|
||||
seg_list = list(jieba.cut(data['title'], cut_all=True))
|
||||
seg_list = list(jieba.cut(data['title'], cut_all=False))
|
||||
count+=1
|
||||
for w in seg_list:
|
||||
if w not in self.worddict:
|
||||
@ -59,7 +59,7 @@ class InverseIndex:
|
||||
if not line:
|
||||
break
|
||||
data = json.loads(line)
|
||||
seg_list = list(jieba.cut(data['title'],cut_all=True))
|
||||
seg_list = list(jieba.cut(data['title'],cut_all=False))
|
||||
doc.append(seg_list)
|
||||
return doc
|
||||
|
||||
@ -72,8 +72,6 @@ class InverseIndex:
|
||||
if not line:
|
||||
break
|
||||
data = json.loads(line)
|
||||
# seg_list = jieba.cut(data['content'],cut_all=True)
|
||||
|
||||
keyword = analyse.extract_tags(data['content'],topK=20)
|
||||
seg = " ".join(keyword)
|
||||
print seg
|
||||
@ -132,6 +130,6 @@ class InverseIndex:
|
||||
|
||||
|
||||
#test
|
||||
ii = InverseIndex()
|
||||
ii.CalcTFIDF()
|
||||
#ii = InverseIndex()
|
||||
#ii.CalcTFIDF()
|
||||
#ii.loadDataFromCutFile(20)
|
||||
|
74
ml/Search.py
74
ml/Search.py
@ -19,32 +19,62 @@ class Search:
|
||||
return kw_id
|
||||
|
||||
|
||||
def QuerySingle(self,searchWord):
|
||||
idx = self.kw_id[searchWord.decode('utf-8')]
|
||||
#返回文档号
|
||||
def QuerySingle(self,searchWord,ishow):
|
||||
if self.kw_id.has_key(searchWord.decode('utf-8')):
|
||||
idx = self.kw_id[searchWord.decode('utf-8')]
|
||||
cut = Cut()
|
||||
ii_line = cut.getInverseIndexRow(idx,Global.inverse_dir,Global.filesize)
|
||||
record =json.loads(ii_line)
|
||||
if ishow:
|
||||
for rec in record:
|
||||
line = cut.getRow(int(rec),Global.cutnews_origin_dir,Global.filesize)
|
||||
data = json.loads(line)
|
||||
print data['title'],'\n',data['time'],'\n',data['content'],'\n'
|
||||
#返回单个词项对应的倒排记录表
|
||||
return record
|
||||
else:
|
||||
if isshow:
|
||||
print 'Not Exists Record!'
|
||||
#调用该函数后需要对结果进行判断
|
||||
return dict()
|
||||
|
||||
|
||||
#'与'查询:先分词,再合并倒排记录,不考虑权重,返回文档号
|
||||
def QueryPhrase(self,searchPhrase,ishow = True):
|
||||
words = jieba.cut(searchPhrase.decode('utf-8'),cut_all=False)
|
||||
cut = Cut()
|
||||
ii_line = cut.getInverseIndexRow(idx,Global.inverse_dir,Global.filesize)
|
||||
record =json.loads(ii_line)
|
||||
for rec in record:
|
||||
line = cut.getRow(int(rec),Global.cutnews_origin_dir,Global.filesize)
|
||||
data = json.loads(line)
|
||||
print data['title'],'\n',data['time'],'\n',data['content'],'\n'
|
||||
result = set(range(1,100000))
|
||||
for word in words:
|
||||
if not self.kw_id.has_key(word):
|
||||
print 'Not Exist Record'
|
||||
return set()
|
||||
idx = self.kw_id[word]
|
||||
ii_line = cut.getInverseIndexRow(idx,Global.inverse_dir,Global.filesize)
|
||||
record =json.loads(ii_line)
|
||||
re = set()
|
||||
for rec in record:
|
||||
re.add(int(rec))
|
||||
result = result & re
|
||||
print result
|
||||
if ishow:
|
||||
if len(result) == 0:
|
||||
print 'Not Exists Record!'
|
||||
else:
|
||||
for rst in result:
|
||||
line = cut.getRow(int(rst),Global.cutnews_origin_dir,Global.filesize)
|
||||
data = json.loads(line)
|
||||
print data['title'],'\n',data['time'],'\n',data['content'],'\n'
|
||||
return result
|
||||
|
||||
def getQueryItem(self,searchWord):
|
||||
idx = self.kw_id[searchWord.decode('utf-8')]
|
||||
cut = Cut()
|
||||
ii_line = cut.getInverseIndexRow(idx,Global.inverse_dir,Global.filesize)
|
||||
record =json.loads(ii_line)
|
||||
for rec in record:
|
||||
line = cut.getRow(int(rec),Global.cutnews_origin_dir,Global.filesize)
|
||||
data = json.loads(line)
|
||||
print data['title'],'\n',data['time'],'\n',data['content'],'\n'
|
||||
|
||||
|
||||
def getInverseRecord(self,item):
|
||||
#返回热点新闻
|
||||
def QueryHotNews(self):
|
||||
pass
|
||||
|
||||
def mergeInverseRecord(self,RecordList):
|
||||
#返回最新新闻
|
||||
def QueryByTime(self):
|
||||
pass
|
||||
|
||||
search = Search()
|
||||
search.getQueryItem(sys.argv[1])
|
||||
#search.QueryPhrase(sys.argv[1])
|
||||
search.QueryPhrase(sys.argv[1])
|
||||
|
Loading…
Reference in New Issue
Block a user