diff --git a/ml/Cut.py b/ml/Cut.py index 756de13..897861f 100644 --- a/ml/Cut.py +++ b/ml/Cut.py @@ -29,7 +29,7 @@ class Cut: flag = 1 break data = json.loads(line) - seg_list = jieba.cut(data['content'],cut_all=True) + seg_list = jieba.cut(data['content'],cut_all=False) result = ' '.join(seg_list) data['content'] = result cut_file.write(json.dumps(data)+'\n') diff --git a/ml/Cut.pyc b/ml/Cut.pyc index 2c41f81..c6ef393 100644 Binary files a/ml/Cut.pyc and b/ml/Cut.pyc differ diff --git a/ml/InverseIndex.py b/ml/InverseIndex.py index d2c3e8e..9323219 100644 --- a/ml/InverseIndex.py +++ b/ml/InverseIndex.py @@ -42,7 +42,7 @@ class InverseIndex: if not line: break data = json.loads(line) - seg_list = list(jieba.cut(data['title'], cut_all=True)) + seg_list = list(jieba.cut(data['title'], cut_all=False)) count+=1 for w in seg_list: if w not in self.worddict: @@ -59,7 +59,7 @@ class InverseIndex: if not line: break data = json.loads(line) - seg_list = list(jieba.cut(data['title'],cut_all=True)) + seg_list = list(jieba.cut(data['title'],cut_all=False)) doc.append(seg_list) return doc @@ -72,8 +72,6 @@ class InverseIndex: if not line: break data = json.loads(line) -# seg_list = jieba.cut(data['content'],cut_all=True) - keyword = analyse.extract_tags(data['content'],topK=20) seg = " ".join(keyword) print seg @@ -132,6 +130,6 @@ class InverseIndex: #test -ii = InverseIndex() -ii.CalcTFIDF() +#ii = InverseIndex() +#ii.CalcTFIDF() #ii.loadDataFromCutFile(20) diff --git a/ml/Search.py b/ml/Search.py index f25413e..ce74d4e 100644 --- a/ml/Search.py +++ b/ml/Search.py @@ -19,32 +19,62 @@ class Search: return kw_id - def QuerySingle(self,searchWord): - idx = self.kw_id[searchWord.decode('utf-8')] - cut = Cut() - ii_line = cut.getInverseIndexRow(idx,Global.inverse_dir,Global.filesize) - record =json.loads(ii_line) - for rec in record: - line = cut.getRow(int(rec),Global.cutnews_origin_dir,Global.filesize) - data = json.loads(line) - print data['title'],'\n',data['time'],'\n',data['content'],'\n' + #返回文档号 + def QuerySingle(self,searchWord,ishow): + if self.kw_id.has_key(searchWord.decode('utf-8')): + idx = self.kw_id[searchWord.decode('utf-8')] + cut = Cut() + ii_line = cut.getInverseIndexRow(idx,Global.inverse_dir,Global.filesize) + record =json.loads(ii_line) + if ishow: + for rec in record: + line = cut.getRow(int(rec),Global.cutnews_origin_dir,Global.filesize) + data = json.loads(line) + print data['title'],'\n',data['time'],'\n',data['content'],'\n' + #返回单个词项对应的倒排记录表 + return record + else: + if isshow: + print 'Not Exists Record!' + #调用该函数后需要对结果进行判断 + return dict() - def getQueryItem(self,searchWord): - idx = self.kw_id[searchWord.decode('utf-8')] + + #'与'查询:先分词,再合并倒排记录,不考虑权重,返回文档号 + def QueryPhrase(self,searchPhrase,ishow = True): + words = jieba.cut(searchPhrase.decode('utf-8'),cut_all=False) cut = Cut() - ii_line = cut.getInverseIndexRow(idx,Global.inverse_dir,Global.filesize) - record =json.loads(ii_line) - for rec in record: - line = cut.getRow(int(rec),Global.cutnews_origin_dir,Global.filesize) - data = json.loads(line) - print data['title'],'\n',data['time'],'\n',data['content'],'\n' + result = set(range(1,100000)) + for word in words: + if not self.kw_id.has_key(word): + print 'Not Exist Record' + return set() + idx = self.kw_id[word] + ii_line = cut.getInverseIndexRow(idx,Global.inverse_dir,Global.filesize) + record =json.loads(ii_line) + re = set() + for rec in record: + re.add(int(rec)) + result = result & re + print result + if ishow: + if len(result) == 0: + print 'Not Exists Record!' + else: + for rst in result: + line = cut.getRow(int(rst),Global.cutnews_origin_dir,Global.filesize) + data = json.loads(line) + print data['title'],'\n',data['time'],'\n',data['content'],'\n' + return result - - def getInverseRecord(self,item): + #返回热点新闻 + def QueryHotNews(self): pass - def mergeInverseRecord(self,RecordList): + #返回最新新闻 + def QueryByTime(self): pass - + search = Search() -search.getQueryItem(sys.argv[1]) +#search.QueryPhrase(sys.argv[1]) +search.QueryPhrase(sys.argv[1])