From d1824516d39382399ed3fb3dad2738ecffc9392d Mon Sep 17 00:00:00 2001 From: lzjqsdd Date: Wed, 4 May 2016 00:46:51 +0800 Subject: [PATCH] =?UTF-8?q?=E6=94=AF=E6=8C=81=E8=8B=B1=E6=96=87=E6=90=9C?= =?UTF-8?q?=E7=B4=A2=EF=BC=8C=E4=B8=AD=E6=96=87=E6=90=9C=E7=B4=A2=E7=BC=96?= =?UTF-8?q?=E7=A0=81=E5=AD=98=E5=9C=A8=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + ml/Cut.py | 22 ++++++++++++++++++++++ ml/Cut.pyc | Bin 1946 -> 2350 bytes ml/InverseIndex.py | 8 ++++---- ml/Search.py | 30 +++++++++++++++++++++++++++--- test/test_tool.py | 3 +-- tools/Global.py | 1 + tools/Global.pyc | Bin 442 -> 494 bytes tools/__init__.pyc | Bin 107 -> 107 bytes tools/show.pyc | Bin 1730 -> 1730 bytes 10 files changed, 56 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index e8c7a12..c44ff6c 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ data/news.db data/news.json data/title.json data/cutnews +data/orinews data/inversedata diff --git a/ml/Cut.py b/ml/Cut.py index 701a081..0974f87 100644 --- a/ml/Cut.py +++ b/ml/Cut.py @@ -35,6 +35,27 @@ class Cut: cut_file.write(json.dumps(data)+'\n') cut_file.close() num+=1 + + def cutfileWithoutCut(self,path,fliename,size): + file_data = open(fliename,'r') + num = 0 + flag = 0 + while True: + if flag == 1: + break + if not os.path.exists(path): + os.makedirs(path) + cutfilename = path+'/'+str(num)+'.txt' + cut_file = open(cutfilename,'wb') + print 'Generate:'+cutfilename+'...' + for i in range(0,size): + line = file_data.readline() + if not line: + flag = 1 + break + cut_file.write(line) + cut_file.close() + num+=1 def getRow(self,recordnum,path,size): filenum = (recordnum-1)/size linenum = (recordnum-1)%size+1 @@ -48,6 +69,7 @@ class Cut: #test cutfile #c = Cut() +#c.cutfileWithoutCut(Global.cutnews_origin_dir,Global.content_dir,Global.filesize) #c.cutfile(Global.cutnews_dir,Global.content_dir,Global.filesize) #test getRow diff --git a/ml/Cut.pyc b/ml/Cut.pyc index 470c343079802a4f9508f1f98fcfa0bd3e7b47a0..0cb63acab8b494e5e8f076a17429196ede492353 100644 GIT binary patch delta 539 zcmZXQJ4*vW5XWb7m%H4Vhl$S^qftZ;2^(w`QBe>Zi?s*{780XI;w!fSg+Pi3)?vGJ z7PdBGX>BL?6+{bbzko9rgM}>f+ufP}JeW@6A!QriG57w}YuitlZ$Hn(4h*;f*bRf+ z@yzz;niT~AAs)9dWBk7Lttk_II=XoH)>Mp@o2#9VoKH)pw!MrbCssCD*KwK?MoYB=`)Wu9OV(k9C{nvvO@; zl^1>13n4t7M{-EIjNr?o%NQfhNHF>tNyZ2xP3Yz}gMLBE7)LUUaYmLg!I)&^7*mXC zf=AYc-`!Nxu$%e~|`7#t)h6?CNHeld*5c3#)dh-b05K9#Op>`lAkADrTnw^+jgga?Q)2RD4o^n)$#*$q KiSf`RPCWntHe12~ diff --git a/ml/InverseIndex.py b/ml/InverseIndex.py index b562531..f49faa8 100644 --- a/ml/InverseIndex.py +++ b/ml/InverseIndex.py @@ -73,7 +73,7 @@ class InverseIndex: data = json.loads(line) # seg_list = jieba.cut(data['content'],cut_all=True) - keyword = analyse.extract_tags(data['content'],topK=10) + keyword = analyse.extract_tags(data['content'],topK=20) seg = " ".join(keyword) print seg doc.append(seg) @@ -93,8 +93,8 @@ class InverseIndex: f = open(Global.inverse_dir+'id.txt','wb') word = vectorizer.get_feature_names() for name in vectorizer.get_feature_names(): - indexdoc[name] = i i+=1 + indexdoc[name] = i f.write(json.dumps(indexdoc)) f.close() @@ -103,11 +103,11 @@ class InverseIndex: for i in range(0,colnum): filename = Global.inverse_dir+str(i/Global.filesize)+'.txt' f = open(filename,'a') - idx_list = list() + idx_list = dict() for j in range(0,row): val = tfidf[j,i] if val > 0: - idx_list[j] = val + idx_list[j+1] = val f.write(json.dumps(idx_list)+'\n') f.close() diff --git a/ml/Search.py b/ml/Search.py index e103287..61ab677 100644 --- a/ml/Search.py +++ b/ml/Search.py @@ -1,12 +1,34 @@ +# -*- coding: utf-8 -*- import sys import json +reload(sys) +sys.path.append("..") +sys.setdefaultencoding('utf-8') +from Cut import Cut +import tools.Global as Global class Search: def __init__(self): - pass + self.kw_id = self.loadKW_ID() - def getQueryItem(self,InputItem): - pass + def loadKW_ID(self): + f = open(Global.inverse_dir+'id.txt') + line = f.readline() + kw_id = json.loads(line, encoding='utf-8') + kwid = dict() + for ki in kw_id: + kwid[ki.encode('utf-8')] = kw_id[ki] + for i in kwid: + print i,kwid[i] + return kwid + + + def getQueryItem(self,searchWord): + idx = self.kw_id[searchWord] + cut = Cut() + line = cut.getRow(idx,Global.cutnews_origin_dir,Global.filesize) + data = json.loads(line) + print data['title'],'\n',data['time'],'\n',data['content'],'\n' def getInverseRecord(self,item): pass @@ -14,3 +36,5 @@ class Search: def mergeInverseRecord(self,RecordList): pass +search = Search() +search.getQueryItem(sys.argv[1].decode('utf-8')) diff --git a/test/test_tool.py b/test/test_tool.py index d63ac06..1aff0de 100644 --- a/test/test_tool.py +++ b/test/test_tool.py @@ -11,6 +11,5 @@ s = show() #s.showitem(2608) c = Cut() -line = c.getRow(3176,Global.cutnews_dir,Global.filesize) +line = c.getRow(2,Global.cutnews_origin_dir,Global.filesize) s.showitem(line) - diff --git a/tools/Global.py b/tools/Global.py index 1059182..4b1a84a 100644 --- a/tools/Global.py +++ b/tools/Global.py @@ -5,4 +5,5 @@ db_dir = project_root+"data/news.db" stopword_dir=project_root+"data/stopword.txt" inverse_dir=project_root+"data/inversedata/" cutnews_dir=project_root+"data/cutnews/" +cutnews_origin_dir=project_root+"data/orinews" filesize = 100 diff --git a/tools/Global.pyc b/tools/Global.pyc index 0927dabc64e55af7be313c9503e1d74652dee762..0b96661591952d68d6156998217c20a221abe698 100644 GIT binary patch delta 134 zcmdnR{Ek_g`7Lkb5&6bC~JCqu9X z*TkGg$zmR$qLjpvME(4t%)Hd{;>;AFfS(5E#IN~6LO^D6X$eR;9w?ojnKwC}(NjbK KD9pylg#rQEnjCKc delta 87 zcmaFIyo*_y`7