From 2626dd15e6e9e3a2264aff7e4a66d55d8685a10f Mon Sep 17 00:00:00 2001 From: lzjqsdd Date: Thu, 5 May 2016 21:29:25 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E7=A8=80=E7=96=8F=E7=9F=A9?= =?UTF-8?q?=E9=98=B5=E8=AF=BB=E5=8F=96=E6=96=B9=E5=BC=8F=EF=BC=8C=E6=8F=90?= =?UTF-8?q?=E9=AB=98=E5=BB=BA=E7=AB=8B=E7=B4=A2=E5=BC=95=E9=80=9F=E5=BA=A6?= =?UTF-8?q?=EF=BC=8C=E5=89=8D=E4=B8=80=E6=AC=A1=E6=8F=90=E4=BA=A4=E5=A4=8D?= =?UTF-8?q?=E6=9D=82=E5=BA=A6=E5=A4=AA=E9=AB=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ml/InverseIndex.py | 33 +++++++++++++----- ml/Search.py | 11 +++++- news_spider/news_spider/__init__.pyc | Bin 150 -> 154 bytes news_spider/news_spider/commands/__init__.pyc | Bin 163 -> 163 bytes news_spider/news_spider/commands/crawlall.pyc | Bin 2490 -> 2490 bytes news_spider/news_spider/items.pyc | Bin 782 -> 782 bytes news_spider/news_spider/pipelines.pyc | Bin 2411 -> 2427 bytes news_spider/news_spider/rotateuseragent.pyc | Bin 3303 -> 3303 bytes news_spider/news_spider/settings.pyc | Bin 647 -> 679 bytes news_spider/news_spider/spiders/NetEase.pyc | Bin 2929 -> 2929 bytes news_spider/news_spider/spiders/Tencent.pyc | Bin 2918 -> 2918 bytes .../news_spider/spiders/TouTiaoSpider.pyc | Bin 3042 -> 3042 bytes news_spider/news_spider/spiders/__init__.pyc | Bin 162 -> 162 bytes test/test_tool.py | 8 ++--- tools/show.py | 7 ++-- tools/show.pyc | Bin 1730 -> 1698 bytes 16 files changed, 42 insertions(+), 17 deletions(-) diff --git a/ml/InverseIndex.py b/ml/InverseIndex.py index f49faa8..d2c3e8e 100644 --- a/ml/InverseIndex.py +++ b/ml/InverseIndex.py @@ -12,6 +12,7 @@ from sklearn import feature_extraction from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer from tools.show import show +import numpy as np class InverseIndex: @@ -82,7 +83,10 @@ class InverseIndex: #calculate tf-idf def CalcTFIDF(self): - docArray = self.loadDataFromCutFile(100) + sh = show() + count = sh.showcount() + docArray = self.loadDataFromCutFile(count) + #docArray = self.loadDataFromCutFile(10) vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray)) @@ -99,18 +103,29 @@ class InverseIndex: f.close() colnum = tfidf.shape[1] - row = tfidf.shape[0] + #for i in range(0,colnum): + # filename = Global.inverse_dir+str(i/Global.filesize)+'.txt' + # f = open(filename,'a') + # idx_list = dict() + # for j in range(0,row): + # val = tfidf[j,i] + # if val > 0: + # idx_list[j+1] = val + # f.write(json.dumps(idx_list)+'\n') + # f.close() + #i表示词项的编号,row表示非零文档所在的行 for i in range(0,colnum): filename = Global.inverse_dir+str(i/Global.filesize)+'.txt' + coldata = tfidf.getcol(i) + col_nonzero_index = np.nonzero(coldata) + item_weight_dict = dict() + for row in col_nonzero_index[0]: + item_weight_dict[row+1] = coldata[row][0].data[0] f = open(filename,'a') - idx_list = dict() - for j in range(0,row): - val = tfidf[j,i] - if val > 0: - idx_list[j+1] = val - f.write(json.dumps(idx_list)+'\n') + f.write(json.dumps(item_weight_dict)+'\n') f.close() - + print 'item ',i,'calculate done' + def WriteInverseIndex(self,mat): pass diff --git a/ml/Search.py b/ml/Search.py index 610ed00..f25413e 100644 --- a/ml/Search.py +++ b/ml/Search.py @@ -19,7 +19,7 @@ class Search: return kw_id - def getQueryItem(self,searchWord): + def QuerySingle(self,searchWord): idx = self.kw_id[searchWord.decode('utf-8')] cut = Cut() ii_line = cut.getInverseIndexRow(idx,Global.inverse_dir,Global.filesize) @@ -29,6 +29,15 @@ class Search: data = json.loads(line) print data['title'],'\n',data['time'],'\n',data['content'],'\n' + def getQueryItem(self,searchWord): + idx = self.kw_id[searchWord.decode('utf-8')] + cut = Cut() + ii_line = cut.getInverseIndexRow(idx,Global.inverse_dir,Global.filesize) + record =json.loads(ii_line) + for rec in record: + line = cut.getRow(int(rec),Global.cutnews_origin_dir,Global.filesize) + data = json.loads(line) + print data['title'],'\n',data['time'],'\n',data['content'],'\n' def getInverseRecord(self,item): diff --git a/news_spider/news_spider/__init__.pyc b/news_spider/news_spider/__init__.pyc index cf9c6610a5cc1780010b7154bbce24e16a559162..2bce9186a3090cbb805015a62ea154b797e933f7 100644 GIT binary patch delta 29 lcmbQnIE#^;`7}HJ46YV5e{Zh+|gC~aD0|0q^2!H?p delta 25 gcmbQmIE|5=`7~jDk%mlRn delta 15 WcmZ3?xR{Zh`7~jDnIt32^ diff --git a/news_spider/news_spider/commands/crawlall.pyc b/news_spider/news_spider/commands/crawlall.pyc index 40e640b98fb290a810cd14c499fe1465d2c2a712..ae5b405c896eb168499543a68e5107acb0165f20 100644 GIT binary patch delta 16 Xcmdlbyi1s!`75 diff --git a/news_spider/news_spider/items.pyc b/news_spider/news_spider/items.pyc index 83ae39426a94678af08e99031d66abfedae4f9fc..bc15fc687739301eb6e8f776106920556640dabf 100644 GIT binary patch delta 16 XcmeBU>tkbQ{>;nev`b?nJ3BJ~CGZ3z delta 16 XcmeBU>tkbQ{>;nezDIH+J3BJ~CDa5P diff --git a/news_spider/news_spider/pipelines.pyc b/news_spider/news_spider/pipelines.pyc index 638ac8005d8cc0082f66ebeceb14b4621af38a42..0f114640d1e7cc9b26649e922fb9338d00a49d7f 100644 GIT binary patch delta 52 xcmaDY^jnCX`7;jBDf(#4{dV2aPi6x2po241qSa^hxMO0bcSO8ll3-15` delta 36 pcmew@^je6W`7;jA|oD2*Mo0S>aSXj6~EK^oD767-*2ekkI diff --git a/news_spider/news_spider/rotateuseragent.pyc b/news_spider/news_spider/rotateuseragent.pyc index 4b640fc599704631c21635210102e68dfc612e68..b2c9f8982b97410b475ade5721d5ac9761c9f414 100644 GIT binary patch delta 16 XcmaDZ`CO8n`7<@SVG%W?) delta 16 XcmaDZ`CO8n`7<@SVG(ZL7 diff --git a/news_spider/news_spider/settings.pyc b/news_spider/news_spider/settings.pyc index 31c6588568336b65473c6acaaa72fba8c67a391a..f0d5ab32667bedf3bd6873dfbea28870f6d0385f 100644 GIT binary patch delta 154 zcmZo?UCt`a{F#@_X_rQLG6NJa18D~!F7}-$EoYL#%n-%Gkjcc5!p)Gv!eExd%83y&7Gq~f;Q)$pGNf<;#eh6+hF}eziRmt^nIK~(uD;30F}an|SBMYDbMX)N^YM3d liFa}Jag3aNk#T{92vC@fiIb6yk&TZ{l9`d6k&7Ed0stu#8Rq~1 delta 121 zcmZ3^+RiG?{F#^Q;617EWCkc;2GR~dTSAs4iw{HNZ|yE0eM^u!5Z8Xb6qBGyUEBtc><&F{;9ZGyVly delta 16 Xcmew;_EC(T`7{;9ZG`9t9 diff --git a/news_spider/news_spider/spiders/Tencent.pyc b/news_spider/news_spider/spiders/Tencent.pyc index ec17540452df86de865e987bce1f3e7ed6b8301d..27b724d26bece266e06b27c1496a77fed92ae276 100644 GIT binary patch delta 16 XcmaDR_DqbO`7E_S@V5Ek*>+ diff --git a/news_spider/news_spider/spiders/__init__.pyc b/news_spider/news_spider/spiders/__init__.pyc index da4d3b8cd51596408a317c305396a716be4fff9d..d39c8e71973030669c57eeb5dea9cecc9f1e620a 100644 GIT binary patch delta 15 WcmZ3)xQLOR`7HTG8QeW4|6)lMW(KOrNzKy$sSM%+5dxDf tS#&2aW91jo0CJ0YfCK}h7|?uvMpj1d$){P37^NovW3^)BnQY0{2>^TA8+-r& delta 196 zcmZ3)dx)2v`7?wvlaU8VvM};7a!uaCd_X}7 zC{fG;mgZ+nW&|==ftVSHoq@PGbMh1x7nKS{h8kvuEOv%w1_qZ{VTM{3h8h+Icc2)M zl63bd^kWF0{Fo(`3uI7sYGtto>*PRIakdhWdD)Y*nH45?u=0zkgG}TD5)6!DKxgnX avNCcr3Qu0mYQ!i#`5vnkBll!UwoU*^kRb5@