From a732b75e5814f2ac68bca0b27297ade22306f888 Mon Sep 17 00:00:00 2001 From: lzjqsdd Date: Fri, 29 Apr 2016 17:56:36 +0800 Subject: [PATCH] =?UTF-8?q?=E6=8F=90=E5=8F=96=E5=85=B3=E9=94=AE=E8=AF=8D?= =?UTF-8?q?=E5=86=99=E5=85=A5=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + ml/Cut.pyc | Bin 2043 -> 1946 bytes ml/InverseIndex.py | 14 ++++++++++---- test/test_tool.py | 8 ++++++++ tools/Global.py | 4 ++-- tools/Global.pyc | Bin 469 -> 442 bytes tools/show.py | 10 ++++++++++ tools/show.pyc | Bin 1277 -> 1730 bytes 8 files changed, 31 insertions(+), 6 deletions(-) create mode 100644 test/test_tool.py diff --git a/.gitignore b/.gitignore index a68e4ed..e8c7a12 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ data/news.db data/news.json data/title.json data/cutnews +data/inversedata diff --git a/ml/Cut.pyc b/ml/Cut.pyc index 9a7effe7b9cf817f4df74b711fffca006f37fa45..9834f0810689d5f2fc522bb7d87de27ab15a009f 100644 GIT binary patch delta 84 zcmey(KZ{?Q`7flgPm8156IzR_d_hf#o$A4mcrldvWyBLI@gBx?Ww diff --git a/ml/InverseIndex.py b/ml/InverseIndex.py index a7e4435..eb272cc 100644 --- a/ml/InverseIndex.py +++ b/ml/InverseIndex.py @@ -44,7 +44,7 @@ class InverseIndex: count+=1 for w in seg_list: if w not in self.worddict: - self.worddict[w] = [] + self.worddict[w] = [] if w not in self.stopword: print w, self.worddict[w].append(count) @@ -87,14 +87,20 @@ class InverseIndex: #calculate tf-idf def CalcTFIDF(self): - docArray = self.loadDataFromCutFile(100) + docArray = self.loadDataFromCutFile(10000) vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray)) print 'done' + #write index-doc to file + i = 0 + indexdoc = dict() + f = open(Global.inverse_dir+'id.txt','wb') for name in vectorizer.get_feature_names(): - print name - + i+=1 + indexdoc[i] = name + f.write(json.dumps(indexdoc)) + #test ii = InverseIndex() ii.CalcTFIDF() diff --git a/test/test_tool.py b/test/test_tool.py new file mode 100644 index 0000000..db7d52e --- /dev/null +++ b/test/test_tool.py @@ -0,0 +1,8 @@ +import sys +sys.path.append("..") +from tools.show import show + +s = show() +#s.showcount() +#s.shownews(1) +s.showKeyWord() diff --git a/tools/Global.py b/tools/Global.py index 2d2fc03..1059182 100644 --- a/tools/Global.py +++ b/tools/Global.py @@ -3,6 +3,6 @@ title_dir = project_root+"data/title.json" content_dir=project_root+"data/news.json" db_dir = project_root+"data/news.db" stopword_dir=project_root+"data/stopword.txt" -inverse_dir=project_root+"data/inversedata" -cutnews_dir=project_root+"data/cutnews" +inverse_dir=project_root+"data/inversedata/" +cutnews_dir=project_root+"data/cutnews/" filesize = 100 diff --git a/tools/Global.pyc b/tools/Global.pyc index 2a843307031db4a61b91311b34a6b10c100f0bc9..0927dabc64e55af7be313c9503e1d74652dee762 100644 GIT binary patch delta 68 zcmcc0yo;HQ`7~1F8ZH3=AoWC5if(d1a|Z#i<~6F%L{IxwIrNwY+%Z+sVrM oKr#J{{M=OioT{wC;*=Eq^vsfs(j",data['time'],data['title'],data['url'],data['content'] + def showKeyWord(self): + f = open(Global.inverse_dir+'id.txt','r') + line = f.readline() + data = json.loads(line) + print 'load keyword done.' + print type(data) + for k in data.keys(): + print k,data[k] + + diff --git a/tools/show.pyc b/tools/show.pyc index 774e104204ab2ad0ae0506a02b66146f41d94289..90695ee3032eee94c5daa01bf556c841d2870930 100644 GIT binary patch delta 701 zcmah{!A{#i5S{g|oth*CiUdL&pj>bTm?J&)0#YJX%3*UrAXNylmTH$`$S#CPiE>DP zfaca)rT#$GAE^2Rd;wp<8HssIS}v71%$u3r@$=2inhiuO#Jmr^s|6h3<<|0 z*vGB6Tv?vjE&H+Tre{6(g0E5MF8jQS8tf#d#j)Uww}5nq!{!Vu#tACCj0M&(3TWWP z6_SP{fXA}tFO za+A6%r?HTQrjzo>tbEFP-Lhj=OnI8631kd4hINI(!Ib>;GV9pv(BvIGg`0m)>1oX1 z1X*zS-3ae4&>Z go_Kxr-_N5#6LVMyj6__It8qBqSMA9^IKENo1H>nBLI3~& delta 244 zcmX@a`)l{gV9roMQdr ijQn!Fg32I}0h~Zw%mO6X7&&<-TeAfd