From c5dd35c5aa2f7a0c965589873bde466aa05730f0 Mon Sep 17 00:00:00 2001 From: lzjqsdd Date: Tue, 3 May 2016 18:40:45 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9A=82=E6=9C=AA=E8=A7=A3=E5=86=B3=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ml/InverseIndex.py | 22 +++++++++++++--------- test/test_tool.py | 10 +++++++++- tools/show.pyc | Bin 1730 -> 1730 bytes 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/ml/InverseIndex.py b/ml/InverseIndex.py index e98ba3c..b562531 100644 --- a/ml/InverseIndex.py +++ b/ml/InverseIndex.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- import jieba +import jieba.analyse as analyse import json import sys reload(sys) @@ -70,11 +71,12 @@ class InverseIndex: if not line: break data = json.loads(line) - seg_list = jieba.cut(data['content'],cut_all=True) - for seg in seg_list: - seg=''.join(seg.split()) - if(seg!='' and seg!="\n" and seg!="\n\n"): - doc.append(seg) +# seg_list = jieba.cut(data['content'],cut_all=True) + + keyword = analyse.extract_tags(data['content'],topK=10) + seg = " ".join(keyword) + print seg + doc.append(seg) return doc @@ -89,25 +91,26 @@ class InverseIndex: i = 0 indexdoc = dict() f = open(Global.inverse_dir+'id.txt','wb') + word = vectorizer.get_feature_names() for name in vectorizer.get_feature_names(): indexdoc[name] = i i+=1 f.write(json.dumps(indexdoc)) f.close() - colnum = tfidf.shape[1] + colnum = tfidf.shape[1] row = tfidf.shape[0] for i in range(0,colnum): filename = Global.inverse_dir+str(i/Global.filesize)+'.txt' - f = open(filename,'a' - idx_list = dict() + f = open(filename,'a') + idx_list = list() for j in range(0,row): val = tfidf[j,i] if val > 0: idx_list[j] = val f.write(json.dumps(idx_list)+'\n') f.close() - + def WriteInverseIndex(self,mat): pass @@ -116,3 +119,4 @@ class InverseIndex: #test ii = InverseIndex() ii.CalcTFIDF() +#ii.loadDataFromCutFile(20) diff --git a/test/test_tool.py b/test/test_tool.py index db7d52e..d63ac06 100644 --- a/test/test_tool.py +++ b/test/test_tool.py @@ -1,8 +1,16 @@ import sys sys.path.append("..") from tools.show import show +import tools.Global as Global +from ml.Cut import Cut s = show() #s.showcount() #s.shownews(1) -s.showKeyWord() +#s.showKeyWord() +#s.showitem(2608) + +c = Cut() +line = c.getRow(3176,Global.cutnews_dir,Global.filesize) +s.showitem(line) + diff --git a/tools/show.pyc b/tools/show.pyc index 90695ee3032eee94c5daa01bf556c841d2870930..60765fbeebeee11b2e475a0fb7f029d79686c93b 100644 GIT binary patch delta 16 XcmX@adx)2v`7_T6j%D?0?u