From d94fb654ddfc9ae249c9225b56f909776e2243a2 Mon Sep 17 00:00:00 2001 From: lzjqsdd Date: Mon, 25 Apr 2016 23:58:51 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=80=92=E6=8E=92=E5=AD=97?= =?UTF-8?q?=E5=85=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ml/InverseIndex.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 ml/InverseIndex.py diff --git a/ml/InverseIndex.py b/ml/InverseIndex.py new file mode 100644 index 0000000..e756cd8 --- /dev/null +++ b/ml/InverseIndex.py @@ -0,0 +1,26 @@ +#encoding=utf-8 +import jieba +import json +import sys + +reload(sys) +sys.setdefaultencoding('utf-8') + + +file = open('../news_spider/title.json') +worddict = dict() +count=0 +while True: + line = file.readline() + if not line: + break + data = json.loads(line) + seg_list = list(jieba.cut(data['title'], cut_all=True)) + count+=1 + for w in seg_list: + if w not in worddict: + worddict[w] = [] + worddict[w].append(count) + +for i in worddict: + print i