diff --git a/ml/InverseIndex.py b/ml/InverseIndex.py new file mode 100644 index 0000000..e756cd8 --- /dev/null +++ b/ml/InverseIndex.py @@ -0,0 +1,26 @@ +#encoding=utf-8 +import jieba +import json +import sys + +reload(sys) +sys.setdefaultencoding('utf-8') + + +file = open('../news_spider/title.json') +worddict = dict() +count=0 +while True: + line = file.readline() + if not line: + break + data = json.loads(line) + seg_list = list(jieba.cut(data['title'], cut_all=True)) + count+=1 + for w in seg_list: + if w not in worddict: + worddict[w] = [] + worddict[w].append(count) + +for i in worddict: + print i