NewsSpider/ml/InverseIndex.py

# -*- coding: utf-8 -*- 
import jieba
import jieba.analyse as analyse
import json
import sys
reload(sys)
sys.path.append("..")
sys.setdefaultencoding('utf-8')
import tools.Global as Global
from Cut import Cut
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from tools.show import show
import numpy as np


class InverseIndex:

	def __init__(self):
		self.file_data= open(Global.title_dir)
		self.file_sw = open(Global.stopword_dir)
#self.ii = open(Global.inverse_dir,'wb')
		self.stopword=[]
		self.worddict = dict()

	#load stopword list
	def loadsw(self):
		while True:
			line = self.file_sw.readline()
			if not line:
				break
			self.stopword.append(line)
			print line,

	#load origin data:news.json,title.json
	def CalcInverseIndex(self):
		self.loadsw()
		count=0
		while True:
			line = self.file_data.readline()
			if not line:
				break
			data = json.loads(line)
			seg_list = list(jieba.cut(data['title'], cut_all=False))
			count+=1
			for w in seg_list:
				if w not in self.worddict:
					self.worddict[w] = []
				if w not in self.stopword:
					print w,
					self.worddict[w].append(count)

	def loadDataFromFile(self):
		doc = []
		f = open(Global.content_dir,'r')
		while True:
			line = f.readline()
			if not line:
				break
			data = json.loads(line)
			seg_list = list(jieba.cut(data['title'],cut_all=False))
			doc.append(seg_list)
		return doc


	def loadDataFromCutFile(self,totalnum):
		doc = []
		cut = Cut()
		for i in range(1,totalnum):
			line = cut.getRow(i,Global.cutnews_dir,Global.filesize)
			if not line:
				break
			data = json.loads(line)
			keyword = analyse.extract_tags(data['content'],topK=20)
			seg = " ".join(keyword)
			print seg
			doc.append(seg)
		return doc


	#calculate tf-idf
	def CalcTFIDF(self):
		sh = show()
		count = sh.showcount()
		docArray = self.loadDataFromCutFile(count)
        #docArray = self.loadDataFromCutFile(10)
		vectorizer = CountVectorizer()
		transformer = TfidfTransformer()
		tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray))
		print 'done'
		#write index-doc to file
		i = 0
		indexdoc = dict()
		f = open(Global.inverse_dir+'id.txt','wb')
		word = vectorizer.get_feature_names()
		for name in vectorizer.get_feature_names():
			i+=1
			indexdoc[name] = i
		f.write(json.dumps(indexdoc))
		f.close()
		
		colnum = tfidf.shape[1]
		#for i in range(0,colnum):
		#	filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
		#	f = open(filename,'a')
		#	idx_list = dict()
		#	for j in range(0,row):
		#		val = tfidf[j,i]
		#		if val > 0:
		#			idx_list[j+1] = val
		#	f.write(json.dumps(idx_list)+'\n')
		#	f.close()
		#i表示词项的编号，row表示非零文档所在的行
		for i in range(0,colnum):
			filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
			coldata = tfidf.getcol(i)
			col_nonzero_index = np.nonzero(coldata)
			item_weight_dict = dict()
			for row in col_nonzero_index[0]:
				item_weight_dict[row+1] = coldata[row][0].data[0]
			f = open(filename,'a')
			f.write(json.dumps(item_weight_dict)+'\n')
			f.close()
			print 'item ',i,'calculate done'
			

	def WriteInverseIndex(self,mat):
		pass
		
		
#test
#ii = InverseIndex()
#ii.CalcTFIDF()
#ii.loadDataFromCutFile(20)
-												增加tfidf计算，梳理搜索结构

											
										
										
											2016-04-28 22:26:14 +08:00
+								# -*- coding: utf-8 -*-
-												增加倒排字典

											
										
										
											2016-04-25 23:58:51 +08:00
+								import jieba
-												暂未解决数据问题

											
										
										
											2016-05-03 18:40:45 +08:00
+								import jieba.analyse as analyse
-												增加倒排字典

											
										
										
											2016-04-25 23:58:51 +08:00
+								import json
 								import sys
-												按属性调整项目结构

											
										
										
											2016-04-29 16:08:50 +08:00
+								reload(sys)
 								sys.path.append("..")
 								sys.setdefaultencoding('utf-8')
 								import tools.Global as Global
-												增加tfidf计算，梳理搜索结构

											
										
										
											2016-04-28 22:26:14 +08:00
+								from Cut import Cut
 								from sklearn import feature_extraction
 								from sklearn.feature_extraction.text import TfidfTransformer
 								from sklearn.feature_extraction.text import CountVectorizer
-												按属性调整项目结构

											
										
										
											2016-04-29 16:08:50 +08:00
+								from tools.show import show
-												修改稀疏矩阵读取方式，提高建立索引速度，前一次提交复杂度太高

											
										
										
											2016-05-05 21:29:25 +08:00
+								import numpy as np
-												增加倒排字典

											
										
										
											2016-04-25 23:58:51 +08:00
-												增加停用词

											
										
										
											2016-04-26 00:56:46 +08:00
+								class InverseIndex:
 									def __init__(self):
-												增加tfidf计算，梳理搜索结构

											
										
										
											2016-04-28 22:26:14 +08:00
+										self.file_data= open(Global.title_dir)
-												增加停用词

											
										
										
											2016-04-26 00:56:46 +08:00
+										self.file_sw = open(Global.stopword_dir)
-												增加tfidf计算，梳理搜索结构

											
										
										
											2016-04-28 22:26:14 +08:00
+								#self.ii = open(Global.inverse_dir,'wb')
-												增加停用词

											
										
										
											2016-04-26 00:56:46 +08:00
+										self.stopword=[]
 										self.worddict = dict()
-												增加tfidf计算，梳理搜索结构

											
										
										
											2016-04-28 22:26:14 +08:00
+									#load stopword list
-												增加停用词

											
										
										
											2016-04-26 00:56:46 +08:00
+									def loadsw(self):
 										while True:
 											line = self.file_sw.readline()
 											if not line:
 												break
 											self.stopword.append(line)
 											print line,
-												增加tfidf计算，梳理搜索结构

											
										
										
											2016-04-28 22:26:14 +08:00
+									#load origin data:news.json,title.json
 									def CalcInverseIndex(self):
-												增加停用词

											
										
										
											2016-04-26 00:56:46 +08:00
+										self.loadsw()
 										count=0
 										while True:
 											line = self.file_data.readline()
 											if not line:
 												break
 											data = json.loads(line)
-												Add parase query

											
										
										
											2016-05-05 23:04:30 +08:00
+											seg_list = list(jieba.cut(data['title'], cut_all=False))
-												增加停用词

											
										
										
											2016-04-26 00:56:46 +08:00
+											count+=1
 											for w in seg_list:
 												if w not in self.worddict:
-												提取关键词写入文件

											
										
										
											2016-04-29 17:56:36 +08:00
+													self.worddict[w] = []
-												增加停用词

											
										
										
											2016-04-26 00:56:46 +08:00
+												if w not in self.stopword:
-												none

											
										
										
											2016-04-27 23:21:28 +08:00
+													print w,
-												增加停用词

											
										
										
											2016-04-26 00:56:46 +08:00
+													self.worddict[w].append(count)
-												增加tfidf计算，梳理搜索结构

											
										
										
											2016-04-28 22:26:14 +08:00
+									def loadDataFromFile(self):
 										doc = []
 										f = open(Global.content_dir,'r')
 										while True:
 											line = f.readline()
 											if not line:
 												break
 											data = json.loads(line)
-												Add parase query

											
										
										
											2016-05-05 23:04:30 +08:00
+											seg_list = list(jieba.cut(data['title'],cut_all=False))
-												增加tfidf计算，梳理搜索结构

											
										
										
											2016-04-28 22:26:14 +08:00
+											doc.append(seg_list)
 										return doc
 									def loadDataFromCutFile(self,totalnum):
 										doc = []
 										cut = Cut()
 										for i in range(1,totalnum):
 											line = cut.getRow(i,Global.cutnews_dir,Global.filesize)
 											if not line:
 												break
 											data = json.loads(line)
-												支持英文搜索，中文搜索编码存在问题

											
										
										
											2016-05-04 00:46:51 +08:00
+											keyword = analyse.extract_tags(data['content'],topK=20)
-												暂未解决数据问题

											
										
										
											2016-05-03 18:40:45 +08:00
+											seg = " ".join(keyword)
 											print seg
 											doc.append(seg)
-												增加tfidf计算，梳理搜索结构

											
										
										
											2016-04-28 22:26:14 +08:00
+										return doc
 									#calculate tf-idf
 									def CalcTFIDF(self):
-												修改稀疏矩阵读取方式，提高建立索引速度，前一次提交复杂度太高

											
										
										
											2016-05-05 21:29:25 +08:00
+										sh = show()
 										count = sh.showcount()
 										docArray = self.loadDataFromCutFile(count)
 								        #docArray = self.loadDataFromCutFile(10)
-												增加tfidf计算，梳理搜索结构

											
										
										
											2016-04-28 22:26:14 +08:00
+										vectorizer = CountVectorizer()
 										transformer = TfidfTransformer()
 										tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray))
 										print 'done'
-												提取关键词写入文件

											
										
										
											2016-04-29 17:56:36 +08:00
+										#write index-doc to file
 										i = 0
 										indexdoc = dict()
 										f = open(Global.inverse_dir+'id.txt','wb')
-												暂未解决数据问题

											
										
										
											2016-05-03 18:40:45 +08:00
+										word = vectorizer.get_feature_names()
-												增加tfidf计算，梳理搜索结构

											
										
										
											2016-04-28 22:26:14 +08:00
+										for name in vectorizer.get_feature_names():
-												写入索引

											
										
										
											2016-05-02 23:45:36 +08:00
+											i+=1
-												支持英文搜索，中文搜索编码存在问题

											
										
										
											2016-05-04 00:46:51 +08:00
+											indexdoc[name] = i
-												提取关键词写入文件

											
										
										
											2016-04-29 17:56:36 +08:00
+										f.write(json.dumps(indexdoc))
-												写入索引

											
										
										
											2016-05-02 23:45:36 +08:00
+										f.close()
-												暂未解决数据问题

											
										
										
											2016-05-03 18:40:45 +08:00
+										colnum = tfidf.shape[1]
-												修改稀疏矩阵读取方式，提高建立索引速度，前一次提交复杂度太高

											
										
										
											2016-05-05 21:29:25 +08:00
+										#for i in range(0,colnum):
 										#	filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
 										#	f = open(filename,'a')
 										#	idx_list = dict()
 										#	for j in range(0,row):
 										#		val = tfidf[j,i]
 										#		if val > 0:
 										#			idx_list[j+1] = val
 										#	f.write(json.dumps(idx_list)+'\n')
 										#	f.close()
 										#i表示词项的编号，row表示非零文档所在的行
-												写入索引

											
										
										
											2016-05-02 23:45:36 +08:00
+										for i in range(0,colnum):
 											filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
-												修改稀疏矩阵读取方式，提高建立索引速度，前一次提交复杂度太高

											
										
										
											2016-05-05 21:29:25 +08:00
+											coldata = tfidf.getcol(i)
 											col_nonzero_index = np.nonzero(coldata)
 											item_weight_dict = dict()
 											for row in col_nonzero_index[0]:
 												item_weight_dict[row+1] = coldata[row][0].data[0]
-												暂未解决数据问题

											
										
										
											2016-05-03 18:40:45 +08:00
+											f = open(filename,'a')
-												修改稀疏矩阵读取方式，提高建立索引速度，前一次提交复杂度太高

											
										
										
											2016-05-05 21:29:25 +08:00
+											f.write(json.dumps(item_weight_dict)+'\n')
-												写入索引

											
										
										
											2016-05-02 23:45:36 +08:00
+											f.close()
-												修改稀疏矩阵读取方式，提高建立索引速度，前一次提交复杂度太高

											
										
										
											2016-05-05 21:29:25 +08:00
+											print 'item ',i,'calculate done'
-												写入索引

											
										
										
											2016-05-02 23:45:36 +08:00
 									def WriteInverseIndex(self,mat):
 										pass
-												提取关键词写入文件

											
										
										
											2016-04-29 17:56:36 +08:00
-												增加tfidf计算，梳理搜索结构

											
										
										
											2016-04-28 22:26:14 +08:00
+								#test
-												Add parase query

											
										
										
											2016-05-05 23:04:30 +08:00
+								#ii = InverseIndex()
 								#ii.CalcTFIDF()
-												暂未解决数据问题

											
										
										
											2016-05-03 18:40:45 +08:00
+								#ii.loadDataFromCutFile(20)