增加tfidf计算，梳理搜索结构

2016-04-28 22:26:14 +08:00 · 2016-04-28 22:26:14 +08:00 · fd7e81c2be
commit fd7e81c2be
parent df183760f8
9 changed files with 112 additions and 39 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,3 +5,4 @@ news_spider/title.json
 news_spider/news.json
 news_spider/news.db
 ml/inversedata
+ml/cutnews
--- a/ml/Cut.py
+++ b/ml/Cut.py
@ -30,19 +30,21 @@ class Cut:
 			cut_file.close()
 			num+=1
 	def getRow(self,recordnum,path,size):
-		filenum = recordnum/size
-		linenum = recordnum%size
+		filenum = (recordnum-1)/size
+		linenum = (recordnum-1)%size+1
 		cutfilename = path+'/'+str(filenum)+'.txt'
-		print cutfilename
+		print cutfilename,linenum
 		linecache.clearcache()
 		line = linecache.getline(cutfilename,linenum)
-		print line
+		linecache.clearcache()
+		data = json.loads(line)
+#print data['title'],data['time']
 		return line

 #test cutfile
 #c = Cut()
-#c.cutfile(Global.inverse_dir,Global.data_dir,Global.filesize)
+#c.cutfile(Global.cutnews_dir,Global.content_dir,Global.filesize)

 #test getRow
 #c = Cut()
-#c.getRow(107,Global.inverse_dir,Global.filesize)
+#c.getRow(200,Global.cutnews_dir,Global.filesize)
--- a/ml/Cut.pyc
+++ b/ml/Cut.pyc
--- a/ml/Global.py
+++ b/ml/Global.py
@ -1,5 +1,7 @@
-data_dir = "../news_spider/title.json"
+title_dir = "../news_spider/title.json"
+content_dir="../news_spider/news.json"
 db_dir = "../news_spider/news.db"
 stopword_dir="./stopword.txt"
 inverse_dir="./inversedata"
+cutnews_dir="./cutnews"
 filesize = 100
--- a/ml/Global.pyc
+++ b/ml/Global.pyc
--- a/ml/InverseIndex.py
+++ b/ml/InverseIndex.py
@ -1,8 +1,12 @@
-#encoding=utf-8
+# -*- coding: utf-8 -*- 
 import jieba
 import json
 import sys
 import Global
+from Cut import Cut
+from sklearn import feature_extraction
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.feature_extraction.text import CountVectorizer
 reload(sys)
 sys.setdefaultencoding('utf-8')

@ -10,12 +14,13 @@ sys.setdefaultencoding('utf-8')
 class InverseIndex:

 	def __init__(self):
-		self.file_data= open(Global.data_dir)
+		self.file_data= open(Global.title_dir)
 		self.file_sw = open(Global.stopword_dir)
-		self.ii = open(Global.inverse_dir,'wb')
+#self.ii = open(Global.inverse_dir,'wb')
 		self.stopword=[]
 		self.worddict = dict()

+	#load stopword list
 	def loadsw(self):
 		while True:
 			line = self.file_sw.readline()
@ -24,7 +29,8 @@ class InverseIndex:
 			self.stopword.append(line)
 			print line,

-	def loaddata(self):
+	#load origin data:news.json,title.json
+	def CalcInverseIndex(self):
 		self.loadsw()
 		count=0
 		while True:
@ -41,10 +47,52 @@ class InverseIndex:
 					print w,
 					self.worddict[w].append(count)

+	def loadDataFromFile(self):
+		doc = []
+		f = open(Global.content_dir,'r')
+		while True:
+			line = f.readline()
+			if not line:
+				break
+			data = json.loads(line)
+			seg_list = list(jieba.cut(data['title'],cut_all=True))
+			doc.append(seg_list)
+		return doc
+
+
+	def loadDataFromCutFile(self,totalnum):
+		doc = []
+		cut = Cut()
+		for i in range(1,totalnum):
+			line = cut.getRow(i,Global.cutnews_dir,Global.filesize)
+			if not line:
+				break
+			data = json.loads(line)
+			seg_list = jieba.cut(data['content'],cut_all=True)
+			for seg in seg_list:
+				seg=''.join(seg.split())
+				if(seg!='' and seg!="\n" and seg!="\n\n"):
+					doc.append(seg)
+		return doc
+
+
+	#save inverse table to file
 	def write2file(self):
 		for w in self.worddict:
 			ii.write(w+' '+str(worddict[w])+'\n')

+
+
+	#calculate tf-idf
+	def CalcTFIDF(self):
+		docArray = self.loadDataFromCutFile(100)
+		vectorizer = CountVectorizer()
+		transformer = TfidfTransformer()
+		tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray))
+		print 'done'
+		for name in vectorizer.get_feature_names():
+			print name
+
+#test
 ii = InverseIndex()
-ii.loaddata()
-ii.write2file()
+ii.CalcTFIDF()
--- a/ml/README.md
+++ b/ml/README.md
@ -0,0 +1,30 @@
+#新闻检索系统框架
+
+##后端程序
+### 抓取新闻
+- Step1:采集网易，头条和腾讯新闻，以json格式保存
+- Step2:由于抓取内容较大，对文件进行拆分，每个文件保存100个新闻文档
++**新闻在文件中的书序默认从1开始到结束**++
+
+### 构建索引
+- 根据tf-idf提取每篇新闻文档的关键词。
+- 建立倒排索引，以小文件进行存储，因此需要维护两个词典
+	- `词项-编号`，以json格式存储到文件，启动系统初加载到字典中。
+	- `词项-文档编号`，（即倒排记录），查询时按需进行加载。
+- 避免一次性读写，分批次读取新闻内容，建好索引写入文件，对于已经出现的词项更新倒排记录。
+
+### 检索
+- Step1:对输入的词或句子进行拆分，分别查询每个词
+- Step2:对每个词返回的倒排记录进行合并，得到最终的查询记录。**优化：按照文档频率从小到大进行合并**
+- Step3:可以考虑对结果按时间进行排序。
+- Step4:返回内容摘要及url链接
+
+###~~相似新闻推荐~~
+**++待定++**
+
+##前段展示
+- 暂定使用webpy轻量级框架进行开发。
+- 页面结构
+	- 搜索首页
+	- 搜索结果展示页
+	- 新闻页（侧边包含推荐相似新闻部分）
--- a/ml/Search.py
+++ b/ml/Search.py
@ -0,0 +1,16 @@
+import sys
+import json
+
+class Search:
+	def __init__(self):
+		pass
+
+	def getQueryItem(self,InputItem):
+		pass
+
+	def getInverseRecord(self,item):
+		pass
+
+	def mergeInverseRecord(self,RecordList):
+		pass
+
--- a/ml/div.py
+++ b/ml/div.py
@ -1,26 +0,0 @@
-# -*- coding: utf-8 -*- 
-import jieba
-import json
-import sys
-
-reload(sys)
-sys.setdefaultencoding('utf-8')
-
-class DivideWord:
-	def __init__(self):
-		pass
-	def parse(self):
-		file = open('../news_spider/title.json')
-
-		while True:
-			line = file.readline()
-			if not line:
-				break
-			data = json.loads(line)
-			seg_list = list(jieba.cut(data['title'], cut_all=True))
-			for w in seg_list:
-				print w.encode('utf-8'),
-			print '\n'
-
-dw = DivideWord()
-dw.parse()