增加tfidf计算，梳理搜索结构

2016-04-28 22:26:14 +08:00 · 2016-04-28 22:26:14 +08:00 · fd7e81c2be
commit fd7e81c2be
parent df183760f8
9 changed files with 112 additions and 39 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,3 +5,4 @@ news_spider/title.json
 news_spider/news.json
 news_spider/news.db
 ml/inversedata
 ml/cutnews
--- a/ml/Cut.py
+++ b/ml/Cut.py
@ -30,19 +30,21 @@ class Cut:
 			cut_file.close()
 			num+=1
 	def getRow(self,recordnum,path,size):
-		filenum = recordnum/size
+		filenum = (recordnum-1)/size
-		linenum = recordnum%size
+		linenum = (recordnum-1)%size+1
 		cutfilename = path+'/'+str(filenum)+'.txt'
-		print cutfilename
+		print cutfilename,linenum
 		linecache.clearcache()
 		line = linecache.getline(cutfilename,linenum)
-		print line
+		linecache.clearcache()
 		data = json.loads(line)
 #print data['title'],data['time']
 		return line
 #test cutfile
 #c = Cut()
-#c.cutfile(Global.inverse_dir,Global.data_dir,Global.filesize)
+#c.cutfile(Global.cutnews_dir,Global.content_dir,Global.filesize)
 #test getRow
 #c = Cut()
-#c.getRow(107,Global.inverse_dir,Global.filesize)
+#c.getRow(200,Global.cutnews_dir,Global.filesize)
--- a/ml/Cut.pyc
+++ b/ml/Cut.pyc
--- a/ml/Global.py
+++ b/ml/Global.py
@ -1,5 +1,7 @@
-data_dir = "../news_spider/title.json"
+title_dir = "../news_spider/title.json"
 content_dir="../news_spider/news.json"
 db_dir = "../news_spider/news.db"
 stopword_dir="./stopword.txt"
 inverse_dir="./inversedata"
 cutnews_dir="./cutnews"
 filesize = 100
--- a/ml/Global.pyc
+++ b/ml/Global.pyc
--- a/ml/InverseIndex.py
+++ b/ml/InverseIndex.py
@ -1,8 +1,12 @@
-#encoding=utf-8
+# -*- coding: utf-8 -*- 
 import jieba
 import json
 import sys
 import Global
 from Cut import Cut
 from sklearn import feature_extraction
 from sklearn.feature_extraction.text import TfidfTransformer
 from sklearn.feature_extraction.text import CountVectorizer
 reload(sys)
 sys.setdefaultencoding('utf-8')
@ -10,12 +14,13 @@ sys.setdefaultencoding('utf-8')
 class InverseIndex:
 	def __init__(self):
-		self.file_data= open(Global.data_dir)
+		self.file_data= open(Global.title_dir)
 		self.file_sw = open(Global.stopword_dir)
-		self.ii = open(Global.inverse_dir,'wb')
+#self.ii = open(Global.inverse_dir,'wb')
 		self.stopword=[]
 		self.worddict = dict()
 	#load stopword list
 	def loadsw(self):
 		while True:
 			line = self.file_sw.readline()
@ -24,7 +29,8 @@ class InverseIndex:
 			self.stopword.append(line)
 			print line,
-	def loaddata(self):
+	#load origin data:news.json,title.json
 	def CalcInverseIndex(self):
 		self.loadsw()
 		count=0
 		while True:
@ -41,10 +47,52 @@ class InverseIndex:
 					print w,
 					self.worddict[w].append(count)
 	def loadDataFromFile(self):
 		doc = []
 		f = open(Global.content_dir,'r')
 		while True:
 			line = f.readline()
 			if not line:
 				break
 			data = json.loads(line)
 			seg_list = list(jieba.cut(data['title'],cut_all=True))
 			doc.append(seg_list)
 		return doc
 	def loadDataFromCutFile(self,totalnum):
 		doc = []
 		cut = Cut()
 		for i in range(1,totalnum):
 			line = cut.getRow(i,Global.cutnews_dir,Global.filesize)
 			if not line:
 				break
 			data = json.loads(line)
 			seg_list = jieba.cut(data['content'],cut_all=True)
 			for seg in seg_list:
 				seg=''.join(seg.split())
 				if(seg!='' and seg!="\n" and seg!="\n\n"):
 					doc.append(seg)
 		return doc
 	#save inverse table to file
 	def write2file(self):
 		for w in self.worddict:
 			ii.write(w+' '+str(worddict[w])+'\n')
 	#calculate tf-idf
 	def CalcTFIDF(self):
 		docArray = self.loadDataFromCutFile(100)
 		vectorizer = CountVectorizer()
 		transformer = TfidfTransformer()
 		tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray))
 		print 'done'
 		for name in vectorizer.get_feature_names():
 			print name
 #test
 ii = InverseIndex()
-ii.loaddata()
+ii.CalcTFIDF()
 ii.write2file()
--- a/ml/README.md
+++ b/ml/README.md
@ -0,0 +1,30 @@
 #新闻检索系统框架
 ##后端程序
 ### 抓取新闻
 - Step1:采集网易，头条和腾讯新闻，以json格式保存
 - Step2:由于抓取内容较大，对文件进行拆分，每个文件保存100个新闻文档
 ++**新闻在文件中的书序默认从1开始到结束**++
 ### 构建索引
 - 根据tf-idf提取每篇新闻文档的关键词。
 - 建立倒排索引，以小文件进行存储，因此需要维护两个词典
 	- `词项-编号`，以json格式存储到文件，启动系统初加载到字典中。
 	- `词项-文档编号`，（即倒排记录），查询时按需进行加载。
 - 避免一次性读写，分批次读取新闻内容，建好索引写入文件，对于已经出现的词项更新倒排记录。
 ### 检索
 - Step1:对输入的词或句子进行拆分，分别查询每个词
 - Step2:对每个词返回的倒排记录进行合并，得到最终的查询记录。**优化：按照文档频率从小到大进行合并**
 - Step3:可以考虑对结果按时间进行排序。
 - Step4:返回内容摘要及url链接
 ###~~相似新闻推荐~~
 **++待定++**
 ##前段展示
 - 暂定使用webpy轻量级框架进行开发。
 - 页面结构
 	- 搜索首页
 	- 搜索结果展示页
 	- 新闻页（侧边包含推荐相似新闻部分）
--- a/ml/Search.py
+++ b/ml/Search.py
@ -0,0 +1,16 @@
 import sys
 import json
 class Search:
 	def __init__(self):
 		pass
 	def getQueryItem(self,InputItem):
 		pass
 	def getInverseRecord(self,item):
 		pass
 	def mergeInverseRecord(self,RecordList):
 		pass
--- a/ml/div.py
+++ b/ml/div.py
@ -1,26 +0,0 @@
 # -*- coding: utf-8 -*- 
 import jieba
 import json
 import sys
 reload(sys)
 sys.setdefaultencoding('utf-8')
 class DivideWord:
 	def __init__(self):
 		pass
 	def parse(self):
 		file = open('../news_spider/title.json')
 		while True:
 			line = file.readline()
 			if not line:
 				break
 			data = json.loads(line)
 			seg_list = list(jieba.cut(data['title'], cut_all=True))
 			for w in seg_list:
 				print w.encode('utf-8'),
 			print '\n'
 dw = DivideWord()
 dw.parse()