增加tfidf计算,梳理搜索结构

This commit is contained in:
lzjqsdd 2016-04-28 22:26:14 +08:00
parent df183760f8
commit fd7e81c2be
9 changed files with 112 additions and 39 deletions

1
.gitignore vendored
View File

@ -5,3 +5,4 @@ news_spider/title.json
news_spider/news.json news_spider/news.json
news_spider/news.db news_spider/news.db
ml/inversedata ml/inversedata
ml/cutnews

View File

@ -30,19 +30,21 @@ class Cut:
cut_file.close() cut_file.close()
num+=1 num+=1
def getRow(self,recordnum,path,size): def getRow(self,recordnum,path,size):
filenum = recordnum/size filenum = (recordnum-1)/size
linenum = recordnum%size linenum = (recordnum-1)%size+1
cutfilename = path+'/'+str(filenum)+'.txt' cutfilename = path+'/'+str(filenum)+'.txt'
print cutfilename print cutfilename,linenum
linecache.clearcache() linecache.clearcache()
line = linecache.getline(cutfilename,linenum) line = linecache.getline(cutfilename,linenum)
print line linecache.clearcache()
data = json.loads(line)
#print data['title'],data['time']
return line return line
#test cutfile #test cutfile
#c = Cut() #c = Cut()
#c.cutfile(Global.inverse_dir,Global.data_dir,Global.filesize) #c.cutfile(Global.cutnews_dir,Global.content_dir,Global.filesize)
#test getRow #test getRow
#c = Cut() #c = Cut()
#c.getRow(107,Global.inverse_dir,Global.filesize) #c.getRow(200,Global.cutnews_dir,Global.filesize)

BIN
ml/Cut.pyc Normal file

Binary file not shown.

View File

@ -1,5 +1,7 @@
data_dir = "../news_spider/title.json" title_dir = "../news_spider/title.json"
content_dir="../news_spider/news.json"
db_dir = "../news_spider/news.db" db_dir = "../news_spider/news.db"
stopword_dir="./stopword.txt" stopword_dir="./stopword.txt"
inverse_dir="./inversedata" inverse_dir="./inversedata"
cutnews_dir="./cutnews"
filesize = 100 filesize = 100

Binary file not shown.

View File

@ -1,8 +1,12 @@
#encoding=utf-8 # -*- coding: utf-8 -*-
import jieba import jieba
import json import json
import sys import sys
import Global import Global
from Cut import Cut
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
reload(sys) reload(sys)
sys.setdefaultencoding('utf-8') sys.setdefaultencoding('utf-8')
@ -10,12 +14,13 @@ sys.setdefaultencoding('utf-8')
class InverseIndex: class InverseIndex:
def __init__(self): def __init__(self):
self.file_data= open(Global.data_dir) self.file_data= open(Global.title_dir)
self.file_sw = open(Global.stopword_dir) self.file_sw = open(Global.stopword_dir)
self.ii = open(Global.inverse_dir,'wb') #self.ii = open(Global.inverse_dir,'wb')
self.stopword=[] self.stopword=[]
self.worddict = dict() self.worddict = dict()
#load stopword list
def loadsw(self): def loadsw(self):
while True: while True:
line = self.file_sw.readline() line = self.file_sw.readline()
@ -24,7 +29,8 @@ class InverseIndex:
self.stopword.append(line) self.stopword.append(line)
print line, print line,
def loaddata(self): #load origin data:news.json,title.json
def CalcInverseIndex(self):
self.loadsw() self.loadsw()
count=0 count=0
while True: while True:
@ -41,10 +47,52 @@ class InverseIndex:
print w, print w,
self.worddict[w].append(count) self.worddict[w].append(count)
def loadDataFromFile(self):
doc = []
f = open(Global.content_dir,'r')
while True:
line = f.readline()
if not line:
break
data = json.loads(line)
seg_list = list(jieba.cut(data['title'],cut_all=True))
doc.append(seg_list)
return doc
def loadDataFromCutFile(self,totalnum):
doc = []
cut = Cut()
for i in range(1,totalnum):
line = cut.getRow(i,Global.cutnews_dir,Global.filesize)
if not line:
break
data = json.loads(line)
seg_list = jieba.cut(data['content'],cut_all=True)
for seg in seg_list:
seg=''.join(seg.split())
if(seg!='' and seg!="\n" and seg!="\n\n"):
doc.append(seg)
return doc
#save inverse table to file
def write2file(self): def write2file(self):
for w in self.worddict: for w in self.worddict:
ii.write(w+' '+str(worddict[w])+'\n') ii.write(w+' '+str(worddict[w])+'\n')
#calculate tf-idf
def CalcTFIDF(self):
docArray = self.loadDataFromCutFile(100)
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray))
print 'done'
for name in vectorizer.get_feature_names():
print name
#test
ii = InverseIndex() ii = InverseIndex()
ii.loaddata() ii.CalcTFIDF()
ii.write2file()

30
ml/README.md Normal file
View File

@ -0,0 +1,30 @@
#新闻检索系统框架
##后端程序
### 抓取新闻
- Step1:采集网易头条和腾讯新闻以json格式保存
- Step2:由于抓取内容较大对文件进行拆分每个文件保存100个新闻文档
++**新闻在文件中的书序默认从1开始到结束**++
### 构建索引
- 根据tf-idf提取每篇新闻文档的关键词。
- 建立倒排索引,以小文件进行存储,因此需要维护两个词典
- `词项-编号`以json格式存储到文件启动系统初加载到字典中。
- `词项-文档编号`,(即倒排记录),查询时按需进行加载。
- 避免一次性读写,分批次读取新闻内容,建好索引写入文件,对于已经出现的词项更新倒排记录。
### 检索
- Step1:对输入的词或句子进行拆分,分别查询每个词
- Step2:对每个词返回的倒排记录进行合并,得到最终的查询记录。**优化:按照文档频率从小到大进行合并**
- Step3:可以考虑对结果按时间进行排序。
- Step4:返回内容摘要及url链接
###~~相似新闻推荐~~
**++待定++**
##前段展示
- 暂定使用webpy轻量级框架进行开发。
- 页面结构
- 搜索首页
- 搜索结果展示页
- 新闻页(侧边包含推荐相似新闻部分)

16
ml/Search.py Normal file
View File

@ -0,0 +1,16 @@
import sys
import json
class Search:
def __init__(self):
pass
def getQueryItem(self,InputItem):
pass
def getInverseRecord(self,item):
pass
def mergeInverseRecord(self,RecordList):
pass

View File

@ -1,26 +0,0 @@
# -*- coding: utf-8 -*-
import jieba
import json
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class DivideWord:
def __init__(self):
pass
def parse(self):
file = open('../news_spider/title.json')
while True:
line = file.readline()
if not line:
break
data = json.loads(line)
seg_list = list(jieba.cut(data['title'], cut_all=True))
for w in seg_list:
print w.encode('utf-8'),
print '\n'
dw = DivideWord()
dw.parse()