增加tfidf计算,梳理搜索结构
This commit is contained in:
parent
df183760f8
commit
fd7e81c2be
1
.gitignore
vendored
1
.gitignore
vendored
@ -5,3 +5,4 @@ news_spider/title.json
|
||||
news_spider/news.json
|
||||
news_spider/news.db
|
||||
ml/inversedata
|
||||
ml/cutnews
|
||||
|
14
ml/Cut.py
14
ml/Cut.py
@ -30,19 +30,21 @@ class Cut:
|
||||
cut_file.close()
|
||||
num+=1
|
||||
def getRow(self,recordnum,path,size):
|
||||
filenum = recordnum/size
|
||||
linenum = recordnum%size
|
||||
filenum = (recordnum-1)/size
|
||||
linenum = (recordnum-1)%size+1
|
||||
cutfilename = path+'/'+str(filenum)+'.txt'
|
||||
print cutfilename
|
||||
print cutfilename,linenum
|
||||
linecache.clearcache()
|
||||
line = linecache.getline(cutfilename,linenum)
|
||||
print line
|
||||
linecache.clearcache()
|
||||
data = json.loads(line)
|
||||
#print data['title'],data['time']
|
||||
return line
|
||||
|
||||
#test cutfile
|
||||
#c = Cut()
|
||||
#c.cutfile(Global.inverse_dir,Global.data_dir,Global.filesize)
|
||||
#c.cutfile(Global.cutnews_dir,Global.content_dir,Global.filesize)
|
||||
|
||||
#test getRow
|
||||
#c = Cut()
|
||||
#c.getRow(107,Global.inverse_dir,Global.filesize)
|
||||
#c.getRow(200,Global.cutnews_dir,Global.filesize)
|
||||
|
BIN
ml/Cut.pyc
Normal file
BIN
ml/Cut.pyc
Normal file
Binary file not shown.
@ -1,5 +1,7 @@
|
||||
data_dir = "../news_spider/title.json"
|
||||
title_dir = "../news_spider/title.json"
|
||||
content_dir="../news_spider/news.json"
|
||||
db_dir = "../news_spider/news.db"
|
||||
stopword_dir="./stopword.txt"
|
||||
inverse_dir="./inversedata"
|
||||
cutnews_dir="./cutnews"
|
||||
filesize = 100
|
||||
|
BIN
ml/Global.pyc
BIN
ml/Global.pyc
Binary file not shown.
@ -1,8 +1,12 @@
|
||||
#encoding=utf-8
|
||||
# -*- coding: utf-8 -*-
|
||||
import jieba
|
||||
import json
|
||||
import sys
|
||||
import Global
|
||||
from Cut import Cut
|
||||
from sklearn import feature_extraction
|
||||
from sklearn.feature_extraction.text import TfidfTransformer
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
reload(sys)
|
||||
sys.setdefaultencoding('utf-8')
|
||||
|
||||
@ -10,12 +14,13 @@ sys.setdefaultencoding('utf-8')
|
||||
class InverseIndex:
|
||||
|
||||
def __init__(self):
|
||||
self.file_data= open(Global.data_dir)
|
||||
self.file_data= open(Global.title_dir)
|
||||
self.file_sw = open(Global.stopword_dir)
|
||||
self.ii = open(Global.inverse_dir,'wb')
|
||||
#self.ii = open(Global.inverse_dir,'wb')
|
||||
self.stopword=[]
|
||||
self.worddict = dict()
|
||||
|
||||
#load stopword list
|
||||
def loadsw(self):
|
||||
while True:
|
||||
line = self.file_sw.readline()
|
||||
@ -24,7 +29,8 @@ class InverseIndex:
|
||||
self.stopword.append(line)
|
||||
print line,
|
||||
|
||||
def loaddata(self):
|
||||
#load origin data:news.json,title.json
|
||||
def CalcInverseIndex(self):
|
||||
self.loadsw()
|
||||
count=0
|
||||
while True:
|
||||
@ -41,10 +47,52 @@ class InverseIndex:
|
||||
print w,
|
||||
self.worddict[w].append(count)
|
||||
|
||||
def loadDataFromFile(self):
|
||||
doc = []
|
||||
f = open(Global.content_dir,'r')
|
||||
while True:
|
||||
line = f.readline()
|
||||
if not line:
|
||||
break
|
||||
data = json.loads(line)
|
||||
seg_list = list(jieba.cut(data['title'],cut_all=True))
|
||||
doc.append(seg_list)
|
||||
return doc
|
||||
|
||||
|
||||
def loadDataFromCutFile(self,totalnum):
|
||||
doc = []
|
||||
cut = Cut()
|
||||
for i in range(1,totalnum):
|
||||
line = cut.getRow(i,Global.cutnews_dir,Global.filesize)
|
||||
if not line:
|
||||
break
|
||||
data = json.loads(line)
|
||||
seg_list = jieba.cut(data['content'],cut_all=True)
|
||||
for seg in seg_list:
|
||||
seg=''.join(seg.split())
|
||||
if(seg!='' and seg!="\n" and seg!="\n\n"):
|
||||
doc.append(seg)
|
||||
return doc
|
||||
|
||||
|
||||
#save inverse table to file
|
||||
def write2file(self):
|
||||
for w in self.worddict:
|
||||
ii.write(w+' '+str(worddict[w])+'\n')
|
||||
|
||||
|
||||
|
||||
#calculate tf-idf
|
||||
def CalcTFIDF(self):
|
||||
docArray = self.loadDataFromCutFile(100)
|
||||
vectorizer = CountVectorizer()
|
||||
transformer = TfidfTransformer()
|
||||
tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray))
|
||||
print 'done'
|
||||
for name in vectorizer.get_feature_names():
|
||||
print name
|
||||
|
||||
#test
|
||||
ii = InverseIndex()
|
||||
ii.loaddata()
|
||||
ii.write2file()
|
||||
ii.CalcTFIDF()
|
||||
|
30
ml/README.md
Normal file
30
ml/README.md
Normal file
@ -0,0 +1,30 @@
|
||||
#新闻检索系统框架
|
||||
|
||||
##后端程序
|
||||
### 抓取新闻
|
||||
- Step1:采集网易,头条和腾讯新闻,以json格式保存
|
||||
- Step2:由于抓取内容较大,对文件进行拆分,每个文件保存100个新闻文档
|
||||
++**新闻在文件中的书序默认从1开始到结束**++
|
||||
|
||||
### 构建索引
|
||||
- 根据tf-idf提取每篇新闻文档的关键词。
|
||||
- 建立倒排索引,以小文件进行存储,因此需要维护两个词典
|
||||
- `词项-编号`,以json格式存储到文件,启动系统初加载到字典中。
|
||||
- `词项-文档编号`,(即倒排记录),查询时按需进行加载。
|
||||
- 避免一次性读写,分批次读取新闻内容,建好索引写入文件,对于已经出现的词项更新倒排记录。
|
||||
|
||||
### 检索
|
||||
- Step1:对输入的词或句子进行拆分,分别查询每个词
|
||||
- Step2:对每个词返回的倒排记录进行合并,得到最终的查询记录。**优化:按照文档频率从小到大进行合并**
|
||||
- Step3:可以考虑对结果按时间进行排序。
|
||||
- Step4:返回内容摘要及url链接
|
||||
|
||||
###~~相似新闻推荐~~
|
||||
**++待定++**
|
||||
|
||||
##前段展示
|
||||
- 暂定使用webpy轻量级框架进行开发。
|
||||
- 页面结构
|
||||
- 搜索首页
|
||||
- 搜索结果展示页
|
||||
- 新闻页(侧边包含推荐相似新闻部分)
|
16
ml/Search.py
Normal file
16
ml/Search.py
Normal file
@ -0,0 +1,16 @@
|
||||
import sys
|
||||
import json
|
||||
|
||||
class Search:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def getQueryItem(self,InputItem):
|
||||
pass
|
||||
|
||||
def getInverseRecord(self,item):
|
||||
pass
|
||||
|
||||
def mergeInverseRecord(self,RecordList):
|
||||
pass
|
||||
|
26
ml/div.py
26
ml/div.py
@ -1,26 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import jieba
|
||||
import json
|
||||
import sys
|
||||
|
||||
reload(sys)
|
||||
sys.setdefaultencoding('utf-8')
|
||||
|
||||
class DivideWord:
|
||||
def __init__(self):
|
||||
pass
|
||||
def parse(self):
|
||||
file = open('../news_spider/title.json')
|
||||
|
||||
while True:
|
||||
line = file.readline()
|
||||
if not line:
|
||||
break
|
||||
data = json.loads(line)
|
||||
seg_list = list(jieba.cut(data['title'], cut_all=True))
|
||||
for w in seg_list:
|
||||
print w.encode('utf-8'),
|
||||
print '\n'
|
||||
|
||||
dw = DivideWord()
|
||||
dw.parse()
|
Loading…
Reference in New Issue
Block a user