增加tfidf计算,梳理搜索结构
This commit is contained in:
parent
df183760f8
commit
fd7e81c2be
1
.gitignore
vendored
1
.gitignore
vendored
@ -5,3 +5,4 @@ news_spider/title.json
|
|||||||
news_spider/news.json
|
news_spider/news.json
|
||||||
news_spider/news.db
|
news_spider/news.db
|
||||||
ml/inversedata
|
ml/inversedata
|
||||||
|
ml/cutnews
|
||||||
|
14
ml/Cut.py
14
ml/Cut.py
@ -30,19 +30,21 @@ class Cut:
|
|||||||
cut_file.close()
|
cut_file.close()
|
||||||
num+=1
|
num+=1
|
||||||
def getRow(self,recordnum,path,size):
|
def getRow(self,recordnum,path,size):
|
||||||
filenum = recordnum/size
|
filenum = (recordnum-1)/size
|
||||||
linenum = recordnum%size
|
linenum = (recordnum-1)%size+1
|
||||||
cutfilename = path+'/'+str(filenum)+'.txt'
|
cutfilename = path+'/'+str(filenum)+'.txt'
|
||||||
print cutfilename
|
print cutfilename,linenum
|
||||||
linecache.clearcache()
|
linecache.clearcache()
|
||||||
line = linecache.getline(cutfilename,linenum)
|
line = linecache.getline(cutfilename,linenum)
|
||||||
print line
|
linecache.clearcache()
|
||||||
|
data = json.loads(line)
|
||||||
|
#print data['title'],data['time']
|
||||||
return line
|
return line
|
||||||
|
|
||||||
#test cutfile
|
#test cutfile
|
||||||
#c = Cut()
|
#c = Cut()
|
||||||
#c.cutfile(Global.inverse_dir,Global.data_dir,Global.filesize)
|
#c.cutfile(Global.cutnews_dir,Global.content_dir,Global.filesize)
|
||||||
|
|
||||||
#test getRow
|
#test getRow
|
||||||
#c = Cut()
|
#c = Cut()
|
||||||
#c.getRow(107,Global.inverse_dir,Global.filesize)
|
#c.getRow(200,Global.cutnews_dir,Global.filesize)
|
||||||
|
BIN
ml/Cut.pyc
Normal file
BIN
ml/Cut.pyc
Normal file
Binary file not shown.
@ -1,5 +1,7 @@
|
|||||||
data_dir = "../news_spider/title.json"
|
title_dir = "../news_spider/title.json"
|
||||||
|
content_dir="../news_spider/news.json"
|
||||||
db_dir = "../news_spider/news.db"
|
db_dir = "../news_spider/news.db"
|
||||||
stopword_dir="./stopword.txt"
|
stopword_dir="./stopword.txt"
|
||||||
inverse_dir="./inversedata"
|
inverse_dir="./inversedata"
|
||||||
|
cutnews_dir="./cutnews"
|
||||||
filesize = 100
|
filesize = 100
|
||||||
|
BIN
ml/Global.pyc
BIN
ml/Global.pyc
Binary file not shown.
@ -1,8 +1,12 @@
|
|||||||
#encoding=utf-8
|
# -*- coding: utf-8 -*-
|
||||||
import jieba
|
import jieba
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
import Global
|
import Global
|
||||||
|
from Cut import Cut
|
||||||
|
from sklearn import feature_extraction
|
||||||
|
from sklearn.feature_extraction.text import TfidfTransformer
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
reload(sys)
|
reload(sys)
|
||||||
sys.setdefaultencoding('utf-8')
|
sys.setdefaultencoding('utf-8')
|
||||||
|
|
||||||
@ -10,12 +14,13 @@ sys.setdefaultencoding('utf-8')
|
|||||||
class InverseIndex:
|
class InverseIndex:
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.file_data= open(Global.data_dir)
|
self.file_data= open(Global.title_dir)
|
||||||
self.file_sw = open(Global.stopword_dir)
|
self.file_sw = open(Global.stopword_dir)
|
||||||
self.ii = open(Global.inverse_dir,'wb')
|
#self.ii = open(Global.inverse_dir,'wb')
|
||||||
self.stopword=[]
|
self.stopword=[]
|
||||||
self.worddict = dict()
|
self.worddict = dict()
|
||||||
|
|
||||||
|
#load stopword list
|
||||||
def loadsw(self):
|
def loadsw(self):
|
||||||
while True:
|
while True:
|
||||||
line = self.file_sw.readline()
|
line = self.file_sw.readline()
|
||||||
@ -24,7 +29,8 @@ class InverseIndex:
|
|||||||
self.stopword.append(line)
|
self.stopword.append(line)
|
||||||
print line,
|
print line,
|
||||||
|
|
||||||
def loaddata(self):
|
#load origin data:news.json,title.json
|
||||||
|
def CalcInverseIndex(self):
|
||||||
self.loadsw()
|
self.loadsw()
|
||||||
count=0
|
count=0
|
||||||
while True:
|
while True:
|
||||||
@ -41,10 +47,52 @@ class InverseIndex:
|
|||||||
print w,
|
print w,
|
||||||
self.worddict[w].append(count)
|
self.worddict[w].append(count)
|
||||||
|
|
||||||
|
def loadDataFromFile(self):
|
||||||
|
doc = []
|
||||||
|
f = open(Global.content_dir,'r')
|
||||||
|
while True:
|
||||||
|
line = f.readline()
|
||||||
|
if not line:
|
||||||
|
break
|
||||||
|
data = json.loads(line)
|
||||||
|
seg_list = list(jieba.cut(data['title'],cut_all=True))
|
||||||
|
doc.append(seg_list)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
def loadDataFromCutFile(self,totalnum):
|
||||||
|
doc = []
|
||||||
|
cut = Cut()
|
||||||
|
for i in range(1,totalnum):
|
||||||
|
line = cut.getRow(i,Global.cutnews_dir,Global.filesize)
|
||||||
|
if not line:
|
||||||
|
break
|
||||||
|
data = json.loads(line)
|
||||||
|
seg_list = jieba.cut(data['content'],cut_all=True)
|
||||||
|
for seg in seg_list:
|
||||||
|
seg=''.join(seg.split())
|
||||||
|
if(seg!='' and seg!="\n" and seg!="\n\n"):
|
||||||
|
doc.append(seg)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
#save inverse table to file
|
||||||
def write2file(self):
|
def write2file(self):
|
||||||
for w in self.worddict:
|
for w in self.worddict:
|
||||||
ii.write(w+' '+str(worddict[w])+'\n')
|
ii.write(w+' '+str(worddict[w])+'\n')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#calculate tf-idf
|
||||||
|
def CalcTFIDF(self):
|
||||||
|
docArray = self.loadDataFromCutFile(100)
|
||||||
|
vectorizer = CountVectorizer()
|
||||||
|
transformer = TfidfTransformer()
|
||||||
|
tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray))
|
||||||
|
print 'done'
|
||||||
|
for name in vectorizer.get_feature_names():
|
||||||
|
print name
|
||||||
|
|
||||||
|
#test
|
||||||
ii = InverseIndex()
|
ii = InverseIndex()
|
||||||
ii.loaddata()
|
ii.CalcTFIDF()
|
||||||
ii.write2file()
|
|
||||||
|
30
ml/README.md
Normal file
30
ml/README.md
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
#新闻检索系统框架
|
||||||
|
|
||||||
|
##后端程序
|
||||||
|
### 抓取新闻
|
||||||
|
- Step1:采集网易,头条和腾讯新闻,以json格式保存
|
||||||
|
- Step2:由于抓取内容较大,对文件进行拆分,每个文件保存100个新闻文档
|
||||||
|
++**新闻在文件中的书序默认从1开始到结束**++
|
||||||
|
|
||||||
|
### 构建索引
|
||||||
|
- 根据tf-idf提取每篇新闻文档的关键词。
|
||||||
|
- 建立倒排索引,以小文件进行存储,因此需要维护两个词典
|
||||||
|
- `词项-编号`,以json格式存储到文件,启动系统初加载到字典中。
|
||||||
|
- `词项-文档编号`,(即倒排记录),查询时按需进行加载。
|
||||||
|
- 避免一次性读写,分批次读取新闻内容,建好索引写入文件,对于已经出现的词项更新倒排记录。
|
||||||
|
|
||||||
|
### 检索
|
||||||
|
- Step1:对输入的词或句子进行拆分,分别查询每个词
|
||||||
|
- Step2:对每个词返回的倒排记录进行合并,得到最终的查询记录。**优化:按照文档频率从小到大进行合并**
|
||||||
|
- Step3:可以考虑对结果按时间进行排序。
|
||||||
|
- Step4:返回内容摘要及url链接
|
||||||
|
|
||||||
|
###~~相似新闻推荐~~
|
||||||
|
**++待定++**
|
||||||
|
|
||||||
|
##前段展示
|
||||||
|
- 暂定使用webpy轻量级框架进行开发。
|
||||||
|
- 页面结构
|
||||||
|
- 搜索首页
|
||||||
|
- 搜索结果展示页
|
||||||
|
- 新闻页(侧边包含推荐相似新闻部分)
|
16
ml/Search.py
Normal file
16
ml/Search.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
import sys
|
||||||
|
import json
|
||||||
|
|
||||||
|
class Search:
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def getQueryItem(self,InputItem):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def getInverseRecord(self,item):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def mergeInverseRecord(self,RecordList):
|
||||||
|
pass
|
||||||
|
|
26
ml/div.py
26
ml/div.py
@ -1,26 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
import jieba
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
|
|
||||||
reload(sys)
|
|
||||||
sys.setdefaultencoding('utf-8')
|
|
||||||
|
|
||||||
class DivideWord:
|
|
||||||
def __init__(self):
|
|
||||||
pass
|
|
||||||
def parse(self):
|
|
||||||
file = open('../news_spider/title.json')
|
|
||||||
|
|
||||||
while True:
|
|
||||||
line = file.readline()
|
|
||||||
if not line:
|
|
||||||
break
|
|
||||||
data = json.loads(line)
|
|
||||||
seg_list = list(jieba.cut(data['title'], cut_all=True))
|
|
||||||
for w in seg_list:
|
|
||||||
print w.encode('utf-8'),
|
|
||||||
print '\n'
|
|
||||||
|
|
||||||
dw = DivideWord()
|
|
||||||
dw.parse()
|
|
Loading…
Reference in New Issue
Block a user