diff --git a/.gitignore b/.gitignore index bc2ae0e..581c6da 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ news_spider/title.json news_spider/news.json news_spider/news.db ml/inversedata +ml/cutnews diff --git a/ml/Cut.py b/ml/Cut.py index 674fda9..35486f1 100644 --- a/ml/Cut.py +++ b/ml/Cut.py @@ -30,19 +30,21 @@ class Cut: cut_file.close() num+=1 def getRow(self,recordnum,path,size): - filenum = recordnum/size - linenum = recordnum%size + filenum = (recordnum-1)/size + linenum = (recordnum-1)%size+1 cutfilename = path+'/'+str(filenum)+'.txt' - print cutfilename + print cutfilename,linenum linecache.clearcache() line = linecache.getline(cutfilename,linenum) - print line + linecache.clearcache() + data = json.loads(line) +#print data['title'],data['time'] return line #test cutfile #c = Cut() -#c.cutfile(Global.inverse_dir,Global.data_dir,Global.filesize) +#c.cutfile(Global.cutnews_dir,Global.content_dir,Global.filesize) #test getRow #c = Cut() -#c.getRow(107,Global.inverse_dir,Global.filesize) +#c.getRow(200,Global.cutnews_dir,Global.filesize) diff --git a/ml/Cut.pyc b/ml/Cut.pyc new file mode 100644 index 0000000..b844e73 Binary files /dev/null and b/ml/Cut.pyc differ diff --git a/ml/Global.py b/ml/Global.py index 4cb68ff..db3ffeb 100644 --- a/ml/Global.py +++ b/ml/Global.py @@ -1,5 +1,7 @@ -data_dir = "../news_spider/title.json" +title_dir = "../news_spider/title.json" +content_dir="../news_spider/news.json" db_dir = "../news_spider/news.db" stopword_dir="./stopword.txt" inverse_dir="./inversedata" +cutnews_dir="./cutnews" filesize = 100 diff --git a/ml/Global.pyc b/ml/Global.pyc index fa87777..e79e2d3 100644 Binary files a/ml/Global.pyc and b/ml/Global.pyc differ diff --git a/ml/InverseIndex.py b/ml/InverseIndex.py index 847ddfc..92930dc 100644 --- a/ml/InverseIndex.py +++ b/ml/InverseIndex.py @@ -1,8 +1,12 @@ -#encoding=utf-8 +# -*- coding: utf-8 -*- import jieba import json import sys import Global +from Cut import Cut +from sklearn import feature_extraction +from sklearn.feature_extraction.text import TfidfTransformer +from sklearn.feature_extraction.text import CountVectorizer reload(sys) sys.setdefaultencoding('utf-8') @@ -10,12 +14,13 @@ sys.setdefaultencoding('utf-8') class InverseIndex: def __init__(self): - self.file_data= open(Global.data_dir) + self.file_data= open(Global.title_dir) self.file_sw = open(Global.stopword_dir) - self.ii = open(Global.inverse_dir,'wb') +#self.ii = open(Global.inverse_dir,'wb') self.stopword=[] self.worddict = dict() + #load stopword list def loadsw(self): while True: line = self.file_sw.readline() @@ -24,7 +29,8 @@ class InverseIndex: self.stopword.append(line) print line, - def loaddata(self): + #load origin data:news.json,title.json + def CalcInverseIndex(self): self.loadsw() count=0 while True: @@ -41,10 +47,52 @@ class InverseIndex: print w, self.worddict[w].append(count) + def loadDataFromFile(self): + doc = [] + f = open(Global.content_dir,'r') + while True: + line = f.readline() + if not line: + break + data = json.loads(line) + seg_list = list(jieba.cut(data['title'],cut_all=True)) + doc.append(seg_list) + return doc + + + def loadDataFromCutFile(self,totalnum): + doc = [] + cut = Cut() + for i in range(1,totalnum): + line = cut.getRow(i,Global.cutnews_dir,Global.filesize) + if not line: + break + data = json.loads(line) + seg_list = jieba.cut(data['content'],cut_all=True) + for seg in seg_list: + seg=''.join(seg.split()) + if(seg!='' and seg!="\n" and seg!="\n\n"): + doc.append(seg) + return doc + + + #save inverse table to file def write2file(self): for w in self.worddict: ii.write(w+' '+str(worddict[w])+'\n') + + + #calculate tf-idf + def CalcTFIDF(self): + docArray = self.loadDataFromCutFile(100) + vectorizer = CountVectorizer() + transformer = TfidfTransformer() + tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray)) + print 'done' + for name in vectorizer.get_feature_names(): + print name + +#test ii = InverseIndex() -ii.loaddata() -ii.write2file() +ii.CalcTFIDF() diff --git a/ml/README.md b/ml/README.md new file mode 100644 index 0000000..2294d15 --- /dev/null +++ b/ml/README.md @@ -0,0 +1,30 @@ +#新闻检索系统框架 + +##后端程序 +### 抓取新闻 +- Step1:采集网易,头条和腾讯新闻,以json格式保存 +- Step2:由于抓取内容较大,对文件进行拆分,每个文件保存100个新闻文档 +++**新闻在文件中的书序默认从1开始到结束**++ + +### 构建索引 +- 根据tf-idf提取每篇新闻文档的关键词。 +- 建立倒排索引,以小文件进行存储,因此需要维护两个词典 + - `词项-编号`,以json格式存储到文件,启动系统初加载到字典中。 + - `词项-文档编号`,(即倒排记录),查询时按需进行加载。 +- 避免一次性读写,分批次读取新闻内容,建好索引写入文件,对于已经出现的词项更新倒排记录。 + +### 检索 +- Step1:对输入的词或句子进行拆分,分别查询每个词 +- Step2:对每个词返回的倒排记录进行合并,得到最终的查询记录。**优化:按照文档频率从小到大进行合并** +- Step3:可以考虑对结果按时间进行排序。 +- Step4:返回内容摘要及url链接 + +###~~相似新闻推荐~~ +**++待定++** + +##前段展示 +- 暂定使用webpy轻量级框架进行开发。 +- 页面结构 + - 搜索首页 + - 搜索结果展示页 + - 新闻页(侧边包含推荐相似新闻部分) \ No newline at end of file diff --git a/ml/Search.py b/ml/Search.py new file mode 100644 index 0000000..e103287 --- /dev/null +++ b/ml/Search.py @@ -0,0 +1,16 @@ +import sys +import json + +class Search: + def __init__(self): + pass + + def getQueryItem(self,InputItem): + pass + + def getInverseRecord(self,item): + pass + + def mergeInverseRecord(self,RecordList): + pass + diff --git a/ml/div.py b/ml/div.py deleted file mode 100644 index 0984854..0000000 --- a/ml/div.py +++ /dev/null @@ -1,26 +0,0 @@ -# -*- coding: utf-8 -*- -import jieba -import json -import sys - -reload(sys) -sys.setdefaultencoding('utf-8') - -class DivideWord: - def __init__(self): - pass - def parse(self): - file = open('../news_spider/title.json') - - while True: - line = file.readline() - if not line: - break - data = json.loads(line) - seg_list = list(jieba.cut(data['title'], cut_all=True)) - for w in seg_list: - print w.encode('utf-8'), - print '\n' - -dw = DivideWord() -dw.parse()