diff --git a/.gitignore b/.gitignore index e8c7a12..c44ff6c 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ data/news.db data/news.json data/title.json data/cutnews +data/orinews data/inversedata diff --git a/ml/Cut.py b/ml/Cut.py index 701a081..0974f87 100644 --- a/ml/Cut.py +++ b/ml/Cut.py @@ -35,6 +35,27 @@ class Cut: cut_file.write(json.dumps(data)+'\n') cut_file.close() num+=1 + + def cutfileWithoutCut(self,path,fliename,size): + file_data = open(fliename,'r') + num = 0 + flag = 0 + while True: + if flag == 1: + break + if not os.path.exists(path): + os.makedirs(path) + cutfilename = path+'/'+str(num)+'.txt' + cut_file = open(cutfilename,'wb') + print 'Generate:'+cutfilename+'...' + for i in range(0,size): + line = file_data.readline() + if not line: + flag = 1 + break + cut_file.write(line) + cut_file.close() + num+=1 def getRow(self,recordnum,path,size): filenum = (recordnum-1)/size linenum = (recordnum-1)%size+1 @@ -48,6 +69,7 @@ class Cut: #test cutfile #c = Cut() +#c.cutfileWithoutCut(Global.cutnews_origin_dir,Global.content_dir,Global.filesize) #c.cutfile(Global.cutnews_dir,Global.content_dir,Global.filesize) #test getRow diff --git a/ml/Cut.pyc b/ml/Cut.pyc index 470c343..0cb63ac 100644 Binary files a/ml/Cut.pyc and b/ml/Cut.pyc differ diff --git a/ml/InverseIndex.py b/ml/InverseIndex.py index b562531..f49faa8 100644 --- a/ml/InverseIndex.py +++ b/ml/InverseIndex.py @@ -73,7 +73,7 @@ class InverseIndex: data = json.loads(line) # seg_list = jieba.cut(data['content'],cut_all=True) - keyword = analyse.extract_tags(data['content'],topK=10) + keyword = analyse.extract_tags(data['content'],topK=20) seg = " ".join(keyword) print seg doc.append(seg) @@ -93,8 +93,8 @@ class InverseIndex: f = open(Global.inverse_dir+'id.txt','wb') word = vectorizer.get_feature_names() for name in vectorizer.get_feature_names(): - indexdoc[name] = i i+=1 + indexdoc[name] = i f.write(json.dumps(indexdoc)) f.close() @@ -103,11 +103,11 @@ class InverseIndex: for i in range(0,colnum): filename = Global.inverse_dir+str(i/Global.filesize)+'.txt' f = open(filename,'a') - idx_list = list() + idx_list = dict() for j in range(0,row): val = tfidf[j,i] if val > 0: - idx_list[j] = val + idx_list[j+1] = val f.write(json.dumps(idx_list)+'\n') f.close() diff --git a/ml/Search.py b/ml/Search.py index e103287..61ab677 100644 --- a/ml/Search.py +++ b/ml/Search.py @@ -1,12 +1,34 @@ +# -*- coding: utf-8 -*- import sys import json +reload(sys) +sys.path.append("..") +sys.setdefaultencoding('utf-8') +from Cut import Cut +import tools.Global as Global class Search: def __init__(self): - pass + self.kw_id = self.loadKW_ID() - def getQueryItem(self,InputItem): - pass + def loadKW_ID(self): + f = open(Global.inverse_dir+'id.txt') + line = f.readline() + kw_id = json.loads(line, encoding='utf-8') + kwid = dict() + for ki in kw_id: + kwid[ki.encode('utf-8')] = kw_id[ki] + for i in kwid: + print i,kwid[i] + return kwid + + + def getQueryItem(self,searchWord): + idx = self.kw_id[searchWord] + cut = Cut() + line = cut.getRow(idx,Global.cutnews_origin_dir,Global.filesize) + data = json.loads(line) + print data['title'],'\n',data['time'],'\n',data['content'],'\n' def getInverseRecord(self,item): pass @@ -14,3 +36,5 @@ class Search: def mergeInverseRecord(self,RecordList): pass +search = Search() +search.getQueryItem(sys.argv[1].decode('utf-8')) diff --git a/test/test_tool.py b/test/test_tool.py index d63ac06..1aff0de 100644 --- a/test/test_tool.py +++ b/test/test_tool.py @@ -11,6 +11,5 @@ s = show() #s.showitem(2608) c = Cut() -line = c.getRow(3176,Global.cutnews_dir,Global.filesize) +line = c.getRow(2,Global.cutnews_origin_dir,Global.filesize) s.showitem(line) - diff --git a/tools/Global.py b/tools/Global.py index 1059182..4b1a84a 100644 --- a/tools/Global.py +++ b/tools/Global.py @@ -5,4 +5,5 @@ db_dir = project_root+"data/news.db" stopword_dir=project_root+"data/stopword.txt" inverse_dir=project_root+"data/inversedata/" cutnews_dir=project_root+"data/cutnews/" +cutnews_origin_dir=project_root+"data/orinews" filesize = 100 diff --git a/tools/Global.pyc b/tools/Global.pyc index 0927dab..0b96661 100644 Binary files a/tools/Global.pyc and b/tools/Global.pyc differ diff --git a/tools/__init__.pyc b/tools/__init__.pyc index 1d15046..6dd09e6 100644 Binary files a/tools/__init__.pyc and b/tools/__init__.pyc differ diff --git a/tools/show.pyc b/tools/show.pyc index 60765fb..9a2865e 100644 Binary files a/tools/show.pyc and b/tools/show.pyc differ