支持英文搜索，中文搜索编码存在问题

2016-05-04 00:46:51 +08:00 · 2016-05-04 00:46:51 +08:00 · d1824516d3
commit d1824516d3
parent c5dd35c5aa
10 changed files with 56 additions and 9 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,4 +3,5 @@ data/news.db
 data/news.json
 data/title.json
 data/cutnews
+data/orinews
 data/inversedata
--- a/ml/Cut.py
+++ b/ml/Cut.py
@ -35,6 +35,27 @@ class Cut:
 				cut_file.write(json.dumps(data)+'\n')
 			cut_file.close()
 			num+=1
+
+	def cutfileWithoutCut(self,path,fliename,size):
+		file_data = open(fliename,'r')
+		num = 0
+		flag = 0
+		while True:
+			if flag == 1:
+				break
+			if not os.path.exists(path):
+				os.makedirs(path)
+			cutfilename = path+'/'+str(num)+'.txt'
+			cut_file = open(cutfilename,'wb')
+			print 'Generate:'+cutfilename+'...'
+			for i in range(0,size):
+				line = file_data.readline()
+				if not line:
+					flag = 1
+					break
+				cut_file.write(line)
+			cut_file.close()
+			num+=1
 	def getRow(self,recordnum,path,size):
 		filenum = (recordnum-1)/size
 		linenum = (recordnum-1)%size+1
@ -48,6 +69,7 @@ class Cut:

 #test cutfile
 #c = Cut()
+#c.cutfileWithoutCut(Global.cutnews_origin_dir,Global.content_dir,Global.filesize)
 #c.cutfile(Global.cutnews_dir,Global.content_dir,Global.filesize)

 #test getRow
--- a/ml/Cut.pyc
+++ b/ml/Cut.pyc
--- a/ml/InverseIndex.py
+++ b/ml/InverseIndex.py
@ -73,7 +73,7 @@ class InverseIndex:
 			data = json.loads(line)
 #	seg_list = jieba.cut(data['content'],cut_all=True)

-			keyword = analyse.extract_tags(data['content'],topK=10)
+			keyword = analyse.extract_tags(data['content'],topK=20)
 			seg = " ".join(keyword)
 			print seg
 			doc.append(seg)
@ -93,8 +93,8 @@ class InverseIndex:
 		f = open(Global.inverse_dir+'id.txt','wb')
 		word = vectorizer.get_feature_names()
 		for name in vectorizer.get_feature_names():
-			indexdoc[name] = i
 			i+=1
+			indexdoc[name] = i
 		f.write(json.dumps(indexdoc))
 		f.close()
 		
@ -103,11 +103,11 @@ class InverseIndex:
 		for i in range(0,colnum):
 			filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
 			f = open(filename,'a')
-			idx_list = list()
+			idx_list = dict()
 			for j in range(0,row):
 				val = tfidf[j,i]
 				if val > 0:
-					idx_list[j] = val
+					idx_list[j+1] = val
 			f.write(json.dumps(idx_list)+'\n')
 			f.close()

--- a/ml/Search.py
+++ b/ml/Search.py
@ -1,12 +1,34 @@
+# -*- coding: utf-8 -*- 
 import sys
 import json
+reload(sys)
+sys.path.append("..")
+sys.setdefaultencoding('utf-8')
+from Cut import Cut
+import tools.Global as Global

 class Search:
 	def __init__(self):
-		pass
+		self.kw_id = self.loadKW_ID()

-	def getQueryItem(self,InputItem):
-		pass
+	def loadKW_ID(self):
+		f = open(Global.inverse_dir+'id.txt')
+		line = f.readline()
+		kw_id = json.loads(line, encoding='utf-8')
+		kwid = dict()
+		for ki in kw_id:
+			kwid[ki.encode('utf-8')] = kw_id[ki]
+		for i in kwid:
+		 	print i,kwid[i]
+		return kwid
+
+
+	def getQueryItem(self,searchWord):
+		idx = self.kw_id[searchWord]
+		cut = Cut()
+		line = cut.getRow(idx,Global.cutnews_origin_dir,Global.filesize)
+		data = json.loads(line)
+		print data['title'],'\n',data['time'],'\n',data['content'],'\n'

 	def getInverseRecord(self,item):
 		pass
@ -14,3 +36,5 @@ class Search:
 	def mergeInverseRecord(self,RecordList):
 		pass

+search = Search()
+search.getQueryItem(sys.argv[1].decode('utf-8'))
--- a/test/test_tool.py
+++ b/test/test_tool.py
@ -11,6 +11,5 @@ s = show()
 #s.showitem(2608)

 c = Cut()
-line = c.getRow(3176,Global.cutnews_dir,Global.filesize)
+line = c.getRow(2,Global.cutnews_origin_dir,Global.filesize)
 s.showitem(line)
-
--- a/tools/Global.py
+++ b/tools/Global.py
@ -5,4 +5,5 @@ db_dir = project_root+"data/news.db"
 stopword_dir=project_root+"data/stopword.txt"
 inverse_dir=project_root+"data/inversedata/"
 cutnews_dir=project_root+"data/cutnews/"
+cutnews_origin_dir=project_root+"data/orinews"
 filesize = 100
--- a/tools/Global.pyc
+++ b/tools/Global.pyc
--- a/tools/init.pyc
+++ b/tools/init.pyc
--- a/tools/show.pyc
+++ b/tools/show.pyc