From d1824516d39382399ed3fb3dad2738ecffc9392d Mon Sep 17 00:00:00 2001
From: lzjqsdd <lzj7179@163.com>
Date: Wed, 4 May 2016 00:46:51 +0800
Subject: [PATCH] =?UTF-8?q?=E6=94=AF=E6=8C=81=E8=8B=B1=E6=96=87=E6=90=9C?=
 =?UTF-8?q?=E7=B4=A2=EF=BC=8C=E4=B8=AD=E6=96=87=E6=90=9C=E7=B4=A2=E7=BC=96?=
 =?UTF-8?q?=E7=A0=81=E5=AD=98=E5=9C=A8=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore         |   1 +
 ml/Cut.py          |  22 ++++++++++++++++++++++
 ml/Cut.pyc         | Bin 1946 -> 2350 bytes
 ml/InverseIndex.py |   8 ++++----
 ml/Search.py       |  30 +++++++++++++++++++++++++++---
 test/test_tool.py  |   3 +--
 tools/Global.py    |   1 +
 tools/Global.pyc   | Bin 442 -> 494 bytes
 tools/__init__.pyc | Bin 107 -> 107 bytes
 tools/show.pyc     | Bin 1730 -> 1730 bytes
 10 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/.gitignore b/.gitignore
index e8c7a12..c44ff6c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,5 @@ data/news.db
 data/news.json
 data/title.json
 data/cutnews
+data/orinews
 data/inversedata
diff --git a/ml/Cut.py b/ml/Cut.py
index 701a081..0974f87 100644
--- a/ml/Cut.py
+++ b/ml/Cut.py
@@ -35,6 +35,27 @@ class Cut:
 				cut_file.write(json.dumps(data)+'\n')
 			cut_file.close()
 			num+=1
+
+	def cutfileWithoutCut(self,path,fliename,size):
+		file_data = open(fliename,'r')
+		num = 0
+		flag = 0
+		while True:
+			if flag == 1:
+				break
+			if not os.path.exists(path):
+				os.makedirs(path)
+			cutfilename = path+'/'+str(num)+'.txt'
+			cut_file = open(cutfilename,'wb')
+			print 'Generate:'+cutfilename+'...'
+			for i in range(0,size):
+				line = file_data.readline()
+				if not line:
+					flag = 1
+					break
+				cut_file.write(line)
+			cut_file.close()
+			num+=1
 	def getRow(self,recordnum,path,size):
 		filenum = (recordnum-1)/size
 		linenum = (recordnum-1)%size+1
@@ -48,6 +69,7 @@ class Cut:
 
 #test cutfile
 #c = Cut()
+#c.cutfileWithoutCut(Global.cutnews_origin_dir,Global.content_dir,Global.filesize)
 #c.cutfile(Global.cutnews_dir,Global.content_dir,Global.filesize)
 
 #test getRow
diff --git a/ml/Cut.pyc b/ml/Cut.pyc
index 470c343079802a4f9508f1f98fcfa0bd3e7b47a0..0cb63acab8b494e5e8f076a17429196ede492353 100644
GIT binary patch
delta 539
zcmZXQJ4*vW5XWb7m%H4Vhl$S^qftZ;2^(w`QBe>Zi?s*{780XI;w!fSg+Pi3)?vGJ
z7PdBGX>BL?6+{bbzko9rgM}>f+ufP}JeW@6A!QriG57w}YuitlZ$Hn(4h*;f*bRf+
z@yzz;niT~AAs)9dWBk7Lttk_I<Tl9G$^0+SO`M3lfI_H!fWF70<SHl|4noi{pcR6R
zy@Jk$a>I=XoH)>Mp@o2#9VoKH)pw!MrbCssCD*KwK?MoYB=`)Wu9OV(k9C{nvvO@;
zl^1>13n4t7M{-EIjNr?o%NQfhNHF>tNyZ2xP3Yz}gMLBE7)LUUaYmLg!I)&^7*mXC
zf=AYc-`!N<v|3oND{*nHxJ{|94lC7iv7(O8>xu$%<n(7b3lnJ3aWIM|ru36JfXT-<
za~XSkUa*F;ODs<X@;Ih&l;kemE+u=GE?M&{Tu=(a-zdTB-#n75@OJzNff^f5+8NaE
P))sm?Z)baLtG4(7b-_{*

delta 317
zcmZ1{G>e~|`7<w9qK5KB_N|P{6ZfjH1O;m_PkiU7tjWN@pr4VSo2s8vl~q`rlA@oU
zS&~tjr0<tnUL0JInUY$x*_2V5nHYtW!&rhg?`1P(v?Rh*{oEXV=h70rg31y$pc~Rt
zOM>#t)h6?CNHeld*5c3#)dh-b05K9#Op>`lAkADrTnw^+jgga?Q)2RD4o^n)$#*$q
KiSf`RPCWntHe12~

diff --git a/ml/InverseIndex.py b/ml/InverseIndex.py
index b562531..f49faa8 100644
--- a/ml/InverseIndex.py
+++ b/ml/InverseIndex.py
@@ -73,7 +73,7 @@ class InverseIndex:
 			data = json.loads(line)
 #	seg_list = jieba.cut(data['content'],cut_all=True)
 
-			keyword = analyse.extract_tags(data['content'],topK=10)
+			keyword = analyse.extract_tags(data['content'],topK=20)
 			seg = " ".join(keyword)
 			print seg
 			doc.append(seg)
@@ -93,8 +93,8 @@ class InverseIndex:
 		f = open(Global.inverse_dir+'id.txt','wb')
 		word = vectorizer.get_feature_names()
 		for name in vectorizer.get_feature_names():
-			indexdoc[name] = i
 			i+=1
+			indexdoc[name] = i
 		f.write(json.dumps(indexdoc))
 		f.close()
 		
@@ -103,11 +103,11 @@ class InverseIndex:
 		for i in range(0,colnum):
 			filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
 			f = open(filename,'a')
-			idx_list = list()
+			idx_list = dict()
 			for j in range(0,row):
 				val = tfidf[j,i]
 				if val > 0:
-					idx_list[j] = val
+					idx_list[j+1] = val
 			f.write(json.dumps(idx_list)+'\n')
 			f.close()
 
diff --git a/ml/Search.py b/ml/Search.py
index e103287..61ab677 100644
--- a/ml/Search.py
+++ b/ml/Search.py
@@ -1,12 +1,34 @@
+# -*- coding: utf-8 -*- 
 import sys
 import json
+reload(sys)
+sys.path.append("..")
+sys.setdefaultencoding('utf-8')
+from Cut import Cut
+import tools.Global as Global
 
 class Search:
 	def __init__(self):
-		pass
+		self.kw_id = self.loadKW_ID()
 
-	def getQueryItem(self,InputItem):
-		pass
+	def loadKW_ID(self):
+		f = open(Global.inverse_dir+'id.txt')
+		line = f.readline()
+		kw_id = json.loads(line, encoding='utf-8')
+		kwid = dict()
+		for ki in kw_id:
+			kwid[ki.encode('utf-8')] = kw_id[ki]
+		for i in kwid:
+		 	print i,kwid[i]
+		return kwid
+
+
+	def getQueryItem(self,searchWord):
+		idx = self.kw_id[searchWord]
+		cut = Cut()
+		line = cut.getRow(idx,Global.cutnews_origin_dir,Global.filesize)
+		data = json.loads(line)
+		print data['title'],'\n',data['time'],'\n',data['content'],'\n'
 
 	def getInverseRecord(self,item):
 		pass
@@ -14,3 +36,5 @@ class Search:
 	def mergeInverseRecord(self,RecordList):
 		pass
 
+search = Search()
+search.getQueryItem(sys.argv[1].decode('utf-8'))
diff --git a/test/test_tool.py b/test/test_tool.py
index d63ac06..1aff0de 100644
--- a/test/test_tool.py
+++ b/test/test_tool.py
@@ -11,6 +11,5 @@ s = show()
 #s.showitem(2608)
 
 c = Cut()
-line = c.getRow(3176,Global.cutnews_dir,Global.filesize)
+line = c.getRow(2,Global.cutnews_origin_dir,Global.filesize)
 s.showitem(line)
-
diff --git a/tools/Global.py b/tools/Global.py
index 1059182..4b1a84a 100644
--- a/tools/Global.py
+++ b/tools/Global.py
@@ -5,4 +5,5 @@ db_dir = project_root+"data/news.db"
 stopword_dir=project_root+"data/stopword.txt"
 inverse_dir=project_root+"data/inversedata/"
 cutnews_dir=project_root+"data/cutnews/"
+cutnews_origin_dir=project_root+"data/orinews"
 filesize = 100
diff --git a/tools/Global.pyc b/tools/Global.pyc
index 0927dabc64e55af7be313c9503e1d74652dee762..0b96661591952d68d6156998217c20a221abe698 100644
GIT binary patch
delta 134
zcmdnR{Ek_g`7<xq(-Ruu$qZ1y1f(5+xHxR0w5wPu149ZsgLo7>Lkb5&6bC~JCqu9X
z*TkGg$zmR$qLjpvME(4t%)Hd{;>;AFfS(5E#IN~6LO^D6X$eR;9w?ojnKwC}(NjbK
KD9pylg#rQEnjCKc

delta 87
zcmaFIyo*_y`7<w<nUZpNG6NJa0ci&yF7}xy?aGtF&Je}Ukix+btid_4tdS!#1t{gG
X!7*8YF@N$jMi(JIAfJtq3kd=Mu96Ll

diff --git a/tools/__init__.pyc b/tools/__init__.pyc
index 1d150466ee1597e9b874d4d55296ffd11047dfd4..6dd09e6c48885dca517820b07c521479f02fc2fb 100644
GIT binary patch
delta 13
Ucmd1KW@rA)%jL97V<LMB02~(t3jhEB

delta 13
Ucmd1KW@rA)%QfMU(nR(Y03G`TTL1t6

diff --git a/tools/show.pyc b/tools/show.pyc
index 60765fbeebeee11b2e475a0fb7f029d79686c93b..9a2865ed403e2d5fa89d892aec523e143a5e64a7 100644
GIT binary patch
delta 15
WcmX@adx)2f`7<w<)2@wdd)NRev;`*s

delta 15
WcmX@adx)2f`7<w<W9~+_J!}9dEd<#B