From 9da1dac6e8585602706fcf4d46e6e51f9dfca7a4 Mon Sep 17 00:00:00 2001 From: lzjqsdd Date: Wed, 27 Apr 2016 23:21:28 +0800 Subject: [PATCH] none --- ml/Cut.py | 24 ++++++++++++++++++++++++ ml/Global.py | 1 + ml/Global.pyc | Bin 270 -> 307 bytes ml/InverseIndex.py | 7 ++++++- 4 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 ml/Cut.py diff --git a/ml/Cut.py b/ml/Cut.py new file mode 100644 index 0000000..630f2dc --- /dev/null +++ b/ml/Cut.py @@ -0,0 +1,24 @@ +#encoding:utf8 +import json +import sys +import Global + +class Cut: + def __init__(self): +#every 30 news in a flie + self.size = 30 + self.file_data = open(Global.data_dir) + + def cutfile(self,file): + + num = 0 + While(True): + line = self.file_data.readline() + if not line: + break + + num+=1 + filename = str(num/self.size)+'.txt' + if num%self.size != 0: + + diff --git a/ml/Global.py b/ml/Global.py index aa10081..f13f78e 100644 --- a/ml/Global.py +++ b/ml/Global.py @@ -1,3 +1,4 @@ data_dir = "../news_spider/title.json" db_dir = "../news_spider/news.db" stopword_dir="./stopword.txt" +inverse_dir="./ii.txt" diff --git a/ml/Global.pyc b/ml/Global.pyc index 6a914e7f1ae3af4673586db18578734587258566..7bb9ddaeab82d342eebbab10cf5f2fc0d4a0e6ab 100644 GIT binary patch delta 115 zcmeBU+RS9b{F#@_<+@yWG6NJa0%->zE|vijDGUrzK$4Lmijg6Oi6M%KA%&SCikTsW zg&|mjb)uQSKrshUnVx=Trd~-!iJu0`#10*v5^kVaW?orpQE}?T4PjhtKrS022m%0e CLKL0= delta 81 zcmdnY)W>AT{F#^Qx`$kNG6NJa0%->zE*1k4DGUrzK$4Lmijg6Oi6M%KA%&SCSc7Gv ZjX$%W2J^((Iuox4aj*bYvN5tT0st-M4Kx4% diff --git a/ml/InverseIndex.py b/ml/InverseIndex.py index b985422..847ddfc 100644 --- a/ml/InverseIndex.py +++ b/ml/InverseIndex.py @@ -12,6 +12,7 @@ class InverseIndex: def __init__(self): self.file_data= open(Global.data_dir) self.file_sw = open(Global.stopword_dir) + self.ii = open(Global.inverse_dir,'wb') self.stopword=[] self.worddict = dict() @@ -24,7 +25,6 @@ class InverseIndex: print line, def loaddata(self): - self.loadsw() count=0 while True: @@ -38,8 +38,13 @@ class InverseIndex: if w not in self.worddict: self.worddict[w] = [] if w not in self.stopword: + print w, self.worddict[w].append(count) + def write2file(self): + for w in self.worddict: + ii.write(w+' '+str(worddict[w])+'\n') ii = InverseIndex() ii.loaddata() +ii.write2file()