NewsSpider/ml/Cut.py
2016-05-05 23:04:30 +08:00

90 lines
2.1 KiB
Python

# -*- coding: utf-8 -*-
import json
import sys
sys.path.append("..")
import tools.Global as Global
import os
import linecache
import jieba
class Cut:
def __init__(self):
pass
def cutfile(self,path,fliename,size):
file_data = open(fliename,'r')
num = 0
flag = 0
while True:
if flag == 1:
break
if not os.path.exists(path):
os.makedirs(path)
cutfilename = path+'/'+str(num)+'.txt'
cut_file = open(cutfilename,'wb')
print 'Generate:'+cutfilename+'...'
for i in range(0,size):
line = file_data.readline()
if not line:
flag = 1
break
data = json.loads(line)
seg_list = jieba.cut(data['content'],cut_all=False)
result = ' '.join(seg_list)
data['content'] = result
cut_file.write(json.dumps(data)+'\n')
cut_file.close()
num+=1
def cutfileWithoutCut(self,path,fliename,size):
file_data = open(fliename,'r')
num = 0
flag = 0
while True:
if flag == 1:
break
if not os.path.exists(path):
os.makedirs(path)
cutfilename = path+'/'+str(num)+'.txt'
cut_file = open(cutfilename,'wb')
print 'Generate:'+cutfilename+'...'
for i in range(0,size):
line = file_data.readline()
if not line:
flag = 1
break
cut_file.write(line)
cut_file.close()
num+=1
def getInverseIndexRow(self,recordnum,path,size):
filenum = (recordnum-1)/size
linenum = (recordnum-1)%size+1
cutfilename = path+'/'+str(filenum)+'.txt'
print cutfilename,linenum
linecache.clearcache()
line = linecache.getline(cutfilename,linenum)
linecache.clearcache()
data = json.loads(line)
return line
def getRow(self,recordnum,path,size):
filenum = (recordnum-1)/size
linenum = (recordnum-1)%size+1
cutfilename = path+'/'+str(filenum)+'.txt'
print cutfilename,linenum
linecache.clearcache()
line = linecache.getline(cutfilename,linenum)
linecache.clearcache()
data = json.loads(line)
return line
#test cutfile
#c = Cut()
#c.cutfileWithoutCut(Global.cutnews_origin_dir,Global.content_dir,Global.filesize)
#c.cutfile(Global.cutnews_dir,Global.content_dir,Global.filesize)
#test getRow
#c = Cut()
#c.getRow(200,Global.cutnews_dir,Global.filesize)