2016-04-28 12:38:50 +08:00
|
|
|
# -*- coding: utf-8 -*-
|
2016-04-27 23:21:28 +08:00
|
|
|
import json
|
|
|
|
import sys
|
2016-04-29 16:08:50 +08:00
|
|
|
sys.path.append("..")
|
|
|
|
import tools.Global as Global
|
2016-04-28 12:38:50 +08:00
|
|
|
import os
|
|
|
|
import linecache
|
2016-04-29 16:08:50 +08:00
|
|
|
import jieba
|
2016-04-27 23:21:28 +08:00
|
|
|
|
|
|
|
class Cut:
|
|
|
|
def __init__(self):
|
2016-04-28 12:38:50 +08:00
|
|
|
pass
|
2016-04-27 23:21:28 +08:00
|
|
|
|
2016-04-28 12:38:50 +08:00
|
|
|
def cutfile(self,path,fliename,size):
|
|
|
|
file_data = open(fliename,'r')
|
2016-04-27 23:21:28 +08:00
|
|
|
num = 0
|
2016-04-28 12:38:50 +08:00
|
|
|
flag = 0
|
|
|
|
while True:
|
|
|
|
if flag == 1:
|
2016-04-27 23:21:28 +08:00
|
|
|
break
|
2016-04-28 12:38:50 +08:00
|
|
|
if not os.path.exists(path):
|
|
|
|
os.makedirs(path)
|
|
|
|
cutfilename = path+'/'+str(num)+'.txt'
|
|
|
|
cut_file = open(cutfilename,'wb')
|
|
|
|
print 'Generate:'+cutfilename+'...'
|
|
|
|
for i in range(0,size):
|
|
|
|
line = file_data.readline()
|
|
|
|
if not line:
|
|
|
|
flag = 1
|
|
|
|
break
|
2016-04-29 16:08:50 +08:00
|
|
|
data = json.loads(line)
|
2016-05-05 23:04:30 +08:00
|
|
|
seg_list = jieba.cut(data['content'],cut_all=False)
|
2016-04-29 16:08:50 +08:00
|
|
|
result = ' '.join(seg_list)
|
|
|
|
data['content'] = result
|
|
|
|
cut_file.write(json.dumps(data)+'\n')
|
2016-04-28 12:38:50 +08:00
|
|
|
cut_file.close()
|
2016-04-27 23:21:28 +08:00
|
|
|
num+=1
|
2016-05-04 00:46:51 +08:00
|
|
|
|
|
|
|
def cutfileWithoutCut(self,path,fliename,size):
|
|
|
|
file_data = open(fliename,'r')
|
|
|
|
num = 0
|
|
|
|
flag = 0
|
|
|
|
while True:
|
|
|
|
if flag == 1:
|
|
|
|
break
|
|
|
|
if not os.path.exists(path):
|
|
|
|
os.makedirs(path)
|
|
|
|
cutfilename = path+'/'+str(num)+'.txt'
|
|
|
|
cut_file = open(cutfilename,'wb')
|
|
|
|
print 'Generate:'+cutfilename+'...'
|
|
|
|
for i in range(0,size):
|
|
|
|
line = file_data.readline()
|
|
|
|
if not line:
|
|
|
|
flag = 1
|
|
|
|
break
|
|
|
|
cut_file.write(line)
|
|
|
|
cut_file.close()
|
|
|
|
num+=1
|
2016-05-04 09:42:52 +08:00
|
|
|
|
|
|
|
def getInverseIndexRow(self,recordnum,path,size):
|
|
|
|
filenum = (recordnum-1)/size
|
|
|
|
linenum = (recordnum-1)%size+1
|
|
|
|
cutfilename = path+'/'+str(filenum)+'.txt'
|
|
|
|
print cutfilename,linenum
|
|
|
|
linecache.clearcache()
|
|
|
|
line = linecache.getline(cutfilename,linenum)
|
|
|
|
linecache.clearcache()
|
|
|
|
data = json.loads(line)
|
|
|
|
return line
|
|
|
|
|
2016-04-28 12:38:50 +08:00
|
|
|
def getRow(self,recordnum,path,size):
|
2016-04-28 22:26:14 +08:00
|
|
|
filenum = (recordnum-1)/size
|
|
|
|
linenum = (recordnum-1)%size+1
|
2016-04-28 12:38:50 +08:00
|
|
|
cutfilename = path+'/'+str(filenum)+'.txt'
|
2016-04-28 22:26:14 +08:00
|
|
|
print cutfilename,linenum
|
2016-04-28 12:38:50 +08:00
|
|
|
linecache.clearcache()
|
|
|
|
line = linecache.getline(cutfilename,linenum)
|
2016-04-28 22:26:14 +08:00
|
|
|
linecache.clearcache()
|
|
|
|
data = json.loads(line)
|
2016-04-28 12:38:50 +08:00
|
|
|
return line
|
|
|
|
|
|
|
|
#test cutfile
|
|
|
|
#c = Cut()
|
2016-05-04 00:46:51 +08:00
|
|
|
#c.cutfileWithoutCut(Global.cutnews_origin_dir,Global.content_dir,Global.filesize)
|
2016-04-28 22:26:14 +08:00
|
|
|
#c.cutfile(Global.cutnews_dir,Global.content_dir,Global.filesize)
|
2016-04-28 12:38:50 +08:00
|
|
|
|
|
|
|
#test getRow
|
|
|
|
#c = Cut()
|
2016-04-28 22:26:14 +08:00
|
|
|
#c.getRow(200,Global.cutnews_dir,Global.filesize)
|