Update text_tools.py

This commit is contained in:
yongzhuo 2019-05-12 09:49:24 +08:00 committed by GitHub
parent dc9940d880
commit 873cd8ac9b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -131,69 +131,6 @@ def get_syboml(ques):
answer = re.sub("[\.\!\/_,?;:$%^*<>()+\"\']+|[!,;:。?、“”’‘《》[\]|{}【】~@#¥%…&*\/\-—_]+", " ", ques).strip() answer = re.sub("[\.\!\/_,?;:$%^*<>()+\"\']+|[!,;:。?、“”’‘《》[\]|{}【】~@#¥%…&*\/\-—_]+", " ", ques).strip()
return answer return answer
'''xlrd读xls'''
def xlsRead(sheetName=None, cols=0, fileXlsPath=None):
'''读xls文件'''
workbook = xlrd.open_workbook(fileXlsPath)
# 根据sheet索引或者名称获取sheet内容
sheet = workbook.sheet_by_name(sheetName)
nrows = sheet.nrows
ncols = sheet.ncols
listRows = []
for i in range(nrows):
listRows.append(sheet.row_values(i))
return listRows
'''openpyxl写xlsx'''
def xlsxWrite(sheetName, writeList, fileXlsName):
wb = Workbook()
print('{}'.format(wb.get_sheet_names())) # 提供一个默认名叫Sheet的表office2016下新建提供默认Sheet1
sheet = wb.create_sheet(sheetName)
# i = 0
for listLine_one in writeList:
# i += 1
sheet.append(listLine_one)
# if i == 1000:
# break
wb.save(fileXlsName)
'''读取txt文件'''
def txtRead(filePath, encodeType='utf-8'):
listLine = []
try:
file = open(filePath, 'r', encoding=encodeType)
while True:
line = file.readline()
if not line:
break
listLine.append(line)
file.close()
except Exception as e:
logger.info(str(e))
finally:
return listLine
'''读取txt文件'''
def txtWrite(listLine, filePath, type='w', encodeType='utf-8'):
try:
file = open(filePath, type, encoding=encodeType)
file.writelines(listLine)
file.close()
except Exception as e:
logger.info(str(e))
# -*- coding: cp936 -*- # -*- coding: cp936 -*-
def strQ2B(ustring): def strQ2B(ustring):
@ -320,6 +257,15 @@ def load_word2vec_model(model_path, binary_type=True, encoding_type = 'utf-8', l
return word2vec_model return word2vec_model
def text_preprocess(text):
if text.strip():
text_simple_q2b = strQ2B(text)
text_simple_q2b_only = getChinese1(text_simple_q2b)
return text_simple_q2b_only.lower()
else:
return text
#todo #句子改写,同义词替换,去停用词等 #todo #句子改写,同义词替换,去停用词等