chip rel first commit

This commit is contained in:
loujie0822 2020-08-12 00:54:33 +08:00
parent a88565cf85
commit afce3517f9

View File

@ -9,9 +9,14 @@ Authors: daisongtai(daisongtai@baidu.com)
Date: 2019/5/29 6:38 PM
"""
from __future__ import print_function
import sys
import re
import io
from transformers import BertTokenizer
max_seq_length = 500
tokenizer = BertTokenizer.from_pretrained('transformer_cpt/bert', do_lower_case=True)
LHan = [
[0x2E80, 0x2E99], # Han # So [26] CJK RADICAL REPEAT, CJK RADICAL RAP
@ -53,8 +58,10 @@ CN_PUNCTS = [(0x3002, "。"), (0xFF1F, ""), (0xFF01, ""), (0xFF0C, ""),
(0xFF09, ""), (0x3014, ""), (0x3015, ""), (0x3010, ""),
(0x3011, ""), (0x2014, ""), (0x2026, ""), (0x2013, ""),
(0xFF0E, ""), (0x300A, ""), (0x300B, ""), (0x3008, ""),
(0x3009, ""), (0x2015, ""), (0xff0d, ""), (0x0020, " "),(0xFF5E, "")]
#(0xFF5E, ""),
(0x2460, ""), (0x2461, ""), (0x2462, ""), (0x2463, ""),
(0x2464, ""), (0x2465, ""), (0x2466, ""), (0x2467, ""), (0x2468, ""), (0x2469, ""),
(0x3009, ""), (0x2015, ""), (0xff0d, ""), (0x0020, " "), (0xFF5E, "")]
# (0xFF5E, ""),
EN_PUNCTS = [[0x0021, 0x002F], [0x003A, 0x0040], [0x005B, 0x0060],
[0x007B, 0x007E]]
@ -121,12 +128,12 @@ if __name__ == '__main__':
# print("―", extractor.is_chinese_or_punct("―"))
# print("-", extractor.is_chinese_or_punct("-"))
text_raw="'ymx510335'"
text_raw = "1%2%ALL患者有Burkitt淋巴瘤的形态学及免疫学特征即FAB形态学分类中的L3型常伴有腹部或其余部位的包块性疾病可以将之视为极晚期Burkitt淋巴瘤对这类患儿应采用晚期Burkitt淋巴瘤的治疗方案。"
sub_text = []
buff = ""
flag_en=False
flag_digit=False
flag_en = False
flag_digit = False
for char in text_raw:
if extractor.is_chinese_or_punct(char):
if buff != "":
@ -140,19 +147,36 @@ if __name__ == '__main__':
if buff != "" and flag_en:
sub_text.append(buff)
buff = ""
flag_en =False
flag_en = False
flag_digit = True
buff +=char
buff += char
else:
if buff != "" and flag_digit:
sub_text.append(buff)
buff = ""
flag_digit =False
flag_digit = False
flag_en = True
buff +=char
buff += char
if buff != "":
sub_text.append(buff)
tok_to_orig_start_index = []
tok_to_orig_end_index = []
tokens = []
text_tmp = ''
for (i, token) in enumerate(sub_text):
sub_tokens = tokenizer.tokenize(token) if token != ' ' else []
text_tmp += token
for sub_token in sub_tokens:
tok_to_orig_start_index.append(len(text_tmp) - len(token))
tok_to_orig_end_index.append(len(text_tmp) - 1)
tokens.append(sub_token)
if len(tokens) >= max_seq_length - 2:
break
else:
continue
break
print(sub_text)
print(tokens)