DeepIE/utils/extract_chinese_and_punct.py
loujie0822 63b1a3eff5 fixed
2020-08-25 20:00:45 +08:00

183 lines
6.2 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2019 Baidu.com, Inc. All Rights Reserved
#
"""
requirements:
Authors: daisongtai(daisongtai@baidu.com)
Date: 2019/5/29 6:38 PM
"""
from __future__ import print_function
import re
from transformers import BertTokenizer
max_seq_length = 500
tokenizer = BertTokenizer.from_pretrained('transformer_cpt/bert', do_lower_case=True)
LHan = [
[0x2E80, 0x2E99], # Han # So [26] CJK RADICAL REPEAT, CJK RADICAL RAP
[0x2E9B, 0x2EF3
], # Han # So [89] CJK RADICAL CHOKE, CJK RADICAL C-SIMPLIFIED TURTLE
[0x2F00, 0x2FD5], # Han # So [214] KANGXI RADICAL ONE, KANGXI RADICAL FLUTE
0x3005, # Han # Lm IDEOGRAPHIC ITERATION MARK
0x3007, # Han # Nl IDEOGRAPHIC NUMBER ZERO
[0x3021,
0x3029], # Han # Nl [9] HANGZHOU NUMERAL ONE, HANGZHOU NUMERAL NINE
[0x3038,
0x303A], # Han # Nl [3] HANGZHOU NUMERAL TEN, HANGZHOU NUMERAL THIRTY
0x303B, # Han # Lm VERTICAL IDEOGRAPHIC ITERATION MARK
[
0x3400, 0x4DB5
], # Han # Lo [6582] CJK UNIFIED IDEOGRAPH-3400, CJK UNIFIED IDEOGRAPH-4DB5
[
0x4E00, 0x9FC3
], # Han # Lo [20932] CJK UNIFIED IDEOGRAPH-4E00, CJK UNIFIED IDEOGRAPH-9FC3
[
0xF900, 0xFA2D
], # Han # Lo [302] CJK COMPATIBILITY IDEOGRAPH-F900, CJK COMPATIBILITY IDEOGRAPH-FA2D
[
0xFA30, 0xFA6A
], # Han # Lo [59] CJK COMPATIBILITY IDEOGRAPH-FA30, CJK COMPATIBILITY IDEOGRAPH-FA6A
[
0xFA70, 0xFAD9
], # Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70, CJK COMPATIBILITY IDEOGRAPH-FAD9
[
0x20000, 0x2A6D6
], # Han # Lo [42711] CJK UNIFIED IDEOGRAPH-20000, CJK UNIFIED IDEOGRAPH-2A6D6
[0x2F800, 0x2FA1D]
] # Han # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800, CJK COMPATIBILITY IDEOGRAPH-2FA1D
CN_PUNCTS = [(0x3002, ""), (0xFF1F, ""), (0xFF01, ""), (0xFF0C, ""),
(0x3001, ""), (0xFF1B, ""), (0xFF1A, ""), (0x300C, ""),
(0x300D, ""), (0x300E, ""), (0x300F, ""), (0x2018, ""),
(0x2019, ""), (0x201C, ""), (0x201D, ""), (0xFF08, ""),
(0xFF09, ""), (0x3014, ""), (0x3015, ""), (0x3010, ""),
(0x3011, ""), (0x2014, ""), (0x2026, ""), (0x2013, ""),
(0xFF0E, ""), (0x300A, ""), (0x300B, ""), (0x3008, ""),
(0x2460, ""), (0x2461, ""), (0x2462, ""), (0x2463, ""),
(0x2464, ""), (0x2465, ""), (0x2466, ""), (0x2467, ""), (0x2468, ""), (0x2469, ""),
(0x3009, ""), (0x2015, ""), (0xff0d, ""), (0x0020, " "), (0xFF5E, "")]
# (0xFF5E, ""),
EN_PUNCTS = [[0x0021, 0x002F], [0x003A, 0x0040], [0x005B, 0x0060],
[0x007B, 0x007E]]
class ChineseAndPunctuationExtractor(object):
def __init__(self):
self.chinese_re = self.build_re()
def is_chinese_or_punct(self, c):
if self.chinese_re.match(c):
return True
else:
return False
def build_re(self):
L = []
for i in LHan:
if isinstance(i, list):
f, t = i
try:
f = chr(f)
t = chr(t)
L.append('%s-%s' % (f, t))
except:
pass # A narrow python build, so can't use chars > 65535 without surrogate pairs!
else:
try:
L.append(chr(i))
except:
pass
for j, _ in CN_PUNCTS:
try:
L.append(chr(j))
except:
pass
for k in EN_PUNCTS:
f, t = k
try:
f = chr(f)
t = chr(t)
L.append('%s-%s' % (f, t))
except:
raise ValueError()
pass # A narrow python build, so can't use chars > 65535 without surrogate pairs!
RE = '[%s]' % ''.join(L)
# print('RE:', RE.encode('utf-8'))
return re.compile(RE, re.UNICODE)
if __name__ == '__main__':
extractor = ChineseAndPunctuationExtractor()
# for c in "韩邦庆18561894曾用名寄字子云别署太仙、大一山人、花也怜侬、三庆":
# if extractor.is_chinese_or_punct(c):
# print(c, 'yes')
# else:
# print(c, "no")
#
# print("", extractor.is_chinese_or_punct(""))
# print("~", extractor.is_chinese_or_punct("~"))
# print("―", extractor.is_chinese_or_punct("―"))
# print("-", extractor.is_chinese_or_punct("-"))
text_raw = "3抗甲状腺球蛋白及抗甲状腺微粒体抗体TGA与TPO在桥本甲状腺炎患者血清中高滴度TGA90%95%TPO检测也有相应诊断价值"
sub_text = []
buff = ""
flag_en = False
flag_digit = False
for char in text_raw:
if extractor.is_chinese_or_punct(char):
if buff != "":
sub_text.append(buff)
buff = ""
sub_text.append(char)
flag_en = False
flag_digit = False
else:
if re.compile('\d').match(char):
if buff != "" and flag_en:
sub_text.append(buff)
buff = ""
flag_en = False
flag_digit = True
buff += char
else:
if buff != "" and flag_digit:
sub_text.append(buff)
buff = ""
flag_digit = False
flag_en = True
buff += char
if buff != "":
sub_text.append(buff)
tok_to_orig_start_index = []
tok_to_orig_end_index = []
tokens = []
text_tmp = ''
for (i, token) in enumerate(sub_text):
sub_tokens = tokenizer.tokenize(token) if token != ' ' else []
text_tmp += token
for sub_token in sub_tokens:
tok_to_orig_start_index.append(len(text_tmp) - len(token))
tok_to_orig_end_index.append(len(text_tmp) - 1)
tokens.append(sub_token)
if len(tokens) >= max_seq_length - 2:
break
else:
continue
break
print(sub_text)
print(tokens)