DeepIE/utils/extract_chinese_and_punct.py
2020-04-19 20:57:15 +08:00

159 lines
5.2 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2019 Baidu.com, Inc. All Rights Reserved
#
"""
requirements:
Authors: daisongtai(daisongtai@baidu.com)
Date: 2019/5/29 6:38 PM
"""
from __future__ import print_function
import sys
import re
import io
LHan = [
[0x2E80, 0x2E99], # Han # So [26] CJK RADICAL REPEAT, CJK RADICAL RAP
[0x2E9B, 0x2EF3
], # Han # So [89] CJK RADICAL CHOKE, CJK RADICAL C-SIMPLIFIED TURTLE
[0x2F00, 0x2FD5], # Han # So [214] KANGXI RADICAL ONE, KANGXI RADICAL FLUTE
0x3005, # Han # Lm IDEOGRAPHIC ITERATION MARK
0x3007, # Han # Nl IDEOGRAPHIC NUMBER ZERO
[0x3021,
0x3029], # Han # Nl [9] HANGZHOU NUMERAL ONE, HANGZHOU NUMERAL NINE
[0x3038,
0x303A], # Han # Nl [3] HANGZHOU NUMERAL TEN, HANGZHOU NUMERAL THIRTY
0x303B, # Han # Lm VERTICAL IDEOGRAPHIC ITERATION MARK
[
0x3400, 0x4DB5
], # Han # Lo [6582] CJK UNIFIED IDEOGRAPH-3400, CJK UNIFIED IDEOGRAPH-4DB5
[
0x4E00, 0x9FC3
], # Han # Lo [20932] CJK UNIFIED IDEOGRAPH-4E00, CJK UNIFIED IDEOGRAPH-9FC3
[
0xF900, 0xFA2D
], # Han # Lo [302] CJK COMPATIBILITY IDEOGRAPH-F900, CJK COMPATIBILITY IDEOGRAPH-FA2D
[
0xFA30, 0xFA6A
], # Han # Lo [59] CJK COMPATIBILITY IDEOGRAPH-FA30, CJK COMPATIBILITY IDEOGRAPH-FA6A
[
0xFA70, 0xFAD9
], # Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70, CJK COMPATIBILITY IDEOGRAPH-FAD9
[
0x20000, 0x2A6D6
], # Han # Lo [42711] CJK UNIFIED IDEOGRAPH-20000, CJK UNIFIED IDEOGRAPH-2A6D6
[0x2F800, 0x2FA1D]
] # Han # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800, CJK COMPATIBILITY IDEOGRAPH-2FA1D
CN_PUNCTS = [(0x3002, ""), (0xFF1F, ""), (0xFF01, ""), (0xFF0C, ""),
(0x3001, ""), (0xFF1B, ""), (0xFF1A, ""), (0x300C, ""),
(0x300D, ""), (0x300E, ""), (0x300F, ""), (0x2018, ""),
(0x2019, ""), (0x201C, ""), (0x201D, ""), (0xFF08, ""),
(0xFF09, ""), (0x3014, ""), (0x3015, ""), (0x3010, ""),
(0x3011, ""), (0x2014, ""), (0x2026, ""), (0x2013, ""),
(0xFF0E, ""), (0x300A, ""), (0x300B, ""), (0x3008, ""),
(0x3009, ""), (0x2015, ""), (0xff0d, ""), (0x0020, " "),(0xFF5E, "")]
#(0xFF5E, ""),
EN_PUNCTS = [[0x0021, 0x002F], [0x003A, 0x0040], [0x005B, 0x0060],
[0x007B, 0x007E]]
class ChineseAndPunctuationExtractor(object):
def __init__(self):
self.chinese_re = self.build_re()
def is_chinese_or_punct(self, c):
if self.chinese_re.match(c):
return True
else:
return False
def build_re(self):
L = []
for i in LHan:
if isinstance(i, list):
f, t = i
try:
f = chr(f)
t = chr(t)
L.append('%s-%s' % (f, t))
except:
pass # A narrow python build, so can't use chars > 65535 without surrogate pairs!
else:
try:
L.append(chr(i))
except:
pass
for j, _ in CN_PUNCTS:
try:
L.append(chr(j))
except:
pass
for k in EN_PUNCTS:
f, t = k
try:
f = chr(f)
t = chr(t)
L.append('%s-%s' % (f, t))
except:
raise ValueError()
pass # A narrow python build, so can't use chars > 65535 without surrogate pairs!
RE = '[%s]' % ''.join(L)
# print('RE:', RE.encode('utf-8'))
return re.compile(RE, re.UNICODE)
if __name__ == '__main__':
extractor = ChineseAndPunctuationExtractor()
# for c in "韩邦庆18561894曾用名寄字子云别署太仙、大一山人、花也怜侬、三庆":
# if extractor.is_chinese_or_punct(c):
# print(c, 'yes')
# else:
# print(c, "no")
#
# print("", extractor.is_chinese_or_punct(""))
# print("~", extractor.is_chinese_or_punct("~"))
# print("―", extractor.is_chinese_or_punct("―"))
# print("-", extractor.is_chinese_or_punct("-"))
text_raw="'ymx510335'"
sub_text = []
buff = ""
flag_en=False
flag_digit=False
for char in text_raw:
if extractor.is_chinese_or_punct(char):
if buff != "":
sub_text.append(buff)
buff = ""
sub_text.append(char)
flag_en = False
flag_digit = False
else:
if re.compile('\d').match(char):
if buff != "" and flag_en:
sub_text.append(buff)
buff = ""
flag_en =False
flag_digit = True
buff +=char
else:
if buff != "" and flag_digit:
sub_text.append(buff)
buff = ""
flag_digit =False
flag_en = True
buff +=char
if buff != "":
sub_text.append(buff)
print(sub_text)