Match internal links in external links
See attardi/wikiextractor/issues/#55
This commit is contained in:
parent
6af9c283eb
commit
7a5b5e5765
@ -2035,8 +2035,10 @@ wgUrlProtocols = [
|
||||
# \p{Zs} is unicode 'separator, space' category. It covers the space 0x20
|
||||
# as well as U+3000 is IDEOGRAPHIC SPACE for bug 19052
|
||||
EXT_LINK_URL_CLASS = r'[^][<>"\x00-\x20\x7F\s]'
|
||||
ANCHOR_CLASS = r'[^][\x00-\x08\x0a-\x1F]'
|
||||
ExtLinkBracketedRegex = re.compile(
|
||||
'\[(((?i)' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)\s*([^\]\x00-\x08\x0a-\x1F]*?)\]',
|
||||
'\[(((?i)' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)' +
|
||||
r'\s*((?:' + ANCHOR_CLASS + r'|\[\[' + ANCHOR_CLASS + r'+\]\])' + r'*?)\]',
|
||||
re.S | re.U)
|
||||
EXT_IMAGE_REGEX = re.compile(
|
||||
r"""^(http://|https://)([^][<>"\x00-\x20\x7F\s]+)
|
||||
|
Loading…
Reference in New Issue
Block a user