Match internal links in external links
See attardi/wikiextractor/issues/#55
This commit is contained in:
parent
6af9c283eb
commit
7a5b5e5765
@ -2035,8 +2035,10 @@ wgUrlProtocols = [
|
|||||||
# \p{Zs} is unicode 'separator, space' category. It covers the space 0x20
|
# \p{Zs} is unicode 'separator, space' category. It covers the space 0x20
|
||||||
# as well as U+3000 is IDEOGRAPHIC SPACE for bug 19052
|
# as well as U+3000 is IDEOGRAPHIC SPACE for bug 19052
|
||||||
EXT_LINK_URL_CLASS = r'[^][<>"\x00-\x20\x7F\s]'
|
EXT_LINK_URL_CLASS = r'[^][<>"\x00-\x20\x7F\s]'
|
||||||
|
ANCHOR_CLASS = r'[^][\x00-\x08\x0a-\x1F]'
|
||||||
ExtLinkBracketedRegex = re.compile(
|
ExtLinkBracketedRegex = re.compile(
|
||||||
'\[(((?i)' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)\s*([^\]\x00-\x08\x0a-\x1F]*?)\]',
|
'\[(((?i)' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)' +
|
||||||
|
r'\s*((?:' + ANCHOR_CLASS + r'|\[\[' + ANCHOR_CLASS + r'+\]\])' + r'*?)\]',
|
||||||
re.S | re.U)
|
re.S | re.U)
|
||||||
EXT_IMAGE_REGEX = re.compile(
|
EXT_IMAGE_REGEX = re.compile(
|
||||||
r"""^(http://|https://)([^][<>"\x00-\x20\x7F\s]+)
|
r"""^(http://|https://)([^][<>"\x00-\x20\x7F\s]+)
|
||||||
|
Loading…
Reference in New Issue
Block a user