See ChangeLog.

This commit is contained in:
Giuseppe Attardi 2015-04-22 12:42:42 +02:00
parent 4e07a6c149
commit 363dd47666
2 changed files with 402 additions and 59 deletions

View File

@ -1,3 +1,9 @@
2015-04-22 Giuseppe Attardi <attardi@di.unipi.it>
* WikiExtractor.py (replaceInternalLinks): function for replacing
internal links, modeled after MediaWiki original.
(replaceExternalLinks): revised taking into account the former.
2015-04-20 Giuseppe Attardi <attardi@di.unipi.it>
* WikiExtractor.py (findBalanced): dropped optional arguments.

View File

@ -2,9 +2,9 @@
# -*- coding: utf-8 -*-
#
# =============================================================================
# Version: 2.26 (Apr 20, 2015)
# Version: 2.27 (Apr 21, 2015)
# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
# Antonio Fuschetto (fuschett@di.unipi.it), University of Pisa
# Antonio Fuschetto (fuschett@aol.com), University of Pisa
#
# Contributors:
# Leonardo Souza (lsouza@amtera.com.br)
@ -47,21 +47,20 @@ collecting template definitions.
"""
import sys, os.path
import re, random
import argparse
import re # TODO use regex when it will be standard
import argparse, random
from itertools import izip, izip_longest
import logging, traceback
import urllib
import bz2
import codecs
from htmlentitydefs import name2codepoint
import urllib
import Queue, threading, multiprocessing
#===========================================================================
# Program version
version = '2.26'
version = '2.27'
### PARAMS ####################################################################
@ -302,7 +301,7 @@ class Template(list):
return ''.join([tpl.subst(params, extractor, depth) for tpl in self])
def __str__(self):
return ''.join([str(x) for x in self])
return ''.join([unicode(x) for x in self])
class TemplateText(unicode):
"""Fixed text of template"""
@ -1398,21 +1397,317 @@ def dropSpans(spans, text):
res += text[offset:]
return res
# Match interwiki links, | separates parameters.
# First parameter is displayed, also trailing concatenated text included
# in display, e.g. 's' for plural).
#
# ----------------------------------------------------------------------
# WikiLinks
# See https://www.mediawiki.org/wiki/Help:Links#Internal_links
# Can be nested [[File:..|..[[..]]..|..]], [[Category:...]], etc.
# We first expand inner ones, than remove enclosing ones.
# Deal also with: [[Help:IPA for Catalan|[anˈdɔra]]]
# Also: [[Help:IPA for Catalan|[andora]]]
parametrizedLink = re.compile(r'\[\[[^\]]*?]]')
def replaceInternalLinks(text):
"""
Replaces external links of the form:
[[title |...|label]]trail
# Function applied to wikiLinks
def make_anchor_tag(link, trail):
braq = 2 if link[1] == '[' else 1
pipe = link.find('|', braq)
title = link[braq:-braq] if pipe < 0 else link[braq:pipe]
with title concatenated with trail, when present, e.g. 's' for plural.
"""
# call this after removal of external links, so we need not worry about
# triple closing ]]].
cur = 0
res = ''
for s,e in findBalanced(text, ['[['], [']]']):
m = tailRE.match(text, e)
if m:
trail = m.group(0)
end = m.end()
else:
trail = ''
end = e
inner = text[s+2:e-2]
# find first |
pipe = inner.find('|')
if pipe < 0:
title = inner
label = title
else:
title = inner[:pipe].rstrip()
# find last |
curp = pipe+1
for s,e in findBalanced(inner, ['[['], [']]']):
last = inner.rfind('|', curp, s)
if last >= 0:
pipe = last # advance
curp = e
label = inner[pipe+1:].strip()
res += text[cur:s] + makeInternalLink(title, label) + trail
cur = end
return res + text[cur:]
# the official version is a method in class Parser, similar to this:
# def replaceInternalLinks2(text):
# global wgExtraInterlanguageLinkPrefixes
# # the % is needed to support urlencoded titles as well
# tc = Title::legalChars() + '#%'
# # Match a link having the form [[namespace:link|alternate]]trail
# e1 = re.compile("([%s]+)(?:\\|(.+?))?]](.*)" % tc, re.S | re.D)
# # Match cases where there is no "]]", which might still be images
# e1_img = re.compile("([%s]+)\\|(.*)" % tc, re.S | re.D)
# holders = LinkHolderArray(self)
# # split the entire text string on occurrences of [[
# iterBrackets = re.compile('[[').finditer(text)
# m in iterBrackets.next()
# # get the first element (all text up to first [[)
# s = text[:m.start()]
# cur = m.end()
# line = s
# useLinkPrefixExtension = self.getTargetLanguage().linkPrefixExtension()
# e2 = None
# if useLinkPrefixExtension:
# # Match the end of a line for a word that is not followed by whitespace,
# # e.g. in the case of "The Arab al[[Razi]]", "al" will be matched
# global wgContLang
# charset = wgContLang.linkPrefixCharset()
# e2 = re.compile("((?>.*[^charset]|))(.+)", re.S | re.D | re.U)
# if self.mTitle is None:
# raise MWException(__METHOD__ + ": \self.mTitle is null\n")
# nottalk = not self.mTitle.isTalkPage()
# if useLinkPrefixExtension:
# m = e2.match(s)
# if m:
# first_prefix = m.group(2)
# else:
# first_prefix = false
# else:
# prefix = ''
# useSubpages = self.areSubpagesAllowed()
# for m in iterBrackets:
# line = text[cur:m.start()]
# cur = m.end()
# # TODO: Check for excessive memory usage
# if useLinkPrefixExtension:
# m = e2.match(e2)
# if m:
# prefix = m.group(2)
# s = m.group(1)
# else:
# prefix = ''
# # first link
# if first_prefix:
# prefix = first_prefix
# first_prefix = False
# might_be_img = False
# m = e1.match(line)
# if m: # page with normal label or alt
# label = m.group(2)
# # If we get a ] at the beginning of m.group(3) that means we have a link that is something like:
# # [[Image:Foo.jpg|[http://example.com desc]]] <- having three ] in a row fucks up,
# # the real problem is with the e1 regex
# # See bug 1300.
# #
# # Still some problems for cases where the ] is meant to be outside punctuation,
# # and no image is in sight. See bug 2095.
# #
# if label and m.group(3)[0] == ']' and '[' in label:
# label += ']' # so that replaceExternalLinks(label) works later
# m.group(3) = m.group(3)[1:]
# # fix up urlencoded title texts
# if '%' in m.group(1):
# # Should anchors '#' also be rejected?
# m.group(1) = str_replace(array('<', '>'), array('&lt', '&gt'), rawurldecode(m.group(1)))
# trail = m.group(3)
# else:
# m = e1_img.match(line):
# if m:
# # Invalid, but might be an image with a link in its caption
# might_be_img = true
# label = m.group(2)
# if '%' in m.group(1):
# m.group(1) = rawurldecode(m.group(1))
# trail = ""
# else: # Invalid form; output directly
# s += prefix + '[[' + line
# continue
# origLink = m.group(1)
# # Dont allow internal links to pages containing
# # PROTO: where PROTO is a valid URL protocol these
# # should be external links.
# if (preg_match('/^(?i:' + self.mUrlProtocols + ')/', origLink)) {
# s += prefix + '[[' + line
# continue
# }
# # Make subpage if necessary
# if useSubpages:
# link = self.maybeDoSubpageLink(origLink, label)
# else:
# link = origLink
# noforce = origLink[0] != ':'
# if not noforce:
# # Strip off leading ':'
# link = link[1:]
# nt = Title::newFromText(self.mStripState.unstripNoWiki(link))
# if nt is None:
# s += prefix + '[[' + line
# continue
# ns = nt.getNamespace()
# iw = nt.getInterwiki()
# if might_be_img { # if this is actually an invalid link
# if (ns == NS_FILE and noforce) { # but might be an image
# found = False
# while True:
# # look at the next 'line' to see if we can close it there
# next_line = iterBrakets.next()
# if not next_line:
# break
# m = explode(']]', next_line, 3)
# if m.lastindex == 3:
# # the first ]] closes the inner link, the second the image
# found = True
# label += "[[%s]]%s" % (m.group(0), m.group(1))
# trail = m.group(2)
# break
# elif m.lastindex == 2:
# # if there is exactly one ]] that is fine, we will keep looking
# label += "[[{m[0]}]]{m.group(1)}"
# else:
# # if next_line is invalid too, we need look no further
# label += '[[' + next_line
# break
# if not found:
# # we couldnt find the end of this imageLink, so output it raw
# # but dont ignore what might be perfectly normal links in the text we ve examined
# holders.merge(self.replaceInternalLinks2(label))
# s += "{prefix}[[%s|%s" % (link, text)
# # note: no trail, because without an end, there *is* no trail
# continue
# } else: # it is not an image, so output it raw
# s += "{prefix}[[%s|%s" % (link, text)
# # note: no trail, because without an end, there *is* no trail
# continue
# }
# wasblank = (text == '')
# if wasblank:
# text = link
# else:
# # Bug 4598 madness. Handle the quotes only if they come from the alternate part
# # [[Lista d''e paise d''o munno]] . <a href="...">Lista d''e paise d''o munno</a>
# # [[Criticism of Harry Potter|Criticism of ''Harry Potter'']]
# # . <a href="Criticism of Harry Potter">Criticism of <i>Harry Potter</i></a>
# text = self.doQuotes(text)
# # Link not escaped by : , create the various objects
# if noforce and not nt.wasLocalInterwiki():
# # Interwikis
# if iw and mOptions.getInterwikiMagic() and nottalk and (
# Language::fetchLanguageName(iw, None, 'mw') or
# in_array(iw, wgExtraInterlanguageLinkPrefixes)):
# # Bug 24502: filter duplicates
# if iw not in mLangLinkLanguages:
# self.mLangLinkLanguages[iw] = True
# self.mOutput.addLanguageLink(nt.getFullText())
# s = rstrip(s + prefix)
# s += strip(trail, "\n") == '' ? '': prefix + trail
# continue
# if ns == NS_FILE:
# if not wfIsBadImage(nt.getDBkey(), self.mTitle):
# if wasblank:
# # if no parameters were passed, text
# # becomes something like "File:Foo.png",
# # which we dont want to pass on to the
# # image generator
# text = ''
# else:
# # recursively parse links inside the image caption
# # actually, this will parse them in any other parameters, too,
# # but it might be hard to fix that, and it doesnt matter ATM
# text = self.replaceExternalLinks(text)
# holders.merge(self.replaceInternalLinks2(text))
# # cloak any absolute URLs inside the image markup, so replaceExternalLinks() wont touch them
# s += prefix + self.armorLinks(
# self.makeImage(nt, text, holders)) + trail
# else:
# s += prefix + trail
# continue
# if ns == NS_CATEGORY:
# s = rstrip(s + "\n") # bug 87
# if wasblank:
# sortkey = self.getDefaultSort()
# else:
# sortkey = text
# sortkey = Sanitizer::decodeCharReferences(sortkey)
# sortkey = str_replace("\n", '', sortkey)
# sortkey = self.getConverterLanguage().convertCategoryKey(sortkey)
# self.mOutput.addCategory(nt.getDBkey(), sortkey)
# s += strip(prefix + trail, "\n") == '' ? '' : prefix + trail
# continue
# }
# }
# # Self-link checking. For some languages, variants of the title are checked in
# # LinkHolderArray::doVariants() to allow batching the existence checks necessary
# # for linking to a different variant.
# if ns != NS_SPECIAL and nt.equals(self.mTitle) and !nt.hasFragment():
# s += prefix + Linker::makeSelfLinkObj(nt, text, '', trail)
# continue
# # NS_MEDIA is a pseudo-namespace for linking directly to a file
# # @todo FIXME: Should do batch file existence checks, see comment below
# if ns == NS_MEDIA:
# # Give extensions a chance to select the file revision for us
# options = []
# descQuery = False
# Hooks::run('BeforeParserFetchFileAndTitle',
# [this, nt, &options, &descQuery])
# # Fetch and register the file (file title may be different via hooks)
# file, nt = self.fetchFileAndTitle(nt, options)
# # Cloak with NOPARSE to avoid replacement in replaceExternalLinks
# s += prefix + self.armorLinks(
# Linker::makeMediaLinkFile(nt, file, text)) + trail
# continue
# # Some titles, such as valid special pages or files in foreign repos, should
# # be shown as bluelinks even though they are not included in the page table
# #
# # @todo FIXME: isAlwaysKnown() can be expensive for file links; we should really do
# # batch file existence checks for NS_FILE and NS_MEDIA
# if iw == '' and nt.isAlwaysKnown():
# self.mOutput.addLink(nt)
# s += self.makeKnownLinkHolder(nt, text, array(), trail, prefix)
# else:
# # Links will be added to the output link list after checking
# s += holders.makeHolder(nt, text, array(), trail, prefix)
# }
# return holders
def makeInternalLink(title, label):
colon = title.find(':')
if colon > 0 and title[:colon] not in acceptedNamespaces:
return ''
@ -1421,23 +1716,90 @@ def make_anchor_tag(link, trail):
colon2 = title.find(':', colon+1)
if colon2 > 1 and title[colon+1:colon2] not in acceptedNamespaces:
return ''
# find anchor
if pipe < 0:
anchor = title
if Extractor.keepLinks:
return '<a href="%s">%s</a>' % (urllib.quote(title.encode('utf-8')), anchor)
else:
# find last |
cur = pipe
for s,e in findBalanced(link[pipe:-braq], ['[[', '['], [']]', ']']):
pipe = link.find('|', cur, s)
if pipe < 0:
pipe = cur
anchor = link[pipe+1:-braq]
anchor += trail
return label
# ----------------------------------------------------------------------
# External links
# from: https://doc.wikimedia.org/mediawiki-core/master/php/DefaultSettings_8php_source.html
wgUrlProtocols = [
'bitcoin:', 'ftp://', 'ftps://', 'geo:', 'git://', 'gopher://', 'http://',
'https://', 'irc://', 'ircs://', 'magnet:', 'mailto:', 'mms://', 'news:',
'nntp://', 'redis://', 'sftp://', 'sip:', 'sips:', 'sms:', 'ssh://',
'svn://', 'tel:', 'telnet://', 'urn:', 'worldwind://', 'xmpp:', '//'
]
# from: https://doc.wikimedia.org/mediawiki-core/master/php/Parser_8php_source.html
# Constants needed for external link processing
# Everything except bracket, space, or control characters
# \p{Zs} is unicode 'separator, space' category. It covers the space 0x20
# as well as U+3000 is IDEOGRAPHIC SPACE for bug 19052
EXT_LINK_URL_CLASS = r'[^][<>"\x00-\x20\x7F\s]'
ExtLinkBracketedRegex = re.compile('\[(((?i)' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)\s*([^\]\x00-\x08\x0a-\x1F]*?)\]', re.S | re.U)
EXT_IMAGE_REGEX = re.compile(
r"""^(http://|https://)([^][<>"\x00-\x20\x7F\s]+)
/([A-Za-z0-9_.,~%\-+&;#*?!=()@\x80-\xFF]+)\.((?i)gif|png|jpg|jpeg)$""",
re.X | re.S | re.U)
def replaceExternalLinks(text):
s = ''
cur = 0
for m in ExtLinkBracketedRegex.finditer(text):
s += text[cur:m.start()]
cur = m.end()
url = m.group(1)
label = m.group(3)
# # The characters '<' and '>' (which were escaped by
# # removeHTMLtags()) should not be included in
# # URLs, per RFC 2396.
# m2 = re.search('&(lt|gt);', url)
# if m2:
# link = url[m2.end():] + ' ' + link
# url = url[0:m2.end()]
# If the link text is an image URL, replace it with an <img> tag
# This happened by accident in the original parser, but some people used it extensively
m = EXT_IMAGE_REGEX.match(label)
if m:
label = makeExternalImage(label)
# Use the encoded URL
# This means that users can paste URLs directly into the text
# Funny characters like ö aren't valid in URLs anyway
# This was changed in August 2004
s += makeExternalLink(url, label) #+ trail
return s + text[cur:]
# Function applied to wikiLinks
def makeExternalLink(title, anchor):
colon = title.find(':')
if colon > 0 and title[:colon] not in acceptedNamespaces:
return ''
if colon == 0:
# drop also :File:
colon2 = title.find(':', colon+1)
if colon2 > 1 and title[colon+1:colon2] not in acceptedNamespaces:
return ''
if Extractor.keepLinks:
return '<a href="%s">%s</a>' % (urllib.quote(title.encode('utf-8')), anchor)
else:
return anchor
def makeExternalImage(url, alt=''):
if Extractor.keepLinks:
return '<img src="%s" alt="%s">' % (url, alt)
else:
return alt
# ----------------------------------------------------------------------
# match tail after wikilink
@ -1464,36 +1826,11 @@ def clean(extractor, text):
# Drop tables
text = dropNested(text, r'{\|', r'\|}')
# Expand links
res = ''
cur = 0
# This is too slow.
# for m in wikiLink.finditer(text):
# res += text[cur:m.start()] + make_anchor_tag(m)
# cur = m.end()
# text = res + text[cur:]
# Matches also: [[Help:IPA for Spanish|[a'ðoβe]]]
# May be nested:
# [http://www.perseus.tufts.edu .&lt;/ref&gt;&lt;ref&gt;[http://www.merriam-webster.com], [[Merriam-Webster]].&lt;/ref&gt;&lt;ref&gt;
# replace external links
text = replaceExternalLinks(text)
for s,e in findBalanced(text, ['[[', '['], [']]', ']']):
m = tailRE.match(text, e)
if m:
trail = m.group(0)
end = m.end()
else:
trail = ''
end = e
res += text[cur:s] + make_anchor_tag(text[s:e], trail)
cur = end
text = res + text[cur:]
# Drop all remaining ones
text = parametrizedLink.sub('', text)
# Handle external links
text = externalLink.sub(r'\1', text)
text = externalLinkNoAnchor.sub('', text)
# replace internal links
text = replaceInternalLinks(text)
# drop MagicWords behavioral switches
text = magicWordsRE.sub('', text)