See ChangeLog.

This commit is contained in:
Giuseppe Attardi 2015-04-15 03:43:02 +02:00
parent f4e416ba3b
commit d6491e0763
2 changed files with 30 additions and 20 deletions

View File

@ -2,6 +2,8 @@
* WikiExtractor.py (expandTemplates): increase depth only when
calling expandTemplate()
(define_template): removed \n in onlyincludeAccumulator, drop
<noinclude> always.
2015-04-14 Giuseppe Attardi <attardi@di.unipi.it>

View File

@ -479,6 +479,10 @@ def findMatchingBraces(text, ldelim):
# {{{{ }}}} -> { {{{ }}} }
# {{{{{ }}}}} -> {{ {{{ }}} }}
# {{#if:{{{{{#if:{{{nominee|}}}|nominee|candidate}}|}}}|...}}
# Handle:
# {{{{{|safesubst:}}}#Invoke:String|replace|{{{1|{{{{{|safesubst:}}}PAGENAME}}}}}|%s+%([^%(]-%)$||plain=false}}
reOpen = re.compile('{' * ldelim) # inner
reClose = re.compile('([{]{2,})|(}{2,})') # at least 2
cur = 0
@ -497,26 +501,26 @@ def findMatchingBraces(text, ldelim):
if m2.lastindex == 1:
npar += len(m2.group(1))
else:
clen = len(m2.group(2))
if clen < npar:
npar -= clen
if npar == 1 and clen == ldelim and text[m1.start()+ldelim]=='{':
close = len(m2.group(2))
if close < npar:
npar -= close
if npar < ldelim and close == ldelim and all([text[i]=='{' for i in range(m1.start(),m1.start()+npar)]):
# spurious {
yield m1.start()+1, end
yield m1.start()+npar, end
cur = end
break
else:
# resolve ambiguities
if ldelim == 3:
if clen > 3 and all([text[i]=='{' for i in range(m1.start()+3,m1.start()+clen)]):
yield m1.start()+clen-3, end-clen+3
if close > 3 and all([text[i]=='{' for i in range(m1.start()+3,m1.start()+close)]):
yield m1.start()+close-3, end-close+3
elif text[m1.start()+3] == '{' and text[m1.start()+4] != '{':
# spurious {
yield m1.start()+1, end
else: # clen >= npar
yield m1.start(), end-clen+npar
else: # close >= npar
yield m1.start(), end-close+npar
else: # ldelim == 2
yield m1.start(), end-clen+npar
yield m1.start(), end-close+npar
cur = end
break
@ -732,7 +736,7 @@ def expandTemplate(body, depth):
parts = splitParameters(body)
# title is the portion before the first |
#logging.debug('TITLE ' + str(depth) + ' ' + parts[0].strip())
logging.debug('TITLE ' + parts[0].strip())
title = expandTemplates(parts[0].strip(), depth)
# SUBST
@ -844,6 +848,7 @@ def substParameters(body, params, depth, subst_depth=0):
# {{{italics|{{{italic|}}}
# {{#if:{{{{{#if:{{{nominee|}}}|nominee|candidate}}|}}}|
#
for s,e in findMatchingBraces(body, 3):
# invoke substParameter on outer {{{}}}
result += body[start:s] + substParameter(body[s+3:e-3],
@ -1183,18 +1188,17 @@ def define_template(title, page):
# remove comments
text = comment.sub('', text)
# eliminate <noinclude> fragments
text = reNoinclude.sub('', text)
# eliminate unterminated <noinclude> elements
text = re.sub(r'<noinclude\s*>.*$', '', text, flags=re.DOTALL)
onlyincludeAccumulator = ''
for m in re.finditer('<onlyinclude>(.*?)</onlyinclude>', text, re.DOTALL):
onlyincludeAccumulator += m.group(1) + "\n"
onlyincludeAccumulator += m.group(1)
if onlyincludeAccumulator:
text = onlyincludeAccumulator
else:
# If there are no <onlyinclude> fragments, simply eliminate
# <noinclude> fragments and keep <includeonly> ones.
text = reNoinclude.sub('', text)
# eliminate unterminated <noinclude> elements
text = re.sub(r'<noinclude\s*>.*$', '', text, flags=re.DOTALL)
text = reIncludeonly.sub('', text)
if text:
@ -1283,11 +1287,13 @@ def dropSpans(spans, text):
parametrizedLink = re.compile(r'\[\[[^\]]*?]]')
wikiLink = re.compile(r'\[\[([^|]*)(?:\|([^|]*?))*?]]')
wikiLink = re.compile(r'\[\[([^|]*)(?:\|([^|]*))*]]')
# Function applied to wikiLinks
def make_anchor_tag(link, trail):
match = wikiLink.match(link)
if not match: # single []
return link+trail
link = match.group(1)
colon = link.find(':')
if colon > 0 and link[:colon] not in acceptedNamespaces:
@ -1334,7 +1340,8 @@ def clean(text):
# res += text[cur:m.start()] + make_anchor_tag(m)
# cur = m.end()
# text = res + text[cur:]
for s,e in findBalanced(text, ['[['], [']]']):
# Matches also: [[Help:IPA for Spanish|[a'ðoβe]]]
for s,e in findBalanced(text, ['[[', '['], [']]', ']']):
m = tailRE.match(text, e)
if m:
trail = m.group(0)
@ -1469,6 +1476,7 @@ def compact(text):
page.append("<li>%s</li>" % line[1:])
else:
continue
# Drop residuals of lists
elif line[0] in '{|' or line[-1] in '}':
continue