Restored template expansion and HTML preserving flag.
This commit is contained in:
parent
1053fe2030
commit
05b5cc7e51
@ -79,7 +79,6 @@ knownNamespaces = set(['Template'])
|
||||
# The namespace used for template definitions
|
||||
# It is the name associated with namespace key=10 in the siteinfo header.
|
||||
templateNamespace = ''
|
||||
templatePrefix = ''
|
||||
|
||||
##
|
||||
# The namespace used for module definitions
|
||||
@ -196,8 +195,7 @@ def load_templates(file, output_file=None):
|
||||
Load templates from :param file:.
|
||||
:param output_file: file where to save templates and modules.
|
||||
"""
|
||||
global templateNamespace, templatePrefix
|
||||
templatePrefix = templateNamespace + ':'
|
||||
global templateNamespace
|
||||
global moduleNamespace, modulePrefix
|
||||
modulePrefix = moduleNamespace + ':'
|
||||
articles = 0
|
||||
@ -220,6 +218,13 @@ def load_templates(file, output_file=None):
|
||||
page = []
|
||||
elif tag == 'title':
|
||||
title = m.group(3)
|
||||
if not output_file and not templateNamespace: # do not know it yet
|
||||
# we reconstruct it from the first title
|
||||
colon = title.find(':')
|
||||
if colon > 1:
|
||||
templateNamespace = title[:colon]
|
||||
Extractor.templatePrefix = title[:colon + 1]
|
||||
# FIXME: should reconstruct also moduleNamespace
|
||||
elif tag == 'text':
|
||||
inText = True
|
||||
line = line[m.start(3):m.end(3)]
|
||||
@ -233,18 +238,11 @@ def load_templates(file, output_file=None):
|
||||
elif inText:
|
||||
page.append(line)
|
||||
elif tag == '/page':
|
||||
if not output_file and not templateNamespace: # do not know it yet
|
||||
# we reconstruct it from the first title
|
||||
colon = title.find(':')
|
||||
if colon > 1:
|
||||
templateNamespace = title[:colon]
|
||||
templatePrefix = title[:colon + 1]
|
||||
# FIXME: should reconstruct also moduleNamespace
|
||||
if title.startswith(templatePrefix):
|
||||
if title.startswith(Extractor.templatePrefix):
|
||||
define_template(title, page)
|
||||
templates += 1
|
||||
# save templates and modules to file
|
||||
if output_file and (title.startswith(templatePrefix) or
|
||||
if output_file and (title.startswith(Extractor.templatePrefix) or
|
||||
title.startswith(modulePrefix)):
|
||||
output.write('<page>\n')
|
||||
output.write(' <title>%s</title>\n' % title)
|
||||
@ -279,6 +277,63 @@ def decode_open(filename, mode='rt', encoding='utf-8'):
|
||||
return open(filename, mode, encoding=encoding)
|
||||
|
||||
|
||||
def collect_pages(text):
|
||||
"""
|
||||
:param text: the text of a wikipedia file dump.
|
||||
"""
|
||||
# we collect individual lines, since str.join() is significantly faster
|
||||
# than concatenation
|
||||
page = []
|
||||
id = ''
|
||||
revid = ''
|
||||
last_id = ''
|
||||
inText = False
|
||||
redirect = False
|
||||
for line in text:
|
||||
if '<' not in line: # faster than doing re.search()
|
||||
if inText:
|
||||
page.append(line)
|
||||
continue
|
||||
m = tagRE.search(line)
|
||||
if not m:
|
||||
continue
|
||||
tag = m.group(2)
|
||||
if tag == 'page':
|
||||
page = []
|
||||
redirect = False
|
||||
elif tag == 'id' and not id:
|
||||
id = m.group(3)
|
||||
elif tag == 'id' and id: # <revision> <id></id> </revision>
|
||||
revid = m.group(3)
|
||||
elif tag == 'title':
|
||||
title = m.group(3)
|
||||
elif tag == 'redirect':
|
||||
redirect = True
|
||||
elif tag == 'text':
|
||||
inText = True
|
||||
line = line[m.start(3):m.end(3)]
|
||||
page.append(line)
|
||||
if m.lastindex == 4: # open-close
|
||||
inText = False
|
||||
elif tag == '/text':
|
||||
if m.group(1):
|
||||
page.append(m.group(1))
|
||||
inText = False
|
||||
elif inText:
|
||||
page.append(line)
|
||||
elif tag == '/page':
|
||||
colon = title.find(':')
|
||||
if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and
|
||||
not redirect and not title.startswith(templateNamespace)):
|
||||
yield (id, revid, title, page)
|
||||
last_id = id
|
||||
id = ''
|
||||
revid = ''
|
||||
page = []
|
||||
inText = False
|
||||
redirect = False
|
||||
|
||||
|
||||
def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
||||
process_count, html_safe):
|
||||
"""
|
||||
@ -290,7 +345,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
||||
:param process_count: number of extraction processes to spawn.
|
||||
"""
|
||||
global knownNamespaces
|
||||
global templateNamespace, templatePrefix
|
||||
global templateNamespace
|
||||
global moduleNamespace, modulePrefix
|
||||
|
||||
urlbase = '' # This is obtained from <siteinfo>
|
||||
@ -313,7 +368,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
||||
knownNamespaces.add(m.group(3))
|
||||
if re.search('key="10"', line):
|
||||
templateNamespace = m.group(3)
|
||||
templatePrefix = templateNamespace + ':'
|
||||
Extractor.templatePrefix = templateNamespace + ':'
|
||||
elif re.search('key="828"', line):
|
||||
moduleNamespace = m.group(3)
|
||||
modulePrefix = moduleNamespace + ':'
|
||||
@ -383,56 +438,12 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
||||
|
||||
# we collect individual lines, since str.join() is significantly faster
|
||||
# than concatenation
|
||||
page = []
|
||||
id = ''
|
||||
revid = ''
|
||||
last_id = ''
|
||||
|
||||
ordinal = 0 # page count
|
||||
inText = False
|
||||
redirect = False
|
||||
for line in input:
|
||||
if '<' not in line: # faster than doing re.search()
|
||||
if inText:
|
||||
page.append(line)
|
||||
continue
|
||||
m = tagRE.search(line)
|
||||
if not m:
|
||||
continue
|
||||
tag = m.group(2)
|
||||
if tag == 'page':
|
||||
page = []
|
||||
redirect = False
|
||||
elif tag == 'id' and not id:
|
||||
id = m.group(3)
|
||||
elif tag == 'id' and id: # <revision> <id></id> </revision>
|
||||
revid = m.group(3)
|
||||
elif tag == 'title':
|
||||
title = m.group(3)
|
||||
elif tag == 'redirect':
|
||||
redirect = True
|
||||
elif tag == 'text':
|
||||
inText = True
|
||||
line = line[m.start(3):m.end(3)]
|
||||
page.append(line)
|
||||
if m.lastindex == 4: # open-close
|
||||
inText = False
|
||||
elif tag == '/text':
|
||||
if m.group(1):
|
||||
page.append(m.group(1))
|
||||
inText = False
|
||||
elif inText:
|
||||
page.append(line)
|
||||
elif tag == '/page':
|
||||
colon = title.find(':')
|
||||
if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and
|
||||
not redirect and not title.startswith(templateNamespace)):
|
||||
for id, revid, title, page in collect_pages(input):
|
||||
job = (id, revid, urlbase, title, page, ordinal)
|
||||
jobs_queue.put(job) # goes to any available extract_process
|
||||
last_id = id
|
||||
ordinal += 1
|
||||
id = ''
|
||||
revid = ''
|
||||
page = []
|
||||
|
||||
input.close()
|
||||
|
||||
@ -467,7 +478,7 @@ def extract_process(jobs_queue, output_queue, html_safe):
|
||||
:html_safe: whether to convert entities in text to HTML.
|
||||
"""
|
||||
while True:
|
||||
job = jobs_queue.get() # job is (id, revid, urlbase, title, page, ordinal)
|
||||
job = jobs_queue.get() # job is (id, revid, urlbase, title, page)
|
||||
if job:
|
||||
out = StringIO() # memory buffer
|
||||
Extractor(*job[:-1]).extract(out, html_safe) # (id, urlbase, title, page)
|
||||
@ -479,7 +490,8 @@ def extract_process(jobs_queue, output_queue, html_safe):
|
||||
|
||||
|
||||
def reduce_process(output_queue, output):
|
||||
"""Pull finished article text, write series of files (or stdout)
|
||||
"""
|
||||
Pull finished article text, write series of files (or stdout)
|
||||
:param output_queue: text to be output.
|
||||
:param output: file object where to print.
|
||||
"""
|
||||
@ -515,7 +527,7 @@ minFileSize = 200 * 1024
|
||||
|
||||
|
||||
def main():
|
||||
global urlbase, acceptedNamespaces
|
||||
global acceptedNamespaces
|
||||
global expand_templates, templateCache
|
||||
|
||||
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
|
||||
@ -609,24 +621,10 @@ def main():
|
||||
with open(args.templates) as file:
|
||||
load_templates(file)
|
||||
|
||||
with open(input_file) as file:
|
||||
page = file.read()
|
||||
ids = re.findall(r'<id>(\d*?)</id>', page)
|
||||
id = ids[0] if ids else ''
|
||||
revid = ids[1] if len(ids) > 1 else ''
|
||||
m = re.search(r'<title>(.*?)</title>', page)
|
||||
if m:
|
||||
title = m.group(1)
|
||||
else:
|
||||
logging.error('Missing title element')
|
||||
return
|
||||
m = re.search(r'<base>(.*?)</base>', page)
|
||||
if m:
|
||||
base = m.group(1)
|
||||
urlbase = base[:base.rfind("/")]
|
||||
else:
|
||||
urlbase = ''
|
||||
Extractor(id, revid, urlbase, title, [page]).extract(sys.stdout)
|
||||
with open(input_file) as input:
|
||||
for id, revid, title, page in collect_pages(input):
|
||||
Extractor(id, revid, urlbase, title, page).extract(sys.stdout)
|
||||
return
|
||||
|
||||
output_path = args.output
|
||||
|
@ -26,6 +26,7 @@ from urllib.parse import quote as urlencode
|
||||
from html.entities import name2codepoint
|
||||
import logging
|
||||
import time
|
||||
import pdb # DEBUG
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
@ -81,6 +82,7 @@ def clean(extractor, text, expand_templates=False, html_safe=True):
|
||||
if expand_templates:
|
||||
# expand templates
|
||||
# See: http://www.mediawiki.org/wiki/Help:Templates
|
||||
pdb.set_trace() # DEBUG
|
||||
text = extractor.expandTemplates(text)
|
||||
else:
|
||||
# Drop transclusions (template, parser functions)
|
||||
@ -199,7 +201,12 @@ def compact(text, mark_headers=False):
|
||||
for line in text.split('\n'):
|
||||
|
||||
if not line:
|
||||
if len(listLevel): # implies Extractor.HtmlFormatting
|
||||
for c in reversed(listLevel):
|
||||
page.append(listClose[c])
|
||||
listLevel = ''
|
||||
continue
|
||||
|
||||
# Handle section titles
|
||||
m = section.match(line)
|
||||
if m:
|
||||
@ -227,36 +234,35 @@ def compact(text, mark_headers=False):
|
||||
page.append(title)
|
||||
# handle indents
|
||||
elif line[0] == ':':
|
||||
# page.append(line.lstrip(':*#;'))
|
||||
continue
|
||||
page.append(line.lstrip(':'))
|
||||
# handle lists
|
||||
elif line[0] in '*#;:':
|
||||
# @see https://www.mediawiki.org/wiki/Help:Formatting
|
||||
elif line[0] in '*#;':
|
||||
if Extractor.HtmlFormatting:
|
||||
i = 0
|
||||
for c, n in zip_longest(listLevel, line, fillvalue=''):
|
||||
if not n or n not in '*#;:':
|
||||
if c:
|
||||
page.append(listClose[c])
|
||||
listLevel = listLevel[:-1]
|
||||
continue
|
||||
else:
|
||||
# close extra levels
|
||||
l = 0
|
||||
for c in listLevel:
|
||||
if l < len(line) and c != line[l]:
|
||||
for extra in reversed(listLevel[l:]):
|
||||
page.append(listClose[extra])
|
||||
listLevel = listLevel[:l]
|
||||
break
|
||||
# n != ''
|
||||
if c != n and (not c or (c not in ';:' and n not in ';:')):
|
||||
if c:
|
||||
# close level
|
||||
page.append(listClose[c])
|
||||
listLevel = listLevel[:-1]
|
||||
listLevel += n
|
||||
page.append(listOpen[n])
|
||||
i += 1
|
||||
n = line[i - 1] # last list char
|
||||
line = line[i:].strip()
|
||||
if line: # FIXME: n is '"'
|
||||
page.append(listItem[n] % line)
|
||||
l += 1
|
||||
if l < len(line) and line[l] in '*#;:':
|
||||
# add new level (only one, no jumps)
|
||||
# FIXME: handle jumping levels
|
||||
type = line[l]
|
||||
page.append(listOpen[type])
|
||||
listLevel += type
|
||||
line = line[l+1:].strip()
|
||||
else:
|
||||
# continue on same level
|
||||
type = line[l-1]
|
||||
line = line[l:].strip()
|
||||
page.append(listItem[type] % line)
|
||||
else:
|
||||
continue
|
||||
elif len(listLevel):
|
||||
elif len(listLevel): # implies Extractor.HtmlFormatting
|
||||
for c in reversed(listLevel):
|
||||
page.append(listClose[c])
|
||||
listLevel = []
|
||||
@ -786,6 +792,114 @@ spaces = re.compile(r' {2,}')
|
||||
# Matches dots
|
||||
dots = re.compile(r'\.{4,}')
|
||||
|
||||
# ======================================================================
|
||||
|
||||
class Template(list):
|
||||
"""
|
||||
A Template is a list of TemplateText or TemplateArgs
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def parse(cls, body):
|
||||
tpl = Template()
|
||||
# we must handle nesting, s.a.
|
||||
# {{{1|{{PAGENAME}}}
|
||||
# {{{italics|{{{italic|}}}
|
||||
# {{#if:{{{{{#if:{{{nominee|}}}|nominee|candidate}}|}}}|
|
||||
#
|
||||
start = 0
|
||||
for s,e in findMatchingBraces(body, 3):
|
||||
tpl.append(TemplateText(body[start:s]))
|
||||
tpl.append(TemplateArg(body[s+3:e-3]))
|
||||
start = e
|
||||
tpl.append(TemplateText(body[start:])) # leftover
|
||||
return tpl
|
||||
|
||||
def subst(self, params, extractor, depth=0):
|
||||
# We perform parameter substitutions recursively.
|
||||
# We also limit the maximum number of iterations to avoid too long or
|
||||
# even endless loops (in case of malformed input).
|
||||
|
||||
# :see: http://meta.wikimedia.org/wiki/Help:Expansion#Distinction_between_variables.2C_parser_functions.2C_and_templates
|
||||
#
|
||||
# Parameter values are assigned to parameters in two (?) passes.
|
||||
# Therefore a parameter name in a template can depend on the value of
|
||||
# another parameter of the same template, regardless of the order in
|
||||
# which they are specified in the template call, for example, using
|
||||
# Template:ppp containing "{{{{{{p}}}}}}", {{ppp|p=q|q=r}} and even
|
||||
# {{ppp|q=r|p=q}} gives r, but using Template:tvvv containing
|
||||
# "{{{{{{{{{p}}}}}}}}}", {{tvvv|p=q|q=r|r=s}} gives s.
|
||||
|
||||
#logging.debug('subst tpl (%d, %d) %s', len(extractor.frame), depth, self)
|
||||
|
||||
if depth > extractor.maxParameterRecursionLevels:
|
||||
extractor.recursion_exceeded_3_errs += 1
|
||||
return ''
|
||||
|
||||
return ''.join([tpl.subst(params, extractor, depth) for tpl in self])
|
||||
|
||||
def __str__(self):
|
||||
return ''.join([str(x) for x in self])
|
||||
|
||||
|
||||
class TemplateText(str):
|
||||
"""Fixed text of template"""
|
||||
|
||||
def subst(self, params, extractor, depth):
|
||||
return self
|
||||
|
||||
|
||||
class TemplateArg():
|
||||
"""
|
||||
parameter to a template.
|
||||
Has a name and a default value, both of which are Templates.
|
||||
"""
|
||||
def __init__(self, parameter):
|
||||
"""
|
||||
:param parameter: the parts of a tplarg.
|
||||
"""
|
||||
# the parameter name itself might contain templates, e.g.:
|
||||
# appointe{{#if:{{{appointer14|}}}|r|d}}14|
|
||||
# 4|{{{{{subst|}}}CURRENTYEAR}}
|
||||
|
||||
# any parts in a tplarg after the first (the parameter default) are
|
||||
# ignored, and an equals sign in the first part is treated as plain text.
|
||||
#logging.debug('TemplateArg %s', parameter)
|
||||
|
||||
parts = splitParts(parameter)
|
||||
self.name = Template.parse(parts[0])
|
||||
if len(parts) > 1:
|
||||
# This parameter has a default value
|
||||
self.default = Template.parse(parts[1])
|
||||
else:
|
||||
self.default = None
|
||||
|
||||
def __str__(self):
|
||||
if self.default:
|
||||
return '{{{%s|%s}}}' % (self.name, self.default)
|
||||
else:
|
||||
return '{{{%s}}}' % self.name
|
||||
|
||||
def subst(self, params, extractor, depth):
|
||||
"""
|
||||
Substitute value for this argument from dict :param params:
|
||||
Use :param extractor: to evaluate expressions for name and default.
|
||||
Limit substitution to the maximun :param depth:.
|
||||
"""
|
||||
# the parameter name itself might contain templates, e.g.:
|
||||
# appointe{{#if:{{{appointer14|}}}|r|d}}14|
|
||||
paramName = self.name.subst(params, extractor, depth+1)
|
||||
paramName = extractor.expandTemplates(paramName)
|
||||
res = ''
|
||||
if paramName in params:
|
||||
res = params[paramName] # use parameter value specified in template invocation
|
||||
elif self.default: # use the default value
|
||||
defaultValue = self.default.subst(params, extractor, depth+1)
|
||||
res = extractor.expandTemplates(defaultValue)
|
||||
#logging.debug('subst arg %d %s -> %s' % (depth, paramName, res))
|
||||
return res
|
||||
|
||||
|
||||
# ======================================================================
|
||||
|
||||
substWords = 'subst:|safesubst:'
|
||||
@ -811,6 +925,10 @@ class Extractor():
|
||||
# Whether to produce json instead of the default <doc> output format.
|
||||
toJson = False
|
||||
|
||||
##
|
||||
# Obtained from TemplateNamespace
|
||||
templatePrefix = ''
|
||||
|
||||
def __init__(self, id, revid, urlbase, title, page):
|
||||
"""
|
||||
:param page: a list of lines.
|
||||
@ -827,12 +945,13 @@ class Extractor():
|
||||
self.recursion_exceeded_3_errs = 0 # parameter recursion
|
||||
self.template_title_errs = 0
|
||||
|
||||
def clean_text(self, text, mark_headers=False, expand_templates=False,
|
||||
def clean_text(self, text, mark_headers=False, expand_templates=True,
|
||||
html_safe=True):
|
||||
"""
|
||||
:param mark_headers: True to distinguish headers from paragraphs
|
||||
e.g. "## Section 1"
|
||||
"""
|
||||
self.magicWords['namespace'] = self.title[:max(0, self.title.find(":"))]
|
||||
self.magicWords['pagename'] = self.title
|
||||
self.magicWords['fullpagename'] = self.title
|
||||
self.magicWords['currentyear'] = time.strftime('%Y')
|
||||
@ -978,7 +1097,11 @@ class Extractor():
|
||||
# The '=' might occurr within an HTML attribute:
|
||||
# "<ref name=value"
|
||||
# but we stop at first.
|
||||
m = re.match(' *([^=]*?) *=(.*)', param, re.DOTALL)
|
||||
|
||||
# The '=' might occurr within quotes:
|
||||
# ''''<span lang="pt-pt" xml:lang="pt-pt">cénicas</span>'''
|
||||
|
||||
m = re.match(" *([^=']*?) *=(.*)", param, re.DOTALL)
|
||||
if m:
|
||||
# This is a named parameter. This case also handles parameter
|
||||
# assignments like "2=xxx", where the number of an unnamed
|
||||
@ -1273,7 +1396,7 @@ def findMatchingBraces(text, ldelim=0):
|
||||
|
||||
if ldelim: # 2-3
|
||||
reOpen = re.compile('[{]{%d,}' % ldelim) # at least ldelim
|
||||
reNext = re.compile('[{]{2,}|}{2,}') # at least 2
|
||||
reNext = re.compile('[{]{2,}|}{2,}') # at least 2 open or close bracces
|
||||
else:
|
||||
reOpen = re.compile('{{2,}|\[{2,}')
|
||||
reNext = re.compile('{{2,}|}{2,}|\[{2,}|]{2,}') # at least 2
|
||||
@ -1439,7 +1562,7 @@ def fullyQualifiedTemplateTitle(templateTitle):
|
||||
# space]], but having in the system a redirect page with an empty title
|
||||
# causes numerous problems, so we'll live happier without it.
|
||||
if templateTitle:
|
||||
return templatePrefix + ucfirst(templateTitle)
|
||||
return Extractor.templatePrefix + ucfirst(templateTitle)
|
||||
else:
|
||||
return '' # caller may log as error
|
||||
|
||||
@ -1489,7 +1612,7 @@ def sharp_expr(expr):
|
||||
expr = re.sub('mod', '%', expr)
|
||||
expr = re.sub('\bdiv\b', '/', expr)
|
||||
expr = re.sub('\bround\b', '|ROUND|', expr)
|
||||
return unicode(eval(expr))
|
||||
return str(eval(expr))
|
||||
except:
|
||||
return '<span class="error"></span>'
|
||||
|
||||
@ -1675,7 +1798,7 @@ def callParserFunction(functionName, args, frame):
|
||||
reNoinclude = re.compile(r'<noinclude>(?:.*?)</noinclude>', re.DOTALL)
|
||||
reIncludeonly = re.compile(r'<includeonly>|</includeonly>', re.DOTALL)
|
||||
|
||||
# These are built before spawning processes, hence thay are shared.
|
||||
# These are built before spawning processes, hence they are shared.
|
||||
templates = {}
|
||||
redirects = {}
|
||||
# cache of parser templates
|
||||
|
@ -46,7 +46,8 @@ tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')
|
||||
def process_data(input_file, id, templates=False):
|
||||
"""
|
||||
:param input_file: name of the wikipedia dump file.
|
||||
:param id: article id
|
||||
:param id: article id.
|
||||
:param templates: whether article is a template.
|
||||
"""
|
||||
|
||||
if input_file.lower().endswith(".bz2"):
|
||||
@ -105,9 +106,9 @@ def main():
|
||||
parser.add_argument("--id", default="1",
|
||||
help="article number")
|
||||
parser.add_argument("--template", action="store_true",
|
||||
help="template number")
|
||||
help="whether article is a template")
|
||||
parser.add_argument("-v", "--version", action="version",
|
||||
version='%(prog)s ' + version,
|
||||
version='%(prog)s ' + __version__,
|
||||
help="print program version")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
Loading…
Reference in New Issue
Block a user