Restored template expansion and HTML preserving flag.

This commit is contained in:
Giuseppe Attardi 2022-03-07 10:31:01 +01:00
parent 1053fe2030
commit 05b5cc7e51
3 changed files with 241 additions and 119 deletions

View File

@ -79,7 +79,6 @@ knownNamespaces = set(['Template'])
# The namespace used for template definitions
# It is the name associated with namespace key=10 in the siteinfo header.
templateNamespace = ''
templatePrefix = ''
##
# The namespace used for module definitions
@ -196,8 +195,7 @@ def load_templates(file, output_file=None):
Load templates from :param file:.
:param output_file: file where to save templates and modules.
"""
global templateNamespace, templatePrefix
templatePrefix = templateNamespace + ':'
global templateNamespace
global moduleNamespace, modulePrefix
modulePrefix = moduleNamespace + ':'
articles = 0
@ -220,6 +218,13 @@ def load_templates(file, output_file=None):
page = []
elif tag == 'title':
title = m.group(3)
if not output_file and not templateNamespace: # do not know it yet
# we reconstruct it from the first title
colon = title.find(':')
if colon > 1:
templateNamespace = title[:colon]
Extractor.templatePrefix = title[:colon + 1]
# FIXME: should reconstruct also moduleNamespace
elif tag == 'text':
inText = True
line = line[m.start(3):m.end(3)]
@ -233,18 +238,11 @@ def load_templates(file, output_file=None):
elif inText:
page.append(line)
elif tag == '/page':
if not output_file and not templateNamespace: # do not know it yet
# we reconstruct it from the first title
colon = title.find(':')
if colon > 1:
templateNamespace = title[:colon]
templatePrefix = title[:colon + 1]
# FIXME: should reconstruct also moduleNamespace
if title.startswith(templatePrefix):
if title.startswith(Extractor.templatePrefix):
define_template(title, page)
templates += 1
# save templates and modules to file
if output_file and (title.startswith(templatePrefix) or
if output_file and (title.startswith(Extractor.templatePrefix) or
title.startswith(modulePrefix)):
output.write('<page>\n')
output.write(' <title>%s</title>\n' % title)
@ -279,6 +277,63 @@ def decode_open(filename, mode='rt', encoding='utf-8'):
return open(filename, mode, encoding=encoding)
def collect_pages(text):
"""
:param text: the text of a wikipedia file dump.
"""
# we collect individual lines, since str.join() is significantly faster
# than concatenation
page = []
id = ''
revid = ''
last_id = ''
inText = False
redirect = False
for line in text:
if '<' not in line: # faster than doing re.search()
if inText:
page.append(line)
continue
m = tagRE.search(line)
if not m:
continue
tag = m.group(2)
if tag == 'page':
page = []
redirect = False
elif tag == 'id' and not id:
id = m.group(3)
elif tag == 'id' and id: # <revision> <id></id> </revision>
revid = m.group(3)
elif tag == 'title':
title = m.group(3)
elif tag == 'redirect':
redirect = True
elif tag == 'text':
inText = True
line = line[m.start(3):m.end(3)]
page.append(line)
if m.lastindex == 4: # open-close
inText = False
elif tag == '/text':
if m.group(1):
page.append(m.group(1))
inText = False
elif inText:
page.append(line)
elif tag == '/page':
colon = title.find(':')
if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and
not redirect and not title.startswith(templateNamespace)):
yield (id, revid, title, page)
last_id = id
id = ''
revid = ''
page = []
inText = False
redirect = False
def process_dump(input_file, template_file, out_file, file_size, file_compress,
process_count, html_safe):
"""
@ -290,7 +345,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
:param process_count: number of extraction processes to spawn.
"""
global knownNamespaces
global templateNamespace, templatePrefix
global templateNamespace
global moduleNamespace, modulePrefix
urlbase = '' # This is obtained from <siteinfo>
@ -313,7 +368,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
knownNamespaces.add(m.group(3))
if re.search('key="10"', line):
templateNamespace = m.group(3)
templatePrefix = templateNamespace + ':'
Extractor.templatePrefix = templateNamespace + ':'
elif re.search('key="828"', line):
moduleNamespace = m.group(3)
modulePrefix = moduleNamespace + ':'
@ -383,56 +438,12 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
# we collect individual lines, since str.join() is significantly faster
# than concatenation
page = []
id = ''
revid = ''
last_id = ''
ordinal = 0 # page count
inText = False
redirect = False
for line in input:
if '<' not in line: # faster than doing re.search()
if inText:
page.append(line)
continue
m = tagRE.search(line)
if not m:
continue
tag = m.group(2)
if tag == 'page':
page = []
redirect = False
elif tag == 'id' and not id:
id = m.group(3)
elif tag == 'id' and id: # <revision> <id></id> </revision>
revid = m.group(3)
elif tag == 'title':
title = m.group(3)
elif tag == 'redirect':
redirect = True
elif tag == 'text':
inText = True
line = line[m.start(3):m.end(3)]
page.append(line)
if m.lastindex == 4: # open-close
inText = False
elif tag == '/text':
if m.group(1):
page.append(m.group(1))
inText = False
elif inText:
page.append(line)
elif tag == '/page':
colon = title.find(':')
if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and
not redirect and not title.startswith(templateNamespace)):
job = (id, revid, urlbase, title, page, ordinal)
jobs_queue.put(job) # goes to any available extract_process
last_id = id
ordinal += 1
id = ''
revid = ''
page = []
for id, revid, title, page in collect_pages(input):
job = (id, revid, urlbase, title, page, ordinal)
jobs_queue.put(job) # goes to any available extract_process
ordinal += 1
input.close()
@ -467,7 +478,7 @@ def extract_process(jobs_queue, output_queue, html_safe):
:html_safe: whether to convert entities in text to HTML.
"""
while True:
job = jobs_queue.get() # job is (id, revid, urlbase, title, page, ordinal)
job = jobs_queue.get() # job is (id, revid, urlbase, title, page)
if job:
out = StringIO() # memory buffer
Extractor(*job[:-1]).extract(out, html_safe) # (id, urlbase, title, page)
@ -479,7 +490,8 @@ def extract_process(jobs_queue, output_queue, html_safe):
def reduce_process(output_queue, output):
"""Pull finished article text, write series of files (or stdout)
"""
Pull finished article text, write series of files (or stdout)
:param output_queue: text to be output.
:param output: file object where to print.
"""
@ -515,7 +527,7 @@ minFileSize = 200 * 1024
def main():
global urlbase, acceptedNamespaces
global acceptedNamespaces
global expand_templates, templateCache
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
@ -609,24 +621,10 @@ def main():
with open(args.templates) as file:
load_templates(file)
with open(input_file) as file:
page = file.read()
ids = re.findall(r'<id>(\d*?)</id>', page)
id = ids[0] if ids else ''
revid = ids[1] if len(ids) > 1 else ''
m = re.search(r'<title>(.*?)</title>', page)
if m:
title = m.group(1)
else:
logging.error('Missing title element')
return
m = re.search(r'<base>(.*?)</base>', page)
if m:
base = m.group(1)
urlbase = base[:base.rfind("/")]
else:
urlbase = ''
Extractor(id, revid, urlbase, title, [page]).extract(sys.stdout)
urlbase = ''
with open(input_file) as input:
for id, revid, title, page in collect_pages(input):
Extractor(id, revid, urlbase, title, page).extract(sys.stdout)
return
output_path = args.output

View File

@ -26,6 +26,7 @@ from urllib.parse import quote as urlencode
from html.entities import name2codepoint
import logging
import time
import pdb # DEBUG
# ----------------------------------------------------------------------
@ -81,6 +82,7 @@ def clean(extractor, text, expand_templates=False, html_safe=True):
if expand_templates:
# expand templates
# See: http://www.mediawiki.org/wiki/Help:Templates
pdb.set_trace() # DEBUG
text = extractor.expandTemplates(text)
else:
# Drop transclusions (template, parser functions)
@ -199,7 +201,12 @@ def compact(text, mark_headers=False):
for line in text.split('\n'):
if not line:
if len(listLevel): # implies Extractor.HtmlFormatting
for c in reversed(listLevel):
page.append(listClose[c])
listLevel = ''
continue
# Handle section titles
m = section.match(line)
if m:
@ -227,36 +234,35 @@ def compact(text, mark_headers=False):
page.append(title)
# handle indents
elif line[0] == ':':
# page.append(line.lstrip(':*#;'))
continue
page.append(line.lstrip(':'))
# handle lists
elif line[0] in '*#;:':
# @see https://www.mediawiki.org/wiki/Help:Formatting
elif line[0] in '*#;':
if Extractor.HtmlFormatting:
i = 0
for c, n in zip_longest(listLevel, line, fillvalue=''):
if not n or n not in '*#;:':
if c:
page.append(listClose[c])
listLevel = listLevel[:-1]
continue
else:
break
# n != ''
if c != n and (not c or (c not in ';:' and n not in ';:')):
if c:
# close level
page.append(listClose[c])
listLevel = listLevel[:-1]
listLevel += n
page.append(listOpen[n])
i += 1
n = line[i - 1] # last list char
line = line[i:].strip()
if line: # FIXME: n is '"'
page.append(listItem[n] % line)
# close extra levels
l = 0
for c in listLevel:
if l < len(line) and c != line[l]:
for extra in reversed(listLevel[l:]):
page.append(listClose[extra])
listLevel = listLevel[:l]
break
l += 1
if l < len(line) and line[l] in '*#;:':
# add new level (only one, no jumps)
# FIXME: handle jumping levels
type = line[l]
page.append(listOpen[type])
listLevel += type
line = line[l+1:].strip()
else:
# continue on same level
type = line[l-1]
line = line[l:].strip()
page.append(listItem[type] % line)
else:
continue
elif len(listLevel):
elif len(listLevel): # implies Extractor.HtmlFormatting
for c in reversed(listLevel):
page.append(listClose[c])
listLevel = []
@ -786,6 +792,114 @@ spaces = re.compile(r' {2,}')
# Matches dots
dots = re.compile(r'\.{4,}')
# ======================================================================
class Template(list):
"""
A Template is a list of TemplateText or TemplateArgs
"""
@classmethod
def parse(cls, body):
tpl = Template()
# we must handle nesting, s.a.
# {{{1|{{PAGENAME}}}
# {{{italics|{{{italic|}}}
# {{#if:{{{{{#if:{{{nominee|}}}|nominee|candidate}}|}}}|
#
start = 0
for s,e in findMatchingBraces(body, 3):
tpl.append(TemplateText(body[start:s]))
tpl.append(TemplateArg(body[s+3:e-3]))
start = e
tpl.append(TemplateText(body[start:])) # leftover
return tpl
def subst(self, params, extractor, depth=0):
# We perform parameter substitutions recursively.
# We also limit the maximum number of iterations to avoid too long or
# even endless loops (in case of malformed input).
# :see: http://meta.wikimedia.org/wiki/Help:Expansion#Distinction_between_variables.2C_parser_functions.2C_and_templates
#
# Parameter values are assigned to parameters in two (?) passes.
# Therefore a parameter name in a template can depend on the value of
# another parameter of the same template, regardless of the order in
# which they are specified in the template call, for example, using
# Template:ppp containing "{{{{{{p}}}}}}", {{ppp|p=q|q=r}} and even
# {{ppp|q=r|p=q}} gives r, but using Template:tvvv containing
# "{{{{{{{{{p}}}}}}}}}", {{tvvv|p=q|q=r|r=s}} gives s.
#logging.debug('subst tpl (%d, %d) %s', len(extractor.frame), depth, self)
if depth > extractor.maxParameterRecursionLevels:
extractor.recursion_exceeded_3_errs += 1
return ''
return ''.join([tpl.subst(params, extractor, depth) for tpl in self])
def __str__(self):
return ''.join([str(x) for x in self])
class TemplateText(str):
"""Fixed text of template"""
def subst(self, params, extractor, depth):
return self
class TemplateArg():
"""
parameter to a template.
Has a name and a default value, both of which are Templates.
"""
def __init__(self, parameter):
"""
:param parameter: the parts of a tplarg.
"""
# the parameter name itself might contain templates, e.g.:
# appointe{{#if:{{{appointer14|}}}|r|d}}14|
# 4|{{{{{subst|}}}CURRENTYEAR}}
# any parts in a tplarg after the first (the parameter default) are
# ignored, and an equals sign in the first part is treated as plain text.
#logging.debug('TemplateArg %s', parameter)
parts = splitParts(parameter)
self.name = Template.parse(parts[0])
if len(parts) > 1:
# This parameter has a default value
self.default = Template.parse(parts[1])
else:
self.default = None
def __str__(self):
if self.default:
return '{{{%s|%s}}}' % (self.name, self.default)
else:
return '{{{%s}}}' % self.name
def subst(self, params, extractor, depth):
"""
Substitute value for this argument from dict :param params:
Use :param extractor: to evaluate expressions for name and default.
Limit substitution to the maximun :param depth:.
"""
# the parameter name itself might contain templates, e.g.:
# appointe{{#if:{{{appointer14|}}}|r|d}}14|
paramName = self.name.subst(params, extractor, depth+1)
paramName = extractor.expandTemplates(paramName)
res = ''
if paramName in params:
res = params[paramName] # use parameter value specified in template invocation
elif self.default: # use the default value
defaultValue = self.default.subst(params, extractor, depth+1)
res = extractor.expandTemplates(defaultValue)
#logging.debug('subst arg %d %s -> %s' % (depth, paramName, res))
return res
# ======================================================================
substWords = 'subst:|safesubst:'
@ -811,6 +925,10 @@ class Extractor():
# Whether to produce json instead of the default <doc> output format.
toJson = False
##
# Obtained from TemplateNamespace
templatePrefix = ''
def __init__(self, id, revid, urlbase, title, page):
"""
:param page: a list of lines.
@ -827,12 +945,13 @@ class Extractor():
self.recursion_exceeded_3_errs = 0 # parameter recursion
self.template_title_errs = 0
def clean_text(self, text, mark_headers=False, expand_templates=False,
def clean_text(self, text, mark_headers=False, expand_templates=True,
html_safe=True):
"""
:param mark_headers: True to distinguish headers from paragraphs
e.g. "## Section 1"
"""
self.magicWords['namespace'] = self.title[:max(0, self.title.find(":"))]
self.magicWords['pagename'] = self.title
self.magicWords['fullpagename'] = self.title
self.magicWords['currentyear'] = time.strftime('%Y')
@ -978,7 +1097,11 @@ class Extractor():
# The '=' might occurr within an HTML attribute:
# "&lt;ref name=value"
# but we stop at first.
m = re.match(' *([^=]*?) *=(.*)', param, re.DOTALL)
# The '=' might occurr within quotes:
# ''''<span lang="pt-pt" xml:lang="pt-pt">cénicas</span>'''
m = re.match(" *([^=']*?) *=(.*)", param, re.DOTALL)
if m:
# This is a named parameter. This case also handles parameter
# assignments like "2=xxx", where the number of an unnamed
@ -1273,7 +1396,7 @@ def findMatchingBraces(text, ldelim=0):
if ldelim: # 2-3
reOpen = re.compile('[{]{%d,}' % ldelim) # at least ldelim
reNext = re.compile('[{]{2,}|}{2,}') # at least 2
reNext = re.compile('[{]{2,}|}{2,}') # at least 2 open or close bracces
else:
reOpen = re.compile('{{2,}|\[{2,}')
reNext = re.compile('{{2,}|}{2,}|\[{2,}|]{2,}') # at least 2
@ -1439,7 +1562,7 @@ def fullyQualifiedTemplateTitle(templateTitle):
# space]], but having in the system a redirect page with an empty title
# causes numerous problems, so we'll live happier without it.
if templateTitle:
return templatePrefix + ucfirst(templateTitle)
return Extractor.templatePrefix + ucfirst(templateTitle)
else:
return '' # caller may log as error
@ -1489,7 +1612,7 @@ def sharp_expr(expr):
expr = re.sub('mod', '%', expr)
expr = re.sub('\bdiv\b', '/', expr)
expr = re.sub('\bround\b', '|ROUND|', expr)
return unicode(eval(expr))
return str(eval(expr))
except:
return '<span class="error"></span>'
@ -1675,7 +1798,7 @@ def callParserFunction(functionName, args, frame):
reNoinclude = re.compile(r'<noinclude>(?:.*?)</noinclude>', re.DOTALL)
reIncludeonly = re.compile(r'<includeonly>|</includeonly>', re.DOTALL)
# These are built before spawning processes, hence thay are shared.
# These are built before spawning processes, hence they are shared.
templates = {}
redirects = {}
# cache of parser templates

View File

@ -46,7 +46,8 @@ tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')
def process_data(input_file, id, templates=False):
"""
:param input_file: name of the wikipedia dump file.
:param id: article id
:param id: article id.
:param templates: whether article is a template.
"""
if input_file.lower().endswith(".bz2"):
@ -105,9 +106,9 @@ def main():
parser.add_argument("--id", default="1",
help="article number")
parser.add_argument("--template", action="store_true",
help="template number")
help="whether article is a template")
parser.add_argument("-v", "--version", action="version",
version='%(prog)s ' + version,
version='%(prog)s ' + __version__,
help="print program version")
args = parser.parse_args()