diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py index 99a39ad..feab143 100755 --- a/wikiextractor/WikiExtractor.py +++ b/wikiextractor/WikiExtractor.py @@ -79,7 +79,6 @@ knownNamespaces = set(['Template']) # The namespace used for template definitions # It is the name associated with namespace key=10 in the siteinfo header. templateNamespace = '' -templatePrefix = '' ## # The namespace used for module definitions @@ -196,8 +195,7 @@ def load_templates(file, output_file=None): Load templates from :param file:. :param output_file: file where to save templates and modules. """ - global templateNamespace, templatePrefix - templatePrefix = templateNamespace + ':' + global templateNamespace global moduleNamespace, modulePrefix modulePrefix = moduleNamespace + ':' articles = 0 @@ -220,6 +218,13 @@ def load_templates(file, output_file=None): page = [] elif tag == 'title': title = m.group(3) + if not output_file and not templateNamespace: # do not know it yet + # we reconstruct it from the first title + colon = title.find(':') + if colon > 1: + templateNamespace = title[:colon] + Extractor.templatePrefix = title[:colon + 1] + # FIXME: should reconstruct also moduleNamespace elif tag == 'text': inText = True line = line[m.start(3):m.end(3)] @@ -233,18 +238,11 @@ def load_templates(file, output_file=None): elif inText: page.append(line) elif tag == '/page': - if not output_file and not templateNamespace: # do not know it yet - # we reconstruct it from the first title - colon = title.find(':') - if colon > 1: - templateNamespace = title[:colon] - templatePrefix = title[:colon + 1] - # FIXME: should reconstruct also moduleNamespace - if title.startswith(templatePrefix): + if title.startswith(Extractor.templatePrefix): define_template(title, page) templates += 1 # save templates and modules to file - if output_file and (title.startswith(templatePrefix) or + if output_file and (title.startswith(Extractor.templatePrefix) or title.startswith(modulePrefix)): output.write('\n') output.write(' %s\n' % title) @@ -279,6 +277,63 @@ def decode_open(filename, mode='rt', encoding='utf-8'): return open(filename, mode, encoding=encoding) +def collect_pages(text): + """ + :param text: the text of a wikipedia file dump. + """ + # we collect individual lines, since str.join() is significantly faster + # than concatenation + page = [] + id = '' + revid = '' + last_id = '' + inText = False + redirect = False + for line in text: + if '<' not in line: # faster than doing re.search() + if inText: + page.append(line) + continue + m = tagRE.search(line) + if not m: + continue + tag = m.group(2) + if tag == 'page': + page = [] + redirect = False + elif tag == 'id' and not id: + id = m.group(3) + elif tag == 'id' and id: # + revid = m.group(3) + elif tag == 'title': + title = m.group(3) + elif tag == 'redirect': + redirect = True + elif tag == 'text': + inText = True + line = line[m.start(3):m.end(3)] + page.append(line) + if m.lastindex == 4: # open-close + inText = False + elif tag == '/text': + if m.group(1): + page.append(m.group(1)) + inText = False + elif inText: + page.append(line) + elif tag == '/page': + colon = title.find(':') + if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and + not redirect and not title.startswith(templateNamespace)): + yield (id, revid, title, page) + last_id = id + id = '' + revid = '' + page = [] + inText = False + redirect = False + + def process_dump(input_file, template_file, out_file, file_size, file_compress, process_count, html_safe): """ @@ -290,7 +345,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, :param process_count: number of extraction processes to spawn. """ global knownNamespaces - global templateNamespace, templatePrefix + global templateNamespace global moduleNamespace, modulePrefix urlbase = '' # This is obtained from @@ -313,7 +368,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, knownNamespaces.add(m.group(3)) if re.search('key="10"', line): templateNamespace = m.group(3) - templatePrefix = templateNamespace + ':' + Extractor.templatePrefix = templateNamespace + ':' elif re.search('key="828"', line): moduleNamespace = m.group(3) modulePrefix = moduleNamespace + ':' @@ -383,56 +438,12 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, # we collect individual lines, since str.join() is significantly faster # than concatenation - page = [] - id = '' - revid = '' - last_id = '' + ordinal = 0 # page count - inText = False - redirect = False - for line in input: - if '<' not in line: # faster than doing re.search() - if inText: - page.append(line) - continue - m = tagRE.search(line) - if not m: - continue - tag = m.group(2) - if tag == 'page': - page = [] - redirect = False - elif tag == 'id' and not id: - id = m.group(3) - elif tag == 'id' and id: # - revid = m.group(3) - elif tag == 'title': - title = m.group(3) - elif tag == 'redirect': - redirect = True - elif tag == 'text': - inText = True - line = line[m.start(3):m.end(3)] - page.append(line) - if m.lastindex == 4: # open-close - inText = False - elif tag == '/text': - if m.group(1): - page.append(m.group(1)) - inText = False - elif inText: - page.append(line) - elif tag == '/page': - colon = title.find(':') - if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and - not redirect and not title.startswith(templateNamespace)): - job = (id, revid, urlbase, title, page, ordinal) - jobs_queue.put(job) # goes to any available extract_process - last_id = id - ordinal += 1 - id = '' - revid = '' - page = [] + for id, revid, title, page in collect_pages(input): + job = (id, revid, urlbase, title, page, ordinal) + jobs_queue.put(job) # goes to any available extract_process + ordinal += 1 input.close() @@ -467,7 +478,7 @@ def extract_process(jobs_queue, output_queue, html_safe): :html_safe: whether to convert entities in text to HTML. """ while True: - job = jobs_queue.get() # job is (id, revid, urlbase, title, page, ordinal) + job = jobs_queue.get() # job is (id, revid, urlbase, title, page) if job: out = StringIO() # memory buffer Extractor(*job[:-1]).extract(out, html_safe) # (id, urlbase, title, page) @@ -479,7 +490,8 @@ def extract_process(jobs_queue, output_queue, html_safe): def reduce_process(output_queue, output): - """Pull finished article text, write series of files (or stdout) + """ + Pull finished article text, write series of files (or stdout) :param output_queue: text to be output. :param output: file object where to print. """ @@ -515,7 +527,7 @@ minFileSize = 200 * 1024 def main(): - global urlbase, acceptedNamespaces + global acceptedNamespaces global expand_templates, templateCache parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), @@ -609,24 +621,10 @@ def main(): with open(args.templates) as file: load_templates(file) - with open(input_file) as file: - page = file.read() - ids = re.findall(r'(\d*?)', page) - id = ids[0] if ids else '' - revid = ids[1] if len(ids) > 1 else '' - m = re.search(r'(.*?)', page) - if m: - title = m.group(1) - else: - logging.error('Missing title element') - return - m = re.search(r'(.*?)', page) - if m: - base = m.group(1) - urlbase = base[:base.rfind("/")] - else: - urlbase = '' - Extractor(id, revid, urlbase, title, [page]).extract(sys.stdout) + urlbase = '' + with open(input_file) as input: + for id, revid, title, page in collect_pages(input): + Extractor(id, revid, urlbase, title, page).extract(sys.stdout) return output_path = args.output diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py index 2180dc4..5a3bd5a 100644 --- a/wikiextractor/extract.py +++ b/wikiextractor/extract.py @@ -26,6 +26,7 @@ from urllib.parse import quote as urlencode from html.entities import name2codepoint import logging import time +import pdb # DEBUG # ---------------------------------------------------------------------- @@ -81,6 +82,7 @@ def clean(extractor, text, expand_templates=False, html_safe=True): if expand_templates: # expand templates # See: http://www.mediawiki.org/wiki/Help:Templates + pdb.set_trace() # DEBUG text = extractor.expandTemplates(text) else: # Drop transclusions (template, parser functions) @@ -199,7 +201,12 @@ def compact(text, mark_headers=False): for line in text.split('\n'): if not line: + if len(listLevel): # implies Extractor.HtmlFormatting + for c in reversed(listLevel): + page.append(listClose[c]) + listLevel = '' continue + # Handle section titles m = section.match(line) if m: @@ -227,36 +234,35 @@ def compact(text, mark_headers=False): page.append(title) # handle indents elif line[0] == ':': - # page.append(line.lstrip(':*#;')) - continue + page.append(line.lstrip(':')) # handle lists - elif line[0] in '*#;:': + # @see https://www.mediawiki.org/wiki/Help:Formatting + elif line[0] in '*#;': if Extractor.HtmlFormatting: - i = 0 - for c, n in zip_longest(listLevel, line, fillvalue=''): - if not n or n not in '*#;:': - if c: - page.append(listClose[c]) - listLevel = listLevel[:-1] - continue - else: - break - # n != '' - if c != n and (not c or (c not in ';:' and n not in ';:')): - if c: - # close level - page.append(listClose[c]) - listLevel = listLevel[:-1] - listLevel += n - page.append(listOpen[n]) - i += 1 - n = line[i - 1] # last list char - line = line[i:].strip() - if line: # FIXME: n is '"' - page.append(listItem[n] % line) + # close extra levels + l = 0 + for c in listLevel: + if l < len(line) and c != line[l]: + for extra in reversed(listLevel[l:]): + page.append(listClose[extra]) + listLevel = listLevel[:l] + break + l += 1 + if l < len(line) and line[l] in '*#;:': + # add new level (only one, no jumps) + # FIXME: handle jumping levels + type = line[l] + page.append(listOpen[type]) + listLevel += type + line = line[l+1:].strip() + else: + # continue on same level + type = line[l-1] + line = line[l:].strip() + page.append(listItem[type] % line) else: continue - elif len(listLevel): + elif len(listLevel): # implies Extractor.HtmlFormatting for c in reversed(listLevel): page.append(listClose[c]) listLevel = [] @@ -786,6 +792,114 @@ spaces = re.compile(r' {2,}') # Matches dots dots = re.compile(r'\.{4,}') +# ====================================================================== + +class Template(list): + """ + A Template is a list of TemplateText or TemplateArgs + """ + + @classmethod + def parse(cls, body): + tpl = Template() + # we must handle nesting, s.a. + # {{{1|{{PAGENAME}}} + # {{{italics|{{{italic|}}} + # {{#if:{{{{{#if:{{{nominee|}}}|nominee|candidate}}|}}}| + # + start = 0 + for s,e in findMatchingBraces(body, 3): + tpl.append(TemplateText(body[start:s])) + tpl.append(TemplateArg(body[s+3:e-3])) + start = e + tpl.append(TemplateText(body[start:])) # leftover + return tpl + + def subst(self, params, extractor, depth=0): + # We perform parameter substitutions recursively. + # We also limit the maximum number of iterations to avoid too long or + # even endless loops (in case of malformed input). + + # :see: http://meta.wikimedia.org/wiki/Help:Expansion#Distinction_between_variables.2C_parser_functions.2C_and_templates + # + # Parameter values are assigned to parameters in two (?) passes. + # Therefore a parameter name in a template can depend on the value of + # another parameter of the same template, regardless of the order in + # which they are specified in the template call, for example, using + # Template:ppp containing "{{{{{{p}}}}}}", {{ppp|p=q|q=r}} and even + # {{ppp|q=r|p=q}} gives r, but using Template:tvvv containing + # "{{{{{{{{{p}}}}}}}}}", {{tvvv|p=q|q=r|r=s}} gives s. + + #logging.debug('subst tpl (%d, %d) %s', len(extractor.frame), depth, self) + + if depth > extractor.maxParameterRecursionLevels: + extractor.recursion_exceeded_3_errs += 1 + return '' + + return ''.join([tpl.subst(params, extractor, depth) for tpl in self]) + + def __str__(self): + return ''.join([str(x) for x in self]) + + +class TemplateText(str): + """Fixed text of template""" + + def subst(self, params, extractor, depth): + return self + + +class TemplateArg(): + """ + parameter to a template. + Has a name and a default value, both of which are Templates. + """ + def __init__(self, parameter): + """ + :param parameter: the parts of a tplarg. + """ + # the parameter name itself might contain templates, e.g.: + # appointe{{#if:{{{appointer14|}}}|r|d}}14| + # 4|{{{{{subst|}}}CURRENTYEAR}} + + # any parts in a tplarg after the first (the parameter default) are + # ignored, and an equals sign in the first part is treated as plain text. + #logging.debug('TemplateArg %s', parameter) + + parts = splitParts(parameter) + self.name = Template.parse(parts[0]) + if len(parts) > 1: + # This parameter has a default value + self.default = Template.parse(parts[1]) + else: + self.default = None + + def __str__(self): + if self.default: + return '{{{%s|%s}}}' % (self.name, self.default) + else: + return '{{{%s}}}' % self.name + + def subst(self, params, extractor, depth): + """ + Substitute value for this argument from dict :param params: + Use :param extractor: to evaluate expressions for name and default. + Limit substitution to the maximun :param depth:. + """ + # the parameter name itself might contain templates, e.g.: + # appointe{{#if:{{{appointer14|}}}|r|d}}14| + paramName = self.name.subst(params, extractor, depth+1) + paramName = extractor.expandTemplates(paramName) + res = '' + if paramName in params: + res = params[paramName] # use parameter value specified in template invocation + elif self.default: # use the default value + defaultValue = self.default.subst(params, extractor, depth+1) + res = extractor.expandTemplates(defaultValue) + #logging.debug('subst arg %d %s -> %s' % (depth, paramName, res)) + return res + + # ====================================================================== substWords = 'subst:|safesubst:' @@ -811,6 +925,10 @@ class Extractor(): # Whether to produce json instead of the default output format. toJson = False + ## + # Obtained from TemplateNamespace + templatePrefix = '' + def __init__(self, id, revid, urlbase, title, page): """ :param page: a list of lines. @@ -827,12 +945,13 @@ class Extractor(): self.recursion_exceeded_3_errs = 0 # parameter recursion self.template_title_errs = 0 - def clean_text(self, text, mark_headers=False, expand_templates=False, + def clean_text(self, text, mark_headers=False, expand_templates=True, html_safe=True): """ :param mark_headers: True to distinguish headers from paragraphs e.g. "## Section 1" """ + self.magicWords['namespace'] = self.title[:max(0, self.title.find(":"))] self.magicWords['pagename'] = self.title self.magicWords['fullpagename'] = self.title self.magicWords['currentyear'] = time.strftime('%Y') @@ -978,7 +1097,11 @@ class Extractor(): # The '=' might occurr within an HTML attribute: # "<ref name=value" # but we stop at first. - m = re.match(' *([^=]*?) *=(.*)', param, re.DOTALL) + + # The '=' might occurr within quotes: + # ''''cénicas''' + + m = re.match(" *([^=']*?) *=(.*)", param, re.DOTALL) if m: # This is a named parameter. This case also handles parameter # assignments like "2=xxx", where the number of an unnamed @@ -1273,7 +1396,7 @@ def findMatchingBraces(text, ldelim=0): if ldelim: # 2-3 reOpen = re.compile('[{]{%d,}' % ldelim) # at least ldelim - reNext = re.compile('[{]{2,}|}{2,}') # at least 2 + reNext = re.compile('[{]{2,}|}{2,}') # at least 2 open or close bracces else: reOpen = re.compile('{{2,}|\[{2,}') reNext = re.compile('{{2,}|}{2,}|\[{2,}|]{2,}') # at least 2 @@ -1439,7 +1562,7 @@ def fullyQualifiedTemplateTitle(templateTitle): # space]], but having in the system a redirect page with an empty title # causes numerous problems, so we'll live happier without it. if templateTitle: - return templatePrefix + ucfirst(templateTitle) + return Extractor.templatePrefix + ucfirst(templateTitle) else: return '' # caller may log as error @@ -1489,7 +1612,7 @@ def sharp_expr(expr): expr = re.sub('mod', '%', expr) expr = re.sub('\bdiv\b', '/', expr) expr = re.sub('\bround\b', '|ROUND|', expr) - return unicode(eval(expr)) + return str(eval(expr)) except: return '' @@ -1675,7 +1798,7 @@ def callParserFunction(functionName, args, frame): reNoinclude = re.compile(r'(?:.*?)', re.DOTALL) reIncludeonly = re.compile(r'|', re.DOTALL) -# These are built before spawning processes, hence thay are shared. +# These are built before spawning processes, hence they are shared. templates = {} redirects = {} # cache of parser templates diff --git a/wikiextractor/extractPage.py b/wikiextractor/extractPage.py index 1e40410..83b6758 100755 --- a/wikiextractor/extractPage.py +++ b/wikiextractor/extractPage.py @@ -46,7 +46,8 @@ tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?') def process_data(input_file, id, templates=False): """ :param input_file: name of the wikipedia dump file. - :param id: article id + :param id: article id. + :param templates: whether article is a template. """ if input_file.lower().endswith(".bz2"): @@ -105,9 +106,9 @@ def main(): parser.add_argument("--id", default="1", help="article number") parser.add_argument("--template", action="store_true", - help="template number") + help="whether article is a template") parser.add_argument("-v", "--version", action="version", - version='%(prog)s ' + version, + version='%(prog)s ' + __version__, help="print program version") args = parser.parse_args()