From 8f1b434a80608e1e313d38d263ed7c79c9ee75a9 Mon Sep 17 00:00:00 2001 From: attardi Date: Tue, 24 Jan 2023 19:02:27 +0100 Subject: [PATCH] Fixed handling of templates. --- wikiextractor/WikiExtractor.py | 20 ++++++++++---------- wikiextractor/extract.py | 17 ++++++++++------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py index c195a19..830235d 100755 --- a/wikiextractor/WikiExtractor.py +++ b/wikiextractor/WikiExtractor.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # ============================================================================= -# Version: 3.0 (July 22, 2020) +# Version: 3.0 (January 24, 2023) # Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa # # Contributors: @@ -17,7 +17,7 @@ # Nick Ulven (nulven@github) # # ============================================================================= -# Copyright (c) 2009-2020. Giuseppe Attardi (attardi@di.unipi.it). +# Copyright (c) 2009-2023. Giuseppe Attardi (attardi@di.unipi.it). # ============================================================================= # This file is part of Tanl. # @@ -68,7 +68,7 @@ from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces # =========================================================================== # Program version -__version__ = '3.0.6' +__version__ = '3.0.7' ## # Defined in @@ -194,6 +194,7 @@ def load_templates(file, output_file=None): """ Load templates from :param file:. :param output_file: file where to save templates and modules. + :return: number of templates loaded. """ global templateNamespace global moduleNamespace, modulePrefix @@ -335,7 +336,7 @@ def collect_pages(text): def process_dump(input_file, template_file, out_file, file_size, file_compress, - process_count, html_safe): + process_count, html_safe, expand_templates=True): """ :param input_file: name of the wikipedia dump file; '-' to read from stdin :param template_file: optional file with template definitions. @@ -343,6 +344,8 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, :param file_size: max size of each extracted file, or None for no max (one file) :param file_compress: whether to compress files with bzip. :param process_count: number of extraction processes to spawn. + :html_safe: whether to convert entities in text to HTML. + :param expand_templates: whether to expand templates. """ global knownNamespaces global templateNamespace @@ -528,7 +531,7 @@ minFileSize = 200 * 1024 def main(): global acceptedNamespaces - global expand_templates, templateCache + global templateCache parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, @@ -555,7 +558,7 @@ def main(): help="accepted namespaces") groupP.add_argument("--templates", help="use or create file containing templates") - groupP.add_argument("--no-templates", action="store_false", + groupP.add_argument("--no-templates", action="store_true", help="Do not expand templates") groupP.add_argument("--html-safe", default=True, help="use to produce HTML safe output within ...") @@ -582,8 +585,6 @@ def main(): Extractor.keepLinks = True Extractor.to_json = args.json - expand_templates = args.no_templates - try: power = 'kmg'.find(args.bytes[-1].lower()) + 1 # 0 bytes means put a single article per file. @@ -636,8 +637,7 @@ def main(): return process_dump(input_file, args.templates, output_path, file_size, - args.compress, args.processes, args.html_safe) - + args.compress, args.processes, args.html_safe, not args.no_templates) if __name__ == '__main__': main() diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py index 5a3bd5a..a00e23d 100644 --- a/wikiextractor/extract.py +++ b/wikiextractor/extract.py @@ -26,7 +26,6 @@ from urllib.parse import quote as urlencode from html.entities import name2codepoint import logging import time -import pdb # DEBUG # ---------------------------------------------------------------------- @@ -82,7 +81,6 @@ def clean(extractor, text, expand_templates=False, html_safe=True): if expand_templates: # expand templates # See: http://www.mediawiki.org/wiki/Help:Templates - pdb.set_trace() # DEBUG text = extractor.expandTemplates(text) else: # Drop transclusions (template, parser functions) @@ -830,7 +828,7 @@ class Template(list): # {{ppp|q=r|p=q}} gives r, but using Template:tvvv containing # "{{{{{{{{{p}}}}}}}}}", {{tvvv|p=q|q=r|r=s}} gives s. - #logging.debug('subst tpl (%d, %d) %s', len(extractor.frame), depth, self) + logging.debug('subst tpl (%d, %d) %s', len(extractor.frame), depth, self) if depth > extractor.maxParameterRecursionLevels: extractor.recursion_exceeded_3_errs += 1 @@ -952,6 +950,7 @@ class Extractor(): e.g. "## Section 1" """ self.magicWords['namespace'] = self.title[:max(0, self.title.find(":"))] + #self.magicWords['namespacenumber'] = '0' # for article, self.magicWords['pagename'] = self.title self.magicWords['fullpagename'] = self.title self.magicWords['currentyear'] = time.strftime('%Y') @@ -1008,7 +1007,7 @@ class Extractor(): # Expand templates maxTemplateRecursionLevels = 30 - maxParameterRecursionLevels = 10 + maxParameterRecursionLevels = 16 # check for template beginning reOpen = re.compile('(? %s %s', functionName, ret) + # logging.debug('parserFunction> %s %s', args[1], ret) return ret if functionName in parserFunctions: ret = parserFunctions[functionName](*args) - # logging.debug('parserFunction> %s %s', functionName, ret) + # logging.debug('parserFunction> %s(%s) %s', functionName, args, ret) return ret except: return "" # FIXME: fix errors @@ -1851,6 +1854,6 @@ def define_template(title, page): text = reIncludeonly.sub('', text) if text: - if title in templates: + if title in templates and templates[title] != text: logging.warn('Redefining: %s', title) templates[title] = text