Fixed handling of templates.

This commit is contained in:
attardi 2023-01-24 19:02:27 +01:00
parent f0ca16c3e9
commit 8f1b434a80
2 changed files with 20 additions and 17 deletions

View File

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# ============================================================================= # =============================================================================
# Version: 3.0 (July 22, 2020) # Version: 3.0 (January 24, 2023)
# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa # Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
# #
# Contributors: # Contributors:
@ -17,7 +17,7 @@
# Nick Ulven (nulven@github) # Nick Ulven (nulven@github)
# #
# ============================================================================= # =============================================================================
# Copyright (c) 2009-2020. Giuseppe Attardi (attardi@di.unipi.it). # Copyright (c) 2009-2023. Giuseppe Attardi (attardi@di.unipi.it).
# ============================================================================= # =============================================================================
# This file is part of Tanl. # This file is part of Tanl.
# #
@ -68,7 +68,7 @@ from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces
# =========================================================================== # ===========================================================================
# Program version # Program version
__version__ = '3.0.6' __version__ = '3.0.7'
## ##
# Defined in <siteinfo> # Defined in <siteinfo>
@ -194,6 +194,7 @@ def load_templates(file, output_file=None):
""" """
Load templates from :param file:. Load templates from :param file:.
:param output_file: file where to save templates and modules. :param output_file: file where to save templates and modules.
:return: number of templates loaded.
""" """
global templateNamespace global templateNamespace
global moduleNamespace, modulePrefix global moduleNamespace, modulePrefix
@ -335,7 +336,7 @@ def collect_pages(text):
def process_dump(input_file, template_file, out_file, file_size, file_compress, def process_dump(input_file, template_file, out_file, file_size, file_compress,
process_count, html_safe): process_count, html_safe, expand_templates=True):
""" """
:param input_file: name of the wikipedia dump file; '-' to read from stdin :param input_file: name of the wikipedia dump file; '-' to read from stdin
:param template_file: optional file with template definitions. :param template_file: optional file with template definitions.
@ -343,6 +344,8 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
:param file_size: max size of each extracted file, or None for no max (one file) :param file_size: max size of each extracted file, or None for no max (one file)
:param file_compress: whether to compress files with bzip. :param file_compress: whether to compress files with bzip.
:param process_count: number of extraction processes to spawn. :param process_count: number of extraction processes to spawn.
:html_safe: whether to convert entities in text to HTML.
:param expand_templates: whether to expand templates.
""" """
global knownNamespaces global knownNamespaces
global templateNamespace global templateNamespace
@ -528,7 +531,7 @@ minFileSize = 200 * 1024
def main(): def main():
global acceptedNamespaces global acceptedNamespaces
global expand_templates, templateCache global templateCache
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
@ -555,7 +558,7 @@ def main():
help="accepted namespaces") help="accepted namespaces")
groupP.add_argument("--templates", groupP.add_argument("--templates",
help="use or create file containing templates") help="use or create file containing templates")
groupP.add_argument("--no-templates", action="store_false", groupP.add_argument("--no-templates", action="store_true",
help="Do not expand templates") help="Do not expand templates")
groupP.add_argument("--html-safe", default=True, groupP.add_argument("--html-safe", default=True,
help="use to produce HTML safe output within <doc>...</doc>") help="use to produce HTML safe output within <doc>...</doc>")
@ -582,8 +585,6 @@ def main():
Extractor.keepLinks = True Extractor.keepLinks = True
Extractor.to_json = args.json Extractor.to_json = args.json
expand_templates = args.no_templates
try: try:
power = 'kmg'.find(args.bytes[-1].lower()) + 1 power = 'kmg'.find(args.bytes[-1].lower()) + 1
# 0 bytes means put a single article per file. # 0 bytes means put a single article per file.
@ -636,8 +637,7 @@ def main():
return return
process_dump(input_file, args.templates, output_path, file_size, process_dump(input_file, args.templates, output_path, file_size,
args.compress, args.processes, args.html_safe) args.compress, args.processes, args.html_safe, not args.no_templates)
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@ -26,7 +26,6 @@ from urllib.parse import quote as urlencode
from html.entities import name2codepoint from html.entities import name2codepoint
import logging import logging
import time import time
import pdb # DEBUG
# ---------------------------------------------------------------------- # ----------------------------------------------------------------------
@ -82,7 +81,6 @@ def clean(extractor, text, expand_templates=False, html_safe=True):
if expand_templates: if expand_templates:
# expand templates # expand templates
# See: http://www.mediawiki.org/wiki/Help:Templates # See: http://www.mediawiki.org/wiki/Help:Templates
pdb.set_trace() # DEBUG
text = extractor.expandTemplates(text) text = extractor.expandTemplates(text)
else: else:
# Drop transclusions (template, parser functions) # Drop transclusions (template, parser functions)
@ -830,7 +828,7 @@ class Template(list):
# {{ppp|q=r|p=q}} gives r, but using Template:tvvv containing # {{ppp|q=r|p=q}} gives r, but using Template:tvvv containing
# "{{{{{{{{{p}}}}}}}}}", {{tvvv|p=q|q=r|r=s}} gives s. # "{{{{{{{{{p}}}}}}}}}", {{tvvv|p=q|q=r|r=s}} gives s.
#logging.debug('subst tpl (%d, %d) %s', len(extractor.frame), depth, self) logging.debug('subst tpl (%d, %d) %s', len(extractor.frame), depth, self)
if depth > extractor.maxParameterRecursionLevels: if depth > extractor.maxParameterRecursionLevels:
extractor.recursion_exceeded_3_errs += 1 extractor.recursion_exceeded_3_errs += 1
@ -952,6 +950,7 @@ class Extractor():
e.g. "## Section 1" e.g. "## Section 1"
""" """
self.magicWords['namespace'] = self.title[:max(0, self.title.find(":"))] self.magicWords['namespace'] = self.title[:max(0, self.title.find(":"))]
#self.magicWords['namespacenumber'] = '0' # for article,
self.magicWords['pagename'] = self.title self.magicWords['pagename'] = self.title
self.magicWords['fullpagename'] = self.title self.magicWords['fullpagename'] = self.title
self.magicWords['currentyear'] = time.strftime('%Y') self.magicWords['currentyear'] = time.strftime('%Y')
@ -1008,7 +1007,7 @@ class Extractor():
# Expand templates # Expand templates
maxTemplateRecursionLevels = 30 maxTemplateRecursionLevels = 30
maxParameterRecursionLevels = 10 maxParameterRecursionLevels = 16
# check for template beginning # check for template beginning
reOpen = re.compile('(?<!{){{(?!{)', re.DOTALL) reOpen = re.compile('(?<!{){{(?!{)', re.DOTALL)
@ -1764,6 +1763,8 @@ parserFunctions = {
'int': lambda string, *rest: str(int(string)), 'int': lambda string, *rest: str(int(string)),
'padleft': lambda char, width, string: string.ljust(char, int(pad)), # CHECK_ME
} }
@ -1771,6 +1772,8 @@ def callParserFunction(functionName, args, frame):
""" """
Parser functions have similar syntax as templates, except that Parser functions have similar syntax as templates, except that
the first argument is everything after the first colon. the first argument is everything after the first colon.
:param functionName: nameof the parser function
:param args: the arguments to the function
:return: the result of the invocation, None in case of failure. :return: the result of the invocation, None in case of failure.
http://meta.wikimedia.org/wiki/Help:ParserFunctions http://meta.wikimedia.org/wiki/Help:ParserFunctions
@ -1780,11 +1783,11 @@ def callParserFunction(functionName, args, frame):
if functionName == '#invoke': if functionName == '#invoke':
# special handling of frame # special handling of frame
ret = sharp_invoke(args[0].strip(), args[1].strip(), frame) ret = sharp_invoke(args[0].strip(), args[1].strip(), frame)
# logging.debug('parserFunction> %s %s', functionName, ret) # logging.debug('parserFunction> %s %s', args[1], ret)
return ret return ret
if functionName in parserFunctions: if functionName in parserFunctions:
ret = parserFunctions[functionName](*args) ret = parserFunctions[functionName](*args)
# logging.debug('parserFunction> %s %s', functionName, ret) # logging.debug('parserFunction> %s(%s) %s', functionName, args, ret)
return ret return ret
except: except:
return "" # FIXME: fix errors return "" # FIXME: fix errors
@ -1851,6 +1854,6 @@ def define_template(title, page):
text = reIncludeonly.sub('', text) text = reIncludeonly.sub('', text)
if text: if text:
if title in templates: if title in templates and templates[title] != text:
logging.warn('Redefining: %s', title) logging.warn('Redefining: %s', title)
templates[title] = text templates[title] = text