Fixed handling of templates.
This commit is contained in:
parent
f0ca16c3e9
commit
8f1b434a80
@ -2,7 +2,7 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Version: 3.0 (July 22, 2020)
|
# Version: 3.0 (January 24, 2023)
|
||||||
# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
|
# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
|
||||||
#
|
#
|
||||||
# Contributors:
|
# Contributors:
|
||||||
@ -17,7 +17,7 @@
|
|||||||
# Nick Ulven (nulven@github)
|
# Nick Ulven (nulven@github)
|
||||||
#
|
#
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Copyright (c) 2009-2020. Giuseppe Attardi (attardi@di.unipi.it).
|
# Copyright (c) 2009-2023. Giuseppe Attardi (attardi@di.unipi.it).
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# This file is part of Tanl.
|
# This file is part of Tanl.
|
||||||
#
|
#
|
||||||
@ -68,7 +68,7 @@ from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces
|
|||||||
# ===========================================================================
|
# ===========================================================================
|
||||||
|
|
||||||
# Program version
|
# Program version
|
||||||
__version__ = '3.0.6'
|
__version__ = '3.0.7'
|
||||||
|
|
||||||
##
|
##
|
||||||
# Defined in <siteinfo>
|
# Defined in <siteinfo>
|
||||||
@ -194,6 +194,7 @@ def load_templates(file, output_file=None):
|
|||||||
"""
|
"""
|
||||||
Load templates from :param file:.
|
Load templates from :param file:.
|
||||||
:param output_file: file where to save templates and modules.
|
:param output_file: file where to save templates and modules.
|
||||||
|
:return: number of templates loaded.
|
||||||
"""
|
"""
|
||||||
global templateNamespace
|
global templateNamespace
|
||||||
global moduleNamespace, modulePrefix
|
global moduleNamespace, modulePrefix
|
||||||
@ -335,7 +336,7 @@ def collect_pages(text):
|
|||||||
|
|
||||||
|
|
||||||
def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
||||||
process_count, html_safe):
|
process_count, html_safe, expand_templates=True):
|
||||||
"""
|
"""
|
||||||
:param input_file: name of the wikipedia dump file; '-' to read from stdin
|
:param input_file: name of the wikipedia dump file; '-' to read from stdin
|
||||||
:param template_file: optional file with template definitions.
|
:param template_file: optional file with template definitions.
|
||||||
@ -343,6 +344,8 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
|||||||
:param file_size: max size of each extracted file, or None for no max (one file)
|
:param file_size: max size of each extracted file, or None for no max (one file)
|
||||||
:param file_compress: whether to compress files with bzip.
|
:param file_compress: whether to compress files with bzip.
|
||||||
:param process_count: number of extraction processes to spawn.
|
:param process_count: number of extraction processes to spawn.
|
||||||
|
:html_safe: whether to convert entities in text to HTML.
|
||||||
|
:param expand_templates: whether to expand templates.
|
||||||
"""
|
"""
|
||||||
global knownNamespaces
|
global knownNamespaces
|
||||||
global templateNamespace
|
global templateNamespace
|
||||||
@ -528,7 +531,7 @@ minFileSize = 200 * 1024
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
global acceptedNamespaces
|
global acceptedNamespaces
|
||||||
global expand_templates, templateCache
|
global templateCache
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
|
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
@ -555,7 +558,7 @@ def main():
|
|||||||
help="accepted namespaces")
|
help="accepted namespaces")
|
||||||
groupP.add_argument("--templates",
|
groupP.add_argument("--templates",
|
||||||
help="use or create file containing templates")
|
help="use or create file containing templates")
|
||||||
groupP.add_argument("--no-templates", action="store_false",
|
groupP.add_argument("--no-templates", action="store_true",
|
||||||
help="Do not expand templates")
|
help="Do not expand templates")
|
||||||
groupP.add_argument("--html-safe", default=True,
|
groupP.add_argument("--html-safe", default=True,
|
||||||
help="use to produce HTML safe output within <doc>...</doc>")
|
help="use to produce HTML safe output within <doc>...</doc>")
|
||||||
@ -582,8 +585,6 @@ def main():
|
|||||||
Extractor.keepLinks = True
|
Extractor.keepLinks = True
|
||||||
Extractor.to_json = args.json
|
Extractor.to_json = args.json
|
||||||
|
|
||||||
expand_templates = args.no_templates
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
power = 'kmg'.find(args.bytes[-1].lower()) + 1
|
power = 'kmg'.find(args.bytes[-1].lower()) + 1
|
||||||
# 0 bytes means put a single article per file.
|
# 0 bytes means put a single article per file.
|
||||||
@ -636,8 +637,7 @@ def main():
|
|||||||
return
|
return
|
||||||
|
|
||||||
process_dump(input_file, args.templates, output_path, file_size,
|
process_dump(input_file, args.templates, output_path, file_size,
|
||||||
args.compress, args.processes, args.html_safe)
|
args.compress, args.processes, args.html_safe, not args.no_templates)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
@ -26,7 +26,6 @@ from urllib.parse import quote as urlencode
|
|||||||
from html.entities import name2codepoint
|
from html.entities import name2codepoint
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
import pdb # DEBUG
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
@ -82,7 +81,6 @@ def clean(extractor, text, expand_templates=False, html_safe=True):
|
|||||||
if expand_templates:
|
if expand_templates:
|
||||||
# expand templates
|
# expand templates
|
||||||
# See: http://www.mediawiki.org/wiki/Help:Templates
|
# See: http://www.mediawiki.org/wiki/Help:Templates
|
||||||
pdb.set_trace() # DEBUG
|
|
||||||
text = extractor.expandTemplates(text)
|
text = extractor.expandTemplates(text)
|
||||||
else:
|
else:
|
||||||
# Drop transclusions (template, parser functions)
|
# Drop transclusions (template, parser functions)
|
||||||
@ -830,7 +828,7 @@ class Template(list):
|
|||||||
# {{ppp|q=r|p=q}} gives r, but using Template:tvvv containing
|
# {{ppp|q=r|p=q}} gives r, but using Template:tvvv containing
|
||||||
# "{{{{{{{{{p}}}}}}}}}", {{tvvv|p=q|q=r|r=s}} gives s.
|
# "{{{{{{{{{p}}}}}}}}}", {{tvvv|p=q|q=r|r=s}} gives s.
|
||||||
|
|
||||||
#logging.debug('subst tpl (%d, %d) %s', len(extractor.frame), depth, self)
|
logging.debug('subst tpl (%d, %d) %s', len(extractor.frame), depth, self)
|
||||||
|
|
||||||
if depth > extractor.maxParameterRecursionLevels:
|
if depth > extractor.maxParameterRecursionLevels:
|
||||||
extractor.recursion_exceeded_3_errs += 1
|
extractor.recursion_exceeded_3_errs += 1
|
||||||
@ -952,6 +950,7 @@ class Extractor():
|
|||||||
e.g. "## Section 1"
|
e.g. "## Section 1"
|
||||||
"""
|
"""
|
||||||
self.magicWords['namespace'] = self.title[:max(0, self.title.find(":"))]
|
self.magicWords['namespace'] = self.title[:max(0, self.title.find(":"))]
|
||||||
|
#self.magicWords['namespacenumber'] = '0' # for article,
|
||||||
self.magicWords['pagename'] = self.title
|
self.magicWords['pagename'] = self.title
|
||||||
self.magicWords['fullpagename'] = self.title
|
self.magicWords['fullpagename'] = self.title
|
||||||
self.magicWords['currentyear'] = time.strftime('%Y')
|
self.magicWords['currentyear'] = time.strftime('%Y')
|
||||||
@ -1008,7 +1007,7 @@ class Extractor():
|
|||||||
# Expand templates
|
# Expand templates
|
||||||
|
|
||||||
maxTemplateRecursionLevels = 30
|
maxTemplateRecursionLevels = 30
|
||||||
maxParameterRecursionLevels = 10
|
maxParameterRecursionLevels = 16
|
||||||
|
|
||||||
# check for template beginning
|
# check for template beginning
|
||||||
reOpen = re.compile('(?<!{){{(?!{)', re.DOTALL)
|
reOpen = re.compile('(?<!{){{(?!{)', re.DOTALL)
|
||||||
@ -1764,6 +1763,8 @@ parserFunctions = {
|
|||||||
|
|
||||||
'int': lambda string, *rest: str(int(string)),
|
'int': lambda string, *rest: str(int(string)),
|
||||||
|
|
||||||
|
'padleft': lambda char, width, string: string.ljust(char, int(pad)), # CHECK_ME
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -1771,6 +1772,8 @@ def callParserFunction(functionName, args, frame):
|
|||||||
"""
|
"""
|
||||||
Parser functions have similar syntax as templates, except that
|
Parser functions have similar syntax as templates, except that
|
||||||
the first argument is everything after the first colon.
|
the first argument is everything after the first colon.
|
||||||
|
:param functionName: nameof the parser function
|
||||||
|
:param args: the arguments to the function
|
||||||
:return: the result of the invocation, None in case of failure.
|
:return: the result of the invocation, None in case of failure.
|
||||||
|
|
||||||
http://meta.wikimedia.org/wiki/Help:ParserFunctions
|
http://meta.wikimedia.org/wiki/Help:ParserFunctions
|
||||||
@ -1780,11 +1783,11 @@ def callParserFunction(functionName, args, frame):
|
|||||||
if functionName == '#invoke':
|
if functionName == '#invoke':
|
||||||
# special handling of frame
|
# special handling of frame
|
||||||
ret = sharp_invoke(args[0].strip(), args[1].strip(), frame)
|
ret = sharp_invoke(args[0].strip(), args[1].strip(), frame)
|
||||||
# logging.debug('parserFunction> %s %s', functionName, ret)
|
# logging.debug('parserFunction> %s %s', args[1], ret)
|
||||||
return ret
|
return ret
|
||||||
if functionName in parserFunctions:
|
if functionName in parserFunctions:
|
||||||
ret = parserFunctions[functionName](*args)
|
ret = parserFunctions[functionName](*args)
|
||||||
# logging.debug('parserFunction> %s %s', functionName, ret)
|
# logging.debug('parserFunction> %s(%s) %s', functionName, args, ret)
|
||||||
return ret
|
return ret
|
||||||
except:
|
except:
|
||||||
return "" # FIXME: fix errors
|
return "" # FIXME: fix errors
|
||||||
@ -1851,6 +1854,6 @@ def define_template(title, page):
|
|||||||
text = reIncludeonly.sub('', text)
|
text = reIncludeonly.sub('', text)
|
||||||
|
|
||||||
if text:
|
if text:
|
||||||
if title in templates:
|
if title in templates and templates[title] != text:
|
||||||
logging.warn('Redefining: %s', title)
|
logging.warn('Redefining: %s', title)
|
||||||
templates[title] = text
|
templates[title] = text
|
||||||
|
Loading…
Reference in New Issue
Block a user