Restored template expansion and HTML preserving flag.

2022-03-07 10:31:01 +01:00 · 2022-03-07 10:31:01 +01:00 · 05b5cc7e51
commit 05b5cc7e51
parent 1053fe2030
3 changed files with 241 additions and 119 deletions
--- a/wikiextractor/WikiExtractor.py
+++ b/wikiextractor/WikiExtractor.py
@ -79,7 +79,6 @@ knownNamespaces = set(['Template'])
 # The namespace used for template definitions
 # It is the name associated with namespace key=10 in the siteinfo header.
 templateNamespace = ''
-templatePrefix = ''

 ##
 # The namespace used for module definitions
@ -196,8 +195,7 @@ def load_templates(file, output_file=None):
    Load templates from :param file:.
    :param output_file: file where to save templates and modules.
    """
-    global templateNamespace, templatePrefix
-    templatePrefix = templateNamespace + ':'
+    global templateNamespace
    global moduleNamespace, modulePrefix
    modulePrefix = moduleNamespace + ':'
    articles = 0
@ -220,6 +218,13 @@ def load_templates(file, output_file=None):
            page = []
        elif tag == 'title':
            title = m.group(3)
+            if not output_file and not templateNamespace:  # do not know it yet
+                # we reconstruct it from the first title
+                colon = title.find(':')
+                if colon > 1:
+                    templateNamespace = title[:colon]
+                    Extractor.templatePrefix = title[:colon + 1]
+            # FIXME: should reconstruct also moduleNamespace
        elif tag == 'text':
            inText = True
            line = line[m.start(3):m.end(3)]
@ -233,18 +238,11 @@ def load_templates(file, output_file=None):
        elif inText:
            page.append(line)
        elif tag == '/page':
-            if not output_file and not templateNamespace:  # do not know it yet
-                # we reconstruct it from the first title
-                colon = title.find(':')
-                if colon > 1:
-                    templateNamespace = title[:colon]
-                    templatePrefix = title[:colon + 1]
-            # FIXME: should reconstruct also moduleNamespace
-            if title.startswith(templatePrefix):
+            if title.startswith(Extractor.templatePrefix):
                define_template(title, page)
                templates += 1
            # save templates and modules to file
-            if output_file and (title.startswith(templatePrefix) or
+            if output_file and (title.startswith(Extractor.templatePrefix) or
                                title.startswith(modulePrefix)):
                output.write('<page>\n')
                output.write('   <title>%s</title>\n' % title)
@ -279,6 +277,63 @@ def decode_open(filename, mode='rt', encoding='utf-8'):
        return open(filename, mode, encoding=encoding)


+def collect_pages(text):
+    """
+    :param text: the text of a wikipedia file dump.
+    """
+    # we collect individual lines, since str.join() is significantly faster
+    # than concatenation
+    page = []
+    id = ''
+    revid = ''
+    last_id = ''
+    inText = False
+    redirect = False
+    for line in text:
+        if '<' not in line:     # faster than doing re.search()
+            if inText:
+                page.append(line)
+            continue
+        m = tagRE.search(line)
+        if not m:
+            continue
+        tag = m.group(2)
+        if tag == 'page':
+            page = []
+            redirect = False
+        elif tag == 'id' and not id:
+            id = m.group(3)
+        elif tag == 'id' and id: # <revision> <id></id> </revision>
+            revid = m.group(3)
+        elif tag == 'title':
+            title = m.group(3)
+        elif tag == 'redirect':
+            redirect = True
+        elif tag == 'text':
+            inText = True
+            line = line[m.start(3):m.end(3)]
+            page.append(line)
+            if m.lastindex == 4:  # open-close
+                inText = False
+        elif tag == '/text':
+            if m.group(1):
+                page.append(m.group(1))
+            inText = False
+        elif inText:
+            page.append(line)
+        elif tag == '/page':
+            colon = title.find(':')
+            if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and
+                    not redirect and not title.startswith(templateNamespace)):
+                yield (id, revid, title, page)
+                last_id = id
+            id = ''
+            revid = ''
+            page = []
+            inText = False
+            redirect = False
+
+
 def process_dump(input_file, template_file, out_file, file_size, file_compress,
                 process_count, html_safe):
    """
@ -290,7 +345,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
    :param process_count: number of extraction processes to spawn.
    """
    global knownNamespaces
-    global templateNamespace, templatePrefix
+    global templateNamespace
    global moduleNamespace, modulePrefix

    urlbase = ''                # This is obtained from <siteinfo>
@ -313,7 +368,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
            knownNamespaces.add(m.group(3))
            if re.search('key="10"', line):
                templateNamespace = m.group(3)
-                templatePrefix = templateNamespace + ':'
+                Extractor.templatePrefix = templateNamespace + ':'
            elif re.search('key="828"', line):
                moduleNamespace = m.group(3)
                modulePrefix = moduleNamespace + ':'
@ -383,56 +438,12 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,

    # we collect individual lines, since str.join() is significantly faster
    # than concatenation
-    page = []
-    id = ''
-    revid = ''
-    last_id = ''
+
    ordinal = 0  # page count
-    inText = False
-    redirect = False
-    for line in input:
-        if '<' not in line:  # faster than doing re.search()
-            if inText:
-                page.append(line)
-            continue
-        m = tagRE.search(line)
-        if not m:
-            continue
-        tag = m.group(2)
-        if tag == 'page':
-            page = []
-            redirect = False
-        elif tag == 'id' and not id:
-            id = m.group(3)
-        elif tag == 'id' and id: # <revision> <id></id> </revision>
-            revid = m.group(3)
-        elif tag == 'title':
-            title = m.group(3)
-        elif tag == 'redirect':
-            redirect = True
-        elif tag == 'text':
-            inText = True
-            line = line[m.start(3):m.end(3)]
-            page.append(line)
-            if m.lastindex == 4:  # open-close
-                inText = False
-        elif tag == '/text':
-            if m.group(1):
-                page.append(m.group(1))
-            inText = False
-        elif inText:
-            page.append(line)
-        elif tag == '/page':
-            colon = title.find(':')
-            if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and
-                    not redirect and not title.startswith(templateNamespace)):
-                job = (id, revid, urlbase, title, page, ordinal)
-                jobs_queue.put(job)  # goes to any available extract_process
-                last_id = id
-                ordinal += 1
-            id = ''
-            revid = ''
-            page = []
+    for id, revid, title, page in collect_pages(input):
+        job = (id, revid, urlbase, title, page, ordinal)
+        jobs_queue.put(job)  # goes to any available extract_process
+        ordinal += 1

    input.close()

@ -467,7 +478,7 @@ def extract_process(jobs_queue, output_queue, html_safe):
    :html_safe: whether to convert entities in text to HTML.
    """
    while True:
-        job = jobs_queue.get()  # job is (id, revid, urlbase, title, page, ordinal)
+        job = jobs_queue.get()  # job is (id, revid, urlbase, title, page)
        if job:
            out = StringIO()  # memory buffer
            Extractor(*job[:-1]).extract(out, html_safe)  # (id, urlbase, title, page)
@ -479,7 +490,8 @@ def extract_process(jobs_queue, output_queue, html_safe):


 def reduce_process(output_queue, output):
-    """Pull finished article text, write series of files (or stdout)
+    """
+    Pull finished article text, write series of files (or stdout)
    :param output_queue: text to be output.
    :param output: file object where to print.
    """
@ -515,7 +527,7 @@ minFileSize = 200 * 1024


 def main():
-    global urlbase, acceptedNamespaces
+    global acceptedNamespaces
    global expand_templates, templateCache

    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
@ -609,24 +621,10 @@ def main():
                with open(args.templates) as file:
                    load_templates(file)

-        with open(input_file) as file:
-            page = file.read()
-            ids = re.findall(r'<id>(\d*?)</id>', page)
-            id = ids[0] if ids else ''
-            revid = ids[1] if len(ids) > 1 else ''
-            m = re.search(r'<title>(.*?)</title>', page)
-            if m:
-                title = m.group(1)
-            else:
-                logging.error('Missing title element')
-                return
-            m = re.search(r'<base>(.*?)</base>', page)
-            if m:
-                base = m.group(1)
-                urlbase = base[:base.rfind("/")]
-            else:
-                urlbase = ''
-            Extractor(id, revid, urlbase, title, [page]).extract(sys.stdout)
+        urlbase = ''
+        with open(input_file) as input:
+            for id, revid, title, page in collect_pages(input):
+                Extractor(id, revid, urlbase, title, page).extract(sys.stdout)
        return

    output_path = args.output
--- a/wikiextractor/extract.py
+++ b/wikiextractor/extract.py
@ -26,6 +26,7 @@ from urllib.parse import quote as urlencode
 from html.entities import name2codepoint
 import logging
 import time
+import pdb                      # DEBUG

 # ----------------------------------------------------------------------

@ -81,6 +82,7 @@ def clean(extractor, text, expand_templates=False, html_safe=True):
    if expand_templates:
        # expand templates
        # See: http://www.mediawiki.org/wiki/Help:Templates
+        pdb.set_trace()         # DEBUG
        text = extractor.expandTemplates(text)
    else:
        # Drop transclusions (template, parser functions)
@ -199,7 +201,12 @@ def compact(text, mark_headers=False):
    for line in text.split('\n'):

        if not line:
+            if len(listLevel):    # implies Extractor.HtmlFormatting
+                for c in reversed(listLevel):
+                    page.append(listClose[c])
+                    listLevel = ''
            continue
+
        # Handle section titles
        m = section.match(line)
        if m:
@ -227,36 +234,35 @@ def compact(text, mark_headers=False):
                page.append(title)
        # handle indents
        elif line[0] == ':':
-            # page.append(line.lstrip(':*#;'))
-            continue
+            page.append(line.lstrip(':'))
        # handle lists
-        elif line[0] in '*#;:':
+        # @see https://www.mediawiki.org/wiki/Help:Formatting
+        elif line[0] in '*#;':
            if Extractor.HtmlFormatting:
-                i = 0
-                for c, n in zip_longest(listLevel, line, fillvalue=''):
-                    if not n or n not in '*#;:':
-                        if c:
-                            page.append(listClose[c])
-                            listLevel = listLevel[:-1]
-                            continue
-                        else:
-                            break
-                    # n != ''
-                    if c != n and (not c or (c not in ';:' and n not in ';:')):
-                        if c:
-                            # close level
-                            page.append(listClose[c])
-                            listLevel = listLevel[:-1]
-                        listLevel += n
-                        page.append(listOpen[n])
-                    i += 1
-                n = line[i - 1]  # last list char
-                line = line[i:].strip()
-                if line:  # FIXME: n is '"'
-                    page.append(listItem[n] % line)
+                # close extra levels
+                l = 0
+                for c in listLevel:
+                    if l < len(line) and c != line[l]:
+                        for extra in reversed(listLevel[l:]):
+                            page.append(listClose[extra])
+                        listLevel = listLevel[:l]
+                        break
+                    l += 1
+                if l < len(line) and line[l] in '*#;:':
+                    # add new level (only one, no jumps)
+                    # FIXME: handle jumping levels
+                    type = line[l]
+                    page.append(listOpen[type])
+                    listLevel += type
+                    line = line[l+1:].strip()
+                else:
+                    # continue on same level
+                    type = line[l-1]
+                    line = line[l:].strip()
+                page.append(listItem[type] % line)
            else:
                continue
-        elif len(listLevel):
+        elif len(listLevel):    # implies Extractor.HtmlFormatting
            for c in reversed(listLevel):
                page.append(listClose[c])
            listLevel = []
@ -786,6 +792,114 @@ spaces = re.compile(r' {2,}')
 # Matches dots
 dots = re.compile(r'\.{4,}')

+# ======================================================================
+
+class Template(list):
+    """
+    A Template is a list of TemplateText or TemplateArgs
+    """
+
+    @classmethod
+    def parse(cls, body):
+        tpl = Template()
+        # we must handle nesting, s.a.
+        # {{{1|{{PAGENAME}}}
+        # {{{italics|{{{italic|}}}
+        # {{#if:{{{{{#if:{{{nominee|}}}|nominee|candidate}}|}}}|
+        #
+        start = 0
+        for s,e in findMatchingBraces(body, 3):
+            tpl.append(TemplateText(body[start:s]))
+            tpl.append(TemplateArg(body[s+3:e-3]))
+            start = e
+        tpl.append(TemplateText(body[start:])) # leftover
+        return tpl
+
+    def subst(self, params, extractor, depth=0):
+        # We perform parameter substitutions recursively.
+        # We also limit the maximum number of iterations to avoid too long or
+        # even endless loops (in case of malformed input).
+
+        # :see: http://meta.wikimedia.org/wiki/Help:Expansion#Distinction_between_variables.2C_parser_functions.2C_and_templates
+        #
+        # Parameter values are assigned to parameters in two (?) passes.
+        # Therefore a parameter name in a template can depend on the value of
+        # another parameter of the same template, regardless of the order in
+        # which they are specified in the template call, for example, using
+        # Template:ppp containing "{{{{{{p}}}}}}", {{ppp|p=q|q=r}} and even
+        # {{ppp|q=r|p=q}} gives r, but using Template:tvvv containing
+        # "{{{{{{{{{p}}}}}}}}}", {{tvvv|p=q|q=r|r=s}} gives s.
+
+        #logging.debug('subst tpl (%d, %d) %s', len(extractor.frame), depth, self)
+
+        if depth > extractor.maxParameterRecursionLevels:
+            extractor.recursion_exceeded_3_errs += 1
+            return ''
+
+        return ''.join([tpl.subst(params, extractor, depth) for tpl in self])
+
+    def __str__(self):
+        return ''.join([str(x) for x in self])
+
+
+class TemplateText(str):
+    """Fixed text of template"""
+
+    def subst(self, params, extractor, depth):
+        return self
+
+
+class TemplateArg():
+    """
+    parameter to a template.
+    Has a name and a default value, both of which are Templates.
+    """
+    def __init__(self, parameter):
+        """
+        :param parameter: the parts of a tplarg.
+        """
+        # the parameter name itself might contain templates, e.g.:
+        #   appointe{{#if:{{{appointer14|}}}|r|d}}14|
+        #   4|{{{{{subst|}}}CURRENTYEAR}}
+
+        # any parts in a tplarg after the first (the parameter default) are
+        # ignored, and an equals sign in the first part is treated as plain text.
+        #logging.debug('TemplateArg %s', parameter)
+
+        parts = splitParts(parameter)
+        self.name = Template.parse(parts[0])
+        if len(parts) > 1:
+            # This parameter has a default value
+            self.default = Template.parse(parts[1])
+        else:
+            self.default = None
+
+    def __str__(self):
+        if self.default:
+            return '{{{%s|%s}}}' % (self.name, self.default)
+        else:
+            return '{{{%s}}}' % self.name
+
+    def subst(self, params, extractor, depth):
+        """
+        Substitute value for this argument from dict :param params:
+        Use :param extractor: to evaluate expressions for name and default.
+        Limit substitution to the maximun :param depth:.
+        """
+        # the parameter name itself might contain templates, e.g.:
+        # appointe{{#if:{{{appointer14|}}}|r|d}}14|
+        paramName = self.name.subst(params, extractor, depth+1)
+        paramName = extractor.expandTemplates(paramName)
+        res = ''
+        if paramName in params:
+            res = params[paramName]  # use parameter value specified in template invocation
+        elif self.default:            # use the default value
+            defaultValue = self.default.subst(params, extractor, depth+1)
+            res =  extractor.expandTemplates(defaultValue)
+        #logging.debug('subst arg %d %s -> %s' % (depth, paramName, res))
+        return res
+
+
 # ======================================================================

 substWords = 'subst:|safesubst:'
@ -811,6 +925,10 @@ class Extractor():
    # Whether to produce json instead of the default <doc> output format.
    toJson = False

+    ##
+    # Obtained from TemplateNamespace
+    templatePrefix = ''
+
    def __init__(self, id, revid, urlbase, title, page):
        """
        :param page: a list of lines.
@ -827,12 +945,13 @@ class Extractor():
        self.recursion_exceeded_3_errs = 0  # parameter recursion
        self.template_title_errs = 0

-    def clean_text(self, text, mark_headers=False, expand_templates=False,
+    def clean_text(self, text, mark_headers=False, expand_templates=True,
                   html_safe=True):
        """
        :param mark_headers: True to distinguish headers from paragraphs
          e.g. "## Section 1"
        """
+        self.magicWords['namespace'] = self.title[:max(0, self.title.find(":"))]
        self.magicWords['pagename'] = self.title
        self.magicWords['fullpagename'] = self.title
        self.magicWords['currentyear'] = time.strftime('%Y')
@ -978,7 +1097,11 @@ class Extractor():
            # The '=' might occurr within an HTML attribute:
            #   "&lt;ref name=value"
            # but we stop at first.
-            m = re.match(' *([^=]*?) *=(.*)', param, re.DOTALL)
+
+            # The '=' might occurr within quotes:
+            # ''''<span lang="pt-pt" xml:lang="pt-pt">cénicas</span>'''
+
+            m = re.match(" *([^=']*?) *=(.*)", param, re.DOTALL)
            if m:
                # This is a named parameter.  This case also handles parameter
                # assignments like "2=xxx", where the number of an unnamed
@ -1273,7 +1396,7 @@ def findMatchingBraces(text, ldelim=0):

    if ldelim:  # 2-3
        reOpen = re.compile('[{]{%d,}' % ldelim)  # at least ldelim
-        reNext = re.compile('[{]{2,}|}{2,}')  # at least 2
+        reNext = re.compile('[{]{2,}|}{2,}')  # at least 2 open or close bracces
    else:
        reOpen = re.compile('{{2,}|\[{2,}')
        reNext = re.compile('{{2,}|}{2,}|\[{2,}|]{2,}')  # at least 2
@ -1439,7 +1562,7 @@ def fullyQualifiedTemplateTitle(templateTitle):
    # space]], but having in the system a redirect page with an empty title
    # causes numerous problems, so we'll live happier without it.
    if templateTitle:
-        return templatePrefix + ucfirst(templateTitle)
+        return Extractor.templatePrefix + ucfirst(templateTitle)
    else:
        return ''  # caller may log as error

@ -1489,7 +1612,7 @@ def sharp_expr(expr):
        expr = re.sub('mod', '%', expr)
        expr = re.sub('\bdiv\b', '/', expr)
        expr = re.sub('\bround\b', '|ROUND|', expr)
-        return unicode(eval(expr))
+        return str(eval(expr))
    except:
        return '<span class="error"></span>'

@ -1675,7 +1798,7 @@ def callParserFunction(functionName, args, frame):
 reNoinclude = re.compile(r'<noinclude>(?:.*?)</noinclude>', re.DOTALL)
 reIncludeonly = re.compile(r'<includeonly>|</includeonly>', re.DOTALL)

-# These are built before spawning processes, hence thay are shared.
+# These are built before spawning processes, hence they are shared.
 templates = {}
 redirects = {}
 # cache of parser templates
--- a/wikiextractor/extractPage.py
+++ b/wikiextractor/extractPage.py
@ -46,7 +46,8 @@ tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')
 def process_data(input_file, id, templates=False):
    """
    :param input_file: name of the wikipedia dump file.
-    :param id: article id
+    :param id: article id.
+    :param templates: whether article is a template.
    """

    if input_file.lower().endswith(".bz2"):
@ -105,9 +106,9 @@ def main():
    parser.add_argument("--id", default="1",
                        help="article number")
    parser.add_argument("--template", action="store_true",
-                        help="template number")
+                        help="whether article is a template")
    parser.add_argument("-v", "--version", action="version",
-                        version='%(prog)s ' + version,
+                        version='%(prog)s ' + __version__,
                        help="print program version")

    args = parser.parse_args()