Better handling of encoding.

2020-12-05 19:27:01 +01:00 · 2020-12-05 19:27:01 +01:00 · a2e078f3be
commit a2e078f3be
parent 0933664d70
3 changed files with 54 additions and 49 deletions
--- a/wikiextractor/WikiExtractor.py
+++ b/wikiextractor/WikiExtractor.py
@ -49,7 +49,6 @@ collecting template definitions.
 import argparse
 import bz2
 import fileinput
 import logging
 import os.path
 import re  # TODO use regex when it will be standard
@ -112,7 +111,7 @@ modules = {
 # Output
-class NextFile(object):
+class NextFile():
    """
    Synchronous generation of next available file name.
@ -143,7 +142,7 @@ class NextFile(object):
        return '%s/wiki_%02d' % (self._dirname(), self.file_index)
-class OutputSplitter(object):
+class OutputSplitter():
    """
    File-like object, that splits output to multiple files of a given max size.
@ -203,7 +202,7 @@ def load_templates(file, output_file=None):
    if output_file:
        output = open(output_file, 'w')
    for line in file:
-        line = line.decode('utf-8')
+        #line = line.decode('utf-8')
        if '<' not in line:  # faster than doing re.search()
            if inText:
                page.append(line)
@ -238,18 +237,18 @@ def load_templates(file, output_file=None):
            # FIXME: should reconstruct also moduleNamespace
            if title.startswith(templatePrefix):
                define_template(title, page)
                templates += 1
            # save templates and modules to file
            if output_file and (title.startswith(templatePrefix) or
                                title.startswith(modulePrefix)):
                output.write('<page>\n')
-                output.write('   <title>%s</title>\n' % title)
+                output.write('   <title>%s</title>\n' % title.encode('utf-8'))
                output.write('   <ns>10</ns>\n')
                output.write('   <text>')
                for line in page:
-                    output.write(line)
+                    output.write(line.encode('utf-8'))
                output.write('   </text>\n')
                output.write('</page>\n')
                templates += 1
            page = []
            articles += 1
            if articles % 100000 == 0:
@ -260,6 +259,20 @@ def load_templates(file, output_file=None):
    return templates
 def decode_open(filename, mode='rt', encoding='utf-8'):
    """
    Open a file, decode and decompress, depending on extension `gz`, or 'bz2`.
    """
    ext = os.path.splitext(filename)[1]
    if ext == '.gz':
        import gzip
        return gzip.open(filename, mode)
    elif ext == '.bz2':
        return bz2.open(filename, mode=mode, encoding=encoding)
    else:
        return open(filename, mode, encoding=encoding)
 def process_dump(input_file, template_file, out_file, file_size, file_compress,
                 process_count):
    """
@ -275,14 +288,11 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
    global templateNamespace, templatePrefix
    global moduleNamespace, modulePrefix
-    if input_file == '-':
+    input = decode_open(input_file)
        input = sys.stdin
    else:
        input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
    # collect siteinfo
    for line in input:
-        line = line.decode('utf-8')
+        line = line #.decode('utf-8')
        m = tagRE.search(line)
        if not m:
            continue
@ -308,7 +318,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
        template_load_start = default_timer()
        if template_file and os.path.exists(template_file):
            logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", template_file)
-            file = fileinput.FileInput(template_file, openhook=fileinput.hook_compressed)
+            file = decode_open(template_file)
            templates = load_templates(file)
            file.close()
        else:
@ -318,7 +328,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
            logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", input_file)
            templates = load_templates(input, template_file)
            input.close()
-            input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
+            input = decode_open(input_file)
        template_load_elapsed = default_timer() - template_load_start
        logging.info("Loaded %d templates in %.1fs", templates, template_load_elapsed)
@ -370,7 +380,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
    inText = False
    redirect = False
    for line in input:
-        line = line.decode('utf-8')
+        #line = line.decode('utf-8')
        if '<' not in line:  # faster than doing re.search()
            if inText:
                page.append(line)
@ -402,8 +412,8 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
            page.append(line)
        elif tag == '/page':
            colon = title.find(':')
-            if (colon < 0 or title[:colon] in acceptedNamespaces) and id != last_id and \
+            if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and
-                    not redirect and not title.startswith(templateNamespace):
+                    not redirect and not title.startswith(templateNamespace)):
                job = (id, title, page, ordinal)
                jobs_queue.put(job)  # goes to any available extract_process
                last_id = id
@ -539,7 +549,7 @@ def main():
    args = parser.parse_args()
    Extractor.keepLinks = args.links
-    Extractor.toHTML = args.html
+    Extractor.HtmlFormatting = args.html
    if args.html:
        Extractor.keepLinks = True
@ -583,7 +593,7 @@ def main():
                    load_templates(file)
        with open(input_file) as file:
-            page = file.read().decode('utf-8')
+            page = file.read()#.decode('utf-8')
            m = re.search(r'<id>(.*)</id>', page)
            id = m.group(1) if m else 0
            m = re.search(r'<title>(.*)</title>', page)
--- a/wikiextractor/extract.py
+++ b/wikiextractor/extract.py
@ -73,6 +73,11 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
    """
    Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped
    @see https://www.mediawiki.org/wiki/Help:Formatting
    :param extractor: the Extractor t use.
    :param text: the text to clean.
    :param expand_templates: whether to perform template expansion.
    :param escape_doc: whether to convert special characters to HTML entities.
    @return: the cleaned text.
    """
    if expand_templates:
@ -107,7 +112,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
    text = res + unescape(text[cur:])
    # Handle bold/italic/quote
-    if extractor.toHTML:
+    if extractor.HtmlFormatting:
        text = bold_italic.sub(r'<b>\1</b>', text)
        text = bold.sub(r'<b>\1</b>', text)
        text = italic.sub(r'<i>\1</i>', text)
@ -146,7 +151,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
    for tag in discardElements:
        text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag)
-    if not extractor.toHTML:
+    if not extractor.HtmlFormatting:
        # Turn into text what is left (&amp;nbsp;) and <syntaxhighlight>
        text = unescape(text)
@ -170,7 +175,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
    text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U)  # lines with only punctuations
    text = text.replace(',,', ',').replace(',.', '.')
    if escape_doc:
-        text = html.escape(text)
+        text = html.escape(text, quote=False)
    return text
@ -202,7 +207,7 @@ def compact(text, mark_headers=False):
        if m:
            title = m.group(2)
            lev = len(m.group(1))
-            if Extractor.toHTML:
+            if Extractor.HtmlFormatting:
                page.append("<h%d>%s</h%d>" % (lev, title, lev))
            if title and title[-1] not in '!?':
                title += '.'
@ -212,7 +217,7 @@ def compact(text, mark_headers=False):
            headers[lev] = title
            # drop previous headers
-            headers = { k:v for k,v in headers.items() if k > lev }
+            headers = { k:v for k,v in headers.items() if k <= lev }
            emptySection = True
            continue
        # Handle page title
@ -228,7 +233,7 @@ def compact(text, mark_headers=False):
            continue
        # handle lists
        elif line[0] in '*#;:':
-            if Extractor.toHTML:
+            if Extractor.HtmlFormatting:
                i = 0
                for c, n in zip_longest(listLevel, line, fillvalue=''):
                    if not n or n not in '*#;:':
@ -282,13 +287,6 @@ def compact(text, mark_headers=False):
    return page
 def handle_unicode(entity):
    numeric_code = int(entity[2:-1])
    if numeric_code >= 0x10000:
        return ''
    return unichr(numeric_code)
 # ----------------------------------------------------------------------
 def dropNested(text, openDelim, closeDelim):
@ -503,7 +501,7 @@ def makeInternalLink(title, label):
 # variables
-class MagicWords(object):
+class MagicWords():
    """
    One copy in each Extractor.
@ -726,11 +724,11 @@ def unescape(text):
        try:
            if text[1] == "#":  # character reference
                if text[2] == "x":
-                    return unichr(int(code[1:], 16))
+                    return chr(int(code[1:], 16))
                else:
-                    return unichr(int(code))
+                    return chr(int(code))
            else:  # named entity
-                return unichr(name2codepoint[code])
+                return chr(name2codepoint[code])
        except:
            return text  # leave as is
@ -795,8 +793,7 @@ dots = re.compile(r'\.{4,}')
 substWords = 'subst:|safesubst:'
-class Extractor(object):
+class Extractor():
    """
    An extraction task on a article.
    """
@ -809,8 +806,8 @@ class Extractor(object):
    keepSections = True
    ##
-    # Whether to output HTML instead of text
+    # Whether to output text with HTML formatting elements in <doc> files.
-    toHTML = False
+    HtmlFormatting = False
    def __init__(self, id, title, page):
        """
@ -846,7 +843,7 @@ class Extractor(object):
        text = compact(text, mark_headers=mark_headers)
        return text
-    def extract(self, out):
+    def extract(self, out, escape_doc=True):
        """
        :param out: a memory file.
        """
@ -860,7 +857,7 @@ class Extractor(object):
        footer = "\n</doc>\n"
        out.write(header)
-        text = self.clean_text(text)
+        text = self.clean_text(text, escape_doc=escape_doc)
        for line in text:
            out.write(line)
@ -1443,7 +1440,7 @@ def normalizeNamespace(ns):
 # https://github.com/Wikia/app/blob/dev/extensions/ParserFunctions/ParserFunctions_body.php
-class Infix:
+class Infix():
    """Infix operators.
    The calling sequence for the infix is:
--- a/wikiextractor/extractPage.py
+++ b/wikiextractor/extractPage.py
@ -34,7 +34,7 @@ import bz2
 # Program version
-version = '3.0'
+__version__ = '3.0.3'
 # ----------------------------------------------------------------------
 # READER
@ -49,15 +49,13 @@ def process_data(input_file, id, templates=False):
    :param id: article id
    """
-    if input_file.lower().endswith("bz2"):
+    opener = bz2.BZ2File if input_file.lower().endswith("bz2") else open
        opener = bz2.BZ2File
    else:
        opener = open
    input = opener(input_file)
    page = []
    for line in input:
        line = line.decode('utf-8')
        if '<' not in line:         # faster than doing re.search()
            if page:
                page.append(line)
@ -103,7 +101,7 @@ def main():
                                     description=__doc__)
    parser.add_argument("input",
                        help="XML wiki dump file")
-    parser.add_argument("--id", default="",
+    parser.add_argument("--id", default="1",
                        help="article number")
    parser.add_argument("--template", action="store_true",
                        help="template number")