From a2e078f3beb571e2ceee84c0ac42d19e220ed339 Mon Sep 17 00:00:00 2001
From: attardi <attardi@di.unipi.it>
Date: Sat, 5 Dec 2020 19:27:01 +0100
Subject: [PATCH] Better handling of encoding.

---
 wikiextractor/WikiExtractor.py | 48 ++++++++++++++++++++--------------
 wikiextractor/extract.py       | 45 +++++++++++++++----------------
 wikiextractor/extractPage.py   | 10 +++----
 3 files changed, 54 insertions(+), 49 deletions(-)
diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py
index 27056e2..b4a6ed1 100755
--- a/wikiextractor/WikiExtractor.py
+++ b/wikiextractor/WikiExtractor.py
@@ -49,7 +49,6 @@ collecting template definitions.
 
 import argparse
 import bz2
-import fileinput
 import logging
 import os.path
 import re  # TODO use regex when it will be standard
@@ -112,7 +111,7 @@ modules = {
 # Output
 
 
-class NextFile(object):
+class NextFile():
 
     """
     Synchronous generation of next available file name.
@@ -143,7 +142,7 @@ class NextFile(object):
         return '%s/wiki_%02d' % (self._dirname(), self.file_index)
 
 
-class OutputSplitter(object):
+class OutputSplitter():
 
     """
     File-like object, that splits output to multiple files of a given max size.
@@ -203,7 +202,7 @@ def load_templates(file, output_file=None):
     if output_file:
         output = open(output_file, 'w')
     for line in file:
-        line = line.decode('utf-8')
+        #line = line.decode('utf-8')
         if '<' not in line:  # faster than doing re.search()
             if inText:
                 page.append(line)
@@ -238,18 +237,18 @@ def load_templates(file, output_file=None):
             # FIXME: should reconstruct also moduleNamespace
             if title.startswith(templatePrefix):
                 define_template(title, page)
+                templates += 1
             # save templates and modules to file
             if output_file and (title.startswith(templatePrefix) or
                                 title.startswith(modulePrefix)):
                 output.write('<page>\n')
-                output.write('   <title>%s</title>\n' % title)
+                output.write('   <title>%s</title>\n' % title.encode('utf-8'))
                 output.write('   <ns>10</ns>\n')
                 output.write('   <text>')
                 for line in page:
-                    output.write(line)
+                    output.write(line.encode('utf-8'))
                 output.write('   </text>\n')
                 output.write('</page>\n')
-                templates += 1
             page = []
             articles += 1
             if articles % 100000 == 0:
@@ -260,6 +259,20 @@ def load_templates(file, output_file=None):
     return templates
 
 
+def decode_open(filename, mode='rt', encoding='utf-8'):
+    """
+    Open a file, decode and decompress, depending on extension `gz`, or 'bz2`.
+    """
+    ext = os.path.splitext(filename)[1]
+    if ext == '.gz':
+        import gzip
+        return gzip.open(filename, mode)
+    elif ext == '.bz2':
+        return bz2.open(filename, mode=mode, encoding=encoding)
+    else:
+        return open(filename, mode, encoding=encoding)
+
+
 def process_dump(input_file, template_file, out_file, file_size, file_compress,
                  process_count):
     """
@@ -275,14 +288,11 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
     global templateNamespace, templatePrefix
     global moduleNamespace, modulePrefix
 
-    if input_file == '-':
-        input = sys.stdin
-    else:
-        input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
+    input = decode_open(input_file)
 
     # collect siteinfo
     for line in input:
-        line = line.decode('utf-8')
+        line = line #.decode('utf-8')
         m = tagRE.search(line)
         if not m:
             continue
@@ -308,7 +318,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
         template_load_start = default_timer()
         if template_file and os.path.exists(template_file):
             logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", template_file)
-            file = fileinput.FileInput(template_file, openhook=fileinput.hook_compressed)
+            file = decode_open(template_file)
             templates = load_templates(file)
             file.close()
         else:
@@ -318,7 +328,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
             logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", input_file)
             templates = load_templates(input, template_file)
             input.close()
-            input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
+            input = decode_open(input_file)
         template_load_elapsed = default_timer() - template_load_start
         logging.info("Loaded %d templates in %.1fs", templates, template_load_elapsed)
 
@@ -370,7 +380,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
     inText = False
     redirect = False
     for line in input:
-        line = line.decode('utf-8')
+        #line = line.decode('utf-8')
         if '<' not in line:  # faster than doing re.search()
             if inText:
                 page.append(line)
@@ -402,8 +412,8 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
             page.append(line)
         elif tag == '/page':
             colon = title.find(':')
-            if (colon < 0 or title[:colon] in acceptedNamespaces) and id != last_id and \
-                    not redirect and not title.startswith(templateNamespace):
+            if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and
+                    not redirect and not title.startswith(templateNamespace)):
                 job = (id, title, page, ordinal)
                 jobs_queue.put(job)  # goes to any available extract_process
                 last_id = id
@@ -539,7 +549,7 @@ def main():
     args = parser.parse_args()
 
     Extractor.keepLinks = args.links
-    Extractor.toHTML = args.html
+    Extractor.HtmlFormatting = args.html
     if args.html:
         Extractor.keepLinks = True
 
@@ -583,7 +593,7 @@ def main():
                     load_templates(file)
 
         with open(input_file) as file:
-            page = file.read().decode('utf-8')
+            page = file.read()#.decode('utf-8')
             m = re.search(r'<id>(.*)</id>', page)
             id = m.group(1) if m else 0
             m = re.search(r'<title>(.*)</title>', page)
diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py
index c0d9d75..5dd2a93 100644
--- a/wikiextractor/extract.py
+++ b/wikiextractor/extract.py
@@ -73,6 +73,11 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
     """
     Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped
     @see https://www.mediawiki.org/wiki/Help:Formatting
+    :param extractor: the Extractor t use.
+    :param text: the text to clean.
+    :param expand_templates: whether to perform template expansion.
+    :param escape_doc: whether to convert special characters to HTML entities.
+    @return: the cleaned text.
     """
 
     if expand_templates:
@@ -107,7 +112,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
     text = res + unescape(text[cur:])
 
     # Handle bold/italic/quote
-    if extractor.toHTML:
+    if extractor.HtmlFormatting:
         text = bold_italic.sub(r'<b>\1</b>', text)
         text = bold.sub(r'<b>\1</b>', text)
         text = italic.sub(r'<i>\1</i>', text)
@@ -146,7 +151,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
     for tag in discardElements:
         text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag)
 
-    if not extractor.toHTML:
+    if not extractor.HtmlFormatting:
         # Turn into text what is left (&amp;nbsp;) and <syntaxhighlight>
         text = unescape(text)
 
@@ -170,7 +175,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
     text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U)  # lines with only punctuations
     text = text.replace(',,', ',').replace(',.', '.')
     if escape_doc:
-        text = html.escape(text)
+        text = html.escape(text, quote=False)
     return text
 
 
@@ -202,7 +207,7 @@ def compact(text, mark_headers=False):
         if m:
             title = m.group(2)
             lev = len(m.group(1))
-            if Extractor.toHTML:
+            if Extractor.HtmlFormatting:
                 page.append("<h%d>%s</h%d>" % (lev, title, lev))
             if title and title[-1] not in '!?':
                 title += '.'
@@ -212,7 +217,7 @@ def compact(text, mark_headers=False):
 
             headers[lev] = title
             # drop previous headers
-            headers = { k:v for k,v in headers.items() if k > lev }
+            headers = { k:v for k,v in headers.items() if k <= lev }
             emptySection = True
             continue
         # Handle page title
@@ -228,7 +233,7 @@ def compact(text, mark_headers=False):
             continue
         # handle lists
         elif line[0] in '*#;:':
-            if Extractor.toHTML:
+            if Extractor.HtmlFormatting:
                 i = 0
                 for c, n in zip_longest(listLevel, line, fillvalue=''):
                     if not n or n not in '*#;:':
@@ -282,13 +287,6 @@ def compact(text, mark_headers=False):
     return page
 
 
-def handle_unicode(entity):
-    numeric_code = int(entity[2:-1])
-    if numeric_code >= 0x10000:
-        return ''
-    return unichr(numeric_code)
-
-
 # ----------------------------------------------------------------------
 
 def dropNested(text, openDelim, closeDelim):
@@ -503,7 +501,7 @@ def makeInternalLink(title, label):
 # variables
 
 
-class MagicWords(object):
+class MagicWords():
 
     """
     One copy in each Extractor.
@@ -726,11 +724,11 @@ def unescape(text):
         try:
             if text[1] == "#":  # character reference
                 if text[2] == "x":
-                    return unichr(int(code[1:], 16))
+                    return chr(int(code[1:], 16))
                 else:
-                    return unichr(int(code))
+                    return chr(int(code))
             else:  # named entity
-                return unichr(name2codepoint[code])
+                return chr(name2codepoint[code])
         except:
             return text  # leave as is
 
@@ -795,8 +793,7 @@ dots = re.compile(r'\.{4,}')
 substWords = 'subst:|safesubst:'
 
 
-class Extractor(object):
-
+class Extractor():
     """
     An extraction task on a article.
     """
@@ -809,8 +806,8 @@ class Extractor(object):
     keepSections = True
 
     ##
-    # Whether to output HTML instead of text
-    toHTML = False
+    # Whether to output text with HTML formatting elements in <doc> files.
+    HtmlFormatting = False
 
     def __init__(self, id, title, page):
         """
@@ -846,7 +843,7 @@ class Extractor(object):
         text = compact(text, mark_headers=mark_headers)
         return text
 
-    def extract(self, out):
+    def extract(self, out, escape_doc=True):
         """
         :param out: a memory file.
         """
@@ -860,7 +857,7 @@ class Extractor(object):
         footer = "\n</doc>\n"
         out.write(header)
 
-        text = self.clean_text(text)
+        text = self.clean_text(text, escape_doc=escape_doc)
 
         for line in text:
             out.write(line)
@@ -1443,7 +1440,7 @@ def normalizeNamespace(ns):
 # https://github.com/Wikia/app/blob/dev/extensions/ParserFunctions/ParserFunctions_body.php
 
 
-class Infix:
+class Infix():
 
     """Infix operators.
     The calling sequence for the infix is:
diff --git a/wikiextractor/extractPage.py b/wikiextractor/extractPage.py
index 89a7081..e73dcd9 100755
--- a/wikiextractor/extractPage.py
+++ b/wikiextractor/extractPage.py
@@ -34,7 +34,7 @@ import bz2
 
 
 # Program version
-version = '3.0'
+__version__ = '3.0.3'
 
 # ----------------------------------------------------------------------
 # READER
@@ -49,15 +49,13 @@ def process_data(input_file, id, templates=False):
     :param id: article id
     """
 
-    if input_file.lower().endswith("bz2"):
-        opener = bz2.BZ2File
-    else:
-        opener = open
+    opener = bz2.BZ2File if input_file.lower().endswith("bz2") else open
 
     input = opener(input_file)
 
     page = []
     for line in input:
+        line = line.decode('utf-8')
         if '<' not in line:         # faster than doing re.search()
             if page:
                 page.append(line)
@@ -103,7 +101,7 @@ def main():
                                      description=__doc__)
     parser.add_argument("input",
                         help="XML wiki dump file")
-    parser.add_argument("--id", default="",
+    parser.add_argument("--id", default="1",
                         help="article number")
     parser.add_argument("--template", action="store_true",
                         help="template number")