From ff9a70cd6d11c7438ef7551a5a3fa173f1e3f3ab Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Sat, 13 Jul 2019 18:21:43 +0200
Subject: [PATCH] Force 'utf-8' encoding without relying on platform-dependent
 default

On Windows, the default encoding is 'cp1252' and this raises a UnicodeDecodeError.

Fix #89 #144 #165
---
 WikiExtractor.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/WikiExtractor.py b/WikiExtractor.py
index 730b3ba..791a71e 100755
--- a/WikiExtractor.py
+++ b/WikiExtractor.py
@@ -2865,10 +2865,23 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
     :param process_count: number of extraction processes to spawn.
     """
 
+    def hook_compressed_encoded(encoding):
+        def hook(filename, mode):
+            ext = os.path.splitext(filename)[1]
+            if ext == '.gz':
+                import gzip
+                return gzip.open(filename, mode, encoding=encoding)
+            elif ext == '.bz2':
+                import bz2
+                return bz2.open(filename, mode, encoding=encoding)
+            else:
+                return open(filename, mode, encoding=encoding)
+        return hook
+
     if input_file == '-':
         input = sys.stdin
     else:
-        input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
+        input = fileinput.FileInput(input_file, openhook=hook_compressed_encoded('utf-8'))
 
     # collect siteinfo
     for line in input: