From ff9a70cd6d11c7438ef7551a5a3fa173f1e3f3ab Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Sat, 13 Jul 2019 18:21:43 +0200 Subject: [PATCH] Force 'utf-8' encoding without relying on platform-dependent default On Windows, the default encoding is 'cp1252' and this raises a UnicodeDecodeError. Fix #89 #144 #165 --- WikiExtractor.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/WikiExtractor.py b/WikiExtractor.py index 730b3ba..791a71e 100755 --- a/WikiExtractor.py +++ b/WikiExtractor.py @@ -2865,10 +2865,23 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, :param process_count: number of extraction processes to spawn. """ + def hook_compressed_encoded(encoding): + def hook(filename, mode): + ext = os.path.splitext(filename)[1] + if ext == '.gz': + import gzip + return gzip.open(filename, mode, encoding=encoding) + elif ext == '.bz2': + import bz2 + return bz2.open(filename, mode, encoding=encoding) + else: + return open(filename, mode, encoding=encoding) + return hook + if input_file == '-': input = sys.stdin else: - input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed) + input = fileinput.FileInput(input_file, openhook=hook_compressed_encoded('utf-8')) # collect siteinfo for line in input: