Force 'utf-8' encoding without relying on platform-dependent default

On Windows, the default encoding is 'cp1252' and this raises a UnicodeDecodeError.

Fix #89 #144 #165
This commit is contained in:
Albert Villanova del Moral 2019-07-13 18:21:43 +02:00
parent 3162bb6c3c
commit ff9a70cd6d

View File

@ -2865,10 +2865,23 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
:param process_count: number of extraction processes to spawn.
"""
def hook_compressed_encoded(encoding):
def hook(filename, mode):
ext = os.path.splitext(filename)[1]
if ext == '.gz':
import gzip
return gzip.open(filename, mode, encoding=encoding)
elif ext == '.bz2':
import bz2
return bz2.open(filename, mode, encoding=encoding)
else:
return open(filename, mode, encoding=encoding)
return hook
if input_file == '-':
input = sys.stdin
else:
input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
input = fileinput.FileInput(input_file, openhook=hook_compressed_encoded('utf-8'))
# collect siteinfo
for line in input: