Merge pull request #183 from albertvillanova/fix-encoding

Force 'utf-8' encoding without relying on platform-dependent default
This commit is contained in:
Giuseppe Attardi 2020-07-22 12:18:17 +02:00 committed by GitHub
commit 6408a430fc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -2865,10 +2865,23 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
:param process_count: number of extraction processes to spawn. :param process_count: number of extraction processes to spawn.
""" """
def hook_compressed_encoded(encoding):
def hook(filename, mode):
ext = os.path.splitext(filename)[1]
if ext == '.gz':
import gzip
return gzip.open(filename, mode, encoding=encoding)
elif ext == '.bz2':
import bz2
return bz2.open(filename, mode, encoding=encoding)
else:
return open(filename, mode, encoding=encoding)
return hook
if input_file == '-': if input_file == '-':
input = sys.stdin input = sys.stdin
else: else:
input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed) input = fileinput.FileInput(input_file, openhook=hook_compressed_encoded('utf-8'))
# collect siteinfo # collect siteinfo
for line in input: for line in input: