Use bz2.open.

This commit is contained in:
attardi 2020-12-05 20:13:46 +01:00
parent a2e078f3be
commit 3150f604e9
2 changed files with 8 additions and 7 deletions

View File

@ -62,7 +62,7 @@ from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces
# ===========================================================================
# Program version
__version__ = '3.0.3'
__version__ = '3.0.4'
##
# Defined in <siteinfo>
@ -266,7 +266,7 @@ def decode_open(filename, mode='rt', encoding='utf-8'):
ext = os.path.splitext(filename)[1]
if ext == '.gz':
import gzip
return gzip.open(filename, mode)
return gzip.open(filename, mode, encoding=encoding)
elif ext == '.bz2':
return bz2.open(filename, mode=mode, encoding=encoding)
else:

View File

@ -34,7 +34,7 @@ import bz2
# Program version
__version__ = '3.0.3'
__version__ = '3.0.4'
# ----------------------------------------------------------------------
# READER
@ -49,13 +49,14 @@ def process_data(input_file, id, templates=False):
:param id: article id
"""
opener = bz2.BZ2File if input_file.lower().endswith("bz2") else open
input = opener(input_file)
if input_file.lower().endswith(".bz2"):
input = bz2.open(input_file, mode='rt', encoding='utf-8')
else:
input = open(input_file)
page = []
for line in input:
line = line.decode('utf-8')
line = line
if '<' not in line: # faster than doing re.search()
if page:
page.append(line)