diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py index b4a6ed1..7d7ec15 100755 --- a/wikiextractor/WikiExtractor.py +++ b/wikiextractor/WikiExtractor.py @@ -62,7 +62,7 @@ from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces # =========================================================================== # Program version -__version__ = '3.0.3' +__version__ = '3.0.4' ## # Defined in @@ -266,7 +266,7 @@ def decode_open(filename, mode='rt', encoding='utf-8'): ext = os.path.splitext(filename)[1] if ext == '.gz': import gzip - return gzip.open(filename, mode) + return gzip.open(filename, mode, encoding=encoding) elif ext == '.bz2': return bz2.open(filename, mode=mode, encoding=encoding) else: diff --git a/wikiextractor/extractPage.py b/wikiextractor/extractPage.py index e73dcd9..9a10d8d 100755 --- a/wikiextractor/extractPage.py +++ b/wikiextractor/extractPage.py @@ -34,7 +34,7 @@ import bz2 # Program version -__version__ = '3.0.3' +__version__ = '3.0.4' # ---------------------------------------------------------------------- # READER @@ -49,13 +49,14 @@ def process_data(input_file, id, templates=False): :param id: article id """ - opener = bz2.BZ2File if input_file.lower().endswith("bz2") else open - - input = opener(input_file) + if input_file.lower().endswith(".bz2"): + input = bz2.open(input_file, mode='rt', encoding='utf-8') + else: + input = open(input_file) page = [] for line in input: - line = line.decode('utf-8') + line = line if '<' not in line: # faster than doing re.search() if page: page.append(line)