From 3150f604e9481f50633238956719d9af41d97acf Mon Sep 17 00:00:00 2001 From: attardi Date: Sat, 5 Dec 2020 20:13:46 +0100 Subject: [PATCH] Use bz2.open. --- wikiextractor/WikiExtractor.py | 4 ++-- wikiextractor/extractPage.py | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py index b4a6ed1..7d7ec15 100755 --- a/wikiextractor/WikiExtractor.py +++ b/wikiextractor/WikiExtractor.py @@ -62,7 +62,7 @@ from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces # =========================================================================== # Program version -__version__ = '3.0.3' +__version__ = '3.0.4' ## # Defined in @@ -266,7 +266,7 @@ def decode_open(filename, mode='rt', encoding='utf-8'): ext = os.path.splitext(filename)[1] if ext == '.gz': import gzip - return gzip.open(filename, mode) + return gzip.open(filename, mode, encoding=encoding) elif ext == '.bz2': return bz2.open(filename, mode=mode, encoding=encoding) else: diff --git a/wikiextractor/extractPage.py b/wikiextractor/extractPage.py index e73dcd9..9a10d8d 100755 --- a/wikiextractor/extractPage.py +++ b/wikiextractor/extractPage.py @@ -34,7 +34,7 @@ import bz2 # Program version -__version__ = '3.0.3' +__version__ = '3.0.4' # ---------------------------------------------------------------------- # READER @@ -49,13 +49,14 @@ def process_data(input_file, id, templates=False): :param id: article id """ - opener = bz2.BZ2File if input_file.lower().endswith("bz2") else open - - input = opener(input_file) + if input_file.lower().endswith(".bz2"): + input = bz2.open(input_file, mode='rt', encoding='utf-8') + else: + input = open(input_file) page = [] for line in input: - line = line.decode('utf-8') + line = line if '<' not in line: # faster than doing re.search() if page: page.append(line)