See ChanngeLog.

This commit is contained in:
attardi 2017-01-15 09:08:35 +01:00
parent e00eacb372
commit 6660973646
2 changed files with 13 additions and 5 deletions

View File

@ -1,3 +1,8 @@
2017-01-15 Giuseppe Attardi <attardi@di.unipi.it>
* WikiExtractor.py (process_dump): use text_type to decide whether
to use decode('utf-8').)
2016-10-29 Giuseppe Attardi <attardi@di.unipi.it>
* setup.py: use scripts instead of console_scripts.

View File

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
# =============================================================================
# Version: 2.67 (Jan 4, 2017)
# Version: 2.68 (Jan 15, 2017)
# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
#
# Contributors:
@ -66,6 +66,7 @@ from timeit import default_timer
PY2 = sys.version_info[0] == 2
# Python 2.7 compatibiity
if PY2:
from urllib import quote
from htmlentitydefs import name2codepoint
@ -83,7 +84,7 @@ else:
# ===========================================================================
# Program version
version = '2.67'
version = '2.68'
## PARAMS ####################################################################
@ -2392,7 +2393,9 @@ def compact(text):
for line in text.split('\n'):
if not line:
if not line: # collapse empty lines
if page and page[-1]:
page.append('')
continue
# Handle section titles
m = section.match(line)
@ -2637,7 +2640,7 @@ def pages_from(input):
redirect = False
title = None
for line in input:
if PY2: line = line.decode('utf-8')
if text_type == unicode: line = line.decode('utf-8')
if '<' not in line: # faster than doing re.search()
if inText:
page.append(line)
@ -2707,7 +2710,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
# collect siteinfo
for line in input:
if PY2: line = line.decode('utf-8')
if text_type == unicode: line = line.decode('utf-8')
m = tagRE.search(line)
if not m:
continue