See ChangeLog.

This commit is contained in:
Giuseppe Attardi 2015-05-06 16:08:27 +02:00
parent b44b750056
commit d5cca5da43
2 changed files with 11 additions and 6 deletions

View File

@ -1,3 +1,8 @@
2015-05-06 Giuseppe Attardi <attardi@di.unipi.it>
* WikiExtractor.py (main): fixed arg.namespaces.
(compact): use fillvalue=' ' in izip_longest.
2015-04-26 Giuseppe Attardi <attardi@di.unipi.it> 2015-04-26 Giuseppe Attardi <attardi@di.unipi.it>
* WikiExtractor.py (clean): use re.U when matching \W or chinese * WikiExtractor.py (clean): use re.U when matching \W or chinese

View File

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# #
# ============================================================================= # =============================================================================
# Version: 2.32 (Apr 26, 2015) # Version: 2.33 (May 6, 2015)
# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa # Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
# #
# Contributors: # Contributors:
@ -60,7 +60,7 @@ import Queue, threading, multiprocessing
#=========================================================================== #===========================================================================
# Program version # Program version
version = '2.32' version = '2.33'
### PARAMS #################################################################### ### PARAMS ####################################################################
@ -1761,7 +1761,7 @@ def makeInternalLink(title, label):
if colon2 > 1 and title[colon+1:colon2] not in acceptedNamespaces: if colon2 > 1 and title[colon+1:colon2] not in acceptedNamespaces:
return '' return ''
if Extractor.keepLinks: if Extractor.keepLinks:
return '<a href="%s">%s</a>' % (urllib.quote(title.encode('utf-8')), anchor) return '<a href="%s">%s</a>' % (urllib.quote(title.encode('utf-8')), label)
else: else:
return label return label
@ -2009,7 +2009,7 @@ def compact(text):
elif line[0] in '*#;:': elif line[0] in '*#;:':
if Extractor.toHTML: if Extractor.toHTML:
i = 0 i = 0
for c,n in izip_longest(listLevel, line): for c,n in izip_longest(listLevel, line, fillvalue=' '):
if n not in '*#;:': if n not in '*#;:':
if c: if c:
page.append(listClose[c]) page.append(listClose[c])
@ -2022,7 +2022,7 @@ def compact(text):
# close level # close level
page.append(listClose[c]) page.append(listClose[c])
listLevel = listLevel[:-1] listLevel = listLevel[:-1]
listLevel = listLevel + n listLevel += n
page.append(listOpen[n]) page.append(listOpen[n])
i += 1 i += 1
n = line[i-1] n = line[i-1]
@ -2406,7 +2406,7 @@ def main():
return return
if args.namespaces: if args.namespaces:
acceptedNamespaces = set(args.ns.split(',')) acceptedNamespaces = set(args.namespaces.split(','))
FORMAT = '%(levelname)s: %(message)s' FORMAT = '%(levelname)s: %(message)s'
logging.basicConfig(format=FORMAT) logging.basicConfig(format=FORMAT)