See ChangeLog/

This commit is contained in:
Giuseppe Attardi 2015-04-19 11:32:39 +02:00
parent 1485e2b5bc
commit 858670beeb
2 changed files with 46 additions and 9 deletions

View File

@ -1,6 +1,9 @@
2015-04-19 Giuseppe Attardi <attardi@di.unipi.it>
* WikiExtractor.py (clean): use dropNext to drop discardElements.
(discardElements): added div.
(compact): avoid duplicated header line with optio --sections.
(compact): skip empty list items.
2015-04-18 Giuseppe Attardi <attardi@di.unipi.it>

View File

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
#
# =============================================================================
# Version: 2.20 (Apr 18, 2015)
# Version: 2.22 (Apr 19, 2015)
# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
# Antonio Fuschetto (fuschett@di.unipi.it), University of Pisa
#
@ -61,7 +61,7 @@ import Queue, threading, multiprocessing
#===========================================================================
# Program version
version = '2.20'
version = '2.22'
### PARAMS ####################################################################
@ -97,7 +97,7 @@ acceptedNamespaces = ['w', 'wiktionary', 'wikt']
#
discardElements = [
'gallery', 'timeline', 'noinclude', 'pre',
'table', 'tr', 'td', 'th', 'caption',
'table', 'tr', 'td', 'th', 'caption', 'div',
'form', 'input', 'select', 'option', 'textarea',
'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir',
'ref', 'references', 'img', 'imagemap', 'source', 'small'
@ -615,7 +615,7 @@ class Extractor(object):
# any parts in a tplarg after the first (the parameter default) are
# ignored, and an equals sign in the first part is treated as plain text.
logging.debug(' subst %s %s' % (parameter, str(params)))
#logging.debug(' subst %s' % parameter)
parts = splitParameters(parameter)
if len(parts) > 1:
@ -954,6 +954,29 @@ class MagicWords(object):
def __setitem__(self, name, value):
self.values[name] = value
switches = [
'__NOTOC__',
'__FORCETOC__',
'__TOC__',
'__TOC__',
'__NEWSECTIONLINK__',
'__NONEWSECTIONLINK__',
'__NOGALLERY__',
'__HIDDENCAT__',
'__NOCONTENTCONVERT__',
'__NOCC__',
'__NOTITLECONVERT__',
'__NOTC__',
'__START__',
'__END__',
'__INDEX__',
'__NOINDEX__',
'__STATICREDIRECT__',
'__DISAMBIG__'
]
magicWordsRE = re.compile('|'.join(MagicWords.switches))
# ----------------------------------------------------------------------
# parser functions utilities
@ -1392,6 +1415,10 @@ tailRE = re.compile('\w*')
expand_templates = True
def clean(extractor, text):
"""
Transforms wiki markup.
@see https://www.mediawiki.org/wiki/Help:Formatting
"""
if (expand_templates):
# expand templates
@ -1441,6 +1468,9 @@ def clean(extractor, text):
# residuals of unbalanced quotes
text = text.replace("'''", '').replace("''", '&quot;')
# drop MagicWords behavioral switches
text = magicWordsRE.sub('', text)
################ Process HTML ###############
# turn into HTML
@ -1495,6 +1525,7 @@ def clean(extractor, text):
text = text.replace(',,', ',').replace(',.', '.')
return text
# skip level 1, it is page name level
section = re.compile(r'(==+)\s*(.*?)\s*\1')
def compact(text):
@ -1538,7 +1569,9 @@ def compact(text):
# handle lists
elif line[0] in '*#;':
if keepSections:
page.append("<li>%s</li>" % line.lstrip(line[0]))
line = line.lstrip(line[0]).strip()
if line:
page.append("<li>%s</li>" % line)
else:
continue
@ -1549,10 +1582,11 @@ def compact(text):
elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '':
continue
elif len(headers):
items = headers.items()
items.sort()
for (i, v) in items:
page.append(v)
if not keepSections:
items = headers.items()
items.sort()
for (i, v) in items:
page.append(v)
headers.clear()
page.append(line) # first line
emptySection = False