See ChangeLog/

2015-04-19 11:32:39 +02:00 · 2015-04-19 11:32:39 +02:00 · 858670beeb
commit 858670beeb
parent 1485e2b5bc
2 changed files with 46 additions and 9 deletions
--- a/3
+++ b/3
@ -1,6 +1,9 @@
 2015-04-19  Giuseppe Attardi  <attardi@di.unipi.it>

 	* WikiExtractor.py (clean): use dropNext to drop discardElements.
+	(discardElements): added div.
+	(compact): avoid duplicated header line with optio --sections.
+	(compact): skip empty list items.

 2015-04-18  Giuseppe Attardi  <attardi@di.unipi.it>

--- a/WikiExtractor.py
+++ b/WikiExtractor.py
@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 #
 # =============================================================================
-#  Version: 2.20 (Apr 18, 2015)
+#  Version: 2.22 (Apr 19, 2015)
 #  Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
 #	   Antonio Fuschetto (fuschett@di.unipi.it), University of Pisa
 #
@ -61,7 +61,7 @@ import Queue, threading, multiprocessing
 #===========================================================================

 # Program version
-version = '2.20'
+version = '2.22'

 ### PARAMS ####################################################################

@ -97,7 +97,7 @@ acceptedNamespaces = ['w', 'wiktionary', 'wikt']
 #
 discardElements = [
        'gallery', 'timeline', 'noinclude', 'pre',
-        'table', 'tr', 'td', 'th', 'caption',
+        'table', 'tr', 'td', 'th', 'caption', 'div',
        'form', 'input', 'select', 'option', 'textarea',
        'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir',
        'ref', 'references', 'img', 'imagemap', 'source', 'small'
@ -615,7 +615,7 @@ class Extractor(object):

        # any parts in a tplarg after the first (the parameter default) are
        # ignored, and an equals sign in the first part is treated as plain text.
-        logging.debug(' subst %s %s' % (parameter, str(params)))
+        #logging.debug(' subst %s' % parameter)

        parts = splitParameters(parameter)
        if len(parts) > 1:
@ -954,6 +954,29 @@ class MagicWords(object):
    def __setitem__(self, name, value):
        self.values[name] = value

+    switches = [
+        '__NOTOC__',
+        '__FORCETOC__',
+        '__TOC__',
+        '__TOC__',
+        '__NEWSECTIONLINK__',
+        '__NONEWSECTIONLINK__',
+        '__NOGALLERY__',
+        '__HIDDENCAT__',
+        '__NOCONTENTCONVERT__',
+        '__NOCC__',
+        '__NOTITLECONVERT__',
+        '__NOTC__',
+        '__START__',
+        '__END__',
+        '__INDEX__',
+        '__NOINDEX__',
+        '__STATICREDIRECT__',
+        '__DISAMBIG__'
+        ]
+
+magicWordsRE = re.compile('|'.join(MagicWords.switches))
+
 # ----------------------------------------------------------------------
 # parser functions utilities

@ -1392,6 +1415,10 @@ tailRE = re.compile('\w*')
 expand_templates = True

 def clean(extractor, text):
+    """
+    Transforms wiki markup.
+    @see https://www.mediawiki.org/wiki/Help:Formatting
+    """

    if (expand_templates):
        # expand templates
@ -1441,6 +1468,9 @@ def clean(extractor, text):
    # residuals of unbalanced quotes
    text = text.replace("'''", '').replace("''", '&quot;')

+    # drop MagicWords behavioral switches
+    text = magicWordsRE.sub('', text)
+
    ################ Process HTML ###############

    # turn into HTML
@ -1495,6 +1525,7 @@ def clean(extractor, text):
    text = text.replace(',,', ',').replace(',.', '.')
    return text

+# skip level 1, it is page name level
 section = re.compile(r'(==+)\s*(.*?)\s*\1')

 def compact(text):
@ -1538,7 +1569,9 @@ def compact(text):
        # handle lists
        elif line[0] in '*#;':
            if keepSections:
-                page.append("<li>%s</li>" % line.lstrip(line[0]))
+                line = line.lstrip(line[0]).strip()
+                if line:
+                    page.append("<li>%s</li>" % line)
            else:
                continue

@ -1549,10 +1582,11 @@ def compact(text):
        elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '':
            continue
        elif len(headers):
-            items = headers.items()
-            items.sort()
-            for (i, v) in items:
-                page.append(v)
+            if not keepSections:
+                items = headers.items()
+                items.sort()
+                for (i, v) in items:
+                    page.append(v)
            headers.clear()
            page.append(line)   # first line
            emptySection = False