See ChangeLog.

2016-02-12 23:31:21 +01:00 · 2016-02-12 23:31:21 +01:00 · 834cad6a35
commit 834cad6a35
parent b04760ecd8
3 changed files with 32 additions and 25 deletions
--- a/3
+++ b/3
@ -3,6 +3,9 @@
 	* WikiExtractor.py (reduce_process): moved here creation of OutputSplitter.
 	(compact): Extractor.keepLists allows preserving lists in output.
 	(main): added new option --lists for preserving lists in output.
+	(compact): rest lislLevel entering new section.
+	(ignoredTags): removed 'div' from here, since it is in discardedTags.
+	(ignoredTags): moved 'sub' and 'sup' to discardedTags.

 2016-02-11  Giuseppe Attardi  <attardi@di.unipi.it>

--- a/README.md
+++ b/README.md
@ -57,6 +57,7 @@ Each file will contains several documents in this [document format](http://media
    Processing:
      --html                produce HTML output, subsumes --links
      -l, --links           preserve links
+      --lists               preserve lists
      -ns ns1,ns2, --namespaces ns1,ns2
 			    accepted namespaces
      --templates TEMPLATES
--- a/WikiExtractor.py
+++ b/WikiExtractor.py
@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-

 # =============================================================================
-#  Version: 2.46 (February 11, 2016)
+#  Version: 2.48 (February 12, 2016)
 #  Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
 #
 #  Contributors:
@ -66,7 +66,7 @@ from timeit import default_timer
 # ===========================================================================

 # Program version
-version = '2.46'
+version = '2.48'

 ## PARAMS ####################################################################

@ -106,7 +106,8 @@ discardElements = [
    'table', 'tr', 'td', 'th', 'caption', 'div',
    'form', 'input', 'select', 'option', 'textarea',
    'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir',
-    'ref', 'references', 'img', 'imagemap', 'source', 'small'
+    'ref', 'references', 'img', 'imagemap', 'source', 'small',
+    'sub', 'sup'
 ]

 # This is obtained from <siteinfo>
@ -158,10 +159,10 @@ selfClosingTags = ('br', 'hr', 'nobr', 'ref', 'references', 'nowiki')
 # These tags are dropped, keeping their content.
 # handle 'a' separately, depending on keepLinks
 ignoredTags = (
-    'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'div', 'em',
+    'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'em',
    'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd', 'nowiki',
    'p', 'plaintext', 's', 'span', 'strike', 'strong',
-    'sub', 'sup', 'tt', 'u', 'var'
+    'tt', 'u', 'var'
 )

 placeholder_tags = {'math': 'formula', 'code': 'codice'}
@ -1514,8 +1515,8 @@ def dropNested(text, openDelim, closeDelim):
    openRE = re.compile(openDelim, re.IGNORECASE)
    closeRE = re.compile(closeDelim, re.IGNORECASE)
    # partition text in separate blocks { } { }
-    spans = []  # pairs (s, e) for each partition
-    nest = 0  # nesting level
+    spans = []                  # pairs (s, e) for each partition
+    nest = 0                    # nesting level
    start = openRE.search(text, 0)
    if not start:
        return text
@ -1523,8 +1524,8 @@ def dropNested(text, openDelim, closeDelim):
    next = start
    while end:
        next = openRE.search(text, next.end())
-        if not next:  # termination
-            while nest:  # close all pending
+        if not next:            # termination
+            while nest:         # close all pending
                nest -= 1
                end0 = closeRE.search(text, end.end())
                if end0:
@ -1540,7 +1541,7 @@ def dropNested(text, openDelim, closeDelim):
                # try closing more
                last = end.end()
                end = closeRE.search(text, end.end())
-                if not end:  # unbalanced
+                if not end:     # unbalanced
                    if spans:
                        span = (spans[0][0], last)
                    else:
@ -1552,7 +1553,7 @@ def dropNested(text, openDelim, closeDelim):
                # advance start, find next close
                start = next
                end = closeRE.search(text, next.end())
-                break  # { }
+                break           # { }
        if next != start:
            # { { }
            nest += 1
@ -1568,7 +1569,7 @@ def dropSpans(spans, text):
    res = ''
    offset = 0
    for s, e in spans:
-        if offset <= s:  # handle nesting
+        if offset <= s:         # handle nesting
            if offset < s:
                res += text[offset:s]
            offset = e
@ -2111,13 +2112,13 @@ listItem = {'*': '<li>%s</li>', '#': '<li>%s</<li>', ';': '<dt>%s</dt>',

 def compact(text):
    """Deal with headers, lists, empty sections, residuals of tables.
-    :param text: convert to HTML
+    :param text: convert to HTML.
    """

-    page = []  # list of paragraph
-    headers = {}  # Headers for unfilled sections
+    page = []             # list of paragraph
+    headers = {}          # Headers for unfilled sections
    emptySection = False  # empty sections are discarded
-    listLevel = []  # nesting of lists
+    listLevel = []        # nesting of lists

    for line in text.split('\n'):

@ -2127,20 +2128,21 @@ def compact(text):
        m = section.match(line)
        if m:
            title = m.group(2)
-            lev = len(m.group(1))
+            lev = len(m.group(1)) # header level
            if Extractor.toHTML:
                page.append("<h%d>%s</h%d>" % (lev, title, lev))
            if title and title[-1] not in '!?':
-                title += '.'
+                title += '.'    # terminate sentence.
            headers[lev] = title
            # drop previous headers
            for i in headers.keys():
                if i > lev:
                    del headers[i]
            emptySection = True
+            listLevel = []
            continue
        # Handle page title
-        if line.startswith('++'):
+        elif line.startswith('++'):
            title = line[2:-2]
            if title:
                if title[-1] not in '!?':
@ -2153,8 +2155,10 @@ def compact(text):
        # handle lists
        elif line[0] in '*#;:':
            i = 0
+            # c: current level char
+            # n: next level char
            for c, n in izip_longest(listLevel, line, fillvalue=''):
-                if not n or n not in '*#;:':
+                if not n or n not in '*#;:': # shorter or different
                    if c:
                        if Extractor.toHTML:
                            page.append(listClose[c])
@ -2183,6 +2187,7 @@ def compact(text):
                    bullet = '1. ' if n == '#' else '- '
                    page.append('{0:{1}s}'.format(bullet, len(listLevel)) + line)
        elif len(listLevel):
+            page.append(line)
            if Extractor.toHTML:
                for c in reversed(listLevel):
                    page.append(listClose[c])
@ -2204,11 +2209,9 @@ def compact(text):
            page.append(line)  # first line
            emptySection = False
        elif not emptySection:
-            page.append(line)
-            # dangerous
-            # # Drop preformatted
-            # elif line[0] == ' ':
-            #     continue
+            # Drop preformatted
+            if line[0] != ' ':  # dangerous
+                page.append(line)

    return page