See ChangeLog.

This commit is contained in:
attardi 2016-02-12 23:31:21 +01:00
parent b04760ecd8
commit 834cad6a35
3 changed files with 32 additions and 25 deletions

View File

@ -3,6 +3,9 @@
* WikiExtractor.py (reduce_process): moved here creation of OutputSplitter.
(compact): Extractor.keepLists allows preserving lists in output.
(main): added new option --lists for preserving lists in output.
(compact): rest lislLevel entering new section.
(ignoredTags): removed 'div' from here, since it is in discardedTags.
(ignoredTags): moved 'sub' and 'sup' to discardedTags.
2016-02-11 Giuseppe Attardi <attardi@di.unipi.it>

View File

@ -57,6 +57,7 @@ Each file will contains several documents in this [document format](http://media
Processing:
--html produce HTML output, subsumes --links
-l, --links preserve links
--lists preserve lists
-ns ns1,ns2, --namespaces ns1,ns2
accepted namespaces
--templates TEMPLATES

View File

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
# =============================================================================
# Version: 2.46 (February 11, 2016)
# Version: 2.48 (February 12, 2016)
# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
#
# Contributors:
@ -66,7 +66,7 @@ from timeit import default_timer
# ===========================================================================
# Program version
version = '2.46'
version = '2.48'
## PARAMS ####################################################################
@ -106,7 +106,8 @@ discardElements = [
'table', 'tr', 'td', 'th', 'caption', 'div',
'form', 'input', 'select', 'option', 'textarea',
'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir',
'ref', 'references', 'img', 'imagemap', 'source', 'small'
'ref', 'references', 'img', 'imagemap', 'source', 'small',
'sub', 'sup'
]
# This is obtained from <siteinfo>
@ -158,10 +159,10 @@ selfClosingTags = ('br', 'hr', 'nobr', 'ref', 'references', 'nowiki')
# These tags are dropped, keeping their content.
# handle 'a' separately, depending on keepLinks
ignoredTags = (
'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'div', 'em',
'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'em',
'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd', 'nowiki',
'p', 'plaintext', 's', 'span', 'strike', 'strong',
'sub', 'sup', 'tt', 'u', 'var'
'tt', 'u', 'var'
)
placeholder_tags = {'math': 'formula', 'code': 'codice'}
@ -1514,8 +1515,8 @@ def dropNested(text, openDelim, closeDelim):
openRE = re.compile(openDelim, re.IGNORECASE)
closeRE = re.compile(closeDelim, re.IGNORECASE)
# partition text in separate blocks { } { }
spans = [] # pairs (s, e) for each partition
nest = 0 # nesting level
spans = [] # pairs (s, e) for each partition
nest = 0 # nesting level
start = openRE.search(text, 0)
if not start:
return text
@ -1523,8 +1524,8 @@ def dropNested(text, openDelim, closeDelim):
next = start
while end:
next = openRE.search(text, next.end())
if not next: # termination
while nest: # close all pending
if not next: # termination
while nest: # close all pending
nest -= 1
end0 = closeRE.search(text, end.end())
if end0:
@ -1540,7 +1541,7 @@ def dropNested(text, openDelim, closeDelim):
# try closing more
last = end.end()
end = closeRE.search(text, end.end())
if not end: # unbalanced
if not end: # unbalanced
if spans:
span = (spans[0][0], last)
else:
@ -1552,7 +1553,7 @@ def dropNested(text, openDelim, closeDelim):
# advance start, find next close
start = next
end = closeRE.search(text, next.end())
break # { }
break # { }
if next != start:
# { { }
nest += 1
@ -1568,7 +1569,7 @@ def dropSpans(spans, text):
res = ''
offset = 0
for s, e in spans:
if offset <= s: # handle nesting
if offset <= s: # handle nesting
if offset < s:
res += text[offset:s]
offset = e
@ -2111,13 +2112,13 @@ listItem = {'*': '<li>%s</li>', '#': '<li>%s</<li>', ';': '<dt>%s</dt>',
def compact(text):
"""Deal with headers, lists, empty sections, residuals of tables.
:param text: convert to HTML
:param text: convert to HTML.
"""
page = [] # list of paragraph
headers = {} # Headers for unfilled sections
page = [] # list of paragraph
headers = {} # Headers for unfilled sections
emptySection = False # empty sections are discarded
listLevel = [] # nesting of lists
listLevel = [] # nesting of lists
for line in text.split('\n'):
@ -2127,20 +2128,21 @@ def compact(text):
m = section.match(line)
if m:
title = m.group(2)
lev = len(m.group(1))
lev = len(m.group(1)) # header level
if Extractor.toHTML:
page.append("<h%d>%s</h%d>" % (lev, title, lev))
if title and title[-1] not in '!?':
title += '.'
title += '.' # terminate sentence.
headers[lev] = title
# drop previous headers
for i in headers.keys():
if i > lev:
del headers[i]
emptySection = True
listLevel = []
continue
# Handle page title
if line.startswith('++'):
elif line.startswith('++'):
title = line[2:-2]
if title:
if title[-1] not in '!?':
@ -2153,8 +2155,10 @@ def compact(text):
# handle lists
elif line[0] in '*#;:':
i = 0
# c: current level char
# n: next level char
for c, n in izip_longest(listLevel, line, fillvalue=''):
if not n or n not in '*#;:':
if not n or n not in '*#;:': # shorter or different
if c:
if Extractor.toHTML:
page.append(listClose[c])
@ -2183,6 +2187,7 @@ def compact(text):
bullet = '1. ' if n == '#' else '- '
page.append('{0:{1}s}'.format(bullet, len(listLevel)) + line)
elif len(listLevel):
page.append(line)
if Extractor.toHTML:
for c in reversed(listLevel):
page.append(listClose[c])
@ -2204,11 +2209,9 @@ def compact(text):
page.append(line) # first line
emptySection = False
elif not emptySection:
page.append(line)
# dangerous
# # Drop preformatted
# elif line[0] == ' ':
# continue
# Drop preformatted
if line[0] != ' ': # dangerous
page.append(line)
return page