Added new flags
--discard_elements - allowing you to customise which elements are discarded --ignored_tags - allowing you to customise which tags are ignored --keep_tables - allows the contents of the tables in the original to articel to be retained. This does not render html tables
This commit is contained in:
parent
b5d97f64f9
commit
e835e8c004
3
.gitignore
vendored
3
.gitignore
vendored
@ -86,4 +86,5 @@ ENV/
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
|
||||
|
||||
# Go files used for local testing
|
||||
*.go
|
||||
|
@ -87,6 +87,12 @@ Each file will contains several documents in this [document format](http://media
|
||||
--filter_disambig_pages
|
||||
Remove pages from output that contain disabmiguation
|
||||
markup (default=False)
|
||||
-it, --ignored_tags
|
||||
comma separated list of tags that will be dropped, keeping their content
|
||||
-de, --discard_elements
|
||||
comma separated list of elements that will be removed from the article text
|
||||
--keep_tables
|
||||
Preserve tables in the output article text (default=False)
|
||||
|
||||
Special:
|
||||
-q, --quiet suppress reporting progress info
|
||||
|
@ -117,18 +117,6 @@ moduleNamespace = ''
|
||||
acceptedNamespaces = ['w', 'wiktionary', 'wikt']
|
||||
|
||||
|
||||
##
|
||||
# Drop these elements from article text
|
||||
#
|
||||
discardElements = [
|
||||
'gallery', 'timeline', 'noinclude', 'pre',
|
||||
'table', 'tr', 'td', 'th', 'caption', 'div',
|
||||
'form', 'input', 'select', 'option', 'textarea',
|
||||
'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir',
|
||||
'ref', 'references', 'img', 'imagemap', 'source', 'small',
|
||||
'sub', 'sup', 'indicator'
|
||||
]
|
||||
|
||||
# This is obtained from <siteinfo>
|
||||
urlbase = ''
|
||||
|
||||
@ -137,6 +125,10 @@ urlbase = ''
|
||||
filter_disambig_pages = False
|
||||
filter_disambig_page_pattern = re.compile("{{disambig(uation)?(\|[^}]*)?}}")
|
||||
|
||||
##
|
||||
# Drop tables from the article
|
||||
keep_tables = False
|
||||
|
||||
##
|
||||
# page filtering logic -- remove templates, undesired xml namespaces, and disambiguation pages
|
||||
def keepPage(ns, page):
|
||||
@ -192,15 +184,6 @@ def get_url(uid):
|
||||
|
||||
selfClosingTags = ('br', 'hr', 'nobr', 'ref', 'references', 'nowiki')
|
||||
|
||||
# These tags are dropped, keeping their content.
|
||||
# handle 'a' separately, depending on keepLinks
|
||||
ignoredTags = (
|
||||
'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'em',
|
||||
'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd',
|
||||
'p', 'plaintext', 's', 'span', 'strike', 'strong',
|
||||
'tt', 'u', 'var'
|
||||
)
|
||||
|
||||
placeholder_tags = {'math': 'formula', 'code': 'codice'}
|
||||
|
||||
|
||||
@ -285,10 +268,6 @@ def ignoreTag(tag):
|
||||
right = re.compile(r'</\s*%s>' % tag, re.IGNORECASE)
|
||||
ignored_tag_patterns.append((left, right))
|
||||
|
||||
|
||||
for tag in ignoredTags:
|
||||
ignoreTag(tag)
|
||||
|
||||
# Match selfClosing HTML tags
|
||||
selfClosing_tag_patterns = [
|
||||
re.compile(r'<\s*%s\b[^>]*/\s*>' % tag, re.DOTALL | re.IGNORECASE) for tag in selfClosingTags
|
||||
@ -546,8 +525,11 @@ class Extractor(object):
|
||||
#
|
||||
text = self.transform(text)
|
||||
text = self.wiki2text(text)
|
||||
|
||||
text = compact(self.clean(text))
|
||||
# the text is still present
|
||||
# NOTE: Something in the combination of clean and compact is dropping the tables
|
||||
# If we remove these calls thant the data is there but the odd wikiml formatting type characters remain
|
||||
text = self.clean(text)
|
||||
text = compact(text)
|
||||
footer = "\n</doc>\n"
|
||||
if sum(len(line) for line in text) < Extractor.min_text_length:
|
||||
return
|
||||
@ -584,7 +566,7 @@ class Extractor(object):
|
||||
res += self.transform1(wikitext[cur:])
|
||||
return res
|
||||
|
||||
|
||||
|
||||
def transform1(self, text):
|
||||
"""Transform text not containing <nowiki>"""
|
||||
if Extractor.expand_templates:
|
||||
@ -612,8 +594,9 @@ class Extractor(object):
|
||||
|
||||
# Drop tables
|
||||
# first drop residual templates, or else empty parameter |} might look like end of table.
|
||||
text = dropNested(text, r'{{', r'}}')
|
||||
text = dropNested(text, r'{\|', r'\|}')
|
||||
if not keep_tables:
|
||||
text = dropNested(text, r'{{', r'}}')
|
||||
text = dropNested(text, r'{\|', r'\|}')
|
||||
|
||||
# Handle bold/italic/quote
|
||||
if self.toHTML:
|
||||
@ -647,7 +630,6 @@ class Extractor(object):
|
||||
res += unescape(text[cur:m.start()]) + m.group(1)
|
||||
cur = m.end()
|
||||
text = res + unescape(text[cur:])
|
||||
|
||||
return text
|
||||
|
||||
|
||||
@ -705,6 +687,10 @@ class Extractor(object):
|
||||
text = re.sub('(\[\(«) ', r'\1', text)
|
||||
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
|
||||
text = text.replace(',,', ',').replace(',.', '.')
|
||||
if keep_tables:
|
||||
text = re.sub(r'!(?:\s)?style=\"width:(?:\d+)%;\"', r'', text)
|
||||
text = text.replace('|-', '')
|
||||
text = text.replace('|', '')
|
||||
if Extractor.toHTML:
|
||||
text = cgi.escape(text)
|
||||
return text
|
||||
@ -878,7 +864,6 @@ class Extractor(object):
|
||||
return ''
|
||||
|
||||
logging.debug('%*sEXPAND %s', self.frame.depth, '', body)
|
||||
|
||||
parts = splitParts(body)
|
||||
# title is the portion before the first |
|
||||
title = parts[0].strip()
|
||||
@ -1043,6 +1028,7 @@ def splitParts(paramsList):
|
||||
sep = '|'
|
||||
parameters = []
|
||||
cur = 0
|
||||
|
||||
for s, e in findMatchingBraces(paramsList):
|
||||
par = paramsList[cur:s].split(sep)
|
||||
if par:
|
||||
@ -1327,7 +1313,7 @@ def string_find(args):
|
||||
return source.find(pattern, start) + 1 # lua is 1-based
|
||||
else:
|
||||
return (re.compile(pattern).search(source, start) or -1) + 1
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Module:Roman
|
||||
# http://en.wikipedia.org/w/index.php?title=Module:Roman
|
||||
@ -1337,11 +1323,11 @@ def string_find(args):
|
||||
def roman_main(args):
|
||||
"""Convert first arg to roman numeral if <= 5000 else :return: second arg."""
|
||||
num = int(float(args.get('1')))
|
||||
|
||||
|
||||
# Return a message for numbers too big to be expressed in Roman numerals.
|
||||
if 0 > num or num >= 5000:
|
||||
return args.get('2', 'N/A')
|
||||
|
||||
|
||||
def toRoman(n, romanNumeralMap):
|
||||
"""convert integer to Roman numeral"""
|
||||
result = ""
|
||||
@ -1356,7 +1342,7 @@ def roman_main(args):
|
||||
(1000, "M"),
|
||||
(900, "CM"), (500, "D"), (400, "CD"), (100, "C"),
|
||||
(90, "XC"), (50, "L"), (40, "XL"), (10, "X"),
|
||||
(9, "IX"), (5, "V"), (4, "IV"), (1, "I")
|
||||
(9, "IX"), (5, "V"), (4, "IV"), (1, "I")
|
||||
)
|
||||
return toRoman(num, smallRomans)
|
||||
|
||||
@ -2489,7 +2475,6 @@ def compact(text):
|
||||
# Drop preformatted
|
||||
if line[0] != ' ': # dangerous
|
||||
page.append(line)
|
||||
|
||||
return page
|
||||
|
||||
|
||||
@ -2885,7 +2870,7 @@ def reduce_process(output_queue, spool_length,
|
||||
output = sys.stdout if PY2 else sys.stdout.buffer
|
||||
if file_compress:
|
||||
logging.warn("writing to stdout, so no output compression (use an external tool)")
|
||||
|
||||
|
||||
interval_start = default_timer()
|
||||
# FIXME: use a heap
|
||||
spool = {} # collected pages
|
||||
@ -2926,8 +2911,9 @@ def reduce_process(output_queue, spool_length,
|
||||
minFileSize = 200 * 1024
|
||||
|
||||
def main():
|
||||
global urlbase, acceptedNamespaces, filter_disambig_pages
|
||||
global urlbase, acceptedNamespaces, filter_disambig_pages, keep_tables
|
||||
global templateCache
|
||||
global discardElements
|
||||
|
||||
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
@ -2964,6 +2950,12 @@ def main():
|
||||
help="Minimum expanded text length required to write document (default=%(default)s)")
|
||||
groupP.add_argument("--filter_disambig_pages", action="store_true", default=filter_disambig_pages,
|
||||
help="Remove pages from output that contain disabmiguation markup (default=%(default)s)")
|
||||
groupP.add_argument("-it", "--ignored_tags", default="", metavar="abbr,b,big",
|
||||
help="comma separated list of tags that will be dropped, keeping their content")
|
||||
groupP.add_argument("-de", "--discard_elements", default="", metavar="gallery,timeline,noinclude",
|
||||
help="comma separated list of elements that will be removed from the article text")
|
||||
groupP.add_argument("--keep_tables", action="store_true", default=keep_tables,
|
||||
help="Preserve tables in the output article text (default=%(default)s)")
|
||||
default_process_count = max(1, cpu_count() - 1)
|
||||
parser.add_argument("--processes", type=int, default=default_process_count,
|
||||
help="Number of processes to use (default %(default)s)")
|
||||
@ -2992,6 +2984,7 @@ def main():
|
||||
|
||||
Extractor.expand_templates = args.no_templates
|
||||
filter_disambig_pages = args.filter_disambig_pages
|
||||
keep_tables = args.keep_tables
|
||||
|
||||
try:
|
||||
power = 'kmg'.find(args.bytes[-1].lower()) + 1
|
||||
@ -3005,6 +2998,33 @@ def main():
|
||||
if args.namespaces:
|
||||
acceptedNamespaces = set(args.namespaces.split(','))
|
||||
|
||||
# ignoredTags and discardElemets have default values already supplied, if passed in the defaults are overwritten
|
||||
if args.ignored_tags:
|
||||
ignoredTags = set(args.ignored_tags.split(','))
|
||||
else:
|
||||
ignoredTags = [
|
||||
'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'em',
|
||||
'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd',
|
||||
'p', 'plaintext', 's', 'span', 'strike', 'strong',
|
||||
'tt', 'u', 'var'
|
||||
]
|
||||
|
||||
# 'a' tag is handled separately
|
||||
for tag in ignoredTags:
|
||||
ignoreTag(tag)
|
||||
|
||||
if args.discard_elements:
|
||||
discardElements = set(args.discard_elements.split(','))
|
||||
else:
|
||||
discardElements = [
|
||||
'gallery', 'timeline', 'noinclude', 'pre',
|
||||
'table', 'tr', 'td', 'th', 'caption', 'div',
|
||||
'form', 'input', 'select', 'option', 'textarea',
|
||||
'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir',
|
||||
'ref', 'references', 'img', 'imagemap', 'source', 'small',
|
||||
'sub', 'sup', 'indicator'
|
||||
]
|
||||
|
||||
FORMAT = '%(levelname)s: %(message)s'
|
||||
logging.basicConfig(format=FORMAT)
|
||||
|
||||
@ -3014,6 +3034,7 @@ def main():
|
||||
if args.debug:
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
input_file = args.input
|
||||
|
||||
if not Extractor.keepLinks:
|
||||
|
Loading…
Reference in New Issue
Block a user