Added new flags

--discard_elements - allowing you to customise which elements are discarded
--ignored_tags - allowing you to customise which tags are ignored
--keep_tables - allows the contents of the tables in the original to articel to be retained. This does not render html tables
This commit is contained in:
Nathan Davies 2017-01-23 14:18:21 +00:00
parent b5d97f64f9
commit e835e8c004
4 changed files with 69 additions and 41 deletions

3
.gitignore vendored
View File

@ -86,4 +86,5 @@ ENV/
# Spyder project settings
.spyderproject
# Go files used for local testing
*.go

View File

@ -87,6 +87,12 @@ Each file will contains several documents in this [document format](http://media
--filter_disambig_pages
Remove pages from output that contain disabmiguation
markup (default=False)
-it, --ignored_tags
comma separated list of tags that will be dropped, keeping their content
-de, --discard_elements
comma separated list of elements that will be removed from the article text
--keep_tables
Preserve tables in the output article text (default=False)
Special:
-q, --quiet suppress reporting progress info

View File

@ -117,18 +117,6 @@ moduleNamespace = ''
acceptedNamespaces = ['w', 'wiktionary', 'wikt']
##
# Drop these elements from article text
#
discardElements = [
'gallery', 'timeline', 'noinclude', 'pre',
'table', 'tr', 'td', 'th', 'caption', 'div',
'form', 'input', 'select', 'option', 'textarea',
'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir',
'ref', 'references', 'img', 'imagemap', 'source', 'small',
'sub', 'sup', 'indicator'
]
# This is obtained from <siteinfo>
urlbase = ''
@ -137,6 +125,10 @@ urlbase = ''
filter_disambig_pages = False
filter_disambig_page_pattern = re.compile("{{disambig(uation)?(\|[^}]*)?}}")
##
# Drop tables from the article
keep_tables = False
##
# page filtering logic -- remove templates, undesired xml namespaces, and disambiguation pages
def keepPage(ns, page):
@ -192,15 +184,6 @@ def get_url(uid):
selfClosingTags = ('br', 'hr', 'nobr', 'ref', 'references', 'nowiki')
# These tags are dropped, keeping their content.
# handle 'a' separately, depending on keepLinks
ignoredTags = (
'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'em',
'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd',
'p', 'plaintext', 's', 'span', 'strike', 'strong',
'tt', 'u', 'var'
)
placeholder_tags = {'math': 'formula', 'code': 'codice'}
@ -285,10 +268,6 @@ def ignoreTag(tag):
right = re.compile(r'</\s*%s>' % tag, re.IGNORECASE)
ignored_tag_patterns.append((left, right))
for tag in ignoredTags:
ignoreTag(tag)
# Match selfClosing HTML tags
selfClosing_tag_patterns = [
re.compile(r'<\s*%s\b[^>]*/\s*>' % tag, re.DOTALL | re.IGNORECASE) for tag in selfClosingTags
@ -546,8 +525,11 @@ class Extractor(object):
#
text = self.transform(text)
text = self.wiki2text(text)
text = compact(self.clean(text))
# the text is still present
# NOTE: Something in the combination of clean and compact is dropping the tables
# If we remove these calls thant the data is there but the odd wikiml formatting type characters remain
text = self.clean(text)
text = compact(text)
footer = "\n</doc>\n"
if sum(len(line) for line in text) < Extractor.min_text_length:
return
@ -584,7 +566,7 @@ class Extractor(object):
res += self.transform1(wikitext[cur:])
return res
def transform1(self, text):
"""Transform text not containing <nowiki>"""
if Extractor.expand_templates:
@ -612,8 +594,9 @@ class Extractor(object):
# Drop tables
# first drop residual templates, or else empty parameter |} might look like end of table.
text = dropNested(text, r'{{', r'}}')
text = dropNested(text, r'{\|', r'\|}')
if not keep_tables:
text = dropNested(text, r'{{', r'}}')
text = dropNested(text, r'{\|', r'\|}')
# Handle bold/italic/quote
if self.toHTML:
@ -647,7 +630,6 @@ class Extractor(object):
res += unescape(text[cur:m.start()]) + m.group(1)
cur = m.end()
text = res + unescape(text[cur:])
return text
@ -705,6 +687,10 @@ class Extractor(object):
text = re.sub('(\[\(«) ', r'\1', text)
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
text = text.replace(',,', ',').replace(',.', '.')
if keep_tables:
text = re.sub(r'!(?:\s)?style=\"width:(?:\d+)%;\"', r'', text)
text = text.replace('|-', '')
text = text.replace('|', '')
if Extractor.toHTML:
text = cgi.escape(text)
return text
@ -878,7 +864,6 @@ class Extractor(object):
return ''
logging.debug('%*sEXPAND %s', self.frame.depth, '', body)
parts = splitParts(body)
# title is the portion before the first |
title = parts[0].strip()
@ -1043,6 +1028,7 @@ def splitParts(paramsList):
sep = '|'
parameters = []
cur = 0
for s, e in findMatchingBraces(paramsList):
par = paramsList[cur:s].split(sep)
if par:
@ -1327,7 +1313,7 @@ def string_find(args):
return source.find(pattern, start) + 1 # lua is 1-based
else:
return (re.compile(pattern).search(source, start) or -1) + 1
# ----------------------------------------------------------------------
# Module:Roman
# http://en.wikipedia.org/w/index.php?title=Module:Roman
@ -1337,11 +1323,11 @@ def string_find(args):
def roman_main(args):
"""Convert first arg to roman numeral if <= 5000 else :return: second arg."""
num = int(float(args.get('1')))
# Return a message for numbers too big to be expressed in Roman numerals.
if 0 > num or num >= 5000:
return args.get('2', 'N/A')
def toRoman(n, romanNumeralMap):
"""convert integer to Roman numeral"""
result = ""
@ -1356,7 +1342,7 @@ def roman_main(args):
(1000, "M"),
(900, "CM"), (500, "D"), (400, "CD"), (100, "C"),
(90, "XC"), (50, "L"), (40, "XL"), (10, "X"),
(9, "IX"), (5, "V"), (4, "IV"), (1, "I")
(9, "IX"), (5, "V"), (4, "IV"), (1, "I")
)
return toRoman(num, smallRomans)
@ -2489,7 +2475,6 @@ def compact(text):
# Drop preformatted
if line[0] != ' ': # dangerous
page.append(line)
return page
@ -2885,7 +2870,7 @@ def reduce_process(output_queue, spool_length,
output = sys.stdout if PY2 else sys.stdout.buffer
if file_compress:
logging.warn("writing to stdout, so no output compression (use an external tool)")
interval_start = default_timer()
# FIXME: use a heap
spool = {} # collected pages
@ -2926,8 +2911,9 @@ def reduce_process(output_queue, spool_length,
minFileSize = 200 * 1024
def main():
global urlbase, acceptedNamespaces, filter_disambig_pages
global urlbase, acceptedNamespaces, filter_disambig_pages, keep_tables
global templateCache
global discardElements
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.RawDescriptionHelpFormatter,
@ -2964,6 +2950,12 @@ def main():
help="Minimum expanded text length required to write document (default=%(default)s)")
groupP.add_argument("--filter_disambig_pages", action="store_true", default=filter_disambig_pages,
help="Remove pages from output that contain disabmiguation markup (default=%(default)s)")
groupP.add_argument("-it", "--ignored_tags", default="", metavar="abbr,b,big",
help="comma separated list of tags that will be dropped, keeping their content")
groupP.add_argument("-de", "--discard_elements", default="", metavar="gallery,timeline,noinclude",
help="comma separated list of elements that will be removed from the article text")
groupP.add_argument("--keep_tables", action="store_true", default=keep_tables,
help="Preserve tables in the output article text (default=%(default)s)")
default_process_count = max(1, cpu_count() - 1)
parser.add_argument("--processes", type=int, default=default_process_count,
help="Number of processes to use (default %(default)s)")
@ -2992,6 +2984,7 @@ def main():
Extractor.expand_templates = args.no_templates
filter_disambig_pages = args.filter_disambig_pages
keep_tables = args.keep_tables
try:
power = 'kmg'.find(args.bytes[-1].lower()) + 1
@ -3005,6 +2998,33 @@ def main():
if args.namespaces:
acceptedNamespaces = set(args.namespaces.split(','))
# ignoredTags and discardElemets have default values already supplied, if passed in the defaults are overwritten
if args.ignored_tags:
ignoredTags = set(args.ignored_tags.split(','))
else:
ignoredTags = [
'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'em',
'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd',
'p', 'plaintext', 's', 'span', 'strike', 'strong',
'tt', 'u', 'var'
]
# 'a' tag is handled separately
for tag in ignoredTags:
ignoreTag(tag)
if args.discard_elements:
discardElements = set(args.discard_elements.split(','))
else:
discardElements = [
'gallery', 'timeline', 'noinclude', 'pre',
'table', 'tr', 'td', 'th', 'caption', 'div',
'form', 'input', 'select', 'option', 'textarea',
'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir',
'ref', 'references', 'img', 'imagemap', 'source', 'small',
'sub', 'sup', 'indicator'
]
FORMAT = '%(levelname)s: %(message)s'
logging.basicConfig(format=FORMAT)
@ -3014,6 +3034,7 @@ def main():
if args.debug:
logger.setLevel(logging.DEBUG)
input_file = args.input
if not Extractor.keepLinks:

View File

@ -11,7 +11,7 @@ setup(
'database dump',
author='Giuseppe Attardi',
author_email='attardi@di.unipi.it',
version='2.66',
version='2.69',
url='https://github.com/attardi/wikiextractor',