Added new flags

--discard_elements - allowing you to customise which elements are discarded --ignored_tags - allowing you to customise which tags are ignored --keep_tables - allows the contents of the tables in the original to articel to be retained. This does not render html tables
2017-01-23 14:18:21 +00:00 · 2017-01-23 14:18:21 +00:00 · e835e8c004
commit e835e8c004
parent b5d97f64f9
4 changed files with 69 additions and 41 deletions
--- a/.gitignore
+++ b/.gitignore
@ -86,4 +86,5 @@ ENV/
 # Spyder project settings
 .spyderproject

-
+# Go files used for local testing
+*.go
--- a/README.md
+++ b/README.md
@ -87,6 +87,12 @@ Each file will contains several documents in this [document format](http://media
      --filter_disambig_pages
                            Remove pages from output that contain disabmiguation
                            markup (default=False)
+      -it, --ignored_tags
+                            comma separated list of tags that will be dropped, keeping their content
+      -de, --discard_elements
+                            comma separated list of elements that will be removed from the article text
+      --keep_tables
+                            Preserve tables in the output article text (default=False)

    Special:
      -q, --quiet           suppress reporting progress info
--- a/WikiExtractor.py
+++ b/WikiExtractor.py
@ -117,18 +117,6 @@ moduleNamespace = ''
 acceptedNamespaces = ['w', 'wiktionary', 'wikt']


-##
-# Drop these elements from article text
-#
-discardElements = [
-    'gallery', 'timeline', 'noinclude', 'pre',
-    'table', 'tr', 'td', 'th', 'caption', 'div',
-    'form', 'input', 'select', 'option', 'textarea',
-    'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir',
-    'ref', 'references', 'img', 'imagemap', 'source', 'small',
-    'sub', 'sup', 'indicator'
-]
-
 # This is obtained from <siteinfo>
 urlbase = ''

@ -137,6 +125,10 @@ urlbase = ''
 filter_disambig_pages = False
 filter_disambig_page_pattern = re.compile("{{disambig(uation)?(\|[^}]*)?}}")

+##
+# Drop tables from the article
+keep_tables = False
+
 ##
 # page filtering logic -- remove templates, undesired xml namespaces, and disambiguation pages
 def keepPage(ns, page):
@ -192,15 +184,6 @@ def get_url(uid):

 selfClosingTags = ('br', 'hr', 'nobr', 'ref', 'references', 'nowiki')

-# These tags are dropped, keeping their content.
-# handle 'a' separately, depending on keepLinks
-ignoredTags = (
-    'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'em',
-    'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd',
-    'p', 'plaintext', 's', 'span', 'strike', 'strong',
-    'tt', 'u', 'var'
-)
-
 placeholder_tags = {'math': 'formula', 'code': 'codice'}


@ -285,10 +268,6 @@ def ignoreTag(tag):
    right = re.compile(r'</\s*%s>' % tag, re.IGNORECASE)
    ignored_tag_patterns.append((left, right))

-
-for tag in ignoredTags:
-    ignoreTag(tag)
-
 # Match selfClosing HTML tags
 selfClosing_tag_patterns = [
    re.compile(r'<\s*%s\b[^>]*/\s*>' % tag, re.DOTALL | re.IGNORECASE) for tag in selfClosingTags
@ -546,8 +525,11 @@ class Extractor(object):
        #
        text = self.transform(text)
        text = self.wiki2text(text)
-
-        text = compact(self.clean(text))
+        # the text is still present
+        # NOTE: Something in the combination of clean and compact is dropping the tables
+        # If we remove these calls thant the data is there but the odd wikiml formatting type characters remain
+        text = self.clean(text)
+        text = compact(text)
        footer = "\n</doc>\n"
        if sum(len(line) for line in text) < Extractor.min_text_length:
            return
@ -584,7 +566,7 @@ class Extractor(object):
        res += self.transform1(wikitext[cur:])
        return res

-        
+
    def transform1(self, text):
        """Transform text not containing <nowiki>"""
        if Extractor.expand_templates:
@ -612,8 +594,9 @@ class Extractor(object):

        # Drop tables
        # first drop residual templates, or else empty parameter |} might look like end of table.
-        text = dropNested(text, r'{{', r'}}')
-        text = dropNested(text, r'{\|', r'\|}')
+        if not keep_tables:
+            text = dropNested(text, r'{{', r'}}')
+            text = dropNested(text, r'{\|', r'\|}')

        # Handle bold/italic/quote
        if self.toHTML:
@ -647,7 +630,6 @@ class Extractor(object):
            res += unescape(text[cur:m.start()]) + m.group(1)
            cur = m.end()
        text = res + unescape(text[cur:])
-
        return text


@ -705,6 +687,10 @@ class Extractor(object):
        text = re.sub('(\[\(«) ', r'\1', text)
        text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U)  # lines with only punctuations
        text = text.replace(',,', ',').replace(',.', '.')
+        if keep_tables:
+            text = re.sub(r'!(?:\s)?style=\"width:(?:\d+)%;\"', r'', text)
+            text = text.replace('|-', '')
+            text = text.replace('|', '')
        if Extractor.toHTML:
            text = cgi.escape(text)
        return text
@ -878,7 +864,6 @@ class Extractor(object):
            return ''

        logging.debug('%*sEXPAND %s', self.frame.depth, '', body)
-
        parts = splitParts(body)
        # title is the portion before the first |
        title = parts[0].strip()
@ -1043,6 +1028,7 @@ def splitParts(paramsList):
    sep = '|'
    parameters = []
    cur = 0
+
    for s, e in findMatchingBraces(paramsList):
        par = paramsList[cur:s].split(sep)
        if par:
@ -1327,7 +1313,7 @@ def string_find(args):
        return source.find(pattern, start) + 1 # lua is 1-based
    else:
        return (re.compile(pattern).search(source, start) or -1) + 1
-        
+
 # ----------------------------------------------------------------------
 # Module:Roman
 # http://en.wikipedia.org/w/index.php?title=Module:Roman
@ -1337,11 +1323,11 @@ def string_find(args):
 def roman_main(args):
    """Convert first arg to roman numeral if <= 5000 else :return: second arg."""
    num = int(float(args.get('1')))
- 
+
    # Return a message for numbers too big to be expressed in Roman numerals.
    if 0 > num or num >= 5000:
        return args.get('2', 'N/A')
- 
+
    def toRoman(n, romanNumeralMap):
        """convert integer to Roman numeral"""
        result = ""
@ -1356,7 +1342,7 @@ def roman_main(args):
        (1000, "M"),
        (900, "CM"), (500, "D"), (400, "CD"), (100, "C"),
        (90, "XC"), (50, "L"), (40, "XL"), (10, "X"),
-        (9, "IX"), (5, "V"), (4, "IV"), (1, "I") 
+        (9, "IX"), (5, "V"), (4, "IV"), (1, "I")
    )
    return toRoman(num, smallRomans)

@ -2489,7 +2475,6 @@ def compact(text):
            # Drop preformatted
            if line[0] != ' ':  # dangerous
                page.append(line)
-
    return page


@ -2885,7 +2870,7 @@ def reduce_process(output_queue, spool_length,
        output = sys.stdout if PY2 else sys.stdout.buffer
        if file_compress:
            logging.warn("writing to stdout, so no output compression (use an external tool)")
-    
+
    interval_start = default_timer()
    # FIXME: use a heap
    spool = {}        # collected pages
@ -2926,8 +2911,9 @@ def reduce_process(output_queue, spool_length,
 minFileSize = 200 * 1024

 def main():
-    global urlbase, acceptedNamespaces, filter_disambig_pages
+    global urlbase, acceptedNamespaces, filter_disambig_pages, keep_tables
    global templateCache
+    global discardElements

    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
@ -2964,6 +2950,12 @@ def main():
                        help="Minimum expanded text length required to write document (default=%(default)s)")
    groupP.add_argument("--filter_disambig_pages", action="store_true", default=filter_disambig_pages,
                        help="Remove pages from output that contain disabmiguation markup (default=%(default)s)")
+    groupP.add_argument("-it", "--ignored_tags", default="", metavar="abbr,b,big",
+                        help="comma separated list of tags that will be dropped, keeping their content")
+    groupP.add_argument("-de", "--discard_elements", default="", metavar="gallery,timeline,noinclude",
+                        help="comma separated list of elements that will be removed from the article text")
+    groupP.add_argument("--keep_tables", action="store_true", default=keep_tables,
+                        help="Preserve tables in the output article text (default=%(default)s)")
    default_process_count = max(1, cpu_count() - 1)
    parser.add_argument("--processes", type=int, default=default_process_count,
                        help="Number of processes to use (default %(default)s)")
@ -2992,6 +2984,7 @@ def main():

    Extractor.expand_templates = args.no_templates
    filter_disambig_pages = args.filter_disambig_pages
+    keep_tables = args.keep_tables

    try:
        power = 'kmg'.find(args.bytes[-1].lower()) + 1
@ -3005,6 +2998,33 @@ def main():
    if args.namespaces:
        acceptedNamespaces = set(args.namespaces.split(','))

+    # ignoredTags and discardElemets have default values already supplied, if passed in the defaults are overwritten
+    if args.ignored_tags:
+        ignoredTags = set(args.ignored_tags.split(','))
+    else:
+        ignoredTags = [
+            'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'em',
+            'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd',
+            'p', 'plaintext', 's', 'span', 'strike', 'strong',
+            'tt', 'u', 'var'
+        ]
+
+    # 'a' tag is handled separately
+    for tag in ignoredTags:
+        ignoreTag(tag)
+
+    if args.discard_elements:
+        discardElements = set(args.discard_elements.split(','))
+    else:
+        discardElements = [
+            'gallery', 'timeline', 'noinclude', 'pre',
+            'table', 'tr', 'td', 'th', 'caption', 'div',
+            'form', 'input', 'select', 'option', 'textarea',
+            'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir',
+            'ref', 'references', 'img', 'imagemap', 'source', 'small',
+            'sub', 'sup', 'indicator'
+        ]
+
    FORMAT = '%(levelname)s: %(message)s'
    logging.basicConfig(format=FORMAT)

@ -3014,6 +3034,7 @@ def main():
    if args.debug:
        logger.setLevel(logging.DEBUG)

+
    input_file = args.input

    if not Extractor.keepLinks:
--- a/setup.py
+++ b/setup.py
@ -11,7 +11,7 @@ setup(
                'database dump',
    author='Giuseppe Attardi',
    author_email='attardi@di.unipi.it',
-    version='2.66',
+    version='2.69',

    url='https://github.com/attardi/wikiextractor',