See ChangeLog.

2015-04-19 13:17:48 +02:00 · 2015-04-19 13:17:48 +02:00 · 66caade3ad
commit 66caade3ad
parent 858670beeb
2 changed files with 23 additions and 8 deletions
--- a/3
+++ b/3
@ -4,6 +4,9 @@
 	(discardElements): added div.
 	(compact): avoid duplicated header line with optio --sections.
 	(compact): skip empty list items.
+	(main): changed logger format.
+	(ignoredTags): added abbr.
+	(clean): handle extension SyntaxHighlight.

 2015-04-18  Giuseppe Attardi  <attardi@di.unipi.it>

--- a/WikiExtractor.py
+++ b/WikiExtractor.py
@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 #
 # =============================================================================
-#  Version: 2.22 (Apr 19, 2015)
+#  Version: 2.23 (Apr 19, 2015)
 #  Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
 #	   Antonio Fuschetto (fuschett@di.unipi.it), University of Pisa
 #
@ -61,7 +61,7 @@ import Queue, threading, multiprocessing
 #===========================================================================

 # Program version
-version = '2.22'
+version = '2.23'

 ### PARAMS ####################################################################

@ -151,7 +151,7 @@ selfClosingTags = [ 'br', 'hr', 'nobr', 'ref', 'references', 'nowiki' ]
 # These tags are dropped, keeping their content.
 # handle 'a' separately, depending on keepLinks
 ignoredTags = [
-    'b', 'big', 'blockquote', 'center', 'cite', 'div', 'em',
+    'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'div', 'em',
    'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd', 'nowiki',
    'p', 'plaintext', 's', 'span', 'strike', 'strong',
    'sub', 'sup', 'tt', 'u', 'var'
@ -1412,6 +1412,8 @@ def make_anchor_tag(link, trail):
 # match tail after wikilink
 tailRE = re.compile('\w*')

+syntaxhighlight = re.compile('&lt;syntaxhighlight .*?&gt;(.*?)&lt;/syntaxhighlight&gt;', re.DOTALL)
+
 expand_templates = True

 def clean(extractor, text):
@ -1473,10 +1475,14 @@ def clean(extractor, text):

    ################ Process HTML ###############

-    # turn into HTML
-    text = unescape(text)
-    # do it again (&amp;nbsp;)
-    text = unescape(text)
+    # turn into HTML, except for the content of <syntaxhighlight>
+    res = ''
+    cur = 0
+    for m in syntaxhighlight.finditer(text):
+        end = m.end()
+        res += unescape(text[cur:m.start()]) + m.group(1)
+        cur = end
+    text = res

    # Collect spans

@ -1504,6 +1510,9 @@ def clean(extractor, text):
    for tag in discardElements:
        text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag)

+    # Turn into HTML what is left (&amp;nbsp;) and <syntaxhighlight>
+    text = unescape(text)
+
    # Expand placeholders
    for pattern, placeholder in placeholder_tag_patterns:
        index = 1
@ -1731,7 +1740,7 @@ def load_templates(file, output_file=None):
            page = []
            articles += 1
            if articles % 10000 == 0:
-                logging.info("Preprocessed: %d pages" % articles)
+                logging.info("Preprocessed %d pages" % articles)

 def process_dump(input_file, template_file, outdir, file_size, file_compress, threads):
    """
@ -1942,6 +1951,9 @@ def main():
    if args.namespaces:
        acceptedNamespaces = set(args.ns.split(','))

+    FORMAT = '%(levelname)s: %(message)s'
+    logging.basicConfig(format=FORMAT)
+
    logger = logging.getLogger()
    if not args.quiet:
        logger.setLevel(logging.INFO)