Merged PR from Seth Cleveland.

2016-06-19 13:10:36 +02:00 · 2016-06-19 13:10:36 +02:00 · 0f703c0aae
commit 0f703c0aae
parent f9b8e8ac02
3 changed files with 127 additions and 29 deletions
--- a/6
+++ b/6
@ -2,6 +2,12 @@

 	* WikiExtractor.py: support for Python 3 by orangain@gmail.com

+2016-03-23  Seth Cleveland <scleveland@turnitin.com>
+
+	* WikiExtractor.py: add filtering options -- min text length, xml
+	namespace, and disambig pages. Added option to print the revision
+	id with each document.
+
 2016-03-23  Giuseppe Attardi  <attardi@di.unipi.it>

 	* WikiExtractor.py (compact): properly emit section headers when Extractor.keepSections.
--- a/README.md
+++ b/README.md
@ -33,11 +33,25 @@ The script is invoked with a Wikipedia dump file as an argument.
 The output is stored in several files of similar size in a given directory.
 Each file will contains several documents in this [document format](http://medialab.di.unipi.it/wiki/Document_Format).

-    usage: WikiExtractor.py [-h] [-o OUTPUT] [-b n[KMG]] [-c] [--html] [-l]
-			    [-ns ns1,ns2] [-s] [--templates TEMPLATES]
-			    [--no-templates] [--processes PROCESSES] [-q] [--debug]
-			    [-a] [-v]
-			    input
+    usage: WikiExtractor.py [-h] [-o OUTPUT] [-b n[KMG]] [-c] [--html] [-l] [-s]
+                            [--lists] [-ns ns1,ns2] [-xns ns1,ns2]
+                            [--templates TEMPLATES] [--no-templates] [--escapedoc]
+                            [-r] [--min_text_length MIN_TEXT_LENGTH]
+                            [--filter_disambig_pages] [--processes PROCESSES] [-q]
+                            [--debug] [-a] [-v]
+                            input
+
+    Wikipedia Extractor:
+    Extracts and cleans text from a Wikipedia database dump and stores output in a
+    number of files of similar size in a given directory.
+    Each file will contain several documents in the format:
+
+        <doc id="" revid="" url="" title="">
+            ...
+            </doc>
+
+    Template expansion requires preprocesssng first the whole dump and
+    collecting template definitions.

    positional arguments:
      input                 XML wiki dump file
@ -48,7 +62,7 @@ Each file will contains several documents in this [document format](http://media

    Output:
      -o OUTPUT, --output OUTPUT
-			    a directory where to store the extracted files (or '-' for dumping to
+                            directory for extracted files (or '-' for dumping to
                            stdout)
      -b n[KMG], --bytes n[KMG]
                            maximum bytes per output file (default 1M)
@ -57,21 +71,33 @@ Each file will contains several documents in this [document format](http://media
    Processing:
      --html                produce HTML output, subsumes --links
      -l, --links           preserve links
+      -s, --sections        preserve sections
      --lists               preserve lists
      -ns ns1,ns2, --namespaces ns1,ns2
-			    accepted namespaces
+                            accepted link namespaces
+      -xns ns1,ns2, --xml_namespaces ns1,ns2
+                            accepted page xml namespaces -- 0 for main/articles
      --templates TEMPLATES
-			    use or create file containing templates
+                            use or create file containing templates
      --no-templates        Do not expand templates
      --escapedoc           use to escape the contents of the output
                            <doc>...</doc>
+      -r, --revision        Include the document revision id (default=False)
+      --min_text_length MIN_TEXT_LENGTH
+                            Minimum expanded text length required to write
+                            document (default=0)
+      --filter_disambig_pages
+                            Remove pages from output that contain disabmiguation
+                            markup (default=False)

    Special:
      -q, --quiet           suppress reporting progress info
      --debug               print debug info
-      -a, --article         analyze a file containing a single article (debug option)
+      -a, --article         analyze a file containing a single article (debug
+                            option)
      -v, --version         print program version

+
 Saving templates to a file will speed up performing extraction the next time,
 assuming template definitions have not changed.

--- a/WikiExtractor.py
+++ b/WikiExtractor.py
@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-

 # =============================================================================
-#  Version: 2.56 (June 19, 2016)
+#  Version: 2.57 (June 19, 2016)
 #  Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
 #
 #  Contributors:
@ -15,6 +15,7 @@
 #   Wim Muskee (wimmuskee@gmail.com)
 #   Radics Geza (radicsge@gmail.com)
 #   orangain (orangain@gmail.com)
+#   Seth Cleveland (scleveland@turnitin.com)
 #
 # =============================================================================
 #  Copyright (c) 2011-2016. Giuseppe Attardi (attardi@di.unipi.it).
@ -39,7 +40,7 @@ Extracts and cleans text from a Wikipedia database dump and stores output in a
 number of files of similar size in a given directory.
 Each file will contain several documents in the format:

-    <doc id="" url="" title="">
+    <doc id="" revid="" url="" title="">
        ...
        </doc>

@ -81,7 +82,7 @@ else:
 # ===========================================================================

 # Program version
-version = '2.56'
+version = '2.57'

 ## PARAMS ####################################################################

@ -106,13 +107,22 @@ templatePrefix = ''
 moduleNamespace = ''

 ##
-# Recognize only these namespaces
+# Recognize only these namespaces in links
 # w: Internal links to the Wikipedia
 # wiktionary: Wiki dictionary
 # wikt: shortcut for Wiktionary
 #
 acceptedNamespaces = ['w', 'wiktionary', 'wikt']

+
+##
+# Desired xml namespaces to extract -- allXMLNamespaces for everything, use
+# the integers from https://en.wikipedia.org/wiki/Wikipedia:Namespace to filter
+# pages
+allXMLNamespaces = set()
+acceptedXMLNamespaces = allXMLNamespaces
+
+
 ##
 # Drop these elements from article text
 #
@ -128,6 +138,27 @@ discardElements = [
 # This is obtained from <siteinfo>
 urlbase = ''

+##
+# Filter disambiguation pages
+filter_disambig_pages = False
+filter_disambig_page_pattern = re.compile("{{disambig(uation)?(\|[^}]*)?}}")
+
+##
+# page filtering logic -- remove templates, un desired xml namespaces, and disambig pages
+def keepPage(ns, page):
+    # remove modules and templates from output
+    if ns in templateKeys:
+        return False
+    # filter this page based on namespace, unless we want all namespaces
+    if not (acceptedXMLNamespaces is allXMLNamespaces) and ns not in acceptedXMLNamespaces:
+        return False
+    # remove disambig pages if desired
+    if filter_disambig_pages:
+        for line in page:
+            if filter_disambig_page_pattern.match(line):
+                return False
+    return True
+

 def get_url(uid):
    return "%s?curid=%s" % (urlbase, uid)
@ -433,14 +464,22 @@ class Extractor(object):
    # Whether to expand templates
    expand_templates = True

+    ##
+    # Print the wikipedia article revision
+    print_revision = False

-    def __init__(self, id, title, lines):
+    ##
+    # Minimum expanded text length required to print document
+    min_text_length = 0
+
+    def __init__(self, id, revid, title, lines):
        """
        :param id: id of page.
        :param title: tutle of page.
        :param lines: a list of lines.
        """
        self.id = id
+        self.revid = revid
        self.title = title
        self.text = ''.join(lines)
        self.magicWords = MagicWords()
@ -456,7 +495,10 @@ class Extractor(object):
        """
        logging.debug("%s\t%s", self.id, self.title)
        url = get_url(self.id)
-        header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
+        if Extractor.print_revision:
+            header = '<doc id="%s" revid="%s" url="%s" title="%s">\n' % (self.id, self.revid, url, self.title)
+        else:
+            header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
        # Separate header from text with a newline.
        header += self.title + '\n\n'
        self.magicWords['pagename'] = self.title
@ -466,10 +508,12 @@ class Extractor(object):
        self.magicWords['currentday'] = time.strftime('%d')
        self.magicWords['currenthour'] = time.strftime('%H')
        self.magicWords['currenttime'] = time.strftime('%H:%M:%S')
-        text = self.clean()
+        text = [line.encode('utf-8') for line in compact(self.clean())]
        footer = "\n</doc>\n"
+        if sum(len(line) for line in text) < Extractor.min_text_length:
+            return
        out.write(header)
-        for line in compact(text):
+        for line in text:
            out.write(line)
            out.write('\n')
        out.write(footer)
@ -2338,7 +2382,7 @@ def load_templates(file, output_file=None):
    if output_file:
        output = codecs.open(output_file, 'wb', 'utf-8')
    for page_count, page_data in enumerate(pages_from(file)):
-        id, title, ns, page = page_data
+        id, revid, title, ns, page = page_data
        if not output_file and (not templateNamespace or
                                not moduleNamespace):  # do not know it yet
            # reconstruct templateNamespace and moduleNamespace from the first title
@ -2383,8 +2427,10 @@ def pages_from(input):
    id = None
    ns = '0'
    last_id = None
+    revid = None
    inText = False
    redirect = False
+    title = None
    for line in input:
        line = line.decode('utf-8')
        if '<' not in line:  # faster than doing re.search()
@ -2398,8 +2444,10 @@ def pages_from(input):
        if tag == 'page':
            page = []
            redirect = False
-        elif tag == 'id' and not id: # skip nested <id>
+        elif tag == 'id' and not id:
            id = m.group(3)
+        elif tag == 'id' and id:
+            revid = m.group(3)
        elif tag == 'title':
            title = m.group(3)
        elif tag == 'ns':
@ -2420,10 +2468,12 @@ def pages_from(input):
            page.append(line)
        elif tag == '/page':
            if id != last_id and not redirect:
-                yield (id, title, ns, page)
+                yield (id, revid, title, ns, page)
                last_id = id
                ns = '0'
            id = None
+            revid = None
+            title = None
            page = []


@ -2441,6 +2491,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
    global knownNamespaces
    global templateNamespace, templatePrefix
    global moduleNamespace, modulePrefix
+    global allXMLNamespaces, acceptedXMLNamespaces

    if input_file == '-':
        input = sys.stdin
@ -2533,8 +2584,8 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
    # Mapper process
    page_num = 0
    for page_data in pages_from(input):
-        id, title, ns, page = page_data
-        if ns not in templateKeys:
+        id, revid, title, ns, page = page_data
+        if keepPage(ns, page):
            # slow down
            delay = 0
            if spool_length.value > max_spool_length:
@ -2544,7 +2595,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
                    delay += 10
            if delay:
                logging.info('Delay %ds', delay)
-            job = (id, title, page, page_num)
+            job = (id, revid, title, page, page_num)
            jobs_queue.put(job) # goes to any available extract_process
            page_num += 1
        page = None             # free memory
@ -2583,15 +2634,16 @@ def extract_process(i, jobs_queue, output_queue):
    while True:
        job = jobs_queue.get()  # job is (id, title, page, page_num)
        if job:
-            id, title, page, page_num = job
+            id, revid, title, page, page_num = job
            try:
-                e = Extractor(*job[:3]) # (id, title, page)
+                e = Extractor(*job[:4]) # (id, revid, title, page)
                page = None              # free memory
                e.extract(out)
                text = out.getvalue()
            except:
                text = ''
                logging.error('Processing page: %s %s', id, title)
+
            output_queue.put((page_num, text))
            out.truncate(0)
            out.seek(0)
@ -2661,7 +2713,7 @@ minFileSize = 200 * 1024


 def main():
-    global urlbase, acceptedNamespaces
+    global urlbase, acceptedNamespaces, acceptedXMLNamespaces, filter_disambig_pages
    global templateCache, escape_doc

    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
@ -2688,13 +2740,21 @@ def main():
    groupP.add_argument("--lists", action="store_true",
                        help="preserve lists")
    groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
-                        help="accepted namespaces")
+                        help="accepted link namespaces")
+    groupP.add_argument("-xns", "--xml_namespaces", default="", metavar="ns1,ns2",
+                        help="accepted page xml namespaces -- 0 for main/articles")
    groupP.add_argument("--templates",
                        help="use or create file containing templates")
    groupP.add_argument("--no-templates", action="store_false",
                        help="Do not expand templates")
    groupP.add_argument("--escapedoc", action="store_true",
                        help="use to escape the contents of the output <doc>...</doc>")
+    groupP.add_argument("-r", "--revision", action="store_true", default=Extractor.print_revision,
+                        help="Include the document revision id (default=%(default)s)")
+    groupP.add_argument("--min_text_length", type=int, default=Extractor.min_text_length,
+                        help="Minimum expanded text length required to write document (default=%(default)s)")
+    groupP.add_argument("--filter_disambig_pages", action="store_true", default=filter_disambig_pages,
+                        help="Remove pages from output that contain disabmiguation markup (default=%(default)s)")
    default_process_count = cpu_count() - 1
    parser.add_argument("--processes", type=int, default=default_process_count,
                        help="Number of processes to use (default %(default)s)")
@ -2716,11 +2776,14 @@ def main():
    Extractor.keepSections = args.sections
    Extractor.keepLists = args.lists
    Extractor.toHTML = args.html
+    Extractor.print_revision = args.revision
+    Extractor.min_text_length = args.min_text_length
    if args.html:
        Extractor.keepLinks = True

    Extractor.expand_templates = args.no_templates
    escape_doc = args.escapedoc
+    filter_disambig_pages = args.filter_disambig_pages

    try:
        power = 'kmg'.find(args.bytes[-1].lower()) + 1
@ -2734,6 +2797,9 @@ def main():
    if args.namespaces:
        acceptedNamespaces = set(args.namespaces.split(','))

+    if args.xml_namespaces:
+        acceptedXMLNamespaces = set([unicode(ns) for ns in args.xml_namespaces.split(',')])
+
    FORMAT = '%(levelname)s: %(message)s'
    logging.basicConfig(format=FORMAT)

@ -2760,8 +2826,8 @@ def main():

        file = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
        for page_data in pages_from(file):
-            id, title, ns, page = page_data
-            Extractor(id, title, page).extract(sys.stdout)
+            id, revid, title, ns, page = page_data
+            Extractor(id, revid, title, page).extract(sys.stdout)
        file.close()
        return