Add json output

This commit adds a new output format to the program, namely json. When invoked with the --json flag, the program will write several files with several pages per file (as before) with one json object per line representing a single page. The information contained in this json object is the same as in the default format, but is somewhat more straightforward to parse for other tools. The running time is the same as the default format, as well as the compressed output size. The following is a simple benchmark on an I5 machine using as input enwiki-20170120-pages-meta-current1.xml-p000000010p000030303.bz2: $ ./WikiExtractor.py -o xml --compress --no-templates input.bz2 INFO: Finished 3-process extraction of 15084 articles in 335.7s (44.9 art/s) $ ./WikiExtractor.py -o json --json --compress --no-templates input.bz2 INFO: Finished 3-process extraction of 15084 articles in 336.8s (44.8 art/s) $ du -sh json xml 69M json 69M xml
2017-02-10 10:36:04 +01:00 · 2017-02-10 10:36:04 +01:00 · 7ae45fcff7
commit 7ae45fcff7
parent ea9c368e52
1 changed files with 61 additions and 17 deletions
--- a/WikiExtractor.py
+++ b/WikiExtractor.py
@ -44,8 +44,15 @@ Each file will contain several documents in the format:
        ...
        </doc>

+If the program is invoked with the --json flag, then each file will
+contain several documents formatted as json ojects, one per line, with
+the following structure
+
+    {"id": "", "revid": "", "url":"", "title": "", "text": "..."}
+
 Template expansion requires preprocesssng first the whole dump and
 collecting template definitions.
+
 """

 from __future__ import unicode_literals, division
@ -60,6 +67,7 @@ import logging
 import os.path
 import re  # TODO use regex when it will be standard
 import time
+import json
 from io import StringIO
 from multiprocessing import Queue, Process, Value, cpu_count
 from timeit import default_timer
@ -464,6 +472,10 @@ class Extractor(object):
    # Whether to output HTML instead of text
    toHTML = False

+    ##
+    # Whether to write json instead of the xml-like default output format
+    write_json = False
+    
    ##
    # Whether to expand templates
    expand_templates = True
@ -497,22 +509,55 @@ class Extractor(object):
        self.recursion_exceeded_3_errs = 0  # parameter recursion
        self.template_title_errs = 0

+    def write_output(self, out, text):
+        """
+        :param out: a memory file
+        :param text: the text of the page
+        """
+        url = get_url(self.id)
+        if Extractor.write_json:
+            json_data = {
+                'id': self.id,
+                'url': url,
+                'title': self.title,
+                'text': "\n".join(text)
+            }
+            if Extractor.print_revision:
+                json_data['revid'] = self.revid
+            # We don't use json.dump(data, out) because we want to be
+            # able to encode the string if the output is sys.stdout
+            out_str = json.dumps(json_data, ensure_ascii=False)
+            if out == sys.stdout:   # option -a or -o -
+                out_str = out_str.encode('utf-8')
+            out.write(out_str)
+            out.write('\n')
+        else:
+            if Extractor.print_revision:
+                header = '<doc id="%s" revid="%s" url="%s" title="%s">\n' % (self.id, self.revid, url, self.title)
+            else:
+                header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
+            footer = "\n</doc>\n"
+            if out == sys.stdout:   # option -a or -o -
+                header = header.encode('utf-8')
+            out.write(header)
+            for line in text:
+                if out == sys.stdout:   # option -a or -o -
+                    line = line.encode('utf-8')
+                out.write(line)
+                out.write('\n')
+            out.write(footer)

    def extract(self, out):
        """
        :param out: a memory file.
        """
        logging.info('%s\t%s', self.id, self.title)
-        url = get_url(self.id)
-        if Extractor.print_revision:
-            header = '<doc id="%s" revid="%s" url="%s" title="%s">\n' % (self.id, self.revid, url, self.title)
-        else:
-            header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
+        
        # Separate header from text with a newline.
        if self.toHTML:
-            header += '<h1>' + self.title + '</h1>\n'
+            title_str = '<h1>' + self.title + '</h1>'
        else:
-            header += self.title + '\n\n'
+            title_str = self.title + '\n'
        # https://www.mediawiki.org/wiki/Help:Magic_words
        self.magicWords['PAGENAME'] = self.title
        self.magicWords['FULLPAGENAME'] = self.title
@ -533,18 +578,13 @@ class Extractor(object):
        text = self.transform(text)
        text = self.wiki2text(text)
        text = compact(self.clean(text))
-        footer = "\n</doc>\n"
+        text = [title_str] + text
+        
        if sum(len(line) for line in text) < Extractor.min_text_length:
            return
-        if out == sys.stdout:   # option -a or -o -
-            header = header.encode('utf-8')
-        out.write(header)
-        for line in text:
-            if out == sys.stdout:   # option -a or -o -
-                line = line.encode('utf-8')
-            out.write(line)
-            out.write('\n')
-        out.write(footer)
+        
+        self.write_output(out, text)
+        
        errs = (self.template_title_errs,
                self.recursion_exceeded_1_errs,
                self.recursion_exceeded_2_errs,
@ -2954,6 +2994,9 @@ def main():
                        metavar="n[KMG]")
    groupO.add_argument("-c", "--compress", action="store_true",
                        help="compress output files using bzip")
+    groupO.add_argument("--json", action="store_true",
+                        help="write output in json format instead of the default one")
+

    groupP = parser.add_argument_group('Processing')
    groupP.add_argument("--html", action="store_true",
@ -3003,6 +3046,7 @@ def main():
    Extractor.keepSections = args.sections
    Extractor.keepLists = args.lists
    Extractor.toHTML = args.html
+    Extractor.write_json = args.json
    Extractor.print_revision = args.revision
    Extractor.min_text_length = args.min_text_length
    if args.html: