See ChangeLog.

2016-02-04 11:23:40 +01:00 · 2016-02-04 11:23:40 +01:00 · fc89e2514e
commit fc89e2514e
parent 49464c0210
3 changed files with 264 additions and 3 deletions
--- a/5
+++ b/5
@ -1,3 +1,8 @@
+2016-02-04  Giuseppe Attardi  <attardi@di.unipi.it>
+
+	* cirrus-extract.py: added.
+	* README.md: added mention of Cirrus extract.
+
 2015-11-20  Giuseppe Attardi  <attardi@di.unipi.it>

 	* WikiExtractor.py (makeExternalLink): fixed.
--- a/README.md
+++ b/README.md
@ -1,12 +1,22 @@
-# wikiextractor
+# WikiExtractor
 [WikiExtractor.py](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) is a Python script that extracts and cleans text from a [Wikipedia database dump](http://download.wikimedia.org/).

 The tool is written in Python and requires Python 2.7 but no additional library.
-**Warning**: problems have been reported on Windows due to poor support for `StringIO` in the Python implementation on Windows.
+**Warning**: problems have been reported on Windows with the use of multiprocessing.

 For further information, see the [project Home Page](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) or the [Wiki](https://github.com/attardi/wikiextractor/wiki).

-The current version performs template expansion by preprocesssng the whole dump and extracting template definitions.
+# Wikipedia Cirrus Extractor
+
+`cirrus-extractor.py` is a version of the script that perfomrs extraction from a Wikipedia Cirrus dump.
+Cirrus dumps contain text with already exmpanded templates.
+
+Cirrus dumps are avilable at:
+[http://dumps.wikimedia.org/other/cirrussearch/](cirrussearch).
+
+# Details
+
+WikiExtractor performs template expansion by preprocesssng the whole dump and extracting template definitions.

 The latest version includes the following performance improvements:

--- a/cirrus-extract.py
+++ b/cirrus-extract.py
@ -0,0 +1,246 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# =============================================================================
+#  Version: 1.00 (December 15, 2015)
+#  Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
+#
+# =============================================================================
+#  Copyright (c) 2015. Giuseppe Attardi (attardi@di.unipi.it).
+# =============================================================================
+#  This file is part of Tanl.
+#
+#  Tanl is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU General Public License, version 3,
+#  as published by the Free Software Foundation.
+#
+#  Tanl is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+# =============================================================================
+
+"""Wikipedia Cirrus Extractor:
+Extracts and cleans text from a Wikipedia Cirrus dump and stores output in a
+number of files of similar size in a given directory.
+Each file will contain several documents in the format:
+
+	<doc id="" url="" title="">
+        ...
+        </doc>
+
+"""
+
+import sys, os.path, time
+import re
+import json
+import argparse
+import bz2
+import gzip
+import logging
+
+# Program version
+version = '1.00'
+
+urlbase = 'http://it.wikipedia.org/'
+
+# ----------------------------------------------------------------------
+
+class NextFile(object):
+    """
+    Synchronous generation of next available file name.
+    """
+
+    filesPerDir = 100
+
+    def __init__(self, path_name):
+        self.path_name = path_name
+        self.dir_index = -1
+        self.file_index = -1
+
+    def next(self):
+        self.file_index = (self.file_index + 1) % NextFile.filesPerDir
+        if self.file_index == 0:
+            self.dir_index += 1
+        dirname = self._dirname()
+        if not os.path.isdir(dirname):
+            os.makedirs(dirname)
+        return self._filepath()
+
+    def _dirname(self):
+        char1 = self.dir_index % 26
+        char2 = self.dir_index / 26 % 26
+        return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))
+
+    def _filepath(self):
+        return '%s/wiki_%02d' % (self._dirname(), self.file_index)
+
+class OutputSplitter(object):
+    """
+    File-like object, that splits output to multiple files of a given max size.
+    """
+
+    def __init__(self, nextFile, max_file_size=0, compress=True):
+        """
+        :param nextfile: a NextFile object from which to obtain filenames
+            to use.
+        :param max_file_size: the maximum size of each file.
+        :para compress: whether to write data with bzip compression.
+        """
+        self.nextFile = nextFile
+        self.compress = compress
+        self.max_file_size = max_file_size
+        self.file = self.open(self.nextFile.next())
+
+    def reserve(self, size):
+        if self.file.tell() + size > self.max_file_size:
+            self.close()
+            self.file = self.open(self.nextFile.next())
+
+    def write(self, data):
+        self.reserve(len(data))
+        self.file.write(data)
+
+    def close(self):
+        self.file.close()
+
+    def open(self, filename):
+        if self.compress:
+            return bz2.BZ2File(filename + '.bz2', 'w')
+        else:
+            return open(filename, 'w')
+
+# ----------------------------------------------------------------------
+
+class Extractor(object):
+
+    def extract(self, out):
+        """
+        :param out: output file.
+        """
+        logging.debug("%s\t%s", self.id, self.title)
+        text = ''.join(self.page)
+        url = get_url(self.id)
+        header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
+        # Separate header from text with a newline.
+        header += self.title + '\n\n'
+        header = header.encode('utf-8')
+        footer = "\n</doc>\n"
+        out.write(header)
+        text = clean(self, text)
+        for line in compact(text):
+            out.write(line.encode('utf-8'))
+            out.write('\n')
+        out.write(footer)
+
+def process_dump(input_file, out_file, file_size, file_compress):
+    """
+    :param input_file: name of the wikipedia dump file; '-' to read from stdin
+    :param out_file: directory where to store extracted data, or '-' for stdout
+    :param file_size: max size of each extracted file, or None for no max (one file)
+    :param file_compress: whether to compress files with bzip.
+    """
+
+    if input_file == '-':
+        input = sys.stdin
+    else:
+        input = gzip.open(input_file)
+
+    if out_file == '-':
+        output = sys.stdout
+        if file_compress:
+            logging.warn("writing to stdout, so no output compression (use external tool)")
+    else:
+        nextFile = NextFile(out_file)
+        output = OutputSplitter(nextFile, file_size, file_compress)
+
+    # process dump
+    # format
+    # {"index":{"_type":"page","_id":"3825914"}}
+    # {"namespace":0,"title":TITLE,"timestamp":"2014-06-29T15:51:09Z","text":TEXT,...}
+    while True:
+        line = input.readline()
+        if not line:
+            break
+        index = json.loads(line)
+        content = json.loads(input.readline())
+        type = index['index']['_type']
+        id = index['index']['_id']
+        if type == 'page' and content['namespace'] == 0:
+            title = content['title']
+            text = content['text']
+            # drop references:
+            # ^ The Penguin Dictionary
+            text = re.sub(r'  \^ .*', '', text)
+            url = urlbase + 'wiki?curid=' + id
+            header = '<doc id="%s" url="%s" title="%s">\n' % (id, url, title)
+            page = header + title + '\n\n' + text + '\n</doc>\n'
+            output.write(page.encode('utf-8'))
+
+# ----------------------------------------------------------------------
+
+# Minimum size of output files
+minFileSize = 200 * 1024
+
+def main():
+    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+                                     description=__doc__)
+    parser.add_argument("input",
+                        help="XML wiki dump file")
+    groupO = parser.add_argument_group('Output')
+    groupO.add_argument("-o", "--output", default="text",
+                        help="directory for extracted files (or '-' for dumping to stdin)")
+    groupO.add_argument("-b", "--bytes", default="1M",
+                        help="maximum bytes per output file (default %(default)s)",
+                        metavar="n[KMG]")
+    groupO.add_argument("-c", "--compress", action="store_true",
+                        help="compress output files using bzip")
+
+    groupP = parser.add_argument_group('Processing')
+    groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
+                        help="accepted namespaces")
+
+    groupS = parser.add_argument_group('Special')
+    groupS.add_argument("-q", "--quiet", action="store_true",
+                        help="suppress reporting progress info")
+    groupS.add_argument("-v", "--version", action="version",
+                        version='%(prog)s ' + version,
+                        help="print program version")
+
+    args = parser.parse_args()
+
+    try:
+        power = 'kmg'.find(args.bytes[-1].lower()) + 1
+        file_size = int(args.bytes[:-1]) * 1024 ** power
+        if file_size < minFileSize:
+            raise ValueError()
+    except ValueError:
+        logging.error('Insufficient or invalid size: %s', args.bytes)
+        return
+
+    FORMAT = '%(levelname)s: %(message)s'
+    logging.basicConfig(format=FORMAT)
+
+    logger = logging.getLogger()
+    if not args.quiet:
+        logger.setLevel(logging.INFO)
+
+    input_file = args.input
+
+    output_path = args.output
+    if output_path != '-' and not os.path.isdir(output_path):
+        try:
+            os.makedirs(output_path)
+        except:
+            logging.error('Could not create: %s', output_path)
+            return
+
+    process_dump(input_file, output_path, file_size, args.compress)
+
+
+if __name__ == '__main__':
+    main()