Upgrade to Python 3.3+.

2020-07-22 14:12:37 +02:00 · 2020-07-22 14:12:37 +02:00 · 62bdbe6106
commit 62bdbe6106
parent 6408a430fc
12 changed files with 2570 additions and 39899 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,93 +1,52 @@
-local/
-tmp/
-
-### https://raw.github.com/github/gitignore/c699a4f4684e9e294c9c550f820ca330f019b6f9/python.gitignore
-
-# Byte-compiled / optimized / DLL files
-__pycache__/
 *.py[cod]
-*$py.class

 # C extensions
 *.so

-# Distribution / packaging
-.Python
-env/
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-*.egg-info/
-.installed.cfg
+# Packages
 *.egg
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
+*.egg-info
+dist
+build
+eggs
+parts
+bin
+var
+sdist
+develop-eggs
+.installed.cfg
+lib
+lib64

 # Installer logs
 pip-log.txt
-pip-delete-this-directory.txt

 # Unit test / coverage reports
-htmlcov/
-.tox/
 .coverage
-.coverage.*
-.cache
+.tox
 nosetests.xml
-coverage.xml
-*,cover
-.hypothesis/

 # Translations
 *.mo
-*.pot

-# Django stuff:
-*.log
-local_settings.py
+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject

-# Flask instance folder
-instance/
+/docs/_build
+.idea
+*.iml

-# Scrapy stuff:
-.scrapy
+.travis-solo
+G*
+*.db
+*.mdb

-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# IPython Notebook
-.ipynb_checkpoints
-
-# pyenv
-.python-version
-
-# celery beat schedule file
-celerybeat-schedule
-
-# dotenv
-.env
-
-# virtualenv
-venv/
-ENV/
-
-# Spyder project settings
-.spyderproject
-
-# Editor files
-*.idea
+# Vim
+[._]*.s[a-w][a-z]
+[._]s[a-w][a-z]
+*.un~
+Session.vim
+.netrwhist
+*~
--- a/README.md
+++ b/README.md
@ -1,7 +1,8 @@
 # WikiExtractor
 [WikiExtractor.py](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) is a Python script that extracts and cleans text from a [Wikipedia database dump](http://download.wikimedia.org/).

-The tool is written in Python and requires Python 2.7 or Python 3.3+ but no additional library.
+The tool is written in Python and requires Python 3.7 but no additional library.
+**Warning**: problems have been reported on Windows due to poor support for `StringIO` in the Python implementation on Windows.

 For further information, see the [project Home Page](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) or the [Wiki](https://github.com/attardi/wikiextractor/wiki).

--- a/WikiExtractor.py
+++ b/WikiExtractor.py
--- a/categories.filter
+++ b/categories.filter
--- a/cirrus-extract.py
+++ b/cirrus-extract.py
@ -11,15 +11,15 @@
 #  This file is part of Tanl.
 #
 #  Tanl is free software; you can redistribute it and/or modify it
-#  under the terms of the GNU General Public License, version 3,
+#  under the terms of the GNU Affero General Public License, version 3,
 #  as published by the Free Software Foundation.
 #
 #  Tanl is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#  GNU General Public License for more details.
+#  GNU Affero General Public License for more details.
 #
-#  You should have received a copy of the GNU General Public License
+#  You should have received a copy of the GNU Affero General Public License
 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 # =============================================================================

@ -43,7 +43,7 @@ import gzip
 import logging

 # Program version
-version = '1.00'
+version = '3.0'

 urlbase = 'http://it.wikipedia.org/'

--- a/scripts/WikiExtractor.py
+++ b/scripts/WikiExtractor.py
@ -0,0 +1,608 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# =============================================================================
+#  Version: 3.0 (July 22, 2020)
+#  Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
+#
+#  Contributors:
+#   Antonio Fuschetto (fuschett@aol.com)
+#   Leonardo Souza (lsouza@amtera.com.br)
+#   Juan Manuel Caicedo (juan@cavorite.com)
+#   Humberto Pereira (begini@gmail.com)
+#   Siegfried-A. Gevatter (siegfried@gevatter.com)
+#   Pedro Assis (pedroh2306@gmail.com)
+#   Wim Muskee (wimmuskee@gmail.com)
+#   Radics Geza (radicsge@gmail.com)
+#
+# =============================================================================
+#  Copyright (c) 2009-2020. Giuseppe Attardi (attardi@di.unipi.it).
+# =============================================================================
+#  This file is part of Tanl.
+#
+#  Tanl is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU Affero General Public License, version 3,
+#  as published by the Free Software Foundation.
+#
+#  Tanl is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU Affero General Public License for more details.
+#
+#  You should have received a copy of the GNU Affero General Public License
+#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+# =============================================================================
+
+"""Wikipedia Extractor:
+Extracts and cleans text from a Wikipedia database dump and stores output in a
+number of files of similar size in a given directory.
+Each file will contain several documents in the format:
+
+    <doc id="" url="" title="">
+        ...
+        </doc>
+
+This version performs template expansion by preprocesssng the whole dump and
+collecting template definitions.
+"""
+
+import argparse
+import bz2
+import codecs
+import fileinput
+import logging
+import os.path
+import re  # TODO use regex when it will be standard
+import sys
+from io import StringIO
+from multiprocessing import Queue, Process, cpu_count
+from timeit import default_timer
+
+from wikiextractor.extract import Extractor, ignoreTag
+
+# ===========================================================================
+
+# Program version
+version = '3.0'
+
+##
+# Defined in <siteinfo>
+# We include as default Template, when loading external template file.
+knownNamespaces = set(['Template'])
+
+##
+# The namespace used for template definitions
+# It is the name associated with namespace key=10 in the siteinfo header.
+templateNamespace = ''
+templatePrefix = ''
+
+##
+# The namespace used for module definitions
+# It is the name associated with namespace key=828 in the siteinfo header.
+moduleNamespace = ''
+
+# This is obtained from <siteinfo>
+urlbase = ''
+
+
+# ----------------------------------------------------------------------
+# Modules
+
+# Only minimal support
+# FIXME: import Lua modules.
+
+modules = {
+    'convert': {
+        'convert': lambda x, u, *rest: x + ' ' + u,  # no conversion
+    }
+}
+# ----------------------------------------------------------------------
+# Expand using WikiMedia API
+# import json
+
+# def expandTemplates(text):
+#     """Expand templates invoking MediaWiki API"""
+#     text = urlib.urlencodew(text.encode('utf-8'))
+#     base = urlbase[:urlbase.rfind('/')]
+#     url = base + "/w/api.php?action=expandtemplates&format=json&text=" + text
+#     exp = json.loads(urllib.urlopen(url))
+#     return exp['expandtemplates']['*']
+
+# ------------------------------------------------------------------------------
+# Output
+
+
+class NextFile(object):
+
+    """
+    Synchronous generation of next available file name.
+    """
+
+    filesPerDir = 100
+
+    def __init__(self, path_name):
+        self.path_name = path_name
+        self.dir_index = -1
+        self.file_index = -1
+
+    def next(self):
+        self.file_index = (self.file_index + 1) % NextFile.filesPerDir
+        if self.file_index == 0:
+            self.dir_index += 1
+        dirname = self._dirname()
+        if not os.path.isdir(dirname):
+            os.makedirs(dirname)
+        return self._filepath()
+
+    def _dirname(self):
+        char1 = self.dir_index % 26
+        char2 = self.dir_index / 26 % 26
+        return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))
+
+    def _filepath(self):
+        return '%s/wiki_%02d' % (self._dirname(), self.file_index)
+
+
+class OutputSplitter(object):
+
+    """
+    File-like object, that splits output to multiple files of a given max size.
+    """
+
+    def __init__(self, nextFile, max_file_size=0, compress=True):
+        """
+        :param nextFile: a NextFile object from which to obtain filenames
+            to use.
+        :param max_file_size: the maximum size of each file.
+        :para compress: whether to write data with bzip compression.
+        """
+        self.nextFile = nextFile
+        self.compress = compress
+        self.max_file_size = max_file_size
+        self.file = self.open(self.nextFile.next())
+
+    def reserve(self, size):
+        if self.file.tell() + size > self.max_file_size:
+            self.close()
+            self.file = self.open(self.nextFile.next())
+
+    def write(self, data):
+        self.reserve(len(data))
+        self.file.write(data)
+
+    def close(self):
+        self.file.close()
+
+    def open(self, filename):
+        if self.compress:
+            return bz2.BZ2File(filename + '.bz2', 'w')
+        else:
+            return open(filename, 'w')
+
+
+# ----------------------------------------------------------------------
+# READER
+
+tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')
+#                    1     2               3      4
+
+
+def load_templates(file, output_file=None):
+    """
+    Load templates from :param file:.
+    :param output_file: file where to save templates and modules.
+    """
+    global templateNamespace, templatePrefix
+    templatePrefix = templateNamespace + ':'
+    global moduleNamespace, modulePrefix
+    modulePrefix = moduleNamespace + ':'
+    articles = 0
+    page = []
+    inText = False
+    if output_file:
+        output = codecs.open(output_file, 'wb', 'utf-8')
+    for line in file:
+        line = line.decode('utf-8')
+        if '<' not in line:  # faster than doing re.search()
+            if inText:
+                page.append(line)
+            continue
+        m = tagRE.search(line)
+        if not m:
+            continue
+        tag = m.group(2)
+        if tag == 'page':
+            page = []
+        elif tag == 'title':
+            title = m.group(3)
+        elif tag == 'text':
+            inText = True
+            line = line[m.start(3):m.end(3)]
+            page.append(line)
+            if m.lastindex == 4:  # open-close
+                inText = False
+        elif tag == '/text':
+            if m.group(1):
+                page.append(m.group(1))
+            inText = False
+        elif inText:
+            page.append(line)
+        elif tag == '/page':
+            if not output_file and not templateNamespace:  # do not know it yet
+                # we reconstruct it from the first title
+                colon = title.find(':')
+                if colon > 1:
+                    templateNamespace = title[:colon]
+                    templatePrefix = title[:colon + 1]
+            # FIXME: should reconstruct also moduleNamespace
+            if title.startswith(templatePrefix):
+                define_template(title, page)
+            # save templates and modules to file
+            if output_file and (title.startswith(templatePrefix) or
+                                title.startswith(modulePrefix)):
+                output.write('<page>\n')
+                output.write('   <title>%s</title>\n' % title)
+                output.write('   <ns>10</ns>\n')
+                output.write('   <text>')
+                for line in page:
+                    output.write(line)
+                output.write('   </text>\n')
+                output.write('</page>\n')
+            page = []
+            articles += 1
+            if articles % 100000 == 0:
+                logging.info("Preprocessed %d pages", articles)
+    if output_file:
+        output.close()
+        logging.info("Saved %d templates to '%s'", len(templates), output_file)
+
+
+def process_dump(input_file, template_file, out_file, file_size, file_compress,
+                 process_count):
+    """
+    :param input_file: name of the wikipedia dump file; '-' to read from stdin
+    :param template_file: optional file with template definitions.
+    :param out_file: directory where to store extracted data, or '-' for stdout
+    :param file_size: max size of each extracted file, or None for no max (one file)
+    :param file_compress: whether to compress files with bzip.
+    :param process_count: number of extraction processes to spawn.
+    """
+    global urlbase
+    global knownNamespaces
+    global templateNamespace, templatePrefix
+    global moduleNamespace, modulePrefix
+
+    if input_file == '-':
+        input = sys.stdin
+    else:
+        input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
+
+    # collect siteinfo
+    for line in input:
+        line = line.decode('utf-8')
+        m = tagRE.search(line)
+        if not m:
+            continue
+        tag = m.group(2)
+        if tag == 'base':
+            # discover urlbase from the xml dump file
+            # /mediawiki/siteinfo/base
+            base = m.group(3)
+            urlbase = base[:base.rfind("/")]
+        elif tag == 'namespace':
+            knownNamespaces.add(m.group(3))
+            if re.search('key="10"', line):
+                templateNamespace = m.group(3)
+                templatePrefix = templateNamespace + ':'
+            elif re.search('key="828"', line):
+                moduleNamespace = m.group(3)
+                modulePrefix = moduleNamespace + ':'
+        elif tag == '/siteinfo':
+            break
+
+    if expand_templates:
+        # preprocess
+        template_load_start = default_timer()
+        if template_file and os.path.exists(template_file):
+            logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", template_file)
+            file = fileinput.FileInput(template_file, openhook=fileinput.hook_compressed)
+            load_templates(file)
+            file.close()
+        else:
+            if input_file == '-':
+                # can't scan then reset stdin; must error w/ suggestion to specify template_file
+                raise ValueError("to use templates with stdin dump, must supply explicit template-file")
+            logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", input_file)
+            load_templates(input, template_file)
+            input.close()
+            input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
+        template_load_elapsed = default_timer() - template_load_start
+        logging.info("Loaded %d templates in %.1fs", len(templates), template_load_elapsed)
+
+    if out_file == '-':
+        output = sys.stdout
+        if file_compress:
+            logging.warn("writing to stdout, so no output compression (use an external tool)")
+    else:
+        nextFile = NextFile(out_file)
+        output = OutputSplitter(nextFile, file_size, file_compress)
+
+    # process pages
+    logging.info("Starting page extraction from %s.", input_file)
+    extract_start = default_timer()
+
+    # Parallel Map/Reduce:
+    # - pages to be processed are dispatched to workers
+    # - a reduce process collects the results, sort them and print them.
+
+    maxsize = 10 * process_count
+    # output queue
+    output_queue = Queue(maxsize=maxsize)
+
+    # Reduce job that sorts and prints output
+    reduce = Process(target=reduce_process, args=(output_queue, output))
+    reduce.start()
+
+    # initialize jobs queue
+    jobs_queue = Queue(maxsize=maxsize)
+
+    # start worker processes
+    logging.info("Using %d extract processes.", process_count)
+    workers = []
+    for _ in xrange(max(1, process_count)):
+        extractor = Process(target=extract_process,
+                            args=(jobs_queue, output_queue))
+        extractor.daemon = True  # only live while parent process lives
+        extractor.start()
+        workers.append(extractor)
+
+    # Mapper process
+
+    # we collect individual lines, since str.join() is significantly faster
+    # than concatenation
+    page = []
+    id = None
+    last_id = None
+    ordinal = 0  # page count
+    inText = False
+    redirect = False
+    for line in input:
+        line = line.decode('utf-8')
+        if '<' not in line:  # faster than doing re.search()
+            if inText:
+                page.append(line)
+            continue
+        m = tagRE.search(line)
+        if not m:
+            continue
+        tag = m.group(2)
+        if tag == 'page':
+            page = []
+            redirect = False
+        elif tag == 'id' and not id:
+            id = m.group(3)
+        elif tag == 'title':
+            title = m.group(3)
+        elif tag == 'redirect':
+            redirect = True
+        elif tag == 'text':
+            inText = True
+            line = line[m.start(3):m.end(3)]
+            page.append(line)
+            if m.lastindex == 4:  # open-close
+                inText = False
+        elif tag == '/text':
+            if m.group(1):
+                page.append(m.group(1))
+            inText = False
+        elif inText:
+            page.append(line)
+        elif tag == '/page':
+            colon = title.find(':')
+            if (colon < 0 or title[:colon] in acceptedNamespaces) and id != last_id and \
+                    not redirect and not title.startswith(templateNamespace):
+                job = (id, title, page, ordinal)
+                jobs_queue.put(job)  # goes to any available extract_process
+                last_id = id
+                ordinal += 1
+            id = None
+            page = []
+
+    input.close()
+
+    # signal termination
+    for _ in workers:
+        jobs_queue.put(None)
+    # wait for workers to terminate
+    for w in workers:
+        w.join()
+
+    # signal end of work to reduce process
+    output_queue.put(None)
+    # wait for it to finish
+    reduce.join()
+
+    if output != sys.stdout:
+        output.close()
+    extract_duration = default_timer() - extract_start
+    extract_rate = ordinal / extract_duration
+    logging.info("Finished %d-process extraction of %d articles in %.1fs (%.1f art/s)",
+                 process_count, ordinal, extract_duration, extract_rate)
+
+
+# ----------------------------------------------------------------------
+# Multiprocess support
+
+
+def extract_process(jobs_queue, output_queue):
+    """Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text
+    :param jobs_queue: where to get jobs.
+    :param output_queue: where to queue extracted text for output.
+    """
+    while True:
+        job = jobs_queue.get()  # job is (id, title, page, ordinal)
+        if job:
+            out = StringIO()  # memory buffer
+            Extractor(*job[:3]).extract(out)  # (id, title, page)
+            text = out.getvalue()
+            output_queue.put((job[3], text))  # (ordinal, extracted_text)
+            out.close()
+        else:
+            break
+
+
+def reduce_process(output_queue, output):
+    """Pull finished article text, write series of files (or stdout)
+    :param output_queue: text to be output.
+    :param output: file object where to print.
+    """
+
+    interval_start = default_timer()
+    period = 100000
+    # FIXME: use a heap
+    ordering_buffer = {}  # collected pages
+    next_ordinal = 0  # sequence number of pages
+    while True:
+        if next_ordinal in ordering_buffer:
+            output.write(ordering_buffer.pop(next_ordinal))
+            next_ordinal += 1
+            # progress report
+            if next_ordinal % period == 0:
+                interval_rate = period / (default_timer() - interval_start)
+                logging.info("Extracted %d articles (%.1f art/s)",
+                             next_ordinal, interval_rate)
+                interval_start = default_timer()
+        else:
+            # mapper puts None to signal finish
+            pair = output_queue.get()
+            if not pair:
+                break
+            ordinal, text = pair
+            ordering_buffer[ordinal] = text
+
+
+# ----------------------------------------------------------------------
+
+# Minimum size of output files
+minFileSize = 200 * 1024
+
+
+def main():
+    global urlbase, acceptedNamespaces
+    global expand_templates, templateCache, escape_doc
+
+    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
+                                     formatter_class=argparse.RawDescriptionHelpFormatter,
+                                     description=__doc__)
+    parser.add_argument("input",
+                        help="XML wiki dump file")
+    groupO = parser.add_argument_group('Output')
+    groupO.add_argument("-o", "--output", default="text",
+                        help="directory for extracted files (or '-' for dumping to stdout)")
+    groupO.add_argument("-b", "--bytes", default="1M",
+                        help="maximum bytes per output file (default %(default)s)",
+                        metavar="n[KMG]")
+    groupO.add_argument("-c", "--compress", action="store_true",
+                        help="compress output files using bzip")
+
+    groupP = parser.add_argument_group('Processing')
+    groupP.add_argument("--html", action="store_true",
+                        help="produce HTML output, subsumes --links")
+    groupP.add_argument("-l", "--links", action="store_true",
+                        help="preserve links")
+    groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
+                        help="accepted namespaces")
+    groupP.add_argument("--templates",
+                        help="use or create file containing templates")
+    groupP.add_argument("--no-templates", action="store_false",
+                        help="Do not expand templates")
+    groupP.add_argument("--escapedoc", action="store_true",
+                        help="use to escape the contents of the output <doc>...</doc>")
+    default_process_count = cpu_count() - 1
+    parser.add_argument("--processes", type=int, default=default_process_count,
+                        help="Number of processes to use (default %(default)s)")
+
+    groupS = parser.add_argument_group('Special')
+    groupS.add_argument("-q", "--quiet", action="store_true",
+                        help="suppress reporting progress info")
+    groupS.add_argument("--debug", action="store_true",
+                        help="print debug info")
+    groupS.add_argument("-a", "--article", action="store_true",
+                        help="analyze a file containing a single article (debug option)")
+    groupS.add_argument("-v", "--version", action="version",
+                        version='%(prog)s ' + version,
+                        help="print program version")
+
+    args = parser.parse_args()
+
+    Extractor.keepLinks = args.links
+    Extractor.toHTML = args.html
+    if args.html:
+        Extractor.keepLinks = True
+
+    expand_templates = args.no_templates
+    escape_doc = args.escapedoc
+
+    try:
+        power = 'kmg'.find(args.bytes[-1].lower()) + 1
+        file_size = int(args.bytes[:-1]) * 1024 ** power
+        if file_size < minFileSize:
+            raise ValueError()
+    except ValueError:
+        logging.error('Insufficient or invalid size: %s', args.bytes)
+        return
+
+    if args.namespaces:
+        acceptedNamespaces = set(args.namespaces.split(','))
+
+    FORMAT = '%(levelname)s: %(message)s'
+    logging.basicConfig(format=FORMAT)
+
+    logger = logging.getLogger()
+    if not args.quiet:
+        logger.setLevel(logging.INFO)
+    if args.debug:
+        logger.setLevel(logging.DEBUG)
+
+    input_file = args.input
+
+    if not Extractor.keepLinks:
+        ignoreTag('a')
+
+    # sharing cache of parser templates is too slow:
+    # manager = Manager()
+    # templateCache = manager.dict()
+
+    if args.article:
+        if args.templates:
+            if os.path.exists(args.templates):
+                with open(args.templates) as file:
+                    load_templates(file)
+
+        with open(input_file) as file:
+            page = file.read().decode('utf-8')
+            m = re.search(r'<id>(.*)</id>', page)
+            id = m.group(1) if m else 0
+            m = re.search(r'<title>(.*)</title>', page)
+            if m:
+                title = m.group(1)
+            else:
+                logging.error('Missing title element')
+                return
+            Extractor(id, title, [page]).extract(sys.stdout)
+        return
+
+    output_path = args.output
+    if output_path != '-' and not os.path.isdir(output_path):
+        try:
+            os.makedirs(output_path)
+        except:
+            logging.error('Could not create: %s', output_path)
+            return
+
+    process_dump(input_file, args.templates, output_path, file_size,
+                 args.compress, args.processes)
+
+
+if __name__ == '__main__':
+    main()
--- a/scripts/init.py
+++ b/scripts/init.py
--- a/scripts/extractPage.py
+++ b/scripts/extractPage.py
@ -0,0 +1,125 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# =============================================================================
+#  Version: 3.0 (July 22, 2020)
+#  Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
+
+# =============================================================================
+#  Copyright (c) 2009. Giuseppe Attardi (attardi@di.unipi.it).
+# =============================================================================
+#  This file is part of Tanl.
+#
+#  Tanl is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU Affero General Public License, version 3,
+#  as published by the Free Software Foundation.
+#
+#  Tanl is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU Affero General Public License for more details.
+#
+#  You should have received a copy of the GNU Affero General Public License
+#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+# =============================================================================
+
+"""Wikipedia Page Extractor:
+Extracts a single page from a Wikipedia dump file.
+"""
+
+import sys, os.path
+import re, random
+import argparse
+from itertools import izip
+import logging, traceback
+import urllib
+import bz2, gzip
+from htmlentitydefs import name2codepoint
+import Queue, threading, multiprocessing
+
+
+# Program version
+version = '3.0'
+
+# ----------------------------------------------------------------------
+# READER
+
+tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')
+#tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>([^<]*)')
+#                    1     2            3
+
+def process_data(input_file, id, templates=False):
+    """
+    :param input_file: name of the wikipedia dump file.
+    :param id: article id
+    """
+
+    if input_file.lower().endswith("bz2"):
+        opener = bz2.BZ2File
+    else:
+        opener = open
+
+    input = opener(input_file)
+
+    page = []
+    for line in input:
+        line = line.decode('utf-8')
+        if '<' not in line:         # faster than doing re.search()
+            if page:
+                page.append(line)
+            continue
+        m = tagRE.search(line)
+        if not m:
+            continue
+        tag = m.group(2)
+        if tag == 'page':
+            page = []
+            page.append(line)
+            inArticle = False
+        elif tag == 'id':
+            curid = m.group(3)
+            if id == curid:
+                page.append(line)
+                inArticle = True
+            elif not inArticle and not templates:
+                page = []
+        elif tag == 'title':
+            if templates:
+                if m.group(3).startswith('Template:'):
+                    page.append(line)
+                else:
+                    page = []
+            else:
+                page.append(line)
+        elif tag == '/page':
+            if page:
+                page.append(line)
+                print ''.join(page).encode('utf-8')
+                if not templates:
+                    break
+            page = []
+        elif page:
+            page.append(line)
+
+    input.close()
+
+def main():
+    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+                                     description=__doc__)
+    parser.add_argument("input",
+                        help="XML wiki dump file")
+    parser.add_argument("--id", default="",
+                        help="article number")
+    parser.add_argument("--template", action="store_true",
+                        help="template number")
+    parser.add_argument("-v", "--version", action="version",
+                        version='%(prog)s ' + version,
+                        help="print program version")
+
+    args = parser.parse_args()
+
+    process_data(args.input, args.id, args.template)
+
+if __name__ == '__main__':
+    main()
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,25 @@
+from setuptools import setup
+import re
+
+from scripts.WikiExtractor import version
+
+
+def to_semantic_version(version):
+    if re.match(r'^\d+\.\d+$', version):
+        return version + '.0'
+    return version
+
+setup(
+    name='wikiextractor',
+    version=to_semantic_version(version),
+    description='A tool for extracting plain text from Wikipedia dumps',
+    packages=[
+        'wikiextractor'
+    ],
+    install_requires=[
+    ],
+    tests_require=[
+        'nose>=1.0',
+    ],
+    test_suite='nose.collector',
+)
--- a/wikiextractor/init.py
+++ b/wikiextractor/init.py
--- a/wikiextractor/clean.py
+++ b/wikiextractor/clean.py
@ -0,0 +1,48 @@
+# =============================================================================
+#  Copyright (c) 2020. Giuseppe Attardi (attardi@di.unipi.it).
+# =============================================================================
+#  This file is part of Tanl.
+#
+#  Tanl is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU Affero General Public License, version 3,
+#  as published by the Free Software Foundation.
+#
+#  Tanl is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU Affero General Public License for more details.
+#
+#  You should have received a copy of the GNU Affero General Public License
+#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+# =============================================================================
+
+from wikiextractor.extract import Extractor, ignoreTag, resetIgnoredTags
+
+
+def clean_markup(markup, keep_links=False, ignore_headers=True):
+    """
+    Clean Wikimarkup to produce plaintext.
+
+    :param keep_links: Set to True to keep internal and external links
+    :param ignore_headers: if set to True, the output list will not contain
+    headers, only 
+
+    Returns a list of paragraphs (unicode strings).
+    """
+
+    if not keep_links:
+        ignoreTag('a')
+
+    extractor = Extractor(0, '', [])
+
+    # returns a list of strings
+    paragraphs = extractor.clean_text(markup,
+                                      mark_headers=True,
+                                      expand_templates=False,
+                                      escape_doc=True)
+    resetIgnoredTags()
+
+    if ignore_headers:
+        paragraphs = filter(lambda s: not s.startswith('## '), paragraphs)
+
+    return paragraphs
--- a/wikiextractor/extract.py
+++ b/wikiextractor/extract.py