From 6d0577ef10e48959de709ca05cfa3c2b195f4087 Mon Sep 17 00:00:00 2001
From: attardi <Giuseppe Attardi attardi@di.unipi.it>
Date: Mon, 15 Feb 2016 01:22:38 +0100
Subject: [PATCH] See ChangeLog.

---
 ChangeLog        |  16 ++
 WikiExtractor.py | 484 +++++++++++++++++++++++------------------------
 extractPage.py   |  33 +++-
 3 files changed, 278 insertions(+), 255 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 80d0d3c..1893682 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,19 @@
+2016-02-15  Giuseppe Attardi  <attardi@di.unipi.it>
+
+	* WikiExtractor.py (Extractor.clean): turned into method.
+
+2016-02-14  Giuseppe Attardi  <attardi@di.unipi.it>
+
+	* WikiExtractor.py (load_templates): save also <id> in templates.
+	(pages_from): common iterator for extracting pages from dump, used
+	both for analyzing pages, templates and single articles.
+
+2016-02-13  Giuseppe Attardi  <attardi@di.unipi.it>
+
+	* WikiExtractor.py (reduce_process): close file here.
+
+	* extractPage.py (process_data): allow range of ids.
+
 2016-02-12  Giuseppe Attardi  <attardi@di.unipi.it>
 
 	* WikiExtractor.py (reduce_process): moved here creation of OutputSplitter.
diff --git a/WikiExtractor.py b/WikiExtractor.py
index 7b8fc13..d9427ba 100755
--- a/WikiExtractor.py
+++ b/WikiExtractor.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 # =============================================================================
-#  Version: 2.48 (February 12, 2016)
+#  Version: 2.49 (February 14, 2016)
 #  Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
 #
 #  Contributors:
@@ -46,6 +46,7 @@ This version performs template expansion by preprocesssng the whole dump and
 collecting template definitions.
 """
 
+import sys
 import argparse
 import bz2
 import codecs
@@ -54,19 +55,18 @@ import fileinput
 import logging
 import os.path
 import re  # TODO use regex when it will be standard
-import sys
 import time
 import urllib
 from cStringIO import StringIO
 from htmlentitydefs import name2codepoint
 from itertools import izip, izip_longest
-from multiprocessing import Queue, Process, cpu_count
+from multiprocessing import Queue, Process, Array, cpu_count
 from timeit import default_timer
 
 # ===========================================================================
 
 # Program version
-version = '2.48'
+version = '2.49'
 
 ## PARAMS ####################################################################
 
@@ -394,7 +394,6 @@ class TemplateArg(object):
 
 substWords = 'subst:|safesubst:'
 
-
 class Extractor(object):
     """
     An extraction task on a article.
@@ -415,13 +414,20 @@ class Extractor(object):
     # Whether to output HTML instead of text
     toHTML = False
 
-    def __init__(self, id, title, page):
+    ##
+    # Whether to expand templates
+    expand_templates = True
+
+
+    def __init__(self, id, title, lines):
         """
-        :param page: a list of lines.
+        :param id: id of page.
+        :param title: tutle of page.
+        :param lines: a list of lines.
         """
         self.id = id
         self.title = title
-        self.page = page
+        self.text = ''.join(lines)
         self.magicWords = MagicWords()
         self.frame = []
         self.recursion_exceeded_1_errs = 0  # template recursion within expandTemplates()
@@ -433,8 +439,7 @@ class Extractor(object):
         """
         :param out: a memory file.
         """
-        logging.debug("%s\t%s", self.id, self.title)
-        text = ''.join(self.page)
+        logging.info("%s\t%s", self.id, self.title)
         url = get_url(self.id)
         header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
         # Separate header from text with a newline.
@@ -447,7 +452,7 @@ class Extractor(object):
         self.magicWords['currentday'] = time.strftime('%d')
         self.magicWords['currenthour'] = time.strftime('%H')
         self.magicWords['currenttime'] = time.strftime('%H:%M:%S')
-        text = clean(self, text)
+        text = self.clean()
         footer = "\n</doc>\n"
         out.write(header)
         for line in compact(text):
@@ -462,6 +467,112 @@ class Extractor(object):
             logging.warn("Template errors in article '%s' (%s): title(%d) recursion(%d, %d, %d)",
                          self.title, self.id, *errs)
 
+    def clean(self):
+        """
+        Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped
+        @see https://www.mediawiki.org/wiki/Help:Formatting
+        """
+        text = self.text
+        self.text = ''          # save memory
+        if Extractor.expand_templates:
+            # expand templates
+            # See: http://www.mediawiki.org/wiki/Help:Templates
+            text = self.expandTemplates(text)
+        else:
+            # Drop transclusions (template, parser functions)
+            text = dropNested(text, r'{{', r'}}')
+
+        # Drop tables
+        text = dropNested(text, r'{\|', r'\|}')
+
+        # replace external links
+        text = replaceExternalLinks(text)
+
+        # replace internal links
+        text = replaceInternalLinks(text)
+
+        # drop MagicWords behavioral switches
+        text = magicWordsRE.sub('', text)
+
+        # ############### Process HTML ###############
+
+        # turn into HTML, except for the content of <syntaxhighlight>
+        res = ''
+        cur = 0
+        for m in syntaxhighlight.finditer(text):
+            end = m.end()
+            res += unescape(text[cur:m.start()]) + m.group(1)
+            cur = end
+        text = res + unescape(text[cur:])
+
+        # Handle bold/italic/quote
+        if self.toHTML:
+            text = bold_italic.sub(r'<b>\1</b>', text)
+            text = bold.sub(r'<b>\1</b>', text)
+            text = italic.sub(r'<i>\1</i>', text)
+        else:
+            text = bold_italic.sub(r'\1', text)
+            text = bold.sub(r'\1', text)
+            text = italic_quote.sub(r'"\1"', text)
+            text = italic.sub(r'"\1"', text)
+            text = quote_quote.sub(r'"\1"', text)
+        # residuals of unbalanced quotes
+        text = text.replace("'''", '').replace("''", '"')
+
+        # Collect spans
+
+        spans = []
+        # Drop HTML comments
+        for m in comment.finditer(text):
+            spans.append((m.start(), m.end()))
+
+        # Drop self-closing tags
+        for pattern in selfClosing_tag_patterns:
+            for m in pattern.finditer(text):
+                spans.append((m.start(), m.end()))
+
+        # Drop ignored tags
+        for left, right in ignored_tag_patterns:
+            for m in left.finditer(text):
+                spans.append((m.start(), m.end()))
+            for m in right.finditer(text):
+                spans.append((m.start(), m.end()))
+
+        # Bulk remove all spans
+        text = dropSpans(spans, text)
+
+        # Drop discarded elements
+        for tag in discardElements:
+            text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag)
+
+        if not self.toHTML:
+            # Turn into text what is left (&amp;nbsp;) and <syntaxhighlight>
+            text = unescape(text)
+
+        # Expand placeholders
+        for pattern, placeholder in placeholder_tag_patterns:
+            index = 1
+            for match in pattern.finditer(text):
+                text = text.replace(match.group(), '%s_%d' % (placeholder, index))
+                index += 1
+
+        text = text.replace('<<', u'«').replace('>>', u'»')
+
+        #############################################
+
+        # Cleanup text
+        text = text.replace('\t', ' ')
+        text = spaces.sub(' ', text)
+        text = dots.sub('...', text)
+        text = re.sub(u' (,:\.\)\]»)', r'\1', text)
+        text = re.sub(u'(\[\(«) ', r'\1', text)
+        text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U)  # lines with only punctuations
+        text = text.replace(',,', ',').replace(',.', '.')
+        if escape_doc:
+            text = cgi.escape(text)
+        return text
+
+
     # ----------------------------------------------------------------------
     # Expand templates
 
@@ -1993,114 +2104,6 @@ tailRE = re.compile('\w+')
 
 syntaxhighlight = re.compile('&lt;syntaxhighlight .*?&gt;(.*?)&lt;/syntaxhighlight&gt;', re.DOTALL)
 
-expand_templates = True
-
-
-def clean(extractor, text):
-    """
-    Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped
-    @see https://www.mediawiki.org/wiki/Help:Formatting
-    """
-
-    if expand_templates:
-        # expand templates
-        # See: http://www.mediawiki.org/wiki/Help:Templates
-        text = extractor.expandTemplates(text)
-    else:
-        # Drop transclusions (template, parser functions)
-        text = dropNested(text, r'{{', r'}}')
-
-    # Drop tables
-    text = dropNested(text, r'{\|', r'\|}')
-
-    # replace external links
-    text = replaceExternalLinks(text)
-
-    # replace internal links
-    text = replaceInternalLinks(text)
-
-    # drop MagicWords behavioral switches
-    text = magicWordsRE.sub('', text)
-
-    # ############### Process HTML ###############
-
-    # turn into HTML, except for the content of <syntaxhighlight>
-    res = ''
-    cur = 0
-    for m in syntaxhighlight.finditer(text):
-        end = m.end()
-        res += unescape(text[cur:m.start()]) + m.group(1)
-        cur = end
-    text = res + unescape(text[cur:])
-
-    # Handle bold/italic/quote
-    if extractor.toHTML:
-        text = bold_italic.sub(r'<b>\1</b>', text)
-        text = bold.sub(r'<b>\1</b>', text)
-        text = italic.sub(r'<i>\1</i>', text)
-    else:
-        text = bold_italic.sub(r'\1', text)
-        text = bold.sub(r'\1', text)
-        text = italic_quote.sub(r'"\1"', text)
-        text = italic.sub(r'"\1"', text)
-        text = quote_quote.sub(r'"\1"', text)
-    # residuals of unbalanced quotes
-    text = text.replace("'''", '').replace("''", '"')
-
-    # Collect spans
-
-    spans = []
-    # Drop HTML comments
-    for m in comment.finditer(text):
-        spans.append((m.start(), m.end()))
-
-    # Drop self-closing tags
-    for pattern in selfClosing_tag_patterns:
-        for m in pattern.finditer(text):
-            spans.append((m.start(), m.end()))
-
-    # Drop ignored tags
-    for left, right in ignored_tag_patterns:
-        for m in left.finditer(text):
-            spans.append((m.start(), m.end()))
-        for m in right.finditer(text):
-            spans.append((m.start(), m.end()))
-
-    # Bulk remove all spans
-    text = dropSpans(spans, text)
-
-    # Drop discarded elements
-    for tag in discardElements:
-        text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag)
-
-    if not extractor.toHTML:
-        # Turn into text what is left (&amp;nbsp;) and <syntaxhighlight>
-        text = unescape(text)
-
-    # Expand placeholders
-    for pattern, placeholder in placeholder_tag_patterns:
-        index = 1
-        for match in pattern.finditer(text):
-            text = text.replace(match.group(), '%s_%d' % (placeholder, index))
-            index += 1
-
-    text = text.replace('<<', u'«').replace('>>', u'»')
-
-    #############################################
-
-    # Cleanup text
-    text = text.replace('\t', ' ')
-    text = spaces.sub(' ', text)
-    text = dots.sub('...', text)
-    text = re.sub(u' (,:\.\)\]»)', r'\1', text)
-    text = re.sub(u'(\[\(«) ', r'\1', text)
-    text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U)  # lines with only punctuations
-    text = text.replace(',,', ',').replace(',.', '.')
-    if escape_doc:
-        text = cgi.escape(text)
-    return text
-
-
 # skip level 1, it is page name level
 section = re.compile(r'(==+)\s*(.*?)\s*\1')
 
@@ -2308,13 +2311,57 @@ def load_templates(file, output_file=None):
     templatePrefix = templateNamespace + ':'
     global moduleNamespace, modulePrefix
     modulePrefix = moduleNamespace + ':'
-    articles = 0
-    page = []
-    ns = '0'
-    inText = False
     if output_file:
         output = codecs.open(output_file, 'wb', 'utf-8')
-    for line in file:
+    for page_count, page_data in enumerate(pages_from(file)):
+        id, title, ns, page = page_data
+        if not output_file and (not templateNamespace or
+                                not moduleNamespace):  # do not know it yet
+            # reconstruct templateNamespace and moduleNamespace from the first title
+            if ns in templateKeys:
+                colon = title.find(':')
+                if colon > 1:
+                    if ns == '10':
+                        templateNamespace = title[:colon]
+                        templatePrefix = title[:colon + 1]
+                    elif ns == '828':
+                        moduleNamespace = title[:colon]
+                        modulePrefix = title[:colon + 1]
+        if ns in templateKeys:
+            text = ''.join(page)
+            define_template(title, text)
+            # save templates and modules to file
+            if output_file:
+                output.write('<page>\n')
+                output.write('   <title>%s</title>\n' % title)
+                output.write('   <ns>%s</ns>\n' % ns)
+                output.write('   <id>%s</id>\n' % id)
+                output.write('   <text>')
+                for line in page:
+                    output.write(line)
+                output.write('   </text>\n')
+                output.write('</page>\n')
+        if page_count and page_count % 100000 == 0:
+            logging.info("Preprocessed %d pages", page_count)
+    if output_file:
+        output.close()
+        logging.info("Saved %d templates to '%s'", len(templates), output_file)
+
+
+def pages_from(input):
+    """
+    Scans input extracting pages.
+    :return: (id, title, namespace, page), page is a list of lines.
+    """
+    # we collect individual lines, since str.join() is significantly faster
+    # than concatenation
+    page = []
+    id = None
+    ns = '0'
+    last_id = None
+    inText = False
+    redirect = False
+    for line in input:
         line = line.decode('utf-8')
         if '<' not in line:  # faster than doing re.search()
             if inText:
@@ -2326,10 +2373,15 @@ def load_templates(file, output_file=None):
         tag = m.group(2)
         if tag == 'page':
             page = []
+            redirect = False
+        elif tag == 'id' and not id:
+            id = m.group(3)
         elif tag == 'title':
             title = m.group(3)
         elif tag == 'ns':
             ns = m.group(3)
+        elif tag == 'redirect':
+            redirect = True
         elif tag == 'text':
             inText = True
             line = line[m.start(3):m.end(3)]
@@ -2343,37 +2395,12 @@ def load_templates(file, output_file=None):
         elif inText:
             page.append(line)
         elif tag == '/page':
-            if not output_file and not templateNamespace:  # do not know it yet
-                # reconstruct templateNamespace and moduleNamespace from the first title
-                if ns in templateKeys:
-                    colon = title.find(':')
-                    if colon > 1:
-                        if ns == '10':
-                            templateNamespace = title[:colon]
-                            templatePrefix = title[:colon + 1]
-                        elif ns == '828':
-                            moduleNamespace = title[:colon]
-                            modulePrefix = title[:colon + 1]
-            if ns in templateKeys:
-                define_template(title, page)
-                # save templates and modules to file
-                if output_file:
-                    output.write('<page>\n')
-                    output.write('   <title>%s</title>\n' % title)
-                    output.write('   <ns>%s</ns>\n' % ns)
-                    output.write('   <text>')
-                    for line in page:
-                        output.write(line)
-                    output.write('   </text>\n')
-                    output.write('</page>\n')
+            if id != last_id and not redirect:
+                yield (id, title, ns, page)
+                last_id = id
+                ns = '0'
+            id = None
             page = []
-            ns = '0'
-            articles += 1
-            if articles % 100000 == 0:
-                logging.info("Preprocessed %d pages", articles)
-    if output_file:
-        output.close()
-        logging.info("Saved %d templates to '%s'", len(templates), output_file)
 
 
 def process_dump(input_file, template_file, out_file, file_size, file_compress,
@@ -2419,7 +2446,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
         elif tag == '/siteinfo':
             break
 
-    if expand_templates:
+    if Extractor.expand_templates:
         # preprocess
         template_load_start = default_timer()
         if template_file and os.path.exists(template_file):
@@ -2452,17 +2479,22 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
 
     if out_file == '-':
         out_file = None
-    # Reduce job that sorts and prints output
-    reduce = Process(target=reduce_process, args=(output_queue, out_file, file_size, file_compress))
+
+    worker_count = max(1, process_count)
+
+    # reduce job that sorts and prints output
+    reduce = Process(target=reduce_process,
+                     args=(output_queue,
+                           out_file, file_size, file_compress))
     reduce.start()
 
     # initialize jobs queue
     jobs_queue = Queue(maxsize=maxsize)
 
     # start worker processes
-    logging.info("Using %d extract processes.", process_count)
+    logging.info("Using %d extract processes.", worker_count)
     workers = []
-    for _ in xrange(max(1, process_count)):
+    for i in xrange(worker_count):
         extractor = Process(target=extract_process,
                             args=(jobs_queue, output_queue))
         extractor.daemon = True  # only live while parent process lives
@@ -2470,59 +2502,14 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
         workers.append(extractor)
 
     # Mapper process
-
-    # we collect individual lines, since str.join() is significantly faster
-    # than concatenation
-    page = []
-    id = None
-    ns = '0'
-    last_id = None
-    ordinal = 0  # page count
-    inText = False
-    redirect = False
-    for line in input:
-        line = line.decode('utf-8')
-        if '<' not in line:  # faster than doing re.search()
-            if inText:
-                page.append(line)
-            continue
-        m = tagRE.search(line)
-        if not m:
-            continue
-        tag = m.group(2)
-        if tag == 'page':
-            page = []
-            redirect = False
-        elif tag == 'id' and not id:
-            id = m.group(3)
-        elif tag == 'title':
-            title = m.group(3)
-        elif tag == 'ns':
-            ns = m.group(3)
-        elif tag == 'redirect':
-            redirect = True
-        elif tag == 'text':
-            inText = True
-            line = line[m.start(3):m.end(3)]
-            page.append(line)
-            if m.lastindex == 4:  # open-close
-                inText = False
-        elif tag == '/text':
-            if m.group(1):
-                page.append(m.group(1))
-            inText = False
-        elif inText:
-            page.append(line)
-        elif tag == '/page':
-            if id != last_id and not redirect and ns not in templateKeys:
-                job = (id, title, page, ordinal)
-                jobs_queue.put(job)  # goes to any available extract_process
-                logging.info('%s\t%s', id, title)
-                last_id = id
-                ns = '0'
-                ordinal += 1
-            id = None
-            page = []
+    page_num = 0
+    for page_data in pages_from(input):
+        id, title, ns, page = page_data
+        if ns not in templateKeys:
+            job = (id, title, page, page_num)
+            # logging.info('Put: %s %s', id, page_num) # DEBUG
+            jobs_queue.put(job) # goes to any available extract_process
+            page_num += 1
 
     input.close()
 
@@ -2538,12 +2525,10 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
     # wait for it to finish
     reduce.join()
 
-    if output != sys.stdout:
-        output.close()
     extract_duration = default_timer() - extract_start
-    extract_rate = ordinal / extract_duration
+    extract_rate = page_num / extract_duration
     logging.info("Finished %d-process extraction of %d articles in %.1fs (%.1f art/s)",
-                 process_count, ordinal, extract_duration, extract_rate)
+                 process_count, page_num, extract_duration, extract_rate)
 
 
 # ----------------------------------------------------------------------
@@ -2556,18 +2541,27 @@ def extract_process(jobs_queue, output_queue):
     :param output_queue: where to queue extracted text for output.
     """
     while True:
-        job = jobs_queue.get()  # job is (id, title, page, ordinal)
+        job = jobs_queue.get()  # job is (id, title, page, page_num)
         if job:
-            out = StringIO()  # memory buffer
-            Extractor(*job[:3]).extract(out)  # (id, title, page)
-            text = out.getvalue()
-            output_queue.put((job[3], text))  # (ordinal, extracted_text)
+            id, title, page, page_num = job
+            # logging.info('Got: %s %s', id, page_num) # DEBUG
+            out = StringIO()                 # memory buffer
+            try:
+                Extractor(*job[:3]).extract(out) # (id, title, page)
+                text = out.getvalue()
+            except:
+                text = ''
+                logging.error('Processing page: %s %s', id, title)
+            # logging.info('Done: %s %s', id, page_num) # DEBUG
+            output_queue.put((page_num, text))
             out.close()
         else:
             break
 
 
-def reduce_process(output_queue, out_file=None, file_size=0, file_compress=True):
+period = 100000                 # progress report period
+def reduce_process(output_queue,
+                   out_file=None, file_size=0, file_compress=True):
     """Pull finished article text, write series of files (or stdout)
     :param output_queue: text to be output.
     :param out_file: filename where to print.
@@ -2584,27 +2578,33 @@ def reduce_process(output_queue, out_file=None, file_size=0, file_compress=True)
             logging.warn("writing to stdout, so no output compression (use an external tool)")
     
     interval_start = default_timer()
-    period = 100000
     # FIXME: use a heap
-    ordering_buffer = {}  # collected pages
-    next_ordinal = 0  # sequence number of pages
+    collected_pages = {}        # collected pages
+    next_page = 0               # sequence numbering of page
     while True:
-        if next_ordinal in ordering_buffer:
-            output.write(ordering_buffer.pop(next_ordinal))
-            next_ordinal += 1
+        if next_page in collected_pages:
+            output.write(collected_pages.pop(next_page))
+            next_page += 1
             # progress report
-            if next_ordinal % period == 0:
+            if next_page % period == 0:
                 interval_rate = period / (default_timer() - interval_start)
                 logging.info("Extracted %d articles (%.1f art/s)",
-                             next_ordinal, interval_rate)
+                             next_page, interval_rate)
                 interval_start = default_timer()
         else:
             # mapper puts None to signal finish
             pair = output_queue.get()
             if not pair:
                 break
-            ordinal, text = pair
-            ordering_buffer[ordinal] = text
+            page_num, text = pair
+            collected_pages[page_num] = text
+            # FIXME: if an extractor dies, process stalls; the other processes
+            # continue to produce pairs, filling up memory.
+            if len(collected_pages) > 200: # DEBUG
+                logging.debug('Collected %d, wait: %d, %d', len(collected_pages),
+                              next_page, next_page == page_num)
+    if output != sys.stdout:
+        output.close()
 
 
 # ----------------------------------------------------------------------
@@ -2615,7 +2615,7 @@ minFileSize = 200 * 1024
 
 def main():
     global urlbase, acceptedNamespaces
-    global expand_templates, templateCache, escape_doc
+    global templateCache, escape_doc
 
     parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
                                      formatter_class=argparse.RawDescriptionHelpFormatter,
@@ -2669,7 +2669,7 @@ def main():
     if args.html:
         Extractor.keepLinks = True
 
-    expand_templates = args.no_templates
+    Extractor.expand_templates = args.no_templates
     escape_doc = args.escapedoc
 
     try:
@@ -2708,17 +2708,11 @@ def main():
                 with open(args.templates) as file:
                     load_templates(file)
 
-        with open(input_file) as file:
-            page = file.read().decode('utf-8')
-            m = re.search(r'<id>(.*)</id>', page)
-            id = m.group(1) if m else 0
-            m = re.search(r'<title>(.*)</title>', page)
-            if m:
-                title = m.group(1)
-            else:
-                logging.error('Missing title element')
-                return
-            Extractor(id, title, [page]).extract(sys.stdout)
+        file = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
+        for page_data in pages_from(file):
+            id, title, ns, page = page_data
+            Extractor(id, title, page).extract(sys.stdout)
+        file.close()
         return
 
     output_path = args.output
diff --git a/extractPage.py b/extractPage.py
index 06458e1..29dfef2 100755
--- a/extractPage.py
+++ b/extractPage.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 #
 # =============================================================================
-#  Version: 2.8 (Jan 10, 2015)
+#  Version: 2.9 (Feb 13, 2016)
 #  Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
 
 # =============================================================================
@@ -39,7 +39,7 @@ import Queue, threading, multiprocessing
 
 
 # Program version
-version = '2.8'
+version = '2.9'
 
 # ----------------------------------------------------------------------
 # READER
@@ -48,10 +48,11 @@ tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')
 #tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>([^<]*)')
 #                    1     2            3
 
-def process_data(input_file, id, templates=False):
+def process_data(input_file, ids, templates=False):
     """
     :param input_file: name of the wikipedia dump file.
-    :param id: article id
+    :param ids: article ids (single or range first-last).
+    :param templates: collect also templates
     """
 
     if input_file.lower().endswith("bz2"):
@@ -60,8 +61,16 @@ def process_data(input_file, id, templates=False):
         opener = open
 
     input = opener(input_file)
+    print '<mediawiki>'
 
+    rang = ids.split('-')
+    first = int(rang[0])
+    if len(rang) == 1:
+        last = first
+    else:
+        last = int(rang[1])
     page = []
+    curid = 0
     for line in input:
         line = line.decode('utf-8')
         if '<' not in line:         # faster than doing re.search()
@@ -76,11 +85,13 @@ def process_data(input_file, id, templates=False):
             page = []
             page.append(line)
             inArticle = False
-        elif tag == 'id':
-            curid = m.group(3)
-            if id == curid:
+        elif tag == 'id' and not curid: # other <id> are present
+            curid = int(m.group(3))
+            if first <= curid <= last:
                 page.append(line)
                 inArticle = True
+            elif curid > last and not templates:
+                break
             elif not inArticle and not templates:
                 page = []
         elif tag == 'title':
@@ -95,12 +106,14 @@ def process_data(input_file, id, templates=False):
             if page:
                 page.append(line)
                 print ''.join(page).encode('utf-8')
-                if not templates:
+                if not templates and curid == last:
                     break
+            curid = 0
             page = []
         elif page:
             page.append(line)
 
+    print '</mediawiki>'
     input.close()
 
 def main():
@@ -110,9 +123,9 @@ def main():
     parser.add_argument("input",
                         help="XML wiki dump file")
     parser.add_argument("--id", default="",
-                        help="article number")
+                        help="article number, or range first-last")
     parser.add_argument("--template", action="store_true",
-                        help="template number")
+                        help="extract also all templates")
     parser.add_argument("-v", "--version", action="version",
                         version='%(prog)s ' + version,
                         help="print program version")