Factor all info that needs to be passed to subprocesses into "options" variable

In order for things to work properly on Windows (and to make the communication between processes more clear in general), the parent process should communicate with the subprocess function only via its arguments, not via shared global variables. This change takes all the global variables that used to be implicitly shared with the subprocess, and puts them into a single "options" object (a dict-like SimpleNamespace). This object is then passed to the subprocess functions. The "options" object includes not only the values of command-line arguments (e.g., "--no-templates") but also stuff like the "URL base" and template definitions that are precomputed before the main extraction begins.
2017-02-26 11:49:00 -08:00 · 2017-02-26 11:49:00 -08:00 · 19d358eee8
commit 19d358eee8
parent f6f80e2350
1 changed files with 248 additions and 202 deletions
--- a/WikiExtractor.py
+++ b/WikiExtractor.py
@ -82,88 +82,169 @@ if PY2:
    range = xrange  # Overwrite by Python 3 name
    chr = unichr    # Overwrite by Python 3 name
    text_type = unicode
+    
+    class SimpleNamespace (object):
+        def __init__ (self, **kwargs):
+            self.__dict__.update(kwargs)
+        def __repr__ (self):
+            keys = sorted(self.__dict__)
+            items = ("{}={!r}".format(k, self.__dict__[k]) for k in keys)
+            return "{}({})".format(type(self).__name__, ", ".join(items))
+        def __eq__ (self, other):
+            return self.__dict__ == other.__dict__
 else:
    from urllib.parse import quote
    from html.entities import name2codepoint
    from itertools import zip_longest
+    from types import SimpleNamespace
    text_type = str


 # ===========================================================================

 # Program version
-version = '2.71'
+version = '2.70'

 ## PARAMS ####################################################################

-##
-# Defined in <siteinfo>
-# We include as default Template, when loading external template file.
-knownNamespaces = set(['Template'])
+options = SimpleNamespace(

+    ##
+    # Defined in <siteinfo>
+    # We include as default Template, when loading external template file.
+    knownNamespaces=set(['Template']),
+
+    ##
+    # The namespace used for template definitions
+    # It is the name associated with namespace key=10 in the siteinfo header.
+    templateNamespace='',
+    templatePrefix='',
+
+    ##
+    # The namespace used for module definitions
+    # It is the name associated with namespace key=828 in the siteinfo header.
+    moduleNamespace='',
+
+    ##
+    # Recognize only these namespaces in links
+    # w: Internal links to the Wikipedia
+    # wiktionary: Wiki dictionary
+    # wikt: shortcut for Wiktionary
+    #
+    acceptedNamespaces=['w', 'wiktionary', 'wikt'],
+
+    # This is obtained from <siteinfo>
+    urlbase='',
+
+    ##
+    # Filter disambiguation pages
+    filter_disambig_pages=False,
+    
+    ##
+    # Drop tables from the article
+    keep_tables=False,
+    
+    ##
+    # Whether to preserve links in output
+    keepLinks=False,
+
+    ##
+    # Whether to preserve section titles
+    keepSections=True,
+
+    ##
+    # Whether to preserve lists
+    keepLists=False,
+
+    ##
+    # Whether to output HTML instead of text
+    toHTML=False,
+
+    ##
+    # Whether to write json instead of the xml-like default output format
+    write_json=False,
+    
+    ##
+    # Whether to expand templates
+    expand_templates=True,
+
+    ##
+    ## Whether to escape doc content
+    escape_doc=False,
+
+    ##
+    # Print the wikipedia article revision
+    print_revision=False,
+
+    ##
+    # Minimum expanded text length required to print document
+    min_text_length=0,
+    
+    # Shared objects holding templates, redirects and cache
+    templates={},
+    redirects={},
+    # cache of parser templates
+    # FIXME: sharing this with a Manager slows down.
+    templateCache={},
+    
+    # Elements to ignore/discard
+    
+    ignored_tag_patterns = [],
+    
+    discardElements = [
+        'gallery', 'timeline', 'noinclude', 'pre',
+        'table', 'tr', 'td', 'th', 'caption', 'div',
+        'form', 'input', 'select', 'option', 'textarea',
+        'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir',
+        'ref', 'references', 'img', 'imagemap', 'source', 'small',
+        'sub', 'sup', 'indicator'
+    ],
+)
+
+"""
+globals
+-----
+discardElements
+knownNamespaces
+moduleNamespace, modulePrefix
+moduleNamespace, modulePrefix
+redirects
+templateCache
+templateNamespace, templatePrefix
+templateNamespace, templatePrefix
+templates
+urlbase
+urlbase, acceptedNamespaces, filter_disambig_pages, keep_tables
+wgContLang
+wgExtraInterlanguageLinkPrefixes
+
+Extractor attributes
+----
+expand_templates
+keepLinks
+keepLists
+keepSections
+min_text_length
+print_revision
+toHTML
+write_json
+
+"""
 ##
 # Keys for Template and Module namespaces
 templateKeys = set(['10', '828'])

 ##
-# The namespace used for template definitions
-# It is the name associated with namespace key=10 in the siteinfo header.
-templateNamespace = ''
-templatePrefix = ''
-
-##
-# The namespace used for module definitions
-# It is the name associated with namespace key=828 in the siteinfo header.
-moduleNamespace = ''
-
-##
-# Recognize only these namespaces in links
-# w: Internal links to the Wikipedia
-# wiktionary: Wiki dictionary
-# wikt: shortcut for Wiktionary
-#
-acceptedNamespaces = ['w', 'wiktionary', 'wikt']
-
-
-# This is obtained from <siteinfo>
-urlbase = ''
-
-##
-# Filter disambiguation pages
-filter_disambig_pages = False
+# Regex for identifying disambig pages
 filter_disambig_page_pattern = re.compile("{{disambig(uation)?(\|[^}]*)?}}")

-##
-# Drop tables from the article
-keep_tables = False
-
-##
-# Ignored tags
-ignoredTags = set([
-    'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'em',
-    'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd',
-    'p', 'plaintext', 's', 'span', 'strike', 'strong',
-    'tt', 'u', 'var'
-])
-
-##
-# Elements to be discarded
-discardElements = set([
-    'gallery', 'timeline', 'noinclude', 'pre',
-    'table', 'tr', 'td', 'th', 'caption', 'div',
-    'form', 'input', 'select', 'option', 'textarea',
-    'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir',
-    'ref', 'references', 'img', 'imagemap', 'source', 'small',
-    'sub', 'sup', 'indicator'
-])
-
 ##
 # page filtering logic -- remove templates, undesired xml namespaces, and disambiguation pages
 def keepPage(ns, page):
    if ns != '0':               # Aritcle
        return False
    # remove disambig pages if desired
-    if filter_disambig_pages:
+    if options.filter_disambig_pages:
        for line in page:
            if filter_disambig_page_pattern.match(line):
                return False
@ -171,7 +252,7 @@ def keepPage(ns, page):


 def get_url(uid):
-    return "%s?curid=%s" % (urlbase, uid)
+    return "%s?curid=%s" % (options.urlbase, uid)


 # =========================================================================
@ -232,7 +313,7 @@ def normalizeTitle(title):
        rest = m.group(3)

        ns = normalizeNamespace(prefix)
-        if ns in knownNamespaces:
+        if ns in options.knownNamespaces:
            # If the prefix designates a known namespace, then it might be
            # followed by optional whitespace that should be removed to get
            # the canonical page name
@ -287,14 +368,10 @@ comment = re.compile(r'<!--.*?-->', re.DOTALL)
 nowiki = re.compile(r'<nowiki>.*?</nowiki>')


-# Match ignored tags
-ignored_tag_patterns = []
-
-
 def ignoreTag(tag):
    left = re.compile(r'<%s\b.*?>' % tag, re.IGNORECASE | re.DOTALL)  # both <ref> and <reference>
    right = re.compile(r'</\s*%s>' % tag, re.IGNORECASE)
-    ignored_tag_patterns.append((left, right))
+    options.ignored_tag_patterns.append((left, right))

 # Match selfClosing HTML tags
 selfClosing_tag_patterns = [
@ -475,42 +552,6 @@ class Extractor(object):
    """
    An extraction task on a article.
    """
-    ##
-    # Whether to preserve links in output
-    keepLinks = False
-
-    ##
-    # Whether to preserve section titles
-    keepSections = True
-
-    ##
-    # Whether to preserve lists
-    keepLists = False
-
-    ##
-    # Whether to output HTML instead of text
-    toHTML = False
-
-    ##
-    # Whether to write json instead of the xml-like default output format
-    write_json = False
-    
-    ##
-    # Whether to expand templates
-    expand_templates = True
-
-    ##
-    ## Whether to escape doc content
-    escape_doc = False
-
-    ##
-    # Print the wikipedia article revision
-    print_revision = False
-
-    ##
-    # Minimum expanded text length required to print document
-    min_text_length = 0
-
    def __init__(self, id, revid, title, lines):
        """
        :param id: id of page.
@ -534,14 +575,14 @@ class Extractor(object):
        :param text: the text of the page
        """
        url = get_url(self.id)
-        if Extractor.write_json:
+        if options.write_json:
            json_data = {
                'id': self.id,
                'url': url,
                'title': self.title,
                'text': "\n".join(text)
            }
-            if Extractor.print_revision:
+            if options.print_revision:
                json_data['revid'] = self.revid
            # We don't use json.dump(data, out) because we want to be
            # able to encode the string if the output is sys.stdout
@ -551,7 +592,7 @@ class Extractor(object):
            out.write(out_str)
            out.write('\n')
        else:
-            if Extractor.print_revision:
+            if options.print_revision:
                header = '<doc id="%s" revid="%s" url="%s" title="%s">\n' % (self.id, self.revid, url, self.title)
            else:
                header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
@ -573,7 +614,7 @@ class Extractor(object):
        logging.info('%s\t%s', self.id, self.title)
        
        # Separate header from text with a newline.
-        if self.toHTML:
+        if options.toHTML:
            title_str = '<h1>' + self.title + '</h1>'
        else:
            title_str = self.title + '\n'
@ -599,7 +640,7 @@ class Extractor(object):
        text = compact(self.clean(text))
        text = [title_str] + text
        
-        if sum(len(line) for line in text) < Extractor.min_text_length:
+        if sum(len(line) for line in text) < options.min_text_length:
            return
        
        self.write_output(out, text)
@ -631,7 +672,7 @@ class Extractor(object):

    def transform1(self, text):
        """Transform text not containing <nowiki>"""
-        if Extractor.expand_templates:
+        if options.expand_templates:
            # expand templates
            # See: http://www.mediawiki.org/wiki/Help:Templates
            return self.expand(text)
@ -657,12 +698,12 @@ class Extractor(object):

        # Drop tables
        # first drop residual templates, or else empty parameter |} might look like end of table.
-        if not keep_tables:
+        if not options.keep_tables:
            text = dropNested(text, r'{{', r'}}')
            text = dropNested(text, r'{\|', r'\|}')

        # Handle bold/italic/quote
-        if self.toHTML:
+        if options.toHTML:
            text = bold_italic.sub(r'<b>\1</b>', text)
            text = bold.sub(r'<b>\1</b>', text)
            text = italic.sub(r'<i>\1</i>', text)
@ -714,7 +755,7 @@ class Extractor(object):
                spans.append((m.start(), m.end()))

        # Drop ignored tags
-        for left, right in ignored_tag_patterns:
+        for left, right in options.ignored_tag_patterns:
            for m in left.finditer(text):
                spans.append((m.start(), m.end()))
            for m in right.finditer(text):
@ -724,10 +765,10 @@ class Extractor(object):
        text = dropSpans(spans, text)

        # Drop discarded elements
-        for tag in discardElements:
+        for tag in options.discardElements:
            text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag)

-        if not self.toHTML:
+        if not options.toHTML:
            # Turn into text what is left (&amp;nbsp;) and <syntaxhighlight>
            text = unescape(text)

@ -750,7 +791,7 @@ class Extractor(object):
        text = re.sub('(\[\(«) ', r'\1', text)
        text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U)  # lines with only punctuations
        text = text.replace(',,', ',').replace(',.', '.')
-        if keep_tables:
+        if options.keep_tables:
            # the following regular expressions are used to remove the wikiml chartacters around table strucutures
            # yet keep the content. The order here is imporant so we remove certain markup like {| and then
            # then the future html attributes such as 'style'. Finally we drop the remaining '|-' that delimits cells.
@ -758,7 +799,7 @@ class Extractor(object):
            text = re.sub(r'!(?:\s)?style="[a-z]+:(?:\d+)%;[a-z]+:(?:#)?(?:[0-9a-z]+)?"', r'', text)
            text = text.replace('|-', '')
            text = text.replace('|', '')
-        if Extractor.toHTML:
+        if options.toHTML:
            text = cgi.escape(text)
        return text

@ -981,18 +1022,18 @@ class Extractor(object):
            self.template_title_errs += 1
            return ''

-        redirected = redirects.get(title)
+        redirected = options.redirects.get(title)
        if redirected:
            title = redirected

        # get the template
-        if title in templateCache:
-            template = templateCache[title]
-        elif title in templates:
-            template = Template.parse(templates[title])
+        if title in options.templateCache:
+            template = options.templateCache[title]
+        elif title in options.templates:
+            template = Template.parse(options.templates[title])
            # add it to cache
-            templateCache[title] = template
-            del templates[title]
+            options.templateCache[title] = template
+            del options.templates[title]
        else:
            # The page being included could not be identified
            logging.debug('%*s<EXPAND %s %s', self.frame.depth, '', title, '')
@ -1607,7 +1648,7 @@ def fullyQualifiedTemplateTitle(templateTitle):
            # colon found but not in the first position - check if it
            # designates a known namespace
            prefix = normalizeNamespace(m.group(1))
-            if prefix in knownNamespaces:
+            if prefix in options.knownNamespaces:
                return prefix + ucfirst(m.group(2))
    # The title of the page being included is NOT in the main namespace and
    # lacks any other explicit designation of the namespace - therefore, it
@ -1621,7 +1662,7 @@ def fullyQualifiedTemplateTitle(templateTitle):
    # space]], but having in the system a redirect page with an empty title
    # causes numerous problems, so we'll live happier without it.
    if templateTitle:
-        return templatePrefix + ucfirst(templateTitle)
+        return options.templatePrefix + ucfirst(templateTitle)
    else:
        return ''  # caller may log as error

@ -1880,28 +1921,17 @@ def callParserFunction(functionName, args, extractor):
 reNoinclude = re.compile(r'<noinclude>(?:.*?)</noinclude>', re.DOTALL)
 reIncludeonly = re.compile(r'<includeonly>|</includeonly>', re.DOTALL)

-# These are built before spawning processes, hence thay are shared.
-templates = {}
-redirects = {}
-# cache of parser templates
-# FIXME: sharing this with a Manager slows down.
-templateCache = {}
-
-
 def define_template(title, page):
    """
    Adds a template defined in the :param page:.
    @see https://en.wikipedia.org/wiki/Help:Template#Noinclude.2C_includeonly.2C_and_onlyinclude
    """
-    global templates
-    global redirects
-
    # title = normalizeTitle(title)

    # check for redirects
    m = re.match('#REDIRECT.*?\[\[([^\]]*)]]', page[0], re.IGNORECASE)
    if m:
-        redirects[title] = m.group(1)  # normalizeTitle(m.group(1))
+        options.redirects[title] = m.group(1)  # normalizeTitle(m.group(1))
        return

    text = unescape(''.join(page))
@ -1933,9 +1963,9 @@ def define_template(title, page):
        text = reIncludeonly.sub('', text)

    if text:
-        if title in templates:
+        if title in options.templates:
            logging.warn('Redefining: %s', title)
-        templates[title] = text
+        options.templates[title] = text


 # ----------------------------------------------------------------------
@ -2325,14 +2355,14 @@ def replaceInternalLinks(text):

 def makeInternalLink(title, label):
    colon = title.find(':')
-    if colon > 0 and title[:colon] not in acceptedNamespaces:
+    if colon > 0 and title[:colon] not in options.acceptedNamespaces:
        return ''
    if colon == 0:
        # drop also :File:
        colon2 = title.find(':', colon + 1)
-        if colon2 > 1 and title[colon + 1:colon2] not in acceptedNamespaces:
+        if colon2 > 1 and title[colon + 1:colon2] not in options.acceptedNamespaces:
            return ''
-    if Extractor.keepLinks:
+    if options.keepLinks:
        return '<a href="%s">%s</a>' % (quote(title.encode('utf-8')), label)
    else:
        return label
@ -2410,14 +2440,14 @@ def replaceExternalLinks(text):

 def makeExternalLink(url, anchor):
    """Function applied to wikiLinks"""
-    if Extractor.keepLinks:
+    if options.keepLinks:
        return '<a href="%s">%s</a>' % (quote(url.encode('utf-8')), anchor)
    else:
        return anchor


 def makeExternalImage(url, alt=''):
-    if Extractor.keepLinks:
+    if options.keepLinks:
        return '<img src="%s" alt="%s">' % (url, alt)
    else:
        return alt
@ -2454,7 +2484,7 @@ def compact(text):
            # if there is an opening list, close it if we see an empty line
            if len(listLevel):
                page.append(line)
-                if Extractor.toHTML:
+                if options.toHTML:
                    for c in reversed(listLevel):
                        page.append(listClose[c])
                listLevel = []
@ -2467,7 +2497,7 @@ def compact(text):
        if m:
            title = m.group(2)
            lev = len(m.group(1)) # header level
-            if Extractor.toHTML:
+            if options.toHTML:
                page.append("<h%d>%s</h%d>" % (lev, title, lev))
            if title and title[-1] not in '!?':
                title += '.'    # terminate sentence.
@ -2499,7 +2529,7 @@ def compact(text):
            for c, n in zip_longest(listLevel, line, fillvalue=''):
                if not n or n not in '*#;:': # shorter or different
                    if c:
-                        if Extractor.toHTML:
+                        if options.toHTML:
                            page.append(listClose[c])
                        listLevel = listLevel[:-1]
                        listCount = listCount[:-1]
@ -2510,19 +2540,19 @@ def compact(text):
                if c != n and (not c or (c not in ';:' and n not in ';:')):
                    if c:
                        # close level
-                        if Extractor.toHTML:
+                        if options.toHTML:
                            page.append(listClose[c])
                        listLevel = listLevel[:-1]
                        listCount = listCount[:-1]
                    listLevel += n
                    listCount.append(0)
-                    if Extractor.toHTML:
+                    if options.toHTML:
                        page.append(listOpen[n])
                i += 1
            n = line[i - 1]  # last list char
            line = line[i:].strip()
            if line:  # FIXME: n is '"'
-                if Extractor.keepLists:
+                if options.keepLists:
                    # emit open sections
                    items = sorted(headers.items())
                    for _, v in items:
@ -2532,10 +2562,10 @@ def compact(text):
                    listCount[i - 1] += 1
                    bullet = '%d. ' % listCount[i - 1] if n == '#' else '- '
                    page.append('{0:{1}s}'.format(bullet, len(listLevel)) + line)
-                elif Extractor.toHTML:
+                elif options.toHTML:
                    page.append(listItem[n] % line)
        elif len(listLevel):
-            if Extractor.toHTML:
+            if options.toHTML:
                for c in reversed(listLevel):
                    page.append(listClose[c])
            listLevel = []
@ -2549,7 +2579,7 @@ def compact(text):
        elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '':
            continue
        elif len(headers):
-            if Extractor.keepSections:
+            if options.keepSections:
                items = sorted(headers.items())
                for i, v in items:
                    page.append(v)
@ -2653,26 +2683,25 @@ def load_templates(file, output_file=None):
    Load templates from :param file:.
    :param output_file: file where to save templates and modules.
    """
-    global templateNamespace, templatePrefix
-    templatePrefix = templateNamespace + ':'
-    global moduleNamespace, modulePrefix
-    modulePrefix = moduleNamespace + ':'
+    options.templatePrefix = options.templateNamespace + ':'
+    options.modulePrefix = options.moduleNamespace + ':'
+
    if output_file:
        output = codecs.open(output_file, 'wb', 'utf-8')
    for page_count, page_data in enumerate(pages_from(file)):
        id, revid, title, ns, page = page_data
-        if not output_file and (not templateNamespace or
-                                not moduleNamespace):  # do not know it yet
+        if not output_file and (not options.templateNamespace or
+                                not options.moduleNamespace):  # do not know it yet
            # reconstruct templateNamespace and moduleNamespace from the first title
            if ns in templateKeys:
                colon = title.find(':')
                if colon > 1:
                    if ns == '10':
-                        templateNamespace = title[:colon]
-                        templatePrefix = title[:colon + 1]
+                        options.templateNamespace = title[:colon]
+                        options.templatePrefix = title[:colon + 1]
                    elif ns == '828':
-                        moduleNamespace = title[:colon]
-                        modulePrefix = title[:colon + 1]
+                        options.moduleNamespace = title[:colon]
+                        options.modulePrefix = title[:colon + 1]
        if ns in templateKeys:
            text = ''.join(page)
            define_template(title, text)
@ -2691,7 +2720,7 @@ def load_templates(file, output_file=None):
            logging.info("Preprocessed %d pages", page_count)
    if output_file:
        output.close()
-        logging.info("Saved %d templates to '%s'", len(templates), output_file)
+        logging.info("Saved %d templates to '%s'", len(options.templates), output_file)


 def pages_from(input):
@ -2768,10 +2797,6 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
    :param file_compress: whether to compress files with bzip.
    :param process_count: number of extraction processes to spawn.
    """
-    global urlbase
-    global knownNamespaces
-    global templateNamespace, templatePrefix
-    global moduleNamespace, modulePrefix

    if input_file == '-':
        input = sys.stdin
@ -2790,19 +2815,19 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
            # discover urlbase from the xml dump file
            # /mediawiki/siteinfo/base
            base = m.group(3)
-            urlbase = base[:base.rfind("/")]
+            options.urlbase = base[:base.rfind("/")]
        elif tag == 'namespace':
-            knownNamespaces.add(m.group(3))
+            options.knownNamespaces.add(m.group(3))
            if re.search('key="10"', line):
-                templateNamespace = m.group(3)
-                templatePrefix = templateNamespace + ':'
+                options.templateNamespace = m.group(3)
+                options.templatePrefix = options.templateNamespace + ':'
            elif re.search('key="828"', line):
-                moduleNamespace = m.group(3)
-                modulePrefix = moduleNamespace + ':'
+                options.moduleNamespace = m.group(3)
+                options.modulePrefix = options.moduleNamespace + ':'
        elif tag == '/siteinfo':
            break

-    if Extractor.expand_templates:
+    if options.expand_templates:
        # preprocess
        template_load_start = default_timer()
        if template_file:
@ -2822,7 +2847,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
                input.close()
                input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
        template_load_elapsed = default_timer() - template_load_start
-        logging.info("Loaded %d templates in %.1fs", len(templates), template_load_elapsed)
+        logging.info("Loaded %d templates in %.1fs", len(options.templates), template_load_elapsed)

    # process pages
    logging.info("Starting page extraction from %s.", input_file)
@ -2848,7 +2873,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,

    # reduce job that sorts and prints output
    reduce = Process(target=reduce_process,
-                     args=(output_queue, spool_length,
+                     args=(options, output_queue, spool_length,
                           out_file, file_size, file_compress))
    reduce.start()

@ -2860,7 +2885,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
    workers = []
    for i in range(worker_count):
        extractor = Process(target=extract_process,
-                            args=(i, jobs_queue, output_queue))
+                            args=(options, i, jobs_queue, output_queue))
        extractor.daemon = True  # only live while parent process lives
        extractor.start()
        workers.append(extractor)
@ -2908,13 +2933,21 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
 # Multiprocess support


-def extract_process(i, jobs_queue, output_queue):
+def extract_process(opts, i, jobs_queue, output_queue):
    """Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text
    :param i: process id.
    :param jobs_queue: where to get jobs.
    :param output_queue: where to queue extracted text for output.
    """
+
+    global options
+    options = opts
+
+    createLogger(options.quiet, options.debug)
+
    out = StringIO()                 # memory buffer
+    
+    
    while True:
        job = jobs_queue.get()  # job is (id, title, page, page_num)
        if job:
@ -2938,7 +2971,7 @@ def extract_process(i, jobs_queue, output_queue):


 report_period = 10000           # progress report period
-def reduce_process(output_queue, spool_length,
+def reduce_process(opts, output_queue, spool_length,
                   out_file=None, file_size=0, file_compress=True):
    """Pull finished article text, write series of files (or stdout)
    :param output_queue: text to be output.
@ -2948,6 +2981,11 @@ def reduce_process(output_queue, spool_length,
    :param file_compress: whether to compress output.
    """

+    global options
+    options = opts
+    
+    createLogger(options.quiet, options.debug)
+    
    if out_file:
        nextFile = NextFile(out_file)
        output = OutputSplitter(nextFile, file_size, file_compress)
@ -2996,9 +3034,6 @@ def reduce_process(output_queue, spool_length,
 minFileSize = 200 * 1024

 def main():
-    global urlbase, acceptedNamespaces, filter_disambig_pages, keep_tables
-    global templateCache
-    global discardElements, ignoredTags

    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
@ -3032,17 +3067,17 @@ def main():
                        help="use or create file containing templates")
    groupP.add_argument("--no-templates", action="store_false",
                        help="Do not expand templates")
-    groupP.add_argument("-r", "--revision", action="store_true", default=Extractor.print_revision,
+    groupP.add_argument("-r", "--revision", action="store_true", default=options.print_revision,
                        help="Include the document revision id (default=%(default)s)")
-    groupP.add_argument("--min_text_length", type=int, default=Extractor.min_text_length,
+    groupP.add_argument("--min_text_length", type=int, default=options.min_text_length,
                        help="Minimum expanded text length required to write document (default=%(default)s)")
-    groupP.add_argument("--filter_disambig_pages", action="store_true", default=filter_disambig_pages,
+    groupP.add_argument("--filter_disambig_pages", action="store_true", default=options.filter_disambig_pages,
                        help="Remove pages from output that contain disabmiguation markup (default=%(default)s)")
    groupP.add_argument("-it", "--ignored_tags", default="", metavar="abbr,b,big",
                        help="comma separated list of tags that will be dropped, keeping their content")
    groupP.add_argument("-de", "--discard_elements", default="", metavar="gallery,timeline,noinclude",
                        help="comma separated list of elements that will be removed from the article text")
-    groupP.add_argument("--keep_tables", action="store_true", default=keep_tables,
+    groupP.add_argument("--keep_tables", action="store_true", default=options.keep_tables,
                        help="Preserve tables in the output article text (default=%(default)s)")
    default_process_count = max(1, cpu_count() - 1)
    parser.add_argument("--processes", type=int, default=default_process_count,
@ -3061,19 +3096,19 @@ def main():

    args = parser.parse_args()

-    Extractor.keepLinks = args.links
-    Extractor.keepSections = args.sections
-    Extractor.keepLists = args.lists
-    Extractor.toHTML = args.html
-    Extractor.write_json = args.json
-    Extractor.print_revision = args.revision
-    Extractor.min_text_length = args.min_text_length
+    options.keepLinks = args.links
+    options.keepSections = args.sections
+    options.keepLists = args.lists
+    options.toHTML = args.html
+    options.write_json = args.json
+    options.print_revision = args.revision
+    options.min_text_length = args.min_text_length
    if args.html:
-        Extractor.keepLinks = True
+        options.keepLinks = True

-    Extractor.expand_templates = args.no_templates
-    filter_disambig_pages = args.filter_disambig_pages
-    keep_tables = args.keep_tables
+    options.expand_templates = args.no_templates
+    options.filter_disambig_pages = args.filter_disambig_pages
+    options.keep_tables = args.keep_tables

    try:
        power = 'kmg'.find(args.bytes[-1].lower()) + 1
@ -3085,32 +3120,37 @@ def main():
        return

    if args.namespaces:
-        acceptedNamespaces = set(args.namespaces.split(','))
+        options.acceptedNamespaces = set(args.namespaces.split(','))

    # ignoredTags and discardElemets have default values already supplied, if passed in the defaults are overwritten
    if args.ignored_tags:
        ignoredTags = set(args.ignored_tags.split(','))
+    else:
+        ignoredTags = [
+            'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'em',
+            'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd',
+            'p', 'plaintext', 's', 'span', 'strike', 'strong',
+            'tt', 'u', 'var'
+        ]

    # 'a' tag is handled separately
    for tag in ignoredTags:
        ignoreTag(tag)

    if args.discard_elements:
-        discardElements = set(args.discard_elements.split(','))
+        options.discardElements = set(args.discard_elements.split(','))

    FORMAT = '%(levelname)s: %(message)s'
    logging.basicConfig(format=FORMAT)

-    logger = logging.getLogger()
-    if not args.quiet:
-        logger.setLevel(logging.INFO)
-    if args.debug:
-        logger.setLevel(logging.DEBUG)
-
+    options.quiet = args.quiet
+    options.debug = args.debug
+    
+    createLogger(options.quiet, options.debug)

    input_file = args.input

-    if not Extractor.keepLinks:
+    if not options.keepLinks:
        ignoreTag('a')

    # sharing cache of parser templates is too slow:
@ -3141,6 +3181,12 @@ def main():
    process_dump(input_file, args.templates, output_path, file_size,
                 args.compress, args.processes)

+def createLogger(quiet, debug):
+    logger = logging.getLogger()
+    if not quiet:
+        logger.setLevel(logging.INFO)
+    if debug:
+        logger.setLevel(logging.DEBUG)

 if __name__ == '__main__':
    main()