Factor all info that needs to be passed to subprocesses into "options" variable

In order for things to work properly on Windows (and to make the communication between processes more clear in general), the parent process should communicate with the subprocess function only via its arguments, not via shared global variables.  This change takes all the global variables that used to be implicitly shared with the subprocess, and puts them into a single "options" object (a dict-like SimpleNamespace).  This object is then passed to the subprocess functions.

The "options" object includes not only the values of command-line arguments (e.g., "--no-templates") but also stuff like the "URL base" and template definitions that are precomputed before the main extraction begins.
This commit is contained in:
BrenBarn 2017-02-26 11:49:00 -08:00
parent f6f80e2350
commit 19d358eee8

View File

@ -82,88 +82,169 @@ if PY2:
range = xrange # Overwrite by Python 3 name
chr = unichr # Overwrite by Python 3 name
text_type = unicode
class SimpleNamespace (object):
def __init__ (self, **kwargs):
self.__dict__.update(kwargs)
def __repr__ (self):
keys = sorted(self.__dict__)
items = ("{}={!r}".format(k, self.__dict__[k]) for k in keys)
return "{}({})".format(type(self).__name__, ", ".join(items))
def __eq__ (self, other):
return self.__dict__ == other.__dict__
else:
from urllib.parse import quote
from html.entities import name2codepoint
from itertools import zip_longest
from types import SimpleNamespace
text_type = str
# ===========================================================================
# Program version
version = '2.71'
version = '2.70'
## PARAMS ####################################################################
##
# Defined in <siteinfo>
# We include as default Template, when loading external template file.
knownNamespaces = set(['Template'])
options = SimpleNamespace(
##
# Defined in <siteinfo>
# We include as default Template, when loading external template file.
knownNamespaces=set(['Template']),
##
# The namespace used for template definitions
# It is the name associated with namespace key=10 in the siteinfo header.
templateNamespace='',
templatePrefix='',
##
# The namespace used for module definitions
# It is the name associated with namespace key=828 in the siteinfo header.
moduleNamespace='',
##
# Recognize only these namespaces in links
# w: Internal links to the Wikipedia
# wiktionary: Wiki dictionary
# wikt: shortcut for Wiktionary
#
acceptedNamespaces=['w', 'wiktionary', 'wikt'],
# This is obtained from <siteinfo>
urlbase='',
##
# Filter disambiguation pages
filter_disambig_pages=False,
##
# Drop tables from the article
keep_tables=False,
##
# Whether to preserve links in output
keepLinks=False,
##
# Whether to preserve section titles
keepSections=True,
##
# Whether to preserve lists
keepLists=False,
##
# Whether to output HTML instead of text
toHTML=False,
##
# Whether to write json instead of the xml-like default output format
write_json=False,
##
# Whether to expand templates
expand_templates=True,
##
## Whether to escape doc content
escape_doc=False,
##
# Print the wikipedia article revision
print_revision=False,
##
# Minimum expanded text length required to print document
min_text_length=0,
# Shared objects holding templates, redirects and cache
templates={},
redirects={},
# cache of parser templates
# FIXME: sharing this with a Manager slows down.
templateCache={},
# Elements to ignore/discard
ignored_tag_patterns = [],
discardElements = [
'gallery', 'timeline', 'noinclude', 'pre',
'table', 'tr', 'td', 'th', 'caption', 'div',
'form', 'input', 'select', 'option', 'textarea',
'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir',
'ref', 'references', 'img', 'imagemap', 'source', 'small',
'sub', 'sup', 'indicator'
],
)
"""
globals
-----
discardElements
knownNamespaces
moduleNamespace, modulePrefix
moduleNamespace, modulePrefix
redirects
templateCache
templateNamespace, templatePrefix
templateNamespace, templatePrefix
templates
urlbase
urlbase, acceptedNamespaces, filter_disambig_pages, keep_tables
wgContLang
wgExtraInterlanguageLinkPrefixes
Extractor attributes
----
expand_templates
keepLinks
keepLists
keepSections
min_text_length
print_revision
toHTML
write_json
"""
##
# Keys for Template and Module namespaces
templateKeys = set(['10', '828'])
##
# The namespace used for template definitions
# It is the name associated with namespace key=10 in the siteinfo header.
templateNamespace = ''
templatePrefix = ''
##
# The namespace used for module definitions
# It is the name associated with namespace key=828 in the siteinfo header.
moduleNamespace = ''
##
# Recognize only these namespaces in links
# w: Internal links to the Wikipedia
# wiktionary: Wiki dictionary
# wikt: shortcut for Wiktionary
#
acceptedNamespaces = ['w', 'wiktionary', 'wikt']
# This is obtained from <siteinfo>
urlbase = ''
##
# Filter disambiguation pages
filter_disambig_pages = False
# Regex for identifying disambig pages
filter_disambig_page_pattern = re.compile("{{disambig(uation)?(\|[^}]*)?}}")
##
# Drop tables from the article
keep_tables = False
##
# Ignored tags
ignoredTags = set([
'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'em',
'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd',
'p', 'plaintext', 's', 'span', 'strike', 'strong',
'tt', 'u', 'var'
])
##
# Elements to be discarded
discardElements = set([
'gallery', 'timeline', 'noinclude', 'pre',
'table', 'tr', 'td', 'th', 'caption', 'div',
'form', 'input', 'select', 'option', 'textarea',
'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir',
'ref', 'references', 'img', 'imagemap', 'source', 'small',
'sub', 'sup', 'indicator'
])
##
# page filtering logic -- remove templates, undesired xml namespaces, and disambiguation pages
def keepPage(ns, page):
if ns != '0': # Aritcle
return False
# remove disambig pages if desired
if filter_disambig_pages:
if options.filter_disambig_pages:
for line in page:
if filter_disambig_page_pattern.match(line):
return False
@ -171,7 +252,7 @@ def keepPage(ns, page):
def get_url(uid):
return "%s?curid=%s" % (urlbase, uid)
return "%s?curid=%s" % (options.urlbase, uid)
# =========================================================================
@ -232,7 +313,7 @@ def normalizeTitle(title):
rest = m.group(3)
ns = normalizeNamespace(prefix)
if ns in knownNamespaces:
if ns in options.knownNamespaces:
# If the prefix designates a known namespace, then it might be
# followed by optional whitespace that should be removed to get
# the canonical page name
@ -287,14 +368,10 @@ comment = re.compile(r'<!--.*?-->', re.DOTALL)
nowiki = re.compile(r'<nowiki>.*?</nowiki>')
# Match ignored tags
ignored_tag_patterns = []
def ignoreTag(tag):
left = re.compile(r'<%s\b.*?>' % tag, re.IGNORECASE | re.DOTALL) # both <ref> and <reference>
right = re.compile(r'</\s*%s>' % tag, re.IGNORECASE)
ignored_tag_patterns.append((left, right))
options.ignored_tag_patterns.append((left, right))
# Match selfClosing HTML tags
selfClosing_tag_patterns = [
@ -475,42 +552,6 @@ class Extractor(object):
"""
An extraction task on a article.
"""
##
# Whether to preserve links in output
keepLinks = False
##
# Whether to preserve section titles
keepSections = True
##
# Whether to preserve lists
keepLists = False
##
# Whether to output HTML instead of text
toHTML = False
##
# Whether to write json instead of the xml-like default output format
write_json = False
##
# Whether to expand templates
expand_templates = True
##
## Whether to escape doc content
escape_doc = False
##
# Print the wikipedia article revision
print_revision = False
##
# Minimum expanded text length required to print document
min_text_length = 0
def __init__(self, id, revid, title, lines):
"""
:param id: id of page.
@ -534,14 +575,14 @@ class Extractor(object):
:param text: the text of the page
"""
url = get_url(self.id)
if Extractor.write_json:
if options.write_json:
json_data = {
'id': self.id,
'url': url,
'title': self.title,
'text': "\n".join(text)
}
if Extractor.print_revision:
if options.print_revision:
json_data['revid'] = self.revid
# We don't use json.dump(data, out) because we want to be
# able to encode the string if the output is sys.stdout
@ -551,7 +592,7 @@ class Extractor(object):
out.write(out_str)
out.write('\n')
else:
if Extractor.print_revision:
if options.print_revision:
header = '<doc id="%s" revid="%s" url="%s" title="%s">\n' % (self.id, self.revid, url, self.title)
else:
header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
@ -573,7 +614,7 @@ class Extractor(object):
logging.info('%s\t%s', self.id, self.title)
# Separate header from text with a newline.
if self.toHTML:
if options.toHTML:
title_str = '<h1>' + self.title + '</h1>'
else:
title_str = self.title + '\n'
@ -599,7 +640,7 @@ class Extractor(object):
text = compact(self.clean(text))
text = [title_str] + text
if sum(len(line) for line in text) < Extractor.min_text_length:
if sum(len(line) for line in text) < options.min_text_length:
return
self.write_output(out, text)
@ -631,7 +672,7 @@ class Extractor(object):
def transform1(self, text):
"""Transform text not containing <nowiki>"""
if Extractor.expand_templates:
if options.expand_templates:
# expand templates
# See: http://www.mediawiki.org/wiki/Help:Templates
return self.expand(text)
@ -657,12 +698,12 @@ class Extractor(object):
# Drop tables
# first drop residual templates, or else empty parameter |} might look like end of table.
if not keep_tables:
if not options.keep_tables:
text = dropNested(text, r'{{', r'}}')
text = dropNested(text, r'{\|', r'\|}')
# Handle bold/italic/quote
if self.toHTML:
if options.toHTML:
text = bold_italic.sub(r'<b>\1</b>', text)
text = bold.sub(r'<b>\1</b>', text)
text = italic.sub(r'<i>\1</i>', text)
@ -714,7 +755,7 @@ class Extractor(object):
spans.append((m.start(), m.end()))
# Drop ignored tags
for left, right in ignored_tag_patterns:
for left, right in options.ignored_tag_patterns:
for m in left.finditer(text):
spans.append((m.start(), m.end()))
for m in right.finditer(text):
@ -724,10 +765,10 @@ class Extractor(object):
text = dropSpans(spans, text)
# Drop discarded elements
for tag in discardElements:
for tag in options.discardElements:
text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag)
if not self.toHTML:
if not options.toHTML:
# Turn into text what is left (&amp;nbsp;) and <syntaxhighlight>
text = unescape(text)
@ -750,7 +791,7 @@ class Extractor(object):
text = re.sub('(\[\(«) ', r'\1', text)
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
text = text.replace(',,', ',').replace(',.', '.')
if keep_tables:
if options.keep_tables:
# the following regular expressions are used to remove the wikiml chartacters around table strucutures
# yet keep the content. The order here is imporant so we remove certain markup like {| and then
# then the future html attributes such as 'style'. Finally we drop the remaining '|-' that delimits cells.
@ -758,7 +799,7 @@ class Extractor(object):
text = re.sub(r'!(?:\s)?style="[a-z]+:(?:\d+)%;[a-z]+:(?:#)?(?:[0-9a-z]+)?"', r'', text)
text = text.replace('|-', '')
text = text.replace('|', '')
if Extractor.toHTML:
if options.toHTML:
text = cgi.escape(text)
return text
@ -981,18 +1022,18 @@ class Extractor(object):
self.template_title_errs += 1
return ''
redirected = redirects.get(title)
redirected = options.redirects.get(title)
if redirected:
title = redirected
# get the template
if title in templateCache:
template = templateCache[title]
elif title in templates:
template = Template.parse(templates[title])
if title in options.templateCache:
template = options.templateCache[title]
elif title in options.templates:
template = Template.parse(options.templates[title])
# add it to cache
templateCache[title] = template
del templates[title]
options.templateCache[title] = template
del options.templates[title]
else:
# The page being included could not be identified
logging.debug('%*s<EXPAND %s %s', self.frame.depth, '', title, '')
@ -1607,7 +1648,7 @@ def fullyQualifiedTemplateTitle(templateTitle):
# colon found but not in the first position - check if it
# designates a known namespace
prefix = normalizeNamespace(m.group(1))
if prefix in knownNamespaces:
if prefix in options.knownNamespaces:
return prefix + ucfirst(m.group(2))
# The title of the page being included is NOT in the main namespace and
# lacks any other explicit designation of the namespace - therefore, it
@ -1621,7 +1662,7 @@ def fullyQualifiedTemplateTitle(templateTitle):
# space]], but having in the system a redirect page with an empty title
# causes numerous problems, so we'll live happier without it.
if templateTitle:
return templatePrefix + ucfirst(templateTitle)
return options.templatePrefix + ucfirst(templateTitle)
else:
return '' # caller may log as error
@ -1880,28 +1921,17 @@ def callParserFunction(functionName, args, extractor):
reNoinclude = re.compile(r'<noinclude>(?:.*?)</noinclude>', re.DOTALL)
reIncludeonly = re.compile(r'<includeonly>|</includeonly>', re.DOTALL)
# These are built before spawning processes, hence thay are shared.
templates = {}
redirects = {}
# cache of parser templates
# FIXME: sharing this with a Manager slows down.
templateCache = {}
def define_template(title, page):
"""
Adds a template defined in the :param page:.
@see https://en.wikipedia.org/wiki/Help:Template#Noinclude.2C_includeonly.2C_and_onlyinclude
"""
global templates
global redirects
# title = normalizeTitle(title)
# check for redirects
m = re.match('#REDIRECT.*?\[\[([^\]]*)]]', page[0], re.IGNORECASE)
if m:
redirects[title] = m.group(1) # normalizeTitle(m.group(1))
options.redirects[title] = m.group(1) # normalizeTitle(m.group(1))
return
text = unescape(''.join(page))
@ -1933,9 +1963,9 @@ def define_template(title, page):
text = reIncludeonly.sub('', text)
if text:
if title in templates:
if title in options.templates:
logging.warn('Redefining: %s', title)
templates[title] = text
options.templates[title] = text
# ----------------------------------------------------------------------
@ -2325,14 +2355,14 @@ def replaceInternalLinks(text):
def makeInternalLink(title, label):
colon = title.find(':')
if colon > 0 and title[:colon] not in acceptedNamespaces:
if colon > 0 and title[:colon] not in options.acceptedNamespaces:
return ''
if colon == 0:
# drop also :File:
colon2 = title.find(':', colon + 1)
if colon2 > 1 and title[colon + 1:colon2] not in acceptedNamespaces:
if colon2 > 1 and title[colon + 1:colon2] not in options.acceptedNamespaces:
return ''
if Extractor.keepLinks:
if options.keepLinks:
return '<a href="%s">%s</a>' % (quote(title.encode('utf-8')), label)
else:
return label
@ -2410,14 +2440,14 @@ def replaceExternalLinks(text):
def makeExternalLink(url, anchor):
"""Function applied to wikiLinks"""
if Extractor.keepLinks:
if options.keepLinks:
return '<a href="%s">%s</a>' % (quote(url.encode('utf-8')), anchor)
else:
return anchor
def makeExternalImage(url, alt=''):
if Extractor.keepLinks:
if options.keepLinks:
return '<img src="%s" alt="%s">' % (url, alt)
else:
return alt
@ -2454,7 +2484,7 @@ def compact(text):
# if there is an opening list, close it if we see an empty line
if len(listLevel):
page.append(line)
if Extractor.toHTML:
if options.toHTML:
for c in reversed(listLevel):
page.append(listClose[c])
listLevel = []
@ -2467,7 +2497,7 @@ def compact(text):
if m:
title = m.group(2)
lev = len(m.group(1)) # header level
if Extractor.toHTML:
if options.toHTML:
page.append("<h%d>%s</h%d>" % (lev, title, lev))
if title and title[-1] not in '!?':
title += '.' # terminate sentence.
@ -2499,7 +2529,7 @@ def compact(text):
for c, n in zip_longest(listLevel, line, fillvalue=''):
if not n or n not in '*#;:': # shorter or different
if c:
if Extractor.toHTML:
if options.toHTML:
page.append(listClose[c])
listLevel = listLevel[:-1]
listCount = listCount[:-1]
@ -2510,19 +2540,19 @@ def compact(text):
if c != n and (not c or (c not in ';:' and n not in ';:')):
if c:
# close level
if Extractor.toHTML:
if options.toHTML:
page.append(listClose[c])
listLevel = listLevel[:-1]
listCount = listCount[:-1]
listLevel += n
listCount.append(0)
if Extractor.toHTML:
if options.toHTML:
page.append(listOpen[n])
i += 1
n = line[i - 1] # last list char
line = line[i:].strip()
if line: # FIXME: n is '"'
if Extractor.keepLists:
if options.keepLists:
# emit open sections
items = sorted(headers.items())
for _, v in items:
@ -2532,10 +2562,10 @@ def compact(text):
listCount[i - 1] += 1
bullet = '%d. ' % listCount[i - 1] if n == '#' else '- '
page.append('{0:{1}s}'.format(bullet, len(listLevel)) + line)
elif Extractor.toHTML:
elif options.toHTML:
page.append(listItem[n] % line)
elif len(listLevel):
if Extractor.toHTML:
if options.toHTML:
for c in reversed(listLevel):
page.append(listClose[c])
listLevel = []
@ -2549,7 +2579,7 @@ def compact(text):
elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '':
continue
elif len(headers):
if Extractor.keepSections:
if options.keepSections:
items = sorted(headers.items())
for i, v in items:
page.append(v)
@ -2653,26 +2683,25 @@ def load_templates(file, output_file=None):
Load templates from :param file:.
:param output_file: file where to save templates and modules.
"""
global templateNamespace, templatePrefix
templatePrefix = templateNamespace + ':'
global moduleNamespace, modulePrefix
modulePrefix = moduleNamespace + ':'
options.templatePrefix = options.templateNamespace + ':'
options.modulePrefix = options.moduleNamespace + ':'
if output_file:
output = codecs.open(output_file, 'wb', 'utf-8')
for page_count, page_data in enumerate(pages_from(file)):
id, revid, title, ns, page = page_data
if not output_file and (not templateNamespace or
not moduleNamespace): # do not know it yet
if not output_file and (not options.templateNamespace or
not options.moduleNamespace): # do not know it yet
# reconstruct templateNamespace and moduleNamespace from the first title
if ns in templateKeys:
colon = title.find(':')
if colon > 1:
if ns == '10':
templateNamespace = title[:colon]
templatePrefix = title[:colon + 1]
options.templateNamespace = title[:colon]
options.templatePrefix = title[:colon + 1]
elif ns == '828':
moduleNamespace = title[:colon]
modulePrefix = title[:colon + 1]
options.moduleNamespace = title[:colon]
options.modulePrefix = title[:colon + 1]
if ns in templateKeys:
text = ''.join(page)
define_template(title, text)
@ -2691,7 +2720,7 @@ def load_templates(file, output_file=None):
logging.info("Preprocessed %d pages", page_count)
if output_file:
output.close()
logging.info("Saved %d templates to '%s'", len(templates), output_file)
logging.info("Saved %d templates to '%s'", len(options.templates), output_file)
def pages_from(input):
@ -2768,10 +2797,6 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
:param file_compress: whether to compress files with bzip.
:param process_count: number of extraction processes to spawn.
"""
global urlbase
global knownNamespaces
global templateNamespace, templatePrefix
global moduleNamespace, modulePrefix
if input_file == '-':
input = sys.stdin
@ -2790,19 +2815,19 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
# discover urlbase from the xml dump file
# /mediawiki/siteinfo/base
base = m.group(3)
urlbase = base[:base.rfind("/")]
options.urlbase = base[:base.rfind("/")]
elif tag == 'namespace':
knownNamespaces.add(m.group(3))
options.knownNamespaces.add(m.group(3))
if re.search('key="10"', line):
templateNamespace = m.group(3)
templatePrefix = templateNamespace + ':'
options.templateNamespace = m.group(3)
options.templatePrefix = options.templateNamespace + ':'
elif re.search('key="828"', line):
moduleNamespace = m.group(3)
modulePrefix = moduleNamespace + ':'
options.moduleNamespace = m.group(3)
options.modulePrefix = options.moduleNamespace + ':'
elif tag == '/siteinfo':
break
if Extractor.expand_templates:
if options.expand_templates:
# preprocess
template_load_start = default_timer()
if template_file:
@ -2822,7 +2847,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
input.close()
input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
template_load_elapsed = default_timer() - template_load_start
logging.info("Loaded %d templates in %.1fs", len(templates), template_load_elapsed)
logging.info("Loaded %d templates in %.1fs", len(options.templates), template_load_elapsed)
# process pages
logging.info("Starting page extraction from %s.", input_file)
@ -2848,7 +2873,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
# reduce job that sorts and prints output
reduce = Process(target=reduce_process,
args=(output_queue, spool_length,
args=(options, output_queue, spool_length,
out_file, file_size, file_compress))
reduce.start()
@ -2860,7 +2885,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
workers = []
for i in range(worker_count):
extractor = Process(target=extract_process,
args=(i, jobs_queue, output_queue))
args=(options, i, jobs_queue, output_queue))
extractor.daemon = True # only live while parent process lives
extractor.start()
workers.append(extractor)
@ -2908,13 +2933,21 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
# Multiprocess support
def extract_process(i, jobs_queue, output_queue):
def extract_process(opts, i, jobs_queue, output_queue):
"""Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text
:param i: process id.
:param jobs_queue: where to get jobs.
:param output_queue: where to queue extracted text for output.
"""
global options
options = opts
createLogger(options.quiet, options.debug)
out = StringIO() # memory buffer
while True:
job = jobs_queue.get() # job is (id, title, page, page_num)
if job:
@ -2938,7 +2971,7 @@ def extract_process(i, jobs_queue, output_queue):
report_period = 10000 # progress report period
def reduce_process(output_queue, spool_length,
def reduce_process(opts, output_queue, spool_length,
out_file=None, file_size=0, file_compress=True):
"""Pull finished article text, write series of files (or stdout)
:param output_queue: text to be output.
@ -2948,6 +2981,11 @@ def reduce_process(output_queue, spool_length,
:param file_compress: whether to compress output.
"""
global options
options = opts
createLogger(options.quiet, options.debug)
if out_file:
nextFile = NextFile(out_file)
output = OutputSplitter(nextFile, file_size, file_compress)
@ -2996,9 +3034,6 @@ def reduce_process(output_queue, spool_length,
minFileSize = 200 * 1024
def main():
global urlbase, acceptedNamespaces, filter_disambig_pages, keep_tables
global templateCache
global discardElements, ignoredTags
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.RawDescriptionHelpFormatter,
@ -3032,17 +3067,17 @@ def main():
help="use or create file containing templates")
groupP.add_argument("--no-templates", action="store_false",
help="Do not expand templates")
groupP.add_argument("-r", "--revision", action="store_true", default=Extractor.print_revision,
groupP.add_argument("-r", "--revision", action="store_true", default=options.print_revision,
help="Include the document revision id (default=%(default)s)")
groupP.add_argument("--min_text_length", type=int, default=Extractor.min_text_length,
groupP.add_argument("--min_text_length", type=int, default=options.min_text_length,
help="Minimum expanded text length required to write document (default=%(default)s)")
groupP.add_argument("--filter_disambig_pages", action="store_true", default=filter_disambig_pages,
groupP.add_argument("--filter_disambig_pages", action="store_true", default=options.filter_disambig_pages,
help="Remove pages from output that contain disabmiguation markup (default=%(default)s)")
groupP.add_argument("-it", "--ignored_tags", default="", metavar="abbr,b,big",
help="comma separated list of tags that will be dropped, keeping their content")
groupP.add_argument("-de", "--discard_elements", default="", metavar="gallery,timeline,noinclude",
help="comma separated list of elements that will be removed from the article text")
groupP.add_argument("--keep_tables", action="store_true", default=keep_tables,
groupP.add_argument("--keep_tables", action="store_true", default=options.keep_tables,
help="Preserve tables in the output article text (default=%(default)s)")
default_process_count = max(1, cpu_count() - 1)
parser.add_argument("--processes", type=int, default=default_process_count,
@ -3061,19 +3096,19 @@ def main():
args = parser.parse_args()
Extractor.keepLinks = args.links
Extractor.keepSections = args.sections
Extractor.keepLists = args.lists
Extractor.toHTML = args.html
Extractor.write_json = args.json
Extractor.print_revision = args.revision
Extractor.min_text_length = args.min_text_length
options.keepLinks = args.links
options.keepSections = args.sections
options.keepLists = args.lists
options.toHTML = args.html
options.write_json = args.json
options.print_revision = args.revision
options.min_text_length = args.min_text_length
if args.html:
Extractor.keepLinks = True
options.keepLinks = True
Extractor.expand_templates = args.no_templates
filter_disambig_pages = args.filter_disambig_pages
keep_tables = args.keep_tables
options.expand_templates = args.no_templates
options.filter_disambig_pages = args.filter_disambig_pages
options.keep_tables = args.keep_tables
try:
power = 'kmg'.find(args.bytes[-1].lower()) + 1
@ -3085,32 +3120,37 @@ def main():
return
if args.namespaces:
acceptedNamespaces = set(args.namespaces.split(','))
options.acceptedNamespaces = set(args.namespaces.split(','))
# ignoredTags and discardElemets have default values already supplied, if passed in the defaults are overwritten
if args.ignored_tags:
ignoredTags = set(args.ignored_tags.split(','))
else:
ignoredTags = [
'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'em',
'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd',
'p', 'plaintext', 's', 'span', 'strike', 'strong',
'tt', 'u', 'var'
]
# 'a' tag is handled separately
for tag in ignoredTags:
ignoreTag(tag)
if args.discard_elements:
discardElements = set(args.discard_elements.split(','))
options.discardElements = set(args.discard_elements.split(','))
FORMAT = '%(levelname)s: %(message)s'
logging.basicConfig(format=FORMAT)
logger = logging.getLogger()
if not args.quiet:
logger.setLevel(logging.INFO)
if args.debug:
logger.setLevel(logging.DEBUG)
options.quiet = args.quiet
options.debug = args.debug
createLogger(options.quiet, options.debug)
input_file = args.input
if not Extractor.keepLinks:
if not options.keepLinks:
ignoreTag('a')
# sharing cache of parser templates is too slow:
@ -3141,6 +3181,12 @@ def main():
process_dump(input_file, args.templates, output_path, file_size,
args.compress, args.processes)
def createLogger(quiet, debug):
logger = logging.getLogger()
if not quiet:
logger.setLevel(logging.INFO)
if debug:
logger.setLevel(logging.DEBUG)
if __name__ == '__main__':
main()