do not include title in text

This commit is contained in:
Karl Stratos 2018-03-22 12:51:47 -05:00
parent e689ef3233
commit ecc7cef402

View File

@ -81,7 +81,7 @@ if PY2:
range = xrange # Use Python 3 equivalent
chr = unichr # Use Python 3 equivalent
text_type = unicode
class SimpleNamespace(object):
def __init__ (self, **kwargs):
self.__dict__.update(kwargs)
@ -138,11 +138,11 @@ options = SimpleNamespace(
##
# Filter disambiguation pages
filter_disambig_pages = False,
##
# Drop tables from the article
keep_tables = False,
##
# Whether to preserve links in output
keepLinks = False,
@ -162,7 +162,7 @@ options = SimpleNamespace(
##
# Whether to write json instead of the xml-like default output format
write_json = False,
##
# Whether to expand templates
expand_templates = True,
@ -178,18 +178,18 @@ options = SimpleNamespace(
##
# Minimum expanded text length required to print document
min_text_length = 0,
# Shared objects holding templates, redirects and cache
templates = {},
redirects = {},
# cache of parser templates
# FIXME: sharing this with a Manager slows down.
templateCache = {},
# Elements to ignore/discard
ignored_tag_patterns = [],
discardElements = [
'gallery', 'timeline', 'noinclude', 'pre',
'table', 'tr', 'td', 'th', 'caption', 'div',
@ -582,7 +582,7 @@ class Extractor(object):
:param out: a memory file.
"""
logging.info('%s\t%s', self.id, self.title)
# Separate header from text with a newline.
if options.toHTML:
title_str = '<h1>' + self.title + '</h1>'
@ -629,13 +629,12 @@ class Extractor(object):
text = self.transform(text)
text = self.wiki2text(text)
text = compact(self.clean(text))
text = [title_str] + text
if sum(len(line) for line in text) < options.min_text_length:
return
self.write_output(out, text)
errs = (self.template_title_errs,
self.recursion_exceeded_1_errs,
self.recursion_exceeded_2_errs,
@ -2991,8 +2990,8 @@ def extract_process(opts, i, jobs_queue, output_queue):
createLogger(options.quiet, options.debug)
out = StringIO() # memory buffer
while True:
job = jobs_queue.get() # job is (id, title, page, page_num)
if job:
@ -3029,9 +3028,9 @@ def reduce_process(opts, output_queue, spool_length,
global options
options = opts
createLogger(options.quiet, options.debug)
if out_file:
nextFile = NextFile(out_file)
output = OutputSplitter(nextFile, file_size, file_compress)
@ -3191,7 +3190,7 @@ def main():
options.quiet = args.quiet
options.debug = args.debug
createLogger(options.quiet, options.debug)
input_file = args.input