do not include title in text
This commit is contained in:
parent
e689ef3233
commit
ecc7cef402
@ -81,7 +81,7 @@ if PY2:
|
||||
range = xrange # Use Python 3 equivalent
|
||||
chr = unichr # Use Python 3 equivalent
|
||||
text_type = unicode
|
||||
|
||||
|
||||
class SimpleNamespace(object):
|
||||
def __init__ (self, **kwargs):
|
||||
self.__dict__.update(kwargs)
|
||||
@ -138,11 +138,11 @@ options = SimpleNamespace(
|
||||
##
|
||||
# Filter disambiguation pages
|
||||
filter_disambig_pages = False,
|
||||
|
||||
|
||||
##
|
||||
# Drop tables from the article
|
||||
keep_tables = False,
|
||||
|
||||
|
||||
##
|
||||
# Whether to preserve links in output
|
||||
keepLinks = False,
|
||||
@ -162,7 +162,7 @@ options = SimpleNamespace(
|
||||
##
|
||||
# Whether to write json instead of the xml-like default output format
|
||||
write_json = False,
|
||||
|
||||
|
||||
##
|
||||
# Whether to expand templates
|
||||
expand_templates = True,
|
||||
@ -178,18 +178,18 @@ options = SimpleNamespace(
|
||||
##
|
||||
# Minimum expanded text length required to print document
|
||||
min_text_length = 0,
|
||||
|
||||
|
||||
# Shared objects holding templates, redirects and cache
|
||||
templates = {},
|
||||
redirects = {},
|
||||
# cache of parser templates
|
||||
# FIXME: sharing this with a Manager slows down.
|
||||
templateCache = {},
|
||||
|
||||
|
||||
# Elements to ignore/discard
|
||||
|
||||
|
||||
ignored_tag_patterns = [],
|
||||
|
||||
|
||||
discardElements = [
|
||||
'gallery', 'timeline', 'noinclude', 'pre',
|
||||
'table', 'tr', 'td', 'th', 'caption', 'div',
|
||||
@ -582,7 +582,7 @@ class Extractor(object):
|
||||
:param out: a memory file.
|
||||
"""
|
||||
logging.info('%s\t%s', self.id, self.title)
|
||||
|
||||
|
||||
# Separate header from text with a newline.
|
||||
if options.toHTML:
|
||||
title_str = '<h1>' + self.title + '</h1>'
|
||||
@ -629,13 +629,12 @@ class Extractor(object):
|
||||
text = self.transform(text)
|
||||
text = self.wiki2text(text)
|
||||
text = compact(self.clean(text))
|
||||
text = [title_str] + text
|
||||
|
||||
|
||||
if sum(len(line) for line in text) < options.min_text_length:
|
||||
return
|
||||
|
||||
|
||||
self.write_output(out, text)
|
||||
|
||||
|
||||
errs = (self.template_title_errs,
|
||||
self.recursion_exceeded_1_errs,
|
||||
self.recursion_exceeded_2_errs,
|
||||
@ -2991,8 +2990,8 @@ def extract_process(opts, i, jobs_queue, output_queue):
|
||||
createLogger(options.quiet, options.debug)
|
||||
|
||||
out = StringIO() # memory buffer
|
||||
|
||||
|
||||
|
||||
|
||||
while True:
|
||||
job = jobs_queue.get() # job is (id, title, page, page_num)
|
||||
if job:
|
||||
@ -3029,9 +3028,9 @@ def reduce_process(opts, output_queue, spool_length,
|
||||
|
||||
global options
|
||||
options = opts
|
||||
|
||||
|
||||
createLogger(options.quiet, options.debug)
|
||||
|
||||
|
||||
if out_file:
|
||||
nextFile = NextFile(out_file)
|
||||
output = OutputSplitter(nextFile, file_size, file_compress)
|
||||
@ -3191,7 +3190,7 @@ def main():
|
||||
|
||||
options.quiet = args.quiet
|
||||
options.debug = args.debug
|
||||
|
||||
|
||||
createLogger(options.quiet, options.debug)
|
||||
|
||||
input_file = args.input
|
||||
|
Loading…
Reference in New Issue
Block a user