print friendly error msg
This commit is contained in:
parent
cc04dae71c
commit
5274829e16
@ -81,7 +81,7 @@ if PY2:
|
||||
range = xrange # Use Python 3 equivalent
|
||||
chr = unichr # Use Python 3 equivalent
|
||||
text_type = unicode
|
||||
|
||||
|
||||
class SimpleNamespace(object):
|
||||
def __init__ (self, **kwargs):
|
||||
self.__dict__.update(kwargs)
|
||||
@ -138,11 +138,11 @@ options = SimpleNamespace(
|
||||
##
|
||||
# Filter disambiguation pages
|
||||
filter_disambig_pages = False,
|
||||
|
||||
|
||||
##
|
||||
# Drop tables from the article
|
||||
keep_tables = False,
|
||||
|
||||
|
||||
##
|
||||
# Whether to preserve links in output
|
||||
keepLinks = False,
|
||||
@ -162,7 +162,7 @@ options = SimpleNamespace(
|
||||
##
|
||||
# Whether to write json instead of the xml-like default output format
|
||||
write_json = False,
|
||||
|
||||
|
||||
##
|
||||
# Whether to expand templates
|
||||
expand_templates = True,
|
||||
@ -178,16 +178,16 @@ options = SimpleNamespace(
|
||||
##
|
||||
# Minimum expanded text length required to print document
|
||||
min_text_length = 0,
|
||||
|
||||
|
||||
# Shared objects holding templates, redirects and cache
|
||||
templates = {},
|
||||
redirects = {},
|
||||
# cache of parser templates
|
||||
# FIXME: sharing this with a Manager slows down.
|
||||
templateCache = {},
|
||||
|
||||
|
||||
# Elements to ignore/discard
|
||||
|
||||
|
||||
ignored_tag_patterns = [],
|
||||
filter_category_include = set(),
|
||||
filter_category_exclude = set(),
|
||||
@ -599,7 +599,7 @@ class Extractor(object):
|
||||
:param out: a memory file.
|
||||
"""
|
||||
logging.info('%s\t%s', self.id, self.title)
|
||||
|
||||
|
||||
# Separate header from text with a newline.
|
||||
if options.toHTML:
|
||||
title_str = '<h1>' + self.title + '</h1>'
|
||||
@ -647,12 +647,12 @@ class Extractor(object):
|
||||
text = self.wiki2text(text)
|
||||
text = compact(self.clean(text))
|
||||
text = [title_str] + text
|
||||
|
||||
|
||||
if sum(len(line) for line in text) < options.min_text_length:
|
||||
return
|
||||
|
||||
|
||||
self.write_output(out, text)
|
||||
|
||||
|
||||
errs = (self.template_title_errs,
|
||||
self.recursion_exceeded_1_errs,
|
||||
self.recursion_exceeded_2_errs,
|
||||
@ -3016,8 +3016,8 @@ def extract_process(opts, i, jobs_queue, output_queue):
|
||||
createLogger(options.quiet, options.debug, options.log_file)
|
||||
|
||||
out = StringIO() # memory buffer
|
||||
|
||||
|
||||
|
||||
|
||||
while True:
|
||||
job = jobs_queue.get() # job is (id, title, page, page_num)
|
||||
if job:
|
||||
@ -3054,9 +3054,9 @@ def reduce_process(opts, output_queue, spool_length,
|
||||
|
||||
global options
|
||||
options = opts
|
||||
|
||||
|
||||
createLogger(options.quiet, options.debug, options.log_file)
|
||||
|
||||
|
||||
if out_file:
|
||||
nextFile = NextFile(out_file)
|
||||
output = OutputSplitter(nextFile, file_size, file_compress)
|
||||
@ -3256,6 +3256,7 @@ def main():
|
||||
filter_category = args.filter_category
|
||||
if (filter_category != None and len(filter_category)>0):
|
||||
with open(filter_category) as f:
|
||||
error_cnt = 0
|
||||
for line in f.readlines():
|
||||
try:
|
||||
line = str(line.strip())
|
||||
@ -3266,8 +3267,8 @@ def main():
|
||||
else:
|
||||
options.filter_category_include.add(line)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print(u"Category not in utf8, ignored:\t")
|
||||
error_cnt += 1
|
||||
print(u"Category not in utf8, ignored. error cnt %d:\t%s" % (error_cnt,e))
|
||||
print(line)
|
||||
print("Excluding categories:",)
|
||||
print(str(options.filter_category_exclude))
|
||||
@ -3283,6 +3284,7 @@ def createLogger(quiet, debug, log_file):
|
||||
logger.setLevel(logging.INFO)
|
||||
if debug:
|
||||
logger.setLevel(logging.DEBUG)
|
||||
print (log_file)
|
||||
if log_file:
|
||||
fileHandler = logging.FileHandler(log_file)
|
||||
logger.addHandler(fileHandler)
|
||||
|
Loading…
Reference in New Issue
Block a user