print friendly error msg

This commit is contained in:
Zhiwei Chen 2017-04-28 14:57:54 -04:00
parent cc04dae71c
commit 5274829e16

View File

@ -81,7 +81,7 @@ if PY2:
range = xrange # Use Python 3 equivalent
chr = unichr # Use Python 3 equivalent
text_type = unicode
class SimpleNamespace(object):
def __init__ (self, **kwargs):
self.__dict__.update(kwargs)
@ -138,11 +138,11 @@ options = SimpleNamespace(
##
# Filter disambiguation pages
filter_disambig_pages = False,
##
# Drop tables from the article
keep_tables = False,
##
# Whether to preserve links in output
keepLinks = False,
@ -162,7 +162,7 @@ options = SimpleNamespace(
##
# Whether to write json instead of the xml-like default output format
write_json = False,
##
# Whether to expand templates
expand_templates = True,
@ -178,16 +178,16 @@ options = SimpleNamespace(
##
# Minimum expanded text length required to print document
min_text_length = 0,
# Shared objects holding templates, redirects and cache
templates = {},
redirects = {},
# cache of parser templates
# FIXME: sharing this with a Manager slows down.
templateCache = {},
# Elements to ignore/discard
ignored_tag_patterns = [],
filter_category_include = set(),
filter_category_exclude = set(),
@ -599,7 +599,7 @@ class Extractor(object):
:param out: a memory file.
"""
logging.info('%s\t%s', self.id, self.title)
# Separate header from text with a newline.
if options.toHTML:
title_str = '<h1>' + self.title + '</h1>'
@ -647,12 +647,12 @@ class Extractor(object):
text = self.wiki2text(text)
text = compact(self.clean(text))
text = [title_str] + text
if sum(len(line) for line in text) < options.min_text_length:
return
self.write_output(out, text)
errs = (self.template_title_errs,
self.recursion_exceeded_1_errs,
self.recursion_exceeded_2_errs,
@ -3016,8 +3016,8 @@ def extract_process(opts, i, jobs_queue, output_queue):
createLogger(options.quiet, options.debug, options.log_file)
out = StringIO() # memory buffer
while True:
job = jobs_queue.get() # job is (id, title, page, page_num)
if job:
@ -3054,9 +3054,9 @@ def reduce_process(opts, output_queue, spool_length,
global options
options = opts
createLogger(options.quiet, options.debug, options.log_file)
if out_file:
nextFile = NextFile(out_file)
output = OutputSplitter(nextFile, file_size, file_compress)
@ -3256,6 +3256,7 @@ def main():
filter_category = args.filter_category
if (filter_category != None and len(filter_category)>0):
with open(filter_category) as f:
error_cnt = 0
for line in f.readlines():
try:
line = str(line.strip())
@ -3266,8 +3267,8 @@ def main():
else:
options.filter_category_include.add(line)
except Exception as e:
print(e)
print(u"Category not in utf8, ignored:\t")
error_cnt += 1
print(u"Category not in utf8, ignored. error cnt %d:\t%s" % (error_cnt,e))
print(line)
print("Excluding categories:",)
print(str(options.filter_category_exclude))
@ -3283,6 +3284,7 @@ def createLogger(quiet, debug, log_file):
logger.setLevel(logging.INFO)
if debug:
logger.setLevel(logging.DEBUG)
print (log_file)
if log_file:
fileHandler = logging.FileHandler(log_file)
logger.addHandler(fileHandler)