Handle broken pipe and keyboard interrupts

This commit is contained in:
Versus Void 2017-08-15 22:06:20 +00:00
parent 2a5e6aebc0
commit 537ab9b4ef

View File

@ -67,8 +67,9 @@ import os.path
import re # TODO use regex when it will be standard
import time
import json
import queue
from io import StringIO
from multiprocessing import Queue, Process, Value, cpu_count
from multiprocessing import Event, Queue, Process, Value, cpu_count
from timeit import default_timer
@ -2826,6 +2827,12 @@ def pages_from(input):
title = None
page = []
def try_put_until(event, q, value):
while not event.is_set():
try:
return q.put(value, False, 1)
except queue.Full:
pass
def process_dump(input_file, template_file, out_file, file_size, file_compress,
process_count):
@ -2916,10 +2923,12 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
max_spool_length = 10000
spool_length = Value('i', 0, lock=False)
broken_pipe_event = Event()
# reduce job that sorts and prints output
reduce = Process(target=reduce_process,
args=(options, output_queue, spool_length,
out_file, file_size, file_compress))
args=(options, output_queue, spool_length, out_file,
file_size, file_compress, broken_pipe_event))
reduce.start()
# initialize jobs queue
@ -2930,14 +2939,17 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
workers = []
for i in range(worker_count):
extractor = Process(target=extract_process,
args=(options, i, jobs_queue, output_queue))
args=(options, i, jobs_queue, output_queue, broken_pipe_event))
extractor.daemon = True # only live while parent process lives
extractor.start()
workers.append(extractor)
# Mapper process
try:
page_num = 0
for page_data in pages_from(input):
if broken_pipe_event.is_set():
break
id, revid, title, ns, page = page_data
if keepPage(ns, page):
# slow down
@ -2950,22 +2962,24 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
if delay:
logging.info('Delay %ds', delay)
job = (id, revid, title, page, page_num)
jobs_queue.put(job) # goes to any available extract_process
# TODO if pipe is closed
try_put_until(broken_pipe_event, jobs_queue, job) # goes to any available extract_process
page_num += 1
page = None # free memory
input.close()
# signal termination
for _ in workers:
jobs_queue.put(None)
except KeyboardInterrupt:
logging.warn("Exiting due interrupt")
# wait for workers to terminate
for w in workers:
w.join()
# signal end of work to reduce process
if reduce.is_alive():
# signal end of work to reduce proces
output_queue.put(None)
# wait for it to finish
# wait for reduce process to finish
reduce.join()
extract_duration = default_timer() - extract_start
@ -2978,7 +2992,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
# Multiprocess support
def extract_process(opts, i, jobs_queue, output_queue):
def extract_process(opts, i, jobs_queue, output_queue, broken_pipe_event):
"""Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text
:param i: process id.
:param jobs_queue: where to get jobs.
@ -2992,8 +3006,8 @@ def extract_process(opts, i, jobs_queue, output_queue):
out = StringIO() # memory buffer
while True:
try:
while not broken_pipe_event.is_set():
job = jobs_queue.get() # job is (id, title, page, page_num)
if job:
id, revid, title, page, page_num = job
@ -3002,22 +3016,29 @@ def extract_process(opts, i, jobs_queue, output_queue):
page = None # free memory
e.extract(out)
text = out.getvalue()
except:
except Exception:
text = ''
logging.exception('Processing page: %s %s', id, title)
output_queue.put((page_num, text))
try_put_until(broken_pipe_event, output_queue, (page_num, text))
out.truncate(0)
out.seek(0)
else:
logging.debug('Quit extractor')
break
except KeyboardInterrupt:
logging.info('Aborting worker %d', i)
output_queue.cancel_join_thread()
jobs_queue.cancel_join_thread()
if broken_pipe_event.is_set():
output_queue.cancel_join_thread()
out.close()
report_period = 10000 # progress report period
def reduce_process(opts, output_queue, spool_length,
out_file=None, file_size=0, file_compress=True):
def reduce_process(opts, output_queue, spool_length, out_file,
file_size, file_compress, broken_pipe_event):
"""Pull finished article text, write series of files (or stdout)
:param opts: global parameters.
:param output_queue: text to be output.
@ -3044,9 +3065,17 @@ def reduce_process(opts, output_queue, spool_length,
# FIXME: use a heap
spool = {} # collected pages
next_page = 0 # sequence numbering of page
try:
while True:
if next_page in spool:
try:
output.write(spool.pop(next_page).encode('utf-8'))
except BrokenPipeError:
# other side of pipe (like `head` or `grep`) is closed
# we can simply exit
broken_pipe_event.set()
break
next_page += 1
# tell mapper our load:
spool_length.value = len(spool)
@ -3070,7 +3099,10 @@ def reduce_process(opts, output_queue, spool_length,
if len(spool) > 200:
logging.debug('Collected %d, waiting: %d, %d', len(spool),
next_page, next_page == page_num)
if output != sys.stdout:
except KeyboardInterrupt:
pass
if output != sys.stdout and not broken_pipe_event.is_set():
output.close()