messy 1st approach

2015-06-17 18:49:26 -07:00 · 2015-06-17 18:49:26 -07:00 · 5d32701400
commit 5d32701400
parent 694cd5a7f4
1 changed files with 50 additions and 23 deletions
--- a/WikiExtractor.py
+++ b/WikiExtractor.py
@ -406,8 +406,8 @@ class Extractor(object):
        self.magicWords['currenttime'] = time.strftime('%H:%M:%S')
        text = clean(self, text)
        footer = "\n</doc>\n"
-        if out != sys.stdout:
-            out.reserve(len(header) + len(text) + len(footer))
+###        if out != sys.stdout:
+###            out.reserve(len(header) + len(text) + len(footer))
        out.write(header)
        for line in compact(text):
            out.write(line.encode('utf-8'))
@ -1982,8 +1982,8 @@ def compact(text):
        if not line:
            continue
        # Handle section titles
-        m = section.match(line)
-        if m:
+###        m = section.match(line)
+        if False:  ### m:
            title = m.group(2)
            lev = len(m.group(1))
            if Extractor.toHTML:
@ -2198,7 +2198,7 @@ def load_templates(file, output_file=None):
            if articles % 10000 == 0:
                logging.info("Preprocessed %d pages", articles)

-def process_dump(input_file, template_file, outdir, file_size, file_compress, threads):
+def process_dump(input, template_file, outdir, file_size, file_compress, threads):
    """
    :param input_file: name of the wikipedia dump file.
    :param template_file: optional file with template definitions.
@ -2211,13 +2211,6 @@ def process_dump(input_file, template_file, outdir, file_size, file_compress, th
    global templateNamespace
    global expand_templates

-    if input_file.lower().endswith("bz2"):
-        opener = bz2.BZ2File
-    else:
-        opener = open
-
-    input = opener(input_file)
-
    # collect siteinfo
    for line in input:
        line = line.decode('utf-8')
@ -2250,21 +2243,27 @@ def process_dump(input_file, template_file, outdir, file_size, file_compress, th
        input = opener(input_file)

    # process pages
-    logging.info("Starting processing pages from %s.", input_file)
+###    logging.info("Starting processing pages from %s.", input_file)
+    logging.info("Starting processing pages from %s.", 'input')

    # initialize jobs queue
    #threads = multiprocessing.cpu_count()
    logging.info("Using %d CPUs.", threads)
-    queue = Queue.Queue(maxsize=2 * threads)
+###    queue = Queue.Queue(maxsize=2 * threads)
+    queue = multiprocessing.JoinableQueue(maxsize=10 * threads)
    lock = threading.Lock()  # for protecting shared state.

-    nextFile = NextFile(lock, outdir)
+###    nextFile = NextFile(lock, outdir)

    # start worker threads
    workers = []
    for _ in xrange(max(1, threads - 1)): # keep one for master
-        output_splitter = OutputSplitter(nextFile, file_size, file_compress)
-        extractor = ExtractorThread(queue, output_splitter)
+###        output_splitter = OutputSplitter(nextFile, file_size, file_compress)
+###        extractor = ExtractorThread(queue, output_splitter)
+        fname = outdir +'/'+ str(_)
+        extractor = multiprocessing.Process(target=worker_process,args=(queue,fname))
+        extractor.daemon = False  # ensure worker process gets to finish
+        extractor.start()
        workers.append(extractor)

    # we collect indivual lines, since str.join() is significantly faster than
@ -2308,28 +2307,47 @@ def process_dump(input_file, template_file, outdir, file_size, file_compress, th
            colon = title.find(':')
            if (colon < 0 or title[:colon] in acceptedNamespaces) and \
                    not redirect and not title.startswith(templateNamespace):
-                queue.put(Extractor(id, title, page), True) # block if full
+###                queue.put(Extractor(id, title, page), True) # block if full
+                item = (id, title, page)
+###                print(id)
+                queue.put(item, True) # block if full
            id = None
            page = []

+    for _ in xrange(max(1, threads - 1)):
+        queue.put(None)  # let each thread finish
+
    # wait for empty queue
    queue.join()

-    input.close()


 #----------------------------------------------------------------------
 # Multithread version

-class ExtractorThread(threading.Thread):
+def worker_process(queue, fname):
+    output = bz2.BZ2File(fname + '.bz2', 'w')
+    while True:
+        job = queue.get()
+        if job:
+            Extractor(*job).extract(output)
+            queue.task_done()  # notify of previous job done
+        else:
+            break
+    output.close()
+    queue.task_done()  # notify of final job done only after file close
+
+###class ExtractorThread(threading.Thread):
+class ExtractorThread(multiprocessing.Process):
    """
    Extractor thread.
    """
    def __init__(self, queue, splitter):
        self._queue = queue
        self._splitter = splitter
-        threading.Thread.__init__(self)
-        self.setDaemon(True)  # let the process die when main thread is killed
+####        threading.Thread.__init__(self)
+        multiprocessing.Process.__init__(self)
+        self.daemon = True  # let the process die when main thread is killed
        self.start()

    def run(self):
@ -2450,8 +2468,17 @@ def main():
            logging.error('Could not create: %s', output_dir)
            return

-    process_dump(input_file, args.templates, output_dir, file_size,
+    if input_file == '-':
+        input = sys.stdin
+    elif input_file.lower().endswith("bz2"):
+        input = bz2.BZ2File(input_file)
+    else:
+        input = open(input_file)
+
+    process_dump(input, args.templates, output_dir, file_size,
                 args.compress, args.threads)

+    input.close()
+
 if __name__ == '__main__':
    main()