From 6d0577ef10e48959de709ca05cfa3c2b195f4087 Mon Sep 17 00:00:00 2001 From: attardi Date: Mon, 15 Feb 2016 01:22:38 +0100 Subject: [PATCH] See ChangeLog. --- ChangeLog | 16 ++ WikiExtractor.py | 484 +++++++++++++++++++++++------------------------ extractPage.py | 33 +++- 3 files changed, 278 insertions(+), 255 deletions(-) diff --git a/ChangeLog b/ChangeLog index 80d0d3c..1893682 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,19 @@ +2016-02-15 Giuseppe Attardi + + * WikiExtractor.py (Extractor.clean): turned into method. + +2016-02-14 Giuseppe Attardi + + * WikiExtractor.py (load_templates): save also in templates. + (pages_from): common iterator for extracting pages from dump, used + both for analyzing pages, templates and single articles. + +2016-02-13 Giuseppe Attardi + + * WikiExtractor.py (reduce_process): close file here. + + * extractPage.py (process_data): allow range of ids. + 2016-02-12 Giuseppe Attardi * WikiExtractor.py (reduce_process): moved here creation of OutputSplitter. diff --git a/WikiExtractor.py b/WikiExtractor.py index 7b8fc13..d9427ba 100755 --- a/WikiExtractor.py +++ b/WikiExtractor.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # ============================================================================= -# Version: 2.48 (February 12, 2016) +# Version: 2.49 (February 14, 2016) # Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa # # Contributors: @@ -46,6 +46,7 @@ This version performs template expansion by preprocesssng the whole dump and collecting template definitions. """ +import sys import argparse import bz2 import codecs @@ -54,19 +55,18 @@ import fileinput import logging import os.path import re # TODO use regex when it will be standard -import sys import time import urllib from cStringIO import StringIO from htmlentitydefs import name2codepoint from itertools import izip, izip_longest -from multiprocessing import Queue, Process, cpu_count +from multiprocessing import Queue, Process, Array, cpu_count from timeit import default_timer # =========================================================================== # Program version -version = '2.48' +version = '2.49' ## PARAMS #################################################################### @@ -394,7 +394,6 @@ class TemplateArg(object): substWords = 'subst:|safesubst:' - class Extractor(object): """ An extraction task on a article. @@ -415,13 +414,20 @@ class Extractor(object): # Whether to output HTML instead of text toHTML = False - def __init__(self, id, title, page): + ## + # Whether to expand templates + expand_templates = True + + + def __init__(self, id, title, lines): """ - :param page: a list of lines. + :param id: id of page. + :param title: tutle of page. + :param lines: a list of lines. """ self.id = id self.title = title - self.page = page + self.text = ''.join(lines) self.magicWords = MagicWords() self.frame = [] self.recursion_exceeded_1_errs = 0 # template recursion within expandTemplates() @@ -433,8 +439,7 @@ class Extractor(object): """ :param out: a memory file. """ - logging.debug("%s\t%s", self.id, self.title) - text = ''.join(self.page) + logging.info("%s\t%s", self.id, self.title) url = get_url(self.id) header = '\n' % (self.id, url, self.title) # Separate header from text with a newline. @@ -447,7 +452,7 @@ class Extractor(object): self.magicWords['currentday'] = time.strftime('%d') self.magicWords['currenthour'] = time.strftime('%H') self.magicWords['currenttime'] = time.strftime('%H:%M:%S') - text = clean(self, text) + text = self.clean() footer = "\n\n" out.write(header) for line in compact(text): @@ -462,6 +467,112 @@ class Extractor(object): logging.warn("Template errors in article '%s' (%s): title(%d) recursion(%d, %d, %d)", self.title, self.id, *errs) + def clean(self): + """ + Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped + @see https://www.mediawiki.org/wiki/Help:Formatting + """ + text = self.text + self.text = '' # save memory + if Extractor.expand_templates: + # expand templates + # See: http://www.mediawiki.org/wiki/Help:Templates + text = self.expandTemplates(text) + else: + # Drop transclusions (template, parser functions) + text = dropNested(text, r'{{', r'}}') + + # Drop tables + text = dropNested(text, r'{\|', r'\|}') + + # replace external links + text = replaceExternalLinks(text) + + # replace internal links + text = replaceInternalLinks(text) + + # drop MagicWords behavioral switches + text = magicWordsRE.sub('', text) + + # ############### Process HTML ############### + + # turn into HTML, except for the content of + res = '' + cur = 0 + for m in syntaxhighlight.finditer(text): + end = m.end() + res += unescape(text[cur:m.start()]) + m.group(1) + cur = end + text = res + unescape(text[cur:]) + + # Handle bold/italic/quote + if self.toHTML: + text = bold_italic.sub(r'\1', text) + text = bold.sub(r'\1', text) + text = italic.sub(r'\1', text) + else: + text = bold_italic.sub(r'\1', text) + text = bold.sub(r'\1', text) + text = italic_quote.sub(r'"\1"', text) + text = italic.sub(r'"\1"', text) + text = quote_quote.sub(r'"\1"', text) + # residuals of unbalanced quotes + text = text.replace("'''", '').replace("''", '"') + + # Collect spans + + spans = [] + # Drop HTML comments + for m in comment.finditer(text): + spans.append((m.start(), m.end())) + + # Drop self-closing tags + for pattern in selfClosing_tag_patterns: + for m in pattern.finditer(text): + spans.append((m.start(), m.end())) + + # Drop ignored tags + for left, right in ignored_tag_patterns: + for m in left.finditer(text): + spans.append((m.start(), m.end())) + for m in right.finditer(text): + spans.append((m.start(), m.end())) + + # Bulk remove all spans + text = dropSpans(spans, text) + + # Drop discarded elements + for tag in discardElements: + text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag) + + if not self.toHTML: + # Turn into text what is left (&nbsp;) and + text = unescape(text) + + # Expand placeholders + for pattern, placeholder in placeholder_tag_patterns: + index = 1 + for match in pattern.finditer(text): + text = text.replace(match.group(), '%s_%d' % (placeholder, index)) + index += 1 + + text = text.replace('<<', u'«').replace('>>', u'»') + + ############################################# + + # Cleanup text + text = text.replace('\t', ' ') + text = spaces.sub(' ', text) + text = dots.sub('...', text) + text = re.sub(u' (,:\.\)\]»)', r'\1', text) + text = re.sub(u'(\[\(«) ', r'\1', text) + text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations + text = text.replace(',,', ',').replace(',.', '.') + if escape_doc: + text = cgi.escape(text) + return text + + # ---------------------------------------------------------------------- # Expand templates @@ -1993,114 +2104,6 @@ tailRE = re.compile('\w+') syntaxhighlight = re.compile('<syntaxhighlight .*?>(.*?)</syntaxhighlight>', re.DOTALL) -expand_templates = True - - -def clean(extractor, text): - """ - Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped - @see https://www.mediawiki.org/wiki/Help:Formatting - """ - - if expand_templates: - # expand templates - # See: http://www.mediawiki.org/wiki/Help:Templates - text = extractor.expandTemplates(text) - else: - # Drop transclusions (template, parser functions) - text = dropNested(text, r'{{', r'}}') - - # Drop tables - text = dropNested(text, r'{\|', r'\|}') - - # replace external links - text = replaceExternalLinks(text) - - # replace internal links - text = replaceInternalLinks(text) - - # drop MagicWords behavioral switches - text = magicWordsRE.sub('', text) - - # ############### Process HTML ############### - - # turn into HTML, except for the content of - res = '' - cur = 0 - for m in syntaxhighlight.finditer(text): - end = m.end() - res += unescape(text[cur:m.start()]) + m.group(1) - cur = end - text = res + unescape(text[cur:]) - - # Handle bold/italic/quote - if extractor.toHTML: - text = bold_italic.sub(r'\1', text) - text = bold.sub(r'\1', text) - text = italic.sub(r'\1', text) - else: - text = bold_italic.sub(r'\1', text) - text = bold.sub(r'\1', text) - text = italic_quote.sub(r'"\1"', text) - text = italic.sub(r'"\1"', text) - text = quote_quote.sub(r'"\1"', text) - # residuals of unbalanced quotes - text = text.replace("'''", '').replace("''", '"') - - # Collect spans - - spans = [] - # Drop HTML comments - for m in comment.finditer(text): - spans.append((m.start(), m.end())) - - # Drop self-closing tags - for pattern in selfClosing_tag_patterns: - for m in pattern.finditer(text): - spans.append((m.start(), m.end())) - - # Drop ignored tags - for left, right in ignored_tag_patterns: - for m in left.finditer(text): - spans.append((m.start(), m.end())) - for m in right.finditer(text): - spans.append((m.start(), m.end())) - - # Bulk remove all spans - text = dropSpans(spans, text) - - # Drop discarded elements - for tag in discardElements: - text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag) - - if not extractor.toHTML: - # Turn into text what is left (&nbsp;) and - text = unescape(text) - - # Expand placeholders - for pattern, placeholder in placeholder_tag_patterns: - index = 1 - for match in pattern.finditer(text): - text = text.replace(match.group(), '%s_%d' % (placeholder, index)) - index += 1 - - text = text.replace('<<', u'«').replace('>>', u'»') - - ############################################# - - # Cleanup text - text = text.replace('\t', ' ') - text = spaces.sub(' ', text) - text = dots.sub('...', text) - text = re.sub(u' (,:\.\)\]»)', r'\1', text) - text = re.sub(u'(\[\(«) ', r'\1', text) - text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations - text = text.replace(',,', ',').replace(',.', '.') - if escape_doc: - text = cgi.escape(text) - return text - - # skip level 1, it is page name level section = re.compile(r'(==+)\s*(.*?)\s*\1') @@ -2308,13 +2311,57 @@ def load_templates(file, output_file=None): templatePrefix = templateNamespace + ':' global moduleNamespace, modulePrefix modulePrefix = moduleNamespace + ':' - articles = 0 - page = [] - ns = '0' - inText = False if output_file: output = codecs.open(output_file, 'wb', 'utf-8') - for line in file: + for page_count, page_data in enumerate(pages_from(file)): + id, title, ns, page = page_data + if not output_file and (not templateNamespace or + not moduleNamespace): # do not know it yet + # reconstruct templateNamespace and moduleNamespace from the first title + if ns in templateKeys: + colon = title.find(':') + if colon > 1: + if ns == '10': + templateNamespace = title[:colon] + templatePrefix = title[:colon + 1] + elif ns == '828': + moduleNamespace = title[:colon] + modulePrefix = title[:colon + 1] + if ns in templateKeys: + text = ''.join(page) + define_template(title, text) + # save templates and modules to file + if output_file: + output.write('\n') + output.write(' %s\n' % title) + output.write(' %s\n' % ns) + output.write(' %s\n' % id) + output.write(' ') + for line in page: + output.write(line) + output.write(' \n') + output.write('\n') + if page_count and page_count % 100000 == 0: + logging.info("Preprocessed %d pages", page_count) + if output_file: + output.close() + logging.info("Saved %d templates to '%s'", len(templates), output_file) + + +def pages_from(input): + """ + Scans input extracting pages. + :return: (id, title, namespace, page), page is a list of lines. + """ + # we collect individual lines, since str.join() is significantly faster + # than concatenation + page = [] + id = None + ns = '0' + last_id = None + inText = False + redirect = False + for line in input: line = line.decode('utf-8') if '<' not in line: # faster than doing re.search() if inText: @@ -2326,10 +2373,15 @@ def load_templates(file, output_file=None): tag = m.group(2) if tag == 'page': page = [] + redirect = False + elif tag == 'id' and not id: + id = m.group(3) elif tag == 'title': title = m.group(3) elif tag == 'ns': ns = m.group(3) + elif tag == 'redirect': + redirect = True elif tag == 'text': inText = True line = line[m.start(3):m.end(3)] @@ -2343,37 +2395,12 @@ def load_templates(file, output_file=None): elif inText: page.append(line) elif tag == '/page': - if not output_file and not templateNamespace: # do not know it yet - # reconstruct templateNamespace and moduleNamespace from the first title - if ns in templateKeys: - colon = title.find(':') - if colon > 1: - if ns == '10': - templateNamespace = title[:colon] - templatePrefix = title[:colon + 1] - elif ns == '828': - moduleNamespace = title[:colon] - modulePrefix = title[:colon + 1] - if ns in templateKeys: - define_template(title, page) - # save templates and modules to file - if output_file: - output.write('\n') - output.write(' %s\n' % title) - output.write(' %s\n' % ns) - output.write(' ') - for line in page: - output.write(line) - output.write(' \n') - output.write('\n') + if id != last_id and not redirect: + yield (id, title, ns, page) + last_id = id + ns = '0' + id = None page = [] - ns = '0' - articles += 1 - if articles % 100000 == 0: - logging.info("Preprocessed %d pages", articles) - if output_file: - output.close() - logging.info("Saved %d templates to '%s'", len(templates), output_file) def process_dump(input_file, template_file, out_file, file_size, file_compress, @@ -2419,7 +2446,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, elif tag == '/siteinfo': break - if expand_templates: + if Extractor.expand_templates: # preprocess template_load_start = default_timer() if template_file and os.path.exists(template_file): @@ -2452,17 +2479,22 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, if out_file == '-': out_file = None - # Reduce job that sorts and prints output - reduce = Process(target=reduce_process, args=(output_queue, out_file, file_size, file_compress)) + + worker_count = max(1, process_count) + + # reduce job that sorts and prints output + reduce = Process(target=reduce_process, + args=(output_queue, + out_file, file_size, file_compress)) reduce.start() # initialize jobs queue jobs_queue = Queue(maxsize=maxsize) # start worker processes - logging.info("Using %d extract processes.", process_count) + logging.info("Using %d extract processes.", worker_count) workers = [] - for _ in xrange(max(1, process_count)): + for i in xrange(worker_count): extractor = Process(target=extract_process, args=(jobs_queue, output_queue)) extractor.daemon = True # only live while parent process lives @@ -2470,59 +2502,14 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, workers.append(extractor) # Mapper process - - # we collect individual lines, since str.join() is significantly faster - # than concatenation - page = [] - id = None - ns = '0' - last_id = None - ordinal = 0 # page count - inText = False - redirect = False - for line in input: - line = line.decode('utf-8') - if '<' not in line: # faster than doing re.search() - if inText: - page.append(line) - continue - m = tagRE.search(line) - if not m: - continue - tag = m.group(2) - if tag == 'page': - page = [] - redirect = False - elif tag == 'id' and not id: - id = m.group(3) - elif tag == 'title': - title = m.group(3) - elif tag == 'ns': - ns = m.group(3) - elif tag == 'redirect': - redirect = True - elif tag == 'text': - inText = True - line = line[m.start(3):m.end(3)] - page.append(line) - if m.lastindex == 4: # open-close - inText = False - elif tag == '/text': - if m.group(1): - page.append(m.group(1)) - inText = False - elif inText: - page.append(line) - elif tag == '/page': - if id != last_id and not redirect and ns not in templateKeys: - job = (id, title, page, ordinal) - jobs_queue.put(job) # goes to any available extract_process - logging.info('%s\t%s', id, title) - last_id = id - ns = '0' - ordinal += 1 - id = None - page = [] + page_num = 0 + for page_data in pages_from(input): + id, title, ns, page = page_data + if ns not in templateKeys: + job = (id, title, page, page_num) + # logging.info('Put: %s %s', id, page_num) # DEBUG + jobs_queue.put(job) # goes to any available extract_process + page_num += 1 input.close() @@ -2538,12 +2525,10 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, # wait for it to finish reduce.join() - if output != sys.stdout: - output.close() extract_duration = default_timer() - extract_start - extract_rate = ordinal / extract_duration + extract_rate = page_num / extract_duration logging.info("Finished %d-process extraction of %d articles in %.1fs (%.1f art/s)", - process_count, ordinal, extract_duration, extract_rate) + process_count, page_num, extract_duration, extract_rate) # ---------------------------------------------------------------------- @@ -2556,18 +2541,27 @@ def extract_process(jobs_queue, output_queue): :param output_queue: where to queue extracted text for output. """ while True: - job = jobs_queue.get() # job is (id, title, page, ordinal) + job = jobs_queue.get() # job is (id, title, page, page_num) if job: - out = StringIO() # memory buffer - Extractor(*job[:3]).extract(out) # (id, title, page) - text = out.getvalue() - output_queue.put((job[3], text)) # (ordinal, extracted_text) + id, title, page, page_num = job + # logging.info('Got: %s %s', id, page_num) # DEBUG + out = StringIO() # memory buffer + try: + Extractor(*job[:3]).extract(out) # (id, title, page) + text = out.getvalue() + except: + text = '' + logging.error('Processing page: %s %s', id, title) + # logging.info('Done: %s %s', id, page_num) # DEBUG + output_queue.put((page_num, text)) out.close() else: break -def reduce_process(output_queue, out_file=None, file_size=0, file_compress=True): +period = 100000 # progress report period +def reduce_process(output_queue, + out_file=None, file_size=0, file_compress=True): """Pull finished article text, write series of files (or stdout) :param output_queue: text to be output. :param out_file: filename where to print. @@ -2584,27 +2578,33 @@ def reduce_process(output_queue, out_file=None, file_size=0, file_compress=True) logging.warn("writing to stdout, so no output compression (use an external tool)") interval_start = default_timer() - period = 100000 # FIXME: use a heap - ordering_buffer = {} # collected pages - next_ordinal = 0 # sequence number of pages + collected_pages = {} # collected pages + next_page = 0 # sequence numbering of page while True: - if next_ordinal in ordering_buffer: - output.write(ordering_buffer.pop(next_ordinal)) - next_ordinal += 1 + if next_page in collected_pages: + output.write(collected_pages.pop(next_page)) + next_page += 1 # progress report - if next_ordinal % period == 0: + if next_page % period == 0: interval_rate = period / (default_timer() - interval_start) logging.info("Extracted %d articles (%.1f art/s)", - next_ordinal, interval_rate) + next_page, interval_rate) interval_start = default_timer() else: # mapper puts None to signal finish pair = output_queue.get() if not pair: break - ordinal, text = pair - ordering_buffer[ordinal] = text + page_num, text = pair + collected_pages[page_num] = text + # FIXME: if an extractor dies, process stalls; the other processes + # continue to produce pairs, filling up memory. + if len(collected_pages) > 200: # DEBUG + logging.debug('Collected %d, wait: %d, %d', len(collected_pages), + next_page, next_page == page_num) + if output != sys.stdout: + output.close() # ---------------------------------------------------------------------- @@ -2615,7 +2615,7 @@ minFileSize = 200 * 1024 def main(): global urlbase, acceptedNamespaces - global expand_templates, templateCache, escape_doc + global templateCache, escape_doc parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, @@ -2669,7 +2669,7 @@ def main(): if args.html: Extractor.keepLinks = True - expand_templates = args.no_templates + Extractor.expand_templates = args.no_templates escape_doc = args.escapedoc try: @@ -2708,17 +2708,11 @@ def main(): with open(args.templates) as file: load_templates(file) - with open(input_file) as file: - page = file.read().decode('utf-8') - m = re.search(r'(.*)', page) - id = m.group(1) if m else 0 - m = re.search(r'(.*)', page) - if m: - title = m.group(1) - else: - logging.error('Missing title element') - return - Extractor(id, title, [page]).extract(sys.stdout) + file = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed) + for page_data in pages_from(file): + id, title, ns, page = page_data + Extractor(id, title, page).extract(sys.stdout) + file.close() return output_path = args.output diff --git a/extractPage.py b/extractPage.py index 06458e1..29dfef2 100755 --- a/extractPage.py +++ b/extractPage.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # ============================================================================= -# Version: 2.8 (Jan 10, 2015) +# Version: 2.9 (Feb 13, 2016) # Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa # ============================================================================= @@ -39,7 +39,7 @@ import Queue, threading, multiprocessing # Program version -version = '2.8' +version = '2.9' # ---------------------------------------------------------------------- # READER @@ -48,10 +48,11 @@ tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?') #tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>([^<]*)') # 1 2 3 -def process_data(input_file, id, templates=False): +def process_data(input_file, ids, templates=False): """ :param input_file: name of the wikipedia dump file. - :param id: article id + :param ids: article ids (single or range first-last). + :param templates: collect also templates """ if input_file.lower().endswith("bz2"): @@ -60,8 +61,16 @@ def process_data(input_file, id, templates=False): opener = open input = opener(input_file) + print '' + rang = ids.split('-') + first = int(rang[0]) + if len(rang) == 1: + last = first + else: + last = int(rang[1]) page = [] + curid = 0 for line in input: line = line.decode('utf-8') if '<' not in line: # faster than doing re.search() @@ -76,11 +85,13 @@ def process_data(input_file, id, templates=False): page = [] page.append(line) inArticle = False - elif tag == 'id': - curid = m.group(3) - if id == curid: + elif tag == 'id' and not curid: # other are present + curid = int(m.group(3)) + if first <= curid <= last: page.append(line) inArticle = True + elif curid > last and not templates: + break elif not inArticle and not templates: page = [] elif tag == 'title': @@ -95,12 +106,14 @@ def process_data(input_file, id, templates=False): if page: page.append(line) print ''.join(page).encode('utf-8') - if not templates: + if not templates and curid == last: break + curid = 0 page = [] elif page: page.append(line) + print '' input.close() def main(): @@ -110,9 +123,9 @@ def main(): parser.add_argument("input", help="XML wiki dump file") parser.add_argument("--id", default="", - help="article number") + help="article number, or range first-last") parser.add_argument("--template", action="store_true", - help="template number") + help="extract also all templates") parser.add_argument("-v", "--version", action="version", version='%(prog)s ' + version, help="print program version")