See ChangeLog.

This commit is contained in:
attardi 2016-02-15 01:22:38 +01:00
parent 834cad6a35
commit 6d0577ef10
3 changed files with 278 additions and 255 deletions

View File

@ -1,3 +1,19 @@
2016-02-15 Giuseppe Attardi <attardi@di.unipi.it>
* WikiExtractor.py (Extractor.clean): turned into method.
2016-02-14 Giuseppe Attardi <attardi@di.unipi.it>
* WikiExtractor.py (load_templates): save also <id> in templates.
(pages_from): common iterator for extracting pages from dump, used
both for analyzing pages, templates and single articles.
2016-02-13 Giuseppe Attardi <attardi@di.unipi.it>
* WikiExtractor.py (reduce_process): close file here.
* extractPage.py (process_data): allow range of ids.
2016-02-12 Giuseppe Attardi <attardi@di.unipi.it>
* WikiExtractor.py (reduce_process): moved here creation of OutputSplitter.

View File

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
# =============================================================================
# Version: 2.48 (February 12, 2016)
# Version: 2.49 (February 14, 2016)
# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
#
# Contributors:
@ -46,6 +46,7 @@ This version performs template expansion by preprocesssng the whole dump and
collecting template definitions.
"""
import sys
import argparse
import bz2
import codecs
@ -54,19 +55,18 @@ import fileinput
import logging
import os.path
import re # TODO use regex when it will be standard
import sys
import time
import urllib
from cStringIO import StringIO
from htmlentitydefs import name2codepoint
from itertools import izip, izip_longest
from multiprocessing import Queue, Process, cpu_count
from multiprocessing import Queue, Process, Array, cpu_count
from timeit import default_timer
# ===========================================================================
# Program version
version = '2.48'
version = '2.49'
## PARAMS ####################################################################
@ -394,7 +394,6 @@ class TemplateArg(object):
substWords = 'subst:|safesubst:'
class Extractor(object):
"""
An extraction task on a article.
@ -415,13 +414,20 @@ class Extractor(object):
# Whether to output HTML instead of text
toHTML = False
def __init__(self, id, title, page):
##
# Whether to expand templates
expand_templates = True
def __init__(self, id, title, lines):
"""
:param page: a list of lines.
:param id: id of page.
:param title: tutle of page.
:param lines: a list of lines.
"""
self.id = id
self.title = title
self.page = page
self.text = ''.join(lines)
self.magicWords = MagicWords()
self.frame = []
self.recursion_exceeded_1_errs = 0 # template recursion within expandTemplates()
@ -433,8 +439,7 @@ class Extractor(object):
"""
:param out: a memory file.
"""
logging.debug("%s\t%s", self.id, self.title)
text = ''.join(self.page)
logging.info("%s\t%s", self.id, self.title)
url = get_url(self.id)
header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
# Separate header from text with a newline.
@ -447,7 +452,7 @@ class Extractor(object):
self.magicWords['currentday'] = time.strftime('%d')
self.magicWords['currenthour'] = time.strftime('%H')
self.magicWords['currenttime'] = time.strftime('%H:%M:%S')
text = clean(self, text)
text = self.clean()
footer = "\n</doc>\n"
out.write(header)
for line in compact(text):
@ -462,6 +467,112 @@ class Extractor(object):
logging.warn("Template errors in article '%s' (%s): title(%d) recursion(%d, %d, %d)",
self.title, self.id, *errs)
def clean(self):
"""
Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped
@see https://www.mediawiki.org/wiki/Help:Formatting
"""
text = self.text
self.text = '' # save memory
if Extractor.expand_templates:
# expand templates
# See: http://www.mediawiki.org/wiki/Help:Templates
text = self.expandTemplates(text)
else:
# Drop transclusions (template, parser functions)
text = dropNested(text, r'{{', r'}}')
# Drop tables
text = dropNested(text, r'{\|', r'\|}')
# replace external links
text = replaceExternalLinks(text)
# replace internal links
text = replaceInternalLinks(text)
# drop MagicWords behavioral switches
text = magicWordsRE.sub('', text)
# ############### Process HTML ###############
# turn into HTML, except for the content of <syntaxhighlight>
res = ''
cur = 0
for m in syntaxhighlight.finditer(text):
end = m.end()
res += unescape(text[cur:m.start()]) + m.group(1)
cur = end
text = res + unescape(text[cur:])
# Handle bold/italic/quote
if self.toHTML:
text = bold_italic.sub(r'<b>\1</b>', text)
text = bold.sub(r'<b>\1</b>', text)
text = italic.sub(r'<i>\1</i>', text)
else:
text = bold_italic.sub(r'\1', text)
text = bold.sub(r'\1', text)
text = italic_quote.sub(r'"\1"', text)
text = italic.sub(r'"\1"', text)
text = quote_quote.sub(r'"\1"', text)
# residuals of unbalanced quotes
text = text.replace("'''", '').replace("''", '"')
# Collect spans
spans = []
# Drop HTML comments
for m in comment.finditer(text):
spans.append((m.start(), m.end()))
# Drop self-closing tags
for pattern in selfClosing_tag_patterns:
for m in pattern.finditer(text):
spans.append((m.start(), m.end()))
# Drop ignored tags
for left, right in ignored_tag_patterns:
for m in left.finditer(text):
spans.append((m.start(), m.end()))
for m in right.finditer(text):
spans.append((m.start(), m.end()))
# Bulk remove all spans
text = dropSpans(spans, text)
# Drop discarded elements
for tag in discardElements:
text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag)
if not self.toHTML:
# Turn into text what is left (&amp;nbsp;) and <syntaxhighlight>
text = unescape(text)
# Expand placeholders
for pattern, placeholder in placeholder_tag_patterns:
index = 1
for match in pattern.finditer(text):
text = text.replace(match.group(), '%s_%d' % (placeholder, index))
index += 1
text = text.replace('<<', u'«').replace('>>', u'»')
#############################################
# Cleanup text
text = text.replace('\t', ' ')
text = spaces.sub(' ', text)
text = dots.sub('...', text)
text = re.sub(u' (,:\.\)\]»)', r'\1', text)
text = re.sub(u'(\[\(«) ', r'\1', text)
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
text = text.replace(',,', ',').replace(',.', '.')
if escape_doc:
text = cgi.escape(text)
return text
# ----------------------------------------------------------------------
# Expand templates
@ -1993,114 +2104,6 @@ tailRE = re.compile('\w+')
syntaxhighlight = re.compile('&lt;syntaxhighlight .*?&gt;(.*?)&lt;/syntaxhighlight&gt;', re.DOTALL)
expand_templates = True
def clean(extractor, text):
"""
Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped
@see https://www.mediawiki.org/wiki/Help:Formatting
"""
if expand_templates:
# expand templates
# See: http://www.mediawiki.org/wiki/Help:Templates
text = extractor.expandTemplates(text)
else:
# Drop transclusions (template, parser functions)
text = dropNested(text, r'{{', r'}}')
# Drop tables
text = dropNested(text, r'{\|', r'\|}')
# replace external links
text = replaceExternalLinks(text)
# replace internal links
text = replaceInternalLinks(text)
# drop MagicWords behavioral switches
text = magicWordsRE.sub('', text)
# ############### Process HTML ###############
# turn into HTML, except for the content of <syntaxhighlight>
res = ''
cur = 0
for m in syntaxhighlight.finditer(text):
end = m.end()
res += unescape(text[cur:m.start()]) + m.group(1)
cur = end
text = res + unescape(text[cur:])
# Handle bold/italic/quote
if extractor.toHTML:
text = bold_italic.sub(r'<b>\1</b>', text)
text = bold.sub(r'<b>\1</b>', text)
text = italic.sub(r'<i>\1</i>', text)
else:
text = bold_italic.sub(r'\1', text)
text = bold.sub(r'\1', text)
text = italic_quote.sub(r'"\1"', text)
text = italic.sub(r'"\1"', text)
text = quote_quote.sub(r'"\1"', text)
# residuals of unbalanced quotes
text = text.replace("'''", '').replace("''", '"')
# Collect spans
spans = []
# Drop HTML comments
for m in comment.finditer(text):
spans.append((m.start(), m.end()))
# Drop self-closing tags
for pattern in selfClosing_tag_patterns:
for m in pattern.finditer(text):
spans.append((m.start(), m.end()))
# Drop ignored tags
for left, right in ignored_tag_patterns:
for m in left.finditer(text):
spans.append((m.start(), m.end()))
for m in right.finditer(text):
spans.append((m.start(), m.end()))
# Bulk remove all spans
text = dropSpans(spans, text)
# Drop discarded elements
for tag in discardElements:
text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag)
if not extractor.toHTML:
# Turn into text what is left (&amp;nbsp;) and <syntaxhighlight>
text = unescape(text)
# Expand placeholders
for pattern, placeholder in placeholder_tag_patterns:
index = 1
for match in pattern.finditer(text):
text = text.replace(match.group(), '%s_%d' % (placeholder, index))
index += 1
text = text.replace('<<', u'«').replace('>>', u'»')
#############################################
# Cleanup text
text = text.replace('\t', ' ')
text = spaces.sub(' ', text)
text = dots.sub('...', text)
text = re.sub(u' (,:\.\)\]»)', r'\1', text)
text = re.sub(u'(\[\(«) ', r'\1', text)
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
text = text.replace(',,', ',').replace(',.', '.')
if escape_doc:
text = cgi.escape(text)
return text
# skip level 1, it is page name level
section = re.compile(r'(==+)\s*(.*?)\s*\1')
@ -2308,13 +2311,57 @@ def load_templates(file, output_file=None):
templatePrefix = templateNamespace + ':'
global moduleNamespace, modulePrefix
modulePrefix = moduleNamespace + ':'
articles = 0
page = []
ns = '0'
inText = False
if output_file:
output = codecs.open(output_file, 'wb', 'utf-8')
for line in file:
for page_count, page_data in enumerate(pages_from(file)):
id, title, ns, page = page_data
if not output_file and (not templateNamespace or
not moduleNamespace): # do not know it yet
# reconstruct templateNamespace and moduleNamespace from the first title
if ns in templateKeys:
colon = title.find(':')
if colon > 1:
if ns == '10':
templateNamespace = title[:colon]
templatePrefix = title[:colon + 1]
elif ns == '828':
moduleNamespace = title[:colon]
modulePrefix = title[:colon + 1]
if ns in templateKeys:
text = ''.join(page)
define_template(title, text)
# save templates and modules to file
if output_file:
output.write('<page>\n')
output.write(' <title>%s</title>\n' % title)
output.write(' <ns>%s</ns>\n' % ns)
output.write(' <id>%s</id>\n' % id)
output.write(' <text>')
for line in page:
output.write(line)
output.write(' </text>\n')
output.write('</page>\n')
if page_count and page_count % 100000 == 0:
logging.info("Preprocessed %d pages", page_count)
if output_file:
output.close()
logging.info("Saved %d templates to '%s'", len(templates), output_file)
def pages_from(input):
"""
Scans input extracting pages.
:return: (id, title, namespace, page), page is a list of lines.
"""
# we collect individual lines, since str.join() is significantly faster
# than concatenation
page = []
id = None
ns = '0'
last_id = None
inText = False
redirect = False
for line in input:
line = line.decode('utf-8')
if '<' not in line: # faster than doing re.search()
if inText:
@ -2326,10 +2373,15 @@ def load_templates(file, output_file=None):
tag = m.group(2)
if tag == 'page':
page = []
redirect = False
elif tag == 'id' and not id:
id = m.group(3)
elif tag == 'title':
title = m.group(3)
elif tag == 'ns':
ns = m.group(3)
elif tag == 'redirect':
redirect = True
elif tag == 'text':
inText = True
line = line[m.start(3):m.end(3)]
@ -2343,37 +2395,12 @@ def load_templates(file, output_file=None):
elif inText:
page.append(line)
elif tag == '/page':
if not output_file and not templateNamespace: # do not know it yet
# reconstruct templateNamespace and moduleNamespace from the first title
if ns in templateKeys:
colon = title.find(':')
if colon > 1:
if ns == '10':
templateNamespace = title[:colon]
templatePrefix = title[:colon + 1]
elif ns == '828':
moduleNamespace = title[:colon]
modulePrefix = title[:colon + 1]
if ns in templateKeys:
define_template(title, page)
# save templates and modules to file
if output_file:
output.write('<page>\n')
output.write(' <title>%s</title>\n' % title)
output.write(' <ns>%s</ns>\n' % ns)
output.write(' <text>')
for line in page:
output.write(line)
output.write(' </text>\n')
output.write('</page>\n')
if id != last_id and not redirect:
yield (id, title, ns, page)
last_id = id
ns = '0'
id = None
page = []
ns = '0'
articles += 1
if articles % 100000 == 0:
logging.info("Preprocessed %d pages", articles)
if output_file:
output.close()
logging.info("Saved %d templates to '%s'", len(templates), output_file)
def process_dump(input_file, template_file, out_file, file_size, file_compress,
@ -2419,7 +2446,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
elif tag == '/siteinfo':
break
if expand_templates:
if Extractor.expand_templates:
# preprocess
template_load_start = default_timer()
if template_file and os.path.exists(template_file):
@ -2452,17 +2479,22 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
if out_file == '-':
out_file = None
# Reduce job that sorts and prints output
reduce = Process(target=reduce_process, args=(output_queue, out_file, file_size, file_compress))
worker_count = max(1, process_count)
# reduce job that sorts and prints output
reduce = Process(target=reduce_process,
args=(output_queue,
out_file, file_size, file_compress))
reduce.start()
# initialize jobs queue
jobs_queue = Queue(maxsize=maxsize)
# start worker processes
logging.info("Using %d extract processes.", process_count)
logging.info("Using %d extract processes.", worker_count)
workers = []
for _ in xrange(max(1, process_count)):
for i in xrange(worker_count):
extractor = Process(target=extract_process,
args=(jobs_queue, output_queue))
extractor.daemon = True # only live while parent process lives
@ -2470,59 +2502,14 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
workers.append(extractor)
# Mapper process
# we collect individual lines, since str.join() is significantly faster
# than concatenation
page = []
id = None
ns = '0'
last_id = None
ordinal = 0 # page count
inText = False
redirect = False
for line in input:
line = line.decode('utf-8')
if '<' not in line: # faster than doing re.search()
if inText:
page.append(line)
continue
m = tagRE.search(line)
if not m:
continue
tag = m.group(2)
if tag == 'page':
page = []
redirect = False
elif tag == 'id' and not id:
id = m.group(3)
elif tag == 'title':
title = m.group(3)
elif tag == 'ns':
ns = m.group(3)
elif tag == 'redirect':
redirect = True
elif tag == 'text':
inText = True
line = line[m.start(3):m.end(3)]
page.append(line)
if m.lastindex == 4: # open-close
inText = False
elif tag == '/text':
if m.group(1):
page.append(m.group(1))
inText = False
elif inText:
page.append(line)
elif tag == '/page':
if id != last_id and not redirect and ns not in templateKeys:
job = (id, title, page, ordinal)
jobs_queue.put(job) # goes to any available extract_process
logging.info('%s\t%s', id, title)
last_id = id
ns = '0'
ordinal += 1
id = None
page = []
page_num = 0
for page_data in pages_from(input):
id, title, ns, page = page_data
if ns not in templateKeys:
job = (id, title, page, page_num)
# logging.info('Put: %s %s', id, page_num) # DEBUG
jobs_queue.put(job) # goes to any available extract_process
page_num += 1
input.close()
@ -2538,12 +2525,10 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
# wait for it to finish
reduce.join()
if output != sys.stdout:
output.close()
extract_duration = default_timer() - extract_start
extract_rate = ordinal / extract_duration
extract_rate = page_num / extract_duration
logging.info("Finished %d-process extraction of %d articles in %.1fs (%.1f art/s)",
process_count, ordinal, extract_duration, extract_rate)
process_count, page_num, extract_duration, extract_rate)
# ----------------------------------------------------------------------
@ -2556,18 +2541,27 @@ def extract_process(jobs_queue, output_queue):
:param output_queue: where to queue extracted text for output.
"""
while True:
job = jobs_queue.get() # job is (id, title, page, ordinal)
job = jobs_queue.get() # job is (id, title, page, page_num)
if job:
out = StringIO() # memory buffer
Extractor(*job[:3]).extract(out) # (id, title, page)
text = out.getvalue()
output_queue.put((job[3], text)) # (ordinal, extracted_text)
id, title, page, page_num = job
# logging.info('Got: %s %s', id, page_num) # DEBUG
out = StringIO() # memory buffer
try:
Extractor(*job[:3]).extract(out) # (id, title, page)
text = out.getvalue()
except:
text = ''
logging.error('Processing page: %s %s', id, title)
# logging.info('Done: %s %s', id, page_num) # DEBUG
output_queue.put((page_num, text))
out.close()
else:
break
def reduce_process(output_queue, out_file=None, file_size=0, file_compress=True):
period = 100000 # progress report period
def reduce_process(output_queue,
out_file=None, file_size=0, file_compress=True):
"""Pull finished article text, write series of files (or stdout)
:param output_queue: text to be output.
:param out_file: filename where to print.
@ -2584,27 +2578,33 @@ def reduce_process(output_queue, out_file=None, file_size=0, file_compress=True)
logging.warn("writing to stdout, so no output compression (use an external tool)")
interval_start = default_timer()
period = 100000
# FIXME: use a heap
ordering_buffer = {} # collected pages
next_ordinal = 0 # sequence number of pages
collected_pages = {} # collected pages
next_page = 0 # sequence numbering of page
while True:
if next_ordinal in ordering_buffer:
output.write(ordering_buffer.pop(next_ordinal))
next_ordinal += 1
if next_page in collected_pages:
output.write(collected_pages.pop(next_page))
next_page += 1
# progress report
if next_ordinal % period == 0:
if next_page % period == 0:
interval_rate = period / (default_timer() - interval_start)
logging.info("Extracted %d articles (%.1f art/s)",
next_ordinal, interval_rate)
next_page, interval_rate)
interval_start = default_timer()
else:
# mapper puts None to signal finish
pair = output_queue.get()
if not pair:
break
ordinal, text = pair
ordering_buffer[ordinal] = text
page_num, text = pair
collected_pages[page_num] = text
# FIXME: if an extractor dies, process stalls; the other processes
# continue to produce pairs, filling up memory.
if len(collected_pages) > 200: # DEBUG
logging.debug('Collected %d, wait: %d, %d', len(collected_pages),
next_page, next_page == page_num)
if output != sys.stdout:
output.close()
# ----------------------------------------------------------------------
@ -2615,7 +2615,7 @@ minFileSize = 200 * 1024
def main():
global urlbase, acceptedNamespaces
global expand_templates, templateCache, escape_doc
global templateCache, escape_doc
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.RawDescriptionHelpFormatter,
@ -2669,7 +2669,7 @@ def main():
if args.html:
Extractor.keepLinks = True
expand_templates = args.no_templates
Extractor.expand_templates = args.no_templates
escape_doc = args.escapedoc
try:
@ -2708,17 +2708,11 @@ def main():
with open(args.templates) as file:
load_templates(file)
with open(input_file) as file:
page = file.read().decode('utf-8')
m = re.search(r'<id>(.*)</id>', page)
id = m.group(1) if m else 0
m = re.search(r'<title>(.*)</title>', page)
if m:
title = m.group(1)
else:
logging.error('Missing title element')
return
Extractor(id, title, [page]).extract(sys.stdout)
file = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
for page_data in pages_from(file):
id, title, ns, page = page_data
Extractor(id, title, page).extract(sys.stdout)
file.close()
return
output_path = args.output

View File

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
#
# =============================================================================
# Version: 2.8 (Jan 10, 2015)
# Version: 2.9 (Feb 13, 2016)
# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
# =============================================================================
@ -39,7 +39,7 @@ import Queue, threading, multiprocessing
# Program version
version = '2.8'
version = '2.9'
# ----------------------------------------------------------------------
# READER
@ -48,10 +48,11 @@ tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')
#tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>([^<]*)')
# 1 2 3
def process_data(input_file, id, templates=False):
def process_data(input_file, ids, templates=False):
"""
:param input_file: name of the wikipedia dump file.
:param id: article id
:param ids: article ids (single or range first-last).
:param templates: collect also templates
"""
if input_file.lower().endswith("bz2"):
@ -60,8 +61,16 @@ def process_data(input_file, id, templates=False):
opener = open
input = opener(input_file)
print '<mediawiki>'
rang = ids.split('-')
first = int(rang[0])
if len(rang) == 1:
last = first
else:
last = int(rang[1])
page = []
curid = 0
for line in input:
line = line.decode('utf-8')
if '<' not in line: # faster than doing re.search()
@ -76,11 +85,13 @@ def process_data(input_file, id, templates=False):
page = []
page.append(line)
inArticle = False
elif tag == 'id':
curid = m.group(3)
if id == curid:
elif tag == 'id' and not curid: # other <id> are present
curid = int(m.group(3))
if first <= curid <= last:
page.append(line)
inArticle = True
elif curid > last and not templates:
break
elif not inArticle and not templates:
page = []
elif tag == 'title':
@ -95,12 +106,14 @@ def process_data(input_file, id, templates=False):
if page:
page.append(line)
print ''.join(page).encode('utf-8')
if not templates:
if not templates and curid == last:
break
curid = 0
page = []
elif page:
page.append(line)
print '</mediawiki>'
input.close()
def main():
@ -110,9 +123,9 @@ def main():
parser.add_argument("input",
help="XML wiki dump file")
parser.add_argument("--id", default="",
help="article number")
help="article number, or range first-last")
parser.add_argument("--template", action="store_true",
help="template number")
help="extract also all templates")
parser.add_argument("-v", "--version", action="version",
version='%(prog)s ' + version,
help="print program version")