Merged PR from Seth Cleveland.

This commit is contained in:
attardi 2016-06-19 13:10:36 +02:00
parent f9b8e8ac02
commit 0f703c0aae
3 changed files with 127 additions and 29 deletions

View File

@ -2,6 +2,12 @@
* WikiExtractor.py: support for Python 3 by orangain@gmail.com
2016-03-23 Seth Cleveland <scleveland@turnitin.com>
* WikiExtractor.py: add filtering options -- min text length, xml
namespace, and disambig pages. Added option to print the revision
id with each document.
2016-03-23 Giuseppe Attardi <attardi@di.unipi.it>
* WikiExtractor.py (compact): properly emit section headers when Extractor.keepSections.

View File

@ -33,11 +33,25 @@ The script is invoked with a Wikipedia dump file as an argument.
The output is stored in several files of similar size in a given directory.
Each file will contains several documents in this [document format](http://medialab.di.unipi.it/wiki/Document_Format).
usage: WikiExtractor.py [-h] [-o OUTPUT] [-b n[KMG]] [-c] [--html] [-l]
[-ns ns1,ns2] [-s] [--templates TEMPLATES]
[--no-templates] [--processes PROCESSES] [-q] [--debug]
[-a] [-v]
input
usage: WikiExtractor.py [-h] [-o OUTPUT] [-b n[KMG]] [-c] [--html] [-l] [-s]
[--lists] [-ns ns1,ns2] [-xns ns1,ns2]
[--templates TEMPLATES] [--no-templates] [--escapedoc]
[-r] [--min_text_length MIN_TEXT_LENGTH]
[--filter_disambig_pages] [--processes PROCESSES] [-q]
[--debug] [-a] [-v]
input
Wikipedia Extractor:
Extracts and cleans text from a Wikipedia database dump and stores output in a
number of files of similar size in a given directory.
Each file will contain several documents in the format:
<doc id="" revid="" url="" title="">
...
</doc>
Template expansion requires preprocesssng first the whole dump and
collecting template definitions.
positional arguments:
input XML wiki dump file
@ -48,7 +62,7 @@ Each file will contains several documents in this [document format](http://media
Output:
-o OUTPUT, --output OUTPUT
a directory where to store the extracted files (or '-' for dumping to
directory for extracted files (or '-' for dumping to
stdout)
-b n[KMG], --bytes n[KMG]
maximum bytes per output file (default 1M)
@ -57,21 +71,33 @@ Each file will contains several documents in this [document format](http://media
Processing:
--html produce HTML output, subsumes --links
-l, --links preserve links
-s, --sections preserve sections
--lists preserve lists
-ns ns1,ns2, --namespaces ns1,ns2
accepted namespaces
accepted link namespaces
-xns ns1,ns2, --xml_namespaces ns1,ns2
accepted page xml namespaces -- 0 for main/articles
--templates TEMPLATES
use or create file containing templates
use or create file containing templates
--no-templates Do not expand templates
--escapedoc use to escape the contents of the output
<doc>...</doc>
-r, --revision Include the document revision id (default=False)
--min_text_length MIN_TEXT_LENGTH
Minimum expanded text length required to write
document (default=0)
--filter_disambig_pages
Remove pages from output that contain disabmiguation
markup (default=False)
Special:
-q, --quiet suppress reporting progress info
--debug print debug info
-a, --article analyze a file containing a single article (debug option)
-a, --article analyze a file containing a single article (debug
option)
-v, --version print program version
Saving templates to a file will speed up performing extraction the next time,
assuming template definitions have not changed.

View File

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
# =============================================================================
# Version: 2.56 (June 19, 2016)
# Version: 2.57 (June 19, 2016)
# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
#
# Contributors:
@ -15,6 +15,7 @@
# Wim Muskee (wimmuskee@gmail.com)
# Radics Geza (radicsge@gmail.com)
# orangain (orangain@gmail.com)
# Seth Cleveland (scleveland@turnitin.com)
#
# =============================================================================
# Copyright (c) 2011-2016. Giuseppe Attardi (attardi@di.unipi.it).
@ -39,7 +40,7 @@ Extracts and cleans text from a Wikipedia database dump and stores output in a
number of files of similar size in a given directory.
Each file will contain several documents in the format:
<doc id="" url="" title="">
<doc id="" revid="" url="" title="">
...
</doc>
@ -81,7 +82,7 @@ else:
# ===========================================================================
# Program version
version = '2.56'
version = '2.57'
## PARAMS ####################################################################
@ -106,13 +107,22 @@ templatePrefix = ''
moduleNamespace = ''
##
# Recognize only these namespaces
# Recognize only these namespaces in links
# w: Internal links to the Wikipedia
# wiktionary: Wiki dictionary
# wikt: shortcut for Wiktionary
#
acceptedNamespaces = ['w', 'wiktionary', 'wikt']
##
# Desired xml namespaces to extract -- allXMLNamespaces for everything, use
# the integers from https://en.wikipedia.org/wiki/Wikipedia:Namespace to filter
# pages
allXMLNamespaces = set()
acceptedXMLNamespaces = allXMLNamespaces
##
# Drop these elements from article text
#
@ -128,6 +138,27 @@ discardElements = [
# This is obtained from <siteinfo>
urlbase = ''
##
# Filter disambiguation pages
filter_disambig_pages = False
filter_disambig_page_pattern = re.compile("{{disambig(uation)?(\|[^}]*)?}}")
##
# page filtering logic -- remove templates, un desired xml namespaces, and disambig pages
def keepPage(ns, page):
# remove modules and templates from output
if ns in templateKeys:
return False
# filter this page based on namespace, unless we want all namespaces
if not (acceptedXMLNamespaces is allXMLNamespaces) and ns not in acceptedXMLNamespaces:
return False
# remove disambig pages if desired
if filter_disambig_pages:
for line in page:
if filter_disambig_page_pattern.match(line):
return False
return True
def get_url(uid):
return "%s?curid=%s" % (urlbase, uid)
@ -433,14 +464,22 @@ class Extractor(object):
# Whether to expand templates
expand_templates = True
##
# Print the wikipedia article revision
print_revision = False
def __init__(self, id, title, lines):
##
# Minimum expanded text length required to print document
min_text_length = 0
def __init__(self, id, revid, title, lines):
"""
:param id: id of page.
:param title: tutle of page.
:param lines: a list of lines.
"""
self.id = id
self.revid = revid
self.title = title
self.text = ''.join(lines)
self.magicWords = MagicWords()
@ -456,7 +495,10 @@ class Extractor(object):
"""
logging.debug("%s\t%s", self.id, self.title)
url = get_url(self.id)
header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
if Extractor.print_revision:
header = '<doc id="%s" revid="%s" url="%s" title="%s">\n' % (self.id, self.revid, url, self.title)
else:
header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
# Separate header from text with a newline.
header += self.title + '\n\n'
self.magicWords['pagename'] = self.title
@ -466,10 +508,12 @@ class Extractor(object):
self.magicWords['currentday'] = time.strftime('%d')
self.magicWords['currenthour'] = time.strftime('%H')
self.magicWords['currenttime'] = time.strftime('%H:%M:%S')
text = self.clean()
text = [line.encode('utf-8') for line in compact(self.clean())]
footer = "\n</doc>\n"
if sum(len(line) for line in text) < Extractor.min_text_length:
return
out.write(header)
for line in compact(text):
for line in text:
out.write(line)
out.write('\n')
out.write(footer)
@ -2338,7 +2382,7 @@ def load_templates(file, output_file=None):
if output_file:
output = codecs.open(output_file, 'wb', 'utf-8')
for page_count, page_data in enumerate(pages_from(file)):
id, title, ns, page = page_data
id, revid, title, ns, page = page_data
if not output_file and (not templateNamespace or
not moduleNamespace): # do not know it yet
# reconstruct templateNamespace and moduleNamespace from the first title
@ -2383,8 +2427,10 @@ def pages_from(input):
id = None
ns = '0'
last_id = None
revid = None
inText = False
redirect = False
title = None
for line in input:
line = line.decode('utf-8')
if '<' not in line: # faster than doing re.search()
@ -2398,8 +2444,10 @@ def pages_from(input):
if tag == 'page':
page = []
redirect = False
elif tag == 'id' and not id: # skip nested <id>
elif tag == 'id' and not id:
id = m.group(3)
elif tag == 'id' and id:
revid = m.group(3)
elif tag == 'title':
title = m.group(3)
elif tag == 'ns':
@ -2420,10 +2468,12 @@ def pages_from(input):
page.append(line)
elif tag == '/page':
if id != last_id and not redirect:
yield (id, title, ns, page)
yield (id, revid, title, ns, page)
last_id = id
ns = '0'
id = None
revid = None
title = None
page = []
@ -2441,6 +2491,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
global knownNamespaces
global templateNamespace, templatePrefix
global moduleNamespace, modulePrefix
global allXMLNamespaces, acceptedXMLNamespaces
if input_file == '-':
input = sys.stdin
@ -2533,8 +2584,8 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
# Mapper process
page_num = 0
for page_data in pages_from(input):
id, title, ns, page = page_data
if ns not in templateKeys:
id, revid, title, ns, page = page_data
if keepPage(ns, page):
# slow down
delay = 0
if spool_length.value > max_spool_length:
@ -2544,7 +2595,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
delay += 10
if delay:
logging.info('Delay %ds', delay)
job = (id, title, page, page_num)
job = (id, revid, title, page, page_num)
jobs_queue.put(job) # goes to any available extract_process
page_num += 1
page = None # free memory
@ -2583,15 +2634,16 @@ def extract_process(i, jobs_queue, output_queue):
while True:
job = jobs_queue.get() # job is (id, title, page, page_num)
if job:
id, title, page, page_num = job
id, revid, title, page, page_num = job
try:
e = Extractor(*job[:3]) # (id, title, page)
e = Extractor(*job[:4]) # (id, revid, title, page)
page = None # free memory
e.extract(out)
text = out.getvalue()
except:
text = ''
logging.error('Processing page: %s %s', id, title)
output_queue.put((page_num, text))
out.truncate(0)
out.seek(0)
@ -2661,7 +2713,7 @@ minFileSize = 200 * 1024
def main():
global urlbase, acceptedNamespaces
global urlbase, acceptedNamespaces, acceptedXMLNamespaces, filter_disambig_pages
global templateCache, escape_doc
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
@ -2688,13 +2740,21 @@ def main():
groupP.add_argument("--lists", action="store_true",
help="preserve lists")
groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
help="accepted namespaces")
help="accepted link namespaces")
groupP.add_argument("-xns", "--xml_namespaces", default="", metavar="ns1,ns2",
help="accepted page xml namespaces -- 0 for main/articles")
groupP.add_argument("--templates",
help="use or create file containing templates")
groupP.add_argument("--no-templates", action="store_false",
help="Do not expand templates")
groupP.add_argument("--escapedoc", action="store_true",
help="use to escape the contents of the output <doc>...</doc>")
groupP.add_argument("-r", "--revision", action="store_true", default=Extractor.print_revision,
help="Include the document revision id (default=%(default)s)")
groupP.add_argument("--min_text_length", type=int, default=Extractor.min_text_length,
help="Minimum expanded text length required to write document (default=%(default)s)")
groupP.add_argument("--filter_disambig_pages", action="store_true", default=filter_disambig_pages,
help="Remove pages from output that contain disabmiguation markup (default=%(default)s)")
default_process_count = cpu_count() - 1
parser.add_argument("--processes", type=int, default=default_process_count,
help="Number of processes to use (default %(default)s)")
@ -2716,11 +2776,14 @@ def main():
Extractor.keepSections = args.sections
Extractor.keepLists = args.lists
Extractor.toHTML = args.html
Extractor.print_revision = args.revision
Extractor.min_text_length = args.min_text_length
if args.html:
Extractor.keepLinks = True
Extractor.expand_templates = args.no_templates
escape_doc = args.escapedoc
filter_disambig_pages = args.filter_disambig_pages
try:
power = 'kmg'.find(args.bytes[-1].lower()) + 1
@ -2734,6 +2797,9 @@ def main():
if args.namespaces:
acceptedNamespaces = set(args.namespaces.split(','))
if args.xml_namespaces:
acceptedXMLNamespaces = set([unicode(ns) for ns in args.xml_namespaces.split(',')])
FORMAT = '%(levelname)s: %(message)s'
logging.basicConfig(format=FORMAT)
@ -2760,8 +2826,8 @@ def main():
file = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
for page_data in pages_from(file):
id, title, ns, page = page_data
Extractor(id, title, page).extract(sys.stdout)
id, revid, title, ns, page = page_data
Extractor(id, revid, title, page).extract(sys.stdout)
file.close()
return