Upgrade to Python 3.3+.
This commit is contained in:
parent
6408a430fc
commit
62bdbe6106
105
.gitignore
vendored
105
.gitignore
vendored
@ -1,93 +1,52 @@
|
||||
local/
|
||||
tmp/
|
||||
|
||||
### https://raw.github.com/github/gitignore/c699a4f4684e9e294c9c550f820ca330f019b6f9/python.gitignore
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
env/
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
# Packages
|
||||
*.egg
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
*.egg-info
|
||||
dist
|
||||
build
|
||||
eggs
|
||||
parts
|
||||
bin
|
||||
var
|
||||
sdist
|
||||
develop-eggs
|
||||
.installed.cfg
|
||||
lib
|
||||
lib64
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
.tox
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*,cover
|
||||
.hypothesis/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
# Mr Developer
|
||||
.mr.developer.cfg
|
||||
.project
|
||||
.pydevproject
|
||||
|
||||
# Flask instance folder
|
||||
instance/
|
||||
/docs/_build
|
||||
.idea
|
||||
*.iml
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
.travis-solo
|
||||
G*
|
||||
*.db
|
||||
*.mdb
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# IPython Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# dotenv
|
||||
.env
|
||||
|
||||
# virtualenv
|
||||
venv/
|
||||
ENV/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
|
||||
# Editor files
|
||||
*.idea
|
||||
# Vim
|
||||
[._]*.s[a-w][a-z]
|
||||
[._]s[a-w][a-z]
|
||||
*.un~
|
||||
Session.vim
|
||||
.netrwhist
|
||||
*~
|
||||
|
@ -1,7 +1,8 @@
|
||||
# WikiExtractor
|
||||
[WikiExtractor.py](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) is a Python script that extracts and cleans text from a [Wikipedia database dump](http://download.wikimedia.org/).
|
||||
|
||||
The tool is written in Python and requires Python 2.7 or Python 3.3+ but no additional library.
|
||||
The tool is written in Python and requires Python 3.7 but no additional library.
|
||||
**Warning**: problems have been reported on Windows due to poor support for `StringIO` in the Python implementation on Windows.
|
||||
|
||||
For further information, see the [project Home Page](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) or the [Wiki](https://github.com/attardi/wikiextractor/wiki).
|
||||
|
||||
|
3309
WikiExtractor.py
3309
WikiExtractor.py
File diff suppressed because it is too large
Load Diff
36512
categories.filter
36512
categories.filter
File diff suppressed because it is too large
Load Diff
@ -11,15 +11,15 @@
|
||||
# This file is part of Tanl.
|
||||
#
|
||||
# Tanl is free software; you can redistribute it and/or modify it
|
||||
# under the terms of the GNU General Public License, version 3,
|
||||
# under the terms of the GNU Affero General Public License, version 3,
|
||||
# as published by the Free Software Foundation.
|
||||
#
|
||||
# Tanl is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
# =============================================================================
|
||||
|
||||
@ -43,7 +43,7 @@ import gzip
|
||||
import logging
|
||||
|
||||
# Program version
|
||||
version = '1.00'
|
||||
version = '3.0'
|
||||
|
||||
urlbase = 'http://it.wikipedia.org/'
|
||||
|
||||
|
608
scripts/WikiExtractor.py
Executable file
608
scripts/WikiExtractor.py
Executable file
@ -0,0 +1,608 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# =============================================================================
|
||||
# Version: 3.0 (July 22, 2020)
|
||||
# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
|
||||
#
|
||||
# Contributors:
|
||||
# Antonio Fuschetto (fuschett@aol.com)
|
||||
# Leonardo Souza (lsouza@amtera.com.br)
|
||||
# Juan Manuel Caicedo (juan@cavorite.com)
|
||||
# Humberto Pereira (begini@gmail.com)
|
||||
# Siegfried-A. Gevatter (siegfried@gevatter.com)
|
||||
# Pedro Assis (pedroh2306@gmail.com)
|
||||
# Wim Muskee (wimmuskee@gmail.com)
|
||||
# Radics Geza (radicsge@gmail.com)
|
||||
#
|
||||
# =============================================================================
|
||||
# Copyright (c) 2009-2020. Giuseppe Attardi (attardi@di.unipi.it).
|
||||
# =============================================================================
|
||||
# This file is part of Tanl.
|
||||
#
|
||||
# Tanl is free software; you can redistribute it and/or modify it
|
||||
# under the terms of the GNU Affero General Public License, version 3,
|
||||
# as published by the Free Software Foundation.
|
||||
#
|
||||
# Tanl is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
# =============================================================================
|
||||
|
||||
"""Wikipedia Extractor:
|
||||
Extracts and cleans text from a Wikipedia database dump and stores output in a
|
||||
number of files of similar size in a given directory.
|
||||
Each file will contain several documents in the format:
|
||||
|
||||
<doc id="" url="" title="">
|
||||
...
|
||||
</doc>
|
||||
|
||||
This version performs template expansion by preprocesssng the whole dump and
|
||||
collecting template definitions.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import bz2
|
||||
import codecs
|
||||
import fileinput
|
||||
import logging
|
||||
import os.path
|
||||
import re # TODO use regex when it will be standard
|
||||
import sys
|
||||
from io import StringIO
|
||||
from multiprocessing import Queue, Process, cpu_count
|
||||
from timeit import default_timer
|
||||
|
||||
from wikiextractor.extract import Extractor, ignoreTag
|
||||
|
||||
# ===========================================================================
|
||||
|
||||
# Program version
|
||||
version = '3.0'
|
||||
|
||||
##
|
||||
# Defined in <siteinfo>
|
||||
# We include as default Template, when loading external template file.
|
||||
knownNamespaces = set(['Template'])
|
||||
|
||||
##
|
||||
# The namespace used for template definitions
|
||||
# It is the name associated with namespace key=10 in the siteinfo header.
|
||||
templateNamespace = ''
|
||||
templatePrefix = ''
|
||||
|
||||
##
|
||||
# The namespace used for module definitions
|
||||
# It is the name associated with namespace key=828 in the siteinfo header.
|
||||
moduleNamespace = ''
|
||||
|
||||
# This is obtained from <siteinfo>
|
||||
urlbase = ''
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Modules
|
||||
|
||||
# Only minimal support
|
||||
# FIXME: import Lua modules.
|
||||
|
||||
modules = {
|
||||
'convert': {
|
||||
'convert': lambda x, u, *rest: x + ' ' + u, # no conversion
|
||||
}
|
||||
}
|
||||
# ----------------------------------------------------------------------
|
||||
# Expand using WikiMedia API
|
||||
# import json
|
||||
|
||||
# def expandTemplates(text):
|
||||
# """Expand templates invoking MediaWiki API"""
|
||||
# text = urlib.urlencodew(text.encode('utf-8'))
|
||||
# base = urlbase[:urlbase.rfind('/')]
|
||||
# url = base + "/w/api.php?action=expandtemplates&format=json&text=" + text
|
||||
# exp = json.loads(urllib.urlopen(url))
|
||||
# return exp['expandtemplates']['*']
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Output
|
||||
|
||||
|
||||
class NextFile(object):
|
||||
|
||||
"""
|
||||
Synchronous generation of next available file name.
|
||||
"""
|
||||
|
||||
filesPerDir = 100
|
||||
|
||||
def __init__(self, path_name):
|
||||
self.path_name = path_name
|
||||
self.dir_index = -1
|
||||
self.file_index = -1
|
||||
|
||||
def next(self):
|
||||
self.file_index = (self.file_index + 1) % NextFile.filesPerDir
|
||||
if self.file_index == 0:
|
||||
self.dir_index += 1
|
||||
dirname = self._dirname()
|
||||
if not os.path.isdir(dirname):
|
||||
os.makedirs(dirname)
|
||||
return self._filepath()
|
||||
|
||||
def _dirname(self):
|
||||
char1 = self.dir_index % 26
|
||||
char2 = self.dir_index / 26 % 26
|
||||
return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))
|
||||
|
||||
def _filepath(self):
|
||||
return '%s/wiki_%02d' % (self._dirname(), self.file_index)
|
||||
|
||||
|
||||
class OutputSplitter(object):
|
||||
|
||||
"""
|
||||
File-like object, that splits output to multiple files of a given max size.
|
||||
"""
|
||||
|
||||
def __init__(self, nextFile, max_file_size=0, compress=True):
|
||||
"""
|
||||
:param nextFile: a NextFile object from which to obtain filenames
|
||||
to use.
|
||||
:param max_file_size: the maximum size of each file.
|
||||
:para compress: whether to write data with bzip compression.
|
||||
"""
|
||||
self.nextFile = nextFile
|
||||
self.compress = compress
|
||||
self.max_file_size = max_file_size
|
||||
self.file = self.open(self.nextFile.next())
|
||||
|
||||
def reserve(self, size):
|
||||
if self.file.tell() + size > self.max_file_size:
|
||||
self.close()
|
||||
self.file = self.open(self.nextFile.next())
|
||||
|
||||
def write(self, data):
|
||||
self.reserve(len(data))
|
||||
self.file.write(data)
|
||||
|
||||
def close(self):
|
||||
self.file.close()
|
||||
|
||||
def open(self, filename):
|
||||
if self.compress:
|
||||
return bz2.BZ2File(filename + '.bz2', 'w')
|
||||
else:
|
||||
return open(filename, 'w')
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# READER
|
||||
|
||||
tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')
|
||||
# 1 2 3 4
|
||||
|
||||
|
||||
def load_templates(file, output_file=None):
|
||||
"""
|
||||
Load templates from :param file:.
|
||||
:param output_file: file where to save templates and modules.
|
||||
"""
|
||||
global templateNamespace, templatePrefix
|
||||
templatePrefix = templateNamespace + ':'
|
||||
global moduleNamespace, modulePrefix
|
||||
modulePrefix = moduleNamespace + ':'
|
||||
articles = 0
|
||||
page = []
|
||||
inText = False
|
||||
if output_file:
|
||||
output = codecs.open(output_file, 'wb', 'utf-8')
|
||||
for line in file:
|
||||
line = line.decode('utf-8')
|
||||
if '<' not in line: # faster than doing re.search()
|
||||
if inText:
|
||||
page.append(line)
|
||||
continue
|
||||
m = tagRE.search(line)
|
||||
if not m:
|
||||
continue
|
||||
tag = m.group(2)
|
||||
if tag == 'page':
|
||||
page = []
|
||||
elif tag == 'title':
|
||||
title = m.group(3)
|
||||
elif tag == 'text':
|
||||
inText = True
|
||||
line = line[m.start(3):m.end(3)]
|
||||
page.append(line)
|
||||
if m.lastindex == 4: # open-close
|
||||
inText = False
|
||||
elif tag == '/text':
|
||||
if m.group(1):
|
||||
page.append(m.group(1))
|
||||
inText = False
|
||||
elif inText:
|
||||
page.append(line)
|
||||
elif tag == '/page':
|
||||
if not output_file and not templateNamespace: # do not know it yet
|
||||
# we reconstruct it from the first title
|
||||
colon = title.find(':')
|
||||
if colon > 1:
|
||||
templateNamespace = title[:colon]
|
||||
templatePrefix = title[:colon + 1]
|
||||
# FIXME: should reconstruct also moduleNamespace
|
||||
if title.startswith(templatePrefix):
|
||||
define_template(title, page)
|
||||
# save templates and modules to file
|
||||
if output_file and (title.startswith(templatePrefix) or
|
||||
title.startswith(modulePrefix)):
|
||||
output.write('<page>\n')
|
||||
output.write(' <title>%s</title>\n' % title)
|
||||
output.write(' <ns>10</ns>\n')
|
||||
output.write(' <text>')
|
||||
for line in page:
|
||||
output.write(line)
|
||||
output.write(' </text>\n')
|
||||
output.write('</page>\n')
|
||||
page = []
|
||||
articles += 1
|
||||
if articles % 100000 == 0:
|
||||
logging.info("Preprocessed %d pages", articles)
|
||||
if output_file:
|
||||
output.close()
|
||||
logging.info("Saved %d templates to '%s'", len(templates), output_file)
|
||||
|
||||
|
||||
def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
||||
process_count):
|
||||
"""
|
||||
:param input_file: name of the wikipedia dump file; '-' to read from stdin
|
||||
:param template_file: optional file with template definitions.
|
||||
:param out_file: directory where to store extracted data, or '-' for stdout
|
||||
:param file_size: max size of each extracted file, or None for no max (one file)
|
||||
:param file_compress: whether to compress files with bzip.
|
||||
:param process_count: number of extraction processes to spawn.
|
||||
"""
|
||||
global urlbase
|
||||
global knownNamespaces
|
||||
global templateNamespace, templatePrefix
|
||||
global moduleNamespace, modulePrefix
|
||||
|
||||
if input_file == '-':
|
||||
input = sys.stdin
|
||||
else:
|
||||
input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
|
||||
|
||||
# collect siteinfo
|
||||
for line in input:
|
||||
line = line.decode('utf-8')
|
||||
m = tagRE.search(line)
|
||||
if not m:
|
||||
continue
|
||||
tag = m.group(2)
|
||||
if tag == 'base':
|
||||
# discover urlbase from the xml dump file
|
||||
# /mediawiki/siteinfo/base
|
||||
base = m.group(3)
|
||||
urlbase = base[:base.rfind("/")]
|
||||
elif tag == 'namespace':
|
||||
knownNamespaces.add(m.group(3))
|
||||
if re.search('key="10"', line):
|
||||
templateNamespace = m.group(3)
|
||||
templatePrefix = templateNamespace + ':'
|
||||
elif re.search('key="828"', line):
|
||||
moduleNamespace = m.group(3)
|
||||
modulePrefix = moduleNamespace + ':'
|
||||
elif tag == '/siteinfo':
|
||||
break
|
||||
|
||||
if expand_templates:
|
||||
# preprocess
|
||||
template_load_start = default_timer()
|
||||
if template_file and os.path.exists(template_file):
|
||||
logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", template_file)
|
||||
file = fileinput.FileInput(template_file, openhook=fileinput.hook_compressed)
|
||||
load_templates(file)
|
||||
file.close()
|
||||
else:
|
||||
if input_file == '-':
|
||||
# can't scan then reset stdin; must error w/ suggestion to specify template_file
|
||||
raise ValueError("to use templates with stdin dump, must supply explicit template-file")
|
||||
logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", input_file)
|
||||
load_templates(input, template_file)
|
||||
input.close()
|
||||
input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
|
||||
template_load_elapsed = default_timer() - template_load_start
|
||||
logging.info("Loaded %d templates in %.1fs", len(templates), template_load_elapsed)
|
||||
|
||||
if out_file == '-':
|
||||
output = sys.stdout
|
||||
if file_compress:
|
||||
logging.warn("writing to stdout, so no output compression (use an external tool)")
|
||||
else:
|
||||
nextFile = NextFile(out_file)
|
||||
output = OutputSplitter(nextFile, file_size, file_compress)
|
||||
|
||||
# process pages
|
||||
logging.info("Starting page extraction from %s.", input_file)
|
||||
extract_start = default_timer()
|
||||
|
||||
# Parallel Map/Reduce:
|
||||
# - pages to be processed are dispatched to workers
|
||||
# - a reduce process collects the results, sort them and print them.
|
||||
|
||||
maxsize = 10 * process_count
|
||||
# output queue
|
||||
output_queue = Queue(maxsize=maxsize)
|
||||
|
||||
# Reduce job that sorts and prints output
|
||||
reduce = Process(target=reduce_process, args=(output_queue, output))
|
||||
reduce.start()
|
||||
|
||||
# initialize jobs queue
|
||||
jobs_queue = Queue(maxsize=maxsize)
|
||||
|
||||
# start worker processes
|
||||
logging.info("Using %d extract processes.", process_count)
|
||||
workers = []
|
||||
for _ in xrange(max(1, process_count)):
|
||||
extractor = Process(target=extract_process,
|
||||
args=(jobs_queue, output_queue))
|
||||
extractor.daemon = True # only live while parent process lives
|
||||
extractor.start()
|
||||
workers.append(extractor)
|
||||
|
||||
# Mapper process
|
||||
|
||||
# we collect individual lines, since str.join() is significantly faster
|
||||
# than concatenation
|
||||
page = []
|
||||
id = None
|
||||
last_id = None
|
||||
ordinal = 0 # page count
|
||||
inText = False
|
||||
redirect = False
|
||||
for line in input:
|
||||
line = line.decode('utf-8')
|
||||
if '<' not in line: # faster than doing re.search()
|
||||
if inText:
|
||||
page.append(line)
|
||||
continue
|
||||
m = tagRE.search(line)
|
||||
if not m:
|
||||
continue
|
||||
tag = m.group(2)
|
||||
if tag == 'page':
|
||||
page = []
|
||||
redirect = False
|
||||
elif tag == 'id' and not id:
|
||||
id = m.group(3)
|
||||
elif tag == 'title':
|
||||
title = m.group(3)
|
||||
elif tag == 'redirect':
|
||||
redirect = True
|
||||
elif tag == 'text':
|
||||
inText = True
|
||||
line = line[m.start(3):m.end(3)]
|
||||
page.append(line)
|
||||
if m.lastindex == 4: # open-close
|
||||
inText = False
|
||||
elif tag == '/text':
|
||||
if m.group(1):
|
||||
page.append(m.group(1))
|
||||
inText = False
|
||||
elif inText:
|
||||
page.append(line)
|
||||
elif tag == '/page':
|
||||
colon = title.find(':')
|
||||
if (colon < 0 or title[:colon] in acceptedNamespaces) and id != last_id and \
|
||||
not redirect and not title.startswith(templateNamespace):
|
||||
job = (id, title, page, ordinal)
|
||||
jobs_queue.put(job) # goes to any available extract_process
|
||||
last_id = id
|
||||
ordinal += 1
|
||||
id = None
|
||||
page = []
|
||||
|
||||
input.close()
|
||||
|
||||
# signal termination
|
||||
for _ in workers:
|
||||
jobs_queue.put(None)
|
||||
# wait for workers to terminate
|
||||
for w in workers:
|
||||
w.join()
|
||||
|
||||
# signal end of work to reduce process
|
||||
output_queue.put(None)
|
||||
# wait for it to finish
|
||||
reduce.join()
|
||||
|
||||
if output != sys.stdout:
|
||||
output.close()
|
||||
extract_duration = default_timer() - extract_start
|
||||
extract_rate = ordinal / extract_duration
|
||||
logging.info("Finished %d-process extraction of %d articles in %.1fs (%.1f art/s)",
|
||||
process_count, ordinal, extract_duration, extract_rate)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Multiprocess support
|
||||
|
||||
|
||||
def extract_process(jobs_queue, output_queue):
|
||||
"""Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text
|
||||
:param jobs_queue: where to get jobs.
|
||||
:param output_queue: where to queue extracted text for output.
|
||||
"""
|
||||
while True:
|
||||
job = jobs_queue.get() # job is (id, title, page, ordinal)
|
||||
if job:
|
||||
out = StringIO() # memory buffer
|
||||
Extractor(*job[:3]).extract(out) # (id, title, page)
|
||||
text = out.getvalue()
|
||||
output_queue.put((job[3], text)) # (ordinal, extracted_text)
|
||||
out.close()
|
||||
else:
|
||||
break
|
||||
|
||||
|
||||
def reduce_process(output_queue, output):
|
||||
"""Pull finished article text, write series of files (or stdout)
|
||||
:param output_queue: text to be output.
|
||||
:param output: file object where to print.
|
||||
"""
|
||||
|
||||
interval_start = default_timer()
|
||||
period = 100000
|
||||
# FIXME: use a heap
|
||||
ordering_buffer = {} # collected pages
|
||||
next_ordinal = 0 # sequence number of pages
|
||||
while True:
|
||||
if next_ordinal in ordering_buffer:
|
||||
output.write(ordering_buffer.pop(next_ordinal))
|
||||
next_ordinal += 1
|
||||
# progress report
|
||||
if next_ordinal % period == 0:
|
||||
interval_rate = period / (default_timer() - interval_start)
|
||||
logging.info("Extracted %d articles (%.1f art/s)",
|
||||
next_ordinal, interval_rate)
|
||||
interval_start = default_timer()
|
||||
else:
|
||||
# mapper puts None to signal finish
|
||||
pair = output_queue.get()
|
||||
if not pair:
|
||||
break
|
||||
ordinal, text = pair
|
||||
ordering_buffer[ordinal] = text
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
# Minimum size of output files
|
||||
minFileSize = 200 * 1024
|
||||
|
||||
|
||||
def main():
|
||||
global urlbase, acceptedNamespaces
|
||||
global expand_templates, templateCache, escape_doc
|
||||
|
||||
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
description=__doc__)
|
||||
parser.add_argument("input",
|
||||
help="XML wiki dump file")
|
||||
groupO = parser.add_argument_group('Output')
|
||||
groupO.add_argument("-o", "--output", default="text",
|
||||
help="directory for extracted files (or '-' for dumping to stdout)")
|
||||
groupO.add_argument("-b", "--bytes", default="1M",
|
||||
help="maximum bytes per output file (default %(default)s)",
|
||||
metavar="n[KMG]")
|
||||
groupO.add_argument("-c", "--compress", action="store_true",
|
||||
help="compress output files using bzip")
|
||||
|
||||
groupP = parser.add_argument_group('Processing')
|
||||
groupP.add_argument("--html", action="store_true",
|
||||
help="produce HTML output, subsumes --links")
|
||||
groupP.add_argument("-l", "--links", action="store_true",
|
||||
help="preserve links")
|
||||
groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
|
||||
help="accepted namespaces")
|
||||
groupP.add_argument("--templates",
|
||||
help="use or create file containing templates")
|
||||
groupP.add_argument("--no-templates", action="store_false",
|
||||
help="Do not expand templates")
|
||||
groupP.add_argument("--escapedoc", action="store_true",
|
||||
help="use to escape the contents of the output <doc>...</doc>")
|
||||
default_process_count = cpu_count() - 1
|
||||
parser.add_argument("--processes", type=int, default=default_process_count,
|
||||
help="Number of processes to use (default %(default)s)")
|
||||
|
||||
groupS = parser.add_argument_group('Special')
|
||||
groupS.add_argument("-q", "--quiet", action="store_true",
|
||||
help="suppress reporting progress info")
|
||||
groupS.add_argument("--debug", action="store_true",
|
||||
help="print debug info")
|
||||
groupS.add_argument("-a", "--article", action="store_true",
|
||||
help="analyze a file containing a single article (debug option)")
|
||||
groupS.add_argument("-v", "--version", action="version",
|
||||
version='%(prog)s ' + version,
|
||||
help="print program version")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
Extractor.keepLinks = args.links
|
||||
Extractor.toHTML = args.html
|
||||
if args.html:
|
||||
Extractor.keepLinks = True
|
||||
|
||||
expand_templates = args.no_templates
|
||||
escape_doc = args.escapedoc
|
||||
|
||||
try:
|
||||
power = 'kmg'.find(args.bytes[-1].lower()) + 1
|
||||
file_size = int(args.bytes[:-1]) * 1024 ** power
|
||||
if file_size < minFileSize:
|
||||
raise ValueError()
|
||||
except ValueError:
|
||||
logging.error('Insufficient or invalid size: %s', args.bytes)
|
||||
return
|
||||
|
||||
if args.namespaces:
|
||||
acceptedNamespaces = set(args.namespaces.split(','))
|
||||
|
||||
FORMAT = '%(levelname)s: %(message)s'
|
||||
logging.basicConfig(format=FORMAT)
|
||||
|
||||
logger = logging.getLogger()
|
||||
if not args.quiet:
|
||||
logger.setLevel(logging.INFO)
|
||||
if args.debug:
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
input_file = args.input
|
||||
|
||||
if not Extractor.keepLinks:
|
||||
ignoreTag('a')
|
||||
|
||||
# sharing cache of parser templates is too slow:
|
||||
# manager = Manager()
|
||||
# templateCache = manager.dict()
|
||||
|
||||
if args.article:
|
||||
if args.templates:
|
||||
if os.path.exists(args.templates):
|
||||
with open(args.templates) as file:
|
||||
load_templates(file)
|
||||
|
||||
with open(input_file) as file:
|
||||
page = file.read().decode('utf-8')
|
||||
m = re.search(r'<id>(.*)</id>', page)
|
||||
id = m.group(1) if m else 0
|
||||
m = re.search(r'<title>(.*)</title>', page)
|
||||
if m:
|
||||
title = m.group(1)
|
||||
else:
|
||||
logging.error('Missing title element')
|
||||
return
|
||||
Extractor(id, title, [page]).extract(sys.stdout)
|
||||
return
|
||||
|
||||
output_path = args.output
|
||||
if output_path != '-' and not os.path.isdir(output_path):
|
||||
try:
|
||||
os.makedirs(output_path)
|
||||
except:
|
||||
logging.error('Could not create: %s', output_path)
|
||||
return
|
||||
|
||||
process_dump(input_file, args.templates, output_path, file_size,
|
||||
args.compress, args.processes)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
0
scripts/__init__.py
Normal file
0
scripts/__init__.py
Normal file
125
scripts/extractPage.py
Executable file
125
scripts/extractPage.py
Executable file
@ -0,0 +1,125 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# =============================================================================
|
||||
# Version: 3.0 (July 22, 2020)
|
||||
# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
|
||||
|
||||
# =============================================================================
|
||||
# Copyright (c) 2009. Giuseppe Attardi (attardi@di.unipi.it).
|
||||
# =============================================================================
|
||||
# This file is part of Tanl.
|
||||
#
|
||||
# Tanl is free software; you can redistribute it and/or modify it
|
||||
# under the terms of the GNU Affero General Public License, version 3,
|
||||
# as published by the Free Software Foundation.
|
||||
#
|
||||
# Tanl is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
# =============================================================================
|
||||
|
||||
"""Wikipedia Page Extractor:
|
||||
Extracts a single page from a Wikipedia dump file.
|
||||
"""
|
||||
|
||||
import sys, os.path
|
||||
import re, random
|
||||
import argparse
|
||||
from itertools import izip
|
||||
import logging, traceback
|
||||
import urllib
|
||||
import bz2, gzip
|
||||
from htmlentitydefs import name2codepoint
|
||||
import Queue, threading, multiprocessing
|
||||
|
||||
|
||||
# Program version
|
||||
version = '3.0'
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# READER
|
||||
|
||||
tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')
|
||||
#tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>([^<]*)')
|
||||
# 1 2 3
|
||||
|
||||
def process_data(input_file, id, templates=False):
|
||||
"""
|
||||
:param input_file: name of the wikipedia dump file.
|
||||
:param id: article id
|
||||
"""
|
||||
|
||||
if input_file.lower().endswith("bz2"):
|
||||
opener = bz2.BZ2File
|
||||
else:
|
||||
opener = open
|
||||
|
||||
input = opener(input_file)
|
||||
|
||||
page = []
|
||||
for line in input:
|
||||
line = line.decode('utf-8')
|
||||
if '<' not in line: # faster than doing re.search()
|
||||
if page:
|
||||
page.append(line)
|
||||
continue
|
||||
m = tagRE.search(line)
|
||||
if not m:
|
||||
continue
|
||||
tag = m.group(2)
|
||||
if tag == 'page':
|
||||
page = []
|
||||
page.append(line)
|
||||
inArticle = False
|
||||
elif tag == 'id':
|
||||
curid = m.group(3)
|
||||
if id == curid:
|
||||
page.append(line)
|
||||
inArticle = True
|
||||
elif not inArticle and not templates:
|
||||
page = []
|
||||
elif tag == 'title':
|
||||
if templates:
|
||||
if m.group(3).startswith('Template:'):
|
||||
page.append(line)
|
||||
else:
|
||||
page = []
|
||||
else:
|
||||
page.append(line)
|
||||
elif tag == '/page':
|
||||
if page:
|
||||
page.append(line)
|
||||
print ''.join(page).encode('utf-8')
|
||||
if not templates:
|
||||
break
|
||||
page = []
|
||||
elif page:
|
||||
page.append(line)
|
||||
|
||||
input.close()
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
description=__doc__)
|
||||
parser.add_argument("input",
|
||||
help="XML wiki dump file")
|
||||
parser.add_argument("--id", default="",
|
||||
help="article number")
|
||||
parser.add_argument("--template", action="store_true",
|
||||
help="template number")
|
||||
parser.add_argument("-v", "--version", action="version",
|
||||
version='%(prog)s ' + version,
|
||||
help="print program version")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
process_data(args.input, args.id, args.template)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
25
setup.py
Normal file
25
setup.py
Normal file
@ -0,0 +1,25 @@
|
||||
from setuptools import setup
|
||||
import re
|
||||
|
||||
from scripts.WikiExtractor import version
|
||||
|
||||
|
||||
def to_semantic_version(version):
|
||||
if re.match(r'^\d+\.\d+$', version):
|
||||
return version + '.0'
|
||||
return version
|
||||
|
||||
setup(
|
||||
name='wikiextractor',
|
||||
version=to_semantic_version(version),
|
||||
description='A tool for extracting plain text from Wikipedia dumps',
|
||||
packages=[
|
||||
'wikiextractor'
|
||||
],
|
||||
install_requires=[
|
||||
],
|
||||
tests_require=[
|
||||
'nose>=1.0',
|
||||
],
|
||||
test_suite='nose.collector',
|
||||
)
|
0
wikiextractor/__init__.py
Normal file
0
wikiextractor/__init__.py
Normal file
48
wikiextractor/clean.py
Normal file
48
wikiextractor/clean.py
Normal file
@ -0,0 +1,48 @@
|
||||
# =============================================================================
|
||||
# Copyright (c) 2020. Giuseppe Attardi (attardi@di.unipi.it).
|
||||
# =============================================================================
|
||||
# This file is part of Tanl.
|
||||
#
|
||||
# Tanl is free software; you can redistribute it and/or modify it
|
||||
# under the terms of the GNU Affero General Public License, version 3,
|
||||
# as published by the Free Software Foundation.
|
||||
#
|
||||
# Tanl is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
# =============================================================================
|
||||
|
||||
from wikiextractor.extract import Extractor, ignoreTag, resetIgnoredTags
|
||||
|
||||
|
||||
def clean_markup(markup, keep_links=False, ignore_headers=True):
|
||||
"""
|
||||
Clean Wikimarkup to produce plaintext.
|
||||
|
||||
:param keep_links: Set to True to keep internal and external links
|
||||
:param ignore_headers: if set to True, the output list will not contain
|
||||
headers, only
|
||||
|
||||
Returns a list of paragraphs (unicode strings).
|
||||
"""
|
||||
|
||||
if not keep_links:
|
||||
ignoreTag('a')
|
||||
|
||||
extractor = Extractor(0, '', [])
|
||||
|
||||
# returns a list of strings
|
||||
paragraphs = extractor.clean_text(markup,
|
||||
mark_headers=True,
|
||||
expand_templates=False,
|
||||
escape_doc=True)
|
||||
resetIgnoredTags()
|
||||
|
||||
if ignore_headers:
|
||||
paragraphs = filter(lambda s: not s.startswith('## '), paragraphs)
|
||||
|
||||
return paragraphs
|
1726
wikiextractor/extract.py
Normal file
1726
wikiextractor/extract.py
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user