Upgrade to Python 3.3+.

This commit is contained in:
attardi 2020-07-22 14:12:37 +02:00
parent 6408a430fc
commit 62bdbe6106
12 changed files with 2570 additions and 39899 deletions

105
.gitignore vendored
View File

@ -1,93 +1,52 @@
local/
tmp/
### https://raw.github.com/github/gitignore/c699a4f4684e9e294c9c550f820ca330f019b6f9/python.gitignore
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
# Packages
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
*.egg-info
dist
build
eggs
parts
bin
var
sdist
develop-eggs
.installed.cfg
lib
lib64
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
.tox
nosetests.xml
coverage.xml
*,cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Mr Developer
.mr.developer.cfg
.project
.pydevproject
# Flask instance folder
instance/
/docs/_build
.idea
*.iml
# Scrapy stuff:
.scrapy
.travis-solo
G*
*.db
*.mdb
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# IPython Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# dotenv
.env
# virtualenv
venv/
ENV/
# Spyder project settings
.spyderproject
# Editor files
*.idea
# Vim
[._]*.s[a-w][a-z]
[._]s[a-w][a-z]
*.un~
Session.vim
.netrwhist
*~

View File

@ -1,7 +1,8 @@
# WikiExtractor
[WikiExtractor.py](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) is a Python script that extracts and cleans text from a [Wikipedia database dump](http://download.wikimedia.org/).
The tool is written in Python and requires Python 2.7 or Python 3.3+ but no additional library.
The tool is written in Python and requires Python 3.7 but no additional library.
**Warning**: problems have been reported on Windows due to poor support for `StringIO` in the Python implementation on Windows.
For further information, see the [project Home Page](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) or the [Wiki](https://github.com/attardi/wikiextractor/wiki).

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -11,15 +11,15 @@
# This file is part of Tanl.
#
# Tanl is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License, version 3,
# under the terms of the GNU Affero General Public License, version 3,
# as published by the Free Software Foundation.
#
# Tanl is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# =============================================================================
@ -43,7 +43,7 @@ import gzip
import logging
# Program version
version = '1.00'
version = '3.0'
urlbase = 'http://it.wikipedia.org/'

608
scripts/WikiExtractor.py Executable file
View File

@ -0,0 +1,608 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# =============================================================================
# Version: 3.0 (July 22, 2020)
# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
#
# Contributors:
# Antonio Fuschetto (fuschett@aol.com)
# Leonardo Souza (lsouza@amtera.com.br)
# Juan Manuel Caicedo (juan@cavorite.com)
# Humberto Pereira (begini@gmail.com)
# Siegfried-A. Gevatter (siegfried@gevatter.com)
# Pedro Assis (pedroh2306@gmail.com)
# Wim Muskee (wimmuskee@gmail.com)
# Radics Geza (radicsge@gmail.com)
#
# =============================================================================
# Copyright (c) 2009-2020. Giuseppe Attardi (attardi@di.unipi.it).
# =============================================================================
# This file is part of Tanl.
#
# Tanl is free software; you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License, version 3,
# as published by the Free Software Foundation.
#
# Tanl is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# =============================================================================
"""Wikipedia Extractor:
Extracts and cleans text from a Wikipedia database dump and stores output in a
number of files of similar size in a given directory.
Each file will contain several documents in the format:
<doc id="" url="" title="">
...
</doc>
This version performs template expansion by preprocesssng the whole dump and
collecting template definitions.
"""
import argparse
import bz2
import codecs
import fileinput
import logging
import os.path
import re # TODO use regex when it will be standard
import sys
from io import StringIO
from multiprocessing import Queue, Process, cpu_count
from timeit import default_timer
from wikiextractor.extract import Extractor, ignoreTag
# ===========================================================================
# Program version
version = '3.0'
##
# Defined in <siteinfo>
# We include as default Template, when loading external template file.
knownNamespaces = set(['Template'])
##
# The namespace used for template definitions
# It is the name associated with namespace key=10 in the siteinfo header.
templateNamespace = ''
templatePrefix = ''
##
# The namespace used for module definitions
# It is the name associated with namespace key=828 in the siteinfo header.
moduleNamespace = ''
# This is obtained from <siteinfo>
urlbase = ''
# ----------------------------------------------------------------------
# Modules
# Only minimal support
# FIXME: import Lua modules.
modules = {
'convert': {
'convert': lambda x, u, *rest: x + ' ' + u, # no conversion
}
}
# ----------------------------------------------------------------------
# Expand using WikiMedia API
# import json
# def expandTemplates(text):
# """Expand templates invoking MediaWiki API"""
# text = urlib.urlencodew(text.encode('utf-8'))
# base = urlbase[:urlbase.rfind('/')]
# url = base + "/w/api.php?action=expandtemplates&format=json&text=" + text
# exp = json.loads(urllib.urlopen(url))
# return exp['expandtemplates']['*']
# ------------------------------------------------------------------------------
# Output
class NextFile(object):
"""
Synchronous generation of next available file name.
"""
filesPerDir = 100
def __init__(self, path_name):
self.path_name = path_name
self.dir_index = -1
self.file_index = -1
def next(self):
self.file_index = (self.file_index + 1) % NextFile.filesPerDir
if self.file_index == 0:
self.dir_index += 1
dirname = self._dirname()
if not os.path.isdir(dirname):
os.makedirs(dirname)
return self._filepath()
def _dirname(self):
char1 = self.dir_index % 26
char2 = self.dir_index / 26 % 26
return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))
def _filepath(self):
return '%s/wiki_%02d' % (self._dirname(), self.file_index)
class OutputSplitter(object):
"""
File-like object, that splits output to multiple files of a given max size.
"""
def __init__(self, nextFile, max_file_size=0, compress=True):
"""
:param nextFile: a NextFile object from which to obtain filenames
to use.
:param max_file_size: the maximum size of each file.
:para compress: whether to write data with bzip compression.
"""
self.nextFile = nextFile
self.compress = compress
self.max_file_size = max_file_size
self.file = self.open(self.nextFile.next())
def reserve(self, size):
if self.file.tell() + size > self.max_file_size:
self.close()
self.file = self.open(self.nextFile.next())
def write(self, data):
self.reserve(len(data))
self.file.write(data)
def close(self):
self.file.close()
def open(self, filename):
if self.compress:
return bz2.BZ2File(filename + '.bz2', 'w')
else:
return open(filename, 'w')
# ----------------------------------------------------------------------
# READER
tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')
# 1 2 3 4
def load_templates(file, output_file=None):
"""
Load templates from :param file:.
:param output_file: file where to save templates and modules.
"""
global templateNamespace, templatePrefix
templatePrefix = templateNamespace + ':'
global moduleNamespace, modulePrefix
modulePrefix = moduleNamespace + ':'
articles = 0
page = []
inText = False
if output_file:
output = codecs.open(output_file, 'wb', 'utf-8')
for line in file:
line = line.decode('utf-8')
if '<' not in line: # faster than doing re.search()
if inText:
page.append(line)
continue
m = tagRE.search(line)
if not m:
continue
tag = m.group(2)
if tag == 'page':
page = []
elif tag == 'title':
title = m.group(3)
elif tag == 'text':
inText = True
line = line[m.start(3):m.end(3)]
page.append(line)
if m.lastindex == 4: # open-close
inText = False
elif tag == '/text':
if m.group(1):
page.append(m.group(1))
inText = False
elif inText:
page.append(line)
elif tag == '/page':
if not output_file and not templateNamespace: # do not know it yet
# we reconstruct it from the first title
colon = title.find(':')
if colon > 1:
templateNamespace = title[:colon]
templatePrefix = title[:colon + 1]
# FIXME: should reconstruct also moduleNamespace
if title.startswith(templatePrefix):
define_template(title, page)
# save templates and modules to file
if output_file and (title.startswith(templatePrefix) or
title.startswith(modulePrefix)):
output.write('<page>\n')
output.write(' <title>%s</title>\n' % title)
output.write(' <ns>10</ns>\n')
output.write(' <text>')
for line in page:
output.write(line)
output.write(' </text>\n')
output.write('</page>\n')
page = []
articles += 1
if articles % 100000 == 0:
logging.info("Preprocessed %d pages", articles)
if output_file:
output.close()
logging.info("Saved %d templates to '%s'", len(templates), output_file)
def process_dump(input_file, template_file, out_file, file_size, file_compress,
process_count):
"""
:param input_file: name of the wikipedia dump file; '-' to read from stdin
:param template_file: optional file with template definitions.
:param out_file: directory where to store extracted data, or '-' for stdout
:param file_size: max size of each extracted file, or None for no max (one file)
:param file_compress: whether to compress files with bzip.
:param process_count: number of extraction processes to spawn.
"""
global urlbase
global knownNamespaces
global templateNamespace, templatePrefix
global moduleNamespace, modulePrefix
if input_file == '-':
input = sys.stdin
else:
input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
# collect siteinfo
for line in input:
line = line.decode('utf-8')
m = tagRE.search(line)
if not m:
continue
tag = m.group(2)
if tag == 'base':
# discover urlbase from the xml dump file
# /mediawiki/siteinfo/base
base = m.group(3)
urlbase = base[:base.rfind("/")]
elif tag == 'namespace':
knownNamespaces.add(m.group(3))
if re.search('key="10"', line):
templateNamespace = m.group(3)
templatePrefix = templateNamespace + ':'
elif re.search('key="828"', line):
moduleNamespace = m.group(3)
modulePrefix = moduleNamespace + ':'
elif tag == '/siteinfo':
break
if expand_templates:
# preprocess
template_load_start = default_timer()
if template_file and os.path.exists(template_file):
logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", template_file)
file = fileinput.FileInput(template_file, openhook=fileinput.hook_compressed)
load_templates(file)
file.close()
else:
if input_file == '-':
# can't scan then reset stdin; must error w/ suggestion to specify template_file
raise ValueError("to use templates with stdin dump, must supply explicit template-file")
logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", input_file)
load_templates(input, template_file)
input.close()
input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
template_load_elapsed = default_timer() - template_load_start
logging.info("Loaded %d templates in %.1fs", len(templates), template_load_elapsed)
if out_file == '-':
output = sys.stdout
if file_compress:
logging.warn("writing to stdout, so no output compression (use an external tool)")
else:
nextFile = NextFile(out_file)
output = OutputSplitter(nextFile, file_size, file_compress)
# process pages
logging.info("Starting page extraction from %s.", input_file)
extract_start = default_timer()
# Parallel Map/Reduce:
# - pages to be processed are dispatched to workers
# - a reduce process collects the results, sort them and print them.
maxsize = 10 * process_count
# output queue
output_queue = Queue(maxsize=maxsize)
# Reduce job that sorts and prints output
reduce = Process(target=reduce_process, args=(output_queue, output))
reduce.start()
# initialize jobs queue
jobs_queue = Queue(maxsize=maxsize)
# start worker processes
logging.info("Using %d extract processes.", process_count)
workers = []
for _ in xrange(max(1, process_count)):
extractor = Process(target=extract_process,
args=(jobs_queue, output_queue))
extractor.daemon = True # only live while parent process lives
extractor.start()
workers.append(extractor)
# Mapper process
# we collect individual lines, since str.join() is significantly faster
# than concatenation
page = []
id = None
last_id = None
ordinal = 0 # page count
inText = False
redirect = False
for line in input:
line = line.decode('utf-8')
if '<' not in line: # faster than doing re.search()
if inText:
page.append(line)
continue
m = tagRE.search(line)
if not m:
continue
tag = m.group(2)
if tag == 'page':
page = []
redirect = False
elif tag == 'id' and not id:
id = m.group(3)
elif tag == 'title':
title = m.group(3)
elif tag == 'redirect':
redirect = True
elif tag == 'text':
inText = True
line = line[m.start(3):m.end(3)]
page.append(line)
if m.lastindex == 4: # open-close
inText = False
elif tag == '/text':
if m.group(1):
page.append(m.group(1))
inText = False
elif inText:
page.append(line)
elif tag == '/page':
colon = title.find(':')
if (colon < 0 or title[:colon] in acceptedNamespaces) and id != last_id and \
not redirect and not title.startswith(templateNamespace):
job = (id, title, page, ordinal)
jobs_queue.put(job) # goes to any available extract_process
last_id = id
ordinal += 1
id = None
page = []
input.close()
# signal termination
for _ in workers:
jobs_queue.put(None)
# wait for workers to terminate
for w in workers:
w.join()
# signal end of work to reduce process
output_queue.put(None)
# wait for it to finish
reduce.join()
if output != sys.stdout:
output.close()
extract_duration = default_timer() - extract_start
extract_rate = ordinal / extract_duration
logging.info("Finished %d-process extraction of %d articles in %.1fs (%.1f art/s)",
process_count, ordinal, extract_duration, extract_rate)
# ----------------------------------------------------------------------
# Multiprocess support
def extract_process(jobs_queue, output_queue):
"""Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text
:param jobs_queue: where to get jobs.
:param output_queue: where to queue extracted text for output.
"""
while True:
job = jobs_queue.get() # job is (id, title, page, ordinal)
if job:
out = StringIO() # memory buffer
Extractor(*job[:3]).extract(out) # (id, title, page)
text = out.getvalue()
output_queue.put((job[3], text)) # (ordinal, extracted_text)
out.close()
else:
break
def reduce_process(output_queue, output):
"""Pull finished article text, write series of files (or stdout)
:param output_queue: text to be output.
:param output: file object where to print.
"""
interval_start = default_timer()
period = 100000
# FIXME: use a heap
ordering_buffer = {} # collected pages
next_ordinal = 0 # sequence number of pages
while True:
if next_ordinal in ordering_buffer:
output.write(ordering_buffer.pop(next_ordinal))
next_ordinal += 1
# progress report
if next_ordinal % period == 0:
interval_rate = period / (default_timer() - interval_start)
logging.info("Extracted %d articles (%.1f art/s)",
next_ordinal, interval_rate)
interval_start = default_timer()
else:
# mapper puts None to signal finish
pair = output_queue.get()
if not pair:
break
ordinal, text = pair
ordering_buffer[ordinal] = text
# ----------------------------------------------------------------------
# Minimum size of output files
minFileSize = 200 * 1024
def main():
global urlbase, acceptedNamespaces
global expand_templates, templateCache, escape_doc
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.RawDescriptionHelpFormatter,
description=__doc__)
parser.add_argument("input",
help="XML wiki dump file")
groupO = parser.add_argument_group('Output')
groupO.add_argument("-o", "--output", default="text",
help="directory for extracted files (or '-' for dumping to stdout)")
groupO.add_argument("-b", "--bytes", default="1M",
help="maximum bytes per output file (default %(default)s)",
metavar="n[KMG]")
groupO.add_argument("-c", "--compress", action="store_true",
help="compress output files using bzip")
groupP = parser.add_argument_group('Processing')
groupP.add_argument("--html", action="store_true",
help="produce HTML output, subsumes --links")
groupP.add_argument("-l", "--links", action="store_true",
help="preserve links")
groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
help="accepted namespaces")
groupP.add_argument("--templates",
help="use or create file containing templates")
groupP.add_argument("--no-templates", action="store_false",
help="Do not expand templates")
groupP.add_argument("--escapedoc", action="store_true",
help="use to escape the contents of the output <doc>...</doc>")
default_process_count = cpu_count() - 1
parser.add_argument("--processes", type=int, default=default_process_count,
help="Number of processes to use (default %(default)s)")
groupS = parser.add_argument_group('Special')
groupS.add_argument("-q", "--quiet", action="store_true",
help="suppress reporting progress info")
groupS.add_argument("--debug", action="store_true",
help="print debug info")
groupS.add_argument("-a", "--article", action="store_true",
help="analyze a file containing a single article (debug option)")
groupS.add_argument("-v", "--version", action="version",
version='%(prog)s ' + version,
help="print program version")
args = parser.parse_args()
Extractor.keepLinks = args.links
Extractor.toHTML = args.html
if args.html:
Extractor.keepLinks = True
expand_templates = args.no_templates
escape_doc = args.escapedoc
try:
power = 'kmg'.find(args.bytes[-1].lower()) + 1
file_size = int(args.bytes[:-1]) * 1024 ** power
if file_size < minFileSize:
raise ValueError()
except ValueError:
logging.error('Insufficient or invalid size: %s', args.bytes)
return
if args.namespaces:
acceptedNamespaces = set(args.namespaces.split(','))
FORMAT = '%(levelname)s: %(message)s'
logging.basicConfig(format=FORMAT)
logger = logging.getLogger()
if not args.quiet:
logger.setLevel(logging.INFO)
if args.debug:
logger.setLevel(logging.DEBUG)
input_file = args.input
if not Extractor.keepLinks:
ignoreTag('a')
# sharing cache of parser templates is too slow:
# manager = Manager()
# templateCache = manager.dict()
if args.article:
if args.templates:
if os.path.exists(args.templates):
with open(args.templates) as file:
load_templates(file)
with open(input_file) as file:
page = file.read().decode('utf-8')
m = re.search(r'<id>(.*)</id>', page)
id = m.group(1) if m else 0
m = re.search(r'<title>(.*)</title>', page)
if m:
title = m.group(1)
else:
logging.error('Missing title element')
return
Extractor(id, title, [page]).extract(sys.stdout)
return
output_path = args.output
if output_path != '-' and not os.path.isdir(output_path):
try:
os.makedirs(output_path)
except:
logging.error('Could not create: %s', output_path)
return
process_dump(input_file, args.templates, output_path, file_size,
args.compress, args.processes)
if __name__ == '__main__':
main()

0
scripts/__init__.py Normal file
View File

125
scripts/extractPage.py Executable file
View File

@ -0,0 +1,125 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# =============================================================================
# Version: 3.0 (July 22, 2020)
# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
# =============================================================================
# Copyright (c) 2009. Giuseppe Attardi (attardi@di.unipi.it).
# =============================================================================
# This file is part of Tanl.
#
# Tanl is free software; you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License, version 3,
# as published by the Free Software Foundation.
#
# Tanl is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# =============================================================================
"""Wikipedia Page Extractor:
Extracts a single page from a Wikipedia dump file.
"""
import sys, os.path
import re, random
import argparse
from itertools import izip
import logging, traceback
import urllib
import bz2, gzip
from htmlentitydefs import name2codepoint
import Queue, threading, multiprocessing
# Program version
version = '3.0'
# ----------------------------------------------------------------------
# READER
tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')
#tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>([^<]*)')
# 1 2 3
def process_data(input_file, id, templates=False):
"""
:param input_file: name of the wikipedia dump file.
:param id: article id
"""
if input_file.lower().endswith("bz2"):
opener = bz2.BZ2File
else:
opener = open
input = opener(input_file)
page = []
for line in input:
line = line.decode('utf-8')
if '<' not in line: # faster than doing re.search()
if page:
page.append(line)
continue
m = tagRE.search(line)
if not m:
continue
tag = m.group(2)
if tag == 'page':
page = []
page.append(line)
inArticle = False
elif tag == 'id':
curid = m.group(3)
if id == curid:
page.append(line)
inArticle = True
elif not inArticle and not templates:
page = []
elif tag == 'title':
if templates:
if m.group(3).startswith('Template:'):
page.append(line)
else:
page = []
else:
page.append(line)
elif tag == '/page':
if page:
page.append(line)
print ''.join(page).encode('utf-8')
if not templates:
break
page = []
elif page:
page.append(line)
input.close()
def main():
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.RawDescriptionHelpFormatter,
description=__doc__)
parser.add_argument("input",
help="XML wiki dump file")
parser.add_argument("--id", default="",
help="article number")
parser.add_argument("--template", action="store_true",
help="template number")
parser.add_argument("-v", "--version", action="version",
version='%(prog)s ' + version,
help="print program version")
args = parser.parse_args()
process_data(args.input, args.id, args.template)
if __name__ == '__main__':
main()

25
setup.py Normal file
View File

@ -0,0 +1,25 @@
from setuptools import setup
import re
from scripts.WikiExtractor import version
def to_semantic_version(version):
if re.match(r'^\d+\.\d+$', version):
return version + '.0'
return version
setup(
name='wikiextractor',
version=to_semantic_version(version),
description='A tool for extracting plain text from Wikipedia dumps',
packages=[
'wikiextractor'
],
install_requires=[
],
tests_require=[
'nose>=1.0',
],
test_suite='nose.collector',
)

View File

48
wikiextractor/clean.py Normal file
View File

@ -0,0 +1,48 @@
# =============================================================================
# Copyright (c) 2020. Giuseppe Attardi (attardi@di.unipi.it).
# =============================================================================
# This file is part of Tanl.
#
# Tanl is free software; you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License, version 3,
# as published by the Free Software Foundation.
#
# Tanl is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# =============================================================================
from wikiextractor.extract import Extractor, ignoreTag, resetIgnoredTags
def clean_markup(markup, keep_links=False, ignore_headers=True):
"""
Clean Wikimarkup to produce plaintext.
:param keep_links: Set to True to keep internal and external links
:param ignore_headers: if set to True, the output list will not contain
headers, only
Returns a list of paragraphs (unicode strings).
"""
if not keep_links:
ignoreTag('a')
extractor = Extractor(0, '', [])
# returns a list of strings
paragraphs = extractor.clean_text(markup,
mark_headers=True,
expand_templates=False,
escape_doc=True)
resetIgnoredTags()
if ignore_headers:
paragraphs = filter(lambda s: not s.startswith('## '), paragraphs)
return paragraphs

1726
wikiextractor/extract.py Normal file

File diff suppressed because it is too large Load Diff