This commit is contained in:
attardi 2016-02-04 11:09:31 +01:00
commit 49464c0210
2 changed files with 11 additions and 4 deletions

View File

@ -1,7 +1,7 @@
# wikiextractor # wikiextractor
[WikiExtractor.py](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) is a Python script that extracts and cleans text from a [Wikipedia database dump](http://download.wikimedia.org/). [WikiExtractor.py](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) is a Python script that extracts and cleans text from a [Wikipedia database dump](http://download.wikimedia.org/).
The tool is written in Python and requires no additional library. The tool is written in Python and requires Python 2.7 but no additional library.
**Warning**: problems have been reported on Windows due to poor support for `StringIO` in the Python implementation on Windows. **Warning**: problems have been reported on Windows due to poor support for `StringIO` in the Python implementation on Windows.
For further information, see the [project Home Page](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) or the [Wiki](https://github.com/attardi/wikiextractor/wiki). For further information, see the [project Home Page](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) or the [Wiki](https://github.com/attardi/wikiextractor/wiki).
@ -47,6 +47,8 @@ Each file will contains several documents in this [document format](http://media
--templates TEMPLATES --templates TEMPLATES
use or create file containing templates use or create file containing templates
--no-templates Do not expand templates --no-templates Do not expand templates
--escapedoc use to escape the contents of the output
<doc>...</doc>
Special: Special:
-q, --quiet suppress reporting progress info -q, --quiet suppress reporting progress info

View File

@ -49,6 +49,7 @@ collecting template definitions.
import argparse import argparse
import bz2 import bz2
import codecs import codecs
import cgi
import fileinput import fileinput
import logging import logging
import os.path import os.path
@ -1936,7 +1937,7 @@ expand_templates = True
def clean(extractor, text): def clean(extractor, text):
""" """
Transforms wiki markup. Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped
@see https://www.mediawiki.org/wiki/Help:Formatting @see https://www.mediawiki.org/wiki/Help:Formatting
""" """
@ -2034,7 +2035,8 @@ def clean(extractor, text):
text = re.sub(u'(\[\(«) ', r'\1', text) text = re.sub(u'(\[\(«) ', r'\1', text)
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
text = text.replace(',,', ',').replace(',.', '.') text = text.replace(',,', ',').replace(',.', '.')
if escape_doc:
text = cgi.escape(text)
return text return text
@ -2530,7 +2532,7 @@ minFileSize = 200 * 1024
def main(): def main():
global urlbase, acceptedNamespaces global urlbase, acceptedNamespaces
global expand_templates, templateCache global expand_templates, templateCache, escape_doc
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
@ -2557,6 +2559,8 @@ def main():
help="use or create file containing templates") help="use or create file containing templates")
groupP.add_argument("--no-templates", action="store_false", groupP.add_argument("--no-templates", action="store_false",
help="Do not expand templates") help="Do not expand templates")
groupP.add_argument("--escapedoc", action="store_true",
help="use to escape the contents of the output <doc>...</doc>")
default_process_count = cpu_count() - 1 default_process_count = cpu_count() - 1
parser.add_argument("--processes", type=int, default=default_process_count, parser.add_argument("--processes", type=int, default=default_process_count,
help="Number of processes to use (default %(default)s)") help="Number of processes to use (default %(default)s)")
@ -2580,6 +2584,7 @@ def main():
Extractor.keepLinks = True Extractor.keepLinks = True
expand_templates = args.no_templates expand_templates = args.no_templates
escape_doc = args.escapedoc
try: try:
power = 'kmg'.find(args.bytes[-1].lower()) + 1 power = 'kmg'.find(args.bytes[-1].lower()) + 1