Merge branch 'master' of https://github.com/attardi/wikiextractor
This commit is contained in:
commit
49464c0210
@ -1,7 +1,7 @@
|
|||||||
# wikiextractor
|
# wikiextractor
|
||||||
[WikiExtractor.py](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) is a Python script that extracts and cleans text from a [Wikipedia database dump](http://download.wikimedia.org/).
|
[WikiExtractor.py](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) is a Python script that extracts and cleans text from a [Wikipedia database dump](http://download.wikimedia.org/).
|
||||||
|
|
||||||
The tool is written in Python and requires no additional library.
|
The tool is written in Python and requires Python 2.7 but no additional library.
|
||||||
**Warning**: problems have been reported on Windows due to poor support for `StringIO` in the Python implementation on Windows.
|
**Warning**: problems have been reported on Windows due to poor support for `StringIO` in the Python implementation on Windows.
|
||||||
|
|
||||||
For further information, see the [project Home Page](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) or the [Wiki](https://github.com/attardi/wikiextractor/wiki).
|
For further information, see the [project Home Page](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) or the [Wiki](https://github.com/attardi/wikiextractor/wiki).
|
||||||
@ -47,6 +47,8 @@ Each file will contains several documents in this [document format](http://media
|
|||||||
--templates TEMPLATES
|
--templates TEMPLATES
|
||||||
use or create file containing templates
|
use or create file containing templates
|
||||||
--no-templates Do not expand templates
|
--no-templates Do not expand templates
|
||||||
|
--escapedoc use to escape the contents of the output
|
||||||
|
<doc>...</doc>
|
||||||
|
|
||||||
Special:
|
Special:
|
||||||
-q, --quiet suppress reporting progress info
|
-q, --quiet suppress reporting progress info
|
||||||
|
@ -49,6 +49,7 @@ collecting template definitions.
|
|||||||
import argparse
|
import argparse
|
||||||
import bz2
|
import bz2
|
||||||
import codecs
|
import codecs
|
||||||
|
import cgi
|
||||||
import fileinput
|
import fileinput
|
||||||
import logging
|
import logging
|
||||||
import os.path
|
import os.path
|
||||||
@ -1936,7 +1937,7 @@ expand_templates = True
|
|||||||
|
|
||||||
def clean(extractor, text):
|
def clean(extractor, text):
|
||||||
"""
|
"""
|
||||||
Transforms wiki markup.
|
Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped
|
||||||
@see https://www.mediawiki.org/wiki/Help:Formatting
|
@see https://www.mediawiki.org/wiki/Help:Formatting
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -2034,7 +2035,8 @@ def clean(extractor, text):
|
|||||||
text = re.sub(u'(\[\(«) ', r'\1', text)
|
text = re.sub(u'(\[\(«) ', r'\1', text)
|
||||||
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
|
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
|
||||||
text = text.replace(',,', ',').replace(',.', '.')
|
text = text.replace(',,', ',').replace(',.', '.')
|
||||||
|
if escape_doc:
|
||||||
|
text = cgi.escape(text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
@ -2530,7 +2532,7 @@ minFileSize = 200 * 1024
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
global urlbase, acceptedNamespaces
|
global urlbase, acceptedNamespaces
|
||||||
global expand_templates, templateCache
|
global expand_templates, templateCache, escape_doc
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
|
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
@ -2557,6 +2559,8 @@ def main():
|
|||||||
help="use or create file containing templates")
|
help="use or create file containing templates")
|
||||||
groupP.add_argument("--no-templates", action="store_false",
|
groupP.add_argument("--no-templates", action="store_false",
|
||||||
help="Do not expand templates")
|
help="Do not expand templates")
|
||||||
|
groupP.add_argument("--escapedoc", action="store_true",
|
||||||
|
help="use to escape the contents of the output <doc>...</doc>")
|
||||||
default_process_count = cpu_count() - 1
|
default_process_count = cpu_count() - 1
|
||||||
parser.add_argument("--processes", type=int, default=default_process_count,
|
parser.add_argument("--processes", type=int, default=default_process_count,
|
||||||
help="Number of processes to use (default %(default)s)")
|
help="Number of processes to use (default %(default)s)")
|
||||||
@ -2580,6 +2584,7 @@ def main():
|
|||||||
Extractor.keepLinks = True
|
Extractor.keepLinks = True
|
||||||
|
|
||||||
expand_templates = args.no_templates
|
expand_templates = args.no_templates
|
||||||
|
escape_doc = args.escapedoc
|
||||||
|
|
||||||
try:
|
try:
|
||||||
power = 'kmg'.find(args.bytes[-1].lower()) + 1
|
power = 'kmg'.find(args.bytes[-1].lower()) + 1
|
||||||
|
Loading…
Reference in New Issue
Block a user