Merge branch 'master' of https://github.com/attardi/wikiextractor

2016-02-04 11:09:31 +01:00 · 2016-02-04 11:09:31 +01:00 · 49464c0210
commit 49464c0210
parent 3cebfdd4c0 0bb3061e79
2 changed files with 11 additions and 4 deletions
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 # wikiextractor
 [WikiExtractor.py](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) is a Python script that extracts and cleans text from a [Wikipedia database dump](http://download.wikimedia.org/).

-The tool is written in Python and requires no additional library.
+The tool is written in Python and requires Python 2.7 but no additional library.
 **Warning**: problems have been reported on Windows due to poor support for `StringIO` in the Python implementation on Windows.

 For further information, see the [project Home Page](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) or the [Wiki](https://github.com/attardi/wikiextractor/wiki).
@ -47,6 +47,8 @@ Each file will contains several documents in this [document format](http://media
      --templates TEMPLATES
 			    use or create file containing templates
      --no-templates        Do not expand templates
+      --escapedoc           use to escape the contents of the output
+                            <doc>...</doc>

    Special:
      -q, --quiet           suppress reporting progress info
--- a/WikiExtractor.py
+++ b/WikiExtractor.py
@ -49,6 +49,7 @@ collecting template definitions.
 import argparse
 import bz2
 import codecs
+import cgi
 import fileinput
 import logging
 import os.path
@ -1936,7 +1937,7 @@ expand_templates = True

 def clean(extractor, text):
    """
-    Transforms wiki markup.
+    Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped
    @see https://www.mediawiki.org/wiki/Help:Formatting
    """

@ -2034,7 +2035,8 @@ def clean(extractor, text):
    text = re.sub(u'(\[\(«) ', r'\1', text)
    text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U)  # lines with only punctuations
    text = text.replace(',,', ',').replace(',.', '.')
-
+    if escape_doc:
+        text = cgi.escape(text)
    return text


@ -2530,7 +2532,7 @@ minFileSize = 200 * 1024

 def main():
    global urlbase, acceptedNamespaces
-    global expand_templates, templateCache
+    global expand_templates, templateCache, escape_doc

    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
@ -2557,6 +2559,8 @@ def main():
                        help="use or create file containing templates")
    groupP.add_argument("--no-templates", action="store_false",
                        help="Do not expand templates")
+    groupP.add_argument("--escapedoc", action="store_true",
+                        help="use to escape the contents of the output <doc>...</doc>")
    default_process_count = cpu_count() - 1
    parser.add_argument("--processes", type=int, default=default_process_count,
                        help="Number of processes to use (default %(default)s)")
@ -2580,6 +2584,7 @@ def main():
        Extractor.keepLinks = True

    expand_templates = args.no_templates
+    escape_doc = args.escapedoc

    try:
        power = 'kmg'.find(args.bytes[-1].lower()) + 1