See ChangeLog.

2015-04-20 06:56:29 +02:00 · 2015-04-20 06:56:29 +02:00 · cae955eb91
commit cae955eb91
parent 4c90d10860
3 changed files with 53 additions and 40 deletions
--- a/1
+++ b/1
@ -4,6 +4,7 @@
 	(make_anchor_tag): dont use splitParameters() since we must
 	consider also single [, which do not count ib parameters.
 	(sharp_switch): use split() to split at fist = sign.
+	(main): grouped command options.

 2015-04-19  Giuseppe Attardi  <attardi@di.unipi.it>

--- a/README.md
+++ b/README.md
@ -5,38 +5,49 @@ The tool is written in Python and requires no additional library.

 For further information, see the [project Home Page](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) or the [Wiki](https://github.com/attardi/wikiextractor/wiki).

-The current beta version of WikiExtrctor.py is capable of performing template expansion to some extent.
+This is a beta version that performs template expansion by preprocesssng the
+whole dump and extracting template definitions.

 ## Usage
 The script is invoked with a Wikipedia dump file as an argument.
 The output is stored in a number of files of similar size in a chosen directory.
 Each file will contains several documents in this [document format](http://medialab.di.unipi.it/wiki/Document_Format).

-This is a beta version that performs template expansion by preprocesssng the
-whole dump and extracting template definitions.
+    usage: WikiExtractor.py [-h] [-o OUTPUT] [-b n[KMG]] [-c] [--html] [-l]
+			    [-ns ns1,ns2] [-s] [--templates TEMPLATES]
+			    [--no-templates] [--threads THREADS] [-q] [--debug]
+			    [-a] [-v]
+			    input

-    Usage:
-     WikiExtractor.py [options] xml-dump-file
+    positional arguments:
+      input                 XML wiki dump file

    optional arguments:
      -h, --help            show this help message and exit
+      --threads THREADS     Number of threads to use (default 2)
+
+    Output:
      -o OUTPUT, --output OUTPUT
-                            output directory
+			    output directory
      -b n[KMG], --bytes n[KMG]
-                            put specified bytes per output file (default is 1M)
-      -B BASE, --base BASE  base URL for the Wikipedia pages
+			    put specified bytes per output file (default is 1M)
      -c, --compress        compress output files using bzip
+
+    Processing:
+      --html                produce HTML output, subsumes --links and --sections
      -l, --links           preserve links
      -ns ns1,ns2, --namespaces ns1,ns2
-                            accepted namespaces
+			    accepted namespaces
+      -s, --sections        preserve sections
+      --templates TEMPLATES
+			    use or create file containing templates
+      --no-templates        Do not expand templates
+
+    Special:
      -q, --quiet           suppress reporting progress info
      --debug               print debug info
-      -s, --sections        preserve sections
-      -a, --article         analyze a file containing a single article
-      --templates TEMPLATES
-                            use or create file containing templates
-      --no-templates        Do not expand templates
-      --threads THREADS     Number of threads to use (default 8)
+      -a, --article         analyze a file containing a single article (debug)
+			    option
      -v, --version         print program version

 Saving templates to a file will speed up performing extraction the next time,
--- a/WikiExtractor.py
+++ b/WikiExtractor.py
@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 #
 # =============================================================================
-#  Version: 2.24 (Apr 19, 2015)
+#  Version: 2.25 (Apr 20, 2015)
 #  Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
 #	   Antonio Fuschetto (fuschett@di.unipi.it), University of Pisa
 #
@ -61,7 +61,7 @@ import Queue, threading, multiprocessing
 #===========================================================================

 # Program version
-version = '2.24'
+version = '2.25'

 ### PARAMS ####################################################################

@ -1486,7 +1486,7 @@ def clean(extractor, text):
        text = bold.sub(r'\1', text)
        text = italic_quote.sub(r'"\1"', text)
        text = italic.sub(r'"\1"', text)
-        text = quote_quote.sub(r'\1', text)
+        text = quote_quote.sub(r'"\1"', text)
    # residuals of unbalanced quotes
    text = text.replace("'''", '').replace("''", '"')

@ -1936,37 +1936,38 @@ def main():
                                     description=__doc__)
    parser.add_argument("input",
                        help="XML wiki dump file")
-    parser.add_argument("-o", "--output", default="text",
+    groupO = parser.add_argument_group('Output')
+    groupO.add_argument("-o", "--output", default="text",
                        help="output directory")
-    parser.add_argument("-b", "--bytes", default="1M",
+    groupO.add_argument("-b", "--bytes", default="1M",
                        help="put specified bytes per output file (default is %(default)s)", metavar="n[KMG]")
-    parser.add_argument("-B", "--base",
-                        help="base URL for the Wikipedia pages")
-    parser.add_argument("-c", "--compress", action="store_true",
+    groupO.add_argument("-c", "--compress", action="store_true",
                        help="compress output files using bzip")
-    parser.add_argument("-l", "--links", action="store_true",
+
+    groupP = parser.add_argument_group('Processing')
+    groupP.add_argument("--html", action="store_true",
+                        help="produce HTML output, subsumes --links and --sections")
+    groupP.add_argument("-l", "--links", action="store_true",
                        help="preserve links")
-    parser.add_argument("-s", "--sections", action="store_true",
-                        help="preserve sections")
-    parser.add_argument("--html", action="store_true",
-                        help="Produce HTML output, subsumes --links and --sections")
-    parser.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
+    groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
                        help="accepted namespaces")
-    parser.add_argument("-q", "--quiet", action="store_true",
-                        help="suppress reporting progress info")
-    parser.add_argument("--debug", action="store_true",
-                        help="print debug info")
-    parser.add_argument("-a", "--article", action="store_true",
-                        help="analyze a file containing a single article (debug) option")
-    # parser.add_argument("-f", "--format", choices=(PLAIN, JSON), default=PLAIN,
-    #                     help="choose output format default is %(default)s")
-    parser.add_argument("--templates",
+    groupP.add_argument("-s", "--sections", action="store_true",
+                        help="preserve sections")
+    groupP.add_argument("--templates",
                        help="use or create file containing templates")
-    parser.add_argument("--no-templates", action="store_false",
+    groupP.add_argument("--no-templates", action="store_false",
                        help="Do not expand templates")
    parser.add_argument("--threads", type=int, default=2,
                        help="Number of threads to use (default 2)")
-    parser.add_argument("-v", "--version", action="version",
+
+    groupS = parser.add_argument_group('Special')
+    groupS.add_argument("-q", "--quiet", action="store_true",
+                        help="suppress reporting progress info")
+    groupS.add_argument("--debug", action="store_true",
+                        help="print debug info")
+    groupS.add_argument("-a", "--article", action="store_true",
+                        help="analyze a file containing a single article (debug) option")
+    groupS.add_argument("-v", "--version", action="version",
                        version='%(prog)s ' + version,
                        help="print program version")