See ChangeLog.
This commit is contained in:
parent
4c90d10860
commit
cae955eb91
@ -4,6 +4,7 @@
|
||||
(make_anchor_tag): dont use splitParameters() since we must
|
||||
consider also single [, which do not count ib parameters.
|
||||
(sharp_switch): use split() to split at fist = sign.
|
||||
(main): grouped command options.
|
||||
|
||||
2015-04-19 Giuseppe Attardi <attardi@di.unipi.it>
|
||||
|
||||
|
41
README.md
41
README.md
@ -5,38 +5,49 @@ The tool is written in Python and requires no additional library.
|
||||
|
||||
For further information, see the [project Home Page](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) or the [Wiki](https://github.com/attardi/wikiextractor/wiki).
|
||||
|
||||
The current beta version of WikiExtrctor.py is capable of performing template expansion to some extent.
|
||||
This is a beta version that performs template expansion by preprocesssng the
|
||||
whole dump and extracting template definitions.
|
||||
|
||||
## Usage
|
||||
The script is invoked with a Wikipedia dump file as an argument.
|
||||
The output is stored in a number of files of similar size in a chosen directory.
|
||||
Each file will contains several documents in this [document format](http://medialab.di.unipi.it/wiki/Document_Format).
|
||||
|
||||
This is a beta version that performs template expansion by preprocesssng the
|
||||
whole dump and extracting template definitions.
|
||||
usage: WikiExtractor.py [-h] [-o OUTPUT] [-b n[KMG]] [-c] [--html] [-l]
|
||||
[-ns ns1,ns2] [-s] [--templates TEMPLATES]
|
||||
[--no-templates] [--threads THREADS] [-q] [--debug]
|
||||
[-a] [-v]
|
||||
input
|
||||
|
||||
Usage:
|
||||
WikiExtractor.py [options] xml-dump-file
|
||||
positional arguments:
|
||||
input XML wiki dump file
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--threads THREADS Number of threads to use (default 2)
|
||||
|
||||
Output:
|
||||
-o OUTPUT, --output OUTPUT
|
||||
output directory
|
||||
output directory
|
||||
-b n[KMG], --bytes n[KMG]
|
||||
put specified bytes per output file (default is 1M)
|
||||
-B BASE, --base BASE base URL for the Wikipedia pages
|
||||
put specified bytes per output file (default is 1M)
|
||||
-c, --compress compress output files using bzip
|
||||
|
||||
Processing:
|
||||
--html produce HTML output, subsumes --links and --sections
|
||||
-l, --links preserve links
|
||||
-ns ns1,ns2, --namespaces ns1,ns2
|
||||
accepted namespaces
|
||||
accepted namespaces
|
||||
-s, --sections preserve sections
|
||||
--templates TEMPLATES
|
||||
use or create file containing templates
|
||||
--no-templates Do not expand templates
|
||||
|
||||
Special:
|
||||
-q, --quiet suppress reporting progress info
|
||||
--debug print debug info
|
||||
-s, --sections preserve sections
|
||||
-a, --article analyze a file containing a single article
|
||||
--templates TEMPLATES
|
||||
use or create file containing templates
|
||||
--no-templates Do not expand templates
|
||||
--threads THREADS Number of threads to use (default 8)
|
||||
-a, --article analyze a file containing a single article (debug)
|
||||
option
|
||||
-v, --version print program version
|
||||
|
||||
Saving templates to a file will speed up performing extraction the next time,
|
||||
|
@ -2,7 +2,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# =============================================================================
|
||||
# Version: 2.24 (Apr 19, 2015)
|
||||
# Version: 2.25 (Apr 20, 2015)
|
||||
# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
|
||||
# Antonio Fuschetto (fuschett@di.unipi.it), University of Pisa
|
||||
#
|
||||
@ -61,7 +61,7 @@ import Queue, threading, multiprocessing
|
||||
#===========================================================================
|
||||
|
||||
# Program version
|
||||
version = '2.24'
|
||||
version = '2.25'
|
||||
|
||||
### PARAMS ####################################################################
|
||||
|
||||
@ -1486,7 +1486,7 @@ def clean(extractor, text):
|
||||
text = bold.sub(r'\1', text)
|
||||
text = italic_quote.sub(r'"\1"', text)
|
||||
text = italic.sub(r'"\1"', text)
|
||||
text = quote_quote.sub(r'\1', text)
|
||||
text = quote_quote.sub(r'"\1"', text)
|
||||
# residuals of unbalanced quotes
|
||||
text = text.replace("'''", '').replace("''", '"')
|
||||
|
||||
@ -1936,37 +1936,38 @@ def main():
|
||||
description=__doc__)
|
||||
parser.add_argument("input",
|
||||
help="XML wiki dump file")
|
||||
parser.add_argument("-o", "--output", default="text",
|
||||
groupO = parser.add_argument_group('Output')
|
||||
groupO.add_argument("-o", "--output", default="text",
|
||||
help="output directory")
|
||||
parser.add_argument("-b", "--bytes", default="1M",
|
||||
groupO.add_argument("-b", "--bytes", default="1M",
|
||||
help="put specified bytes per output file (default is %(default)s)", metavar="n[KMG]")
|
||||
parser.add_argument("-B", "--base",
|
||||
help="base URL for the Wikipedia pages")
|
||||
parser.add_argument("-c", "--compress", action="store_true",
|
||||
groupO.add_argument("-c", "--compress", action="store_true",
|
||||
help="compress output files using bzip")
|
||||
parser.add_argument("-l", "--links", action="store_true",
|
||||
|
||||
groupP = parser.add_argument_group('Processing')
|
||||
groupP.add_argument("--html", action="store_true",
|
||||
help="produce HTML output, subsumes --links and --sections")
|
||||
groupP.add_argument("-l", "--links", action="store_true",
|
||||
help="preserve links")
|
||||
parser.add_argument("-s", "--sections", action="store_true",
|
||||
help="preserve sections")
|
||||
parser.add_argument("--html", action="store_true",
|
||||
help="Produce HTML output, subsumes --links and --sections")
|
||||
parser.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
|
||||
groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
|
||||
help="accepted namespaces")
|
||||
parser.add_argument("-q", "--quiet", action="store_true",
|
||||
help="suppress reporting progress info")
|
||||
parser.add_argument("--debug", action="store_true",
|
||||
help="print debug info")
|
||||
parser.add_argument("-a", "--article", action="store_true",
|
||||
help="analyze a file containing a single article (debug) option")
|
||||
# parser.add_argument("-f", "--format", choices=(PLAIN, JSON), default=PLAIN,
|
||||
# help="choose output format default is %(default)s")
|
||||
parser.add_argument("--templates",
|
||||
groupP.add_argument("-s", "--sections", action="store_true",
|
||||
help="preserve sections")
|
||||
groupP.add_argument("--templates",
|
||||
help="use or create file containing templates")
|
||||
parser.add_argument("--no-templates", action="store_false",
|
||||
groupP.add_argument("--no-templates", action="store_false",
|
||||
help="Do not expand templates")
|
||||
parser.add_argument("--threads", type=int, default=2,
|
||||
help="Number of threads to use (default 2)")
|
||||
parser.add_argument("-v", "--version", action="version",
|
||||
|
||||
groupS = parser.add_argument_group('Special')
|
||||
groupS.add_argument("-q", "--quiet", action="store_true",
|
||||
help="suppress reporting progress info")
|
||||
groupS.add_argument("--debug", action="store_true",
|
||||
help="print debug info")
|
||||
groupS.add_argument("-a", "--article", action="store_true",
|
||||
help="analyze a file containing a single article (debug) option")
|
||||
groupS.add_argument("-v", "--version", action="version",
|
||||
version='%(prog)s ' + version,
|
||||
help="print program version")
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user