diff --git a/README.md b/README.md index 2de2fa9..b53034e 100644 --- a/README.md +++ b/README.md @@ -37,11 +37,11 @@ Each file will contains several documents in this [document format](http://media [-l] [-s] [--lists] [-ns ns1,ns2] [--templates TEMPLATES] [--no-templates] [-r] [--min_text_length MIN_TEXT_LENGTH] - [--filter_category path_of_categories_file] [--filter_disambig_pages] [-it abbr,b,big] [-de gallery,timeline,noinclude] [--keep_tables] - [--processes PROCESSES] [-q] [--debug] [-a] [-v] - [--log_file] + [--processes PROCESSES] [-q] [--debug] [-a] + [--log_file LOG_FILE] [-v] + [--filter_category FILTER_CATEGORY] input Wikipedia Extractor: @@ -68,7 +68,7 @@ Each file will contains several documents in this [document format](http://media optional arguments: -h, --help show this help message and exit --processes PROCESSES - Number of processes to use (default 1) + Number of processes to use (default 7) Output: -o OUTPUT, --output OUTPUT @@ -88,23 +88,12 @@ Each file will contains several documents in this [document format](http://media accepted namespaces in links --templates TEMPLATES use or create file containing templates - --no-templates Do not expand templates + --no-templates, --no_templates + Do not expand templates -r, --revision Include the document revision id (default=False) --min_text_length MIN_TEXT_LENGTH Minimum expanded text length required to write document (default=0) - --filter_category path_of_categories_file - Include or exclude specific categories from the dataset. Specify the categories in - file 'path_of_categories_file'. Format: - One category one line, and if the line starts with: - 1) #: Comments, ignored; - 2) ^: the categories will be in excluding-categories - 3) others: the categories will be in including-categories. - Priority: - 1) If excluding-categories is not empty, and any category of a page exists in excluding-categories, the page will be excluded; else - 2) If including-categories is not empty, and no category of a page exists in including-categories, the page will be excluded; else - 3) the page will be included - --filter_disambig_pages Remove pages from output that contain disabmiguation markup (default=False) @@ -116,15 +105,20 @@ Each file will contains several documents in this [document format](http://media from the article text --keep_tables Preserve tables in the output article text (default=False) + --filter_category FILTER_CATEGORY + specify the file that listing the Categories you want + to include or exclude. One line for one category. + starting with: 1) '#' comment, ignored; 2) '^' + exclude; Note: excluding has higher priority than + including Special: -q, --quiet suppress reporting progress info --debug print debug info -a, --article analyze a file containing a single article (debug option) + --log_file LOG_FILE path to save the log info -v, --version print program version - --log_file specify a file to save the log information. - Saving templates to a file will speed up performing extraction the next time, assuming template definitions have not changed. diff --git a/WikiExtractor.py b/WikiExtractor.py index 730b3ba..e03f52c 100755 --- a/WikiExtractor.py +++ b/WikiExtractor.py @@ -3139,7 +3139,7 @@ def main(): help="accepted namespaces in links") groupP.add_argument("--templates", help="use or create file containing templates") - groupP.add_argument("--no_templates", action="store_false", + groupP.add_argument("--no-templates", "--no_templates", action="store_false", help="Do not expand templates") groupP.add_argument("-r", "--revision", action="store_true", default=options.print_revision, help="Include the document revision id (default=%(default)s)")