diff --git a/README.md b/README.md index b4fce85..1e9c19c 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,11 @@ or locally with: (sudo) python setup.py install +The installer also installs two scripts for direct invocation: + + wikiextractor (equivalent to python -m wikiextractor.WikiExtractor) + extractPage (to extract a single page from a dump) + ## Usage ### Wikiextractor @@ -187,6 +192,25 @@ Special: -v, --version print program version ~~~ +### extractPage +Extract a single page from a Wikipedia dump file. + +~~~ +usage: extractPage [-h] [--id ID] [--template] [-v] input + +Wikipedia Page Extractor: +Extracts a single page from a Wikipedia dump file. + +positional arguments: + input XML wiki dump file + +optional arguments: + -h, --help show this help message and exit + --id ID article number + --template template number + -v, --version print program version +~~~ + ## License The code is made available under the [GNU Affero General Public License v3.0](LICENSE). diff --git a/setup.py b/setup.py index c4a3f76..f18786b 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages import re -from wikiextractor.WikiExtractor import version +from wikiextractor.WikiExtractor import __version__ def get_version(version): @@ -14,7 +14,7 @@ with open("README.md", "r") as fh: setup( name='wikiextractor', - version=get_version(version), + version=get_version(__version__), author='Giuseppe Attardi', author_email='attardi@gmail.com', description='A tool for extracting plain text from Wikipedia dumps', @@ -33,7 +33,7 @@ setup( ], entry_points={ "console_scripts": [ - "wikiextractor = wikiextractor.Wikiextractor:main", + "wikiextractor = wikiextractor.WikiExtractor:main", "extractPage = wikiextractor.extractPage:main", ] }, diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py index c8d1cd5..a8878dd 100755 --- a/wikiextractor/WikiExtractor.py +++ b/wikiextractor/WikiExtractor.py @@ -64,7 +64,7 @@ from .extract import Extractor, ignoreTag, define_template # =========================================================================== # Program version -version = '3.0' +__version__ = '3.0.1' ## # Defined in @@ -531,7 +531,7 @@ def main(): groupS.add_argument("-a", "--article", action="store_true", help="analyze a file containing a single article (debug option)") groupS.add_argument("-v", "--version", action="version", - version='%(prog)s ' + version, + version='%(prog)s ' + __version__, help="print program version") args = parser.parse_args() diff --git a/wikiextractor/extractPage.py b/wikiextractor/extractPage.py index 5445e90..89a7081 100755 --- a/wikiextractor/extractPage.py +++ b/wikiextractor/extractPage.py @@ -28,14 +28,9 @@ Extracts a single page from a Wikipedia dump file. """ import sys, os.path -import re, random +import re import argparse -from itertools import izip -import logging, traceback -import urllib -import bz2, gzip -from htmlentitydefs import name2codepoint -import Queue, threading, multiprocessing +import bz2 # Program version @@ -63,7 +58,6 @@ def process_data(input_file, id, templates=False): page = [] for line in input: - line = line.decode('utf-8') if '<' not in line: # faster than doing re.search() if page: page.append(line) @@ -94,7 +88,7 @@ def process_data(input_file, id, templates=False): elif tag == '/page': if page: page.append(line) - print ''.join(page).encode('utf-8') + print(''.join(page)) if not templates: break page = []