Fix to script invocation.
This commit is contained in:
parent
3179a4c393
commit
8ef37c87e2
24
README.md
24
README.md
@ -37,6 +37,11 @@ or locally with:
|
||||
|
||||
(sudo) python setup.py install
|
||||
|
||||
The installer also installs two scripts for direct invocation:
|
||||
|
||||
wikiextractor (equivalent to python -m wikiextractor.WikiExtractor)
|
||||
extractPage (to extract a single page from a dump)
|
||||
|
||||
## Usage
|
||||
|
||||
### Wikiextractor
|
||||
@ -187,6 +192,25 @@ Special:
|
||||
-v, --version print program version
|
||||
~~~
|
||||
|
||||
### extractPage
|
||||
Extract a single page from a Wikipedia dump file.
|
||||
|
||||
~~~
|
||||
usage: extractPage [-h] [--id ID] [--template] [-v] input
|
||||
|
||||
Wikipedia Page Extractor:
|
||||
Extracts a single page from a Wikipedia dump file.
|
||||
|
||||
positional arguments:
|
||||
input XML wiki dump file
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--id ID article number
|
||||
--template template number
|
||||
-v, --version print program version
|
||||
~~~
|
||||
|
||||
## License
|
||||
The code is made available under the [GNU Affero General Public License v3.0](LICENSE).
|
||||
|
||||
|
6
setup.py
6
setup.py
@ -1,7 +1,7 @@
|
||||
from setuptools import setup, find_packages
|
||||
import re
|
||||
|
||||
from wikiextractor.WikiExtractor import version
|
||||
from wikiextractor.WikiExtractor import __version__
|
||||
|
||||
|
||||
def get_version(version):
|
||||
@ -14,7 +14,7 @@ with open("README.md", "r") as fh:
|
||||
|
||||
setup(
|
||||
name='wikiextractor',
|
||||
version=get_version(version),
|
||||
version=get_version(__version__),
|
||||
author='Giuseppe Attardi',
|
||||
author_email='attardi@gmail.com',
|
||||
description='A tool for extracting plain text from Wikipedia dumps',
|
||||
@ -33,7 +33,7 @@ setup(
|
||||
],
|
||||
entry_points={
|
||||
"console_scripts": [
|
||||
"wikiextractor = wikiextractor.Wikiextractor:main",
|
||||
"wikiextractor = wikiextractor.WikiExtractor:main",
|
||||
"extractPage = wikiextractor.extractPage:main",
|
||||
]
|
||||
},
|
||||
|
@ -64,7 +64,7 @@ from .extract import Extractor, ignoreTag, define_template
|
||||
# ===========================================================================
|
||||
|
||||
# Program version
|
||||
version = '3.0'
|
||||
__version__ = '3.0.1'
|
||||
|
||||
##
|
||||
# Defined in <siteinfo>
|
||||
@ -531,7 +531,7 @@ def main():
|
||||
groupS.add_argument("-a", "--article", action="store_true",
|
||||
help="analyze a file containing a single article (debug option)")
|
||||
groupS.add_argument("-v", "--version", action="version",
|
||||
version='%(prog)s ' + version,
|
||||
version='%(prog)s ' + __version__,
|
||||
help="print program version")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
@ -28,14 +28,9 @@ Extracts a single page from a Wikipedia dump file.
|
||||
"""
|
||||
|
||||
import sys, os.path
|
||||
import re, random
|
||||
import re
|
||||
import argparse
|
||||
from itertools import izip
|
||||
import logging, traceback
|
||||
import urllib
|
||||
import bz2, gzip
|
||||
from htmlentitydefs import name2codepoint
|
||||
import Queue, threading, multiprocessing
|
||||
import bz2
|
||||
|
||||
|
||||
# Program version
|
||||
@ -63,7 +58,6 @@ def process_data(input_file, id, templates=False):
|
||||
|
||||
page = []
|
||||
for line in input:
|
||||
line = line.decode('utf-8')
|
||||
if '<' not in line: # faster than doing re.search()
|
||||
if page:
|
||||
page.append(line)
|
||||
@ -94,7 +88,7 @@ def process_data(input_file, id, templates=False):
|
||||
elif tag == '/page':
|
||||
if page:
|
||||
page.append(line)
|
||||
print ''.join(page).encode('utf-8')
|
||||
print(''.join(page))
|
||||
if not templates:
|
||||
break
|
||||
page = []
|
||||
|
Loading…
Reference in New Issue
Block a user