Fix to script invocation.

This commit is contained in:
attardi 2020-12-04 11:31:15 +01:00
parent 3179a4c393
commit 8ef37c87e2
4 changed files with 32 additions and 14 deletions

View File

@ -37,6 +37,11 @@ or locally with:
(sudo) python setup.py install
The installer also installs two scripts for direct invocation:
wikiextractor (equivalent to python -m wikiextractor.WikiExtractor)
extractPage (to extract a single page from a dump)
## Usage
### Wikiextractor
@ -187,6 +192,25 @@ Special:
-v, --version print program version
~~~
### extractPage
Extract a single page from a Wikipedia dump file.
~~~
usage: extractPage [-h] [--id ID] [--template] [-v] input
Wikipedia Page Extractor:
Extracts a single page from a Wikipedia dump file.
positional arguments:
input XML wiki dump file
optional arguments:
-h, --help show this help message and exit
--id ID article number
--template template number
-v, --version print program version
~~~
## License
The code is made available under the [GNU Affero General Public License v3.0](LICENSE).

View File

@ -1,7 +1,7 @@
from setuptools import setup, find_packages
import re
from wikiextractor.WikiExtractor import version
from wikiextractor.WikiExtractor import __version__
def get_version(version):
@ -14,7 +14,7 @@ with open("README.md", "r") as fh:
setup(
name='wikiextractor',
version=get_version(version),
version=get_version(__version__),
author='Giuseppe Attardi',
author_email='attardi@gmail.com',
description='A tool for extracting plain text from Wikipedia dumps',
@ -33,7 +33,7 @@ setup(
],
entry_points={
"console_scripts": [
"wikiextractor = wikiextractor.Wikiextractor:main",
"wikiextractor = wikiextractor.WikiExtractor:main",
"extractPage = wikiextractor.extractPage:main",
]
},

View File

@ -64,7 +64,7 @@ from .extract import Extractor, ignoreTag, define_template
# ===========================================================================
# Program version
version = '3.0'
__version__ = '3.0.1'
##
# Defined in <siteinfo>
@ -531,7 +531,7 @@ def main():
groupS.add_argument("-a", "--article", action="store_true",
help="analyze a file containing a single article (debug option)")
groupS.add_argument("-v", "--version", action="version",
version='%(prog)s ' + version,
version='%(prog)s ' + __version__,
help="print program version")
args = parser.parse_args()

View File

@ -28,14 +28,9 @@ Extracts a single page from a Wikipedia dump file.
"""
import sys, os.path
import re, random
import re
import argparse
from itertools import izip
import logging, traceback
import urllib
import bz2, gzip
from htmlentitydefs import name2codepoint
import Queue, threading, multiprocessing
import bz2
# Program version
@ -63,7 +58,6 @@ def process_data(input_file, id, templates=False):
page = []
for line in input:
line = line.decode('utf-8')
if '<' not in line: # faster than doing re.search()
if page:
page.append(line)
@ -94,7 +88,7 @@ def process_data(input_file, id, templates=False):
elif tag == '/page':
if page:
page.append(line)
print ''.join(page).encode('utf-8')
print(''.join(page))
if not templates:
break
page = []