Fix to script invocation.

2020-12-04 11:31:15 +01:00 · 2020-12-04 11:31:15 +01:00 · 8ef37c87e2
commit 8ef37c87e2
parent 3179a4c393
4 changed files with 32 additions and 14 deletions
--- a/README.md
+++ b/README.md
@ -37,6 +37,11 @@ or locally with:

    (sudo) python setup.py install

+The installer also installs two scripts for direct invocation:
+
+    wikiextractor  	(equivalent to python -m wikiextractor.WikiExtractor)
+    extractPage		(to extract a single page from a dump)
+
 ## Usage

 ### Wikiextractor
@ -187,6 +192,25 @@ Special:
  -v, --version         print program version
 ~~~

+### extractPage
+Extract a single page from a Wikipedia dump file.
+
+~~~
+usage: extractPage [-h] [--id ID] [--template] [-v] input
+
+Wikipedia Page Extractor:
+Extracts a single page from a Wikipedia dump file.
+
+positional arguments:
+  input          XML wiki dump file
+
+optional arguments:
+  -h, --help     show this help message and exit
+  --id ID        article number
+  --template     template number
+  -v, --version  print program version
+~~~
+
 ## License
 The code is made available under the [GNU Affero General Public License v3.0](LICENSE). 

--- a/setup.py
+++ b/setup.py
@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 import re

-from wikiextractor.WikiExtractor import version
+from wikiextractor.WikiExtractor import __version__


 def get_version(version):
@ -14,7 +14,7 @@ with open("README.md", "r") as fh:

 setup(
    name='wikiextractor',
-    version=get_version(version),
+    version=get_version(__version__),
    author='Giuseppe Attardi',
    author_email='attardi@gmail.com',
    description='A tool for extracting plain text from Wikipedia dumps',
@ -33,7 +33,7 @@ setup(
     ],
    entry_points={
        "console_scripts": [
-            "wikiextractor = wikiextractor.Wikiextractor:main",
+            "wikiextractor = wikiextractor.WikiExtractor:main",
            "extractPage = wikiextractor.extractPage:main",
            ]
        },
--- a/wikiextractor/WikiExtractor.py
+++ b/wikiextractor/WikiExtractor.py
@ -64,7 +64,7 @@ from .extract import Extractor, ignoreTag, define_template
 # ===========================================================================

 # Program version
-version = '3.0'
+__version__ = '3.0.1'

 ##
 # Defined in <siteinfo>
@ -531,7 +531,7 @@ def main():
    groupS.add_argument("-a", "--article", action="store_true",
                        help="analyze a file containing a single article (debug option)")
    groupS.add_argument("-v", "--version", action="version",
-                        version='%(prog)s ' + version,
+                        version='%(prog)s ' + __version__,
                        help="print program version")

    args = parser.parse_args()
--- a/wikiextractor/extractPage.py
+++ b/wikiextractor/extractPage.py
@ -28,14 +28,9 @@ Extracts a single page from a Wikipedia dump file.
 """

 import sys, os.path
-import re, random
+import re
 import argparse
-from itertools import izip
-import logging, traceback
-import urllib
-import bz2, gzip
-from htmlentitydefs import name2codepoint
-import Queue, threading, multiprocessing
+import bz2


 # Program version
@ -63,7 +58,6 @@ def process_data(input_file, id, templates=False):

    page = []
    for line in input:
-        line = line.decode('utf-8')
        if '<' not in line:         # faster than doing re.search()
            if page:
                page.append(line)
@ -94,7 +88,7 @@ def process_data(input_file, id, templates=False):
        elif tag == '/page':
            if page:
                page.append(line)
-                print ''.join(page).encode('utf-8')
+                print(''.join(page))
                if not templates:
                    break
            page = []