Fully working on python 3.

2020-12-05 11:21:46 +01:00 · 2020-12-05 11:21:46 +01:00 · 0933664d70
commit 0933664d70
parent 5b4302bca0
2 changed files with 12 additions and 16 deletions
--- a/wikiextractor/WikiExtractor.py
+++ b/wikiextractor/WikiExtractor.py
@ -58,12 +58,12 @@ from io import StringIO
 from multiprocessing import Queue, Process, cpu_count
 from timeit import default_timer
-from .extract import Extractor, ignoreTag, define_template
+from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces
 # ===========================================================================
 # Program version
-__version__ = '3.0.2'
+__version__ = '3.0.3'
 ##
 # Defined in <siteinfo>
@ -136,7 +136,7 @@ class NextFile(object):
    def _dirname(self):
        char1 = self.dir_index % 26
-        char2 = self.dir_index / 26 % 26
+        char2 = int(self.dir_index / 26) % 26
        return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))
    def _filepath(self):
@ -201,7 +201,7 @@ def load_templates(file, output_file=None):
    page = []
    inText = False
    if output_file:
-        output = open(output_file, 'wb')
+        output = open(output_file, 'w')
    for line in file:
        line = line.decode('utf-8')
        if '<' not in line:  # faster than doing re.search()
@ -352,7 +352,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
    # start worker processes
    logging.info("Using %d extract processes.", process_count)
    workers = []
-    for _ in xrange(max(1, process_count)):
+    for _ in range(max(1, process_count)):
        extractor = Process(target=extract_process,
                            args=(jobs_queue, output_queue))
        extractor.daemon = True  # only live while parent process lives
--- a/wikiextractor/extract.py
+++ b/wikiextractor/extract.py
@ -19,7 +19,7 @@
 # =============================================================================
 import re
-import cgi
+import html
 from itertools import zip_longest
 import urllib
 from html.entities import name2codepoint
@ -63,7 +63,7 @@ acceptedNamespaces = ['w', 'wiktionary', 'wikt']
 def get_url(uid):
-    return "%s?curid=%s" % (options.urlbase, uid)
+    return "%s?curid=%s" % (urlbase, uid)
 # ======================================================================
@ -170,7 +170,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
    text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U)  # lines with only punctuations
    text = text.replace(',,', ',').replace(',.', '.')
    if escape_doc:
-        text = cgi.escape(text)
+        text = html.escape(text)
    return text
@ -212,9 +212,7 @@ def compact(text, mark_headers=False):
            headers[lev] = title
            # drop previous headers
-            for i in headers.keys():
+            headers = { k:v for k,v in headers.items() if k > lev }
                if i > lev:
                    del headers[i]
            emptySection = True
            continue
        # Handle page title
@ -268,8 +266,7 @@ def compact(text, mark_headers=False):
            continue
        elif len(headers):
            if Extractor.keepSections:
-                items = headers.items()
+                items = sorted(headers.items())
                items.sort()
                for (i, v) in items:
                    page.append(v)
            headers.clear()
@ -497,7 +494,7 @@ def makeInternalLink(title, label):
        if colon2 > 1 and title[colon + 1:colon2] not in acceptedNamespaces:
            return ''
    if Extractor.keepLinks:
-        return '<a href="%s">%s</a>' % (urllib.quote(title.encode('utf-8')), label)
+        return '<a href="%s">%s</a>' % (urllib.quote(title), label)
    else:
        return label
@ -860,14 +857,13 @@ class Extractor(object):
        header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
        # Separate header from text with a newline.
        header += self.title + '\n\n'
        header = header.encode('utf-8')
        footer = "\n</doc>\n"
        out.write(header)
        text = self.clean_text(text)
        for line in text:
-            out.write(line.encode('utf-8'))
+            out.write(line)
            out.write('\n')
        out.write(footer)
        errs = (self.template_title_errs,