Fix to urlbase.

2020-12-16 18:10:28 +01:00 · 2020-12-16 18:10:28 +01:00 · 95ddfaa451
commit 95ddfaa451
parent 1c57c06596
3 changed files with 13 additions and 18 deletions
--- a/wikiextractor/WikiExtractor.py
+++ b/wikiextractor/WikiExtractor.py
@ -62,7 +62,7 @@ from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces
 # ===========================================================================

 # Program version
-__version__ = '3.0.4'
+__version__ = '3.0.5'

 ##
 # Defined in <siteinfo>
@ -80,10 +80,6 @@ templatePrefix = ''
 # It is the name associated with namespace key=828 in the siteinfo header.
 moduleNamespace = ''

-# This is obtained from <siteinfo>
-urlbase = ''
-
-
 # ----------------------------------------------------------------------
 # Modules

@ -283,11 +279,12 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
    :param file_compress: whether to compress files with bzip.
    :param process_count: number of extraction processes to spawn.
    """
-    global urlbase
    global knownNamespaces
    global templateNamespace, templatePrefix
    global moduleNamespace, modulePrefix

+    urlbase = ''                # This is obtained from <siteinfo>
+
    input = decode_open(input_file)

    # collect siteinfo
@ -414,7 +411,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
            colon = title.find(':')
            if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and
                    not redirect and not title.startswith(templateNamespace)):
-                job = (id, title, page, ordinal)
+                job = (id, urlbase, title, page, ordinal)
                jobs_queue.put(job)  # goes to any available extract_process
                last_id = id
                ordinal += 1
@ -451,14 +448,15 @@ def extract_process(jobs_queue, output_queue, escape_doc):
    """Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text
    :param jobs_queue: where to get jobs.
    :param output_queue: where to queue extracted text for output.
+    :escape_doc: whether to convert entities in text to HTML.
    """
    while True:
        job = jobs_queue.get()  # job is (id, title, page, ordinal)
        if job:
            out = StringIO()  # memory buffer
-            Extractor(*job[:3]).extract(out, escape_doc)  # (id, title, page)
+            Extractor(*job[:4]).extract(out, escape_doc)  # (id, urlbase, title, page)
            text = out.getvalue()
-            output_queue.put((job[3], text))  # (ordinal, extracted_text)
+            output_queue.put((job[4], text))  # (ordinal, extracted_text)
            out.close()
        else:
            break
--- a/wikiextractor/extract.py
+++ b/wikiextractor/extract.py
@ -21,7 +21,7 @@
 import re
 import html
 from itertools import zip_longest
-import urllib.parse.quote as urlquote
+from urllib.parse import quote as urlquote
 from html.entities import name2codepoint
 import logging
 import time
@ -34,9 +34,6 @@ syntaxhighlight = re.compile('&lt;syntaxhighlight .*?&gt;(.*?)&lt;/syntaxhighlig

 ## PARAMS ####################################################################

-# This is obtained from <siteinfo>
-urlbase = ''
-
 ##
 # Defined in <siteinfo>
 # We include as default Template, when loading external template file.
@ -62,7 +59,7 @@ discardElements = [
 acceptedNamespaces = ['w', 'wiktionary', 'wikt']


-def get_url(uid):
+def get_url(urlbase, uid):
    return "%s?curid=%s" % (urlbase, uid)


@ -809,11 +806,12 @@ class Extractor():
    # Whether to output text with HTML formatting elements in <doc> files.
    HtmlFormatting = False

-    def __init__(self, id, title, page):
+    def __init__(self, id, urlbase, title, page):
        """
        :param page: a list of lines.
        """
        self.id = id
+        self.url = get_url(urlbase, id)
        self.title = title
        self.page = page
        self.magicWords = MagicWords()
@ -850,8 +848,7 @@ class Extractor():
        logging.debug("%s\t%s", self.id, self.title)
        text = ''.join(self.page)

-        url = get_url(self.id)
-        header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
+        header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, self.url, self.title)
        # Separate header from text with a newline.
        header += self.title + '\n\n'
        footer = "\n</doc>\n"
--- a/wikiextractor/extractPage.py
+++ b/wikiextractor/extractPage.py
@ -34,7 +34,7 @@ import bz2


 # Program version
-__version__ = '3.0.4'
+__version__ = '3.0.5'

 # ----------------------------------------------------------------------
 # READER