From 95ddfaa451e4c344f209920e56b46d2cea6bc99a Mon Sep 17 00:00:00 2001
From: attardi <attardi@di.unipi.it>
Date: Wed, 16 Dec 2020 18:10:28 +0100
Subject: [PATCH] Fix to urlbase.

---
 wikiextractor/WikiExtractor.py | 16 +++++++---------
 wikiextractor/extract.py       | 13 +++++--------
 wikiextractor/extractPage.py   |  2 +-
 3 files changed, 13 insertions(+), 18 deletions(-)
diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py
index 01f1790..1335555 100755
--- a/wikiextractor/WikiExtractor.py
+++ b/wikiextractor/WikiExtractor.py
@@ -62,7 +62,7 @@ from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces
 # ===========================================================================
 
 # Program version
-__version__ = '3.0.4'
+__version__ = '3.0.5'
 
 ##
 # Defined in <siteinfo>
@@ -80,10 +80,6 @@ templatePrefix = ''
 # It is the name associated with namespace key=828 in the siteinfo header.
 moduleNamespace = ''
 
-# This is obtained from <siteinfo>
-urlbase = ''
-
-
 # ----------------------------------------------------------------------
 # Modules
 
@@ -283,11 +279,12 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
     :param file_compress: whether to compress files with bzip.
     :param process_count: number of extraction processes to spawn.
     """
-    global urlbase
     global knownNamespaces
     global templateNamespace, templatePrefix
     global moduleNamespace, modulePrefix
 
+    urlbase = ''                # This is obtained from <siteinfo>
+
     input = decode_open(input_file)
 
     # collect siteinfo
@@ -414,7 +411,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
             colon = title.find(':')
             if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and
                     not redirect and not title.startswith(templateNamespace)):
-                job = (id, title, page, ordinal)
+                job = (id, urlbase, title, page, ordinal)
                 jobs_queue.put(job)  # goes to any available extract_process
                 last_id = id
                 ordinal += 1
@@ -451,14 +448,15 @@ def extract_process(jobs_queue, output_queue, escape_doc):
     """Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text
     :param jobs_queue: where to get jobs.
     :param output_queue: where to queue extracted text for output.
+    :escape_doc: whether to convert entities in text to HTML.
     """
     while True:
         job = jobs_queue.get()  # job is (id, title, page, ordinal)
         if job:
             out = StringIO()  # memory buffer
-            Extractor(*job[:3]).extract(out, escape_doc)  # (id, title, page)
+            Extractor(*job[:4]).extract(out, escape_doc)  # (id, urlbase, title, page)
             text = out.getvalue()
-            output_queue.put((job[3], text))  # (ordinal, extracted_text)
+            output_queue.put((job[4], text))  # (ordinal, extracted_text)
             out.close()
         else:
             break
diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py
index 26840b9..52b0eae 100644
--- a/wikiextractor/extract.py
+++ b/wikiextractor/extract.py
@@ -21,7 +21,7 @@
 import re
 import html
 from itertools import zip_longest
-import urllib.parse.quote as urlquote
+from urllib.parse import quote as urlquote
 from html.entities import name2codepoint
 import logging
 import time
@@ -34,9 +34,6 @@ syntaxhighlight = re.compile('&lt;syntaxhighlight .*?&gt;(.*?)&lt;/syntaxhighlig
 
 ## PARAMS ####################################################################
 
-# This is obtained from <siteinfo>
-urlbase = ''
-
 ##
 # Defined in <siteinfo>
 # We include as default Template, when loading external template file.
@@ -62,7 +59,7 @@ discardElements = [
 acceptedNamespaces = ['w', 'wiktionary', 'wikt']
 
 
-def get_url(uid):
+def get_url(urlbase, uid):
     return "%s?curid=%s" % (urlbase, uid)
 
 
@@ -809,11 +806,12 @@ class Extractor():
     # Whether to output text with HTML formatting elements in <doc> files.
     HtmlFormatting = False
 
-    def __init__(self, id, title, page):
+    def __init__(self, id, urlbase, title, page):
         """
         :param page: a list of lines.
         """
         self.id = id
+        self.url = get_url(urlbase, id)
         self.title = title
         self.page = page
         self.magicWords = MagicWords()
@@ -850,8 +848,7 @@ class Extractor():
         logging.debug("%s\t%s", self.id, self.title)
         text = ''.join(self.page)
 
-        url = get_url(self.id)
-        header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
+        header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, self.url, self.title)
         # Separate header from text with a newline.
         header += self.title + '\n\n'
         footer = "\n</doc>\n"
diff --git a/wikiextractor/extractPage.py b/wikiextractor/extractPage.py
index 9a10d8d..1e40410 100755
--- a/wikiextractor/extractPage.py
+++ b/wikiextractor/extractPage.py
@@ -34,7 +34,7 @@ import bz2
 
 
 # Program version
-__version__ = '3.0.4'
+__version__ = '3.0.5'
 
 # ----------------------------------------------------------------------
 # READER