From 95ddfaa451e4c344f209920e56b46d2cea6bc99a Mon Sep 17 00:00:00 2001 From: attardi Date: Wed, 16 Dec 2020 18:10:28 +0100 Subject: [PATCH] Fix to urlbase. --- wikiextractor/WikiExtractor.py | 16 +++++++--------- wikiextractor/extract.py | 13 +++++-------- wikiextractor/extractPage.py | 2 +- 3 files changed, 13 insertions(+), 18 deletions(-) diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py index 01f1790..1335555 100755 --- a/wikiextractor/WikiExtractor.py +++ b/wikiextractor/WikiExtractor.py @@ -62,7 +62,7 @@ from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces # =========================================================================== # Program version -__version__ = '3.0.4' +__version__ = '3.0.5' ## # Defined in @@ -80,10 +80,6 @@ templatePrefix = '' # It is the name associated with namespace key=828 in the siteinfo header. moduleNamespace = '' -# This is obtained from -urlbase = '' - - # ---------------------------------------------------------------------- # Modules @@ -283,11 +279,12 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, :param file_compress: whether to compress files with bzip. :param process_count: number of extraction processes to spawn. """ - global urlbase global knownNamespaces global templateNamespace, templatePrefix global moduleNamespace, modulePrefix + urlbase = '' # This is obtained from + input = decode_open(input_file) # collect siteinfo @@ -414,7 +411,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, colon = title.find(':') if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and not redirect and not title.startswith(templateNamespace)): - job = (id, title, page, ordinal) + job = (id, urlbase, title, page, ordinal) jobs_queue.put(job) # goes to any available extract_process last_id = id ordinal += 1 @@ -451,14 +448,15 @@ def extract_process(jobs_queue, output_queue, escape_doc): """Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text :param jobs_queue: where to get jobs. :param output_queue: where to queue extracted text for output. + :escape_doc: whether to convert entities in text to HTML. """ while True: job = jobs_queue.get() # job is (id, title, page, ordinal) if job: out = StringIO() # memory buffer - Extractor(*job[:3]).extract(out, escape_doc) # (id, title, page) + Extractor(*job[:4]).extract(out, escape_doc) # (id, urlbase, title, page) text = out.getvalue() - output_queue.put((job[3], text)) # (ordinal, extracted_text) + output_queue.put((job[4], text)) # (ordinal, extracted_text) out.close() else: break diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py index 26840b9..52b0eae 100644 --- a/wikiextractor/extract.py +++ b/wikiextractor/extract.py @@ -21,7 +21,7 @@ import re import html from itertools import zip_longest -import urllib.parse.quote as urlquote +from urllib.parse import quote as urlquote from html.entities import name2codepoint import logging import time @@ -34,9 +34,6 @@ syntaxhighlight = re.compile('<syntaxhighlight .*?>(.*?)</syntaxhighlig ## PARAMS #################################################################### -# This is obtained from -urlbase = '' - ## # Defined in # We include as default Template, when loading external template file. @@ -62,7 +59,7 @@ discardElements = [ acceptedNamespaces = ['w', 'wiktionary', 'wikt'] -def get_url(uid): +def get_url(urlbase, uid): return "%s?curid=%s" % (urlbase, uid) @@ -809,11 +806,12 @@ class Extractor(): # Whether to output text with HTML formatting elements in files. HtmlFormatting = False - def __init__(self, id, title, page): + def __init__(self, id, urlbase, title, page): """ :param page: a list of lines. """ self.id = id + self.url = get_url(urlbase, id) self.title = title self.page = page self.magicWords = MagicWords() @@ -850,8 +848,7 @@ class Extractor(): logging.debug("%s\t%s", self.id, self.title) text = ''.join(self.page) - url = get_url(self.id) - header = '\n' % (self.id, url, self.title) + header = '\n' % (self.id, self.url, self.title) # Separate header from text with a newline. header += self.title + '\n\n' footer = "\n\n" diff --git a/wikiextractor/extractPage.py b/wikiextractor/extractPage.py index 9a10d8d..1e40410 100755 --- a/wikiextractor/extractPage.py +++ b/wikiextractor/extractPage.py @@ -34,7 +34,7 @@ import bz2 # Program version -__version__ = '3.0.4' +__version__ = '3.0.5' # ---------------------------------------------------------------------- # READER