Fix to urlbase.

This commit is contained in:
attardi 2020-12-16 18:10:28 +01:00
parent 1c57c06596
commit 95ddfaa451
3 changed files with 13 additions and 18 deletions

View File

@ -62,7 +62,7 @@ from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces
# ===========================================================================
# Program version
__version__ = '3.0.4'
__version__ = '3.0.5'
##
# Defined in <siteinfo>
@ -80,10 +80,6 @@ templatePrefix = ''
# It is the name associated with namespace key=828 in the siteinfo header.
moduleNamespace = ''
# This is obtained from <siteinfo>
urlbase = ''
# ----------------------------------------------------------------------
# Modules
@ -283,11 +279,12 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
:param file_compress: whether to compress files with bzip.
:param process_count: number of extraction processes to spawn.
"""
global urlbase
global knownNamespaces
global templateNamespace, templatePrefix
global moduleNamespace, modulePrefix
urlbase = '' # This is obtained from <siteinfo>
input = decode_open(input_file)
# collect siteinfo
@ -414,7 +411,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
colon = title.find(':')
if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and
not redirect and not title.startswith(templateNamespace)):
job = (id, title, page, ordinal)
job = (id, urlbase, title, page, ordinal)
jobs_queue.put(job) # goes to any available extract_process
last_id = id
ordinal += 1
@ -451,14 +448,15 @@ def extract_process(jobs_queue, output_queue, escape_doc):
"""Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text
:param jobs_queue: where to get jobs.
:param output_queue: where to queue extracted text for output.
:escape_doc: whether to convert entities in text to HTML.
"""
while True:
job = jobs_queue.get() # job is (id, title, page, ordinal)
if job:
out = StringIO() # memory buffer
Extractor(*job[:3]).extract(out, escape_doc) # (id, title, page)
Extractor(*job[:4]).extract(out, escape_doc) # (id, urlbase, title, page)
text = out.getvalue()
output_queue.put((job[3], text)) # (ordinal, extracted_text)
output_queue.put((job[4], text)) # (ordinal, extracted_text)
out.close()
else:
break

View File

@ -21,7 +21,7 @@
import re
import html
from itertools import zip_longest
import urllib.parse.quote as urlquote
from urllib.parse import quote as urlquote
from html.entities import name2codepoint
import logging
import time
@ -34,9 +34,6 @@ syntaxhighlight = re.compile('&lt;syntaxhighlight .*?&gt;(.*?)&lt;/syntaxhighlig
## PARAMS ####################################################################
# This is obtained from <siteinfo>
urlbase = ''
##
# Defined in <siteinfo>
# We include as default Template, when loading external template file.
@ -62,7 +59,7 @@ discardElements = [
acceptedNamespaces = ['w', 'wiktionary', 'wikt']
def get_url(uid):
def get_url(urlbase, uid):
return "%s?curid=%s" % (urlbase, uid)
@ -809,11 +806,12 @@ class Extractor():
# Whether to output text with HTML formatting elements in <doc> files.
HtmlFormatting = False
def __init__(self, id, title, page):
def __init__(self, id, urlbase, title, page):
"""
:param page: a list of lines.
"""
self.id = id
self.url = get_url(urlbase, id)
self.title = title
self.page = page
self.magicWords = MagicWords()
@ -850,8 +848,7 @@ class Extractor():
logging.debug("%s\t%s", self.id, self.title)
text = ''.join(self.page)
url = get_url(self.id)
header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, self.url, self.title)
# Separate header from text with a newline.
header += self.title + '\n\n'
footer = "\n</doc>\n"

View File

@ -34,7 +34,7 @@ import bz2
# Program version
__version__ = '3.0.4'
__version__ = '3.0.5'
# ----------------------------------------------------------------------
# READER