Fix to urlbase.
This commit is contained in:
parent
1c57c06596
commit
95ddfaa451
@ -62,7 +62,7 @@ from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces
|
||||
# ===========================================================================
|
||||
|
||||
# Program version
|
||||
__version__ = '3.0.4'
|
||||
__version__ = '3.0.5'
|
||||
|
||||
##
|
||||
# Defined in <siteinfo>
|
||||
@ -80,10 +80,6 @@ templatePrefix = ''
|
||||
# It is the name associated with namespace key=828 in the siteinfo header.
|
||||
moduleNamespace = ''
|
||||
|
||||
# This is obtained from <siteinfo>
|
||||
urlbase = ''
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Modules
|
||||
|
||||
@ -283,11 +279,12 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
||||
:param file_compress: whether to compress files with bzip.
|
||||
:param process_count: number of extraction processes to spawn.
|
||||
"""
|
||||
global urlbase
|
||||
global knownNamespaces
|
||||
global templateNamespace, templatePrefix
|
||||
global moduleNamespace, modulePrefix
|
||||
|
||||
urlbase = '' # This is obtained from <siteinfo>
|
||||
|
||||
input = decode_open(input_file)
|
||||
|
||||
# collect siteinfo
|
||||
@ -414,7 +411,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
||||
colon = title.find(':')
|
||||
if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and
|
||||
not redirect and not title.startswith(templateNamespace)):
|
||||
job = (id, title, page, ordinal)
|
||||
job = (id, urlbase, title, page, ordinal)
|
||||
jobs_queue.put(job) # goes to any available extract_process
|
||||
last_id = id
|
||||
ordinal += 1
|
||||
@ -451,14 +448,15 @@ def extract_process(jobs_queue, output_queue, escape_doc):
|
||||
"""Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text
|
||||
:param jobs_queue: where to get jobs.
|
||||
:param output_queue: where to queue extracted text for output.
|
||||
:escape_doc: whether to convert entities in text to HTML.
|
||||
"""
|
||||
while True:
|
||||
job = jobs_queue.get() # job is (id, title, page, ordinal)
|
||||
if job:
|
||||
out = StringIO() # memory buffer
|
||||
Extractor(*job[:3]).extract(out, escape_doc) # (id, title, page)
|
||||
Extractor(*job[:4]).extract(out, escape_doc) # (id, urlbase, title, page)
|
||||
text = out.getvalue()
|
||||
output_queue.put((job[3], text)) # (ordinal, extracted_text)
|
||||
output_queue.put((job[4], text)) # (ordinal, extracted_text)
|
||||
out.close()
|
||||
else:
|
||||
break
|
||||
|
@ -21,7 +21,7 @@
|
||||
import re
|
||||
import html
|
||||
from itertools import zip_longest
|
||||
import urllib.parse.quote as urlquote
|
||||
from urllib.parse import quote as urlquote
|
||||
from html.entities import name2codepoint
|
||||
import logging
|
||||
import time
|
||||
@ -34,9 +34,6 @@ syntaxhighlight = re.compile('<syntaxhighlight .*?>(.*?)</syntaxhighlig
|
||||
|
||||
## PARAMS ####################################################################
|
||||
|
||||
# This is obtained from <siteinfo>
|
||||
urlbase = ''
|
||||
|
||||
##
|
||||
# Defined in <siteinfo>
|
||||
# We include as default Template, when loading external template file.
|
||||
@ -62,7 +59,7 @@ discardElements = [
|
||||
acceptedNamespaces = ['w', 'wiktionary', 'wikt']
|
||||
|
||||
|
||||
def get_url(uid):
|
||||
def get_url(urlbase, uid):
|
||||
return "%s?curid=%s" % (urlbase, uid)
|
||||
|
||||
|
||||
@ -809,11 +806,12 @@ class Extractor():
|
||||
# Whether to output text with HTML formatting elements in <doc> files.
|
||||
HtmlFormatting = False
|
||||
|
||||
def __init__(self, id, title, page):
|
||||
def __init__(self, id, urlbase, title, page):
|
||||
"""
|
||||
:param page: a list of lines.
|
||||
"""
|
||||
self.id = id
|
||||
self.url = get_url(urlbase, id)
|
||||
self.title = title
|
||||
self.page = page
|
||||
self.magicWords = MagicWords()
|
||||
@ -850,8 +848,7 @@ class Extractor():
|
||||
logging.debug("%s\t%s", self.id, self.title)
|
||||
text = ''.join(self.page)
|
||||
|
||||
url = get_url(self.id)
|
||||
header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
|
||||
header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, self.url, self.title)
|
||||
# Separate header from text with a newline.
|
||||
header += self.title + '\n\n'
|
||||
footer = "\n</doc>\n"
|
||||
|
@ -34,7 +34,7 @@ import bz2
|
||||
|
||||
|
||||
# Program version
|
||||
__version__ = '3.0.4'
|
||||
__version__ = '3.0.5'
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# READER
|
||||
|
Loading…
Reference in New Issue
Block a user