Fully working on python 3.

This commit is contained in:
attardi 2020-12-05 11:21:46 +01:00
parent 5b4302bca0
commit 0933664d70
2 changed files with 12 additions and 16 deletions

View File

@ -58,12 +58,12 @@ from io import StringIO
from multiprocessing import Queue, Process, cpu_count from multiprocessing import Queue, Process, cpu_count
from timeit import default_timer from timeit import default_timer
from .extract import Extractor, ignoreTag, define_template from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces
# =========================================================================== # ===========================================================================
# Program version # Program version
__version__ = '3.0.2' __version__ = '3.0.3'
## ##
# Defined in <siteinfo> # Defined in <siteinfo>
@ -136,7 +136,7 @@ class NextFile(object):
def _dirname(self): def _dirname(self):
char1 = self.dir_index % 26 char1 = self.dir_index % 26
char2 = self.dir_index / 26 % 26 char2 = int(self.dir_index / 26) % 26
return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1)) return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))
def _filepath(self): def _filepath(self):
@ -201,7 +201,7 @@ def load_templates(file, output_file=None):
page = [] page = []
inText = False inText = False
if output_file: if output_file:
output = open(output_file, 'wb') output = open(output_file, 'w')
for line in file: for line in file:
line = line.decode('utf-8') line = line.decode('utf-8')
if '<' not in line: # faster than doing re.search() if '<' not in line: # faster than doing re.search()
@ -352,7 +352,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
# start worker processes # start worker processes
logging.info("Using %d extract processes.", process_count) logging.info("Using %d extract processes.", process_count)
workers = [] workers = []
for _ in xrange(max(1, process_count)): for _ in range(max(1, process_count)):
extractor = Process(target=extract_process, extractor = Process(target=extract_process,
args=(jobs_queue, output_queue)) args=(jobs_queue, output_queue))
extractor.daemon = True # only live while parent process lives extractor.daemon = True # only live while parent process lives

View File

@ -19,7 +19,7 @@
# ============================================================================= # =============================================================================
import re import re
import cgi import html
from itertools import zip_longest from itertools import zip_longest
import urllib import urllib
from html.entities import name2codepoint from html.entities import name2codepoint
@ -63,7 +63,7 @@ acceptedNamespaces = ['w', 'wiktionary', 'wikt']
def get_url(uid): def get_url(uid):
return "%s?curid=%s" % (options.urlbase, uid) return "%s?curid=%s" % (urlbase, uid)
# ====================================================================== # ======================================================================
@ -170,7 +170,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
text = text.replace(',,', ',').replace(',.', '.') text = text.replace(',,', ',').replace(',.', '.')
if escape_doc: if escape_doc:
text = cgi.escape(text) text = html.escape(text)
return text return text
@ -212,9 +212,7 @@ def compact(text, mark_headers=False):
headers[lev] = title headers[lev] = title
# drop previous headers # drop previous headers
for i in headers.keys(): headers = { k:v for k,v in headers.items() if k > lev }
if i > lev:
del headers[i]
emptySection = True emptySection = True
continue continue
# Handle page title # Handle page title
@ -268,8 +266,7 @@ def compact(text, mark_headers=False):
continue continue
elif len(headers): elif len(headers):
if Extractor.keepSections: if Extractor.keepSections:
items = headers.items() items = sorted(headers.items())
items.sort()
for (i, v) in items: for (i, v) in items:
page.append(v) page.append(v)
headers.clear() headers.clear()
@ -497,7 +494,7 @@ def makeInternalLink(title, label):
if colon2 > 1 and title[colon + 1:colon2] not in acceptedNamespaces: if colon2 > 1 and title[colon + 1:colon2] not in acceptedNamespaces:
return '' return ''
if Extractor.keepLinks: if Extractor.keepLinks:
return '<a href="%s">%s</a>' % (urllib.quote(title.encode('utf-8')), label) return '<a href="%s">%s</a>' % (urllib.quote(title), label)
else: else:
return label return label
@ -860,14 +857,13 @@ class Extractor(object):
header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title) header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
# Separate header from text with a newline. # Separate header from text with a newline.
header += self.title + '\n\n' header += self.title + '\n\n'
header = header.encode('utf-8')
footer = "\n</doc>\n" footer = "\n</doc>\n"
out.write(header) out.write(header)
text = self.clean_text(text) text = self.clean_text(text)
for line in text: for line in text:
out.write(line.encode('utf-8')) out.write(line)
out.write('\n') out.write('\n')
out.write(footer) out.write(footer)
errs = (self.template_title_errs, errs = (self.template_title_errs,