Fully working on python 3.
This commit is contained in:
parent
5b4302bca0
commit
0933664d70
@ -58,12 +58,12 @@ from io import StringIO
|
|||||||
from multiprocessing import Queue, Process, cpu_count
|
from multiprocessing import Queue, Process, cpu_count
|
||||||
from timeit import default_timer
|
from timeit import default_timer
|
||||||
|
|
||||||
from .extract import Extractor, ignoreTag, define_template
|
from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces
|
||||||
|
|
||||||
# ===========================================================================
|
# ===========================================================================
|
||||||
|
|
||||||
# Program version
|
# Program version
|
||||||
__version__ = '3.0.2'
|
__version__ = '3.0.3'
|
||||||
|
|
||||||
##
|
##
|
||||||
# Defined in <siteinfo>
|
# Defined in <siteinfo>
|
||||||
@ -136,7 +136,7 @@ class NextFile(object):
|
|||||||
|
|
||||||
def _dirname(self):
|
def _dirname(self):
|
||||||
char1 = self.dir_index % 26
|
char1 = self.dir_index % 26
|
||||||
char2 = self.dir_index / 26 % 26
|
char2 = int(self.dir_index / 26) % 26
|
||||||
return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))
|
return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))
|
||||||
|
|
||||||
def _filepath(self):
|
def _filepath(self):
|
||||||
@ -201,7 +201,7 @@ def load_templates(file, output_file=None):
|
|||||||
page = []
|
page = []
|
||||||
inText = False
|
inText = False
|
||||||
if output_file:
|
if output_file:
|
||||||
output = open(output_file, 'wb')
|
output = open(output_file, 'w')
|
||||||
for line in file:
|
for line in file:
|
||||||
line = line.decode('utf-8')
|
line = line.decode('utf-8')
|
||||||
if '<' not in line: # faster than doing re.search()
|
if '<' not in line: # faster than doing re.search()
|
||||||
@ -352,7 +352,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
|||||||
# start worker processes
|
# start worker processes
|
||||||
logging.info("Using %d extract processes.", process_count)
|
logging.info("Using %d extract processes.", process_count)
|
||||||
workers = []
|
workers = []
|
||||||
for _ in xrange(max(1, process_count)):
|
for _ in range(max(1, process_count)):
|
||||||
extractor = Process(target=extract_process,
|
extractor = Process(target=extract_process,
|
||||||
args=(jobs_queue, output_queue))
|
args=(jobs_queue, output_queue))
|
||||||
extractor.daemon = True # only live while parent process lives
|
extractor.daemon = True # only live while parent process lives
|
||||||
|
@ -19,7 +19,7 @@
|
|||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import cgi
|
import html
|
||||||
from itertools import zip_longest
|
from itertools import zip_longest
|
||||||
import urllib
|
import urllib
|
||||||
from html.entities import name2codepoint
|
from html.entities import name2codepoint
|
||||||
@ -63,7 +63,7 @@ acceptedNamespaces = ['w', 'wiktionary', 'wikt']
|
|||||||
|
|
||||||
|
|
||||||
def get_url(uid):
|
def get_url(uid):
|
||||||
return "%s?curid=%s" % (options.urlbase, uid)
|
return "%s?curid=%s" % (urlbase, uid)
|
||||||
|
|
||||||
|
|
||||||
# ======================================================================
|
# ======================================================================
|
||||||
@ -170,7 +170,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
|
|||||||
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
|
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
|
||||||
text = text.replace(',,', ',').replace(',.', '.')
|
text = text.replace(',,', ',').replace(',.', '.')
|
||||||
if escape_doc:
|
if escape_doc:
|
||||||
text = cgi.escape(text)
|
text = html.escape(text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
@ -212,9 +212,7 @@ def compact(text, mark_headers=False):
|
|||||||
|
|
||||||
headers[lev] = title
|
headers[lev] = title
|
||||||
# drop previous headers
|
# drop previous headers
|
||||||
for i in headers.keys():
|
headers = { k:v for k,v in headers.items() if k > lev }
|
||||||
if i > lev:
|
|
||||||
del headers[i]
|
|
||||||
emptySection = True
|
emptySection = True
|
||||||
continue
|
continue
|
||||||
# Handle page title
|
# Handle page title
|
||||||
@ -268,8 +266,7 @@ def compact(text, mark_headers=False):
|
|||||||
continue
|
continue
|
||||||
elif len(headers):
|
elif len(headers):
|
||||||
if Extractor.keepSections:
|
if Extractor.keepSections:
|
||||||
items = headers.items()
|
items = sorted(headers.items())
|
||||||
items.sort()
|
|
||||||
for (i, v) in items:
|
for (i, v) in items:
|
||||||
page.append(v)
|
page.append(v)
|
||||||
headers.clear()
|
headers.clear()
|
||||||
@ -497,7 +494,7 @@ def makeInternalLink(title, label):
|
|||||||
if colon2 > 1 and title[colon + 1:colon2] not in acceptedNamespaces:
|
if colon2 > 1 and title[colon + 1:colon2] not in acceptedNamespaces:
|
||||||
return ''
|
return ''
|
||||||
if Extractor.keepLinks:
|
if Extractor.keepLinks:
|
||||||
return '<a href="%s">%s</a>' % (urllib.quote(title.encode('utf-8')), label)
|
return '<a href="%s">%s</a>' % (urllib.quote(title), label)
|
||||||
else:
|
else:
|
||||||
return label
|
return label
|
||||||
|
|
||||||
@ -860,14 +857,13 @@ class Extractor(object):
|
|||||||
header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
|
header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
|
||||||
# Separate header from text with a newline.
|
# Separate header from text with a newline.
|
||||||
header += self.title + '\n\n'
|
header += self.title + '\n\n'
|
||||||
header = header.encode('utf-8')
|
|
||||||
footer = "\n</doc>\n"
|
footer = "\n</doc>\n"
|
||||||
out.write(header)
|
out.write(header)
|
||||||
|
|
||||||
text = self.clean_text(text)
|
text = self.clean_text(text)
|
||||||
|
|
||||||
for line in text:
|
for line in text:
|
||||||
out.write(line.encode('utf-8'))
|
out.write(line)
|
||||||
out.write('\n')
|
out.write('\n')
|
||||||
out.write(footer)
|
out.write(footer)
|
||||||
errs = (self.template_title_errs,
|
errs = (self.template_title_errs,
|
||||||
|
Loading…
Reference in New Issue
Block a user