diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py index 152cd78..27056e2 100755 --- a/wikiextractor/WikiExtractor.py +++ b/wikiextractor/WikiExtractor.py @@ -58,12 +58,12 @@ from io import StringIO from multiprocessing import Queue, Process, cpu_count from timeit import default_timer -from .extract import Extractor, ignoreTag, define_template +from .extract import Extractor, ignoreTag, define_template, acceptedNamespaces # =========================================================================== # Program version -__version__ = '3.0.2' +__version__ = '3.0.3' ## # Defined in @@ -136,7 +136,7 @@ class NextFile(object): def _dirname(self): char1 = self.dir_index % 26 - char2 = self.dir_index / 26 % 26 + char2 = int(self.dir_index / 26) % 26 return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1)) def _filepath(self): @@ -201,7 +201,7 @@ def load_templates(file, output_file=None): page = [] inText = False if output_file: - output = open(output_file, 'wb') + output = open(output_file, 'w') for line in file: line = line.decode('utf-8') if '<' not in line: # faster than doing re.search() @@ -352,7 +352,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, # start worker processes logging.info("Using %d extract processes.", process_count) workers = [] - for _ in xrange(max(1, process_count)): + for _ in range(max(1, process_count)): extractor = Process(target=extract_process, args=(jobs_queue, output_queue)) extractor.daemon = True # only live while parent process lives diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py index dcfcd83..c0d9d75 100644 --- a/wikiextractor/extract.py +++ b/wikiextractor/extract.py @@ -19,7 +19,7 @@ # ============================================================================= import re -import cgi +import html from itertools import zip_longest import urllib from html.entities import name2codepoint @@ -63,7 +63,7 @@ acceptedNamespaces = ['w', 'wiktionary', 'wikt'] def get_url(uid): - return "%s?curid=%s" % (options.urlbase, uid) + return "%s?curid=%s" % (urlbase, uid) # ====================================================================== @@ -170,7 +170,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True): text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations text = text.replace(',,', ',').replace(',.', '.') if escape_doc: - text = cgi.escape(text) + text = html.escape(text) return text @@ -212,9 +212,7 @@ def compact(text, mark_headers=False): headers[lev] = title # drop previous headers - for i in headers.keys(): - if i > lev: - del headers[i] + headers = { k:v for k,v in headers.items() if k > lev } emptySection = True continue # Handle page title @@ -268,8 +266,7 @@ def compact(text, mark_headers=False): continue elif len(headers): if Extractor.keepSections: - items = headers.items() - items.sort() + items = sorted(headers.items()) for (i, v) in items: page.append(v) headers.clear() @@ -497,7 +494,7 @@ def makeInternalLink(title, label): if colon2 > 1 and title[colon + 1:colon2] not in acceptedNamespaces: return '' if Extractor.keepLinks: - return '%s' % (urllib.quote(title.encode('utf-8')), label) + return '%s' % (urllib.quote(title), label) else: return label @@ -860,14 +857,13 @@ class Extractor(object): header = '\n' % (self.id, url, self.title) # Separate header from text with a newline. header += self.title + '\n\n' - header = header.encode('utf-8') footer = "\n\n" out.write(header) text = self.clean_text(text) for line in text: - out.write(line.encode('utf-8')) + out.write(line) out.write('\n') out.write(footer) errs = (self.template_title_errs,