From a2e078f3beb571e2ceee84c0ac42d19e220ed339 Mon Sep 17 00:00:00 2001 From: attardi Date: Sat, 5 Dec 2020 19:27:01 +0100 Subject: [PATCH] Better handling of encoding. --- wikiextractor/WikiExtractor.py | 48 ++++++++++++++++++++-------------- wikiextractor/extract.py | 45 +++++++++++++++---------------- wikiextractor/extractPage.py | 10 +++---- 3 files changed, 54 insertions(+), 49 deletions(-) diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py index 27056e2..b4a6ed1 100755 --- a/wikiextractor/WikiExtractor.py +++ b/wikiextractor/WikiExtractor.py @@ -49,7 +49,6 @@ collecting template definitions. import argparse import bz2 -import fileinput import logging import os.path import re # TODO use regex when it will be standard @@ -112,7 +111,7 @@ modules = { # Output -class NextFile(object): +class NextFile(): """ Synchronous generation of next available file name. @@ -143,7 +142,7 @@ class NextFile(object): return '%s/wiki_%02d' % (self._dirname(), self.file_index) -class OutputSplitter(object): +class OutputSplitter(): """ File-like object, that splits output to multiple files of a given max size. @@ -203,7 +202,7 @@ def load_templates(file, output_file=None): if output_file: output = open(output_file, 'w') for line in file: - line = line.decode('utf-8') + #line = line.decode('utf-8') if '<' not in line: # faster than doing re.search() if inText: page.append(line) @@ -238,18 +237,18 @@ def load_templates(file, output_file=None): # FIXME: should reconstruct also moduleNamespace if title.startswith(templatePrefix): define_template(title, page) + templates += 1 # save templates and modules to file if output_file and (title.startswith(templatePrefix) or title.startswith(modulePrefix)): output.write('\n') - output.write(' %s\n' % title) + output.write(' %s\n' % title.encode('utf-8')) output.write(' 10\n') output.write(' ') for line in page: - output.write(line) + output.write(line.encode('utf-8')) output.write(' \n') output.write('\n') - templates += 1 page = [] articles += 1 if articles % 100000 == 0: @@ -260,6 +259,20 @@ def load_templates(file, output_file=None): return templates +def decode_open(filename, mode='rt', encoding='utf-8'): + """ + Open a file, decode and decompress, depending on extension `gz`, or 'bz2`. + """ + ext = os.path.splitext(filename)[1] + if ext == '.gz': + import gzip + return gzip.open(filename, mode) + elif ext == '.bz2': + return bz2.open(filename, mode=mode, encoding=encoding) + else: + return open(filename, mode, encoding=encoding) + + def process_dump(input_file, template_file, out_file, file_size, file_compress, process_count): """ @@ -275,14 +288,11 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, global templateNamespace, templatePrefix global moduleNamespace, modulePrefix - if input_file == '-': - input = sys.stdin - else: - input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed) + input = decode_open(input_file) # collect siteinfo for line in input: - line = line.decode('utf-8') + line = line #.decode('utf-8') m = tagRE.search(line) if not m: continue @@ -308,7 +318,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, template_load_start = default_timer() if template_file and os.path.exists(template_file): logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", template_file) - file = fileinput.FileInput(template_file, openhook=fileinput.hook_compressed) + file = decode_open(template_file) templates = load_templates(file) file.close() else: @@ -318,7 +328,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", input_file) templates = load_templates(input, template_file) input.close() - input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed) + input = decode_open(input_file) template_load_elapsed = default_timer() - template_load_start logging.info("Loaded %d templates in %.1fs", templates, template_load_elapsed) @@ -370,7 +380,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, inText = False redirect = False for line in input: - line = line.decode('utf-8') + #line = line.decode('utf-8') if '<' not in line: # faster than doing re.search() if inText: page.append(line) @@ -402,8 +412,8 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, page.append(line) elif tag == '/page': colon = title.find(':') - if (colon < 0 or title[:colon] in acceptedNamespaces) and id != last_id and \ - not redirect and not title.startswith(templateNamespace): + if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and + not redirect and not title.startswith(templateNamespace)): job = (id, title, page, ordinal) jobs_queue.put(job) # goes to any available extract_process last_id = id @@ -539,7 +549,7 @@ def main(): args = parser.parse_args() Extractor.keepLinks = args.links - Extractor.toHTML = args.html + Extractor.HtmlFormatting = args.html if args.html: Extractor.keepLinks = True @@ -583,7 +593,7 @@ def main(): load_templates(file) with open(input_file) as file: - page = file.read().decode('utf-8') + page = file.read()#.decode('utf-8') m = re.search(r'(.*)', page) id = m.group(1) if m else 0 m = re.search(r'(.*)', page) diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py index c0d9d75..5dd2a93 100644 --- a/wikiextractor/extract.py +++ b/wikiextractor/extract.py @@ -73,6 +73,11 @@ def clean(extractor, text, expand_templates=False, escape_doc=True): """ Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped @see https://www.mediawiki.org/wiki/Help:Formatting + :param extractor: the Extractor t use. + :param text: the text to clean. + :param expand_templates: whether to perform template expansion. + :param escape_doc: whether to convert special characters to HTML entities. + @return: the cleaned text. """ if expand_templates: @@ -107,7 +112,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True): text = res + unescape(text[cur:]) # Handle bold/italic/quote - if extractor.toHTML: + if extractor.HtmlFormatting: text = bold_italic.sub(r'\1', text) text = bold.sub(r'\1', text) text = italic.sub(r'\1', text) @@ -146,7 +151,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True): for tag in discardElements: text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag) - if not extractor.toHTML: + if not extractor.HtmlFormatting: # Turn into text what is left (&nbsp;) and text = unescape(text) @@ -170,7 +175,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True): text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations text = text.replace(',,', ',').replace(',.', '.') if escape_doc: - text = html.escape(text) + text = html.escape(text, quote=False) return text @@ -202,7 +207,7 @@ def compact(text, mark_headers=False): if m: title = m.group(2) lev = len(m.group(1)) - if Extractor.toHTML: + if Extractor.HtmlFormatting: page.append("%s" % (lev, title, lev)) if title and title[-1] not in '!?': title += '.' @@ -212,7 +217,7 @@ def compact(text, mark_headers=False): headers[lev] = title # drop previous headers - headers = { k:v for k,v in headers.items() if k > lev } + headers = { k:v for k,v in headers.items() if k <= lev } emptySection = True continue # Handle page title @@ -228,7 +233,7 @@ def compact(text, mark_headers=False): continue # handle lists elif line[0] in '*#;:': - if Extractor.toHTML: + if Extractor.HtmlFormatting: i = 0 for c, n in zip_longest(listLevel, line, fillvalue=''): if not n or n not in '*#;:': @@ -282,13 +287,6 @@ def compact(text, mark_headers=False): return page -def handle_unicode(entity): - numeric_code = int(entity[2:-1]) - if numeric_code >= 0x10000: - return '' - return unichr(numeric_code) - - # ---------------------------------------------------------------------- def dropNested(text, openDelim, closeDelim): @@ -503,7 +501,7 @@ def makeInternalLink(title, label): # variables -class MagicWords(object): +class MagicWords(): """ One copy in each Extractor. @@ -726,11 +724,11 @@ def unescape(text): try: if text[1] == "#": # character reference if text[2] == "x": - return unichr(int(code[1:], 16)) + return chr(int(code[1:], 16)) else: - return unichr(int(code)) + return chr(int(code)) else: # named entity - return unichr(name2codepoint[code]) + return chr(name2codepoint[code]) except: return text # leave as is @@ -795,8 +793,7 @@ dots = re.compile(r'\.{4,}') substWords = 'subst:|safesubst:' -class Extractor(object): - +class Extractor(): """ An extraction task on a article. """ @@ -809,8 +806,8 @@ class Extractor(object): keepSections = True ## - # Whether to output HTML instead of text - toHTML = False + # Whether to output text with HTML formatting elements in files. + HtmlFormatting = False def __init__(self, id, title, page): """ @@ -846,7 +843,7 @@ class Extractor(object): text = compact(text, mark_headers=mark_headers) return text - def extract(self, out): + def extract(self, out, escape_doc=True): """ :param out: a memory file. """ @@ -860,7 +857,7 @@ class Extractor(object): footer = "\n\n" out.write(header) - text = self.clean_text(text) + text = self.clean_text(text, escape_doc=escape_doc) for line in text: out.write(line) @@ -1443,7 +1440,7 @@ def normalizeNamespace(ns): # https://github.com/Wikia/app/blob/dev/extensions/ParserFunctions/ParserFunctions_body.php -class Infix: +class Infix(): """Infix operators. The calling sequence for the infix is: diff --git a/wikiextractor/extractPage.py b/wikiextractor/extractPage.py index 89a7081..e73dcd9 100755 --- a/wikiextractor/extractPage.py +++ b/wikiextractor/extractPage.py @@ -34,7 +34,7 @@ import bz2 # Program version -version = '3.0' +__version__ = '3.0.3' # ---------------------------------------------------------------------- # READER @@ -49,15 +49,13 @@ def process_data(input_file, id, templates=False): :param id: article id """ - if input_file.lower().endswith("bz2"): - opener = bz2.BZ2File - else: - opener = open + opener = bz2.BZ2File if input_file.lower().endswith("bz2") else open input = opener(input_file) page = [] for line in input: + line = line.decode('utf-8') if '<' not in line: # faster than doing re.search() if page: page.append(line) @@ -103,7 +101,7 @@ def main(): description=__doc__) parser.add_argument("input", help="XML wiki dump file") - parser.add_argument("--id", default="", + parser.add_argument("--id", default="1", help="article number") parser.add_argument("--template", action="store_true", help="template number")