Better handling of encoding.

This commit is contained in:
attardi 2020-12-05 19:27:01 +01:00
parent 0933664d70
commit a2e078f3be
3 changed files with 54 additions and 49 deletions

View File

@ -49,7 +49,6 @@ collecting template definitions.
import argparse import argparse
import bz2 import bz2
import fileinput
import logging import logging
import os.path import os.path
import re # TODO use regex when it will be standard import re # TODO use regex when it will be standard
@ -112,7 +111,7 @@ modules = {
# Output # Output
class NextFile(object): class NextFile():
""" """
Synchronous generation of next available file name. Synchronous generation of next available file name.
@ -143,7 +142,7 @@ class NextFile(object):
return '%s/wiki_%02d' % (self._dirname(), self.file_index) return '%s/wiki_%02d' % (self._dirname(), self.file_index)
class OutputSplitter(object): class OutputSplitter():
""" """
File-like object, that splits output to multiple files of a given max size. File-like object, that splits output to multiple files of a given max size.
@ -203,7 +202,7 @@ def load_templates(file, output_file=None):
if output_file: if output_file:
output = open(output_file, 'w') output = open(output_file, 'w')
for line in file: for line in file:
line = line.decode('utf-8') #line = line.decode('utf-8')
if '<' not in line: # faster than doing re.search() if '<' not in line: # faster than doing re.search()
if inText: if inText:
page.append(line) page.append(line)
@ -238,18 +237,18 @@ def load_templates(file, output_file=None):
# FIXME: should reconstruct also moduleNamespace # FIXME: should reconstruct also moduleNamespace
if title.startswith(templatePrefix): if title.startswith(templatePrefix):
define_template(title, page) define_template(title, page)
templates += 1
# save templates and modules to file # save templates and modules to file
if output_file and (title.startswith(templatePrefix) or if output_file and (title.startswith(templatePrefix) or
title.startswith(modulePrefix)): title.startswith(modulePrefix)):
output.write('<page>\n') output.write('<page>\n')
output.write(' <title>%s</title>\n' % title) output.write(' <title>%s</title>\n' % title.encode('utf-8'))
output.write(' <ns>10</ns>\n') output.write(' <ns>10</ns>\n')
output.write(' <text>') output.write(' <text>')
for line in page: for line in page:
output.write(line) output.write(line.encode('utf-8'))
output.write(' </text>\n') output.write(' </text>\n')
output.write('</page>\n') output.write('</page>\n')
templates += 1
page = [] page = []
articles += 1 articles += 1
if articles % 100000 == 0: if articles % 100000 == 0:
@ -260,6 +259,20 @@ def load_templates(file, output_file=None):
return templates return templates
def decode_open(filename, mode='rt', encoding='utf-8'):
"""
Open a file, decode and decompress, depending on extension `gz`, or 'bz2`.
"""
ext = os.path.splitext(filename)[1]
if ext == '.gz':
import gzip
return gzip.open(filename, mode)
elif ext == '.bz2':
return bz2.open(filename, mode=mode, encoding=encoding)
else:
return open(filename, mode, encoding=encoding)
def process_dump(input_file, template_file, out_file, file_size, file_compress, def process_dump(input_file, template_file, out_file, file_size, file_compress,
process_count): process_count):
""" """
@ -275,14 +288,11 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
global templateNamespace, templatePrefix global templateNamespace, templatePrefix
global moduleNamespace, modulePrefix global moduleNamespace, modulePrefix
if input_file == '-': input = decode_open(input_file)
input = sys.stdin
else:
input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
# collect siteinfo # collect siteinfo
for line in input: for line in input:
line = line.decode('utf-8') line = line #.decode('utf-8')
m = tagRE.search(line) m = tagRE.search(line)
if not m: if not m:
continue continue
@ -308,7 +318,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
template_load_start = default_timer() template_load_start = default_timer()
if template_file and os.path.exists(template_file): if template_file and os.path.exists(template_file):
logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", template_file) logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", template_file)
file = fileinput.FileInput(template_file, openhook=fileinput.hook_compressed) file = decode_open(template_file)
templates = load_templates(file) templates = load_templates(file)
file.close() file.close()
else: else:
@ -318,7 +328,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", input_file) logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", input_file)
templates = load_templates(input, template_file) templates = load_templates(input, template_file)
input.close() input.close()
input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed) input = decode_open(input_file)
template_load_elapsed = default_timer() - template_load_start template_load_elapsed = default_timer() - template_load_start
logging.info("Loaded %d templates in %.1fs", templates, template_load_elapsed) logging.info("Loaded %d templates in %.1fs", templates, template_load_elapsed)
@ -370,7 +380,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
inText = False inText = False
redirect = False redirect = False
for line in input: for line in input:
line = line.decode('utf-8') #line = line.decode('utf-8')
if '<' not in line: # faster than doing re.search() if '<' not in line: # faster than doing re.search()
if inText: if inText:
page.append(line) page.append(line)
@ -402,8 +412,8 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
page.append(line) page.append(line)
elif tag == '/page': elif tag == '/page':
colon = title.find(':') colon = title.find(':')
if (colon < 0 or title[:colon] in acceptedNamespaces) and id != last_id and \ if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and
not redirect and not title.startswith(templateNamespace): not redirect and not title.startswith(templateNamespace)):
job = (id, title, page, ordinal) job = (id, title, page, ordinal)
jobs_queue.put(job) # goes to any available extract_process jobs_queue.put(job) # goes to any available extract_process
last_id = id last_id = id
@ -539,7 +549,7 @@ def main():
args = parser.parse_args() args = parser.parse_args()
Extractor.keepLinks = args.links Extractor.keepLinks = args.links
Extractor.toHTML = args.html Extractor.HtmlFormatting = args.html
if args.html: if args.html:
Extractor.keepLinks = True Extractor.keepLinks = True
@ -583,7 +593,7 @@ def main():
load_templates(file) load_templates(file)
with open(input_file) as file: with open(input_file) as file:
page = file.read().decode('utf-8') page = file.read()#.decode('utf-8')
m = re.search(r'<id>(.*)</id>', page) m = re.search(r'<id>(.*)</id>', page)
id = m.group(1) if m else 0 id = m.group(1) if m else 0
m = re.search(r'<title>(.*)</title>', page) m = re.search(r'<title>(.*)</title>', page)

View File

@ -73,6 +73,11 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
""" """
Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped
@see https://www.mediawiki.org/wiki/Help:Formatting @see https://www.mediawiki.org/wiki/Help:Formatting
:param extractor: the Extractor t use.
:param text: the text to clean.
:param expand_templates: whether to perform template expansion.
:param escape_doc: whether to convert special characters to HTML entities.
@return: the cleaned text.
""" """
if expand_templates: if expand_templates:
@ -107,7 +112,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
text = res + unescape(text[cur:]) text = res + unescape(text[cur:])
# Handle bold/italic/quote # Handle bold/italic/quote
if extractor.toHTML: if extractor.HtmlFormatting:
text = bold_italic.sub(r'<b>\1</b>', text) text = bold_italic.sub(r'<b>\1</b>', text)
text = bold.sub(r'<b>\1</b>', text) text = bold.sub(r'<b>\1</b>', text)
text = italic.sub(r'<i>\1</i>', text) text = italic.sub(r'<i>\1</i>', text)
@ -146,7 +151,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
for tag in discardElements: for tag in discardElements:
text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag) text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag)
if not extractor.toHTML: if not extractor.HtmlFormatting:
# Turn into text what is left (&amp;nbsp;) and <syntaxhighlight> # Turn into text what is left (&amp;nbsp;) and <syntaxhighlight>
text = unescape(text) text = unescape(text)
@ -170,7 +175,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
text = text.replace(',,', ',').replace(',.', '.') text = text.replace(',,', ',').replace(',.', '.')
if escape_doc: if escape_doc:
text = html.escape(text) text = html.escape(text, quote=False)
return text return text
@ -202,7 +207,7 @@ def compact(text, mark_headers=False):
if m: if m:
title = m.group(2) title = m.group(2)
lev = len(m.group(1)) lev = len(m.group(1))
if Extractor.toHTML: if Extractor.HtmlFormatting:
page.append("<h%d>%s</h%d>" % (lev, title, lev)) page.append("<h%d>%s</h%d>" % (lev, title, lev))
if title and title[-1] not in '!?': if title and title[-1] not in '!?':
title += '.' title += '.'
@ -212,7 +217,7 @@ def compact(text, mark_headers=False):
headers[lev] = title headers[lev] = title
# drop previous headers # drop previous headers
headers = { k:v for k,v in headers.items() if k > lev } headers = { k:v for k,v in headers.items() if k <= lev }
emptySection = True emptySection = True
continue continue
# Handle page title # Handle page title
@ -228,7 +233,7 @@ def compact(text, mark_headers=False):
continue continue
# handle lists # handle lists
elif line[0] in '*#;:': elif line[0] in '*#;:':
if Extractor.toHTML: if Extractor.HtmlFormatting:
i = 0 i = 0
for c, n in zip_longest(listLevel, line, fillvalue=''): for c, n in zip_longest(listLevel, line, fillvalue=''):
if not n or n not in '*#;:': if not n or n not in '*#;:':
@ -282,13 +287,6 @@ def compact(text, mark_headers=False):
return page return page
def handle_unicode(entity):
numeric_code = int(entity[2:-1])
if numeric_code >= 0x10000:
return ''
return unichr(numeric_code)
# ---------------------------------------------------------------------- # ----------------------------------------------------------------------
def dropNested(text, openDelim, closeDelim): def dropNested(text, openDelim, closeDelim):
@ -503,7 +501,7 @@ def makeInternalLink(title, label):
# variables # variables
class MagicWords(object): class MagicWords():
""" """
One copy in each Extractor. One copy in each Extractor.
@ -726,11 +724,11 @@ def unescape(text):
try: try:
if text[1] == "#": # character reference if text[1] == "#": # character reference
if text[2] == "x": if text[2] == "x":
return unichr(int(code[1:], 16)) return chr(int(code[1:], 16))
else: else:
return unichr(int(code)) return chr(int(code))
else: # named entity else: # named entity
return unichr(name2codepoint[code]) return chr(name2codepoint[code])
except: except:
return text # leave as is return text # leave as is
@ -795,8 +793,7 @@ dots = re.compile(r'\.{4,}')
substWords = 'subst:|safesubst:' substWords = 'subst:|safesubst:'
class Extractor(object): class Extractor():
""" """
An extraction task on a article. An extraction task on a article.
""" """
@ -809,8 +806,8 @@ class Extractor(object):
keepSections = True keepSections = True
## ##
# Whether to output HTML instead of text # Whether to output text with HTML formatting elements in <doc> files.
toHTML = False HtmlFormatting = False
def __init__(self, id, title, page): def __init__(self, id, title, page):
""" """
@ -846,7 +843,7 @@ class Extractor(object):
text = compact(text, mark_headers=mark_headers) text = compact(text, mark_headers=mark_headers)
return text return text
def extract(self, out): def extract(self, out, escape_doc=True):
""" """
:param out: a memory file. :param out: a memory file.
""" """
@ -860,7 +857,7 @@ class Extractor(object):
footer = "\n</doc>\n" footer = "\n</doc>\n"
out.write(header) out.write(header)
text = self.clean_text(text) text = self.clean_text(text, escape_doc=escape_doc)
for line in text: for line in text:
out.write(line) out.write(line)
@ -1443,7 +1440,7 @@ def normalizeNamespace(ns):
# https://github.com/Wikia/app/blob/dev/extensions/ParserFunctions/ParserFunctions_body.php # https://github.com/Wikia/app/blob/dev/extensions/ParserFunctions/ParserFunctions_body.php
class Infix: class Infix():
"""Infix operators. """Infix operators.
The calling sequence for the infix is: The calling sequence for the infix is:

View File

@ -34,7 +34,7 @@ import bz2
# Program version # Program version
version = '3.0' __version__ = '3.0.3'
# ---------------------------------------------------------------------- # ----------------------------------------------------------------------
# READER # READER
@ -49,15 +49,13 @@ def process_data(input_file, id, templates=False):
:param id: article id :param id: article id
""" """
if input_file.lower().endswith("bz2"): opener = bz2.BZ2File if input_file.lower().endswith("bz2") else open
opener = bz2.BZ2File
else:
opener = open
input = opener(input_file) input = opener(input_file)
page = [] page = []
for line in input: for line in input:
line = line.decode('utf-8')
if '<' not in line: # faster than doing re.search() if '<' not in line: # faster than doing re.search()
if page: if page:
page.append(line) page.append(line)
@ -103,7 +101,7 @@ def main():
description=__doc__) description=__doc__)
parser.add_argument("input", parser.add_argument("input",
help="XML wiki dump file") help="XML wiki dump file")
parser.add_argument("--id", default="", parser.add_argument("--id", default="1",
help="article number") help="article number")
parser.add_argument("--template", action="store_true", parser.add_argument("--template", action="store_true",
help="template number") help="template number")