Better handling of encoding.

This commit is contained in:
attardi 2020-12-05 19:27:01 +01:00
parent 0933664d70
commit a2e078f3be
3 changed files with 54 additions and 49 deletions

View File

@ -49,7 +49,6 @@ collecting template definitions.
import argparse
import bz2
import fileinput
import logging
import os.path
import re # TODO use regex when it will be standard
@ -112,7 +111,7 @@ modules = {
# Output
class NextFile(object):
class NextFile():
"""
Synchronous generation of next available file name.
@ -143,7 +142,7 @@ class NextFile(object):
return '%s/wiki_%02d' % (self._dirname(), self.file_index)
class OutputSplitter(object):
class OutputSplitter():
"""
File-like object, that splits output to multiple files of a given max size.
@ -203,7 +202,7 @@ def load_templates(file, output_file=None):
if output_file:
output = open(output_file, 'w')
for line in file:
line = line.decode('utf-8')
#line = line.decode('utf-8')
if '<' not in line: # faster than doing re.search()
if inText:
page.append(line)
@ -238,18 +237,18 @@ def load_templates(file, output_file=None):
# FIXME: should reconstruct also moduleNamespace
if title.startswith(templatePrefix):
define_template(title, page)
templates += 1
# save templates and modules to file
if output_file and (title.startswith(templatePrefix) or
title.startswith(modulePrefix)):
output.write('<page>\n')
output.write(' <title>%s</title>\n' % title)
output.write(' <title>%s</title>\n' % title.encode('utf-8'))
output.write(' <ns>10</ns>\n')
output.write(' <text>')
for line in page:
output.write(line)
output.write(line.encode('utf-8'))
output.write(' </text>\n')
output.write('</page>\n')
templates += 1
page = []
articles += 1
if articles % 100000 == 0:
@ -260,6 +259,20 @@ def load_templates(file, output_file=None):
return templates
def decode_open(filename, mode='rt', encoding='utf-8'):
"""
Open a file, decode and decompress, depending on extension `gz`, or 'bz2`.
"""
ext = os.path.splitext(filename)[1]
if ext == '.gz':
import gzip
return gzip.open(filename, mode)
elif ext == '.bz2':
return bz2.open(filename, mode=mode, encoding=encoding)
else:
return open(filename, mode, encoding=encoding)
def process_dump(input_file, template_file, out_file, file_size, file_compress,
process_count):
"""
@ -275,14 +288,11 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
global templateNamespace, templatePrefix
global moduleNamespace, modulePrefix
if input_file == '-':
input = sys.stdin
else:
input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
input = decode_open(input_file)
# collect siteinfo
for line in input:
line = line.decode('utf-8')
line = line #.decode('utf-8')
m = tagRE.search(line)
if not m:
continue
@ -308,7 +318,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
template_load_start = default_timer()
if template_file and os.path.exists(template_file):
logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", template_file)
file = fileinput.FileInput(template_file, openhook=fileinput.hook_compressed)
file = decode_open(template_file)
templates = load_templates(file)
file.close()
else:
@ -318,7 +328,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", input_file)
templates = load_templates(input, template_file)
input.close()
input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
input = decode_open(input_file)
template_load_elapsed = default_timer() - template_load_start
logging.info("Loaded %d templates in %.1fs", templates, template_load_elapsed)
@ -370,7 +380,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
inText = False
redirect = False
for line in input:
line = line.decode('utf-8')
#line = line.decode('utf-8')
if '<' not in line: # faster than doing re.search()
if inText:
page.append(line)
@ -402,8 +412,8 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
page.append(line)
elif tag == '/page':
colon = title.find(':')
if (colon < 0 or title[:colon] in acceptedNamespaces) and id != last_id and \
not redirect and not title.startswith(templateNamespace):
if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and
not redirect and not title.startswith(templateNamespace)):
job = (id, title, page, ordinal)
jobs_queue.put(job) # goes to any available extract_process
last_id = id
@ -539,7 +549,7 @@ def main():
args = parser.parse_args()
Extractor.keepLinks = args.links
Extractor.toHTML = args.html
Extractor.HtmlFormatting = args.html
if args.html:
Extractor.keepLinks = True
@ -583,7 +593,7 @@ def main():
load_templates(file)
with open(input_file) as file:
page = file.read().decode('utf-8')
page = file.read()#.decode('utf-8')
m = re.search(r'<id>(.*)</id>', page)
id = m.group(1) if m else 0
m = re.search(r'<title>(.*)</title>', page)

View File

@ -73,6 +73,11 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
"""
Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped
@see https://www.mediawiki.org/wiki/Help:Formatting
:param extractor: the Extractor t use.
:param text: the text to clean.
:param expand_templates: whether to perform template expansion.
:param escape_doc: whether to convert special characters to HTML entities.
@return: the cleaned text.
"""
if expand_templates:
@ -107,7 +112,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
text = res + unescape(text[cur:])
# Handle bold/italic/quote
if extractor.toHTML:
if extractor.HtmlFormatting:
text = bold_italic.sub(r'<b>\1</b>', text)
text = bold.sub(r'<b>\1</b>', text)
text = italic.sub(r'<i>\1</i>', text)
@ -146,7 +151,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
for tag in discardElements:
text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag)
if not extractor.toHTML:
if not extractor.HtmlFormatting:
# Turn into text what is left (&amp;nbsp;) and <syntaxhighlight>
text = unescape(text)
@ -170,7 +175,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
text = text.replace(',,', ',').replace(',.', '.')
if escape_doc:
text = html.escape(text)
text = html.escape(text, quote=False)
return text
@ -202,7 +207,7 @@ def compact(text, mark_headers=False):
if m:
title = m.group(2)
lev = len(m.group(1))
if Extractor.toHTML:
if Extractor.HtmlFormatting:
page.append("<h%d>%s</h%d>" % (lev, title, lev))
if title and title[-1] not in '!?':
title += '.'
@ -212,7 +217,7 @@ def compact(text, mark_headers=False):
headers[lev] = title
# drop previous headers
headers = { k:v for k,v in headers.items() if k > lev }
headers = { k:v for k,v in headers.items() if k <= lev }
emptySection = True
continue
# Handle page title
@ -228,7 +233,7 @@ def compact(text, mark_headers=False):
continue
# handle lists
elif line[0] in '*#;:':
if Extractor.toHTML:
if Extractor.HtmlFormatting:
i = 0
for c, n in zip_longest(listLevel, line, fillvalue=''):
if not n or n not in '*#;:':
@ -282,13 +287,6 @@ def compact(text, mark_headers=False):
return page
def handle_unicode(entity):
numeric_code = int(entity[2:-1])
if numeric_code >= 0x10000:
return ''
return unichr(numeric_code)
# ----------------------------------------------------------------------
def dropNested(text, openDelim, closeDelim):
@ -503,7 +501,7 @@ def makeInternalLink(title, label):
# variables
class MagicWords(object):
class MagicWords():
"""
One copy in each Extractor.
@ -726,11 +724,11 @@ def unescape(text):
try:
if text[1] == "#": # character reference
if text[2] == "x":
return unichr(int(code[1:], 16))
return chr(int(code[1:], 16))
else:
return unichr(int(code))
return chr(int(code))
else: # named entity
return unichr(name2codepoint[code])
return chr(name2codepoint[code])
except:
return text # leave as is
@ -795,8 +793,7 @@ dots = re.compile(r'\.{4,}')
substWords = 'subst:|safesubst:'
class Extractor(object):
class Extractor():
"""
An extraction task on a article.
"""
@ -809,8 +806,8 @@ class Extractor(object):
keepSections = True
##
# Whether to output HTML instead of text
toHTML = False
# Whether to output text with HTML formatting elements in <doc> files.
HtmlFormatting = False
def __init__(self, id, title, page):
"""
@ -846,7 +843,7 @@ class Extractor(object):
text = compact(text, mark_headers=mark_headers)
return text
def extract(self, out):
def extract(self, out, escape_doc=True):
"""
:param out: a memory file.
"""
@ -860,7 +857,7 @@ class Extractor(object):
footer = "\n</doc>\n"
out.write(header)
text = self.clean_text(text)
text = self.clean_text(text, escape_doc=escape_doc)
for line in text:
out.write(line)
@ -1443,7 +1440,7 @@ def normalizeNamespace(ns):
# https://github.com/Wikia/app/blob/dev/extensions/ParserFunctions/ParserFunctions_body.php
class Infix:
class Infix():
"""Infix operators.
The calling sequence for the infix is:

View File

@ -34,7 +34,7 @@ import bz2
# Program version
version = '3.0'
__version__ = '3.0.3'
# ----------------------------------------------------------------------
# READER
@ -49,15 +49,13 @@ def process_data(input_file, id, templates=False):
:param id: article id
"""
if input_file.lower().endswith("bz2"):
opener = bz2.BZ2File
else:
opener = open
opener = bz2.BZ2File if input_file.lower().endswith("bz2") else open
input = opener(input_file)
page = []
for line in input:
line = line.decode('utf-8')
if '<' not in line: # faster than doing re.search()
if page:
page.append(line)
@ -103,7 +101,7 @@ def main():
description=__doc__)
parser.add_argument("input",
help="XML wiki dump file")
parser.add_argument("--id", default="",
parser.add_argument("--id", default="1",
help="article number")
parser.add_argument("--template", action="store_true",
help="template number")