Better handling of encoding.
This commit is contained in:
parent
0933664d70
commit
a2e078f3be
@ -49,7 +49,6 @@ collecting template definitions.
|
||||
|
||||
import argparse
|
||||
import bz2
|
||||
import fileinput
|
||||
import logging
|
||||
import os.path
|
||||
import re # TODO use regex when it will be standard
|
||||
@ -112,7 +111,7 @@ modules = {
|
||||
# Output
|
||||
|
||||
|
||||
class NextFile(object):
|
||||
class NextFile():
|
||||
|
||||
"""
|
||||
Synchronous generation of next available file name.
|
||||
@ -143,7 +142,7 @@ class NextFile(object):
|
||||
return '%s/wiki_%02d' % (self._dirname(), self.file_index)
|
||||
|
||||
|
||||
class OutputSplitter(object):
|
||||
class OutputSplitter():
|
||||
|
||||
"""
|
||||
File-like object, that splits output to multiple files of a given max size.
|
||||
@ -203,7 +202,7 @@ def load_templates(file, output_file=None):
|
||||
if output_file:
|
||||
output = open(output_file, 'w')
|
||||
for line in file:
|
||||
line = line.decode('utf-8')
|
||||
#line = line.decode('utf-8')
|
||||
if '<' not in line: # faster than doing re.search()
|
||||
if inText:
|
||||
page.append(line)
|
||||
@ -238,18 +237,18 @@ def load_templates(file, output_file=None):
|
||||
# FIXME: should reconstruct also moduleNamespace
|
||||
if title.startswith(templatePrefix):
|
||||
define_template(title, page)
|
||||
templates += 1
|
||||
# save templates and modules to file
|
||||
if output_file and (title.startswith(templatePrefix) or
|
||||
title.startswith(modulePrefix)):
|
||||
output.write('<page>\n')
|
||||
output.write(' <title>%s</title>\n' % title)
|
||||
output.write(' <title>%s</title>\n' % title.encode('utf-8'))
|
||||
output.write(' <ns>10</ns>\n')
|
||||
output.write(' <text>')
|
||||
for line in page:
|
||||
output.write(line)
|
||||
output.write(line.encode('utf-8'))
|
||||
output.write(' </text>\n')
|
||||
output.write('</page>\n')
|
||||
templates += 1
|
||||
page = []
|
||||
articles += 1
|
||||
if articles % 100000 == 0:
|
||||
@ -260,6 +259,20 @@ def load_templates(file, output_file=None):
|
||||
return templates
|
||||
|
||||
|
||||
def decode_open(filename, mode='rt', encoding='utf-8'):
|
||||
"""
|
||||
Open a file, decode and decompress, depending on extension `gz`, or 'bz2`.
|
||||
"""
|
||||
ext = os.path.splitext(filename)[1]
|
||||
if ext == '.gz':
|
||||
import gzip
|
||||
return gzip.open(filename, mode)
|
||||
elif ext == '.bz2':
|
||||
return bz2.open(filename, mode=mode, encoding=encoding)
|
||||
else:
|
||||
return open(filename, mode, encoding=encoding)
|
||||
|
||||
|
||||
def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
||||
process_count):
|
||||
"""
|
||||
@ -275,14 +288,11 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
||||
global templateNamespace, templatePrefix
|
||||
global moduleNamespace, modulePrefix
|
||||
|
||||
if input_file == '-':
|
||||
input = sys.stdin
|
||||
else:
|
||||
input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
|
||||
input = decode_open(input_file)
|
||||
|
||||
# collect siteinfo
|
||||
for line in input:
|
||||
line = line.decode('utf-8')
|
||||
line = line #.decode('utf-8')
|
||||
m = tagRE.search(line)
|
||||
if not m:
|
||||
continue
|
||||
@ -308,7 +318,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
||||
template_load_start = default_timer()
|
||||
if template_file and os.path.exists(template_file):
|
||||
logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", template_file)
|
||||
file = fileinput.FileInput(template_file, openhook=fileinput.hook_compressed)
|
||||
file = decode_open(template_file)
|
||||
templates = load_templates(file)
|
||||
file.close()
|
||||
else:
|
||||
@ -318,7 +328,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
||||
logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", input_file)
|
||||
templates = load_templates(input, template_file)
|
||||
input.close()
|
||||
input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
|
||||
input = decode_open(input_file)
|
||||
template_load_elapsed = default_timer() - template_load_start
|
||||
logging.info("Loaded %d templates in %.1fs", templates, template_load_elapsed)
|
||||
|
||||
@ -370,7 +380,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
||||
inText = False
|
||||
redirect = False
|
||||
for line in input:
|
||||
line = line.decode('utf-8')
|
||||
#line = line.decode('utf-8')
|
||||
if '<' not in line: # faster than doing re.search()
|
||||
if inText:
|
||||
page.append(line)
|
||||
@ -402,8 +412,8 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
||||
page.append(line)
|
||||
elif tag == '/page':
|
||||
colon = title.find(':')
|
||||
if (colon < 0 or title[:colon] in acceptedNamespaces) and id != last_id and \
|
||||
not redirect and not title.startswith(templateNamespace):
|
||||
if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and
|
||||
not redirect and not title.startswith(templateNamespace)):
|
||||
job = (id, title, page, ordinal)
|
||||
jobs_queue.put(job) # goes to any available extract_process
|
||||
last_id = id
|
||||
@ -539,7 +549,7 @@ def main():
|
||||
args = parser.parse_args()
|
||||
|
||||
Extractor.keepLinks = args.links
|
||||
Extractor.toHTML = args.html
|
||||
Extractor.HtmlFormatting = args.html
|
||||
if args.html:
|
||||
Extractor.keepLinks = True
|
||||
|
||||
@ -583,7 +593,7 @@ def main():
|
||||
load_templates(file)
|
||||
|
||||
with open(input_file) as file:
|
||||
page = file.read().decode('utf-8')
|
||||
page = file.read()#.decode('utf-8')
|
||||
m = re.search(r'<id>(.*)</id>', page)
|
||||
id = m.group(1) if m else 0
|
||||
m = re.search(r'<title>(.*)</title>', page)
|
||||
|
@ -73,6 +73,11 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
|
||||
"""
|
||||
Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped
|
||||
@see https://www.mediawiki.org/wiki/Help:Formatting
|
||||
:param extractor: the Extractor t use.
|
||||
:param text: the text to clean.
|
||||
:param expand_templates: whether to perform template expansion.
|
||||
:param escape_doc: whether to convert special characters to HTML entities.
|
||||
@return: the cleaned text.
|
||||
"""
|
||||
|
||||
if expand_templates:
|
||||
@ -107,7 +112,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
|
||||
text = res + unescape(text[cur:])
|
||||
|
||||
# Handle bold/italic/quote
|
||||
if extractor.toHTML:
|
||||
if extractor.HtmlFormatting:
|
||||
text = bold_italic.sub(r'<b>\1</b>', text)
|
||||
text = bold.sub(r'<b>\1</b>', text)
|
||||
text = italic.sub(r'<i>\1</i>', text)
|
||||
@ -146,7 +151,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
|
||||
for tag in discardElements:
|
||||
text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag)
|
||||
|
||||
if not extractor.toHTML:
|
||||
if not extractor.HtmlFormatting:
|
||||
# Turn into text what is left (&nbsp;) and <syntaxhighlight>
|
||||
text = unescape(text)
|
||||
|
||||
@ -170,7 +175,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
|
||||
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
|
||||
text = text.replace(',,', ',').replace(',.', '.')
|
||||
if escape_doc:
|
||||
text = html.escape(text)
|
||||
text = html.escape(text, quote=False)
|
||||
return text
|
||||
|
||||
|
||||
@ -202,7 +207,7 @@ def compact(text, mark_headers=False):
|
||||
if m:
|
||||
title = m.group(2)
|
||||
lev = len(m.group(1))
|
||||
if Extractor.toHTML:
|
||||
if Extractor.HtmlFormatting:
|
||||
page.append("<h%d>%s</h%d>" % (lev, title, lev))
|
||||
if title and title[-1] not in '!?':
|
||||
title += '.'
|
||||
@ -212,7 +217,7 @@ def compact(text, mark_headers=False):
|
||||
|
||||
headers[lev] = title
|
||||
# drop previous headers
|
||||
headers = { k:v for k,v in headers.items() if k > lev }
|
||||
headers = { k:v for k,v in headers.items() if k <= lev }
|
||||
emptySection = True
|
||||
continue
|
||||
# Handle page title
|
||||
@ -228,7 +233,7 @@ def compact(text, mark_headers=False):
|
||||
continue
|
||||
# handle lists
|
||||
elif line[0] in '*#;:':
|
||||
if Extractor.toHTML:
|
||||
if Extractor.HtmlFormatting:
|
||||
i = 0
|
||||
for c, n in zip_longest(listLevel, line, fillvalue=''):
|
||||
if not n or n not in '*#;:':
|
||||
@ -282,13 +287,6 @@ def compact(text, mark_headers=False):
|
||||
return page
|
||||
|
||||
|
||||
def handle_unicode(entity):
|
||||
numeric_code = int(entity[2:-1])
|
||||
if numeric_code >= 0x10000:
|
||||
return ''
|
||||
return unichr(numeric_code)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
def dropNested(text, openDelim, closeDelim):
|
||||
@ -503,7 +501,7 @@ def makeInternalLink(title, label):
|
||||
# variables
|
||||
|
||||
|
||||
class MagicWords(object):
|
||||
class MagicWords():
|
||||
|
||||
"""
|
||||
One copy in each Extractor.
|
||||
@ -726,11 +724,11 @@ def unescape(text):
|
||||
try:
|
||||
if text[1] == "#": # character reference
|
||||
if text[2] == "x":
|
||||
return unichr(int(code[1:], 16))
|
||||
return chr(int(code[1:], 16))
|
||||
else:
|
||||
return unichr(int(code))
|
||||
return chr(int(code))
|
||||
else: # named entity
|
||||
return unichr(name2codepoint[code])
|
||||
return chr(name2codepoint[code])
|
||||
except:
|
||||
return text # leave as is
|
||||
|
||||
@ -795,8 +793,7 @@ dots = re.compile(r'\.{4,}')
|
||||
substWords = 'subst:|safesubst:'
|
||||
|
||||
|
||||
class Extractor(object):
|
||||
|
||||
class Extractor():
|
||||
"""
|
||||
An extraction task on a article.
|
||||
"""
|
||||
@ -809,8 +806,8 @@ class Extractor(object):
|
||||
keepSections = True
|
||||
|
||||
##
|
||||
# Whether to output HTML instead of text
|
||||
toHTML = False
|
||||
# Whether to output text with HTML formatting elements in <doc> files.
|
||||
HtmlFormatting = False
|
||||
|
||||
def __init__(self, id, title, page):
|
||||
"""
|
||||
@ -846,7 +843,7 @@ class Extractor(object):
|
||||
text = compact(text, mark_headers=mark_headers)
|
||||
return text
|
||||
|
||||
def extract(self, out):
|
||||
def extract(self, out, escape_doc=True):
|
||||
"""
|
||||
:param out: a memory file.
|
||||
"""
|
||||
@ -860,7 +857,7 @@ class Extractor(object):
|
||||
footer = "\n</doc>\n"
|
||||
out.write(header)
|
||||
|
||||
text = self.clean_text(text)
|
||||
text = self.clean_text(text, escape_doc=escape_doc)
|
||||
|
||||
for line in text:
|
||||
out.write(line)
|
||||
@ -1443,7 +1440,7 @@ def normalizeNamespace(ns):
|
||||
# https://github.com/Wikia/app/blob/dev/extensions/ParserFunctions/ParserFunctions_body.php
|
||||
|
||||
|
||||
class Infix:
|
||||
class Infix():
|
||||
|
||||
"""Infix operators.
|
||||
The calling sequence for the infix is:
|
||||
|
@ -34,7 +34,7 @@ import bz2
|
||||
|
||||
|
||||
# Program version
|
||||
version = '3.0'
|
||||
__version__ = '3.0.3'
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# READER
|
||||
@ -49,15 +49,13 @@ def process_data(input_file, id, templates=False):
|
||||
:param id: article id
|
||||
"""
|
||||
|
||||
if input_file.lower().endswith("bz2"):
|
||||
opener = bz2.BZ2File
|
||||
else:
|
||||
opener = open
|
||||
opener = bz2.BZ2File if input_file.lower().endswith("bz2") else open
|
||||
|
||||
input = opener(input_file)
|
||||
|
||||
page = []
|
||||
for line in input:
|
||||
line = line.decode('utf-8')
|
||||
if '<' not in line: # faster than doing re.search()
|
||||
if page:
|
||||
page.append(line)
|
||||
@ -103,7 +101,7 @@ def main():
|
||||
description=__doc__)
|
||||
parser.add_argument("input",
|
||||
help="XML wiki dump file")
|
||||
parser.add_argument("--id", default="",
|
||||
parser.add_argument("--id", default="1",
|
||||
help="article number")
|
||||
parser.add_argument("--template", action="store_true",
|
||||
help="template number")
|
||||
|
Loading…
Reference in New Issue
Block a user