Better handling of encoding.
This commit is contained in:
parent
0933664d70
commit
a2e078f3be
@ -49,7 +49,6 @@ collecting template definitions.
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import bz2
|
import bz2
|
||||||
import fileinput
|
|
||||||
import logging
|
import logging
|
||||||
import os.path
|
import os.path
|
||||||
import re # TODO use regex when it will be standard
|
import re # TODO use regex when it will be standard
|
||||||
@ -112,7 +111,7 @@ modules = {
|
|||||||
# Output
|
# Output
|
||||||
|
|
||||||
|
|
||||||
class NextFile(object):
|
class NextFile():
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Synchronous generation of next available file name.
|
Synchronous generation of next available file name.
|
||||||
@ -143,7 +142,7 @@ class NextFile(object):
|
|||||||
return '%s/wiki_%02d' % (self._dirname(), self.file_index)
|
return '%s/wiki_%02d' % (self._dirname(), self.file_index)
|
||||||
|
|
||||||
|
|
||||||
class OutputSplitter(object):
|
class OutputSplitter():
|
||||||
|
|
||||||
"""
|
"""
|
||||||
File-like object, that splits output to multiple files of a given max size.
|
File-like object, that splits output to multiple files of a given max size.
|
||||||
@ -203,7 +202,7 @@ def load_templates(file, output_file=None):
|
|||||||
if output_file:
|
if output_file:
|
||||||
output = open(output_file, 'w')
|
output = open(output_file, 'w')
|
||||||
for line in file:
|
for line in file:
|
||||||
line = line.decode('utf-8')
|
#line = line.decode('utf-8')
|
||||||
if '<' not in line: # faster than doing re.search()
|
if '<' not in line: # faster than doing re.search()
|
||||||
if inText:
|
if inText:
|
||||||
page.append(line)
|
page.append(line)
|
||||||
@ -238,18 +237,18 @@ def load_templates(file, output_file=None):
|
|||||||
# FIXME: should reconstruct also moduleNamespace
|
# FIXME: should reconstruct also moduleNamespace
|
||||||
if title.startswith(templatePrefix):
|
if title.startswith(templatePrefix):
|
||||||
define_template(title, page)
|
define_template(title, page)
|
||||||
|
templates += 1
|
||||||
# save templates and modules to file
|
# save templates and modules to file
|
||||||
if output_file and (title.startswith(templatePrefix) or
|
if output_file and (title.startswith(templatePrefix) or
|
||||||
title.startswith(modulePrefix)):
|
title.startswith(modulePrefix)):
|
||||||
output.write('<page>\n')
|
output.write('<page>\n')
|
||||||
output.write(' <title>%s</title>\n' % title)
|
output.write(' <title>%s</title>\n' % title.encode('utf-8'))
|
||||||
output.write(' <ns>10</ns>\n')
|
output.write(' <ns>10</ns>\n')
|
||||||
output.write(' <text>')
|
output.write(' <text>')
|
||||||
for line in page:
|
for line in page:
|
||||||
output.write(line)
|
output.write(line.encode('utf-8'))
|
||||||
output.write(' </text>\n')
|
output.write(' </text>\n')
|
||||||
output.write('</page>\n')
|
output.write('</page>\n')
|
||||||
templates += 1
|
|
||||||
page = []
|
page = []
|
||||||
articles += 1
|
articles += 1
|
||||||
if articles % 100000 == 0:
|
if articles % 100000 == 0:
|
||||||
@ -260,6 +259,20 @@ def load_templates(file, output_file=None):
|
|||||||
return templates
|
return templates
|
||||||
|
|
||||||
|
|
||||||
|
def decode_open(filename, mode='rt', encoding='utf-8'):
|
||||||
|
"""
|
||||||
|
Open a file, decode and decompress, depending on extension `gz`, or 'bz2`.
|
||||||
|
"""
|
||||||
|
ext = os.path.splitext(filename)[1]
|
||||||
|
if ext == '.gz':
|
||||||
|
import gzip
|
||||||
|
return gzip.open(filename, mode)
|
||||||
|
elif ext == '.bz2':
|
||||||
|
return bz2.open(filename, mode=mode, encoding=encoding)
|
||||||
|
else:
|
||||||
|
return open(filename, mode, encoding=encoding)
|
||||||
|
|
||||||
|
|
||||||
def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
||||||
process_count):
|
process_count):
|
||||||
"""
|
"""
|
||||||
@ -275,14 +288,11 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
|||||||
global templateNamespace, templatePrefix
|
global templateNamespace, templatePrefix
|
||||||
global moduleNamespace, modulePrefix
|
global moduleNamespace, modulePrefix
|
||||||
|
|
||||||
if input_file == '-':
|
input = decode_open(input_file)
|
||||||
input = sys.stdin
|
|
||||||
else:
|
|
||||||
input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
|
|
||||||
|
|
||||||
# collect siteinfo
|
# collect siteinfo
|
||||||
for line in input:
|
for line in input:
|
||||||
line = line.decode('utf-8')
|
line = line #.decode('utf-8')
|
||||||
m = tagRE.search(line)
|
m = tagRE.search(line)
|
||||||
if not m:
|
if not m:
|
||||||
continue
|
continue
|
||||||
@ -308,7 +318,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
|||||||
template_load_start = default_timer()
|
template_load_start = default_timer()
|
||||||
if template_file and os.path.exists(template_file):
|
if template_file and os.path.exists(template_file):
|
||||||
logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", template_file)
|
logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", template_file)
|
||||||
file = fileinput.FileInput(template_file, openhook=fileinput.hook_compressed)
|
file = decode_open(template_file)
|
||||||
templates = load_templates(file)
|
templates = load_templates(file)
|
||||||
file.close()
|
file.close()
|
||||||
else:
|
else:
|
||||||
@ -318,7 +328,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
|||||||
logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", input_file)
|
logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", input_file)
|
||||||
templates = load_templates(input, template_file)
|
templates = load_templates(input, template_file)
|
||||||
input.close()
|
input.close()
|
||||||
input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
|
input = decode_open(input_file)
|
||||||
template_load_elapsed = default_timer() - template_load_start
|
template_load_elapsed = default_timer() - template_load_start
|
||||||
logging.info("Loaded %d templates in %.1fs", templates, template_load_elapsed)
|
logging.info("Loaded %d templates in %.1fs", templates, template_load_elapsed)
|
||||||
|
|
||||||
@ -370,7 +380,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
|||||||
inText = False
|
inText = False
|
||||||
redirect = False
|
redirect = False
|
||||||
for line in input:
|
for line in input:
|
||||||
line = line.decode('utf-8')
|
#line = line.decode('utf-8')
|
||||||
if '<' not in line: # faster than doing re.search()
|
if '<' not in line: # faster than doing re.search()
|
||||||
if inText:
|
if inText:
|
||||||
page.append(line)
|
page.append(line)
|
||||||
@ -402,8 +412,8 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
|
|||||||
page.append(line)
|
page.append(line)
|
||||||
elif tag == '/page':
|
elif tag == '/page':
|
||||||
colon = title.find(':')
|
colon = title.find(':')
|
||||||
if (colon < 0 or title[:colon] in acceptedNamespaces) and id != last_id and \
|
if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and
|
||||||
not redirect and not title.startswith(templateNamespace):
|
not redirect and not title.startswith(templateNamespace)):
|
||||||
job = (id, title, page, ordinal)
|
job = (id, title, page, ordinal)
|
||||||
jobs_queue.put(job) # goes to any available extract_process
|
jobs_queue.put(job) # goes to any available extract_process
|
||||||
last_id = id
|
last_id = id
|
||||||
@ -539,7 +549,7 @@ def main():
|
|||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
Extractor.keepLinks = args.links
|
Extractor.keepLinks = args.links
|
||||||
Extractor.toHTML = args.html
|
Extractor.HtmlFormatting = args.html
|
||||||
if args.html:
|
if args.html:
|
||||||
Extractor.keepLinks = True
|
Extractor.keepLinks = True
|
||||||
|
|
||||||
@ -583,7 +593,7 @@ def main():
|
|||||||
load_templates(file)
|
load_templates(file)
|
||||||
|
|
||||||
with open(input_file) as file:
|
with open(input_file) as file:
|
||||||
page = file.read().decode('utf-8')
|
page = file.read()#.decode('utf-8')
|
||||||
m = re.search(r'<id>(.*)</id>', page)
|
m = re.search(r'<id>(.*)</id>', page)
|
||||||
id = m.group(1) if m else 0
|
id = m.group(1) if m else 0
|
||||||
m = re.search(r'<title>(.*)</title>', page)
|
m = re.search(r'<title>(.*)</title>', page)
|
||||||
|
@ -73,6 +73,11 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
|
|||||||
"""
|
"""
|
||||||
Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped
|
Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped
|
||||||
@see https://www.mediawiki.org/wiki/Help:Formatting
|
@see https://www.mediawiki.org/wiki/Help:Formatting
|
||||||
|
:param extractor: the Extractor t use.
|
||||||
|
:param text: the text to clean.
|
||||||
|
:param expand_templates: whether to perform template expansion.
|
||||||
|
:param escape_doc: whether to convert special characters to HTML entities.
|
||||||
|
@return: the cleaned text.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if expand_templates:
|
if expand_templates:
|
||||||
@ -107,7 +112,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
|
|||||||
text = res + unescape(text[cur:])
|
text = res + unescape(text[cur:])
|
||||||
|
|
||||||
# Handle bold/italic/quote
|
# Handle bold/italic/quote
|
||||||
if extractor.toHTML:
|
if extractor.HtmlFormatting:
|
||||||
text = bold_italic.sub(r'<b>\1</b>', text)
|
text = bold_italic.sub(r'<b>\1</b>', text)
|
||||||
text = bold.sub(r'<b>\1</b>', text)
|
text = bold.sub(r'<b>\1</b>', text)
|
||||||
text = italic.sub(r'<i>\1</i>', text)
|
text = italic.sub(r'<i>\1</i>', text)
|
||||||
@ -146,7 +151,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
|
|||||||
for tag in discardElements:
|
for tag in discardElements:
|
||||||
text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag)
|
text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag)
|
||||||
|
|
||||||
if not extractor.toHTML:
|
if not extractor.HtmlFormatting:
|
||||||
# Turn into text what is left (&nbsp;) and <syntaxhighlight>
|
# Turn into text what is left (&nbsp;) and <syntaxhighlight>
|
||||||
text = unescape(text)
|
text = unescape(text)
|
||||||
|
|
||||||
@ -170,7 +175,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
|
|||||||
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
|
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
|
||||||
text = text.replace(',,', ',').replace(',.', '.')
|
text = text.replace(',,', ',').replace(',.', '.')
|
||||||
if escape_doc:
|
if escape_doc:
|
||||||
text = html.escape(text)
|
text = html.escape(text, quote=False)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
@ -202,7 +207,7 @@ def compact(text, mark_headers=False):
|
|||||||
if m:
|
if m:
|
||||||
title = m.group(2)
|
title = m.group(2)
|
||||||
lev = len(m.group(1))
|
lev = len(m.group(1))
|
||||||
if Extractor.toHTML:
|
if Extractor.HtmlFormatting:
|
||||||
page.append("<h%d>%s</h%d>" % (lev, title, lev))
|
page.append("<h%d>%s</h%d>" % (lev, title, lev))
|
||||||
if title and title[-1] not in '!?':
|
if title and title[-1] not in '!?':
|
||||||
title += '.'
|
title += '.'
|
||||||
@ -212,7 +217,7 @@ def compact(text, mark_headers=False):
|
|||||||
|
|
||||||
headers[lev] = title
|
headers[lev] = title
|
||||||
# drop previous headers
|
# drop previous headers
|
||||||
headers = { k:v for k,v in headers.items() if k > lev }
|
headers = { k:v for k,v in headers.items() if k <= lev }
|
||||||
emptySection = True
|
emptySection = True
|
||||||
continue
|
continue
|
||||||
# Handle page title
|
# Handle page title
|
||||||
@ -228,7 +233,7 @@ def compact(text, mark_headers=False):
|
|||||||
continue
|
continue
|
||||||
# handle lists
|
# handle lists
|
||||||
elif line[0] in '*#;:':
|
elif line[0] in '*#;:':
|
||||||
if Extractor.toHTML:
|
if Extractor.HtmlFormatting:
|
||||||
i = 0
|
i = 0
|
||||||
for c, n in zip_longest(listLevel, line, fillvalue=''):
|
for c, n in zip_longest(listLevel, line, fillvalue=''):
|
||||||
if not n or n not in '*#;:':
|
if not n or n not in '*#;:':
|
||||||
@ -282,13 +287,6 @@ def compact(text, mark_headers=False):
|
|||||||
return page
|
return page
|
||||||
|
|
||||||
|
|
||||||
def handle_unicode(entity):
|
|
||||||
numeric_code = int(entity[2:-1])
|
|
||||||
if numeric_code >= 0x10000:
|
|
||||||
return ''
|
|
||||||
return unichr(numeric_code)
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
def dropNested(text, openDelim, closeDelim):
|
def dropNested(text, openDelim, closeDelim):
|
||||||
@ -503,7 +501,7 @@ def makeInternalLink(title, label):
|
|||||||
# variables
|
# variables
|
||||||
|
|
||||||
|
|
||||||
class MagicWords(object):
|
class MagicWords():
|
||||||
|
|
||||||
"""
|
"""
|
||||||
One copy in each Extractor.
|
One copy in each Extractor.
|
||||||
@ -726,11 +724,11 @@ def unescape(text):
|
|||||||
try:
|
try:
|
||||||
if text[1] == "#": # character reference
|
if text[1] == "#": # character reference
|
||||||
if text[2] == "x":
|
if text[2] == "x":
|
||||||
return unichr(int(code[1:], 16))
|
return chr(int(code[1:], 16))
|
||||||
else:
|
else:
|
||||||
return unichr(int(code))
|
return chr(int(code))
|
||||||
else: # named entity
|
else: # named entity
|
||||||
return unichr(name2codepoint[code])
|
return chr(name2codepoint[code])
|
||||||
except:
|
except:
|
||||||
return text # leave as is
|
return text # leave as is
|
||||||
|
|
||||||
@ -795,8 +793,7 @@ dots = re.compile(r'\.{4,}')
|
|||||||
substWords = 'subst:|safesubst:'
|
substWords = 'subst:|safesubst:'
|
||||||
|
|
||||||
|
|
||||||
class Extractor(object):
|
class Extractor():
|
||||||
|
|
||||||
"""
|
"""
|
||||||
An extraction task on a article.
|
An extraction task on a article.
|
||||||
"""
|
"""
|
||||||
@ -809,8 +806,8 @@ class Extractor(object):
|
|||||||
keepSections = True
|
keepSections = True
|
||||||
|
|
||||||
##
|
##
|
||||||
# Whether to output HTML instead of text
|
# Whether to output text with HTML formatting elements in <doc> files.
|
||||||
toHTML = False
|
HtmlFormatting = False
|
||||||
|
|
||||||
def __init__(self, id, title, page):
|
def __init__(self, id, title, page):
|
||||||
"""
|
"""
|
||||||
@ -846,7 +843,7 @@ class Extractor(object):
|
|||||||
text = compact(text, mark_headers=mark_headers)
|
text = compact(text, mark_headers=mark_headers)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def extract(self, out):
|
def extract(self, out, escape_doc=True):
|
||||||
"""
|
"""
|
||||||
:param out: a memory file.
|
:param out: a memory file.
|
||||||
"""
|
"""
|
||||||
@ -860,7 +857,7 @@ class Extractor(object):
|
|||||||
footer = "\n</doc>\n"
|
footer = "\n</doc>\n"
|
||||||
out.write(header)
|
out.write(header)
|
||||||
|
|
||||||
text = self.clean_text(text)
|
text = self.clean_text(text, escape_doc=escape_doc)
|
||||||
|
|
||||||
for line in text:
|
for line in text:
|
||||||
out.write(line)
|
out.write(line)
|
||||||
@ -1443,7 +1440,7 @@ def normalizeNamespace(ns):
|
|||||||
# https://github.com/Wikia/app/blob/dev/extensions/ParserFunctions/ParserFunctions_body.php
|
# https://github.com/Wikia/app/blob/dev/extensions/ParserFunctions/ParserFunctions_body.php
|
||||||
|
|
||||||
|
|
||||||
class Infix:
|
class Infix():
|
||||||
|
|
||||||
"""Infix operators.
|
"""Infix operators.
|
||||||
The calling sequence for the infix is:
|
The calling sequence for the infix is:
|
||||||
|
@ -34,7 +34,7 @@ import bz2
|
|||||||
|
|
||||||
|
|
||||||
# Program version
|
# Program version
|
||||||
version = '3.0'
|
__version__ = '3.0.3'
|
||||||
|
|
||||||
# ----------------------------------------------------------------------
|
# ----------------------------------------------------------------------
|
||||||
# READER
|
# READER
|
||||||
@ -49,15 +49,13 @@ def process_data(input_file, id, templates=False):
|
|||||||
:param id: article id
|
:param id: article id
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if input_file.lower().endswith("bz2"):
|
opener = bz2.BZ2File if input_file.lower().endswith("bz2") else open
|
||||||
opener = bz2.BZ2File
|
|
||||||
else:
|
|
||||||
opener = open
|
|
||||||
|
|
||||||
input = opener(input_file)
|
input = opener(input_file)
|
||||||
|
|
||||||
page = []
|
page = []
|
||||||
for line in input:
|
for line in input:
|
||||||
|
line = line.decode('utf-8')
|
||||||
if '<' not in line: # faster than doing re.search()
|
if '<' not in line: # faster than doing re.search()
|
||||||
if page:
|
if page:
|
||||||
page.append(line)
|
page.append(line)
|
||||||
@ -103,7 +101,7 @@ def main():
|
|||||||
description=__doc__)
|
description=__doc__)
|
||||||
parser.add_argument("input",
|
parser.add_argument("input",
|
||||||
help="XML wiki dump file")
|
help="XML wiki dump file")
|
||||||
parser.add_argument("--id", default="",
|
parser.add_argument("--id", default="1",
|
||||||
help="article number")
|
help="article number")
|
||||||
parser.add_argument("--template", action="store_true",
|
parser.add_argument("--template", action="store_true",
|
||||||
help="template number")
|
help="template number")
|
||||||
|
Loading…
Reference in New Issue
Block a user