diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py
index 27056e2..b4a6ed1 100755
--- a/wikiextractor/WikiExtractor.py
+++ b/wikiextractor/WikiExtractor.py
@@ -49,7 +49,6 @@ collecting template definitions.
import argparse
import bz2
-import fileinput
import logging
import os.path
import re # TODO use regex when it will be standard
@@ -112,7 +111,7 @@ modules = {
# Output
-class NextFile(object):
+class NextFile():
"""
Synchronous generation of next available file name.
@@ -143,7 +142,7 @@ class NextFile(object):
return '%s/wiki_%02d' % (self._dirname(), self.file_index)
-class OutputSplitter(object):
+class OutputSplitter():
"""
File-like object, that splits output to multiple files of a given max size.
@@ -203,7 +202,7 @@ def load_templates(file, output_file=None):
if output_file:
output = open(output_file, 'w')
for line in file:
- line = line.decode('utf-8')
+ #line = line.decode('utf-8')
if '<' not in line: # faster than doing re.search()
if inText:
page.append(line)
@@ -238,18 +237,18 @@ def load_templates(file, output_file=None):
# FIXME: should reconstruct also moduleNamespace
if title.startswith(templatePrefix):
define_template(title, page)
+ templates += 1
# save templates and modules to file
if output_file and (title.startswith(templatePrefix) or
title.startswith(modulePrefix)):
output.write('\n')
- output.write(' %s\n' % title)
+ output.write(' %s\n' % title.encode('utf-8'))
output.write(' 10\n')
output.write(' ')
for line in page:
- output.write(line)
+ output.write(line.encode('utf-8'))
output.write(' \n')
output.write('\n')
- templates += 1
page = []
articles += 1
if articles % 100000 == 0:
@@ -260,6 +259,20 @@ def load_templates(file, output_file=None):
return templates
+def decode_open(filename, mode='rt', encoding='utf-8'):
+ """
+ Open a file, decode and decompress, depending on extension `gz`, or 'bz2`.
+ """
+ ext = os.path.splitext(filename)[1]
+ if ext == '.gz':
+ import gzip
+ return gzip.open(filename, mode)
+ elif ext == '.bz2':
+ return bz2.open(filename, mode=mode, encoding=encoding)
+ else:
+ return open(filename, mode, encoding=encoding)
+
+
def process_dump(input_file, template_file, out_file, file_size, file_compress,
process_count):
"""
@@ -275,14 +288,11 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
global templateNamespace, templatePrefix
global moduleNamespace, modulePrefix
- if input_file == '-':
- input = sys.stdin
- else:
- input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
+ input = decode_open(input_file)
# collect siteinfo
for line in input:
- line = line.decode('utf-8')
+ line = line #.decode('utf-8')
m = tagRE.search(line)
if not m:
continue
@@ -308,7 +318,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
template_load_start = default_timer()
if template_file and os.path.exists(template_file):
logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", template_file)
- file = fileinput.FileInput(template_file, openhook=fileinput.hook_compressed)
+ file = decode_open(template_file)
templates = load_templates(file)
file.close()
else:
@@ -318,7 +328,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", input_file)
templates = load_templates(input, template_file)
input.close()
- input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
+ input = decode_open(input_file)
template_load_elapsed = default_timer() - template_load_start
logging.info("Loaded %d templates in %.1fs", templates, template_load_elapsed)
@@ -370,7 +380,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
inText = False
redirect = False
for line in input:
- line = line.decode('utf-8')
+ #line = line.decode('utf-8')
if '<' not in line: # faster than doing re.search()
if inText:
page.append(line)
@@ -402,8 +412,8 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
page.append(line)
elif tag == '/page':
colon = title.find(':')
- if (colon < 0 or title[:colon] in acceptedNamespaces) and id != last_id and \
- not redirect and not title.startswith(templateNamespace):
+ if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and
+ not redirect and not title.startswith(templateNamespace)):
job = (id, title, page, ordinal)
jobs_queue.put(job) # goes to any available extract_process
last_id = id
@@ -539,7 +549,7 @@ def main():
args = parser.parse_args()
Extractor.keepLinks = args.links
- Extractor.toHTML = args.html
+ Extractor.HtmlFormatting = args.html
if args.html:
Extractor.keepLinks = True
@@ -583,7 +593,7 @@ def main():
load_templates(file)
with open(input_file) as file:
- page = file.read().decode('utf-8')
+ page = file.read()#.decode('utf-8')
m = re.search(r'(.*)', page)
id = m.group(1) if m else 0
m = re.search(r'
(.*)', page)
diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py
index c0d9d75..5dd2a93 100644
--- a/wikiextractor/extract.py
+++ b/wikiextractor/extract.py
@@ -73,6 +73,11 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
"""
Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped
@see https://www.mediawiki.org/wiki/Help:Formatting
+ :param extractor: the Extractor t use.
+ :param text: the text to clean.
+ :param expand_templates: whether to perform template expansion.
+ :param escape_doc: whether to convert special characters to HTML entities.
+ @return: the cleaned text.
"""
if expand_templates:
@@ -107,7 +112,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
text = res + unescape(text[cur:])
# Handle bold/italic/quote
- if extractor.toHTML:
+ if extractor.HtmlFormatting:
text = bold_italic.sub(r'\1', text)
text = bold.sub(r'\1', text)
text = italic.sub(r'\1', text)
@@ -146,7 +151,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
for tag in discardElements:
text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag)
- if not extractor.toHTML:
+ if not extractor.HtmlFormatting:
# Turn into text what is left ( ) and
text = unescape(text)
@@ -170,7 +175,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
text = text.replace(',,', ',').replace(',.', '.')
if escape_doc:
- text = html.escape(text)
+ text = html.escape(text, quote=False)
return text
@@ -202,7 +207,7 @@ def compact(text, mark_headers=False):
if m:
title = m.group(2)
lev = len(m.group(1))
- if Extractor.toHTML:
+ if Extractor.HtmlFormatting:
page.append("%s" % (lev, title, lev))
if title and title[-1] not in '!?':
title += '.'
@@ -212,7 +217,7 @@ def compact(text, mark_headers=False):
headers[lev] = title
# drop previous headers
- headers = { k:v for k,v in headers.items() if k > lev }
+ headers = { k:v for k,v in headers.items() if k <= lev }
emptySection = True
continue
# Handle page title
@@ -228,7 +233,7 @@ def compact(text, mark_headers=False):
continue
# handle lists
elif line[0] in '*#;:':
- if Extractor.toHTML:
+ if Extractor.HtmlFormatting:
i = 0
for c, n in zip_longest(listLevel, line, fillvalue=''):
if not n or n not in '*#;:':
@@ -282,13 +287,6 @@ def compact(text, mark_headers=False):
return page
-def handle_unicode(entity):
- numeric_code = int(entity[2:-1])
- if numeric_code >= 0x10000:
- return ''
- return unichr(numeric_code)
-
-
# ----------------------------------------------------------------------
def dropNested(text, openDelim, closeDelim):
@@ -503,7 +501,7 @@ def makeInternalLink(title, label):
# variables
-class MagicWords(object):
+class MagicWords():
"""
One copy in each Extractor.
@@ -726,11 +724,11 @@ def unescape(text):
try:
if text[1] == "#": # character reference
if text[2] == "x":
- return unichr(int(code[1:], 16))
+ return chr(int(code[1:], 16))
else:
- return unichr(int(code))
+ return chr(int(code))
else: # named entity
- return unichr(name2codepoint[code])
+ return chr(name2codepoint[code])
except:
return text # leave as is
@@ -795,8 +793,7 @@ dots = re.compile(r'\.{4,}')
substWords = 'subst:|safesubst:'
-class Extractor(object):
-
+class Extractor():
"""
An extraction task on a article.
"""
@@ -809,8 +806,8 @@ class Extractor(object):
keepSections = True
##
- # Whether to output HTML instead of text
- toHTML = False
+ # Whether to output text with HTML formatting elements in files.
+ HtmlFormatting = False
def __init__(self, id, title, page):
"""
@@ -846,7 +843,7 @@ class Extractor(object):
text = compact(text, mark_headers=mark_headers)
return text
- def extract(self, out):
+ def extract(self, out, escape_doc=True):
"""
:param out: a memory file.
"""
@@ -860,7 +857,7 @@ class Extractor(object):
footer = "\n\n"
out.write(header)
- text = self.clean_text(text)
+ text = self.clean_text(text, escape_doc=escape_doc)
for line in text:
out.write(line)
@@ -1443,7 +1440,7 @@ def normalizeNamespace(ns):
# https://github.com/Wikia/app/blob/dev/extensions/ParserFunctions/ParserFunctions_body.php
-class Infix:
+class Infix():
"""Infix operators.
The calling sequence for the infix is:
diff --git a/wikiextractor/extractPage.py b/wikiextractor/extractPage.py
index 89a7081..e73dcd9 100755
--- a/wikiextractor/extractPage.py
+++ b/wikiextractor/extractPage.py
@@ -34,7 +34,7 @@ import bz2
# Program version
-version = '3.0'
+__version__ = '3.0.3'
# ----------------------------------------------------------------------
# READER
@@ -49,15 +49,13 @@ def process_data(input_file, id, templates=False):
:param id: article id
"""
- if input_file.lower().endswith("bz2"):
- opener = bz2.BZ2File
- else:
- opener = open
+ opener = bz2.BZ2File if input_file.lower().endswith("bz2") else open
input = opener(input_file)
page = []
for line in input:
+ line = line.decode('utf-8')
if '<' not in line: # faster than doing re.search()
if page:
page.append(line)
@@ -103,7 +101,7 @@ def main():
description=__doc__)
parser.add_argument("input",
help="XML wiki dump file")
- parser.add_argument("--id", default="",
+ parser.add_argument("--id", default="1",
help="article number")
parser.add_argument("--template", action="store_true",
help="template number")