2015-03-22 20:45:17 +08:00
|
|
|
|
#!/usr/bin/python
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
#
|
|
|
|
|
# =============================================================================
|
|
|
|
|
# Version: 2.8 (Jan 10, 2015)
|
|
|
|
|
# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
|
|
|
|
|
# Antonio Fuschetto (fuschett@di.unipi.it), University of Pisa
|
|
|
|
|
#
|
|
|
|
|
# Contributors:
|
|
|
|
|
# Leonardo Souza (lsouza@amtera.com.br)
|
|
|
|
|
# Juan Manuel Caicedo (juan@cavorite.com)
|
|
|
|
|
# Humberto Pereira (begini@gmail.com)
|
|
|
|
|
# Siegfried-A. Gevatter (siegfried@gevatter.com)
|
|
|
|
|
# Pedro Assis (pedroh2306@gmail.com)
|
|
|
|
|
# Wim Muskee (wimmuskee@gmail.com)
|
|
|
|
|
# Radics Geza (radicsge@gmail.com)
|
|
|
|
|
#
|
|
|
|
|
# =============================================================================
|
|
|
|
|
# Copyright (c) 2009. Giuseppe Attardi (attardi@di.unipi.it).
|
|
|
|
|
# =============================================================================
|
|
|
|
|
# This file is part of Tanl.
|
|
|
|
|
#
|
|
|
|
|
# Tanl is free software; you can redistribute it and/or modify it
|
|
|
|
|
# under the terms of the GNU General Public License, version 3,
|
|
|
|
|
# as published by the Free Software Foundation.
|
|
|
|
|
#
|
|
|
|
|
# Tanl is distributed in the hope that it will be useful,
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
|
#
|
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
# =============================================================================
|
|
|
|
|
|
|
|
|
|
"""Wikipedia Extractor:
|
2015-04-12 16:21:35 +08:00
|
|
|
|
Extracts and cleans text from a Wikipedia database dump and stores output in a
|
2015-03-22 20:45:17 +08:00
|
|
|
|
number of files of similar size in a given directory.
|
2015-04-12 16:21:35 +08:00
|
|
|
|
Each file will contain several documents in the format:
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
<doc id="" url="" title="">
|
|
|
|
|
...
|
|
|
|
|
</doc>
|
|
|
|
|
|
|
|
|
|
This version performs template expansion by preprocesssng the whole dump and
|
2015-04-12 16:21:35 +08:00
|
|
|
|
collecting template definitions.
|
2015-03-22 20:45:17 +08:00
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import sys, os.path
|
|
|
|
|
import re, random
|
|
|
|
|
import argparse
|
|
|
|
|
from itertools import izip
|
|
|
|
|
import logging, traceback
|
|
|
|
|
import urllib
|
2015-04-12 16:21:35 +08:00
|
|
|
|
import bz2
|
2015-03-22 20:45:17 +08:00
|
|
|
|
import codecs
|
|
|
|
|
from htmlentitydefs import name2codepoint
|
2015-04-15 20:30:55 +08:00
|
|
|
|
import urllib
|
2015-03-22 20:45:17 +08:00
|
|
|
|
import Queue, threading, multiprocessing
|
|
|
|
|
|
2015-04-15 20:30:55 +08:00
|
|
|
|
#===========================================================================
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
2015-04-12 16:21:35 +08:00
|
|
|
|
# Program version
|
2015-04-16 00:20:32 +08:00
|
|
|
|
version = '2.16'
|
2015-04-12 16:21:35 +08:00
|
|
|
|
|
|
|
|
|
### PARAMS ####################################################################
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
##
|
|
|
|
|
# Defined in <siteinfo>
|
|
|
|
|
# We include as default Template, when loading external template file.
|
|
|
|
|
knownNamespaces = set(['Template'])
|
|
|
|
|
|
|
|
|
|
##
|
|
|
|
|
# Whether to preserve links in output
|
|
|
|
|
#
|
|
|
|
|
keepLinks = False
|
|
|
|
|
|
|
|
|
|
##
|
|
|
|
|
# Whether to transform sections into HTML
|
|
|
|
|
#
|
|
|
|
|
keepSections = False
|
|
|
|
|
|
|
|
|
|
##
|
|
|
|
|
# The namespace used for template definitions
|
|
|
|
|
templateNamespace = 'Template'
|
|
|
|
|
|
|
|
|
|
##
|
|
|
|
|
# Recognize only these namespaces
|
|
|
|
|
# w: Internal links to the Wikipedia
|
|
|
|
|
# wiktionary: Wiki dictionary
|
|
|
|
|
# wikt: shortcut for Wiktionary
|
|
|
|
|
#
|
|
|
|
|
acceptedNamespaces = set(['w', 'wiktionary', 'wikt'])
|
|
|
|
|
|
|
|
|
|
##
|
|
|
|
|
# Drop these elements from article text
|
|
|
|
|
#
|
|
|
|
|
discardElements = set([
|
|
|
|
|
'gallery', 'timeline', 'noinclude', 'pre',
|
|
|
|
|
'table', 'tr', 'td', 'th', 'caption',
|
|
|
|
|
'form', 'input', 'select', 'option', 'textarea',
|
|
|
|
|
'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir',
|
2015-04-15 00:09:46 +08:00
|
|
|
|
'ref', 'references', 'img', 'imagemap', 'source', 'small'
|
2015-03-22 20:45:17 +08:00
|
|
|
|
])
|
|
|
|
|
|
2015-04-12 16:21:35 +08:00
|
|
|
|
# This is obtained from <siteinfo>
|
|
|
|
|
urlbase = None
|
|
|
|
|
|
2015-04-15 00:09:46 +08:00
|
|
|
|
def get_url(id):
|
|
|
|
|
global urlbase
|
2015-04-12 16:21:35 +08:00
|
|
|
|
return "%s?curid=%s" % (urlbase, id)
|
|
|
|
|
|
2015-03-22 20:45:17 +08:00
|
|
|
|
#=========================================================================
|
|
|
|
|
#
|
|
|
|
|
# MediaWiki Markup Grammar
|
2015-04-11 18:29:31 +08:00
|
|
|
|
# https://www.mediawiki.org/wiki/Preprocessor_ABNF
|
|
|
|
|
|
|
|
|
|
# xml-char = %x9 / %xA / %xD / %x20-D7FF / %xE000-FFFD / %x10000-10FFFF
|
|
|
|
|
# sptab = SP / HTAB
|
2015-04-15 20:30:55 +08:00
|
|
|
|
|
2015-04-11 18:29:31 +08:00
|
|
|
|
# ; everything except ">" (%x3E)
|
|
|
|
|
# attr-char = %x9 / %xA / %xD / %x20-3D / %x3F-D7FF / %xE000-FFFD / %x10000-10FFFF
|
2015-04-15 20:30:55 +08:00
|
|
|
|
|
2015-04-11 18:29:31 +08:00
|
|
|
|
# literal = *xml-char
|
|
|
|
|
# title = wikitext-L3
|
|
|
|
|
# part-name = wikitext-L3
|
|
|
|
|
# part-value = wikitext-L3
|
|
|
|
|
# part = ( part-name "=" part-value ) / ( part-value )
|
|
|
|
|
# parts = [ title *( "|" part ) ]
|
|
|
|
|
# tplarg = "{{{" parts "}}}"
|
|
|
|
|
# template = "{{" parts "}}"
|
|
|
|
|
# link = "[[" wikitext-L3 "]]"
|
2015-04-15 20:30:55 +08:00
|
|
|
|
|
2015-04-11 18:29:31 +08:00
|
|
|
|
# comment = "<!--" literal "-->"
|
|
|
|
|
# unclosed-comment = "<!--" literal END
|
|
|
|
|
# ; the + in the line-eating-comment rule was absent between MW 1.12 and MW 1.22
|
|
|
|
|
# line-eating-comment = LF LINE-START *SP +( comment *SP ) LINE-END
|
2015-04-15 20:30:55 +08:00
|
|
|
|
|
2015-04-11 18:29:31 +08:00
|
|
|
|
# attr = *attr-char
|
|
|
|
|
# nowiki-element = "<nowiki" attr ( "/>" / ( ">" literal ( "</nowiki>" / END ) ) )
|
|
|
|
|
|
|
|
|
|
# wikitext-L2 = heading / wikitext-L3 / *wikitext-L2
|
2015-04-15 20:30:55 +08:00
|
|
|
|
# wikitext-L3 = literal / template / tplarg / link / comment /
|
|
|
|
|
# line-eating-comment / unclosed-comment / xmlish-element /
|
2015-04-11 18:29:31 +08:00
|
|
|
|
# *wikitext-L3
|
|
|
|
|
|
2015-03-22 20:45:17 +08:00
|
|
|
|
#------------------------------------------------------------------------------
|
|
|
|
|
|
2015-04-11 21:33:20 +08:00
|
|
|
|
selfClosingTags = [ 'br', 'hr', 'nobr', 'ref', 'references', 'nowiki' ]
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
# These tags are dropped, keeping their content.
|
|
|
|
|
# handle 'a' separately, depending on keepLinks
|
|
|
|
|
ignoredTags = [
|
2015-04-11 21:33:20 +08:00
|
|
|
|
'b', 'big', 'blockquote', 'center', 'cite', 'div', 'em',
|
|
|
|
|
'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd', 'nowiki',
|
2015-04-15 00:09:46 +08:00
|
|
|
|
'p', 'plaintext', 's', 'span', 'strike', 'strong',
|
2015-04-11 21:33:20 +08:00
|
|
|
|
'sub', 'sup', 'tt', 'u', 'var'
|
2015-03-22 20:45:17 +08:00
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
placeholder_tags = {'math':'formula', 'code':'codice'}
|
|
|
|
|
|
|
|
|
|
def normalizeTitle(title):
|
|
|
|
|
"""Normalize title"""
|
|
|
|
|
# remove leading/trailing whitespace and underscores
|
|
|
|
|
title = title.strip(' _')
|
|
|
|
|
# replace sequences of whitespace and underscore chars with a single space
|
|
|
|
|
title = re.sub(r'[\s_]+', ' ', title)
|
|
|
|
|
|
|
|
|
|
m = re.match(r'([^:]*):(\s*)(\S(?:.*))', title)
|
|
|
|
|
if m:
|
|
|
|
|
prefix = m.group(1)
|
|
|
|
|
if m.group(2):
|
|
|
|
|
optionalWhitespace = ' '
|
|
|
|
|
else:
|
|
|
|
|
optionalWhitespace = ''
|
|
|
|
|
rest = m.group(3)
|
|
|
|
|
|
|
|
|
|
ns = normalizeNamespace(prefix)
|
|
|
|
|
if ns in knownNamespaces:
|
|
|
|
|
# If the prefix designates a known namespace, then it might be
|
|
|
|
|
# followed by optional whitespace that should be removed to get
|
|
|
|
|
# the canonical page name
|
|
|
|
|
# (e.g., "Category: Births" should become "Category:Births").
|
|
|
|
|
title = ns + ":" + ucfirst(rest)
|
|
|
|
|
else:
|
|
|
|
|
# No namespace, just capitalize first letter.
|
|
|
|
|
# If the part before the colon is not a known namespace, then we
|
|
|
|
|
# must not remove the space after the colon (if any), e.g.,
|
|
|
|
|
# "3001: The_Final_Odyssey" != "3001:The_Final_Odyssey".
|
|
|
|
|
# However, to get the canonical page name we must contract multiple
|
|
|
|
|
# spaces into one, because
|
|
|
|
|
# "3001: The_Final_Odyssey" != "3001: The_Final_Odyssey".
|
|
|
|
|
title = ucfirst(prefix) + ":" + optionalWhitespace + ucfirst(rest)
|
|
|
|
|
else:
|
|
|
|
|
# no namespace, just capitalize first letter
|
|
|
|
|
title = ucfirst(title)
|
|
|
|
|
return title
|
|
|
|
|
|
|
|
|
|
##
|
|
|
|
|
# Removes HTML or XML character references and entities from a text string.
|
|
|
|
|
#
|
|
|
|
|
# @param text The HTML (or XML) source text.
|
|
|
|
|
# @return The plain text, as a Unicode string, if necessary.
|
|
|
|
|
|
|
|
|
|
def unescape(text):
|
|
|
|
|
def fixup(m):
|
|
|
|
|
text = m.group(0)
|
|
|
|
|
code = m.group(1)
|
|
|
|
|
try:
|
|
|
|
|
if text[1] == "#": # character reference
|
|
|
|
|
if text[2] == "x":
|
|
|
|
|
return unichr(int(code[1:], 16))
|
|
|
|
|
else:
|
|
|
|
|
return unichr(int(code))
|
|
|
|
|
else: # named entity
|
|
|
|
|
return unichr(name2codepoint[code])
|
|
|
|
|
except:
|
|
|
|
|
return text # leave as is
|
|
|
|
|
|
|
|
|
|
return re.sub("&#?(\w+);", fixup, text)
|
|
|
|
|
|
|
|
|
|
# Match HTML comments
|
|
|
|
|
# The buggy template {{Template:T}} has a comment terminating with just "->"
|
|
|
|
|
comment = re.compile(r'<!--.*?-->', re.DOTALL)
|
|
|
|
|
|
|
|
|
|
# Match elements to ignore
|
2015-04-11 21:33:20 +08:00
|
|
|
|
discard_element_patterns = [re.compile(r'<\s*%s\b[^>]*>.*?<\s*/\s*%s>' % (tag, tag), re.DOTALL | re.IGNORECASE) for tag in discardElements]
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
# Match ignored tags
|
|
|
|
|
ignored_tag_patterns = []
|
|
|
|
|
def ignoreTag(tag):
|
|
|
|
|
left = re.compile(r'<%s\b[^>/]*>' % tag, re.IGNORECASE) # both <ref> and <reference>
|
|
|
|
|
right = re.compile(r'</\s*%s>' % tag, re.IGNORECASE)
|
|
|
|
|
ignored_tag_patterns.append((left, right))
|
|
|
|
|
|
|
|
|
|
for tag in ignoredTags:
|
|
|
|
|
ignoreTag(tag)
|
|
|
|
|
|
|
|
|
|
# Match selfClosing HTML tags
|
2015-04-11 21:33:20 +08:00
|
|
|
|
selfClosing_tag_patterns = [
|
|
|
|
|
re.compile(r'<\s*%s\b[^/]*/\s*>' % tag, re.DOTALL | re.IGNORECASE) for tag in selfClosingTags
|
|
|
|
|
]
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
# Match HTML placeholder tags
|
2015-04-11 21:33:20 +08:00
|
|
|
|
placeholder_tag_patterns = [
|
|
|
|
|
(re.compile(r'<\s*%s(\s*| [^>]+?)>.*?<\s*/\s*%s\s*>' % (tag, tag), re.DOTALL | re.IGNORECASE),
|
|
|
|
|
repl) for tag, repl in placeholder_tags.items()
|
|
|
|
|
]
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
# Match preformatted lines
|
2015-04-15 20:30:55 +08:00
|
|
|
|
preformatted = re.compile(r'^ .*?$')
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
# Match external links (space separates second optional parameter)
|
|
|
|
|
externalLink = re.compile(r'\[\w+[^ ]*? (.*?)]')
|
|
|
|
|
externalLinkNoAnchor = re.compile(r'\[\w+[&\]]*\]')
|
|
|
|
|
|
|
|
|
|
# Matches bold/italic
|
2015-04-11 21:33:20 +08:00
|
|
|
|
bold_italic = re.compile(r"'''''(.*?)'''''")
|
|
|
|
|
bold = re.compile(r"'''(.*?)'''")
|
2015-03-22 20:45:17 +08:00
|
|
|
|
italic_quote = re.compile(r"''\"([^\"]*?)\"''")
|
2015-04-11 21:33:20 +08:00
|
|
|
|
italic = re.compile(r"''(.*?)''")
|
2015-03-22 20:45:17 +08:00
|
|
|
|
quote_quote = re.compile(r'""([^"]*?)""')
|
|
|
|
|
|
|
|
|
|
# Matches space
|
|
|
|
|
spaces = re.compile(r' {2,}')
|
|
|
|
|
|
|
|
|
|
# Matches dots
|
|
|
|
|
dots = re.compile(r'\.{4,}')
|
|
|
|
|
|
|
|
|
|
#----------------------------------------------------------------------
|
|
|
|
|
# Expand templates
|
|
|
|
|
|
2015-04-15 06:09:51 +08:00
|
|
|
|
maxTemplateRecursionLevels = 30
|
2015-03-22 20:45:17 +08:00
|
|
|
|
maxParameterRecursionLevels = 10
|
|
|
|
|
|
|
|
|
|
# check for template beginning
|
|
|
|
|
reOpen = re.compile('(?<!{){{(?!{)', re.DOTALL)
|
|
|
|
|
|
2015-04-15 16:47:02 +08:00
|
|
|
|
def expandTemplates(wikitext, frame=[]):
|
2015-03-22 20:45:17 +08:00
|
|
|
|
"""
|
2015-04-15 00:09:46 +08:00
|
|
|
|
:param wikitext: the text to be expanded.
|
2015-04-15 16:47:02 +08:00
|
|
|
|
:param frame: invocation history.
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
Templates are frequently nested. Occasionally, parsing mistakes may cause
|
|
|
|
|
template insertion to enter an infinite loop, for instance when trying to
|
|
|
|
|
instantiate Template:Country
|
|
|
|
|
|
|
|
|
|
{{country_{{{1}}}|{{{2}}}|{{{2}}}|size={{{size|}}}|name={{{name|}}}}}
|
|
|
|
|
|
|
|
|
|
which is repeatedly trying to insert template 'country_', which is again
|
|
|
|
|
resolved to Template:Country. The straightforward solution of keeping
|
|
|
|
|
track of templates that were already inserted for the current article
|
|
|
|
|
would not work, because the same template may legally be used more than
|
|
|
|
|
once, with different parameters in different parts of the article.
|
|
|
|
|
Therefore, we limit the number of iterations of nested template inclusion.
|
|
|
|
|
"""
|
|
|
|
|
|
2015-04-15 00:09:46 +08:00
|
|
|
|
res = ''
|
2015-04-15 16:47:02 +08:00
|
|
|
|
if len(frame) >= maxTemplateRecursionLevels:
|
2015-04-11 09:43:32 +08:00
|
|
|
|
logging.warn('Max template recursion exceeded!')
|
2015-04-15 00:09:46 +08:00
|
|
|
|
return res
|
2015-04-11 09:43:32 +08:00
|
|
|
|
|
2015-04-15 16:47:02 +08:00
|
|
|
|
#logging.debug('<expandTemplates ' + str(len(frame)))
|
2015-04-15 00:09:46 +08:00
|
|
|
|
|
|
|
|
|
cur = 0
|
|
|
|
|
# look for matching {{...}}
|
|
|
|
|
for s,e in findMatchingBraces(wikitext, 2):
|
2015-04-15 16:47:02 +08:00
|
|
|
|
res += wikitext[cur:s] + expandTemplate(wikitext[s+2:e-2], frame)
|
2015-04-15 00:09:46 +08:00
|
|
|
|
cur = e
|
|
|
|
|
# leftover
|
|
|
|
|
res += wikitext[cur:]
|
2015-04-15 16:47:02 +08:00
|
|
|
|
logging.debug(' expandTemplates> ' + str(len(frame)) + ' ' + res)
|
2015-04-15 00:09:46 +08:00
|
|
|
|
return res
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
# parameter handling
|
|
|
|
|
|
|
|
|
|
def splitParameters(paramsList, sep='|'):
|
|
|
|
|
"""
|
2015-04-16 00:20:32 +08:00
|
|
|
|
:param paramList: the parts of a template or tplarg.
|
|
|
|
|
|
2015-03-22 20:45:17 +08:00
|
|
|
|
Split template parameters at the separator :param sep:, which defaults to
|
|
|
|
|
"|". The fuction can be used also to split also key-value pairs at the
|
|
|
|
|
separator "=".
|
|
|
|
|
|
|
|
|
|
Template parameters often contain URLs, internal links, text or even
|
|
|
|
|
template expressions, since we evaluate templates outside in.
|
|
|
|
|
This is required for cases like:
|
|
|
|
|
{{#if: {{{1}}} | {{lc:{{{1}}} | "parameter missing"}}
|
|
|
|
|
Parameters are separated by "|" symbols. However, we
|
|
|
|
|
cannot simply split the string on "|" symbols, since these
|
|
|
|
|
also appear inside templates and internal links, e.g.
|
|
|
|
|
|
|
|
|
|
{{if:|
|
|
|
|
|
|{{#if:the president|
|
|
|
|
|
|{{#if:|
|
|
|
|
|
[[Category:Hatnote templates|A{{PAGENAME}}]]
|
|
|
|
|
}}
|
|
|
|
|
}}
|
|
|
|
|
}}
|
|
|
|
|
|
|
|
|
|
We split parameters at the "|" symbols that are not inside any pair
|
|
|
|
|
{{{...}}}, {{...}}, [[...]], {|...|}.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
parameters = []
|
|
|
|
|
cur = 0
|
|
|
|
|
for s,e in findBalanced(paramsList,
|
2015-04-16 00:20:32 +08:00
|
|
|
|
['{{{', '{{', '[[', '[', '{\|'],
|
|
|
|
|
['}}}', '}}', ']]', ']', '\|}']):
|
2015-03-22 20:45:17 +08:00
|
|
|
|
par = paramsList[cur:s].split(sep)
|
|
|
|
|
if par:
|
|
|
|
|
if parameters:
|
|
|
|
|
# portion before | belongs to previous parameter
|
|
|
|
|
parameters[-1] += par[0]
|
|
|
|
|
if len(par) > 1:
|
|
|
|
|
# rest are new parameters
|
|
|
|
|
parameters.extend(par[1:])
|
|
|
|
|
else:
|
|
|
|
|
parameters = par
|
|
|
|
|
elif not parameters:
|
|
|
|
|
parameters = [''] # create first param
|
|
|
|
|
# add span to last previous parameter
|
|
|
|
|
parameters[-1] += paramsList[s:e]
|
|
|
|
|
cur = e
|
|
|
|
|
# leftover
|
|
|
|
|
par = paramsList[cur:].split(sep)
|
|
|
|
|
if par:
|
|
|
|
|
if parameters:
|
|
|
|
|
# portion before | belongs to previous parameter
|
|
|
|
|
parameters[-1] += par[0]
|
|
|
|
|
if len(par) > 1:
|
|
|
|
|
# rest are new parameters
|
|
|
|
|
parameters.extend(par[1:])
|
|
|
|
|
else:
|
|
|
|
|
parameters = par
|
|
|
|
|
|
2015-04-15 00:09:46 +08:00
|
|
|
|
#logging.debug('splitParameters ' + sep + ' ' + paramsList + '\ndict: ' + str(parameters))
|
2015-03-22 20:45:17 +08:00
|
|
|
|
return parameters
|
|
|
|
|
|
2015-04-15 16:47:02 +08:00
|
|
|
|
def templateParams(parameters, frame):
|
2015-03-22 20:45:17 +08:00
|
|
|
|
"""
|
2015-04-09 21:24:34 +08:00
|
|
|
|
Build a dictionary with positional or name key to expanded parameters.
|
|
|
|
|
:param parameters: the parts[1:] of a template, i.e. all except the title.
|
2015-04-11 09:43:32 +08:00
|
|
|
|
:param depth: recusion depth.
|
2015-03-22 20:45:17 +08:00
|
|
|
|
"""
|
|
|
|
|
templateParams = {}
|
|
|
|
|
|
|
|
|
|
if not parameters:
|
|
|
|
|
return templateParams
|
2015-04-15 16:47:02 +08:00
|
|
|
|
logging.debug('<templateParams: ' + str(len(frame)) + ' ' + '|'.join(parameters))
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
2015-04-09 21:24:34 +08:00
|
|
|
|
# evaluate parameters, since they may contain templates, including the
|
2015-03-22 20:45:17 +08:00
|
|
|
|
# symbol "=".
|
|
|
|
|
# {{#ifexpr: {{{1}}} = 1 }}
|
2015-04-15 16:47:02 +08:00
|
|
|
|
parameters = [expandTemplates(p, frame) for p in parameters]
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
# Parameters can be either named or unnamed. In the latter case, their
|
|
|
|
|
# name is defined by their ordinal position (1, 2, 3, ...).
|
|
|
|
|
|
|
|
|
|
unnamedParameterCounter = 0
|
|
|
|
|
|
|
|
|
|
# It's legal for unnamed parameters to be skipped, in which case they
|
|
|
|
|
# will get default values (if available) during actual instantiation.
|
|
|
|
|
# That is {{template_name|a||c}} means parameter 1 gets
|
|
|
|
|
# the value 'a', parameter 2 value is not defined, and parameter 3 gets
|
|
|
|
|
# the value 'c'. This case is correctly handled by function 'split',
|
|
|
|
|
# and does not require any special handling.
|
|
|
|
|
for param in parameters:
|
|
|
|
|
# Spaces before or after a parameter value are normally ignored,
|
|
|
|
|
# UNLESS the parameter contains a link (to prevent possible gluing
|
|
|
|
|
# the link to the following text after template substitution)
|
|
|
|
|
|
|
|
|
|
# Parameter values may contain "=" symbols, hence the parameter
|
|
|
|
|
# name extends up to the first such symbol.
|
|
|
|
|
|
|
|
|
|
# It is legal for a parameter to be specified several times, in
|
|
|
|
|
# which case the last assignment takes precedence. Example:
|
|
|
|
|
# "{{t|a|b|c|2=B}}" is equivalent to "{{t|a|B|c}}".
|
|
|
|
|
# Therefore, we don't check if the parameter has been assigned a
|
|
|
|
|
# value before, because anyway the last assignment should override
|
|
|
|
|
# any previous ones.
|
2015-04-15 00:09:46 +08:00
|
|
|
|
# FIXME: Don't use DOTALL here since parameters may be tags with
|
2015-03-22 20:45:17 +08:00
|
|
|
|
# attributes, e.g. <div class="templatequotecite">
|
2015-04-15 00:09:46 +08:00
|
|
|
|
# Parameters may span several lines, like:
|
|
|
|
|
# {{Reflist|colwidth=30em|refs=
|
|
|
|
|
# <ref name="Goode">Title</ref>
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
2015-04-11 18:29:31 +08:00
|
|
|
|
# The '=' might occurr within an HTML attribute: "<ref name=value".
|
2015-04-15 00:09:46 +08:00
|
|
|
|
m = re.match(' *([^= ]*?) *=(.*)', param, re.DOTALL)
|
2015-03-22 20:45:17 +08:00
|
|
|
|
if m:
|
|
|
|
|
# This is a named parameter. This case also handles parameter
|
|
|
|
|
# assignments like "2=xxx", where the number of an unnamed
|
|
|
|
|
# parameter ("2") is specified explicitly - this is handled
|
|
|
|
|
# transparently.
|
|
|
|
|
|
2015-04-15 20:30:55 +08:00
|
|
|
|
parameterName = m.group(1).strip()
|
2015-03-22 20:45:17 +08:00
|
|
|
|
parameterValue = m.group(2)
|
2015-04-15 20:30:55 +08:00
|
|
|
|
|
2015-03-22 20:45:17 +08:00
|
|
|
|
if ']]' not in parameterValue: # if the value does not contain a link, trim whitespace
|
|
|
|
|
parameterValue = parameterValue.strip()
|
|
|
|
|
templateParams[parameterName] = parameterValue
|
|
|
|
|
else:
|
|
|
|
|
# this is an unnamed parameter
|
|
|
|
|
unnamedParameterCounter += 1
|
|
|
|
|
|
|
|
|
|
if ']]' not in param: # if the value does not contain a link, trim whitespace
|
|
|
|
|
param = param.strip()
|
|
|
|
|
templateParams[str(unnamedParameterCounter)] = param
|
2015-04-15 16:47:02 +08:00
|
|
|
|
logging.debug(' templateParams> ' + str(len(frame)) + ' ' + '|'.join(templateParams.values()))
|
2015-03-22 20:45:17 +08:00
|
|
|
|
return templateParams
|
|
|
|
|
|
2015-04-15 00:09:46 +08:00
|
|
|
|
def findMatchingBraces(text, ldelim):
|
2015-04-09 21:24:34 +08:00
|
|
|
|
"""
|
2015-04-15 00:09:46 +08:00
|
|
|
|
:param ldelim: number of braces to match.
|
2015-04-09 21:24:34 +08:00
|
|
|
|
"""
|
|
|
|
|
# Parsing is done with respect to pairs of double braces {{..}} delimiting
|
2015-04-15 00:09:46 +08:00
|
|
|
|
# a template, and pairs of triple braces {{{..}}} delimiting a tplarg.
|
|
|
|
|
# If double opening braces are followed by triple closing braces or
|
2015-04-09 21:24:34 +08:00
|
|
|
|
# conversely, this is taken as delimiting a template, with one left-over
|
|
|
|
|
# brace outside it, taken as plain text. For any pattern of braces this
|
|
|
|
|
# defines a set of templates and tplargs such that any two are either
|
|
|
|
|
# separate or nested (not overlapping).
|
|
|
|
|
|
|
|
|
|
# Unmatched double rectangular closing brackets can be in a template or
|
|
|
|
|
# tplarg, but unmatched double rectangular opening brackets
|
|
|
|
|
# cannot. Unmatched double or triple closing braces inside a pair of
|
|
|
|
|
# double rectangular brackets are treated as plain text.
|
|
|
|
|
# Other formulation: in ambiguity between template or tplarg on one hand,
|
|
|
|
|
# and a link on the other hand, the structure with the rightmost opening
|
|
|
|
|
# takes precedence, even if this is the opening of a link without any
|
|
|
|
|
# closing, so not producing an actual link.
|
|
|
|
|
|
|
|
|
|
# In the case of more than three opening braces the last three are assumed
|
|
|
|
|
# to belong to a tplarg, unless there is no matching triple of closing
|
|
|
|
|
# braces, in which case the last two opening braces are are assumed to
|
|
|
|
|
# belong to a template.
|
|
|
|
|
|
2015-04-15 00:09:46 +08:00
|
|
|
|
# We must skip individual { like in: {{#ifeq: {{padleft:|1|}} | { | | }}
|
|
|
|
|
# We must resolve ambiguities like this:
|
|
|
|
|
# {{{{ }}}} -> { {{{ }}} }
|
|
|
|
|
# {{{{{ }}}}} -> {{ {{{ }}} }}
|
|
|
|
|
# {{#if:{{{{{#if:{{{nominee|}}}|nominee|candidate}}|}}}|...}}
|
2015-04-15 09:43:02 +08:00
|
|
|
|
|
|
|
|
|
# Handle:
|
|
|
|
|
# {{{{{|safesubst:}}}#Invoke:String|replace|{{{1|{{{{{|safesubst:}}}PAGENAME}}}}}|%s+%([^%(]-%)$||plain=false}}
|
|
|
|
|
|
2015-04-15 00:09:46 +08:00
|
|
|
|
reOpen = re.compile('{' * ldelim) # inner
|
|
|
|
|
reClose = re.compile('([{]{2,})|(}{2,})') # at least 2
|
2015-03-22 20:45:17 +08:00
|
|
|
|
cur = 0
|
2015-04-15 00:09:46 +08:00
|
|
|
|
# scan text after {*ldelim looking for matching }*ldelim
|
2015-03-22 20:45:17 +08:00
|
|
|
|
while True:
|
2015-04-15 00:09:46 +08:00
|
|
|
|
m1 = reOpen.search(text, cur)
|
|
|
|
|
if not m1:
|
2015-03-22 20:45:17 +08:00
|
|
|
|
return
|
2015-04-15 00:09:46 +08:00
|
|
|
|
npar = ldelim
|
|
|
|
|
end = m1.end()
|
|
|
|
|
while True:
|
|
|
|
|
m2 = reClose.search(text, end)
|
|
|
|
|
if not m2:
|
|
|
|
|
return # unbalanced
|
|
|
|
|
end = m2.end()
|
|
|
|
|
if m2.lastindex == 1:
|
|
|
|
|
npar += len(m2.group(1))
|
|
|
|
|
else:
|
2015-04-15 09:43:02 +08:00
|
|
|
|
close = len(m2.group(2))
|
|
|
|
|
if close < npar:
|
|
|
|
|
npar -= close
|
|
|
|
|
if npar < ldelim and close == ldelim and all([text[i]=='{' for i in range(m1.start(),m1.start()+npar)]):
|
2015-04-15 00:09:46 +08:00
|
|
|
|
# spurious {
|
2015-04-15 09:43:02 +08:00
|
|
|
|
yield m1.start()+npar, end
|
2015-04-15 00:09:46 +08:00
|
|
|
|
cur = end
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
# resolve ambiguities
|
|
|
|
|
if ldelim == 3:
|
2015-04-15 09:43:02 +08:00
|
|
|
|
if close > 3 and all([text[i]=='{' for i in range(m1.start()+3,m1.start()+close)]):
|
|
|
|
|
yield m1.start()+close-3, end-close+3
|
2015-04-15 00:09:46 +08:00
|
|
|
|
elif text[m1.start()+3] == '{' and text[m1.start()+4] != '{':
|
|
|
|
|
# spurious {
|
|
|
|
|
yield m1.start()+1, end
|
2015-04-15 09:43:02 +08:00
|
|
|
|
else: # close >= npar
|
|
|
|
|
yield m1.start(), end-close+npar
|
2015-04-15 00:09:46 +08:00
|
|
|
|
else: # ldelim == 2
|
2015-04-15 09:43:02 +08:00
|
|
|
|
yield m1.start(), end-close+npar
|
2015-04-15 00:09:46 +08:00
|
|
|
|
cur = end
|
|
|
|
|
break
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
def findBalanced(text, openDelim, closeDelim, openPatterns=None,
|
|
|
|
|
startDelim=None):
|
|
|
|
|
"""
|
|
|
|
|
Assuming that text contains a properly balanced expression using
|
|
|
|
|
:param openDelim: as opening delimiters and
|
|
|
|
|
:param closeDelim: as closing delimiters.
|
|
|
|
|
:param openPatterns: use these regex patterns for matching open delimiters.
|
|
|
|
|
Sometimes patterns are ambiguous, hence specifying '{{{(?!{)' avoids
|
|
|
|
|
matching '{{{{' for '{{{'.
|
|
|
|
|
:param startDelim: start searching for this delimiter.
|
|
|
|
|
:return: an iterator producing pairs (start, end) of start and end
|
|
|
|
|
positions in text containing a balanced expression.
|
|
|
|
|
"""
|
|
|
|
|
if openPatterns:
|
|
|
|
|
openPat = '|'.join(openPatterns)
|
|
|
|
|
else:
|
|
|
|
|
openPat = '|'.join([re.escape(x) for x in openDelim])
|
|
|
|
|
# patter for delimiters expected after each opening delimiter
|
2015-04-15 20:30:55 +08:00
|
|
|
|
afterPat = { o:re.compile(openPat+'|'+c, re.DOTALL) for o,c in izip(openDelim, closeDelim)}
|
2015-03-22 20:45:17 +08:00
|
|
|
|
stack = []
|
|
|
|
|
start = 0
|
|
|
|
|
cur = 0
|
|
|
|
|
end = len(text)
|
|
|
|
|
startSet = False
|
|
|
|
|
if startDelim is not None:
|
|
|
|
|
if openPatterns:
|
|
|
|
|
startPat = re.compile(openPatterns[startDelim])
|
|
|
|
|
else:
|
|
|
|
|
startPat = re.compile(openDelim[startDelim])
|
|
|
|
|
else:
|
|
|
|
|
startPat = re.compile(openPat)
|
|
|
|
|
nextPat = startPat
|
|
|
|
|
while True:
|
|
|
|
|
next = nextPat.search(text, cur)
|
|
|
|
|
if not next:
|
|
|
|
|
return
|
|
|
|
|
if not startSet:
|
|
|
|
|
start = next.start()
|
|
|
|
|
startSet = True
|
|
|
|
|
delim = next.group(0)
|
|
|
|
|
if delim in openDelim:
|
|
|
|
|
stack.append(delim)
|
|
|
|
|
nextPat = afterPat[delim]
|
|
|
|
|
else:
|
|
|
|
|
opening = stack.pop()
|
|
|
|
|
# assert opening == openDelim[closeDelim.index(next.group(0))]
|
|
|
|
|
if stack:
|
|
|
|
|
nextPat = afterPat[stack[-1]]
|
|
|
|
|
else:
|
|
|
|
|
yield start, next.end()
|
|
|
|
|
nextPat = startPat
|
|
|
|
|
start = next.end()
|
|
|
|
|
startSet = False
|
|
|
|
|
cur = next.end()
|
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
# Modules
|
|
|
|
|
|
|
|
|
|
# Only minimal support
|
|
|
|
|
# FIXME: import Lua modules.
|
|
|
|
|
|
|
|
|
|
modules = {
|
|
|
|
|
'convert' : {
|
|
|
|
|
'convert': lambda x, u, *rest: x+' '+u, # no conversion
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
# variables
|
|
|
|
|
|
|
|
|
|
# FIXME: we just discard them.
|
|
|
|
|
magicWords = set([
|
|
|
|
|
'!',
|
|
|
|
|
'currentmonth',
|
|
|
|
|
'currentmonth1',
|
|
|
|
|
'currentmonthname',
|
|
|
|
|
'currentmonthnamegen',
|
|
|
|
|
'currentmonthabbrev',
|
|
|
|
|
'currentday',
|
|
|
|
|
'currentday2',
|
|
|
|
|
'currentdayname',
|
|
|
|
|
'currentyear',
|
|
|
|
|
'currenttime',
|
|
|
|
|
'currenthour',
|
|
|
|
|
'localmonth',
|
|
|
|
|
'localmonth1',
|
|
|
|
|
'localmonthname',
|
|
|
|
|
'localmonthnamegen',
|
|
|
|
|
'localmonthabbrev',
|
|
|
|
|
'localday',
|
|
|
|
|
'localday2',
|
|
|
|
|
'localdayname',
|
|
|
|
|
'localyear',
|
|
|
|
|
'localtime',
|
|
|
|
|
'localhour',
|
|
|
|
|
'numberofarticles',
|
|
|
|
|
'numberoffiles',
|
|
|
|
|
'numberofedits',
|
|
|
|
|
'articlepath',
|
|
|
|
|
'pageid',
|
|
|
|
|
'sitename',
|
|
|
|
|
'server',
|
|
|
|
|
'servername',
|
|
|
|
|
'scriptpath',
|
|
|
|
|
'stylepath',
|
|
|
|
|
'pagename',
|
|
|
|
|
'pagenamee',
|
|
|
|
|
'fullpagename',
|
|
|
|
|
'fullpagenamee',
|
|
|
|
|
'namespace',
|
|
|
|
|
'namespacee',
|
|
|
|
|
'namespacenumber',
|
|
|
|
|
'currentweek',
|
|
|
|
|
'currentdow',
|
|
|
|
|
'localweek',
|
|
|
|
|
'localdow',
|
|
|
|
|
'revisionid',
|
|
|
|
|
'revisionday',
|
|
|
|
|
'revisionday2',
|
|
|
|
|
'revisionmonth',
|
|
|
|
|
'revisionmonth1',
|
|
|
|
|
'revisionyear',
|
|
|
|
|
'revisiontimestamp',
|
|
|
|
|
'revisionuser',
|
|
|
|
|
'revisionsize',
|
|
|
|
|
'subpagename',
|
|
|
|
|
'subpagenamee',
|
|
|
|
|
'talkspace',
|
|
|
|
|
'talkspacee',
|
|
|
|
|
'subjectspace',
|
|
|
|
|
'subjectspacee',
|
|
|
|
|
'talkpagename',
|
|
|
|
|
'talkpagenamee',
|
|
|
|
|
'subjectpagename',
|
|
|
|
|
'subjectpagenamee',
|
|
|
|
|
'numberofusers',
|
|
|
|
|
'numberofactiveusers',
|
|
|
|
|
'numberofpages',
|
|
|
|
|
'currentversion',
|
|
|
|
|
'rootpagename',
|
|
|
|
|
'rootpagenamee',
|
|
|
|
|
'basepagename',
|
|
|
|
|
'basepagenamee',
|
|
|
|
|
'currenttimestamp',
|
|
|
|
|
'localtimestamp',
|
|
|
|
|
'directionmark',
|
|
|
|
|
'contentlanguage',
|
|
|
|
|
'numberofadmins',
|
|
|
|
|
'cascadingsources',
|
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
substWords = 'subst:|safesubst:'
|
|
|
|
|
|
2015-04-15 16:47:02 +08:00
|
|
|
|
def expandTemplate(body, frame):
|
2015-03-22 20:45:17 +08:00
|
|
|
|
"""
|
|
|
|
|
Expands template invocation.
|
2015-04-15 00:09:46 +08:00
|
|
|
|
:param body: the parts of a template.
|
2015-04-15 16:47:02 +08:00
|
|
|
|
:param frame: invocation history.
|
2015-04-09 21:24:34 +08:00
|
|
|
|
|
|
|
|
|
:see http://meta.wikimedia.org/wiki/Help:Expansion for an explanation of
|
|
|
|
|
the process.
|
|
|
|
|
|
|
|
|
|
See in particular: Expansion of names and values
|
|
|
|
|
http://meta.wikimedia.org/wiki/Help:Expansion#Expansion_of_names_and_values
|
|
|
|
|
|
|
|
|
|
For most parser functions all names and values are expanded, regardless of
|
|
|
|
|
what is relevant for the result. The branching functions (#if, #ifeq,
|
|
|
|
|
#iferror, #ifexist, #ifexpr, #switch) are exceptions.
|
|
|
|
|
|
|
|
|
|
All names in a template call are expanded, and the titles of the tplargs
|
|
|
|
|
in the template body, after which it is determined which values must be
|
|
|
|
|
expanded, and for which tplargs in the template body the first part
|
|
|
|
|
(default).
|
|
|
|
|
|
|
|
|
|
In the case of a tplarg, any parts beyond the first are never expanded.
|
|
|
|
|
The possible name and the value of the first part is expanded if the title
|
|
|
|
|
does not match a name in the template call.
|
|
|
|
|
|
|
|
|
|
:see code for braceSubstitution at
|
2015-03-22 20:45:17 +08:00
|
|
|
|
https://doc.wikimedia.org/mediawiki-core/master/php/html/Parser_8php_source.html#3397:
|
2015-04-09 21:24:34 +08:00
|
|
|
|
|
2015-03-22 20:45:17 +08:00
|
|
|
|
"""
|
|
|
|
|
|
2015-04-15 00:09:46 +08:00
|
|
|
|
# template = "{{" parts "}}"
|
|
|
|
|
|
2015-04-09 21:24:34 +08:00
|
|
|
|
# Templates and tplargs are decomposed in the same way, with pipes as
|
|
|
|
|
# separator, even though eventually any parts in a tplarg after the first
|
|
|
|
|
# (the parameter default) are ignored, and an equals sign in the first
|
|
|
|
|
# part is treated as plain text.
|
|
|
|
|
# Pipes inside inner templates and tplargs, or inside double rectangular
|
|
|
|
|
# brackets within the template or tplargs are not taken into account in
|
|
|
|
|
# this decomposition.
|
|
|
|
|
# The first part is called title, the other parts are simply called parts.
|
|
|
|
|
|
|
|
|
|
# If a part has one or more equals signs in it, the first equals sign
|
|
|
|
|
# determines the division into name = value. Equals signs inside inner
|
|
|
|
|
# templates and tplargs, or inside double rectangular brackets within the
|
|
|
|
|
# part are not taken into account in this decomposition. Parts without
|
|
|
|
|
# equals sign are indexed 1, 2, .., given as attribute in the <name> tag.
|
|
|
|
|
|
2015-04-15 16:47:02 +08:00
|
|
|
|
if len(frame) >= maxTemplateRecursionLevels:
|
2015-04-15 00:09:46 +08:00
|
|
|
|
logging.warn('Reached max template recursion: %d' %
|
|
|
|
|
maxTemplateRecursionLevels)
|
2015-04-15 16:47:02 +08:00
|
|
|
|
logging.debug(' INVOCATION> ' + str(len(frame)) + ' ' + body)
|
2015-04-09 21:24:34 +08:00
|
|
|
|
return ''
|
|
|
|
|
|
2015-04-15 16:47:02 +08:00
|
|
|
|
logging.debug('INVOCATION ' + str(len(frame)) + ' ' + body)
|
2015-04-11 09:43:32 +08:00
|
|
|
|
|
2015-04-15 00:09:46 +08:00
|
|
|
|
parts = splitParameters(body)
|
2015-04-11 09:43:32 +08:00
|
|
|
|
# title is the portion before the first |
|
2015-04-15 09:43:02 +08:00
|
|
|
|
logging.debug('TITLE ' + parts[0].strip())
|
2015-04-15 16:47:02 +08:00
|
|
|
|
title = expandTemplates(parts[0].strip(), frame)
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
# SUBST
|
2015-04-11 09:43:32 +08:00
|
|
|
|
if re.match(substWords, title):
|
|
|
|
|
if title.startswith('subst'):
|
2015-04-15 00:09:46 +08:00
|
|
|
|
return body
|
2015-04-11 09:43:32 +08:00
|
|
|
|
title = re.sub(substWords, '', title)
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
2015-04-11 09:43:32 +08:00
|
|
|
|
if title.lower() in magicWords:
|
|
|
|
|
if title == '!':
|
2015-03-22 20:45:17 +08:00
|
|
|
|
return '|'
|
|
|
|
|
return '' # FIXME: get variable value
|
|
|
|
|
|
|
|
|
|
# Parser functions
|
|
|
|
|
# The first argument is everything after the first colon.
|
2015-04-11 09:43:32 +08:00
|
|
|
|
colon = title.find(':')
|
2015-03-22 20:45:17 +08:00
|
|
|
|
if colon > 1:
|
2015-04-11 09:43:32 +08:00
|
|
|
|
funct = title[:colon]
|
|
|
|
|
parts[0] = title[colon+1:].strip() # side-effect (parts[0] not used later)
|
|
|
|
|
# arguments after first are not evaluated
|
2015-04-15 16:47:02 +08:00
|
|
|
|
ret = callParserFunction(funct, parts, frame)
|
|
|
|
|
return expandTemplates(ret, frame)
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
2015-04-11 09:43:32 +08:00
|
|
|
|
title = fullyQualifiedTemplateTitle(title)
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
redirected = redirects.get(title)
|
|
|
|
|
if redirected:
|
|
|
|
|
title = redirected
|
|
|
|
|
|
2015-04-15 00:09:46 +08:00
|
|
|
|
if title not in templates:
|
2015-03-22 20:45:17 +08:00
|
|
|
|
# The page being included could not be identified
|
2015-04-15 00:09:46 +08:00
|
|
|
|
return ''
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
2015-04-15 00:09:46 +08:00
|
|
|
|
# get the template
|
|
|
|
|
template = templates[title]
|
|
|
|
|
logging.debug('TEMPLATE ' + template)
|
|
|
|
|
|
|
|
|
|
# tplarg = "{{{" parts "}}}"
|
|
|
|
|
# parts = [ title *( "|" part ) ]
|
|
|
|
|
# part = ( part-name "=" part-value ) / ( part-value )
|
|
|
|
|
# part-name = wikitext-L3
|
|
|
|
|
# part-value = wikitext-L3
|
2015-04-15 20:30:55 +08:00
|
|
|
|
# wikitext-L3 = literal / template / tplarg / link / comment /
|
2015-04-15 00:09:46 +08:00
|
|
|
|
# line-eating-comment / unclosed-comment /
|
|
|
|
|
# xmlish-element / *wikitext-L3
|
|
|
|
|
|
|
|
|
|
# A tplarg may contain other parameters as well as templates, e.g.:
|
|
|
|
|
# {{{text|{{{quote|{{{1|{{error|Error: No text given}}}}}}}}}}}
|
|
|
|
|
# hence no simple RE like this would work:
|
|
|
|
|
# '{{{((?:(?!{{{).)*?)}}}'
|
|
|
|
|
# We must use full CF parsing.
|
|
|
|
|
|
|
|
|
|
# the parameter name itself might be computed, e.g.:
|
|
|
|
|
# {{{appointe{{#if:{{{appointer14|}}}|r|d}}14|}}}
|
|
|
|
|
|
|
|
|
|
# Because of the multiple uses of double-brace and triple-brace
|
|
|
|
|
# syntax, expressions can sometimes be ambiguous.
|
|
|
|
|
# Precedence rules specifed here:
|
|
|
|
|
# http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence
|
|
|
|
|
# resolve ambiguities like this:
|
|
|
|
|
# {{{{ }}}} -> { {{{ }}} }
|
|
|
|
|
# {{{{{ }}}}} -> {{ {{{ }}} }}
|
2015-04-15 20:30:55 +08:00
|
|
|
|
#
|
2015-04-15 00:09:46 +08:00
|
|
|
|
# :see: https://en.wikipedia.org/wiki/Help:Template#Handling_parameters
|
|
|
|
|
|
|
|
|
|
# Evaluate parameters.
|
|
|
|
|
# build a dict of name-values for the parameter values
|
2015-04-15 16:47:02 +08:00
|
|
|
|
params = templateParams(parts[1:], frame)
|
2015-04-15 00:09:46 +08:00
|
|
|
|
|
|
|
|
|
# Perform parameter substitution
|
2015-04-15 16:47:02 +08:00
|
|
|
|
instantiated = substParameters(template, params, frame)
|
|
|
|
|
#logging.debug('instantiated ' + str(len(frame)) + ' ' + template)
|
|
|
|
|
frame.append((title, params))
|
|
|
|
|
value = expandTemplates(instantiated, frame)
|
|
|
|
|
frame.pop()
|
|
|
|
|
logging.debug(' INVOCATION> ' + str(len(frame)) + ' ' + value)
|
2015-04-15 00:09:46 +08:00
|
|
|
|
return value
|
|
|
|
|
|
2015-04-15 16:47:02 +08:00
|
|
|
|
def substParameters(body, params, frame, subst_depth=0):
|
2015-04-09 21:24:34 +08:00
|
|
|
|
"""
|
2015-04-15 00:09:46 +08:00
|
|
|
|
:param body: the body of a template.
|
|
|
|
|
:param params: dict of name-values template parameters.
|
2015-04-15 16:47:02 +08:00
|
|
|
|
:param frame: invocation history.
|
2015-04-15 00:09:46 +08:00
|
|
|
|
:param subst_depth: depth of recursive parameter substitutions.
|
2015-04-09 21:24:34 +08:00
|
|
|
|
"""
|
2015-04-15 00:09:46 +08:00
|
|
|
|
# We perform parameter substitutions recursively.
|
|
|
|
|
# We also limit the maximum number of iterations to avoid too long or
|
|
|
|
|
# even endless loops (in case of malformed input).
|
|
|
|
|
|
|
|
|
|
# :see: http://meta.wikimedia.org/wiki/Help:Expansion#Distinction_between_variables.2C_parser_functions.2C_and_templates
|
|
|
|
|
#
|
|
|
|
|
# Parameter values are assigned to parameters in two (?) passes.
|
|
|
|
|
# Therefore a parameter name in a template can depend on the value of
|
|
|
|
|
# another parameter of the same template, regardless of the order in
|
|
|
|
|
# which they are specified in the template call, for example, using
|
|
|
|
|
# Template:ppp containing "{{{{{{p}}}}}}", {{ppp|p=q|q=r}} and even
|
|
|
|
|
# {{ppp|q=r|p=q}} gives r, but using Template:tvvv containing
|
|
|
|
|
# "{{{{{{{{{p}}}}}}}}}", {{tvvv|p=q|q=r|r=s}} gives s.
|
|
|
|
|
|
2015-04-15 16:47:02 +08:00
|
|
|
|
logging.debug('substParameters (%d, %d) %s' % (len(frame), subst_depth, body))
|
2015-04-15 06:09:51 +08:00
|
|
|
|
|
2015-04-15 00:09:46 +08:00
|
|
|
|
result = ''
|
2015-04-15 06:09:51 +08:00
|
|
|
|
if subst_depth > maxParameterRecursionLevels:
|
2015-04-15 00:09:46 +08:00
|
|
|
|
logging.warn('Reachead maximum parameter recursions: %d' %
|
|
|
|
|
maxParameterRecursionLevels)
|
|
|
|
|
return result
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
2015-04-15 00:09:46 +08:00
|
|
|
|
start = 0
|
|
|
|
|
# we must handle nesting, s.a.
|
|
|
|
|
# {{{1|{{PAGENAME}}}
|
|
|
|
|
# {{{italics|{{{italic|}}}
|
|
|
|
|
# {{#if:{{{{{#if:{{{nominee|}}}|nominee|candidate}}|}}}|
|
|
|
|
|
#
|
2015-04-15 09:43:02 +08:00
|
|
|
|
|
2015-04-15 00:09:46 +08:00
|
|
|
|
for s,e in findMatchingBraces(body, 3):
|
|
|
|
|
# invoke substParameter on outer {{{}}}
|
|
|
|
|
result += body[start:s] + substParameter(body[s+3:e-3],
|
2015-04-15 16:47:02 +08:00
|
|
|
|
params, frame, subst_depth+1)
|
2015-04-15 00:09:46 +08:00
|
|
|
|
start = e
|
|
|
|
|
result += body[start:] # leftover
|
|
|
|
|
return result
|
|
|
|
|
|
2015-04-15 16:47:02 +08:00
|
|
|
|
def substParameter(parameter, params, frame, subst_depth):
|
2015-04-15 00:09:46 +08:00
|
|
|
|
"""
|
|
|
|
|
:param parameter: the parts of a tplarg.
|
|
|
|
|
:param params: dict of name-values of template parameters.
|
|
|
|
|
"""
|
2015-04-09 21:24:34 +08:00
|
|
|
|
# the parameter name itself might contain templates, e.g.:
|
2015-03-22 20:45:17 +08:00
|
|
|
|
# appointe{{#if:{{{appointer14|}}}|r|d}}14|
|
|
|
|
|
|
2015-04-09 21:24:34 +08:00
|
|
|
|
# any parts in a tplarg after the first (the parameter default) are
|
|
|
|
|
# ignored, and an equals sign in the first part is treated as plain text.
|
2015-04-15 00:09:46 +08:00
|
|
|
|
#logging.debug(' subst ' + parameter + ' ' + str(params))
|
2015-04-09 21:24:34 +08:00
|
|
|
|
|
2015-04-11 18:29:31 +08:00
|
|
|
|
parts = splitParameters(parameter)
|
|
|
|
|
if len(parts) > 1:
|
2015-03-22 20:45:17 +08:00
|
|
|
|
# This parameter has a default value
|
2015-04-15 16:47:02 +08:00
|
|
|
|
paramName = expandTemplates(substParameters(parts[0], params, frame, subst_depth), frame)
|
|
|
|
|
defaultValue = substParameters(parts[1], params, frame, subst_depth)
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
2015-04-15 00:09:46 +08:00
|
|
|
|
if paramName in params:
|
|
|
|
|
return params[paramName] # use parameter value specified in template invocation
|
2015-03-22 20:45:17 +08:00
|
|
|
|
else: # use the default value
|
2015-04-15 16:47:02 +08:00
|
|
|
|
return expandTemplates(defaultValue, frame)
|
2015-03-22 20:45:17 +08:00
|
|
|
|
# parameter without a default value
|
2015-04-15 16:47:02 +08:00
|
|
|
|
parameter = substParameters(parameter, params, frame, subst_depth)
|
|
|
|
|
parameter = expandTemplates(parameter, frame)
|
2015-04-15 00:09:46 +08:00
|
|
|
|
if parameter in params:
|
|
|
|
|
return params[parameter] # use parameter value specified in template invocation
|
|
|
|
|
# Parameter not specified in template invocation and without
|
|
|
|
|
# a default value.
|
|
|
|
|
# The Wiki rules for templates
|
|
|
|
|
# (see http://meta.wikimedia.org/wiki/Help:Template)
|
|
|
|
|
# would require to keep the parameter in 3 braces, but we just drop them.
|
|
|
|
|
return ''
|
2015-03-22 20:45:17 +08:00
|
|
|
|
# Surplus parameters - i.e., those assigned values in template
|
2015-04-09 21:24:34 +08:00
|
|
|
|
# invocation but not used in the template body - are simply ignored.
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
2015-04-15 00:09:46 +08:00
|
|
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
# parser functions utilities
|
|
|
|
|
|
2015-03-22 20:45:17 +08:00
|
|
|
|
def ucfirst(string):
|
|
|
|
|
""":return: a string with its first character uppercase"""
|
|
|
|
|
if string:
|
|
|
|
|
if len(string) > 1:
|
|
|
|
|
return string[0].upper() + string[1:]
|
|
|
|
|
else:
|
|
|
|
|
return string.upper()
|
|
|
|
|
else:
|
|
|
|
|
return ''
|
|
|
|
|
|
|
|
|
|
def lcfirst(string):
|
|
|
|
|
""":return: a string with its first character lowercase"""
|
|
|
|
|
if string:
|
|
|
|
|
if len(string) > 1:
|
|
|
|
|
return string[0].lower() + string[1:]
|
|
|
|
|
else:
|
|
|
|
|
return string.lower()
|
|
|
|
|
else:
|
|
|
|
|
return ''
|
|
|
|
|
|
|
|
|
|
def fullyQualifiedTemplateTitle(templateTitle):
|
|
|
|
|
"""
|
|
|
|
|
Determine the namespace of the page being included through the template
|
|
|
|
|
mechanism
|
|
|
|
|
"""
|
|
|
|
|
if templateTitle.startswith(':'):
|
|
|
|
|
# Leading colon by itself implies main namespace, so strip this colon
|
|
|
|
|
return ucfirst(templateTitle[1:])
|
|
|
|
|
else:
|
|
|
|
|
m = re.match('([^:]*)(:.*)', templateTitle)
|
|
|
|
|
if m:
|
|
|
|
|
# colon found but not in the first position - check if it
|
|
|
|
|
# designates a known namespace
|
|
|
|
|
prefix = normalizeNamespace(m.group(1))
|
|
|
|
|
if prefix in knownNamespaces:
|
|
|
|
|
return prefix + ucfirst(m.group(2))
|
|
|
|
|
# The title of the page being included is NOT in the main namespace and
|
|
|
|
|
# lacks any other explicit designation of the namespace - therefore, it
|
|
|
|
|
# is resolved to the Template namespace (that's the default for the
|
|
|
|
|
# template inclusion mechanism).
|
|
|
|
|
|
|
|
|
|
# This is a defense against pages whose title only contains UTF-8 chars
|
|
|
|
|
# that are reduced to an empty string. Right now I can think of one such
|
|
|
|
|
# case - <C2><A0> which represents the non-breaking space.
|
|
|
|
|
# In this particular case, this page is a redirect to [[Non-nreaking
|
|
|
|
|
# space]], but having in the system a redirect page with an empty title
|
|
|
|
|
# causes numerous problems, so we'll live happier without it.
|
|
|
|
|
if templateTitle:
|
|
|
|
|
return "Template:" + ucfirst(templateTitle)
|
|
|
|
|
else:
|
|
|
|
|
logging.warn("Skipping page with empty title")
|
|
|
|
|
return ''
|
|
|
|
|
|
|
|
|
|
def normalizeNamespace(ns):
|
|
|
|
|
return ucfirst(ns)
|
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
# see http://www.mediawiki.org/wiki/Help:Extension:ParserFunctions
|
|
|
|
|
# https://github.com/Wikia/app/blob/dev/extensions/ParserFunctions/ParserFunctions_body.php
|
|
|
|
|
|
2015-04-15 06:09:51 +08:00
|
|
|
|
class Infix:
|
|
|
|
|
"""Infix operators.
|
|
|
|
|
The calling sequence for the infix is:
|
|
|
|
|
x |op| y
|
|
|
|
|
"""
|
|
|
|
|
def __init__(self, function):
|
|
|
|
|
self.function = function
|
|
|
|
|
def __ror__(self, other):
|
|
|
|
|
return Infix(lambda x, self=self, other=other: self.function(other, x))
|
|
|
|
|
def __or__(self, other):
|
|
|
|
|
return self.function(other)
|
|
|
|
|
def __rlshift__(self, other):
|
|
|
|
|
return Infix(lambda x, self=self, other=other: self.function(other, x))
|
|
|
|
|
def __rshift__(self, other):
|
|
|
|
|
return self.function(other)
|
|
|
|
|
def __call__(self, value1, value2):
|
|
|
|
|
return self.function(value1, value2)
|
|
|
|
|
|
|
|
|
|
ROUND = Infix(lambda x,y: round(x, y))
|
|
|
|
|
|
2015-03-22 20:45:17 +08:00
|
|
|
|
def sharp_expr(expr):
|
|
|
|
|
try:
|
|
|
|
|
expr = re.sub('mod', '%', expr)
|
2015-04-15 06:09:51 +08:00
|
|
|
|
expr = re.sub('\bdiv\b', '/', expr)
|
|
|
|
|
expr = re.sub('\bround\b', '|ROUND|', expr)
|
2015-03-22 20:45:17 +08:00
|
|
|
|
return str(eval(expr))
|
|
|
|
|
except:
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
def sharp_if(testValue, valueIfTrue, valueIfFalse=None, *args):
|
|
|
|
|
if testValue.strip():
|
|
|
|
|
# The {{#if:}} function is an if-then-else construct.
|
2015-04-15 20:30:55 +08:00
|
|
|
|
# The applied condition is: "The condition string is non-empty".
|
2015-03-22 20:45:17 +08:00
|
|
|
|
valueIfTrue = valueIfTrue.strip()
|
|
|
|
|
if valueIfTrue:
|
|
|
|
|
return valueIfTrue
|
|
|
|
|
elif valueIfFalse:
|
|
|
|
|
return valueIfFalse.strip()
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
def sharp_ifeq(lvalue, rvalue, valueIfTrue, valueIfFalse=None, *args):
|
|
|
|
|
rvalue = rvalue.strip()
|
|
|
|
|
if rvalue:
|
|
|
|
|
# lvalue is always defined
|
|
|
|
|
if lvalue.strip() == rvalue:
|
|
|
|
|
# The {{#ifeq:}} function is an if-then-else construct. The
|
|
|
|
|
# applied condition is "is rvalue equal to lvalue". Note that this
|
|
|
|
|
# does only string comparison while MediaWiki implementation also
|
|
|
|
|
# supports numerical comparissons.
|
|
|
|
|
|
|
|
|
|
if valueIfTrue:
|
|
|
|
|
return valueIfTrue.strip()
|
|
|
|
|
else:
|
|
|
|
|
if valueIfFalse:
|
|
|
|
|
return valueIfFalse.strip()
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
def sharp_iferror(test, then='', Else=None, *args):
|
|
|
|
|
if re.match('<(?:strong|span|p|div)\s(?:[^\s>]*\s+)*?class="(?:[^"\s>]*\s+)*?error(?:\s[^">]*)?"', test):
|
|
|
|
|
return then
|
|
|
|
|
elif Else is None:
|
|
|
|
|
return test.strip()
|
|
|
|
|
else:
|
|
|
|
|
return Else.strip()
|
|
|
|
|
|
2015-04-15 00:09:46 +08:00
|
|
|
|
def sharp_switch(primary, *params):
|
2015-03-22 20:45:17 +08:00
|
|
|
|
# FIXME: we don't support numeric expressions in primary
|
|
|
|
|
|
|
|
|
|
# {{#switch: comparison string
|
|
|
|
|
# | case1 = result1
|
2015-04-15 20:30:55 +08:00
|
|
|
|
# | case2
|
2015-03-22 20:45:17 +08:00
|
|
|
|
# | case4 = result2
|
|
|
|
|
# | #default = result3
|
|
|
|
|
# }}
|
|
|
|
|
|
|
|
|
|
primary = primary.strip()
|
|
|
|
|
found = False
|
|
|
|
|
default = None
|
|
|
|
|
rvalue = None
|
|
|
|
|
lvalue = ''
|
2015-04-15 00:09:46 +08:00
|
|
|
|
for param in params:
|
2015-04-16 00:20:32 +08:00
|
|
|
|
# handle cases like:
|
|
|
|
|
# [http://www.perseus.tufts.edu/hopper/text?doc=Perseus...]
|
2015-03-22 20:45:17 +08:00
|
|
|
|
pair = splitParameters(param, '=')
|
|
|
|
|
lvalue = pair[0].strip()
|
|
|
|
|
rvalue = None
|
|
|
|
|
if len(pair) > 1:
|
|
|
|
|
# got "="
|
|
|
|
|
rvalue = pair[1].strip()
|
|
|
|
|
if found or lvalue == primary:
|
|
|
|
|
# Found a match, return now
|
2015-04-12 16:21:35 +08:00
|
|
|
|
return rvalue
|
|
|
|
|
elif lvalue == '#default':
|
2015-03-22 20:45:17 +08:00
|
|
|
|
default = rvalue
|
2015-04-12 16:21:35 +08:00
|
|
|
|
rvalue = None # avoid defaulting to last case
|
2015-03-22 20:45:17 +08:00
|
|
|
|
# else wrong case, continue
|
|
|
|
|
elif lvalue == primary:
|
|
|
|
|
# If the value matches, set a flag and continue
|
|
|
|
|
found = True
|
|
|
|
|
# Default case
|
|
|
|
|
# Check if the last item had no = sign, thus specifying the default case
|
2015-04-12 16:21:35 +08:00
|
|
|
|
if rvalue is not None:
|
2015-03-22 20:45:17 +08:00
|
|
|
|
return lvalue
|
2015-04-12 16:21:35 +08:00
|
|
|
|
elif default is not None:
|
2015-03-22 20:45:17 +08:00
|
|
|
|
return default
|
|
|
|
|
return ''
|
|
|
|
|
|
2015-04-09 21:24:34 +08:00
|
|
|
|
# Extension Scribuntu
|
2015-04-15 16:47:02 +08:00
|
|
|
|
def sharp_invoke(module, function, frame):
|
|
|
|
|
functions = modules.get(module)
|
|
|
|
|
if functions:
|
|
|
|
|
funct = functions.get(function)
|
|
|
|
|
if funct:
|
|
|
|
|
# find parameters in frame whose title is the one of the original
|
|
|
|
|
# template invocation
|
|
|
|
|
templateTitle = fullyQualifiedTemplateTitle(function)
|
|
|
|
|
pair = next((x for x in frame if x[0] == templateTitle), None)
|
|
|
|
|
if pair:
|
|
|
|
|
params = pair[1]
|
|
|
|
|
# extract positional args
|
|
|
|
|
params = [params.get(str(i+1)) for i in range(len(params))]
|
|
|
|
|
return funct(*params)
|
|
|
|
|
else:
|
|
|
|
|
return funct()
|
|
|
|
|
return ''
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
parserFunctions = {
|
|
|
|
|
|
|
|
|
|
'#expr': sharp_expr,
|
|
|
|
|
|
|
|
|
|
'#if': sharp_if,
|
|
|
|
|
|
|
|
|
|
'#ifeq': sharp_ifeq,
|
|
|
|
|
|
|
|
|
|
'#iferror': sharp_iferror,
|
|
|
|
|
|
|
|
|
|
'#ifexpr': lambda *args: '', # not supported
|
|
|
|
|
|
|
|
|
|
'#ifexist': lambda *args: '', # not supported
|
|
|
|
|
|
|
|
|
|
'#rel2abs': lambda *args: '', # not supported
|
|
|
|
|
|
|
|
|
|
'#switch': sharp_switch,
|
|
|
|
|
|
|
|
|
|
'#language': lambda *args: '', # not supported
|
|
|
|
|
|
|
|
|
|
'#time': lambda *args: '', # not supported
|
|
|
|
|
|
|
|
|
|
'#timel': lambda *args: '', # not supported
|
|
|
|
|
|
|
|
|
|
'#titleparts': lambda *args: '', # not supported
|
|
|
|
|
|
|
|
|
|
# This function is used in some pages to construct links
|
|
|
|
|
# http://meta.wikimedia.org/wiki/Help:URL
|
|
|
|
|
'urlencode': lambda string, *rest: urllib.quote(string.encode('utf-8')),
|
|
|
|
|
|
|
|
|
|
'lc': lambda string: string.lower() if string else '',
|
|
|
|
|
|
|
|
|
|
'lcfirst': lambda string: lcfirst(string),
|
|
|
|
|
|
|
|
|
|
'lc': lambda string: string.upper() if string else '',
|
|
|
|
|
|
|
|
|
|
'ucfirst': lambda string: ucfirst(string),
|
|
|
|
|
|
|
|
|
|
'int': lambda string: string,
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
2015-04-15 16:47:02 +08:00
|
|
|
|
def callParserFunction(functionName, args, frame):
|
2015-03-22 20:45:17 +08:00
|
|
|
|
"""
|
|
|
|
|
Parser functions have similar syntax as templates, except that
|
|
|
|
|
the first argument is everything after the first colon.
|
2015-04-11 09:43:32 +08:00
|
|
|
|
:return: the result of the invocation, None in case of failure.
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
http://meta.wikimedia.org/wiki/Help:ParserFunctions
|
|
|
|
|
"""
|
2015-04-15 20:30:55 +08:00
|
|
|
|
|
2015-03-22 20:45:17 +08:00
|
|
|
|
try:
|
2015-04-15 16:47:02 +08:00
|
|
|
|
if functionName == '#invoke':
|
|
|
|
|
# special handling of frame
|
|
|
|
|
ret = sharp_invoke(args[0].strip(), args[1].strip(), frame)
|
|
|
|
|
logging.debug('parserFunction> ' + functionName + ' ' + ret)
|
|
|
|
|
return ret
|
2015-03-22 20:45:17 +08:00
|
|
|
|
if functionName in parserFunctions:
|
2015-04-15 00:09:46 +08:00
|
|
|
|
ret = parserFunctions[functionName](*args)
|
|
|
|
|
logging.debug('parserFunction> ' + functionName + ' ' + ret)
|
|
|
|
|
return ret
|
2015-03-22 20:45:17 +08:00
|
|
|
|
except:
|
2015-04-11 09:43:32 +08:00
|
|
|
|
return "" # FIXME: fix errors
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
# Expand using WikiMedia API
|
|
|
|
|
# import json
|
|
|
|
|
|
|
|
|
|
# def expandTemplates(text):
|
|
|
|
|
# """Expand templates invoking MediaWiki API"""
|
|
|
|
|
# text = urlib.urlencodew(text.encode('utf-8'))
|
2015-04-12 16:21:35 +08:00
|
|
|
|
# base = urlbase[:urlbase.rfind('/')]
|
2015-03-22 20:45:17 +08:00
|
|
|
|
# url = base + "/w/api.php?action=expandtemplates&format=json&text=" + text
|
|
|
|
|
# exp = json.loads(urllib.urlopen(url))
|
|
|
|
|
# return exp['expandtemplates']['*']
|
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
# Extract Template definition
|
|
|
|
|
|
|
|
|
|
reNoinclude = re.compile(r'<noinclude>(?:.*?)</noinclude>', re.DOTALL)
|
|
|
|
|
reIncludeonly = re.compile(r'<includeonly>|</includeonly>', re.DOTALL)
|
|
|
|
|
|
|
|
|
|
templates = {}
|
|
|
|
|
redirects = {}
|
|
|
|
|
|
|
|
|
|
def define_template(title, page):
|
|
|
|
|
global templates
|
|
|
|
|
global redirects
|
|
|
|
|
|
|
|
|
|
#title = normalizeTitle(title)
|
|
|
|
|
|
|
|
|
|
# check for redirects
|
|
|
|
|
m = re.match('#REDIRECT.*?\[\[([^\]]*)]]', page[0])
|
|
|
|
|
if m:
|
|
|
|
|
redirects[title] = m.group(1) #normalizeTitle(m.group(1))
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
text = unescape(''.join(page))
|
|
|
|
|
|
|
|
|
|
# We're storing template text for future inclusion, therefore,
|
|
|
|
|
# remove all <noinclude> text and keep all <includeonly> text
|
|
|
|
|
# (but eliminate <includeonly> tags per se).
|
|
|
|
|
# However, if <onlyinclude> ... </onlyinclude> parts are present,
|
|
|
|
|
# then only keep them and discard the rest of the template body.
|
|
|
|
|
# This is because using <onlyinclude> on a text fragment is
|
|
|
|
|
# equivalent to enclosing it in <includeonly> tags **AND**
|
|
|
|
|
# enclosing all the rest of the template body in <noinclude> tags.
|
|
|
|
|
|
|
|
|
|
# remove comments
|
|
|
|
|
text = comment.sub('', text)
|
|
|
|
|
|
2015-04-15 09:43:02 +08:00
|
|
|
|
# eliminate <noinclude> fragments
|
|
|
|
|
text = reNoinclude.sub('', text)
|
|
|
|
|
# eliminate unterminated <noinclude> elements
|
|
|
|
|
text = re.sub(r'<noinclude\s*>.*$', '', text, flags=re.DOTALL)
|
|
|
|
|
|
2015-03-22 20:45:17 +08:00
|
|
|
|
onlyincludeAccumulator = ''
|
|
|
|
|
for m in re.finditer('<onlyinclude>(.*?)</onlyinclude>', text, re.DOTALL):
|
2015-04-15 09:43:02 +08:00
|
|
|
|
onlyincludeAccumulator += m.group(1)
|
2015-03-22 20:45:17 +08:00
|
|
|
|
if onlyincludeAccumulator:
|
|
|
|
|
text = onlyincludeAccumulator
|
|
|
|
|
else:
|
|
|
|
|
text = reIncludeonly.sub('', text)
|
|
|
|
|
|
|
|
|
|
if text:
|
|
|
|
|
if title in templates:
|
|
|
|
|
logging.warn('Redefining: ' + title)
|
|
|
|
|
templates[title] = text
|
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
def dropNested(text, openDelim, closeDelim):
|
|
|
|
|
"""
|
|
|
|
|
A matching function for nested expressions, e.g. namespaces and tables.
|
|
|
|
|
"""
|
|
|
|
|
openRE = re.compile(openDelim)
|
|
|
|
|
closeRE = re.compile(closeDelim)
|
|
|
|
|
# partition text in separate blocks { } { }
|
|
|
|
|
spans = [] # pairs (s, e) for each partition
|
|
|
|
|
nest = 0 # nesting level
|
|
|
|
|
start = openRE.search(text, 0)
|
|
|
|
|
if not start:
|
|
|
|
|
return text
|
|
|
|
|
end = closeRE.search(text, start.end())
|
|
|
|
|
next = start
|
|
|
|
|
while end:
|
|
|
|
|
next = openRE.search(text, next.end())
|
|
|
|
|
if not next: # termination
|
|
|
|
|
while nest: # close all pending
|
|
|
|
|
nest -=1
|
|
|
|
|
end0 = closeRE.search(text, end.end())
|
|
|
|
|
if end0:
|
|
|
|
|
end = end0
|
|
|
|
|
else:
|
|
|
|
|
break
|
|
|
|
|
spans.append((start.start(), end.end()))
|
|
|
|
|
break
|
|
|
|
|
while end.end() < next.start():
|
|
|
|
|
# { } {
|
|
|
|
|
if nest:
|
|
|
|
|
nest -= 1
|
|
|
|
|
# try closing more
|
|
|
|
|
last = end.end()
|
|
|
|
|
end = closeRE.search(text, end.end())
|
|
|
|
|
if not end: # unbalanced
|
|
|
|
|
if spans:
|
|
|
|
|
span = (spans[0][0], last)
|
|
|
|
|
else:
|
|
|
|
|
span = (start.start(), last)
|
|
|
|
|
spans = [span]
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
spans.append((start.start(), end.end()))
|
|
|
|
|
# advance start, find next close
|
|
|
|
|
start = next
|
|
|
|
|
end = closeRE.search(text, next.end())
|
|
|
|
|
break # { }
|
|
|
|
|
if next != start:
|
|
|
|
|
# { { }
|
|
|
|
|
nest += 1
|
|
|
|
|
# collect text outside partitions
|
|
|
|
|
return dropSpans(spans, text)
|
|
|
|
|
|
|
|
|
|
def dropSpans(spans, text):
|
|
|
|
|
"""
|
|
|
|
|
Drop from text the blocks identified in :param spans:, possibly nested.
|
|
|
|
|
"""
|
|
|
|
|
spans.sort()
|
|
|
|
|
res = ''
|
|
|
|
|
offset = 0
|
|
|
|
|
for s, e in spans:
|
|
|
|
|
if offset <= s: # handle nesting
|
|
|
|
|
if offset < s:
|
|
|
|
|
res += text[offset:s]
|
|
|
|
|
offset = e
|
|
|
|
|
res += text[offset:]
|
|
|
|
|
return res
|
|
|
|
|
|
|
|
|
|
# Match interwiki links, | separates parameters.
|
|
|
|
|
# First parameter is displayed, also trailing concatenated text included
|
2015-04-16 00:20:32 +08:00
|
|
|
|
# in display, e.g. 's' for plural).
|
2015-03-22 20:45:17 +08:00
|
|
|
|
#
|
|
|
|
|
# Can be nested [[File:..|..[[..]]..|..]], [[Category:...]], etc.
|
|
|
|
|
# We first expand inner ones, than remove enclosing ones.
|
2015-04-15 00:09:46 +08:00
|
|
|
|
# Deal also with: [[Help:IPA for Catalan|[anˈdɔra]]]
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
2015-04-15 00:09:46 +08:00
|
|
|
|
parametrizedLink = re.compile(r'\[\[[^\]]*?]]')
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
# Function applied to wikiLinks
|
2015-04-15 06:09:51 +08:00
|
|
|
|
def make_anchor_tag(link, trail):
|
2015-04-16 00:20:32 +08:00
|
|
|
|
if link[1] == '[':
|
|
|
|
|
parts = splitParameters(link[2:-2])
|
|
|
|
|
else:
|
|
|
|
|
parts = splitParameters(link[1:-1])
|
|
|
|
|
link = parts[0]
|
2015-03-22 20:45:17 +08:00
|
|
|
|
colon = link.find(':')
|
|
|
|
|
if colon > 0 and link[:colon] not in acceptedNamespaces:
|
|
|
|
|
return ''
|
2015-04-15 00:09:46 +08:00
|
|
|
|
if colon == 0:
|
|
|
|
|
# drop also :File:
|
|
|
|
|
colon2 = link.find(':', colon+1)
|
|
|
|
|
if colon2 > 1 and link[colon+1:colon2] not in acceptedNamespaces:
|
|
|
|
|
return ''
|
2015-04-16 00:20:32 +08:00
|
|
|
|
anchor = parts[-1] if len(parts) > 1 else link
|
2015-03-22 20:45:17 +08:00
|
|
|
|
anchor += trail
|
|
|
|
|
if keepLinks:
|
2015-04-15 20:30:55 +08:00
|
|
|
|
return '<a href="%s">%s</a>' % (urllib.quote(link.encode('utf-8')), anchor)
|
2015-03-22 20:45:17 +08:00
|
|
|
|
else:
|
|
|
|
|
return anchor
|
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
|
2015-04-15 06:09:51 +08:00
|
|
|
|
# match tail after wikilink
|
|
|
|
|
tailRE = re.compile('\w*')
|
|
|
|
|
|
2015-04-11 18:29:31 +08:00
|
|
|
|
expand_templates = True
|
|
|
|
|
|
2015-03-22 20:45:17 +08:00
|
|
|
|
def clean(text):
|
|
|
|
|
|
2015-04-11 18:29:31 +08:00
|
|
|
|
if (expand_templates):
|
|
|
|
|
# expand templates
|
|
|
|
|
# See: http://www.mediawiki.org/wiki/Help:Templates
|
|
|
|
|
text = expandTemplates(text)
|
2015-04-15 00:09:46 +08:00
|
|
|
|
else:
|
|
|
|
|
# Drop transclusions (template, parser functions)
|
|
|
|
|
text = dropNested(text, r'{{', r'}}')
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
# Drop tables
|
|
|
|
|
text = dropNested(text, r'{\|', r'\|}')
|
|
|
|
|
|
|
|
|
|
# Expand links
|
2015-04-15 00:09:46 +08:00
|
|
|
|
res = ''
|
|
|
|
|
cur = 0
|
2015-04-15 06:09:51 +08:00
|
|
|
|
# This is too slow.
|
|
|
|
|
# for m in wikiLink.finditer(text):
|
|
|
|
|
# res += text[cur:m.start()] + make_anchor_tag(m)
|
|
|
|
|
# cur = m.end()
|
|
|
|
|
# text = res + text[cur:]
|
2015-04-15 09:43:02 +08:00
|
|
|
|
# Matches also: [[Help:IPA for Spanish|[a'ðoβe]]]
|
|
|
|
|
for s,e in findBalanced(text, ['[[', '['], [']]', ']']):
|
2015-04-15 06:09:51 +08:00
|
|
|
|
m = tailRE.match(text, e)
|
|
|
|
|
if m:
|
|
|
|
|
trail = m.group(0)
|
|
|
|
|
e = m.end()
|
|
|
|
|
else:
|
|
|
|
|
trail = ''
|
|
|
|
|
res += text[cur:s] + make_anchor_tag(text[s:e], trail)
|
|
|
|
|
cur = e
|
2015-04-15 00:09:46 +08:00
|
|
|
|
text = res + text[cur:]
|
2015-04-15 06:09:51 +08:00
|
|
|
|
|
2015-03-22 20:45:17 +08:00
|
|
|
|
# Drop all remaining ones
|
|
|
|
|
text = parametrizedLink.sub('', text)
|
|
|
|
|
|
|
|
|
|
# Handle external links
|
|
|
|
|
text = externalLink.sub(r'\1', text)
|
|
|
|
|
text = externalLinkNoAnchor.sub('', text)
|
|
|
|
|
|
|
|
|
|
# Handle bold/italic/quote
|
|
|
|
|
text = bold_italic.sub(r'\1', text)
|
|
|
|
|
text = bold.sub(r'\1', text)
|
|
|
|
|
text = italic_quote.sub(r'"\1"', text)
|
|
|
|
|
text = italic.sub(r'"\1"', text)
|
|
|
|
|
text = quote_quote.sub(r'\1', text)
|
|
|
|
|
# residuals of unbalanced quotes
|
|
|
|
|
text = text.replace("'''", '').replace("''", '"')
|
|
|
|
|
|
|
|
|
|
################ Process HTML ###############
|
|
|
|
|
|
|
|
|
|
# turn into HTML
|
|
|
|
|
text = unescape(text)
|
|
|
|
|
# do it again (&nbsp;)
|
|
|
|
|
text = unescape(text)
|
|
|
|
|
|
|
|
|
|
# Collect spans
|
|
|
|
|
|
|
|
|
|
spans = []
|
|
|
|
|
# Drop HTML comments
|
|
|
|
|
for m in comment.finditer(text):
|
|
|
|
|
spans.append((m.start(), m.end()))
|
|
|
|
|
|
|
|
|
|
# Drop self-closing tags
|
|
|
|
|
for pattern in selfClosing_tag_patterns:
|
|
|
|
|
for m in pattern.finditer(text):
|
|
|
|
|
spans.append((m.start(), m.end()))
|
|
|
|
|
|
|
|
|
|
# Drop ignored tags
|
|
|
|
|
for left, right in ignored_tag_patterns:
|
|
|
|
|
for m in left.finditer(text):
|
|
|
|
|
spans.append((m.start(), m.end()))
|
|
|
|
|
for m in right.finditer(text):
|
|
|
|
|
spans.append((m.start(), m.end()))
|
|
|
|
|
|
|
|
|
|
# Bulk remove all spans
|
|
|
|
|
text = dropSpans(spans, text)
|
|
|
|
|
|
|
|
|
|
# Drop discarded elements
|
2015-04-15 00:09:46 +08:00
|
|
|
|
spans = []
|
2015-03-22 20:45:17 +08:00
|
|
|
|
for pattern in discard_element_patterns:
|
|
|
|
|
for m in pattern.finditer(text):
|
2015-04-15 00:09:46 +08:00
|
|
|
|
spans.append((m.start(),m.end()))
|
2015-03-22 20:45:17 +08:00
|
|
|
|
# bulk removal
|
|
|
|
|
text = dropSpans(spans, text)
|
|
|
|
|
|
|
|
|
|
# Expand placeholders
|
|
|
|
|
for pattern, placeholder in placeholder_tag_patterns:
|
|
|
|
|
index = 1
|
|
|
|
|
for match in pattern.finditer(text):
|
|
|
|
|
text = text.replace(match.group(), '%s_%d' % (placeholder, index))
|
|
|
|
|
index += 1
|
|
|
|
|
|
|
|
|
|
text = text.replace('<<', u'«').replace('>>', u'»')
|
|
|
|
|
|
|
|
|
|
#############################################
|
|
|
|
|
|
|
|
|
|
# Cleanup text
|
|
|
|
|
text = text.replace('\t', ' ')
|
|
|
|
|
text = spaces.sub(' ', text)
|
|
|
|
|
text = dots.sub('...', text)
|
|
|
|
|
text = re.sub(u' (,:\.\)\]»)', r'\1', text)
|
|
|
|
|
text = re.sub(u'(\[\(«) ', r'\1', text)
|
|
|
|
|
text = re.sub(r'\n\W+?\n', '\n', text) # lines with only punctuations
|
|
|
|
|
text = text.replace(',,', ',').replace(',.', '.')
|
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
section = re.compile(r'(==+)\s*(.*?)\s*\1')
|
|
|
|
|
|
|
|
|
|
def compact(text):
|
|
|
|
|
"""Deal with headers, lists, empty sections, residuals of tables"""
|
|
|
|
|
page = [] # list of paragraph
|
|
|
|
|
headers = {} # Headers for unfilled sections
|
|
|
|
|
emptySection = False # empty sections are discarded
|
|
|
|
|
inList = False # whether opened <UL>
|
|
|
|
|
|
|
|
|
|
for line in text.split('\n'):
|
|
|
|
|
|
|
|
|
|
if not line:
|
|
|
|
|
continue
|
|
|
|
|
# Handle section titles
|
|
|
|
|
m = section.match(line)
|
|
|
|
|
if m:
|
|
|
|
|
title = m.group(2)
|
|
|
|
|
lev = len(m.group(1))
|
|
|
|
|
if keepSections:
|
|
|
|
|
page.append("<h%d>%s</h%d>" % (lev, title, lev))
|
|
|
|
|
if title and title[-1] not in '!?':
|
|
|
|
|
title += '.'
|
|
|
|
|
headers[lev] = title
|
|
|
|
|
# drop previous headers
|
|
|
|
|
for i in headers.keys():
|
|
|
|
|
if i > lev:
|
|
|
|
|
del headers[i]
|
|
|
|
|
emptySection = True
|
|
|
|
|
continue
|
|
|
|
|
# Handle page title
|
|
|
|
|
if line.startswith('++'):
|
|
|
|
|
title = line[2:-2]
|
|
|
|
|
if title:
|
|
|
|
|
if title[-1] not in '!?':
|
|
|
|
|
title += '.'
|
|
|
|
|
page.append(title)
|
|
|
|
|
# handle indents
|
|
|
|
|
elif line[0] == ':':
|
|
|
|
|
page.append(line[1:])
|
|
|
|
|
# handle lists
|
|
|
|
|
elif line[0] in '*#;':
|
|
|
|
|
if keepSections:
|
|
|
|
|
page.append("<li>%s</li>" % line[1:])
|
|
|
|
|
else:
|
|
|
|
|
continue
|
2015-04-15 09:43:02 +08:00
|
|
|
|
|
2015-03-22 20:45:17 +08:00
|
|
|
|
# Drop residuals of lists
|
|
|
|
|
elif line[0] in '{|' or line[-1] in '}':
|
|
|
|
|
continue
|
|
|
|
|
# Drop irrelevant lines
|
|
|
|
|
elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '':
|
|
|
|
|
continue
|
|
|
|
|
elif len(headers):
|
|
|
|
|
items = headers.items()
|
|
|
|
|
items.sort()
|
|
|
|
|
for (i, v) in items:
|
|
|
|
|
page.append(v)
|
|
|
|
|
headers.clear()
|
|
|
|
|
page.append(line) # first line
|
|
|
|
|
emptySection = False
|
|
|
|
|
elif not emptySection:
|
|
|
|
|
page.append(line)
|
2015-04-15 20:30:55 +08:00
|
|
|
|
# dangerous
|
|
|
|
|
# # Drop preformatted
|
|
|
|
|
# elif line[0] == ' ':
|
|
|
|
|
# continue
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
return page
|
|
|
|
|
|
|
|
|
|
def handle_unicode(entity):
|
|
|
|
|
numeric_code = int(entity[2:-1])
|
|
|
|
|
if numeric_code >= 0x10000: return ''
|
|
|
|
|
return unichr(numeric_code)
|
|
|
|
|
|
|
|
|
|
#------------------------------------------------------------------------------
|
2015-04-12 16:21:35 +08:00
|
|
|
|
# Output
|
|
|
|
|
|
|
|
|
|
class NextFile(object):
|
|
|
|
|
"""
|
|
|
|
|
Synchronous generation of next available file name.
|
|
|
|
|
"""
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
2015-04-12 16:21:35 +08:00
|
|
|
|
filesPerDir = 100
|
|
|
|
|
|
|
|
|
|
def __init__(self, lock, path_name):
|
|
|
|
|
self.lock = lock
|
|
|
|
|
self.path_name = path_name
|
|
|
|
|
self.dir_index = -1
|
2015-03-22 20:45:17 +08:00
|
|
|
|
self.file_index = -1
|
2015-04-12 16:21:35 +08:00
|
|
|
|
|
|
|
|
|
def next(self):
|
|
|
|
|
with self.lock:
|
|
|
|
|
self.file_index = (self.file_index + 1) % NextFile.filesPerDir
|
|
|
|
|
if self.file_index == 0:
|
|
|
|
|
self.dir_index += 1
|
|
|
|
|
dirname = self._dirname()
|
|
|
|
|
if not os.path.isdir(dirname):
|
|
|
|
|
os.makedirs(dirname)
|
|
|
|
|
return self._filepath()
|
|
|
|
|
|
|
|
|
|
def _dirname(self):
|
|
|
|
|
char1 = self.dir_index % 26
|
|
|
|
|
char2 = self.dir_index / 26 % 26
|
|
|
|
|
return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))
|
|
|
|
|
|
|
|
|
|
def _filepath(self):
|
|
|
|
|
return '%s/wiki_%02d' % (self._dirname(), self.file_index)
|
|
|
|
|
|
|
|
|
|
class OutputSplitter(object):
|
|
|
|
|
"""
|
|
|
|
|
File-like object, that splits output to multiple files of a given max size.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, nextFile, max_file_size=0, compress=True):
|
|
|
|
|
"""
|
|
|
|
|
:param nextfile: a NextFile object from which to obtain filenames
|
|
|
|
|
to use.
|
|
|
|
|
:param max_file_size: the maximum size of each file.
|
|
|
|
|
:para compress: whether to write data with bzip compression.
|
|
|
|
|
"""
|
|
|
|
|
self.nextFile = nextFile
|
2015-03-22 20:45:17 +08:00
|
|
|
|
self.compress = compress
|
|
|
|
|
self.max_file_size = max_file_size
|
2015-04-12 16:21:35 +08:00
|
|
|
|
self.file = self.open(self.nextFile.next())
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
def reserve(self, size):
|
2015-04-12 16:21:35 +08:00
|
|
|
|
if self.file.tell() + size > self.max_file_size:
|
|
|
|
|
self.close()
|
|
|
|
|
self.file = self.open(self.nextFile.next())
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
2015-04-12 16:21:35 +08:00
|
|
|
|
def write(self, data):
|
|
|
|
|
self.file.write(data)
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
def close(self):
|
2015-04-12 16:21:35 +08:00
|
|
|
|
self.file.close()
|
|
|
|
|
|
|
|
|
|
def open(self, filename):
|
2015-03-22 20:45:17 +08:00
|
|
|
|
if self.compress:
|
2015-04-12 16:21:35 +08:00
|
|
|
|
return bz2.BZ2File(filename + '.bz2', 'w')
|
2015-03-22 20:45:17 +08:00
|
|
|
|
else:
|
2015-04-12 16:21:35 +08:00
|
|
|
|
return open(filename, 'w')
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
# READER
|
|
|
|
|
|
|
|
|
|
tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')
|
2015-04-12 16:21:35 +08:00
|
|
|
|
# 1 2 3 4
|
2015-03-22 20:45:17 +08:00
|
|
|
|
#tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>([^<]*)')
|
|
|
|
|
|
|
|
|
|
def load_templates(file, output_file=None):
|
|
|
|
|
"""
|
|
|
|
|
Load templates from :param file:.
|
|
|
|
|
:param output_file: file where to save templates.
|
|
|
|
|
"""
|
|
|
|
|
templatePrefix = templateNamespace + ':'
|
|
|
|
|
articles = 0
|
|
|
|
|
page = []
|
|
|
|
|
inText = False
|
|
|
|
|
if output_file:
|
|
|
|
|
output = codecs.open(output_file, 'wb', 'utf-8')
|
|
|
|
|
for line in file:
|
|
|
|
|
line = line.decode('utf-8')
|
|
|
|
|
if '<' not in line: # faster than doing re.search()
|
|
|
|
|
if inText:
|
|
|
|
|
page.append(line)
|
|
|
|
|
continue
|
|
|
|
|
m = tagRE.search(line)
|
|
|
|
|
if not m:
|
|
|
|
|
continue
|
|
|
|
|
tag = m.group(2)
|
|
|
|
|
if tag == 'page':
|
|
|
|
|
page = []
|
|
|
|
|
elif tag == 'title':
|
|
|
|
|
title = m.group(3)
|
|
|
|
|
elif tag == 'text':
|
|
|
|
|
inText = True
|
|
|
|
|
line = line[m.start(3):m.end(3)]
|
|
|
|
|
page.append(line)
|
|
|
|
|
if m.lastindex == 4: # open-close
|
|
|
|
|
inText = False
|
|
|
|
|
elif tag == '/text':
|
|
|
|
|
if m.group(1):
|
|
|
|
|
page.append(m.group(1))
|
|
|
|
|
inText = False
|
|
|
|
|
elif inText:
|
|
|
|
|
page.append(line)
|
|
|
|
|
elif tag == '/page':
|
|
|
|
|
if title.startswith(templatePrefix):
|
|
|
|
|
define_template(title, page)
|
|
|
|
|
if output_file:
|
|
|
|
|
output.write('<page>\n')
|
|
|
|
|
output.write(' <title>%s</title>\n' % title)
|
|
|
|
|
output.write(' <ns>10</ns>\n')
|
|
|
|
|
output.write(' <text>')
|
|
|
|
|
for line in page:
|
|
|
|
|
output.write(line)
|
|
|
|
|
output.write(' </text>\n')
|
|
|
|
|
output.write('</page>\n')
|
|
|
|
|
page = []
|
|
|
|
|
articles += 1
|
|
|
|
|
if articles % 10000 == 0:
|
|
|
|
|
logging.info("Preprocessed: %d pages" % articles)
|
|
|
|
|
|
2015-04-12 16:21:35 +08:00
|
|
|
|
def process_dump(input_file, template_file, outdir, file_size, file_compress, threads):
|
2015-03-22 20:45:17 +08:00
|
|
|
|
"""
|
|
|
|
|
:param input_file: name of the wikipedia dump file.
|
|
|
|
|
:param template_file: optional file with template definitions.
|
2015-04-12 16:21:35 +08:00
|
|
|
|
:param outdir: name of the directory where to store extracted files.
|
|
|
|
|
:param file_size: max size of each extracted file.
|
|
|
|
|
:param file_compress: whether to compress files with bzip.
|
2015-03-22 20:45:17 +08:00
|
|
|
|
"""
|
2015-04-12 16:21:35 +08:00
|
|
|
|
global urlbase
|
2015-03-22 20:45:17 +08:00
|
|
|
|
global knownNamespaces
|
|
|
|
|
global templateNamespace
|
2015-04-11 18:29:31 +08:00
|
|
|
|
global expand_templates
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
if input_file.lower().endswith("bz2"):
|
|
|
|
|
opener = bz2.BZ2File
|
|
|
|
|
else:
|
|
|
|
|
opener = open
|
|
|
|
|
|
|
|
|
|
input = opener(input_file)
|
|
|
|
|
|
|
|
|
|
# collect siteinfo
|
|
|
|
|
for line in input:
|
|
|
|
|
line = line.decode('utf-8')
|
|
|
|
|
m = tagRE.search(line)
|
|
|
|
|
if not m:
|
|
|
|
|
continue
|
|
|
|
|
tag = m.group(2)
|
|
|
|
|
if tag == 'base':
|
2015-04-12 16:21:35 +08:00
|
|
|
|
# discover urlbase from the xml dump file
|
2015-03-22 20:45:17 +08:00
|
|
|
|
# /mediawiki/siteinfo/base
|
|
|
|
|
base = m.group(3)
|
2015-04-12 16:21:35 +08:00
|
|
|
|
urlbase = base[:base.rfind("/")]
|
2015-03-22 20:45:17 +08:00
|
|
|
|
elif tag == 'namespace':
|
|
|
|
|
knownNamespaces.add(m.group(3))
|
|
|
|
|
if re.search('key="10"', line):
|
|
|
|
|
templateNamespace = m.group(3)
|
|
|
|
|
elif tag == '/siteinfo':
|
|
|
|
|
break
|
|
|
|
|
|
2015-04-11 18:29:31 +08:00
|
|
|
|
if expand_templates:
|
|
|
|
|
# preprocess
|
|
|
|
|
logging.info("Preprocessing dump to collect template definitions: this may take some time.")
|
|
|
|
|
if template_file and os.path.exists(template_file):
|
|
|
|
|
input.close()
|
|
|
|
|
with open(template_file) as file:
|
|
|
|
|
load_templates(file)
|
|
|
|
|
else:
|
|
|
|
|
load_templates(input, template_file)
|
|
|
|
|
input.close()
|
|
|
|
|
input = opener(input_file)
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
# process pages
|
|
|
|
|
logging.info("Starting processing pages from %s." % input_file)
|
|
|
|
|
|
2015-04-12 16:21:35 +08:00
|
|
|
|
# initialize jobs queue
|
|
|
|
|
#threads = multiprocessing.cpu_count()
|
|
|
|
|
logging.info("Using %d CPUs." % threads)
|
|
|
|
|
queue = Queue.Queue(maxsize=2 * threads)
|
|
|
|
|
lock = threading.Lock() # for protecting shared state.
|
|
|
|
|
|
|
|
|
|
nextFile = NextFile(lock, outdir)
|
|
|
|
|
|
2015-04-15 20:30:55 +08:00
|
|
|
|
# start worker threads
|
2015-04-12 16:21:35 +08:00
|
|
|
|
workers = []
|
|
|
|
|
for _ in xrange(max(1, threads - 1)): # keep one for master
|
|
|
|
|
output_splitter = OutputSplitter(nextFile, file_size, file_compress)
|
|
|
|
|
extractor = ExtractorThread(queue, output_splitter)
|
|
|
|
|
workers.append(extractor)
|
|
|
|
|
|
2015-04-15 06:09:51 +08:00
|
|
|
|
# we collect indivual lines, since str.join() is significantly faster than
|
|
|
|
|
# concatenation
|
2015-03-22 20:45:17 +08:00
|
|
|
|
page = []
|
|
|
|
|
id = None
|
|
|
|
|
inText = False
|
|
|
|
|
redirect = False
|
|
|
|
|
for line in input:
|
|
|
|
|
line = line.decode('utf-8')
|
|
|
|
|
if '<' not in line: # faster than doing re.search()
|
|
|
|
|
if inText:
|
|
|
|
|
page.append(line)
|
|
|
|
|
continue
|
|
|
|
|
m = tagRE.search(line)
|
|
|
|
|
if not m:
|
|
|
|
|
continue
|
|
|
|
|
tag = m.group(2)
|
|
|
|
|
if tag == 'page':
|
|
|
|
|
page = []
|
|
|
|
|
redirect = False
|
|
|
|
|
elif tag == 'id' and not id:
|
|
|
|
|
id = m.group(3)
|
|
|
|
|
elif tag == 'title':
|
|
|
|
|
title = m.group(3)
|
|
|
|
|
elif tag == 'redirect':
|
|
|
|
|
redirect = True
|
|
|
|
|
elif tag == 'text':
|
|
|
|
|
inText = True
|
|
|
|
|
line = line[m.start(3):m.end(3)]
|
|
|
|
|
page.append(line)
|
|
|
|
|
if m.lastindex == 4: # open-close
|
|
|
|
|
inText = False
|
|
|
|
|
elif tag == '/text':
|
|
|
|
|
if m.group(1):
|
|
|
|
|
page.append(m.group(1))
|
|
|
|
|
inText = False
|
|
|
|
|
elif inText:
|
|
|
|
|
page.append(line)
|
|
|
|
|
elif tag == '/page':
|
|
|
|
|
colon = title.find(':')
|
|
|
|
|
if (colon < 0 or title[:colon] in acceptedNamespaces) and \
|
|
|
|
|
not redirect and not title.startswith(templateNamespace):
|
2015-04-12 16:21:35 +08:00
|
|
|
|
queue.put(Extractor(id, title, page), True) # block if full
|
2015-03-22 20:45:17 +08:00
|
|
|
|
id = None
|
|
|
|
|
page = []
|
|
|
|
|
|
2015-04-12 16:21:35 +08:00
|
|
|
|
# wait for empty queue
|
2015-04-15 20:30:55 +08:00
|
|
|
|
queue.join()
|
2015-04-12 16:21:35 +08:00
|
|
|
|
|
2015-03-22 20:45:17 +08:00
|
|
|
|
input.close()
|
|
|
|
|
|
2015-04-12 16:21:35 +08:00
|
|
|
|
#======================================================================
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
2015-04-12 16:21:35 +08:00
|
|
|
|
class Extractor(object):
|
|
|
|
|
"""
|
|
|
|
|
An extraction task on a article.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, id, title, page):
|
|
|
|
|
"""
|
|
|
|
|
:param page: a list of lines.
|
|
|
|
|
"""
|
|
|
|
|
self.id = id
|
|
|
|
|
self.title = title
|
|
|
|
|
self.page = page
|
|
|
|
|
|
2015-04-15 00:09:46 +08:00
|
|
|
|
def extract(self, out=sys.stdout):
|
2015-04-15 06:09:51 +08:00
|
|
|
|
logging.info("%s\t%s" % (self.id, self.title))
|
2015-04-15 00:09:46 +08:00
|
|
|
|
text = ''.join(self.page)
|
|
|
|
|
url = get_url(self.id)
|
|
|
|
|
header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
|
2015-04-12 16:21:35 +08:00
|
|
|
|
# Separate header from text with a newline.
|
|
|
|
|
header += self.title + '\n\n'
|
|
|
|
|
header = header.encode('utf-8')
|
|
|
|
|
text = clean(text)
|
|
|
|
|
footer = "\n</doc>\n"
|
2015-04-15 00:09:46 +08:00
|
|
|
|
if out != sys.stdout:
|
|
|
|
|
out.reserve(len(header) + len(text) + len(footer))
|
2015-04-12 16:21:35 +08:00
|
|
|
|
out.write(header)
|
|
|
|
|
for line in compact(text):
|
|
|
|
|
out.write(line.encode('utf-8'))
|
|
|
|
|
out.write('\n')
|
|
|
|
|
out.write(footer)
|
|
|
|
|
|
2015-04-15 00:09:46 +08:00
|
|
|
|
#----------------------------------------------------------------------
|
|
|
|
|
# Multithread version
|
|
|
|
|
|
2015-03-22 20:45:17 +08:00
|
|
|
|
class ExtractorThread(threading.Thread):
|
2015-04-12 16:21:35 +08:00
|
|
|
|
"""
|
|
|
|
|
Extractor thread.
|
|
|
|
|
"""
|
|
|
|
|
def __init__(self, queue, splitter):
|
2015-03-22 20:45:17 +08:00
|
|
|
|
self._queue = queue
|
2015-04-12 16:21:35 +08:00
|
|
|
|
self._splitter = splitter
|
|
|
|
|
threading.Thread.__init__(self)
|
|
|
|
|
self.setDaemon(True) # let the process die when main thread is killed
|
|
|
|
|
self.start()
|
2015-04-15 20:30:55 +08:00
|
|
|
|
|
2015-03-22 20:45:17 +08:00
|
|
|
|
def run(self):
|
|
|
|
|
while True:
|
2015-04-12 16:21:35 +08:00
|
|
|
|
job = self._queue.get()
|
|
|
|
|
if job:
|
|
|
|
|
job.extract(self._splitter)
|
2015-03-22 20:45:17 +08:00
|
|
|
|
self._queue.task_done()
|
2015-04-12 16:21:35 +08:00
|
|
|
|
else:
|
|
|
|
|
break
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
# Minimum size of output files
|
|
|
|
|
minFileSize = 200 * 1024
|
|
|
|
|
|
|
|
|
|
def main():
|
2015-04-12 16:21:35 +08:00
|
|
|
|
global keepLinks, keepSections, urlbase, acceptedNamespaces
|
2015-04-11 18:29:31 +08:00
|
|
|
|
global expand_templates
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
|
|
|
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
|
|
|
description=__doc__)
|
|
|
|
|
parser.add_argument("input",
|
|
|
|
|
help="XML wiki dump file")
|
|
|
|
|
parser.add_argument("-o", "--output", default="text",
|
|
|
|
|
help="output directory")
|
|
|
|
|
parser.add_argument("-b", "--bytes", default="1M",
|
2015-04-15 20:30:55 +08:00
|
|
|
|
help="put specified bytes per output file (default is %(default)s)", metavar="n[KMG]")
|
2015-03-22 20:45:17 +08:00
|
|
|
|
parser.add_argument("-B", "--base",
|
|
|
|
|
help="base URL for the Wikipedia pages")
|
|
|
|
|
parser.add_argument("-c", "--compress", action="store_true",
|
|
|
|
|
help="compress output files using bzip")
|
|
|
|
|
parser.add_argument("-l", "--links", action="store_true",
|
|
|
|
|
help="preserve links")
|
|
|
|
|
parser.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
|
|
|
|
|
help="accepted namespaces")
|
|
|
|
|
parser.add_argument("-q", "--quiet", action="store_true",
|
|
|
|
|
help="suppress reporting progress info")
|
2015-04-09 21:24:34 +08:00
|
|
|
|
parser.add_argument("--debug", action="store_true",
|
|
|
|
|
help="print debug info")
|
2015-03-22 20:45:17 +08:00
|
|
|
|
parser.add_argument("-s", "--sections", action="store_true",
|
|
|
|
|
help="preserve sections")
|
|
|
|
|
parser.add_argument("-a", "--article", action="store_true",
|
|
|
|
|
help="analyze a file containing a single article (debug) option")
|
|
|
|
|
# parser.add_argument("-f", "--format", choices=(PLAIN, JSON), default=PLAIN,
|
|
|
|
|
# help="choose output format default is %(default)s")
|
|
|
|
|
parser.add_argument("--templates",
|
|
|
|
|
help="use or create file containing templates")
|
2015-04-11 18:29:31 +08:00
|
|
|
|
parser.add_argument("--no-templates", action="store_false",
|
|
|
|
|
help="Do not expand templates")
|
2015-04-15 00:09:46 +08:00
|
|
|
|
parser.add_argument("--threads", type=int, default=2,
|
|
|
|
|
help="Number of threads to use (default 2)")
|
2015-03-22 20:45:17 +08:00
|
|
|
|
parser.add_argument("-v", "--version", action="version",
|
|
|
|
|
version='%(prog)s ' + version,
|
|
|
|
|
help="print program version")
|
|
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
2015-04-15 20:30:55 +08:00
|
|
|
|
|
2015-03-22 20:45:17 +08:00
|
|
|
|
keepLinks = args.links
|
|
|
|
|
keepSections = args.sections
|
2015-04-11 18:29:31 +08:00
|
|
|
|
expand_templates = args.no_templates
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
if args.base:
|
2015-04-12 16:21:35 +08:00
|
|
|
|
urlbase = args.base
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
try:
|
2015-04-15 20:30:55 +08:00
|
|
|
|
power = 'kmg'.find(args.bytes[-1].lower()) + 1
|
|
|
|
|
file_size = int(args.bytes[:-1]) * 1024 ** power
|
2015-03-22 20:45:17 +08:00
|
|
|
|
if file_size < minFileSize: raise ValueError()
|
|
|
|
|
except ValueError:
|
|
|
|
|
logging.error('Insufficient or invalid size: %s' % args.bytes)
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
if args.namespaces:
|
|
|
|
|
acceptedNamespaces = set(args.ns.split(','))
|
|
|
|
|
|
2015-04-09 21:24:34 +08:00
|
|
|
|
logger = logging.getLogger()
|
2015-03-22 20:45:17 +08:00
|
|
|
|
if not args.quiet:
|
2015-04-09 21:24:34 +08:00
|
|
|
|
logger.setLevel(logging.INFO)
|
|
|
|
|
if args.debug:
|
|
|
|
|
logger.setLevel(logging.DEBUG)
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
input_file = args.input
|
|
|
|
|
|
|
|
|
|
if not keepLinks:
|
|
|
|
|
ignoreTag('a')
|
|
|
|
|
|
|
|
|
|
if args.article:
|
|
|
|
|
if args.templates:
|
|
|
|
|
if os.path.exists(args.templates):
|
|
|
|
|
with open(args.templates) as file:
|
|
|
|
|
load_templates(file)
|
|
|
|
|
|
|
|
|
|
with open(input_file) as file:
|
|
|
|
|
page = file.read().decode('utf-8')
|
|
|
|
|
m = re.search(r'<id>(.*)</id>', page)
|
2015-04-16 00:20:32 +08:00
|
|
|
|
id = m.group(1) if m else 0
|
2015-03-22 20:45:17 +08:00
|
|
|
|
m = re.search(r'<title>(.*)</title>', page)
|
|
|
|
|
if m:
|
|
|
|
|
title = m.group(1)
|
2015-04-15 00:09:46 +08:00
|
|
|
|
Extractor(id, title, [page]).extract()
|
2015-03-22 20:45:17 +08:00
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
output_dir = args.output
|
|
|
|
|
if not os.path.isdir(output_dir):
|
|
|
|
|
try:
|
|
|
|
|
os.makedirs(output_dir)
|
|
|
|
|
except:
|
|
|
|
|
logging.error('Could not create: %s' % output_dir)
|
|
|
|
|
return
|
|
|
|
|
|
2015-04-12 16:21:35 +08:00
|
|
|
|
process_dump(input_file, args.templates, output_dir, file_size,
|
|
|
|
|
args.compress, args.threads)
|
2015-03-22 20:45:17 +08:00
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
main()
|