See ChangeLog

2015-08-30 21:17:26 +02:00 · 2015-08-30 21:17:26 +02:00 · d5b354597f
commit d5b354597f
parent c15d93c40a
2 changed files with 29 additions and 7 deletions
--- a/7
+++ b/7
@ -1,3 +1,10 @@
+2015-08-30  Giuseppe Attardi  <attardi@di.unipi.it>
+
+	* WikiExtractor.py (main): check presence of title elemen in
+	single article.
+	(load_templates): reconstruct the template namespace from the
+	title of the first template in the saved templates.
+
 2015-06-02  Giuseppe Attardi  <attardi@di.unipi.it>

 	* WikiExtractor.py (Extractor.expandTemplate): extend frame before
--- a/WikiExtractor.py
+++ b/WikiExtractor.py
@ -60,7 +60,7 @@ import Queue, threading, multiprocessing
 #===========================================================================

 # Program version
-version = '2.34'
+version = '2.35'

 ### PARAMS ####################################################################

@ -71,7 +71,8 @@ knownNamespaces = set(['Template'])

 ##
 # The namespace used for template definitions
-templateNamespace = 'Template'
+# It is the name associated with namespace key=10 in the siteinfo header.
+templateNamespace = ''

 ##
 # Recognize only these namespaces
@ -324,7 +325,7 @@ class TemplateArg(object):

        # any parts in a tplarg after the first (the parameter default) are
        # ignored, and an equals sign in the first part is treated as plain text.
-        #logging.debug('TemplateArg %s', parameter)
+        logging.debug('TemplateArg %s', parameter)

        parts = splitParts(parameter)
        self.name = Template.parse(parts[0])
@ -450,7 +451,7 @@ class Extractor(object):
            logging.warn('Max template recursion exceeded!')
            return res

-        #logging.debug('<expandTemplates ' + str(len(self.frame)))
+        logging.debug('<expandTemplates ' + str(len(self.frame)))

        cur = 0
        # look for matching {{...}}
@ -756,7 +757,7 @@ def splitParts(paramsList):
        else:
            parameters = par

-    #logging.debug('splitParts %s %s\nparams: %s', sep, paramsList, str(parameters))
+    logging.debug('splitParts %s %s\nparams: %s', sep, paramsList, str(parameters))
    return parameters

 def findMatchingBraces(text, ldelim=0):
@ -1046,7 +1047,9 @@ magicWordsRE = re.compile('|'.join(MagicWords.switches))
 # parser functions utilities

 def ucfirst(string):
-    """:return: a string with its first character uppercase"""
+    """:return: a string with just its first character uppercase
+    We can't use title() since it coverts all words.
+    """
    if string:
        if len(string) > 1:
            return string[0].upper() + string[1:]
@ -1070,6 +1073,7 @@ def fullyQualifiedTemplateTitle(templateTitle):
    Determine the namespace of the page being included through the template
    mechanism
    """
+    global templatePrefix
    if templateTitle.startswith(':'):
        # Leading colon by itself implies main namespace, so strip this colon
        return ucfirst(templateTitle[1:])
@ -1093,7 +1097,7 @@ def fullyQualifiedTemplateTitle(templateTitle):
    # space]], but having in the system a redirect page with an empty title
    # causes numerous problems, so we'll live happier without it.
    if templateTitle:
-        return "Template:" + ucfirst(templateTitle)
+        return templatePrefix + ucfirst(templateTitle)
    else:
        logging.warn("Skipping page with empty title")
        return ''
@ -2149,6 +2153,7 @@ def load_templates(file, output_file=None):
    Load templates from :param file:.
    :param output_file: file where to save templates.
    """
+    global templateNamespace, templatePrefix
    templatePrefix = templateNamespace + ':'
    articles = 0
    page = []
@ -2182,6 +2187,12 @@ def load_templates(file, output_file=None):
        elif inText:
            page.append(line)
        elif tag == '/page':
+            if not output_file and not templateNamespace: # do not know it yet
+                # we recnstruct it from the first title
+                colon = title.find(':')
+                if colon > 1:
+                    templateNamespace = title[:colon]
+                    templatePrefix = title[:colon+1]
            if title.startswith(templatePrefix):
                define_template(title, page)
                if output_file:
@ -2234,6 +2245,7 @@ def process_dump(input_file, template_file, outdir, file_size, file_compress, th
            knownNamespaces.add(m.group(3))
            if re.search('key="10"', line):
                templateNamespace = m.group(3)
+                templatePrefix = templateNamespace + ':'
        elif tag == '/siteinfo':
            break

@ -2439,6 +2451,9 @@ def main():
            m = re.search(r'<title>(.*)</title>', page)
            if m:
                title = m.group(1)
+            else:
+                logging.error('Missing title element')
+                return
            Extractor(id, title, [page]).extract()
        return