extract language and revion from cirrus search

This simple push extracts the langauge and the page review. These are then added to the XML
2019-03-25 14:28:43 +00:00 · 2019-03-25 14:28:43 +00:00 · 1e4236de42
commit 1e4236de42
parent 2a5e6aebc0
2 changed files with 8 additions and 3 deletions
--- a/.gitignore
+++ b/.gitignore
@ -85,3 +85,6 @@ ENV/

 # Spyder project settings
 .spyderproject
+
+# Editor files
+*.idea
--- a/cirrus-extract.py
+++ b/cirrus-extract.py
@ -28,7 +28,7 @@ Extracts and cleans text from a Wikipedia Cirrus dump and stores output in a
 number of files of similar size in a given directory.
 Each file will contain several documents in the format:

-	<doc id="" url="" title="">
+	<doc id="" url="" title="" language="" revision="">
        ...
        </doc>

@ -124,7 +124,7 @@ class Extractor(object):
        logging.debug("%s\t%s", self.id, self.title)
        text = ''.join(self.page)
        url = get_url(self.id)
-        header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
+        header = '<doc id="%s" url="%s" title="%s" language="%s" revision="%s">\n' % (self.id, url, self.title, self.language, self.revision)
        # Separate header from text with a newline.
        header += self.title + '\n\n'
        header = header.encode('utf-8')
@ -169,6 +169,8 @@ def process_dump(input_file, out_file, file_size, file_compress):
        content = json.loads(input.readline())
        type = index['index']['_type']
        id = index['index']['_id']
+        language = content['language']
+        revision = content['version']
        if type == 'page' and content['namespace'] == 0:
            title = content['title']
            text = content['text']
@ -176,7 +178,7 @@ def process_dump(input_file, out_file, file_size, file_compress):
            # ^ The Penguin Dictionary
            text = re.sub(r'  \^ .*', '', text)
            url = urlbase + 'wiki?curid=' + id
-            header = '<doc id="%s" url="%s" title="%s">\n' % (id, url, title)
+            header = '<doc id="%s" url="%s" title="%s" language="%s" revision="%s">\n' % (id, url, title, language, revision)
            page = header + title + '\n\n' + text + '\n</doc>\n'
            output.write(page.encode('utf-8'))