From 1e4236de4237d0a89d0ad7241505d73ee7e23517 Mon Sep 17 00:00:00 2001
From: Nathan Davies <ndavies@turnitin.com>
Date: Mon, 25 Mar 2019 14:28:43 +0000
Subject: [PATCH] extract language and revion from cirrus search

This simple push extracts the langauge and the page review. These are then added to the XML
---
 .gitignore        | 3 +++
 cirrus-extract.py | 8 +++++---
 2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/.gitignore b/.gitignore
index 6ff5b4f..180558a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -85,3 +85,6 @@ ENV/
 
 # Spyder project settings
 .spyderproject
+
+# Editor files
+*.idea
\ No newline at end of file
diff --git a/cirrus-extract.py b/cirrus-extract.py
index 6589846..895970d 100755
--- a/cirrus-extract.py
+++ b/cirrus-extract.py
@@ -28,7 +28,7 @@ Extracts and cleans text from a Wikipedia Cirrus dump and stores output in a
 number of files of similar size in a given directory.
 Each file will contain several documents in the format:
 
-	<doc id="" url="" title="">
+	<doc id="" url="" title="" language="" revision="">
         ...
         </doc>
 
@@ -124,7 +124,7 @@ class Extractor(object):
         logging.debug("%s\t%s", self.id, self.title)
         text = ''.join(self.page)
         url = get_url(self.id)
-        header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
+        header = '<doc id="%s" url="%s" title="%s" language="%s" revision="%s">\n' % (self.id, url, self.title, self.language, self.revision)
         # Separate header from text with a newline.
         header += self.title + '\n\n'
         header = header.encode('utf-8')
@@ -169,6 +169,8 @@ def process_dump(input_file, out_file, file_size, file_compress):
         content = json.loads(input.readline())
         type = index['index']['_type']
         id = index['index']['_id']
+        language = content['language']
+        revision = content['version']
         if type == 'page' and content['namespace'] == 0:
             title = content['title']
             text = content['text']
@@ -176,7 +178,7 @@ def process_dump(input_file, out_file, file_size, file_compress):
             # ^ The Penguin Dictionary
             text = re.sub(r'  \^ .*', '', text)
             url = urlbase + 'wiki?curid=' + id
-            header = '<doc id="%s" url="%s" title="%s">\n' % (id, url, title)
+            header = '<doc id="%s" url="%s" title="%s" language="%s" revision="%s">\n' % (id, url, title, language, revision)
             page = header + title + '\n\n' + text + '\n</doc>\n'
             output.write(page.encode('utf-8'))