From 1e4236de4237d0a89d0ad7241505d73ee7e23517 Mon Sep 17 00:00:00 2001 From: Nathan Davies Date: Mon, 25 Mar 2019 14:28:43 +0000 Subject: [PATCH] extract language and revion from cirrus search This simple push extracts the langauge and the page review. These are then added to the XML --- .gitignore | 3 +++ cirrus-extract.py | 8 +++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 6ff5b4f..180558a 100644 --- a/.gitignore +++ b/.gitignore @@ -85,3 +85,6 @@ ENV/ # Spyder project settings .spyderproject + +# Editor files +*.idea \ No newline at end of file diff --git a/cirrus-extract.py b/cirrus-extract.py index 6589846..895970d 100755 --- a/cirrus-extract.py +++ b/cirrus-extract.py @@ -28,7 +28,7 @@ Extracts and cleans text from a Wikipedia Cirrus dump and stores output in a number of files of similar size in a given directory. Each file will contain several documents in the format: - + ... @@ -124,7 +124,7 @@ class Extractor(object): logging.debug("%s\t%s", self.id, self.title) text = ''.join(self.page) url = get_url(self.id) - header = '\n' % (self.id, url, self.title) + header = '\n' % (self.id, url, self.title, self.language, self.revision) # Separate header from text with a newline. header += self.title + '\n\n' header = header.encode('utf-8') @@ -169,6 +169,8 @@ def process_dump(input_file, out_file, file_size, file_compress): content = json.loads(input.readline()) type = index['index']['_type'] id = index['index']['_id'] + language = content['language'] + revision = content['version'] if type == 'page' and content['namespace'] == 0: title = content['title'] text = content['text'] @@ -176,7 +178,7 @@ def process_dump(input_file, out_file, file_size, file_compress): # ^ The Penguin Dictionary text = re.sub(r' \^ .*', '', text) url = urlbase + 'wiki?curid=' + id - header = '\n' % (id, url, title) + header = '\n' % (id, url, title, language, revision) page = header + title + '\n\n' + text + '\n\n' output.write(page.encode('utf-8'))