extract language and revion from cirrus search

This simple push extracts the langauge and the page review. These are then added to the XML
This commit is contained in:
Nathan Davies 2019-03-25 14:28:43 +00:00
parent 2a5e6aebc0
commit 1e4236de42
2 changed files with 8 additions and 3 deletions

3
.gitignore vendored
View File

@ -85,3 +85,6 @@ ENV/
# Spyder project settings # Spyder project settings
.spyderproject .spyderproject
# Editor files
*.idea

View File

@ -28,7 +28,7 @@ Extracts and cleans text from a Wikipedia Cirrus dump and stores output in a
number of files of similar size in a given directory. number of files of similar size in a given directory.
Each file will contain several documents in the format: Each file will contain several documents in the format:
<doc id="" url="" title=""> <doc id="" url="" title="" language="" revision="">
... ...
</doc> </doc>
@ -124,7 +124,7 @@ class Extractor(object):
logging.debug("%s\t%s", self.id, self.title) logging.debug("%s\t%s", self.id, self.title)
text = ''.join(self.page) text = ''.join(self.page)
url = get_url(self.id) url = get_url(self.id)
header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title) header = '<doc id="%s" url="%s" title="%s" language="%s" revision="%s">\n' % (self.id, url, self.title, self.language, self.revision)
# Separate header from text with a newline. # Separate header from text with a newline.
header += self.title + '\n\n' header += self.title + '\n\n'
header = header.encode('utf-8') header = header.encode('utf-8')
@ -169,6 +169,8 @@ def process_dump(input_file, out_file, file_size, file_compress):
content = json.loads(input.readline()) content = json.loads(input.readline())
type = index['index']['_type'] type = index['index']['_type']
id = index['index']['_id'] id = index['index']['_id']
language = content['language']
revision = content['version']
if type == 'page' and content['namespace'] == 0: if type == 'page' and content['namespace'] == 0:
title = content['title'] title = content['title']
text = content['text'] text = content['text']
@ -176,7 +178,7 @@ def process_dump(input_file, out_file, file_size, file_compress):
# ^ The Penguin Dictionary # ^ The Penguin Dictionary
text = re.sub(r' \^ .*', '', text) text = re.sub(r' \^ .*', '', text)
url = urlbase + 'wiki?curid=' + id url = urlbase + 'wiki?curid=' + id
header = '<doc id="%s" url="%s" title="%s">\n' % (id, url, title) header = '<doc id="%s" url="%s" title="%s" language="%s" revision="%s">\n' % (id, url, title, language, revision)
page = header + title + '\n\n' + text + '\n</doc>\n' page = header + title + '\n\n' + text + '\n</doc>\n'
output.write(page.encode('utf-8')) output.write(page.encode('utf-8'))