extract language and revion from cirrus search
This simple push extracts the langauge and the page review. These are then added to the XML
This commit is contained in:
parent
2a5e6aebc0
commit
1e4236de42
3
.gitignore
vendored
3
.gitignore
vendored
@ -85,3 +85,6 @@ ENV/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
|
||||
# Editor files
|
||||
*.idea
|
@ -28,7 +28,7 @@ Extracts and cleans text from a Wikipedia Cirrus dump and stores output in a
|
||||
number of files of similar size in a given directory.
|
||||
Each file will contain several documents in the format:
|
||||
|
||||
<doc id="" url="" title="">
|
||||
<doc id="" url="" title="" language="" revision="">
|
||||
...
|
||||
</doc>
|
||||
|
||||
@ -124,7 +124,7 @@ class Extractor(object):
|
||||
logging.debug("%s\t%s", self.id, self.title)
|
||||
text = ''.join(self.page)
|
||||
url = get_url(self.id)
|
||||
header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
|
||||
header = '<doc id="%s" url="%s" title="%s" language="%s" revision="%s">\n' % (self.id, url, self.title, self.language, self.revision)
|
||||
# Separate header from text with a newline.
|
||||
header += self.title + '\n\n'
|
||||
header = header.encode('utf-8')
|
||||
@ -169,6 +169,8 @@ def process_dump(input_file, out_file, file_size, file_compress):
|
||||
content = json.loads(input.readline())
|
||||
type = index['index']['_type']
|
||||
id = index['index']['_id']
|
||||
language = content['language']
|
||||
revision = content['version']
|
||||
if type == 'page' and content['namespace'] == 0:
|
||||
title = content['title']
|
||||
text = content['text']
|
||||
@ -176,7 +178,7 @@ def process_dump(input_file, out_file, file_size, file_compress):
|
||||
# ^ The Penguin Dictionary
|
||||
text = re.sub(r' \^ .*', '', text)
|
||||
url = urlbase + 'wiki?curid=' + id
|
||||
header = '<doc id="%s" url="%s" title="%s">\n' % (id, url, title)
|
||||
header = '<doc id="%s" url="%s" title="%s" language="%s" revision="%s">\n' % (id, url, title, language, revision)
|
||||
page = header + title + '\n\n' + text + '\n</doc>\n'
|
||||
output.write(page.encode('utf-8'))
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user