bash scripts for extraction commands

This commit is contained in:
Karl 2018-03-22 09:54:34 -05:00
parent 4ba4e9f683
commit e689ef3233
2 changed files with 32 additions and 0 deletions

16
extract_json_expanded.sh Executable file
View File

@ -0,0 +1,16 @@
#!/bin/bash
WikiExtractor.py \
/scratch/wikipedia-base/dumps/dump02212018/enwiki-latest-pages-articles.xml.bz2 \
--processes $1 \
--output /scratch/wikipedia-base/extracted/json-expanded \
--bytes 10M \
--compress \
--json \
--links \
--sections \
--lists \
--templates templates.txt \
--min_text_length 0 \
--filter_disambig_pages \
--keep_tables

16
extract_json_not_expanded.sh Executable file
View File

@ -0,0 +1,16 @@
#!/bin/bash
WikiExtractor.py \
/scratch/wikipedia-base/dumps/dump02212018/enwiki-latest-pages-articles.xml.bz2 \
--processes $1 \
--output /scratch/wikipedia-base/extracted/json-not-expanded \
--bytes 10M \
--compress \
--json \
--links \
--sections \
--lists \
--no-templates \
--min_text_length 0 \
--filter_disambig_pages \
--keep_tables