bash scripts for extraction commands
This commit is contained in:
parent
4ba4e9f683
commit
e689ef3233
16
extract_json_expanded.sh
Executable file
16
extract_json_expanded.sh
Executable file
@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
|
||||
WikiExtractor.py \
|
||||
/scratch/wikipedia-base/dumps/dump02212018/enwiki-latest-pages-articles.xml.bz2 \
|
||||
--processes $1 \
|
||||
--output /scratch/wikipedia-base/extracted/json-expanded \
|
||||
--bytes 10M \
|
||||
--compress \
|
||||
--json \
|
||||
--links \
|
||||
--sections \
|
||||
--lists \
|
||||
--templates templates.txt \
|
||||
--min_text_length 0 \
|
||||
--filter_disambig_pages \
|
||||
--keep_tables
|
16
extract_json_not_expanded.sh
Executable file
16
extract_json_not_expanded.sh
Executable file
@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
|
||||
WikiExtractor.py \
|
||||
/scratch/wikipedia-base/dumps/dump02212018/enwiki-latest-pages-articles.xml.bz2 \
|
||||
--processes $1 \
|
||||
--output /scratch/wikipedia-base/extracted/json-not-expanded \
|
||||
--bytes 10M \
|
||||
--compress \
|
||||
--json \
|
||||
--links \
|
||||
--sections \
|
||||
--lists \
|
||||
--no-templates \
|
||||
--min_text_length 0 \
|
||||
--filter_disambig_pages \
|
||||
--keep_tables
|
Loading…
Reference in New Issue
Block a user