wikiextractor/extract.sh

30 lines
799 B
Bash
Raw Normal View History

2018-03-23 05:10:12 +08:00
#!/bin/bash
#
# NOTES
#
# - Must expand templates to avoid a large loss of content.
# - Text will not (redundantly) contain the title string.
# - Keep sections. Section title will be marked by "Section::::".
# - Keep lists. List bullets will be marked by "BULLET::::".
# - Keep tables. They're mostly garbage but can be removed later (remove "^!*").
# - Remove disambiguation pages. Right now there is no use for them.
INPUT=$1
PROCESSES=$2
TEMPLATES=$3
OUTPUT=$4
2020-07-23 00:29:37 +08:00
python -m wikiextractor.WikiExtractor.py $INPUT \
2018-03-23 05:10:12 +08:00
--json \
--processes $PROCESSES \
--templates $TEMPLATES \
--output $OUTPUT \
--bytes 1M \
--compress \
--links \
--sections \
--lists \
--keep_tables \
--min_text_length 0 \
--filter_disambig_pages