2018-03-23 05:10:12 +08:00
|
|
|
#!/bin/bash
|
|
|
|
#
|
|
|
|
# NOTES
|
|
|
|
#
|
|
|
|
# - Must expand templates to avoid a large loss of content.
|
|
|
|
# - Text will not (redundantly) contain the title string.
|
|
|
|
# - Keep sections. Section title will be marked by "Section::::".
|
|
|
|
# - Keep lists. List bullets will be marked by "BULLET::::".
|
|
|
|
# - Keep tables. They're mostly garbage but can be removed later (remove "^!*").
|
|
|
|
# - Remove disambiguation pages. Right now there is no use for them.
|
|
|
|
|
|
|
|
INPUT=$1
|
|
|
|
PROCESSES=$2
|
|
|
|
TEMPLATES=$3
|
|
|
|
OUTPUT=$4
|
|
|
|
|
2020-07-23 00:29:37 +08:00
|
|
|
python -m wikiextractor.WikiExtractor.py $INPUT \
|
2018-03-23 05:10:12 +08:00
|
|
|
--json \
|
|
|
|
--processes $PROCESSES \
|
|
|
|
--templates $TEMPLATES \
|
|
|
|
--output $OUTPUT \
|
|
|
|
--bytes 1M \
|
|
|
|
--compress \
|
|
|
|
--links \
|
|
|
|
--sections \
|
|
|
|
--lists \
|
|
|
|
--keep_tables \
|
|
|
|
--min_text_length 0 \
|
|
|
|
--filter_disambig_pages
|