e835e8c004
--discard_elements - allowing you to customise which elements are discarded --ignored_tags - allowing you to customise which tags are ignored --keep_tables - allows the contents of the tables in the original to articel to be retained. This does not render html tables
22 lines
476 B
Python
22 lines
476 B
Python
from setuptools import setup
|
|
|
|
def readme():
|
|
with open('README.md') as f:
|
|
return f.read()
|
|
|
|
setup(
|
|
name='wikiextractor',
|
|
|
|
description='A script that extracts and cleans text from a Wikipedia'
|
|
'database dump',
|
|
author='Giuseppe Attardi',
|
|
author_email='attardi@di.unipi.it',
|
|
version='2.69',
|
|
|
|
url='https://github.com/attardi/wikiextractor',
|
|
|
|
license="GPL 3.0",
|
|
keywords=['text', 'nlp'],
|
|
scripts=['WikiExtractor.py']
|
|
)
|