Merge branch 'master' of https://github.com/zwChan/wikiextractor into zwChan-master

2019-04-13 12:19:36 +02:00 · 2019-04-13 12:19:36 +02:00 · 45c2212f64
commit 45c2212f64
parent 5bf4df62fa 169eaaf208
3 changed files with 36717 additions and 13 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,135 @@
+# WikiExtractor
+[WikiExtractor.py](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) is a Python script that extracts and cleans text from a [Wikipedia database dump](http://download.wikimedia.org/).
+
+The tool is written in Python and requires Python 2.7 or Python 3.3+ but no additional library.
+
+For further information, see the [project Home Page](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor) or the [Wiki](https://github.com/attardi/wikiextractor/wiki).
+
+# Wikipedia Cirrus Extractor
+
+`cirrus-extractor.py` is a version of the script that performs extraction from a Wikipedia Cirrus dump.
+Cirrus dumps contain text with already expanded templates.
+
+Cirrus dumps are available at:
+[cirrussearch](http://dumps.wikimedia.org/other/cirrussearch/).
+
+# Details
+
+WikiExtractor performs template expansion by preprocessing the whole dump and extracting template definitions.
+
+In order to speed up processing:
+
+- multiprocessing is used for dealing with articles in parallel
+- a cache is kept of parsed templates (only useful for repeated extractions).
+
+## Installation
+
+The script may be invoked directly, however it can be installed by doing:
+
+    (sudo) python setup.py install
+
+## Usage
+The script is invoked with a Wikipedia dump file as an argument.
+The output is stored in several files of similar size in a given directory.
+Each file will contains several documents in this [document format](http://medialab.di.unipi.it/wiki/Document_Format).
+
+    usage: WikiExtractor.py [-h] [-o OUTPUT] [-b n[KMG]] [-c] [--json] [--html]
+                            [-l] [-s] [--lists] [-ns ns1,ns2]
+                            [--templates TEMPLATES] [--no-templates] [-r]
+                            [--min_text_length MIN_TEXT_LENGTH]
+                            [--filter_category path_of_categories_file]
+                            [--filter_disambig_pages] [-it abbr,b,big]
+                            [-de gallery,timeline,noinclude] [--keep_tables]
+                            [--processes PROCESSES] [-q] [--debug] [-a] [-v]
+                            [--log_file]
+                            input
+
+    Wikipedia Extractor:
+    Extracts and cleans text from a Wikipedia database dump and stores output in a
+    number of files of similar size in a given directory.
+    Each file will contain several documents in the format:
+
+        <doc id="" revid="" url="" title="">
+            ...
+            </doc>
+
+    If the program is invoked with the --json flag, then each file will
+    contain several documents formatted as json ojects, one per line, with
+    the following structure
+
+        {"id": "", "revid": "", "url":"", "title": "", "text": "..."}
+
+    Template expansion requires preprocesssng first the whole dump and
+    collecting template definitions.
+
+    positional arguments:
+      input                 XML wiki dump file
+
+    optional arguments:
+      -h, --help            show this help message and exit
+      --processes PROCESSES
+                            Number of processes to use (default 1)
+
+    Output:
+      -o OUTPUT, --output OUTPUT
+                            directory for extracted files (or '-' for dumping to
+                            stdout)
+      -b n[KMG], --bytes n[KMG]
+                            maximum bytes per output file (default 1M)
+      -c, --compress        compress output files using bzip
+      --json                write output in json format instead of the default one
+
+    Processing:
+      --html                produce HTML output, subsumes --links
+      -l, --links           preserve links
+      -s, --sections        preserve sections
+      --lists               preserve lists
+      -ns ns1,ns2, --namespaces ns1,ns2
+                            accepted namespaces in links
+      --templates TEMPLATES
+                            use or create file containing templates
+      --no-templates        Do not expand templates
+      -r, --revision        Include the document revision id (default=False)
+      --min_text_length MIN_TEXT_LENGTH
+                            Minimum expanded text length required to write
+                            document (default=0)
+      --filter_category path_of_categories_file
+                            Include or exclude specific categories from the dataset. Specify the categories in
+                            file 'path_of_categories_file'. Format:
+                            One category one line, and if the line starts with:
+                                1) #: Comments, ignored;
+                                2) ^: the categories will be in excluding-categories
+                                3) others: the categories will be in including-categories.
+                            Priority:
+                                1) If excluding-categories is not empty, and any category of a page exists in excluding-categories, the page will be excluded; else
+                                2) If including-categories is not empty, and no category of a page exists in including-categories, the page will be excluded; else
+                                3) the page will be included
+
+      --filter_disambig_pages
+                            Remove pages from output that contain disabmiguation
+                            markup (default=False)
+      -it abbr,b,big, --ignored_tags abbr,b,big
+                            comma separated list of tags that will be dropped,
+                            keeping their content
+      -de gallery,timeline,noinclude, --discard_elements gallery,timeline,noinclude
+                            comma separated list of elements that will be removed
+                            from the article text
+      --keep_tables         Preserve tables in the output article text
+                            (default=False)
+
+    Special:
+      -q, --quiet           suppress reporting progress info
+      --debug               print debug info
+      -a, --article         analyze a file containing a single article (debug
+                            option)
+      -v, --version         print program version
+      --log_file            specify a file to save the log information.
+
+
+Saving templates to a file will speed up performing extraction the next time,
+assuming template definitions have not changed.
+
+Option --no-templates significantly speeds up the extractor, avoiding the cost
+of expanding [MediaWiki templates](https://www.mediawiki.org/wiki/Help:Templates).
+
+For further information, visit [the documentation](http://attardi.github.io/wikiextractor).
--- a/WikiExtractor.py
+++ b/WikiExtractor.py
@ -189,6 +189,10 @@ options = SimpleNamespace(
    # Elements to ignore/discard

    ignored_tag_patterns = [],
+    filter_category_include = set(),
+    filter_category_exclude = set(),
+
+    log_file = None,

    discardElements = [
        'gallery', 'timeline', 'noinclude', 'pre',
@ -209,15 +213,28 @@ templateKeys = set(['10', '828'])
 filter_disambig_page_pattern = re.compile("{{disambig(uation)?(\|[^}]*)?}}|__DISAMBIG__")

 ##
+g_page_total = 0
+g_page_articl_total=0
+g_page_articl_used_total=0
 # page filtering logic -- remove templates, undesired xml namespaces, and disambiguation pages
-def keepPage(ns, page):
+def keepPage(ns, catSet, page):
+    global g_page_articl_total,g_page_total,g_page_articl_used_total
+    g_page_total += 1
    if ns != '0':               # Aritcle
        return False
    # remove disambig pages if desired
+    g_page_articl_total += 1
    if options.filter_disambig_pages:
        for line in page:
            if filter_disambig_page_pattern.match(line):
                return False
+    if len(options.filter_category_include) > 0 and len(options.filter_category_include & catSet)==0:
+        logging.debug("***No include  " + str(catSet))
+        return False
+    if len(options.filter_category_exclude) > 0 and len(options.filter_category_exclude & catSet)>0:
+        logging.debug("***Exclude  " + str(catSet))
+        return False
+    g_page_articl_used_total += 1
    return True


@ -629,6 +646,8 @@ class Extractor(object):
        text = self.transform(text)
        text = self.wiki2text(text)
        text = compact(self.clean(text))
+        # from zwChan
+        text = [title_str] + text

        if sum(len(line) for line in text) < options.min_text_length:
            return
@ -2717,7 +2736,8 @@ class OutputSplitter(object):

 tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*?>(?:([^<]*)(<.*?>)?)?')
 #                    1     2               3      4
-keyRE = re.compile(r'key="([+-]?)(\d*)"')
+keyRE = re.compile(r'key="(\d*)"')
+catRE = re.compile(r'\[\[Category:([^\|]+).*\]\].*')  # capture the category name [[Category:Category name|Sortkey]]"

 def load_templates(file, output_file=None):
    """
@ -2730,7 +2750,7 @@ def load_templates(file, output_file=None):
    if output_file:
        output = codecs.open(output_file, 'wb', 'utf-8')
    for page_count, page_data in enumerate(pages_from(file)):
-        id, revid, title, ns, page = page_data
+        id, revid, title, ns,catSet, page = page_data
        if not output_file and (not options.templateNamespace or
                                not options.moduleNamespace):  # do not know it yet
            # reconstruct templateNamespace and moduleNamespace from the first title
@ -2784,6 +2804,11 @@ def pages_from(input):
        if '<' not in line:  # faster than doing re.search()
            if inText:
                page.append(line)
+                # extract categories
+                if line.lstrip().startswith('[[Category:'):
+                    mCat = catRE.search(line)
+                    if mCat:
+                        catSet.add(mCat.group(1))
            continue
        m = tagRE.search(line)
        if not m:
@ -2791,6 +2816,7 @@ def pages_from(input):
        tag = m.group(2)
        if tag == 'page':
            page = []
+            catSet = set()
            redirect = False
        elif tag == 'id' and not id:
            id = m.group(3)
@ -2819,7 +2845,7 @@ def pages_from(input):
            page.append(line)
        elif tag == '/page':
            if id != last_id and not redirect:
-                yield (id, revid, title, ns, page)
+                yield (id, revid, title, ns,catSet, page)
                last_id = id
                ns = '0'
            id = None
@ -2939,8 +2965,8 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
    # Mapper process
    page_num = 0
    for page_data in pages_from(input):
-        id, revid, title, ns, page = page_data
-        if keepPage(ns, page):
+        id, revid, title, ns, catSet, page = page_data
+        if keepPage(ns, catSet, page):
            # slow down
            delay = 0
            if spool_length.value > max_spool_length:
@ -2973,6 +2999,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
    extract_rate = page_num / extract_duration
    logging.info("Finished %d-process extraction of %d articles in %.1fs (%.1f art/s)",
                 process_count, page_num, extract_duration, extract_rate)
+    logging.info("total of page: %d, total of articl page: %d; total of used articl page: %d" % (g_page_total, g_page_articl_total,g_page_articl_used_total))


 # ----------------------------------------------------------------------
@ -2989,7 +3016,7 @@ def extract_process(opts, i, jobs_queue, output_queue):
    global options
    options = opts

-    createLogger(options.quiet, options.debug)
+    createLogger(options.quiet, options.debug, options.log_file)

    out = StringIO()                 # memory buffer

@ -3031,7 +3058,7 @@ def reduce_process(opts, output_queue, spool_length,
    global options
    options = opts

-    createLogger(options.quiet, options.debug)
+    createLogger(options.quiet, options.debug, options.log_file)

    if out_file:
        nextFile = NextFile(out_file)
@ -3137,10 +3164,14 @@ def main():
                        help="print debug info")
    groupS.add_argument("-a", "--article", action="store_true",
                        help="analyze a file containing a single article (debug option)")
+    groupS.add_argument("--log_file",
+                        help="path to save the log info")
    groupS.add_argument("-v", "--version", action="version",
                        version='%(prog)s ' + version,
                        help="print program version")
-
+    groupP.add_argument("--filter_category",
+                        help="specify the file that listing the Categories you want to include or exclude. One line for"
+                             " one category. starting with: 1) '#' comment, ignored; 2) '^' exclude; Note: excluding has higher priority than including")
    args = parser.parse_args()

    options.keepLinks = args.links
@ -3192,8 +3223,8 @@ def main():

    options.quiet = args.quiet
    options.debug = args.debug
-
-    createLogger(options.quiet, options.debug)
+    options.log_file = args.log_file
+    createLogger(options.quiet, options.debug, options.log_file)

    input_file = args.input

@ -3212,7 +3243,7 @@ def main():

        file = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
        for page_data in pages_from(file):
-            id, revid, title, ns, page = page_data
+            id, revid, title, ns,catSet, page = page_data
            Extractor(id, revid, title, page).extract(sys.stdout)
        file.close()
        return
@ -3225,15 +3256,41 @@ def main():
            logging.error('Could not create: %s', output_path)
            return

+    filter_category = args.filter_category
+    if (filter_category != None and len(filter_category)>0):
+        with open(filter_category) as f:
+            error_cnt = 0
+            for line in f.readlines():
+                try:
+                    line = str(line.strip())
+                    if line.startswith('#') or len(line) == 0:
+                        continue;
+                    elif line.startswith('^'):
+                        options.filter_category_exclude.add(line.lstrip('^'))
+                    else:
+                        options.filter_category_include.add(line)
+                except Exception as e:
+                    error_cnt += 1
+                    print(u"Category not in utf8, ignored. error cnt %d:\t%s" % (error_cnt,e))
+                    print(line)
+            logging.info("Excluding categories:",)
+            logging.info(str(options.filter_category_exclude))
+            logging.info("Including categories:")
+            logging.info(str(len(options.filter_category_include)))
+
    process_dump(input_file, args.templates, output_path, file_size,
                 args.compress, args.processes)

-def createLogger(quiet, debug):
+def createLogger(quiet, debug, log_file):
    logger = logging.getLogger()
    if not quiet:
        logger.setLevel(logging.INFO)
    if debug:
        logger.setLevel(logging.DEBUG)
+    #print (log_file)
+    if log_file:
+        fileHandler = logging.FileHandler(log_file)
+        logger.addHandler(fileHandler)

 if __name__ == '__main__':
    main()
--- a/categories.filter
+++ b/categories.filter