Added back option --json.

2020-12-17 13:17:34 +01:00 · 2020-12-17 13:17:34 +01:00 · 6490f5361d
commit 6490f5361d
parent 95ddfaa451
3 changed files with 128 additions and 126 deletions
--- a/README.md
+++ b/README.md
@ -54,103 +54,68 @@ The option `--templates` extracts the templates to a local file, which can be re
 The output is stored in several files of similar size in a given directory.
 Each file will contains several documents in this [document format](https://github.com/attardi/wikiextractor/wiki/File-Format).
-    usage: WikiExtractor.py [-h] [-o OUTPUT] [-b n[KMG]] [-c] [--json] [--html]
+```
-                            [-l] [-s] [--lists] [-ns ns1,ns2]
+usage: wikiextractor [-h] [-o OUTPUT] [-b n[KMG]] [-c] [--json] [--html] [-l] [-ns ns1,ns2]
-                            [--templates TEMPLATES] [--no-templates] [-r]
+			 [--templates TEMPLATES] [--no-templates] [--html-safe HTML_SAFE] [--processes PROCESSES]
-                            [--min_text_length MIN_TEXT_LENGTH]
+			 [-q] [--debug] [-a] [-v]
-                            [--filter_category path_of_categories_file]
+			 input
                            [--filter_disambig_pages] [-it abbr,b,big]
                            [-de gallery,timeline,noinclude] [--keep_tables]
                            [--processes PROCESSES] [-q] [--debug] [-a] [-v]
                            [--log_file]
                            input
-    Wikipedia Extractor:
+Wikipedia Extractor:
-    Extracts and cleans text from a Wikipedia database dump and stores output in a
+Extracts and cleans text from a Wikipedia database dump and stores output in a
-    number of files of similar size in a given directory.
+number of files of similar size in a given directory.
-    Each file will contain several documents in the format:
+Each file will contain several documents in the format:
-        <doc id="" revid="" url="" title="">
+	<doc id="" url="" title="">
-            ...
+	    ...
-            </doc>
+	    </doc>
-    If the program is invoked with the --json flag, then each file will
+If the program is invoked with the --json flag, then each file will                                            
-    contain several documents formatted as json ojects, one per line, with
+contain several documents formatted as json ojects, one per line, with                                         
-    the following structure
+the following structure
-        {"id": "", "revid": "", "url":"", "title": "", "text": "..."}
+	{"id": "", "revid": "", "url": "", "title": "", "text": "..."}
-    Template expansion requires preprocessing first the whole dump and
+The program performs template expansion by preprocesssng the whole dump and
-    collecting template definitions.
+collecting template definitions.
-    positional arguments:
+positional arguments:
-      input                 XML wiki dump file
+  input                 XML wiki dump file
-    optional arguments:
+optional arguments:
-      -h, --help            show this help message and exit
+  -h, --help            show this help message and exit
-      --processes PROCESSES
+  --processes PROCESSES
-                            Number of processes to use (default 1)
+			    Number of processes to use (default 79)
-    Output:
+Output:
-      -o OUTPUT, --output OUTPUT
+  -o OUTPUT, --output OUTPUT
-                            directory for extracted files (or '-' for dumping to
+			    directory for extracted files (or '-' for dumping to stdout)
-                            stdout)
+  -b n[KMG], --bytes n[KMG]
-      -b n[KMG], --bytes n[KMG]
+			    maximum bytes per output file (default 1M)
-                            maximum bytes per output file (default 1M)
+  -c, --compress        compress output files using bzip
-      -c, --compress        compress output files using bzip
+  --json                write output in json format instead of the default <doc> format
      --json                write output in json format instead of the default one
-    Processing:
+Processing:
-      --html                produce HTML output, subsumes --links
+  --html                produce HTML output, subsumes --links
-      -l, --links           preserve links
+  -l, --links           preserve links
-      -s, --sections        preserve sections
+  -ns ns1,ns2, --namespaces ns1,ns2
-      --lists               preserve lists
+			    accepted namespaces
-      -ns ns1,ns2, --namespaces ns1,ns2
+  --templates TEMPLATES
-                            accepted namespaces in links
+			    use or create file containing templates
-      --templates TEMPLATES
+  --no-templates        Do not expand templates
-                            use or create file containing templates
+  --html-safe HTML_SAFE
-      --no-templates        Do not expand templates
+			    use to produce HTML safe output within <doc>...</doc>
      -r, --revision        Include the document revision id (default=False)
      --min_text_length MIN_TEXT_LENGTH
                            Minimum expanded text length required to write
                            document (default=0)
      --filter_category path_of_categories_file
                            Include or exclude specific categories from the dataset. Specify the categories in
                            file 'path_of_categories_file'. Format:
                            One category one line, and if the line starts with:
                                1) #: Comments, ignored;
                                2) ^: the categories will be in excluding-categories
                                3) others: the categories will be in including-categories.
                            Priority:
                                1) If excluding-categories is not empty, and any category of a page exists in excluding-categories, the page will be excluded; else
                                2) If including-categories is not empty, and no category of a page exists in including-categories, the page will be excluded; else
                                3) the page will be included
      --filter_disambig_pages
                            Remove pages from output that contain disabmiguation
                            markup (default=False)
      -it abbr,b,big, --ignored_tags abbr,b,big
                            comma separated list of tags that will be dropped,
                            keeping their content
      -de gallery,timeline,noinclude, --discard_elements gallery,timeline,noinclude
                            comma separated list of elements that will be removed
                            from the article text
      --keep_tables         Preserve tables in the output article text
                            (default=False)
    Special:
      -q, --quiet           suppress reporting progress info
      --debug               print debug info
      -a, --article         analyze a file containing a single article (debug
                            option)
      -v, --version         print program version
      --log_file            specify a file to save the log information.
 Special:
  -q, --quiet           suppress reporting progress info
  --debug               print debug info
  -a, --article         analyze a file containing a single article (debug option)
  -v, --version         print program version
 ```
 Saving templates to a file will speed up performing extraction the next time,
 assuming template definitions have not changed.
-Option --no-templates significantly speeds up the extractor, avoiding the cost
+Option `--no-templates` significantly speeds up the extractor, avoiding the cost
 of expanding [MediaWiki templates](https://www.mediawiki.org/wiki/Help:Templates).
 For further information, visit [the documentation](http://attardi.github.io/wikiextractor).
--- a/wikiextractor/WikiExtractor.py
+++ b/wikiextractor/WikiExtractor.py
@ -43,7 +43,13 @@ Each file will contain several documents in the format:
        ...
        </doc>
-This version performs template expansion by preprocesssng the whole dump and
+If the program is invoked with the --json flag, then each file will                                            
 contain several documents formatted as json ojects, one per line, with                                         
 the following structure
    {"id": "", "revid": "", "url": "", "title": "", "text": "..."}
 The program performs template expansion by preprocesssng the whole dump and
 collecting template definitions.
 """
@ -258,6 +264,7 @@ def load_templates(file, output_file=None):
 def decode_open(filename, mode='rt', encoding='utf-8'):
    """
    Open a file, decode and decompress, depending on extension `gz`, or 'bz2`.
    :param filename: the file to open.
    """
    ext = os.path.splitext(filename)[1]
    if ext == '.gz':
@ -270,7 +277,7 @@ def decode_open(filename, mode='rt', encoding='utf-8'):
 def process_dump(input_file, template_file, out_file, file_size, file_compress,
-                 process_count, escape_doc):
+                 process_count, html_safe):
    """
    :param input_file: name of the wikipedia dump file; '-' to read from stdin
    :param template_file: optional file with template definitions.
@ -361,7 +368,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
    workers = []
    for _ in range(max(1, process_count)):
        extractor = Process(target=extract_process,
-                            args=(jobs_queue, output_queue, escape_doc))
+                            args=(jobs_queue, output_queue, html_safe))
        extractor.daemon = True  # only live while parent process lives
        extractor.start()
        workers.append(extractor)
@ -371,13 +378,13 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
    # we collect individual lines, since str.join() is significantly faster
    # than concatenation
    page = []
-    id = None
+    id = ''
-    last_id = None
+    revid = ''
    last_id = ''
    ordinal = 0  # page count
    inText = False
    redirect = False
    for line in input:
        #line = line.decode('utf-8')
        if '<' not in line:  # faster than doing re.search()
            if inText:
                page.append(line)
@ -391,6 +398,8 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
            redirect = False
        elif tag == 'id' and not id:
            id = m.group(3)
        elif tag == 'id' and id: # <revision> <id></id> </revision>
            revid = m.group(3)
        elif tag == 'title':
            title = m.group(3)
        elif tag == 'redirect':
@ -411,11 +420,12 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
            colon = title.find(':')
            if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and
                    not redirect and not title.startswith(templateNamespace)):
-                job = (id, urlbase, title, page, ordinal)
+                job = (id, revid, urlbase, title, page, ordinal)
                jobs_queue.put(job)  # goes to any available extract_process
                last_id = id
                ordinal += 1
-            id = None
+            id = ''
            revid = ''
            page = []
    input.close()
@ -444,19 +454,19 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
 # Multiprocess support
-def extract_process(jobs_queue, output_queue, escape_doc):
+def extract_process(jobs_queue, output_queue, html_safe):
    """Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text
    :param jobs_queue: where to get jobs.
    :param output_queue: where to queue extracted text for output.
-    :escape_doc: whether to convert entities in text to HTML.
+    :html_safe: whether to convert entities in text to HTML.
    """
    while True:
-        job = jobs_queue.get()  # job is (id, title, page, ordinal)
+        job = jobs_queue.get()  # job is (id, revid, urlbase, title, page, ordinal)
        if job:
            out = StringIO()  # memory buffer
-            Extractor(*job[:4]).extract(out, escape_doc)  # (id, urlbase, title, page)
+            Extractor(*job[:-1]).extract(out, html_safe)  # (id, urlbase, title, page)
            text = out.getvalue()
-            output_queue.put((job[4], text))  # (ordinal, extracted_text)
+            output_queue.put((job[-1], text))  # (ordinal, extracted_text)
            out.close()
        else:
            break
@ -515,6 +525,8 @@ def main():
                        metavar="n[KMG]")
    groupO.add_argument("-c", "--compress", action="store_true",
                        help="compress output files using bzip")
    groupO.add_argument("--json", action="store_true",
                        help="write output in json format instead of the default <doc> format")
    groupP = parser.add_argument_group('Processing')
    groupP.add_argument("--html", action="store_true",
@ -527,8 +539,8 @@ def main():
                        help="use or create file containing templates")
    groupP.add_argument("--no-templates", action="store_false",
                        help="Do not expand templates")
-    groupP.add_argument("--escape-doc", default=True,
+    groupP.add_argument("--html-safe", default=True,
-                        help="use to produce proper HTML in the output <doc>...</doc>")
+                        help="use to produce HTML safe output within <doc>...</doc>")
    default_process_count = cpu_count() - 1
    parser.add_argument("--processes", type=int, default=default_process_count,
                        help="Number of processes to use (default %(default)s)")
@ -550,6 +562,7 @@ def main():
    Extractor.HtmlFormatting = args.html
    if args.html:
        Extractor.keepLinks = True
    Extractor.to_json = args.json
    expand_templates = args.no_templates
@ -590,16 +603,23 @@ def main():
                    load_templates(file)
        with open(input_file) as file:
-            page = file.read()#.decode('utf-8')
+            page = file.read()
-            m = re.search(r'<id>(.*)</id>', page)
+            ids = re.findall(r'<id>(\d*?)</id>', page)
-            id = m.group(1) if m else 0
+            id = ids[0] if ids else ''
-            m = re.search(r'<title>(.*)</title>', page)
+            revid = ids[1] if len(ids) > 1 else ''
            m = re.search(r'<title>(.*?)</title>', page)
            if m:
                title = m.group(1)
            else:
                logging.error('Missing title element')
                return
-            Extractor(id, title, [page]).extract(sys.stdout)
+            m = re.search(r'<base>(.*?)</base>', page)
            if m:
                base = m.group(1)
                urlbase = base[:base.rfind("/")]
            else:
                urlbase = ''
            Extractor(id, revid, urlbase, title, [page]).extract(sys.stdout)
        return
    output_path = args.output
@ -611,7 +631,7 @@ def main():
            return
    process_dump(input_file, args.templates, output_path, file_size,
-                 args.compress, args.processes, args.escape_doc)
+                 args.compress, args.processes, args.html_safe)
 if __name__ == '__main__':
--- a/wikiextractor/extract.py
+++ b/wikiextractor/extract.py
@ -20,8 +20,9 @@
 import re
 import html
 import json
 from itertools import zip_longest
-from urllib.parse import quote as urlquote
+from urllib.parse import quote as urlencode
 from html.entities import name2codepoint
 import logging
 import time
@ -66,14 +67,14 @@ def get_url(urlbase, uid):
 # ======================================================================
-def clean(extractor, text, expand_templates=False, escape_doc=True):
+def clean(extractor, text, expand_templates=False, html_safe=True):
    """
    Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped
    @see https://www.mediawiki.org/wiki/Help:Formatting
    :param extractor: the Extractor t use.
    :param text: the text to clean.
    :param expand_templates: whether to perform template expansion.
-    :param escape_doc: whether to convert special characters to HTML entities.
+    :param html_safe: whether to convert reserved HTML characters to entities.
    @return: the cleaned text.
    """
@ -171,7 +172,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
    text = re.sub(u'(\[\(«) ', r'\1', text)
    text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U)  # lines with only punctuations
    text = text.replace(',,', ',').replace(',.', '.')
-    if escape_doc:
+    if html_safe:
        text = html.escape(text, quote=False)
    return text
@ -419,7 +420,7 @@ def replaceExternalLinks(text):
 def makeExternalLink(url, anchor):
    """Function applied to wikiLinks"""
    if Extractor.keepLinks:
-        return '<a href="%s">%s</a>' % (urlquote(url.encode('utf-8')), anchor)
+        return '<a href="%s">%s</a>' % (urlencode(url), anchor)
    else:
        return anchor
@ -489,7 +490,7 @@ def makeInternalLink(title, label):
        if colon2 > 1 and title[colon + 1:colon2] not in acceptedNamespaces:
            return ''
    if Extractor.keepLinks:
-        return '<a href="%s">%s</a>' % (urlquote(title), label)
+        return '<a href="%s">%s</a>' % (urlencode(title), label)
    else:
        return label
@ -806,11 +807,16 @@ class Extractor():
    # Whether to output text with HTML formatting elements in <doc> files.
    HtmlFormatting = False
-    def __init__(self, id, urlbase, title, page):
+    ##
    # Whether to produce json instead of the default <doc> output format.
    toJson = False
    def __init__(self, id, revid, urlbase, title, page):
        """
        :param page: a list of lines.
        """
        self.id = id
        self.revid = revid
        self.url = get_url(urlbase, id)
        self.title = title
        self.page = page
@ -822,7 +828,7 @@ class Extractor():
        self.template_title_errs = 0
    def clean_text(self, text, mark_headers=False, expand_templates=False,
-                   escape_doc=True):
+                   html_safe=True):
        """
        :param mark_headers: True to distinguish headers from paragraphs
          e.g. "## Section 1"
@ -836,30 +842,41 @@ class Extractor():
        self.magicWords['currenttime'] = time.strftime('%H:%M:%S')
        text = clean(self, text, expand_templates=expand_templates,
-                     escape_doc=escape_doc)
+                     html_safe=html_safe)
        text = compact(text, mark_headers=mark_headers)
        return text
-    def extract(self, out, escape_doc=True):
+    def extract(self, out, html_safe=True):
        """
        :param out: a memory file.
        :param html_safe: whether to escape HTML entities.
        """
        logging.debug("%s\t%s", self.id, self.title)
        text = ''.join(self.page)
        text = self.clean_text(text, html_safe=html_safe)
-        header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, self.url, self.title)
+        if self.to_json:
-        # Separate header from text with a newline.
+            json_data = {
-        header += self.title + '\n\n'
+		'id': self.id,
-        footer = "\n</doc>\n"
+                'revid': self.revid,
-        out.write(header)
+                'url': self.url,
-
+                'title': self.title,
-        text = self.clean_text(text, escape_doc=escape_doc)
+                'text': "\n".join(text)
-
+            }
-        for line in text:
+            out_str = json.dumps(json_data)
-            out.write(line)
+            out.write(out_str)
            out.write('\n')
-        out.write(footer)
+        else:
            header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, self.url, self.title)
            # Separate header from text with a newline.
            header += self.title + '\n\n'
            footer = "\n</doc>\n"
            out.write(header)
            out.write('\n'.join(text))
            out.write('\n')
            out.write(footer)
        errs = (self.template_title_errs,
                self.recursion_exceeded_1_errs,
                self.recursion_exceeded_2_errs,
@ -1612,7 +1629,7 @@ parserFunctions = {
    # This function is used in some pages to construct links
    # http://meta.wikimedia.org/wiki/Help:URL
-    'urlencode': lambda string, *rest: urlquote(string.encode('utf-8')),
+    'urlencode': lambda string, *rest: urlencode(string),
    'lc': lambda string, *rest: string.lower() if string else '',