diff --git a/README.md b/README.md index 68d967f..321d008 100644 --- a/README.md +++ b/README.md @@ -54,103 +54,68 @@ The option `--templates` extracts the templates to a local file, which can be re The output is stored in several files of similar size in a given directory. Each file will contains several documents in this [document format](https://github.com/attardi/wikiextractor/wiki/File-Format). - usage: WikiExtractor.py [-h] [-o OUTPUT] [-b n[KMG]] [-c] [--json] [--html] - [-l] [-s] [--lists] [-ns ns1,ns2] - [--templates TEMPLATES] [--no-templates] [-r] - [--min_text_length MIN_TEXT_LENGTH] - [--filter_category path_of_categories_file] - [--filter_disambig_pages] [-it abbr,b,big] - [-de gallery,timeline,noinclude] [--keep_tables] - [--processes PROCESSES] [-q] [--debug] [-a] [-v] - [--log_file] - input +``` +usage: wikiextractor [-h] [-o OUTPUT] [-b n[KMG]] [-c] [--json] [--html] [-l] [-ns ns1,ns2] + [--templates TEMPLATES] [--no-templates] [--html-safe HTML_SAFE] [--processes PROCESSES] + [-q] [--debug] [-a] [-v] + input - Wikipedia Extractor: - Extracts and cleans text from a Wikipedia database dump and stores output in a - number of files of similar size in a given directory. - Each file will contain several documents in the format: +Wikipedia Extractor: +Extracts and cleans text from a Wikipedia database dump and stores output in a +number of files of similar size in a given directory. +Each file will contain several documents in the format: - - ... - + + ... + - If the program is invoked with the --json flag, then each file will - contain several documents formatted as json ojects, one per line, with - the following structure +If the program is invoked with the --json flag, then each file will +contain several documents formatted as json ojects, one per line, with +the following structure - {"id": "", "revid": "", "url":"", "title": "", "text": "..."} + {"id": "", "revid": "", "url": "", "title": "", "text": "..."} - Template expansion requires preprocessing first the whole dump and - collecting template definitions. +The program performs template expansion by preprocesssng the whole dump and +collecting template definitions. - positional arguments: - input XML wiki dump file +positional arguments: + input XML wiki dump file - optional arguments: - -h, --help show this help message and exit - --processes PROCESSES - Number of processes to use (default 1) +optional arguments: + -h, --help show this help message and exit + --processes PROCESSES + Number of processes to use (default 79) - Output: - -o OUTPUT, --output OUTPUT - directory for extracted files (or '-' for dumping to - stdout) - -b n[KMG], --bytes n[KMG] - maximum bytes per output file (default 1M) - -c, --compress compress output files using bzip - --json write output in json format instead of the default one +Output: + -o OUTPUT, --output OUTPUT + directory for extracted files (or '-' for dumping to stdout) + -b n[KMG], --bytes n[KMG] + maximum bytes per output file (default 1M) + -c, --compress compress output files using bzip + --json write output in json format instead of the default format - Processing: - --html produce HTML output, subsumes --links - -l, --links preserve links - -s, --sections preserve sections - --lists preserve lists - -ns ns1,ns2, --namespaces ns1,ns2 - accepted namespaces in links - --templates TEMPLATES - use or create file containing templates - --no-templates Do not expand templates - -r, --revision Include the document revision id (default=False) - --min_text_length MIN_TEXT_LENGTH - Minimum expanded text length required to write - document (default=0) - --filter_category path_of_categories_file - Include or exclude specific categories from the dataset. Specify the categories in - file 'path_of_categories_file'. Format: - One category one line, and if the line starts with: - 1) #: Comments, ignored; - 2) ^: the categories will be in excluding-categories - 3) others: the categories will be in including-categories. - Priority: - 1) If excluding-categories is not empty, and any category of a page exists in excluding-categories, the page will be excluded; else - 2) If including-categories is not empty, and no category of a page exists in including-categories, the page will be excluded; else - 3) the page will be included - - --filter_disambig_pages - Remove pages from output that contain disabmiguation - markup (default=False) - -it abbr,b,big, --ignored_tags abbr,b,big - comma separated list of tags that will be dropped, - keeping their content - -de gallery,timeline,noinclude, --discard_elements gallery,timeline,noinclude - comma separated list of elements that will be removed - from the article text - --keep_tables Preserve tables in the output article text - (default=False) - - Special: - -q, --quiet suppress reporting progress info - --debug print debug info - -a, --article analyze a file containing a single article (debug - option) - -v, --version print program version - --log_file specify a file to save the log information. +Processing: + --html produce HTML output, subsumes --links + -l, --links preserve links + -ns ns1,ns2, --namespaces ns1,ns2 + accepted namespaces + --templates TEMPLATES + use or create file containing templates + --no-templates Do not expand templates + --html-safe HTML_SAFE + use to produce HTML safe output within ... +Special: + -q, --quiet suppress reporting progress info + --debug print debug info + -a, --article analyze a file containing a single article (debug option) + -v, --version print program version +``` Saving templates to a file will speed up performing extraction the next time, assuming template definitions have not changed. -Option --no-templates significantly speeds up the extractor, avoiding the cost +Option `--no-templates` significantly speeds up the extractor, avoiding the cost of expanding [MediaWiki templates](https://www.mediawiki.org/wiki/Help:Templates). For further information, visit [the documentation](http://attardi.github.io/wikiextractor). diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py index 1335555..69cdec7 100755 --- a/wikiextractor/WikiExtractor.py +++ b/wikiextractor/WikiExtractor.py @@ -43,7 +43,13 @@ Each file will contain several documents in the format: ... -This version performs template expansion by preprocesssng the whole dump and +If the program is invoked with the --json flag, then each file will +contain several documents formatted as json ojects, one per line, with +the following structure + + {"id": "", "revid": "", "url": "", "title": "", "text": "..."} + +The program performs template expansion by preprocesssng the whole dump and collecting template definitions. """ @@ -258,6 +264,7 @@ def load_templates(file, output_file=None): def decode_open(filename, mode='rt', encoding='utf-8'): """ Open a file, decode and decompress, depending on extension `gz`, or 'bz2`. + :param filename: the file to open. """ ext = os.path.splitext(filename)[1] if ext == '.gz': @@ -270,7 +277,7 @@ def decode_open(filename, mode='rt', encoding='utf-8'): def process_dump(input_file, template_file, out_file, file_size, file_compress, - process_count, escape_doc): + process_count, html_safe): """ :param input_file: name of the wikipedia dump file; '-' to read from stdin :param template_file: optional file with template definitions. @@ -361,7 +368,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, workers = [] for _ in range(max(1, process_count)): extractor = Process(target=extract_process, - args=(jobs_queue, output_queue, escape_doc)) + args=(jobs_queue, output_queue, html_safe)) extractor.daemon = True # only live while parent process lives extractor.start() workers.append(extractor) @@ -371,13 +378,13 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, # we collect individual lines, since str.join() is significantly faster # than concatenation page = [] - id = None - last_id = None + id = '' + revid = '' + last_id = '' ordinal = 0 # page count inText = False redirect = False for line in input: - #line = line.decode('utf-8') if '<' not in line: # faster than doing re.search() if inText: page.append(line) @@ -391,6 +398,8 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, redirect = False elif tag == 'id' and not id: id = m.group(3) + elif tag == 'id' and id: # + revid = m.group(3) elif tag == 'title': title = m.group(3) elif tag == 'redirect': @@ -411,11 +420,12 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, colon = title.find(':') if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and not redirect and not title.startswith(templateNamespace)): - job = (id, urlbase, title, page, ordinal) + job = (id, revid, urlbase, title, page, ordinal) jobs_queue.put(job) # goes to any available extract_process last_id = id ordinal += 1 - id = None + id = '' + revid = '' page = [] input.close() @@ -444,19 +454,19 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, # Multiprocess support -def extract_process(jobs_queue, output_queue, escape_doc): +def extract_process(jobs_queue, output_queue, html_safe): """Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text :param jobs_queue: where to get jobs. :param output_queue: where to queue extracted text for output. - :escape_doc: whether to convert entities in text to HTML. + :html_safe: whether to convert entities in text to HTML. """ while True: - job = jobs_queue.get() # job is (id, title, page, ordinal) + job = jobs_queue.get() # job is (id, revid, urlbase, title, page, ordinal) if job: out = StringIO() # memory buffer - Extractor(*job[:4]).extract(out, escape_doc) # (id, urlbase, title, page) + Extractor(*job[:-1]).extract(out, html_safe) # (id, urlbase, title, page) text = out.getvalue() - output_queue.put((job[4], text)) # (ordinal, extracted_text) + output_queue.put((job[-1], text)) # (ordinal, extracted_text) out.close() else: break @@ -515,6 +525,8 @@ def main(): metavar="n[KMG]") groupO.add_argument("-c", "--compress", action="store_true", help="compress output files using bzip") + groupO.add_argument("--json", action="store_true", + help="write output in json format instead of the default format") groupP = parser.add_argument_group('Processing') groupP.add_argument("--html", action="store_true", @@ -527,8 +539,8 @@ def main(): help="use or create file containing templates") groupP.add_argument("--no-templates", action="store_false", help="Do not expand templates") - groupP.add_argument("--escape-doc", default=True, - help="use to produce proper HTML in the output ...") + groupP.add_argument("--html-safe", default=True, + help="use to produce HTML safe output within ...") default_process_count = cpu_count() - 1 parser.add_argument("--processes", type=int, default=default_process_count, help="Number of processes to use (default %(default)s)") @@ -550,6 +562,7 @@ def main(): Extractor.HtmlFormatting = args.html if args.html: Extractor.keepLinks = True + Extractor.to_json = args.json expand_templates = args.no_templates @@ -590,16 +603,23 @@ def main(): load_templates(file) with open(input_file) as file: - page = file.read()#.decode('utf-8') - m = re.search(r'(.*)', page) - id = m.group(1) if m else 0 - m = re.search(r'(.*)', page) + page = file.read() + ids = re.findall(r'(\d*?)', page) + id = ids[0] if ids else '' + revid = ids[1] if len(ids) > 1 else '' + m = re.search(r'(.*?)', page) if m: title = m.group(1) else: logging.error('Missing title element') return - Extractor(id, title, [page]).extract(sys.stdout) + m = re.search(r'(.*?)', page) + if m: + base = m.group(1) + urlbase = base[:base.rfind("/")] + else: + urlbase = '' + Extractor(id, revid, urlbase, title, [page]).extract(sys.stdout) return output_path = args.output @@ -611,7 +631,7 @@ def main(): return process_dump(input_file, args.templates, output_path, file_size, - args.compress, args.processes, args.escape_doc) + args.compress, args.processes, args.html_safe) if __name__ == '__main__': diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py index 52b0eae..2180dc4 100644 --- a/wikiextractor/extract.py +++ b/wikiextractor/extract.py @@ -20,8 +20,9 @@ import re import html +import json from itertools import zip_longest -from urllib.parse import quote as urlquote +from urllib.parse import quote as urlencode from html.entities import name2codepoint import logging import time @@ -66,14 +67,14 @@ def get_url(urlbase, uid): # ====================================================================== -def clean(extractor, text, expand_templates=False, escape_doc=True): +def clean(extractor, text, expand_templates=False, html_safe=True): """ Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped @see https://www.mediawiki.org/wiki/Help:Formatting :param extractor: the Extractor t use. :param text: the text to clean. :param expand_templates: whether to perform template expansion. - :param escape_doc: whether to convert special characters to HTML entities. + :param html_safe: whether to convert reserved HTML characters to entities. @return: the cleaned text. """ @@ -171,7 +172,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True): text = re.sub(u'(\[\(«) ', r'\1', text) text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations text = text.replace(',,', ',').replace(',.', '.') - if escape_doc: + if html_safe: text = html.escape(text, quote=False) return text @@ -419,7 +420,7 @@ def replaceExternalLinks(text): def makeExternalLink(url, anchor): """Function applied to wikiLinks""" if Extractor.keepLinks: - return '%s' % (urlquote(url.encode('utf-8')), anchor) + return '%s' % (urlencode(url), anchor) else: return anchor @@ -489,7 +490,7 @@ def makeInternalLink(title, label): if colon2 > 1 and title[colon + 1:colon2] not in acceptedNamespaces: return '' if Extractor.keepLinks: - return '%s' % (urlquote(title), label) + return '%s' % (urlencode(title), label) else: return label @@ -806,11 +807,16 @@ class Extractor(): # Whether to output text with HTML formatting elements in files. HtmlFormatting = False - def __init__(self, id, urlbase, title, page): + ## + # Whether to produce json instead of the default output format. + toJson = False + + def __init__(self, id, revid, urlbase, title, page): """ :param page: a list of lines. """ self.id = id + self.revid = revid self.url = get_url(urlbase, id) self.title = title self.page = page @@ -822,7 +828,7 @@ class Extractor(): self.template_title_errs = 0 def clean_text(self, text, mark_headers=False, expand_templates=False, - escape_doc=True): + html_safe=True): """ :param mark_headers: True to distinguish headers from paragraphs e.g. "## Section 1" @@ -836,30 +842,41 @@ class Extractor(): self.magicWords['currenttime'] = time.strftime('%H:%M:%S') text = clean(self, text, expand_templates=expand_templates, - escape_doc=escape_doc) + html_safe=html_safe) text = compact(text, mark_headers=mark_headers) return text - def extract(self, out, escape_doc=True): + def extract(self, out, html_safe=True): """ :param out: a memory file. + :param html_safe: whether to escape HTML entities. """ logging.debug("%s\t%s", self.id, self.title) text = ''.join(self.page) + text = self.clean_text(text, html_safe=html_safe) - header = '\n' % (self.id, self.url, self.title) - # Separate header from text with a newline. - header += self.title + '\n\n' - footer = "\n\n" - out.write(header) - - text = self.clean_text(text, escape_doc=escape_doc) - - for line in text: - out.write(line) + if self.to_json: + json_data = { + 'id': self.id, + 'revid': self.revid, + 'url': self.url, + 'title': self.title, + 'text': "\n".join(text) + } + out_str = json.dumps(json_data) + out.write(out_str) out.write('\n') - out.write(footer) + else: + header = '\n' % (self.id, self.url, self.title) + # Separate header from text with a newline. + header += self.title + '\n\n' + footer = "\n\n" + out.write(header) + out.write('\n'.join(text)) + out.write('\n') + out.write(footer) + errs = (self.template_title_errs, self.recursion_exceeded_1_errs, self.recursion_exceeded_2_errs, @@ -1612,7 +1629,7 @@ parserFunctions = { # This function is used in some pages to construct links # http://meta.wikimedia.org/wiki/Help:URL - 'urlencode': lambda string, *rest: urlquote(string.encode('utf-8')), + 'urlencode': lambda string, *rest: urlencode(string), 'lc': lambda string, *rest: string.lower() if string else '',