diff --git a/README.md b/README.md
index 68d967f..321d008 100644
--- a/README.md
+++ b/README.md
@@ -54,103 +54,68 @@ The option `--templates` extracts the templates to a local file, which can be re
The output is stored in several files of similar size in a given directory.
Each file will contains several documents in this [document format](https://github.com/attardi/wikiextractor/wiki/File-Format).
- usage: WikiExtractor.py [-h] [-o OUTPUT] [-b n[KMG]] [-c] [--json] [--html]
- [-l] [-s] [--lists] [-ns ns1,ns2]
- [--templates TEMPLATES] [--no-templates] [-r]
- [--min_text_length MIN_TEXT_LENGTH]
- [--filter_category path_of_categories_file]
- [--filter_disambig_pages] [-it abbr,b,big]
- [-de gallery,timeline,noinclude] [--keep_tables]
- [--processes PROCESSES] [-q] [--debug] [-a] [-v]
- [--log_file]
- input
+```
+usage: wikiextractor [-h] [-o OUTPUT] [-b n[KMG]] [-c] [--json] [--html] [-l] [-ns ns1,ns2]
+ [--templates TEMPLATES] [--no-templates] [--html-safe HTML_SAFE] [--processes PROCESSES]
+ [-q] [--debug] [-a] [-v]
+ input
- Wikipedia Extractor:
- Extracts and cleans text from a Wikipedia database dump and stores output in a
- number of files of similar size in a given directory.
- Each file will contain several documents in the format:
+Wikipedia Extractor:
+Extracts and cleans text from a Wikipedia database dump and stores output in a
+number of files of similar size in a given directory.
+Each file will contain several documents in the format:
-
- ...
-
+
+ ...
+
- If the program is invoked with the --json flag, then each file will
- contain several documents formatted as json ojects, one per line, with
- the following structure
+If the program is invoked with the --json flag, then each file will
+contain several documents formatted as json ojects, one per line, with
+the following structure
- {"id": "", "revid": "", "url":"", "title": "", "text": "..."}
+ {"id": "", "revid": "", "url": "", "title": "", "text": "..."}
- Template expansion requires preprocessing first the whole dump and
- collecting template definitions.
+The program performs template expansion by preprocesssng the whole dump and
+collecting template definitions.
- positional arguments:
- input XML wiki dump file
+positional arguments:
+ input XML wiki dump file
- optional arguments:
- -h, --help show this help message and exit
- --processes PROCESSES
- Number of processes to use (default 1)
+optional arguments:
+ -h, --help show this help message and exit
+ --processes PROCESSES
+ Number of processes to use (default 79)
- Output:
- -o OUTPUT, --output OUTPUT
- directory for extracted files (or '-' for dumping to
- stdout)
- -b n[KMG], --bytes n[KMG]
- maximum bytes per output file (default 1M)
- -c, --compress compress output files using bzip
- --json write output in json format instead of the default one
+Output:
+ -o OUTPUT, --output OUTPUT
+ directory for extracted files (or '-' for dumping to stdout)
+ -b n[KMG], --bytes n[KMG]
+ maximum bytes per output file (default 1M)
+ -c, --compress compress output files using bzip
+ --json write output in json format instead of the default format
- Processing:
- --html produce HTML output, subsumes --links
- -l, --links preserve links
- -s, --sections preserve sections
- --lists preserve lists
- -ns ns1,ns2, --namespaces ns1,ns2
- accepted namespaces in links
- --templates TEMPLATES
- use or create file containing templates
- --no-templates Do not expand templates
- -r, --revision Include the document revision id (default=False)
- --min_text_length MIN_TEXT_LENGTH
- Minimum expanded text length required to write
- document (default=0)
- --filter_category path_of_categories_file
- Include or exclude specific categories from the dataset. Specify the categories in
- file 'path_of_categories_file'. Format:
- One category one line, and if the line starts with:
- 1) #: Comments, ignored;
- 2) ^: the categories will be in excluding-categories
- 3) others: the categories will be in including-categories.
- Priority:
- 1) If excluding-categories is not empty, and any category of a page exists in excluding-categories, the page will be excluded; else
- 2) If including-categories is not empty, and no category of a page exists in including-categories, the page will be excluded; else
- 3) the page will be included
-
- --filter_disambig_pages
- Remove pages from output that contain disabmiguation
- markup (default=False)
- -it abbr,b,big, --ignored_tags abbr,b,big
- comma separated list of tags that will be dropped,
- keeping their content
- -de gallery,timeline,noinclude, --discard_elements gallery,timeline,noinclude
- comma separated list of elements that will be removed
- from the article text
- --keep_tables Preserve tables in the output article text
- (default=False)
-
- Special:
- -q, --quiet suppress reporting progress info
- --debug print debug info
- -a, --article analyze a file containing a single article (debug
- option)
- -v, --version print program version
- --log_file specify a file to save the log information.
+Processing:
+ --html produce HTML output, subsumes --links
+ -l, --links preserve links
+ -ns ns1,ns2, --namespaces ns1,ns2
+ accepted namespaces
+ --templates TEMPLATES
+ use or create file containing templates
+ --no-templates Do not expand templates
+ --html-safe HTML_SAFE
+ use to produce HTML safe output within ...
+Special:
+ -q, --quiet suppress reporting progress info
+ --debug print debug info
+ -a, --article analyze a file containing a single article (debug option)
+ -v, --version print program version
+```
Saving templates to a file will speed up performing extraction the next time,
assuming template definitions have not changed.
-Option --no-templates significantly speeds up the extractor, avoiding the cost
+Option `--no-templates` significantly speeds up the extractor, avoiding the cost
of expanding [MediaWiki templates](https://www.mediawiki.org/wiki/Help:Templates).
For further information, visit [the documentation](http://attardi.github.io/wikiextractor).
diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py
index 1335555..69cdec7 100755
--- a/wikiextractor/WikiExtractor.py
+++ b/wikiextractor/WikiExtractor.py
@@ -43,7 +43,13 @@ Each file will contain several documents in the format:
...
-This version performs template expansion by preprocesssng the whole dump and
+If the program is invoked with the --json flag, then each file will
+contain several documents formatted as json ojects, one per line, with
+the following structure
+
+ {"id": "", "revid": "", "url": "", "title": "", "text": "..."}
+
+The program performs template expansion by preprocesssng the whole dump and
collecting template definitions.
"""
@@ -258,6 +264,7 @@ def load_templates(file, output_file=None):
def decode_open(filename, mode='rt', encoding='utf-8'):
"""
Open a file, decode and decompress, depending on extension `gz`, or 'bz2`.
+ :param filename: the file to open.
"""
ext = os.path.splitext(filename)[1]
if ext == '.gz':
@@ -270,7 +277,7 @@ def decode_open(filename, mode='rt', encoding='utf-8'):
def process_dump(input_file, template_file, out_file, file_size, file_compress,
- process_count, escape_doc):
+ process_count, html_safe):
"""
:param input_file: name of the wikipedia dump file; '-' to read from stdin
:param template_file: optional file with template definitions.
@@ -361,7 +368,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
workers = []
for _ in range(max(1, process_count)):
extractor = Process(target=extract_process,
- args=(jobs_queue, output_queue, escape_doc))
+ args=(jobs_queue, output_queue, html_safe))
extractor.daemon = True # only live while parent process lives
extractor.start()
workers.append(extractor)
@@ -371,13 +378,13 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
# we collect individual lines, since str.join() is significantly faster
# than concatenation
page = []
- id = None
- last_id = None
+ id = ''
+ revid = ''
+ last_id = ''
ordinal = 0 # page count
inText = False
redirect = False
for line in input:
- #line = line.decode('utf-8')
if '<' not in line: # faster than doing re.search()
if inText:
page.append(line)
@@ -391,6 +398,8 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
redirect = False
elif tag == 'id' and not id:
id = m.group(3)
+ elif tag == 'id' and id: #
+ revid = m.group(3)
elif tag == 'title':
title = m.group(3)
elif tag == 'redirect':
@@ -411,11 +420,12 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
colon = title.find(':')
if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and
not redirect and not title.startswith(templateNamespace)):
- job = (id, urlbase, title, page, ordinal)
+ job = (id, revid, urlbase, title, page, ordinal)
jobs_queue.put(job) # goes to any available extract_process
last_id = id
ordinal += 1
- id = None
+ id = ''
+ revid = ''
page = []
input.close()
@@ -444,19 +454,19 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress,
# Multiprocess support
-def extract_process(jobs_queue, output_queue, escape_doc):
+def extract_process(jobs_queue, output_queue, html_safe):
"""Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text
:param jobs_queue: where to get jobs.
:param output_queue: where to queue extracted text for output.
- :escape_doc: whether to convert entities in text to HTML.
+ :html_safe: whether to convert entities in text to HTML.
"""
while True:
- job = jobs_queue.get() # job is (id, title, page, ordinal)
+ job = jobs_queue.get() # job is (id, revid, urlbase, title, page, ordinal)
if job:
out = StringIO() # memory buffer
- Extractor(*job[:4]).extract(out, escape_doc) # (id, urlbase, title, page)
+ Extractor(*job[:-1]).extract(out, html_safe) # (id, urlbase, title, page)
text = out.getvalue()
- output_queue.put((job[4], text)) # (ordinal, extracted_text)
+ output_queue.put((job[-1], text)) # (ordinal, extracted_text)
out.close()
else:
break
@@ -515,6 +525,8 @@ def main():
metavar="n[KMG]")
groupO.add_argument("-c", "--compress", action="store_true",
help="compress output files using bzip")
+ groupO.add_argument("--json", action="store_true",
+ help="write output in json format instead of the default format")
groupP = parser.add_argument_group('Processing')
groupP.add_argument("--html", action="store_true",
@@ -527,8 +539,8 @@ def main():
help="use or create file containing templates")
groupP.add_argument("--no-templates", action="store_false",
help="Do not expand templates")
- groupP.add_argument("--escape-doc", default=True,
- help="use to produce proper HTML in the output ...")
+ groupP.add_argument("--html-safe", default=True,
+ help="use to produce HTML safe output within ...")
default_process_count = cpu_count() - 1
parser.add_argument("--processes", type=int, default=default_process_count,
help="Number of processes to use (default %(default)s)")
@@ -550,6 +562,7 @@ def main():
Extractor.HtmlFormatting = args.html
if args.html:
Extractor.keepLinks = True
+ Extractor.to_json = args.json
expand_templates = args.no_templates
@@ -590,16 +603,23 @@ def main():
load_templates(file)
with open(input_file) as file:
- page = file.read()#.decode('utf-8')
- m = re.search(r'(.*)', page)
- id = m.group(1) if m else 0
- m = re.search(r'(.*)', page)
+ page = file.read()
+ ids = re.findall(r'(\d*?)', page)
+ id = ids[0] if ids else ''
+ revid = ids[1] if len(ids) > 1 else ''
+ m = re.search(r'(.*?)', page)
if m:
title = m.group(1)
else:
logging.error('Missing title element')
return
- Extractor(id, title, [page]).extract(sys.stdout)
+ m = re.search(r'(.*?)', page)
+ if m:
+ base = m.group(1)
+ urlbase = base[:base.rfind("/")]
+ else:
+ urlbase = ''
+ Extractor(id, revid, urlbase, title, [page]).extract(sys.stdout)
return
output_path = args.output
@@ -611,7 +631,7 @@ def main():
return
process_dump(input_file, args.templates, output_path, file_size,
- args.compress, args.processes, args.escape_doc)
+ args.compress, args.processes, args.html_safe)
if __name__ == '__main__':
diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py
index 52b0eae..2180dc4 100644
--- a/wikiextractor/extract.py
+++ b/wikiextractor/extract.py
@@ -20,8 +20,9 @@
import re
import html
+import json
from itertools import zip_longest
-from urllib.parse import quote as urlquote
+from urllib.parse import quote as urlencode
from html.entities import name2codepoint
import logging
import time
@@ -66,14 +67,14 @@ def get_url(urlbase, uid):
# ======================================================================
-def clean(extractor, text, expand_templates=False, escape_doc=True):
+def clean(extractor, text, expand_templates=False, html_safe=True):
"""
Transforms wiki markup. If the command line flag --escapedoc is set then the text is also escaped
@see https://www.mediawiki.org/wiki/Help:Formatting
:param extractor: the Extractor t use.
:param text: the text to clean.
:param expand_templates: whether to perform template expansion.
- :param escape_doc: whether to convert special characters to HTML entities.
+ :param html_safe: whether to convert reserved HTML characters to entities.
@return: the cleaned text.
"""
@@ -171,7 +172,7 @@ def clean(extractor, text, expand_templates=False, escape_doc=True):
text = re.sub(u'(\[\(«) ', r'\1', text)
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
text = text.replace(',,', ',').replace(',.', '.')
- if escape_doc:
+ if html_safe:
text = html.escape(text, quote=False)
return text
@@ -419,7 +420,7 @@ def replaceExternalLinks(text):
def makeExternalLink(url, anchor):
"""Function applied to wikiLinks"""
if Extractor.keepLinks:
- return '%s' % (urlquote(url.encode('utf-8')), anchor)
+ return '%s' % (urlencode(url), anchor)
else:
return anchor
@@ -489,7 +490,7 @@ def makeInternalLink(title, label):
if colon2 > 1 and title[colon + 1:colon2] not in acceptedNamespaces:
return ''
if Extractor.keepLinks:
- return '%s' % (urlquote(title), label)
+ return '%s' % (urlencode(title), label)
else:
return label
@@ -806,11 +807,16 @@ class Extractor():
# Whether to output text with HTML formatting elements in files.
HtmlFormatting = False
- def __init__(self, id, urlbase, title, page):
+ ##
+ # Whether to produce json instead of the default output format.
+ toJson = False
+
+ def __init__(self, id, revid, urlbase, title, page):
"""
:param page: a list of lines.
"""
self.id = id
+ self.revid = revid
self.url = get_url(urlbase, id)
self.title = title
self.page = page
@@ -822,7 +828,7 @@ class Extractor():
self.template_title_errs = 0
def clean_text(self, text, mark_headers=False, expand_templates=False,
- escape_doc=True):
+ html_safe=True):
"""
:param mark_headers: True to distinguish headers from paragraphs
e.g. "## Section 1"
@@ -836,30 +842,41 @@ class Extractor():
self.magicWords['currenttime'] = time.strftime('%H:%M:%S')
text = clean(self, text, expand_templates=expand_templates,
- escape_doc=escape_doc)
+ html_safe=html_safe)
text = compact(text, mark_headers=mark_headers)
return text
- def extract(self, out, escape_doc=True):
+ def extract(self, out, html_safe=True):
"""
:param out: a memory file.
+ :param html_safe: whether to escape HTML entities.
"""
logging.debug("%s\t%s", self.id, self.title)
text = ''.join(self.page)
+ text = self.clean_text(text, html_safe=html_safe)
- header = '\n' % (self.id, self.url, self.title)
- # Separate header from text with a newline.
- header += self.title + '\n\n'
- footer = "\n\n"
- out.write(header)
-
- text = self.clean_text(text, escape_doc=escape_doc)
-
- for line in text:
- out.write(line)
+ if self.to_json:
+ json_data = {
+ 'id': self.id,
+ 'revid': self.revid,
+ 'url': self.url,
+ 'title': self.title,
+ 'text': "\n".join(text)
+ }
+ out_str = json.dumps(json_data)
+ out.write(out_str)
out.write('\n')
- out.write(footer)
+ else:
+ header = '\n' % (self.id, self.url, self.title)
+ # Separate header from text with a newline.
+ header += self.title + '\n\n'
+ footer = "\n\n"
+ out.write(header)
+ out.write('\n'.join(text))
+ out.write('\n')
+ out.write(footer)
+
errs = (self.template_title_errs,
self.recursion_exceeded_1_errs,
self.recursion_exceeded_2_errs,
@@ -1612,7 +1629,7 @@ parserFunctions = {
# This function is used in some pages to construct links
# http://meta.wikimedia.org/wiki/Help:URL
- 'urlencode': lambda string, *rest: urlquote(string.encode('utf-8')),
+ 'urlencode': lambda string, *rest: urlencode(string),
'lc': lambda string, *rest: string.lower() if string else '',