improved comment around regex
This commit is contained in:
parent
663a3dea73
commit
12fb5e587d
@ -546,7 +546,6 @@ class Extractor(object):
|
||||
logging.warn("Template errors in article '%s' (%s): title(%d) recursion(%d, %d, %d)",
|
||||
self.title, self.id, *errs)
|
||||
|
||||
|
||||
def transform(self, wikitext):
|
||||
"""
|
||||
Transforms wiki markup.
|
||||
@ -562,7 +561,6 @@ class Extractor(object):
|
||||
res += self.transform1(wikitext[cur:])
|
||||
return res
|
||||
|
||||
|
||||
def transform1(self, text):
|
||||
"""Transform text not containing <nowiki>"""
|
||||
if Extractor.expand_templates:
|
||||
@ -628,7 +626,6 @@ class Extractor(object):
|
||||
text = res + unescape(text[cur:])
|
||||
return text
|
||||
|
||||
|
||||
def clean(self, text):
|
||||
"""
|
||||
Removes irrelevant parts from :param: text.
|
||||
@ -684,6 +681,9 @@ class Extractor(object):
|
||||
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
|
||||
text = text.replace(',,', ',').replace(',.', '.')
|
||||
if keep_tables:
|
||||
# the following regular expressions are used to remove the wikiml chartacters around table strucutures
|
||||
# yet keep the content. The order here is imporant so we remove certain markup like {| and then
|
||||
# then the future html attributes such as 'style'. Finally we drop the remaining '|-' that delimits cells.
|
||||
text = re.sub(r'!(?:\s)?style=\"[a-z]+:(?:\d+)%;\"', r'', text)
|
||||
text = re.sub(r'!(?:\s)?style="[a-z]+:(?:\d+)%;[a-z]+:(?:#)?(?:[0-9a-z]+)?"', r'', text)
|
||||
text = text.replace('|-', '')
|
||||
|
Loading…
Reference in New Issue
Block a user