diff --git a/WikiExtractor.py b/WikiExtractor.py index cc14b99..a36e932 100755 --- a/WikiExtractor.py +++ b/WikiExtractor.py @@ -546,7 +546,6 @@ class Extractor(object): logging.warn("Template errors in article '%s' (%s): title(%d) recursion(%d, %d, %d)", self.title, self.id, *errs) - def transform(self, wikitext): """ Transforms wiki markup. @@ -562,7 +561,6 @@ class Extractor(object): res += self.transform1(wikitext[cur:]) return res - def transform1(self, text): """Transform text not containing """ if Extractor.expand_templates: @@ -628,7 +626,6 @@ class Extractor(object): text = res + unescape(text[cur:]) return text - def clean(self, text): """ Removes irrelevant parts from :param: text. @@ -684,6 +681,9 @@ class Extractor(object): text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations text = text.replace(',,', ',').replace(',.', '.') if keep_tables: + # the following regular expressions are used to remove the wikiml chartacters around table strucutures + # yet keep the content. The order here is imporant so we remove certain markup like {| and then + # then the future html attributes such as 'style'. Finally we drop the remaining '|-' that delimits cells. text = re.sub(r'!(?:\s)?style=\"[a-z]+:(?:\d+)%;\"', r'', text) text = re.sub(r'!(?:\s)?style="[a-z]+:(?:\d+)%;[a-z]+:(?:#)?(?:[0-9a-z]+)?"', r'', text) text = text.replace('|-', '')