From a291104144b716793a96faca119a530caba90b24 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 6 Apr 2021 11:04:08 +1000 Subject: [PATCH] Add defensive programming to site downloaders --- .../site_downloaders/erome.py | 5 +++-- .../site_downloaders/gallery.py | 4 ++-- .../site_downloaders/gfycat.py | 8 +++++++- .../site_downloaders/gif_delivery_network.py | 12 +++++++---- .../site_downloaders/imgur.py | 20 ++++++++++++------- .../site_downloaders/redgifs.py | 18 +++++++++++++---- 6 files changed, 47 insertions(+), 20 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/erome.py b/bulkredditdownloader/site_downloaders/erome.py index c223cd1..ae896e2 100644 --- a/bulkredditdownloader/site_downloaders/erome.py +++ b/bulkredditdownloader/site_downloaders/erome.py @@ -7,7 +7,7 @@ from typing import Optional import bs4 from praw.models import Submission -from bulkredditdownloader.exceptions import NotADownloadableLinkError +from bulkredditdownloader.exceptions import SiteDownloaderError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -21,8 +21,9 @@ class Erome(BaseDownloader): def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: links = self._get_links(self.post.url) + if not links: - raise NotADownloadableLinkError('Erome parser could not find any links') + raise SiteDownloaderError('Erome parser could not find any links') out = [] for link in links: diff --git a/bulkredditdownloader/site_downloaders/gallery.py b/bulkredditdownloader/site_downloaders/gallery.py index 8d7c074..829951c 100644 --- a/bulkredditdownloader/site_downloaders/gallery.py +++ b/bulkredditdownloader/site_downloaders/gallery.py @@ -7,7 +7,7 @@ from typing import Optional import bs4 from praw.models import Submission -from bulkredditdownloader.exceptions import ResourceNotFound +from bulkredditdownloader.exceptions import SiteDownloaderError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -22,7 +22,7 @@ class Gallery(BaseDownloader): def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: image_urls = self._get_links(self.post.url) if not image_urls: - raise ResourceNotFound('No images found in Reddit gallery') + raise SiteDownloaderError('No images found in Reddit gallery') return [Resource(self.post, url) for url in image_urls] @staticmethod diff --git a/bulkredditdownloader/site_downloaders/gfycat.py b/bulkredditdownloader/site_downloaders/gfycat.py index 6d1c3c7..62cee25 100644 --- a/bulkredditdownloader/site_downloaders/gfycat.py +++ b/bulkredditdownloader/site_downloaders/gfycat.py @@ -7,6 +7,7 @@ from typing import Optional from bs4 import BeautifulSoup from praw.models import Submission +from bulkredditdownloader.exceptions import SiteDownloaderError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork @@ -31,5 +32,10 @@ class Gfycat(GifDeliveryNetwork): soup = BeautifulSoup(response.text, 'html.parser') content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'}) - out = json.loads(content.contents[0]).get('video').get('contentUrl') + try: + out = json.loads(content.contents[0])['video']['contentUrl'] + except (IndexError, KeyError) as e: + raise SiteDownloaderError(f'Failed to download Gfycat link {url}: {e}') + except json.JSONDecodeError as e: + raise SiteDownloaderError(f'Did not receive valid JSON data: {e}') return out diff --git a/bulkredditdownloader/site_downloaders/gif_delivery_network.py b/bulkredditdownloader/site_downloaders/gif_delivery_network.py index 2d433d5..31d5660 100644 --- a/bulkredditdownloader/site_downloaders/gif_delivery_network.py +++ b/bulkredditdownloader/site_downloaders/gif_delivery_network.py @@ -5,7 +5,7 @@ from typing import Optional from bs4 import BeautifulSoup from praw.models import Submission -from bulkredditdownloader.exceptions import NotADownloadableLinkError +from bulkredditdownloader.exceptions import NotADownloadableLinkError, SiteDownloaderError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -26,7 +26,11 @@ class GifDeliveryNetwork(BaseDownloader): soup = BeautifulSoup(page.text, 'html.parser') content = soup.find('source', attrs={'id': 'mp4Source', 'type': 'video/mp4'}) - if content is None or content.get('src') is None: - raise NotADownloadableLinkError('Could not read the page source') + try: + out = content['src'] + if not out: + raise KeyError + except KeyError: + raise SiteDownloaderError('Could not find source link') - return content.get('src') + return out diff --git a/bulkredditdownloader/site_downloaders/imgur.py b/bulkredditdownloader/site_downloaders/imgur.py index d72f66a..832729a 100644 --- a/bulkredditdownloader/site_downloaders/imgur.py +++ b/bulkredditdownloader/site_downloaders/imgur.py @@ -50,17 +50,23 @@ class Imgur(BaseDownloader): script_regex = re.compile(r'\s*\(function\(widgetFactory\)\s*{\s*widgetFactory\.mergeConfig\(\'gallery\'') chosen_script = list(filter(lambda s: re.search(script_regex, s), scripts)) if len(chosen_script) != 1: - raise NotADownloadableLinkError(f'Could not read page source from {link}') - else: - chosen_script = chosen_script[0] + raise SiteDownloaderError(f'Could not read page source from {link}') + + chosen_script = chosen_script[0] outer_regex = re.compile(r'widgetFactory\.mergeConfig\(\'gallery\', ({.*})\);') - image_dict = re.search(outer_regex, chosen_script).group(1) - inner_regex = re.compile(r'image\s*:(.*),\s*group') - image_dict = re.search(inner_regex, image_dict).group(1) + try: + image_dict = re.search(outer_regex, chosen_script).group(1) + image_dict = re.search(inner_regex, image_dict).group(1) + except AttributeError: + raise SiteDownloaderError(f'Could not find image dictionary in page source') + + try: + image_dict = json.loads(image_dict) + except json.JSONDecodeError as e: + raise SiteDownloaderError(f'Could not parse received dict as JSON: {e}') - image_dict = json.loads(image_dict) return image_dict @staticmethod diff --git a/bulkredditdownloader/site_downloaders/redgifs.py b/bulkredditdownloader/site_downloaders/redgifs.py index 8f16447..536532e 100644 --- a/bulkredditdownloader/site_downloaders/redgifs.py +++ b/bulkredditdownloader/site_downloaders/redgifs.py @@ -7,7 +7,7 @@ from typing import Optional from bs4 import BeautifulSoup from praw.models import Submission -from bulkredditdownloader.exceptions import NotADownloadableLinkError +from bulkredditdownloader.exceptions import NotADownloadableLinkError, SiteDownloaderError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork @@ -22,7 +22,11 @@ class Redgifs(GifDeliveryNetwork): @staticmethod def _get_link(url: str) -> str: - redgif_id = re.match(r'.*/(.*?)/?$', url).group(1) + try: + redgif_id = re.match(r'.*/(.*?)/?$', url).group(1) + except AttributeError: + raise SiteDownloaderError(f'Could not extract Redgifs ID from {url}') + url = 'https://redgifs.com/watch/' + redgif_id headers = { @@ -36,7 +40,13 @@ class Redgifs(GifDeliveryNetwork): content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'}) if content is None: - raise NotADownloadableLinkError('Could not read the page source') + raise SiteDownloaderError('Could not read the page source') + + try: + out = json.loads(content.contents[0])['video']['contentUrl'] + except (IndexError, KeyError): + raise SiteDownloaderError('Failed to find JSON data in page') + except json.JSONDecodeError as e: + raise SiteDownloaderError(f'Received data was not valid JSON: {e}') - out = json.loads(content.contents[0])['video']['contentUrl'] return out