2021-02-11 07:09:49 +08:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# coding=utf-8
|
|
|
|
|
|
|
|
import hashlib
|
2021-03-11 11:20:39 +08:00
|
|
|
import logging
|
2021-02-11 07:09:49 +08:00
|
|
|
import re
|
2021-02-26 16:56:05 +08:00
|
|
|
import time
|
2021-04-18 19:24:11 +08:00
|
|
|
import urllib.parse
|
2021-07-27 11:39:49 +08:00
|
|
|
from typing import Callable, Optional
|
2021-02-11 07:09:49 +08:00
|
|
|
|
2021-02-26 16:56:05 +08:00
|
|
|
import _hashlib
|
|
|
|
import requests
|
2021-02-11 07:09:49 +08:00
|
|
|
from praw.models import Submission
|
|
|
|
|
2021-04-12 15:58:32 +08:00
|
|
|
from bdfr.exceptions import BulkDownloaderException
|
2021-02-26 16:56:05 +08:00
|
|
|
|
2021-03-11 11:20:39 +08:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
2021-02-11 07:09:49 +08:00
|
|
|
|
|
|
|
class Resource:
|
2021-07-27 11:39:49 +08:00
|
|
|
def __init__(self, source_submission: Submission, url: str, download_function: Callable, extension: str = None):
|
2021-02-11 07:09:49 +08:00
|
|
|
self.source_submission = source_submission
|
2021-02-26 16:56:05 +08:00
|
|
|
self.content: Optional[bytes] = None
|
2021-02-11 07:09:49 +08:00
|
|
|
self.url = url
|
2021-02-26 16:56:05 +08:00
|
|
|
self.hash: Optional[_hashlib.HASH] = None
|
|
|
|
self.extension = extension
|
2021-07-27 11:39:49 +08:00
|
|
|
self.download_function = download_function
|
2021-02-26 16:56:05 +08:00
|
|
|
if not self.extension:
|
|
|
|
self.extension = self._determine_extension()
|
2021-02-11 07:09:49 +08:00
|
|
|
|
|
|
|
@staticmethod
|
2021-07-27 12:02:30 +08:00
|
|
|
def retry_download(url: str) -> Callable:
|
2021-10-02 10:41:57 +08:00
|
|
|
return lambda global_params: Resource.http_download(url, global_params)
|
2021-02-26 16:56:05 +08:00
|
|
|
|
2021-07-29 17:10:10 +08:00
|
|
|
def download(self, download_parameters: Optional[dict] = None):
|
|
|
|
if download_parameters is None:
|
|
|
|
download_parameters = {}
|
2021-02-26 16:56:05 +08:00
|
|
|
if not self.content:
|
2021-04-05 13:11:17 +08:00
|
|
|
try:
|
2021-07-29 17:10:10 +08:00
|
|
|
content = self.download_function(download_parameters)
|
2021-04-05 13:11:17 +08:00
|
|
|
except requests.exceptions.ConnectionError as e:
|
2022-12-03 13:11:17 +08:00
|
|
|
raise BulkDownloaderException(f"Could not download resource: {e}")
|
2021-04-05 13:11:17 +08:00
|
|
|
except BulkDownloaderException:
|
|
|
|
raise
|
2021-02-26 16:56:05 +08:00
|
|
|
if content:
|
|
|
|
self.content = content
|
2021-04-05 13:11:17 +08:00
|
|
|
if not self.hash and self.content:
|
|
|
|
self.create_hash()
|
2021-02-26 16:56:05 +08:00
|
|
|
|
2021-02-28 07:40:42 +08:00
|
|
|
def create_hash(self):
|
|
|
|
self.hash = hashlib.md5(self.content)
|
|
|
|
|
2021-04-13 11:17:40 +08:00
|
|
|
def _determine_extension(self) -> Optional[str]:
|
2022-12-03 13:11:17 +08:00
|
|
|
extension_pattern = re.compile(r".*(\..{3,5})$")
|
2021-04-18 19:24:11 +08:00
|
|
|
stripped_url = urllib.parse.urlsplit(self.url).path
|
|
|
|
match = re.search(extension_pattern, stripped_url)
|
2021-02-26 16:56:05 +08:00
|
|
|
if match:
|
|
|
|
return match.group(1)
|
2021-10-02 10:41:57 +08:00
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def http_download(url: str, download_parameters: dict) -> Optional[bytes]:
|
2022-12-03 13:11:17 +08:00
|
|
|
headers = download_parameters.get("headers")
|
2021-10-02 10:41:57 +08:00
|
|
|
current_wait_time = 60
|
2022-12-03 13:11:17 +08:00
|
|
|
if "max_wait_time" in download_parameters:
|
|
|
|
max_wait_time = download_parameters["max_wait_time"]
|
2021-10-02 10:41:57 +08:00
|
|
|
else:
|
|
|
|
max_wait_time = 300
|
|
|
|
while True:
|
|
|
|
try:
|
|
|
|
response = requests.get(url, headers=headers)
|
2022-12-03 13:11:17 +08:00
|
|
|
if re.match(r"^2\d{2}", str(response.status_code)) and response.content:
|
2021-10-02 10:41:57 +08:00
|
|
|
return response.content
|
|
|
|
elif response.status_code in (408, 429):
|
2022-12-03 13:11:17 +08:00
|
|
|
raise requests.exceptions.ConnectionError(f"Response code {response.status_code}")
|
2021-10-02 10:41:57 +08:00
|
|
|
else:
|
|
|
|
raise BulkDownloaderException(
|
2022-12-03 13:11:17 +08:00
|
|
|
f"Unrecoverable error requesting resource: HTTP Code {response.status_code}"
|
|
|
|
)
|
2021-10-02 10:41:57 +08:00
|
|
|
except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
|
2022-12-03 13:11:17 +08:00
|
|
|
logger.warning(f"Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}")
|
2021-10-02 10:41:57 +08:00
|
|
|
time.sleep(current_wait_time)
|
|
|
|
if current_wait_time < max_wait_time:
|
|
|
|
current_wait_time += 60
|
|
|
|
else:
|
2022-12-03 13:11:17 +08:00
|
|
|
logger.error(f"Max wait time exceeded for resource at url {url}")
|
2021-10-02 10:41:57 +08:00
|
|
|
raise
|