bulk-downloader-for-reddit/bdfr/downloader.py

136 lines
6.0 KiB
Python
Raw Normal View History

2021-02-11 07:10:40 +08:00
#!/usr/bin/env python3
# coding=utf-8
import hashlib
2021-04-05 12:47:39 +08:00
import logging.handlers
import os
import time
2021-02-11 07:10:40 +08:00
from datetime import datetime
2021-03-26 08:42:51 +08:00
from multiprocessing import Pool
2021-02-11 07:10:40 +08:00
from pathlib import Path
import praw
import praw.exceptions
2021-02-11 07:10:40 +08:00
import praw.models
from bdfr import exceptions as errors
2021-04-12 15:58:32 +08:00
from bdfr.configuration import Configuration
from bdfr.connector import RedditConnector
2021-04-12 15:58:32 +08:00
from bdfr.site_downloaders.download_factory import DownloadFactory
2021-02-11 07:10:40 +08:00
logger = logging.getLogger(__name__)
2021-03-26 08:42:51 +08:00
def _calc_hash(existing_file: Path):
2021-05-23 10:13:44 +08:00
chunk_size = 1024 * 1024
md5_hash = hashlib.md5()
2021-03-26 08:42:51 +08:00
with open(existing_file, 'rb') as file:
2021-05-23 10:13:44 +08:00
chunk = file.read(chunk_size)
while chunk:
md5_hash.update(chunk)
2021-05-23 10:13:44 +08:00
chunk = file.read(chunk_size)
file_hash = md5_hash.hexdigest()
return existing_file, file_hash
2021-03-26 08:42:51 +08:00
class RedditDownloader(RedditConnector):
2021-03-10 19:47:57 +08:00
def __init__(self, args: Configuration):
super(RedditDownloader, self).__init__(args)
if self.args.search_existing:
self.master_hash_list = self.scan_existing_files(self.download_directory)
2021-02-26 16:56:21 +08:00
2021-02-11 07:10:40 +08:00
def download(self):
for generator in self.reddit_lists:
for submission in generator:
self._download_submission(submission)
2021-02-11 07:10:40 +08:00
def _download_submission(self, submission: praw.models.Submission):
if submission.id in self.excluded_submission_ids:
logger.debug(f'Object {submission.id} in exclusion list, skipping')
return
elif submission.subreddit.display_name.lower() in self.args.skip_subreddit:
logger.debug(f'Submission {submission.id} in {submission.subreddit.display_name} in skip list')
return
2021-11-29 22:23:04 +08:00
elif (submission.author and submission.author.name in self.args.ignore_user) or \
(submission.author is None and 'DELETED' in self.args.ignore_user):
2021-11-01 21:28:46 +08:00
logger.debug(
f'Submission {submission.id} in {submission.subreddit.display_name} skipped'
2021-11-29 22:23:04 +08:00
f' due to {submission.author.name if submission.author else "DELETED"} being an ignored user')
2021-11-01 21:28:46 +08:00
return
elif not isinstance(submission, praw.models.Submission):
2021-03-25 14:28:08 +08:00
logger.warning(f'{submission.id} is not a submission')
return
elif not self.download_filter.check_url(submission.url):
logger.debug(f'Submission {submission.id} filtered due to URL {submission.url}')
return
logger.debug(f'Attempting to download submission {submission.id}')
try:
downloader_class = DownloadFactory.pull_lever(submission.url)
downloader = downloader_class(submission)
2021-03-13 11:36:18 +08:00
logger.debug(f'Using {downloader_class.__name__} with url {submission.url}')
except errors.NotADownloadableLinkError as e:
2021-04-05 12:54:26 +08:00
logger.error(f'Could not download submission {submission.id}: {e}')
return
if downloader_class.__name__.lower() in self.args.disable_module:
logger.debug(f'Submission {submission.id} skipped due to disabled module {downloader_class.__name__}')
return
try:
content = downloader.find_resources(self.authenticator)
2021-04-04 06:38:48 +08:00
except errors.SiteDownloaderError as e:
logger.error(f'Site {downloader_class.__name__} failed to download submission {submission.id}: {e}')
return
for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory):
if destination.exists():
logger.debug(f'File {destination} from submission {submission.id} already exists, continuing')
2021-05-21 14:50:05 +08:00
continue
elif not self.download_filter.check_resource(res):
logger.debug(f'Download filter removed {submission.id} file with URL {submission.url}')
2021-05-21 14:50:05 +08:00
continue
try:
2021-07-29 17:10:10 +08:00
res.download({'max_wait_time': self.args.max_wait_time})
2021-05-21 14:50:05 +08:00
except errors.BulkDownloaderException as e:
logger.error(f'Failed to download resource {res.url} in submission {submission.id} '
f'with downloader {downloader_class.__name__}: {e}')
return
resource_hash = res.hash.hexdigest()
destination.parent.mkdir(parents=True, exist_ok=True)
if resource_hash in self.master_hash_list:
if self.args.no_dupes:
logger.info(
f'Resource hash {resource_hash} from submission {submission.id} downloaded elsewhere')
return
2021-05-21 14:50:05 +08:00
elif self.args.make_hard_links:
self.master_hash_list[resource_hash].link_to(destination)
logger.info(
f'Hard link made linking {destination} to {self.master_hash_list[resource_hash]}'
f' in submission {submission.id}')
2021-05-21 14:50:05 +08:00
return
try:
with open(destination, 'wb') as file:
file.write(res.content)
logger.debug(f'Written file to {destination}')
except OSError as e:
logger.exception(e)
2021-06-13 07:49:42 +08:00
logger.error(f'Failed to write file in submission {submission.id} to {destination}: {e}')
2021-06-10 16:59:22 +08:00
return
2021-05-21 14:50:05 +08:00
creation_time = time.mktime(datetime.fromtimestamp(submission.created_utc).timetuple())
os.utime(destination, (creation_time, creation_time))
self.master_hash_list[resource_hash] = destination
logger.debug(f'Hash added to master list: {resource_hash}')
2021-05-31 11:42:03 +08:00
logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}')
@staticmethod
def scan_existing_files(directory: Path) -> dict[str, Path]:
files = []
for (dirpath, dirnames, filenames) in os.walk(directory):
files.extend([Path(dirpath, file) for file in filenames])
logger.info(f'Calculating hashes for {len(files)} files')
2021-03-26 08:42:51 +08:00
pool = Pool(15)
results = pool.map(_calc_hash, files)
pool.close()
hash_list = {res[1]: res[0] for res in results}
return hash_list