bulk-downloader-for-reddit/bdfr/downloader.py

#!/usr/bin/env python3
# coding=utf-8

import hashlib
import logging.handlers
import os
import time
from datetime import datetime
from multiprocessing import Pool
from pathlib import Path

import praw
import praw.exceptions
import praw.models

from bdfr import exceptions as errors
from bdfr.configuration import Configuration
from bdfr.connector import RedditConnector
from bdfr.site_downloaders.download_factory import DownloadFactory

logger = logging.getLogger(__name__)


def _calc_hash(existing_file: Path):
    chunk_size = 1024 * 1024
    md5_hash = hashlib.md5()
    with open(existing_file, 'rb') as file:
        chunk = file.read(chunk_size)
        while chunk:
            md5_hash.update(chunk)
            chunk = file.read(chunk_size)
    file_hash = md5_hash.hexdigest()
    return existing_file, file_hash


class RedditDownloader(RedditConnector):
    def __init__(self, args: Configuration):
        super(RedditDownloader, self).__init__(args)
        if self.args.search_existing:
            self.master_hash_list = self.scan_existing_files(self.download_directory)

    def download(self):
        for generator in self.reddit_lists:
            for submission in generator:
                self._download_submission(submission)

    def _download_submission(self, submission: praw.models.Submission):
        if submission.id in self.excluded_submission_ids:
            logger.debug(f'Object {submission.id} in exclusion list, skipping')
            return
        elif submission.subreddit.display_name.lower() in self.args.skip_subreddit:
            logger.debug(f'Submission {submission.id} in {submission.subreddit.display_name} in skip list')
            return
        elif (submission.author and submission.author.name in self.args.ignore_user) or \
                (submission.author is None and 'DELETED' in self.args.ignore_user):
            logger.debug(
                f'Submission {submission.id} in {submission.subreddit.display_name} skipped'
                f' due to {submission.author.name if submission.author else "DELETED"} being an ignored user')
            return
        elif not isinstance(submission, praw.models.Submission):
            logger.warning(f'{submission.id} is not a submission')
            return
        elif not self.download_filter.check_url(submission.url):
            logger.debug(f'Submission {submission.id} filtered due to URL {submission.url}')
            return

        logger.debug(f'Attempting to download submission {submission.id}')
        try:
            downloader_class = DownloadFactory.pull_lever(submission.url)
            downloader = downloader_class(submission)
            logger.debug(f'Using {downloader_class.__name__} with url {submission.url}')
        except errors.NotADownloadableLinkError as e:
            logger.error(f'Could not download submission {submission.id}: {e}')
            return
        if downloader_class.__name__.lower() in self.args.disable_module:
            logger.debug(f'Submission {submission.id} skipped due to disabled module {downloader_class.__name__}')
            return
        try:
            content = downloader.find_resources(self.authenticator)
        except errors.SiteDownloaderError as e:
            logger.error(f'Site {downloader_class.__name__} failed to download submission {submission.id}: {e}')
            return
        for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory):
            if destination.exists():
                logger.debug(f'File {destination} from submission {submission.id} already exists, continuing')
                continue
            elif not self.download_filter.check_resource(res):
                logger.debug(f'Download filter removed {submission.id} file with URL {submission.url}')
                continue
            try:
                res.download({'max_wait_time': self.args.max_wait_time})
            except errors.BulkDownloaderException as e:
                logger.error(f'Failed to download resource {res.url} in submission {submission.id} '
                             f'with downloader {downloader_class.__name__}: {e}')
                return
            resource_hash = res.hash.hexdigest()
            destination.parent.mkdir(parents=True, exist_ok=True)
            if resource_hash in self.master_hash_list:
                if self.args.no_dupes:
                    logger.info(
                        f'Resource hash {resource_hash} from submission {submission.id} downloaded elsewhere')
                    return
                elif self.args.make_hard_links:
                    self.master_hash_list[resource_hash].link_to(destination)
                    logger.info(
                        f'Hard link made linking {destination} to {self.master_hash_list[resource_hash]}'
                        f' in submission {submission.id}')
                    return
            try:
                with open(destination, 'wb') as file:
                    file.write(res.content)
                logger.debug(f'Written file to {destination}')
            except OSError as e:
                logger.exception(e)
                logger.error(f'Failed to write file in submission {submission.id} to {destination}: {e}')
                return
            creation_time = time.mktime(datetime.fromtimestamp(submission.created_utc).timetuple())
            os.utime(destination, (creation_time, creation_time))
            self.master_hash_list[resource_hash] = destination
            logger.debug(f'Hash added to master list: {resource_hash}')
        logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}')

    @staticmethod
    def scan_existing_files(directory: Path) -> dict[str, Path]:
        files = []
        for (dirpath, dirnames, filenames) in os.walk(directory):
            files.extend([Path(dirpath, file) for file in filenames])
        logger.info(f'Calculating hashes for {len(files)} files')

        pool = Pool(15)
        results = pool.map(_calc_hash, files)
        pool.close()

        hash_list = {res[1]: res[0] for res in results}
        return hash_list
Move to different program structure 2021-02-11 07:10:40 +08:00			`#!/usr/bin/env python3`
			`# coding=utf-8`

Add function to calculate all existing file hashes if wanted 2021-03-11 12:18:48 +08:00			`import hashlib`
Switch to rotating log files 2021-04-05 12:47:39 +08:00			`import logging.handlers`
Add function to calculate all existing file hashes if wanted 2021-03-11 12:18:48 +08:00			`import os`
Set file creation times to the post creation time (#391) 2021-05-17 18:49:35 +08:00			`import time`
Move to different program structure 2021-02-11 07:10:40 +08:00			`from datetime import datetime`
Calculate existing hashes in parallel 2021-03-26 08:42:51 +08:00			`from multiprocessing import Pool`
Move to different program structure 2021-02-11 07:10:40 +08:00			`from pathlib import Path`

			`import praw`
Allow subreddits and multireddits to fail individually 2021-03-11 15:18:21 +08:00			`import praw.exceptions`
Move to different program structure 2021-02-11 07:10:40 +08:00			`import praw.models`

Refactor out super class RedditConnector 2021-05-17 08:56:44 +08:00			`from bdfr import exceptions as errors`
Rename module 2021-04-12 15:58:32 +08:00			`from bdfr.configuration import Configuration`
Refactor out super class RedditConnector 2021-05-17 08:56:44 +08:00			`from bdfr.connector import RedditConnector`
Rename module 2021-04-12 15:58:32 +08:00			`from bdfr.site_downloaders.download_factory import DownloadFactory`
Move to different program structure 2021-02-11 07:10:40 +08:00
			`logger = logging.getLogger(__name__)`


Calculate existing hashes in parallel 2021-03-26 08:42:51 +08:00			`def _calc_hash(existing_file: Path):`
Rename variable 2021-05-23 10:13:44 +08:00			`chunk_size = 1024 * 1024`
Read files in chunks instead when hashing (#416) 2021-05-22 02:41:57 +08:00			`md5_hash = hashlib.md5()`
Calculate existing hashes in parallel 2021-03-26 08:42:51 +08:00			`with open(existing_file, 'rb') as file:`
Rename variable 2021-05-23 10:13:44 +08:00			`chunk = file.read(chunk_size)`
Read files in chunks instead when hashing (#416) 2021-05-22 02:41:57 +08:00			`while chunk:`
			`md5_hash.update(chunk)`
Rename variable 2021-05-23 10:13:44 +08:00			`chunk = file.read(chunk_size)`
Read files in chunks instead when hashing (#416) 2021-05-22 02:41:57 +08:00			`file_hash = md5_hash.hexdigest()`
			`return existing_file, file_hash`
Calculate existing hashes in parallel 2021-03-26 08:42:51 +08:00

Refactor out super class RedditConnector 2021-05-17 08:56:44 +08:00			`class RedditDownloader(RedditConnector):`
Abstract configuration into class 2021-03-10 19:47:57 +08:00			`def __init__(self, args: Configuration):`
Refactor out super class RedditConnector 2021-05-17 08:56:44 +08:00			`super(RedditDownloader, self).__init__(args)`
Add option to search for files pre-emptively 2021-03-12 11:24:25 +08:00			`if self.args.search_existing:`
Add ability to make hard links for duplicates 2021-03-20 10:03:53 +08:00			`self.master_hash_list = self.scan_existing_files(self.download_directory)`
Implement changes in downloader 2021-02-26 16:56:21 +08:00
Move to different program structure 2021-02-11 07:10:40 +08:00			`def download(self):`
			`for generator in self.reddit_lists:`
			`for submission in generator:`
Add a combined command for the archiver and downloader: `clone` (#433) * Simplify downloader function * Add basic scraper class * Add "scrape" command * Rename "scrape" command to "clone" * Add integration tests for clone command * Update README * Fix failing test 2021-06-06 18:29:09 +08:00			`self._download_submission(submission)`
Move to different program structure 2021-02-11 07:10:40 +08:00
			`def _download_submission(self, submission: praw.models.Submission):`
Add a combined command for the archiver and downloader: `clone` (#433) * Simplify downloader function * Add basic scraper class * Add "scrape" command * Rename "scrape" command to "clone" * Add integration tests for clone command * Update README * Fix failing test 2021-06-06 18:29:09 +08:00			`if submission.id in self.excluded_submission_ids:`
			`logger.debug(f'Object {submission.id} in exclusion list, skipping')`
			`return`
			`elif submission.subreddit.display_name.lower() in self.args.skip_subreddit:`
			`logger.debug(f'Submission {submission.id} in {submission.subreddit.display_name} in skip list')`
			`return`
Update downloader.py 2021-11-29 22:23:04 +08:00			`elif (submission.author and submission.author.name in self.args.ignore_user) or \`
			`(submission.author is None and 'DELETED' in self.args.ignore_user):`
add test. fix typos. 2021-11-01 21:28:46 +08:00			`logger.debug(`
Add integration test for downloader option 2021-11-24 08:54:29 +08:00			`f'Submission {submission.id} in {submission.subreddit.display_name} skipped'`
Update downloader.py 2021-11-29 22:23:04 +08:00			`f' due to {submission.author.name if submission.author else "DELETED"} being an ignored user')`
add test. fix typos. 2021-11-01 21:28:46 +08:00			`return`
Add a combined command for the archiver and downloader: `clone` (#433) * Simplify downloader function * Add basic scraper class * Add "scrape" command * Rename "scrape" command to "clone" * Add integration tests for clone command * Update README * Fix failing test 2021-06-06 18:29:09 +08:00			`elif not isinstance(submission, praw.models.Submission):`
Filter non-submissions (#212) 2021-03-25 14:28:08 +08:00			`logger.warning(f'{submission.id} is not a submission')`
			`return`
Check submission URL against filter before factory 2021-06-23 12:30:39 +08:00			`elif not self.download_filter.check_url(submission.url):`
			`logger.debug(f'Submission {submission.id} filtered due to URL {submission.url}')`
			`return`
Add a combined command for the archiver and downloader: `clone` (#433) * Simplify downloader function * Add basic scraper class * Add "scrape" command * Rename "scrape" command to "clone" * Add integration tests for clone command * Update README * Fix failing test 2021-06-06 18:29:09 +08:00
			`logger.debug(f'Attempting to download submission {submission.id}')`
Add option to search for files pre-emptively 2021-03-12 11:24:25 +08:00			`try:`
			`downloader_class = DownloadFactory.pull_lever(submission.url)`
			`downloader = downloader_class(submission)`
Add logging entry 2021-03-13 11:36:18 +08:00			`logger.debug(f'Using {downloader_class.__name__} with url {submission.url}')`
Add option to search for files pre-emptively 2021-03-12 11:24:25 +08:00			`except errors.NotADownloadableLinkError as e:`
Correct logger message 2021-04-05 12:54:26 +08:00			`logger.error(f'Could not download submission {submission.id}: {e}')`
Add option to search for files pre-emptively 2021-03-12 11:24:25 +08:00			`return`
Add ability to disable modules (#434) * Fix test name to match standard * Rename file * Add ability to disable modules * Update README * Fix missing comma * Fix more missing commas. sigh... Co-authored-by: Ali Parlakçı <parlakciali@gmail.com> 2021-06-06 18:47:56 +08:00			`if downloader_class.__name__.lower() in self.args.disable_module:`
			`logger.debug(f'Submission {submission.id} skipped due to disabled module {downloader_class.__name__}')`
			`return`
Add error catch for youtube and site downloaders 2021-03-13 10:01:30 +08:00			`try:`
			`content = downloader.find_resources(self.authenticator)`
Alter some logging messages 2021-04-04 06:38:48 +08:00			`except errors.SiteDownloaderError as e:`
			`logger.error(f'Site {downloader_class.__name__} failed to download submission {submission.id}: {e}')`
Add error catch for youtube and site downloaders 2021-03-13 10:01:30 +08:00			`return`
Add option to search for files pre-emptively 2021-03-12 11:24:25 +08:00			`for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory):`
			`if destination.exists():`
Update logging messages to include submission IDs 2021-05-22 09:47:48 +08:00			`logger.debug(f'File {destination} from submission {submission.id} already exists, continuing')`
Simplify method structure 2021-05-21 14:50:05 +08:00			`continue`
Make downloadfilter apply itself to Resources 2021-05-03 11:57:06 +08:00			`elif not self.download_filter.check_resource(res):`
Check submission URL against filter before factory 2021-06-23 12:30:39 +08:00			`logger.debug(f'Download filter removed {submission.id} file with URL {submission.url}')`
Simplify method structure 2021-05-21 14:50:05 +08:00			`continue`
			`try:`
Add in downloader parameters 2021-07-29 17:10:10 +08:00			`res.download({'max_wait_time': self.args.max_wait_time})`
Simplify method structure 2021-05-21 14:50:05 +08:00			`except errors.BulkDownloaderException as e:`
			`logger.error(f'Failed to download resource {res.url} in submission {submission.id} '`
			`f'with downloader {downloader_class.__name__}: {e}')`
			`return`
			`resource_hash = res.hash.hexdigest()`
			`destination.parent.mkdir(parents=True, exist_ok=True)`
			`if resource_hash in self.master_hash_list:`
			`if self.args.no_dupes:`
			`logger.info(`
			`f'Resource hash {resource_hash} from submission {submission.id} downloaded elsewhere')`
Add shortcut in download for certain errors 2021-03-15 12:00:21 +08:00			`return`
Simplify method structure 2021-05-21 14:50:05 +08:00			`elif self.args.make_hard_links:`
			`self.master_hash_list[resource_hash].link_to(destination)`
			`logger.info(`
Update logging messages to include submission IDs 2021-05-22 09:47:48 +08:00			`f'Hard link made linking {destination} to {self.master_hash_list[resource_hash]}'`
			`f' in submission {submission.id}')`
Simplify method structure 2021-05-21 14:50:05 +08:00			`return`
Catch error with MacOS writing per issue #407 2021-05-23 10:17:14 +08:00			`try:`
			`with open(destination, 'wb') as file:`
			`file.write(res.content)`
			`logger.debug(f'Written file to {destination}')`
			`except OSError as e:`
			`logger.exception(e)`
Update logging message 2021-06-13 07:49:42 +08:00			`logger.error(f'Failed to write file in submission {submission.id} to {destination}: {e}')`
Add missing return statement 2021-06-10 16:59:22 +08:00			`return`
Simplify method structure 2021-05-21 14:50:05 +08:00			`creation_time = time.mktime(datetime.fromtimestamp(submission.created_utc).timetuple())`
			`os.utime(destination, (creation_time, creation_time))`
			`self.master_hash_list[resource_hash] = destination`
			`logger.debug(f'Hash added to master list: {resource_hash}')`
Fix indent 2021-05-31 11:42:03 +08:00			`logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}')`
Add option to search for files pre-emptively 2021-03-12 11:24:25 +08:00
			`@staticmethod`
Add ability to make hard links for duplicates 2021-03-20 10:03:53 +08:00			`def scan_existing_files(directory: Path) -> dict[str, Path]:`
Add function to calculate all existing file hashes if wanted 2021-03-11 12:18:48 +08:00			`files = []`
Add option to search for files pre-emptively 2021-03-12 11:24:25 +08:00			`for (dirpath, dirnames, filenames) in os.walk(directory):`
Add function to calculate all existing file hashes if wanted 2021-03-11 12:18:48 +08:00			`files.extend([Path(dirpath, file) for file in filenames])`
			`logger.info(f'Calculating hashes for {len(files)} files')`
Calculate existing hashes in parallel 2021-03-26 08:42:51 +08:00
			`pool = Pool(15)`
			`results = pool.map(_calc_hash, files)`
			`pool.close()`

			`hash_list = {res[1]: res[0] for res in results}`
Add function to calculate all existing file hashes if wanted 2021-03-11 12:18:48 +08:00			`return hash_list`