bulk-downloader-for-reddit/bdfr/archiver.py

#!/usr/bin/env python3
# coding=utf-8

import json
import logging
import re
from time import sleep
from typing import Iterator, Union

import dict2xml
import praw.models
import prawcore
import yaml

from bdfr.archive_entry.base_archive_entry import BaseArchiveEntry
from bdfr.archive_entry.comment_archive_entry import CommentArchiveEntry
from bdfr.archive_entry.submission_archive_entry import SubmissionArchiveEntry
from bdfr.configuration import Configuration
from bdfr.connector import RedditConnector
from bdfr.exceptions import ArchiverError
from bdfr.resource import Resource

logger = logging.getLogger(__name__)


class Archiver(RedditConnector):
    def __init__(self, args: Configuration):
        super(Archiver, self).__init__(args)

    def download(self):
        for generator in self.reddit_lists:
            try:
                for submission in generator:
                    try:
                        if (submission.author and submission.author.name in self.args.ignore_user) or (
                            submission.author is None and "DELETED" in self.args.ignore_user
                        ):
                            logger.debug(
                                f"Submission {submission.id} in {submission.subreddit.display_name} skipped due to"
                                f" {submission.author.name if submission.author else 'DELETED'} being an ignored user"
                            )
                            continue
                        if submission.id in self.excluded_submission_ids:
                            logger.debug(f"Object {submission.id} in exclusion list, skipping")
                            continue
                        logger.debug(f"Attempting to archive submission {submission.id}")
                        self.write_entry(submission)
                    except prawcore.PrawcoreException as e:
                        logger.error(f"Submission {submission.id} failed to be archived due to a PRAW exception: {e}")
            except prawcore.PrawcoreException as e:
                logger.error(f"The submission after {submission.id} failed to download due to a PRAW exception: {e}")
                logger.debug("Waiting 60 seconds to continue")
                sleep(60)

    def get_submissions_from_link(self) -> list[list[praw.models.Submission]]:
        supplied_submissions = []
        for sub_id in self.args.link:
            if len(sub_id) == 6:
                supplied_submissions.append(self.reddit_instance.submission(id=sub_id))
            elif re.match(r"^\w{7}$", sub_id):
                supplied_submissions.append(self.reddit_instance.comment(id=sub_id))
            else:
                supplied_submissions.append(self.reddit_instance.submission(url=sub_id))
        return [supplied_submissions]

    def get_user_data(self) -> list[Iterator]:
        results = super(Archiver, self).get_user_data()
        if self.args.user and self.args.all_comments:
            sort = self.determine_sort_function()
            for user in self.args.user:
                logger.debug(f"Retrieving comments of user {user}")
                results.append(sort(self.reddit_instance.redditor(user).comments, limit=self.args.limit))
        return results

    @staticmethod
    def _pull_lever_entry_factory(praw_item: Union[praw.models.Submission, praw.models.Comment]) -> BaseArchiveEntry:
        if isinstance(praw_item, praw.models.Submission):
            return SubmissionArchiveEntry(praw_item)
        elif isinstance(praw_item, praw.models.Comment):
            return CommentArchiveEntry(praw_item)
        else:
            raise ArchiverError(f"Factory failed to classify item of type {type(praw_item).__name__}")

    def write_entry(self, praw_item: Union[praw.models.Submission, praw.models.Comment]):
        if self.args.comment_context and isinstance(praw_item, praw.models.Comment):
            logger.debug(f"Converting comment {praw_item.id} to submission {praw_item.submission.id}")
            praw_item = praw_item.submission
        archive_entry = self._pull_lever_entry_factory(praw_item)
        if self.args.format == "json":
            self._write_entry_json(archive_entry)
        elif self.args.format == "xml":
            self._write_entry_xml(archive_entry)
        elif self.args.format == "yaml":
            self._write_entry_yaml(archive_entry)
        else:
            raise ArchiverError(f"Unknown format {self.args.format} given")
        logger.info(f"Record for entry item {praw_item.id} written to disk")

    def _write_entry_json(self, entry: BaseArchiveEntry):
        resource = Resource(entry.source, "", lambda: None, ".json")
        content = json.dumps(entry.compile())
        self._write_content_to_disk(resource, content)

    def _write_entry_xml(self, entry: BaseArchiveEntry):
        resource = Resource(entry.source, "", lambda: None, ".xml")
        content = dict2xml.dict2xml(entry.compile(), wrap="root")
        self._write_content_to_disk(resource, content)

    def _write_entry_yaml(self, entry: BaseArchiveEntry):
        resource = Resource(entry.source, "", lambda: None, ".yaml")
        content = yaml.dump(entry.compile())
        self._write_content_to_disk(resource, content)

    def _write_content_to_disk(self, resource: Resource, content: str):
        file_path = self.file_name_formatter.format_path(resource, self.download_directory)
        file_path.parent.mkdir(exist_ok=True, parents=True)
        with open(file_path, "w", encoding="utf-8") as file:
            logger.debug(
                f"Writing entry {resource.source_submission.id} to file in {resource.extension[1:].upper()}"
                f" format at {file_path}"
            )
            file.write(content)