2021-03-13 18:18:30 +08:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# coding=utf-8
|
|
|
|
|
|
|
|
import json
|
|
|
|
import logging
|
2021-04-01 16:37:20 +08:00
|
|
|
import re
|
2021-04-09 21:15:45 +08:00
|
|
|
from typing import Iterator
|
2021-03-13 18:18:30 +08:00
|
|
|
|
2021-03-14 07:00:00 +08:00
|
|
|
import dict2xml
|
2021-03-13 18:18:30 +08:00
|
|
|
import praw.models
|
2021-03-14 07:00:00 +08:00
|
|
|
import yaml
|
2021-03-13 18:18:30 +08:00
|
|
|
|
2021-04-12 15:58:32 +08:00
|
|
|
from bdfr.archive_entry.base_archive_entry import BaseArchiveEntry
|
|
|
|
from bdfr.archive_entry.comment_archive_entry import CommentArchiveEntry
|
|
|
|
from bdfr.archive_entry.submission_archive_entry import SubmissionArchiveEntry
|
|
|
|
from bdfr.configuration import Configuration
|
2021-05-17 08:56:44 +08:00
|
|
|
from bdfr.connector import RedditConnector
|
2021-04-12 15:58:32 +08:00
|
|
|
from bdfr.exceptions import ArchiverError
|
|
|
|
from bdfr.resource import Resource
|
2021-03-13 18:18:30 +08:00
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
2021-05-17 08:56:44 +08:00
|
|
|
class Archiver(RedditConnector):
|
2021-03-13 18:18:30 +08:00
|
|
|
def __init__(self, args: Configuration):
|
|
|
|
super(Archiver, self).__init__(args)
|
|
|
|
|
|
|
|
def download(self):
|
|
|
|
for generator in self.reddit_lists:
|
|
|
|
for submission in generator:
|
|
|
|
logger.debug(f'Attempting to archive submission {submission.id}')
|
2021-05-17 08:56:44 +08:00
|
|
|
self.write_entry(submission)
|
2021-03-13 18:18:30 +08:00
|
|
|
|
2021-05-17 08:56:44 +08:00
|
|
|
def get_submissions_from_link(self) -> list[list[praw.models.Submission]]:
|
2021-04-01 16:37:20 +08:00
|
|
|
supplied_submissions = []
|
|
|
|
for sub_id in self.args.link:
|
|
|
|
if len(sub_id) == 6:
|
|
|
|
supplied_submissions.append(self.reddit_instance.submission(id=sub_id))
|
|
|
|
elif re.match(r'^\w{7}$', sub_id):
|
|
|
|
supplied_submissions.append(self.reddit_instance.comment(id=sub_id))
|
|
|
|
else:
|
|
|
|
supplied_submissions.append(self.reddit_instance.submission(url=sub_id))
|
|
|
|
return [supplied_submissions]
|
|
|
|
|
2021-05-17 08:56:44 +08:00
|
|
|
def get_user_data(self) -> list[Iterator]:
|
|
|
|
results = super(Archiver, self).get_user_data()
|
2021-04-09 21:15:45 +08:00
|
|
|
if self.args.user and self.args.all_comments:
|
2021-05-17 08:56:44 +08:00
|
|
|
sort = self.determine_sort_function()
|
2021-05-27 13:22:58 +08:00
|
|
|
for user in self.args.user:
|
|
|
|
logger.debug(f'Retrieving comments of user {user}')
|
|
|
|
results.append(sort(self.reddit_instance.redditor(user).comments, limit=self.args.limit))
|
2021-04-09 21:15:45 +08:00
|
|
|
return results
|
|
|
|
|
2021-04-01 16:37:20 +08:00
|
|
|
@staticmethod
|
|
|
|
def _pull_lever_entry_factory(praw_item: (praw.models.Submission, praw.models.Comment)) -> BaseArchiveEntry:
|
|
|
|
if isinstance(praw_item, praw.models.Submission):
|
|
|
|
return SubmissionArchiveEntry(praw_item)
|
|
|
|
elif isinstance(praw_item, praw.models.Comment):
|
|
|
|
return CommentArchiveEntry(praw_item)
|
|
|
|
else:
|
|
|
|
raise ArchiverError(f'Factory failed to classify item of type {type(praw_item).__name__}')
|
|
|
|
|
2021-05-17 08:56:44 +08:00
|
|
|
def write_entry(self, praw_item: (praw.models.Submission, praw.models.Comment)):
|
2021-06-12 08:35:31 +08:00
|
|
|
if self.args.comment_context and isinstance(praw_item, praw.models.Comment):
|
2021-06-11 13:31:11 +08:00
|
|
|
logger.debug(f'Converting comment {praw_item.id} to submission {praw_item.submission.id}')
|
|
|
|
praw_item = praw_item.submission
|
2021-04-01 16:37:20 +08:00
|
|
|
archive_entry = self._pull_lever_entry_factory(praw_item)
|
2021-03-13 18:18:30 +08:00
|
|
|
if self.args.format == 'json':
|
2021-04-01 16:37:20 +08:00
|
|
|
self._write_entry_json(archive_entry)
|
2021-03-13 18:18:30 +08:00
|
|
|
elif self.args.format == 'xml':
|
2021-04-01 16:37:20 +08:00
|
|
|
self._write_entry_xml(archive_entry)
|
2021-03-13 18:18:30 +08:00
|
|
|
elif self.args.format == 'yaml':
|
2021-04-01 16:37:20 +08:00
|
|
|
self._write_entry_yaml(archive_entry)
|
2021-03-13 18:18:30 +08:00
|
|
|
else:
|
|
|
|
raise ArchiverError(f'Unknown format {self.args.format} given')
|
2021-04-01 16:37:20 +08:00
|
|
|
logger.info(f'Record for entry item {praw_item.id} written to disk')
|
2021-03-13 18:18:30 +08:00
|
|
|
|
2021-04-01 16:37:20 +08:00
|
|
|
def _write_entry_json(self, entry: BaseArchiveEntry):
|
2021-07-27 11:39:49 +08:00
|
|
|
resource = Resource(entry.source, '', lambda: None, '.json')
|
2021-04-01 16:37:20 +08:00
|
|
|
content = json.dumps(entry.compile())
|
|
|
|
self._write_content_to_disk(resource, content)
|
2021-03-13 18:18:30 +08:00
|
|
|
|
2021-04-01 16:37:20 +08:00
|
|
|
def _write_entry_xml(self, entry: BaseArchiveEntry):
|
2021-07-27 11:39:49 +08:00
|
|
|
resource = Resource(entry.source, '', lambda: None, '.xml')
|
2021-04-01 16:37:20 +08:00
|
|
|
content = dict2xml.dict2xml(entry.compile(), wrap='root')
|
|
|
|
self._write_content_to_disk(resource, content)
|
|
|
|
|
|
|
|
def _write_entry_yaml(self, entry: BaseArchiveEntry):
|
2021-07-27 11:39:49 +08:00
|
|
|
resource = Resource(entry.source, '', lambda: None, '.yaml')
|
2021-04-01 16:37:20 +08:00
|
|
|
content = yaml.dump(entry.compile())
|
|
|
|
self._write_content_to_disk(resource, content)
|
2021-03-13 18:18:30 +08:00
|
|
|
|
2021-04-01 16:37:20 +08:00
|
|
|
def _write_content_to_disk(self, resource: Resource, content: str):
|
2021-03-14 07:00:00 +08:00
|
|
|
file_path = self.file_name_formatter.format_path(resource, self.download_directory)
|
2021-03-14 09:11:37 +08:00
|
|
|
file_path.parent.mkdir(exist_ok=True, parents=True)
|
2021-05-06 21:11:48 +08:00
|
|
|
with open(file_path, 'w', encoding="utf-8") as file:
|
2021-04-01 16:37:20 +08:00
|
|
|
logger.debug(
|
|
|
|
f'Writing entry {resource.source_submission.id} to file in {resource.extension[1:].upper()}'
|
|
|
|
f' format at {file_path}')
|
|
|
|
file.write(content)
|