Source code for diskette.core.loader

import json
import shutil
import tarfile
import tempfile
import requests
from pathlib import Path

from django.conf import settings
from django.template.defaultfilters import filesizeformat

from ..utils.filesystem import directory_size
from ..utils.loggers import NoOperationLogger
from ..utils import hashs
from ..utils.http import is_url

from .serializers import LoaddataSerializerAbstract
from .storages import StorageMixin



[docs]
class Loader(StorageMixin, LoaddataSerializerAbstract):
    """
    Dump loader opens a Diskette archive to deploy its data and storage contents.

    Keyword Arguments:
        logger (object): Instance of a logger object to use. Logger object must
            implement common logging message methods (like error, info, etc..). See
            ``diskette.utils.loggers`` for available loggers. If not given, a dummy
            logger will be used that ignores any messages and won't output anything.
    """
    MANIFEST_FILENAME = "manifest.json"
    TEMPDIR_PREFIX = "diskette_"
    DOWNLOAD_FILENAME = "diskette_downloaded_archive.tar.gz"

    def __init__(self, logger=None):
        self.logger = logger or NoOperationLogger()


[docs]
    def get_dir_size(self, path):
        """
        Shortcut to get computed size of a directory files.

        This exists in its own method to be able to mock it from tests because
        size computation is not stable.

        Arguments:
            path (pathlib.Path): Directory Path object.

        Returns:
            integer: Computed size of directory files.
        """
        return directory_size(path)



[docs]
    def download_archive(self, url, destination=None):
        """
        Download archive from given URL into destination directory.

        Arguments:
            url (string): The archive URL to download.

        Keyword Arguments:
            destination (Path): A path where to write downloaded archive file. If not
                given, the archive file will be written as
                ``diskette_downloaded_archive.tar.gz`` into the current working
                directory.

        Returns:
            Path: Path to downloaded archive file.
        """
        destination = destination or Path.cwd() / self.DOWNLOAD_FILENAME
        self.logger.info(
            "Downloading archive from '{}' to '{}'".format(url, destination)
        )

        request = requests.get(
            url,
            allow_redirects=settings.DISKETTE_DOWNLOAD_ALLOW_REDIRECT,
            timeout=settings.DISKETTE_DOWNLOAD_TIMEOUT,
            stream=True
        )
        request.raise_for_status()

        # Use the chunk way to avoid retaining the whole file in memory
        with open(destination, "wb") as fd:
            for chunk in request.iter_content(
                chunk_size=settings.DISKETTE_DOWNLOAD_CHUNK
            ):
                fd.write(chunk)

        return destination



[docs]
    def open(self, source, download_destination=None, keep=False, checksum=None):
        """
        Extract archive files in a temporary directory.

        .. Warning::
            Using this method, you are responsible to remove the temporary directory
            once you are done with it. Your code must be safe about it and remove it
            even when your code fails or you will produce a lot of remaining temporary
            directories.

        Arguments:
            source (Path): A Path object to the archive to open.

        Keyword Arguments:
            download_destination (Path): A path where to write downloaded archive file.
                If not given, the archive file will be written as
                ``diskette_downloaded_archive.tar.gz`` into the current working
                directory. This argument is useless with local archive file.
            keep (boolean): Archive won't be removed from filesystem if True, else the
                archive file is removed once it have been extracted.
            checksum (object): Manage if archive is checksumed or not depending value:

                * If ``None``: Checksum is done and just output to logs;
                * If ``True``: Checksum is done and just output to logs;
                * If ``False``: No checksum are done or compared;
                * Any other value is assumed to be a string for a checksum to compare.
                  Then a checksum is done on archive and compared to the given one, if
                  comparaison fails it results to a critical error.

        Returns:
            Path: The temporary directory where archive files have been extracted.
        """
        archive = source
        if is_url(source):
            archive = self.download_archive(source, destination=download_destination)

        if not archive.exists():
            self.logger.critical(
                "Given archive path does not exists: {}".format(archive)
            )

        # The temporary directory where to extract archive content
        destination_tmpdir = Path(tempfile.mkdtemp(prefix=self.TEMPDIR_PREFIX))

        # Perform checksum if not explicitely disabled
        if checksum is not False:
            archive_checksum = hashs.file_checksum(archive)
            self.logger.debug(
                "Archive checksum: {}".format(archive_checksum)
            )
            # Compare checksums if any
            if checksum and checksum is not True:
                if archive_checksum != checksum:
                    self.logger.critical(
                        "Checksums do not match. Your archive file is probably "
                        "corrupted."
                    )

        try:
            # Extract everything in temporary directory
            with tarfile.open(archive, "r:gz") as archive_fp:
                archive_fp.extractall(destination_tmpdir)
        except Exception as e:
            # Remove destination_tmpdir on extraction failure
            if destination_tmpdir.exists():
                shutil.rmtree(destination_tmpdir)
            # Then raise the exception
            raise e
        finally:
            # Remove archive if not required to be keeped
            if not keep:
                archive.unlink()

        return destination_tmpdir



[docs]
    def get_manifest(self, path):
        """
        Search for manifest file in given path, validate it and return it.

        This raises an exception if manifest is invalid, the used exception class will
        depends from used logger.

        Arguments:
            path (Path): Path object to the directory where to search for the manifest
                file.

        Returns:
            dict: The manifest data.
        """
        manifest_path = path / self.MANIFEST_FILENAME
        if not manifest_path.exists():
            self.logger.critical(
                "Dump archive is invalid, it does not include manifest file "
                "'manifest.json'"
            )

        try:
            manifest = json.loads(manifest_path.read_text())
        except json.JSONDecodeError as e:
            self.logger.critical(
                "Dump archive is invalid, included manifest file has invalid JSON "
                "syntax: {}".format(str(e))
            )

        if "datas" not in manifest:
            self.logger.critical(
                "Dump archive is invalid, manifest does not include 'datas' field."
            )

        if "storages" not in manifest:
            self.logger.critical(
                "Dump archive is invalid, manifest does not include 'storages' field."
            )

        # Turn data file and storage items to Path objects
        manifest["datas"] = [Path(v) for v in manifest.get("datas") or []]
        manifest["storages"] = [Path(v) for v in manifest.get("storages") or []]

        return manifest



[docs]
    def validate_datas(self):
        """
        Call validators from all enabled data dumps.

        .. Note::
            There is currently no validator needed.

        Returns:
            boolean: Always return True since there is nothing to validate actually.
        """
        return True



[docs]
    def validate_storages(self):
        """
        Call validators from all enabled storages.

        .. Note::
            There is currently no validator needed.

        Returns:
            boolean: Always return True since there is nothing to validate actually.
        """
        return True



[docs]
    def validate(self):
        """
        Call all validators
        """
        self.validate_datas()
        self.validate_storages()



[docs]
    def deploy_storages(self, archive_dir, manifest, destination):
        """
        Deploy storages directories in given destination.

        .. Note::
            When a storage path already exists it is removed just before deploying
            the storage content.

        Arguments:
            archive_dir (Path): Path to directory where archive has been exracted.
            manifest (dict): The manifest data.
            destination (Path): Path to directory where to deploy storages.

        Returns:
            list: List of tuples for deployed storage with respectively source and
                destination paths.
        """
        deployed = []

        for dump_path in manifest["storages"]:
            storage_source = archive_dir / dump_path
            storage_destination = destination / dump_path

            # Create complete destination path structure if needed
            if not storage_destination.parent.exists():
                self.logger.debug(
                    "Creating storage parent directory: {}".format(
                        storage_destination.parent
                    )
                )
                storage_destination.parent.mkdir(parents=True)

            # Remove possible existing storage
            if storage_destination.exists():
                self.logger.debug(
                    "Removing previous storage version directory: {}".format(
                        storage_destination
                    )
                )
                shutil.rmtree(storage_destination)

            # Move storage dump to destination
            self.logger.info(
                "Restoring storage directory ({}): {}".format(
                    filesizeformat(self.get_dir_size(storage_source)),
                    dump_path
                )
            )
            shutil.move(storage_source, storage_destination)

            deployed.append((storage_source, storage_destination))

        return deployed



[docs]
    def check_data_dump(self, dump, excludes):
        """
        Check if data dump is to be loaded or not.

        When dump is not to be loaded, a INFO log message will be output.

        This check dump file against filename exclusions and minimal file size.

        Returns:
            boolean: True if dump is to be loaded, else False.
        """
        if dump.name in excludes:
            self.logger.info("Ignored dump '{}' by exclusion".format(dump.name))
            return False

        if (
            settings.DISKETTE_LOAD_MINIMAL_FILESIZE and
            dump.stat().st_size <= settings.DISKETTE_LOAD_MINIMAL_FILESIZE
        ):
            msg = "Ignored dump '{name}' because file is under the minimal size: {size}"
            self.logger.info(msg.format(
                name=dump.name,
                size=filesizeformat(dump.stat().st_size),
            ))
            return False

        return True



[docs]
    def deploy_datas(self, archive_dir, manifest, excludes=None,
                     ignorenonexistent=False):
        """
        Deploy storages directories in given destination

        Arguments:
            archive_dir (Path): Path to directory where archive has been exracted.
            manifest (dict): The manifest data.

        Keyword Arguments:
            excludes (list): List of dump filenames to exclude from loading. Notes that
                rather to be passed to ``loaddata`` command, instead we are directly
                filter internally excludes.
            ignorenonexistent (boolean): If true, fields and models that does not
                exists in current models will be ignored instead of raising an error.
                This is false on default

        Returns:
            list: List of tuples for deployed dumps with respectively source and
                loaddata output.
        """
        excludes = excludes or []

        return [
            (
                dump.name,
                self.call(archive_dir / dump, ignorenonexistent=ignorenonexistent)
            )
            for dump in manifest["datas"]
            if self.check_data_dump(archive_dir / dump, excludes)
        ]



[docs]
    def deploy(self, archive, storages_destination, data_exclusions=None,
               with_data=True, with_storages=True, download_destination=None,
               keep=False, checksum=None, ignorenonexistent_data=False):
        """
        Load archive and deploy its content.

        Arguments:
            archive (Path or string): The tarball archive to open and extract dumps. It
                may be either a Path to a local archive file or a string for an URL
                to download the archive.
            storages_destination (Path): Destination where to deploy all storage
                directories.

        Keyword Arguments:
            data_exclusions (list): List of dump filenames to exclude from loading.
            with_data (boolean): Enable application datas loading.
            with_storages (boolean): Enabled media storages loading.
            download_destination (Path): A path where to write downloaded archive file.
                If not given, the archive file will be written as
                ``diskette_downloaded_archive.tar.gz`` into the current working
                directory. This argument is useless with local archive file.
            keep (boolean): Archive won't be removed from filesystem if True, else the
                archive file is removed once it have been extracted.
            checksum (object): Manage if archive is checksumed or not depending value:

                * If ``None``: Checksum is done and just output to logs;
                * If ``True``: Checksum is done and just output to logs;
                * If ``False``: No checksum are done or compared;
                * Any other value is assumed to be a string for a checksum to compare.
                  Then a checksum is done on archive and compared to the given one, if
                  comparaison fails it results to a critical error.
            ignorenonexistent_data (boolean): If true, fields and models that does not
                exists in current models will be ignored instead of raising an error.
                This is false on default

        Returns:
            dict: Statistics of deployed storages and datas.
        """
        tmpdir = self.open(
            archive,
            download_destination=download_destination,
            keep=keep,
            checksum=checksum,
        )

        stats = {}
        try:
            manifest = self.get_manifest(tmpdir)

            if with_storages:
                stats["storages"] = self.deploy_storages(
                    tmpdir,
                    manifest,
                    storages_destination,
                )

            if with_data:
                stats["datas"] = self.deploy_datas(
                    tmpdir,
                    manifest,
                    excludes=data_exclusions,
                    ignorenonexistent=ignorenonexistent_data,
                )
        finally:
            if tmpdir.exists():
                shutil.rmtree(tmpdir)

        return stats