import json
import shutil
import tarfile
import tempfile
import requests
from pathlib import Path
from django.conf import settings
from django.template.defaultfilters import filesizeformat
from ..utils.filesystem import directory_size
from ..utils.loggers import NoOperationLogger
from ..utils import hashs
from ..utils.http import is_url
from .serializers import LoaddataSerializerAbstract
from .storages import StorageMixin
[docs]
class Loader(StorageMixin, LoaddataSerializerAbstract):
"""
Dump loader opens a Diskette archive to deploy its data and storage contents.
Keyword Arguments:
logger (object): Instance of a logger object to use. Logger object must
implement common logging message methods (like error, info, etc..). See
``diskette.utils.loggers`` for available loggers. If not given, a dummy
logger will be used that ignores any messages and won't output anything.
"""
MANIFEST_FILENAME = "manifest.json"
TEMPDIR_PREFIX = "diskette_"
DOWNLOAD_FILENAME = "diskette_downloaded_archive.tar.gz"
def __init__(self, logger=None):
self.logger = logger or NoOperationLogger()
[docs]
def get_dir_size(self, path):
"""
Shortcut to get computed size of a directory files.
This exists in its own method to be able to mock it from tests because
size computation is not stable.
Arguments:
path (pathlib.Path): Directory Path object.
Returns:
integer: Computed size of directory files.
"""
return directory_size(path)
[docs]
def download_archive(self, url, destination=None):
"""
Download archive from given URL into destination directory.
Arguments:
url (string): The archive URL to download.
Keyword Arguments:
destination (Path): A path where to write downloaded archive file. If not
given, the archive file will be written as
``diskette_downloaded_archive.tar.gz`` into the current working
directory.
Returns:
Path: Path to downloaded archive file.
"""
destination = destination or Path.cwd() / self.DOWNLOAD_FILENAME
self.logger.info(
"Downloading archive from '{}' to '{}'".format(url, destination)
)
request = requests.get(
url,
allow_redirects=settings.DISKETTE_DOWNLOAD_ALLOW_REDIRECT,
timeout=settings.DISKETTE_DOWNLOAD_TIMEOUT,
stream=True
)
request.raise_for_status()
# Use the chunk way to avoid retaining the whole file in memory
with open(destination, "wb") as fd:
for chunk in request.iter_content(
chunk_size=settings.DISKETTE_DOWNLOAD_CHUNK
):
fd.write(chunk)
return destination
[docs]
def open(self, source, download_destination=None, keep=False, checksum=None):
"""
Extract archive files in a temporary directory.
.. Warning::
Using this method, you are responsible to remove the temporary directory
once you are done with it. Your code must be safe about it and remove it
even when your code fails or you will produce a lot of remaining temporary
directories.
Arguments:
source (Path): A Path object to the archive to open.
Keyword Arguments:
download_destination (Path): A path where to write downloaded archive file.
If not given, the archive file will be written as
``diskette_downloaded_archive.tar.gz`` into the current working
directory. This argument is useless with local archive file.
keep (boolean): Archive won't be removed from filesystem if True, else the
archive file is removed once it have been extracted.
checksum (object): Manage if archive is checksumed or not depending value:
* If ``None``: Checksum is done and just output to logs;
* If ``True``: Checksum is done and just output to logs;
* If ``False``: No checksum are done or compared;
* Any other value is assumed to be a string for a checksum to compare.
Then a checksum is done on archive and compared to the given one, if
comparaison fails it results to a critical error.
Returns:
Path: The temporary directory where archive files have been extracted.
"""
archive = source
if is_url(source):
archive = self.download_archive(source, destination=download_destination)
if not archive.exists():
self.logger.critical(
"Given archive path does not exists: {}".format(archive)
)
# The temporary directory where to extract archive content
destination_tmpdir = Path(tempfile.mkdtemp(prefix=self.TEMPDIR_PREFIX))
# Perform checksum if not explicitely disabled
if checksum is not False:
archive_checksum = hashs.file_checksum(archive)
self.logger.debug(
"Archive checksum: {}".format(archive_checksum)
)
# Compare checksums if any
if checksum and checksum is not True:
if archive_checksum != checksum:
self.logger.critical(
"Checksums do not match. Your archive file is probably "
"corrupted."
)
try:
# Extract everything in temporary directory
with tarfile.open(archive, "r:gz") as archive_fp:
archive_fp.extractall(destination_tmpdir)
except Exception as e:
# Remove destination_tmpdir on extraction failure
if destination_tmpdir.exists():
shutil.rmtree(destination_tmpdir)
# Then raise the exception
raise e
finally:
# Remove archive if not required to be keeped
if not keep:
archive.unlink()
return destination_tmpdir
[docs]
def get_manifest(self, path):
"""
Search for manifest file in given path, validate it and return it.
This raises an exception if manifest is invalid, the used exception class will
depends from used logger.
Arguments:
path (Path): Path object to the directory where to search for the manifest
file.
Returns:
dict: The manifest data.
"""
manifest_path = path / self.MANIFEST_FILENAME
if not manifest_path.exists():
self.logger.critical(
"Dump archive is invalid, it does not include manifest file "
"'manifest.json'"
)
try:
manifest = json.loads(manifest_path.read_text())
except json.JSONDecodeError as e:
self.logger.critical(
"Dump archive is invalid, included manifest file has invalid JSON "
"syntax: {}".format(str(e))
)
if "datas" not in manifest:
self.logger.critical(
"Dump archive is invalid, manifest does not include 'datas' field."
)
if "storages" not in manifest:
self.logger.critical(
"Dump archive is invalid, manifest does not include 'storages' field."
)
# Turn data file and storage items to Path objects
manifest["datas"] = [Path(v) for v in manifest.get("datas") or []]
manifest["storages"] = [Path(v) for v in manifest.get("storages") or []]
return manifest
[docs]
def validate_datas(self):
"""
Call validators from all enabled data dumps.
.. Note::
There is currently no validator needed.
Returns:
boolean: Always return True since there is nothing to validate actually.
"""
return True
[docs]
def validate_storages(self):
"""
Call validators from all enabled storages.
.. Note::
There is currently no validator needed.
Returns:
boolean: Always return True since there is nothing to validate actually.
"""
return True
[docs]
def validate(self):
"""
Call all validators
"""
self.validate_datas()
self.validate_storages()
[docs]
def deploy_storages(self, archive_dir, manifest, destination):
"""
Deploy storages directories in given destination.
.. Note::
When a storage path already exists it is removed just before deploying
the storage content.
Arguments:
archive_dir (Path): Path to directory where archive has been exracted.
manifest (dict): The manifest data.
destination (Path): Path to directory where to deploy storages.
Returns:
list: List of tuples for deployed storage with respectively source and
destination paths.
"""
deployed = []
for dump_path in manifest["storages"]:
storage_source = archive_dir / dump_path
storage_destination = destination / dump_path
# Create complete destination path structure if needed
if not storage_destination.parent.exists():
self.logger.debug(
"Creating storage parent directory: {}".format(
storage_destination.parent
)
)
storage_destination.parent.mkdir(parents=True)
# Remove possible existing storage
if storage_destination.exists():
self.logger.debug(
"Removing previous storage version directory: {}".format(
storage_destination
)
)
shutil.rmtree(storage_destination)
# Move storage dump to destination
self.logger.info(
"Restoring storage directory ({}): {}".format(
filesizeformat(self.get_dir_size(storage_source)),
dump_path
)
)
shutil.move(storage_source, storage_destination)
deployed.append((storage_source, storage_destination))
return deployed
[docs]
def check_data_dump(self, dump, excludes):
"""
Check if data dump is to be loaded or not.
When dump is not to be loaded, a INFO log message will be output.
This check dump file against filename exclusions and minimal file size.
Returns:
boolean: True if dump is to be loaded, else False.
"""
if dump.name in excludes:
self.logger.info("Ignored dump '{}' by exclusion".format(dump.name))
return False
if (
settings.DISKETTE_LOAD_MINIMAL_FILESIZE and
dump.stat().st_size <= settings.DISKETTE_LOAD_MINIMAL_FILESIZE
):
msg = "Ignored dump '{name}' because file is under the minimal size: {size}"
self.logger.info(msg.format(
name=dump.name,
size=filesizeformat(dump.stat().st_size),
))
return False
return True
[docs]
def deploy_datas(self, archive_dir, manifest, excludes=None,
ignorenonexistent=False):
"""
Deploy storages directories in given destination
Arguments:
archive_dir (Path): Path to directory where archive has been exracted.
manifest (dict): The manifest data.
Keyword Arguments:
excludes (list): List of dump filenames to exclude from loading. Notes that
rather to be passed to ``loaddata`` command, instead we are directly
filter internally excludes.
ignorenonexistent (boolean): If true, fields and models that does not
exists in current models will be ignored instead of raising an error.
This is false on default
Returns:
list: List of tuples for deployed dumps with respectively source and
loaddata output.
"""
excludes = excludes or []
return [
(
dump.name,
self.call(archive_dir / dump, ignorenonexistent=ignorenonexistent)
)
for dump in manifest["datas"]
if self.check_data_dump(archive_dir / dump, excludes)
]
[docs]
def deploy(self, archive, storages_destination, data_exclusions=None,
with_data=True, with_storages=True, download_destination=None,
keep=False, checksum=None, ignorenonexistent_data=False):
"""
Load archive and deploy its content.
Arguments:
archive (Path or string): The tarball archive to open and extract dumps. It
may be either a Path to a local archive file or a string for an URL
to download the archive.
storages_destination (Path): Destination where to deploy all storage
directories.
Keyword Arguments:
data_exclusions (list): List of dump filenames to exclude from loading.
with_data (boolean): Enable application datas loading.
with_storages (boolean): Enabled media storages loading.
download_destination (Path): A path where to write downloaded archive file.
If not given, the archive file will be written as
``diskette_downloaded_archive.tar.gz`` into the current working
directory. This argument is useless with local archive file.
keep (boolean): Archive won't be removed from filesystem if True, else the
archive file is removed once it have been extracted.
checksum (object): Manage if archive is checksumed or not depending value:
* If ``None``: Checksum is done and just output to logs;
* If ``True``: Checksum is done and just output to logs;
* If ``False``: No checksum are done or compared;
* Any other value is assumed to be a string for a checksum to compare.
Then a checksum is done on archive and compared to the given one, if
comparaison fails it results to a critical error.
ignorenonexistent_data (boolean): If true, fields and models that does not
exists in current models will be ignored instead of raising an error.
This is false on default
Returns:
dict: Statistics of deployed storages and datas.
"""
tmpdir = self.open(
archive,
download_destination=download_destination,
keep=keep,
checksum=checksum,
)
stats = {}
try:
manifest = self.get_manifest(tmpdir)
if with_storages:
stats["storages"] = self.deploy_storages(
tmpdir,
manifest,
storages_destination,
)
if with_data:
stats["datas"] = self.deploy_datas(
tmpdir,
manifest,
excludes=data_exclusions,
ignorenonexistent=ignorenonexistent_data,
)
finally:
if tmpdir.exists():
shutil.rmtree(tmpdir)
return stats