Commit fff1c1a7 authored by Julien Muchembled's avatar Julien Muchembled Committed by Xavier Thompson

[fix/opti] download: clean-up, fix, optimization

An optimization is to avoid temporary file when possible: a rename
(or hard link) is not always possible (different mount points).

Another one is to not check md5sum twice when using cache file.

Fall-back mode is ignored if an MD5 checksum is given.

In case of checksum mismatch for a cached path, remove it and
download again, mainly to cover the following cases:
- the url content changes and the user updates the checksum
- buildout killed while downloading directly to cache
  (see above optimization)
- shutil.copyfile is interrupted
parent 2c2805c9
...@@ -54,9 +54,9 @@ import os ...@@ -54,9 +54,9 @@ import os
import os.path import os.path
import re import re
import shutil import shutil
import sys
import tempfile import tempfile
import zc.buildout import zc.buildout
from .rmtree import rmtree
class ChecksumError(zc.buildout.UserError): class ChecksumError(zc.buildout.UserError):
...@@ -74,7 +74,8 @@ class Download(object): ...@@ -74,7 +74,8 @@ class Download(object):
cache: path to the download cache (excluding namespaces) cache: path to the download cache (excluding namespaces)
namespace: namespace directory to use inside the cache namespace: namespace directory to use inside the cache
offline: whether to operate in offline mode offline: whether to operate in offline mode
fallback: whether to use the cache as a fallback (try downloading first) fallback: whether to use the cache as a fallback (try downloading first),
when an MD5 checksum is not given
hash_name: whether to use a hash of the URL as cache file name hash_name: whether to use a hash of the URL as cache file name
logger: an optional logger to receive download-related log messages logger: an optional logger to receive download-related log messages
...@@ -107,7 +108,8 @@ class Download(object): ...@@ -107,7 +108,8 @@ class Download(object):
if self.download_cache is not None: if self.download_cache is not None:
return os.path.join(self.download_cache, self.namespace or '') return os.path.join(self.download_cache, self.namespace or '')
def __call__(self, url, md5sum=None, path=None): @property
def __call__(self):
"""Download a file according to the utility's configuration. """Download a file according to the utility's configuration.
url: URL to download url: URL to download
...@@ -117,19 +119,14 @@ class Download(object): ...@@ -117,19 +119,14 @@ class Download(object):
Returns the path to the downloaded file. Returns the path to the downloaded file.
""" """
if self.cache: return self.download_cached if self.cache else self.download
local_path, is_temp = self.download_cached(url, md5sum)
else:
local_path, is_temp = self.download(url, md5sum, path)
return locate_at(local_path, path), is_temp
def download_cached(self, url, md5sum=None): def download_cached(self, url, md5sum=None, path=None):
"""Download a file from a URL using the cache. """Download a file from a URL using the cache.
This method assumes that the cache has been configured. Optionally, it This method assumes that the cache has been configured.
raises a ChecksumError if a cached copy of a file has an MD5 mismatch, If a cached copy of a file has an MD5 mismatch, download
but will not remove the copy in that case. and update the cache on success.
""" """
if not os.path.exists(self.download_cache): if not os.path.exists(self.download_cache):
...@@ -146,26 +143,43 @@ class Download(object): ...@@ -146,26 +143,43 @@ class Download(object):
self.logger.debug('Searching cache at %s' % cache_dir) self.logger.debug('Searching cache at %s' % cache_dir)
if os.path.exists(cached_path): if os.path.exists(cached_path):
is_temp = False if check_md5sum(cached_path, md5sum):
if self.fallback: if md5sum or not self.fallback:
self.logger.debug('Using cache file %s', cached_path)
return locate_at(cached_path, path), False
else:
self.logger.warning(
'MD5 checksum mismatch for cached download from %r at %r',
url, cached_path)
# Don't download directly to cached_path to minimize
# the probability to alter old data if download fails.
try: try:
_, is_temp = self.download(url, md5sum, cached_path) path, is_temp = self.download(url, md5sum, path)
except ChecksumError: except ChecksumError:
raise raise
except Exception: except Exception:
pass if md5sum:
raise
if not check_md5sum(cached_path, md5sum): self.logger.debug("Fallback to cache using %s",
raise ChecksumError( cached_path, exception=1)
'MD5 checksum mismatch for cached download ' else:
'from %r at %r' % (url, cached_path)) samefile = getattr(os.path, 'samefile', None)
self.logger.debug('Using cache file %s' % cached_path) if not (samefile and samefile(path, cached_path)):
# update cache
try:
os.remove(cached_path)
except OSError as e:
if e.errno != errno.EISDIR:
raise
rmtree(cached_path)
locate_at(path, cached_path)
return path, is_temp
else: else:
self.logger.debug('Cache miss; will cache %s as %s' % self.logger.debug('Cache miss; will cache %s as %s' %
(url, cached_path)) (url, cached_path))
_, is_temp = self.download(url, md5sum, cached_path) self.download(url, md5sum, cached_path)
return cached_path, is_temp return locate_at(cached_path, path), False
def download(self, url, md5sum=None, path=None): def download(self, url, md5sum=None, path=None):
"""Download a file from a URL to a given or temporary path. """Download a file from a URL to a given or temporary path.
...@@ -196,27 +210,25 @@ class Download(object): ...@@ -196,27 +210,25 @@ class Download(object):
"Couldn't download %r in offline mode." % url) "Couldn't download %r in offline mode." % url)
self.logger.info('Downloading %s' % url) self.logger.info('Downloading %s' % url)
tmp_path = path
cleanup = True
try:
if not path:
handle, tmp_path = tempfile.mkstemp(prefix='buildout-') handle, tmp_path = tempfile.mkstemp(prefix='buildout-')
os.close(handle) os.close(handle)
try:
tmp_path, headers = urlretrieve(url, tmp_path) tmp_path, headers = urlretrieve(url, tmp_path)
if not check_md5sum(tmp_path, md5sum): if not check_md5sum(tmp_path, md5sum):
raise ChecksumError( raise ChecksumError(
'MD5 checksum mismatch downloading %r' % url) 'MD5 checksum mismatch downloading %r' % url)
except IOError: cleanup = False
e = sys.exc_info()[1] except IOError as e:
os.remove(tmp_path) raise zc.buildout.UserError("Error downloading %s: %s"
raise zc.buildout.UserError("Error downloading extends for URL " % (url, e))
"%s: %s" % (url, e)) finally:
except Exception: if cleanup and tmp_path:
os.remove(tmp_path) remove(tmp_path)
raise
if path: return tmp_path, not path
shutil.move(tmp_path, path)
return path, False
else:
return tmp_path, True
def filename(self, url): def filename(self, url):
"""Determine a file name from a URL according to the configuration. """Determine a file name from a URL according to the configuration.
......
...@@ -165,14 +165,6 @@ the file on the server to see this: ...@@ -165,14 +165,6 @@ the file on the server to see this:
>>> cat(path) >>> cat(path)
This is a foo text. This is a foo text.
If we specify an MD5 checksum for a file that is already in the cache, the
cached copy's checksum will be verified:
>>> download(server_url+'foo.txt', md5('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: MD5 checksum mismatch for cached download
from 'http://localhost/foo.txt' at '/download-cache/foo.txt'
Trying to access another file at a different URL which has the same base name Trying to access another file at a different URL which has the same base name
will result in the cached copy being used: will result in the cached copy being used:
...@@ -184,6 +176,14 @@ will result in the cached copy being used: ...@@ -184,6 +176,14 @@ will result in the cached copy being used:
>>> cat(path) >>> cat(path)
This is a foo text. This is a foo text.
If we specify an MD5 checksum for a file that is already in the cache, the
cached copy's checksum will be verified and the cache will be refreshed:
>>> path, is_temp = download(server_url+'foo.txt', md5('The wrong text.'.encode()).hexdigest())
>>> is_temp
True
>>> remove(path)
Given a target path for the download, the utility will provide a copy of the Given a target path for the download, the utility will provide a copy of the
file at that location both when first downloading the file and when using a file at that location both when first downloading the file and when using a
cached copy: cached copy:
...@@ -259,7 +259,7 @@ If the file is completely missing it should notify the user of the error: ...@@ -259,7 +259,7 @@ If the file is completely missing it should notify the user of the error:
>>> download(server_url+'bar.txt') # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS >>> download(server_url+'bar.txt') # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
Traceback (most recent call last): Traceback (most recent call last):
... ...
UserError: Error downloading extends for URL http://localhost/bar.txt: UserError: Error downloading http://localhost/bar.txt:
...404... ...404...
>>> ls(cache) >>> ls(cache)
...@@ -442,18 +442,22 @@ However, when downloading the file normally with the cache being used in ...@@ -442,18 +442,22 @@ However, when downloading the file normally with the cache being used in
fall-back mode, the file will be downloaded from the net and the cached copy fall-back mode, the file will be downloaded from the net and the cached copy
will be replaced with the new content: will be replaced with the new content:
>>> cat(download(server_url+'foo.txt')[0]) >>> path, is_temp = download(server_url+'foo.txt')
>>> cat(path)
The wrong text. The wrong text.
>>> cat(cache, 'foo.txt') >>> cat(cache, 'foo.txt')
The wrong text. The wrong text.
>>> is_temp
True
>>> remove(path)
When trying to download a resource whose checksum does not match, the cached Fall-back mode is meaningless if md5sum is given. If the checksum of the
copy will neither be used nor overwritten: cached copy matches, the resource is not downloaded:
>>> write(server_data, 'foo.txt', 'This is a foo text.') >>> write(server_data, 'foo.txt', 'This is a foo text.')
>>> download(server_url+'foo.txt', md5('The wrong text.'.encode()).hexdigest()) >>> path, is_temp = download(server_url+'foo.txt', md5('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last): >>> print_(path)
ChecksumError: MD5 checksum mismatch downloading 'http://localhost/foo.txt' /download-cache/foo.txt
>>> cat(cache, 'foo.txt') >>> cat(cache, 'foo.txt')
The wrong text. The wrong text.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment