Commit 95a2a39c authored by bescoto's avatar bescoto

New hard link system should use less memory


git-svn-id: http://svn.savannah.nongnu.org/svn/rdiff-backup@468 2b77aa54-bcbc-44c9-a7ec-4f6cf2b41109
parent a91a4b6c
......@@ -18,6 +18,8 @@ that contain unreadable files and directories as long as they are
owned by that user. Bug report by Arkadiusz Miskiewicz. Hopefully
this is the last of the unreadable file bugs...
Rewrote hard link tracking system. New way should use less memory.
New in v0.13.2 (2003/09/16)
---------------------------
......
......@@ -34,137 +34,80 @@ from __future__ import generators
import cPickle
import Globals, Time, rpath, log, robust
# In all of these lists of indicies are the values. The keys in
# _inode_ ones are (inode, devloc) pairs.
_src_inode_indicies = None
_dest_inode_indicies = None
# The keys for these two are just indicies. They share values
# with the earlier dictionaries.
_src_index_indicies = None
_dest_index_indicies = None
# When a linked file is restored, its path is added to this dict,
# so it can be found when later paths being restored are linked to
# it.
_restore_index_path = None
# The keys in this dictionary are (inode, devloc) pairs. The values
# are a pair (index, remaining_links, dest_key) where index is the
# rorp index of the first such linked file, remaining_links is the
# number of files hard linked to this one we may see, and key is
# either (dest_inode, dest_devloc) or None, and represents the
# hardlink info of the existing file on the destination.
_inode_index = None
def initialize_dictionaries():
"""Set all the hard link dictionaries to empty"""
global _src_inode_indicies, _dest_inode_indicies
global _src_index_indicies, _dest_index_indicies, _restore_index_path
_src_inode_indicies = {}
_dest_inode_indicies = {}
_src_index_indicies = {}
_dest_index_indicies = {}
_restore_index_path = {}
global _inode_index
_inode_index = {}
def clear_dictionaries():
"""Delete all dictionaries"""
global _src_inode_indicies, _dest_inode_indicies
global _src_index_indicies, _dest_index_indicies, _restore_index_path
_src_inode_indicies = _dest_inode_indicies = None
_src_index_indicies = _dest_index_indicies = _restore_index_path = None
global _inode_index
_inode_index = None
def get_inode_key(rorp):
"""Return rorp's key for _inode_ dictionaries"""
return (rorp.getinode(), rorp.getdevloc())
def get_indicies(rorp, source):
"""Return a list of similarly linked indicies, using rorp's index"""
if source: dict = _src_index_indicies
else: dict = _dest_index_indicies
try: return dict[rorp.index]
except KeyError: return []
def add_rorp(rorp, source):
"""Process new rorp and update hard link dictionaries
First enter it into src_inode_indicies. If we have already
seen all the hard links, then we can delete the entry.
Everything must stay recorded in src_index_indicies though.
"""
def add_rorp(rorp, dest_rorp = None):
"""Process new rorp and update hard link dictionaries"""
if not rorp.isreg() or rorp.getnumlinks() < 2: return
rp_inode_key = get_inode_key(rorp)
if not _inode_index.has_key(rp_inode_key):
if not dest_rorp: dest_key = None
elif dest_rorp.getnumlinks() == 1: dest_key = "NA"
else: dest_key = get_inode_key(dest_rorp)
_inode_index[rp_inode_key] = (rorp.index, rorp.getnumlinks(), dest_key)
def del_rorp(rorp):
"""Remove rorp information from dictionary if seen all links"""
if not rorp.isreg() or rorp.getnumlinks() < 2: return
if source:
inode_dict, index_dict = _src_inode_indicies, _src_index_indicies
else: inode_dict, index_dict = _dest_inode_indicies, _dest_index_indicies
rp_inode_key = get_inode_key(rorp)
if inode_dict.has_key(rp_inode_key):
index_list = inode_dict[rp_inode_key]
index_list.append(rorp.index)
if len(index_list) == rorp.getnumlinks():
del inode_dict[rp_inode_key]
else: # make new entry in both src dicts
index_list = [rorp.index]
inode_dict[rp_inode_key] = index_list
index_dict[rorp.index] = index_list
def add_rorp_iter(iter, source):
"""Return new rorp iterator like iter that add_rorp's first"""
for rorp in iter:
add_rorp(rorp, source)
yield rorp
val = _inode_index.get(rp_inode_key)
if not val: return
index, remaining, dest_key = val
if remaining == 1: del _inode_index[rp_inode_key]
else: _inode_index[rp_inode_key] = (index, remaining-1, dest_key)
def rorp_eq(src_rorp, dest_rorp):
"""Compare hardlinked for equality
Two files may otherwise seem equal but be hardlinked in
different ways. This function considers them equal enough if
they have been hardlinked correctly to the previously seen
indicies.
Return false if dest_rorp is linked differently, which can happen
if dest is linked more than source, or if it is represented by a
different inode.
"""
if (not src_rorp.isreg() or not dest_rorp.isreg() or
src_rorp.getnumlinks() == dest_rorp.getnumlinks() == 1):
return 1 # Hard links don't apply
src_index_list = get_indicies(src_rorp, 1)
dest_index_list = get_indicies(dest_rorp, None)
# If a list only has one element, then it is only hardlinked
# to itself so far, so that is not a genuine difference yet.
if not src_index_list or len(src_index_list) == 1:
return not dest_index_list or len(dest_index_list) == 1
if not dest_index_list or len(dest_index_list) == 1: return None
# Both index lists exist and are non-empty
return src_index_list == dest_index_list # they are always sorted
if src_rorp.getnumlinks() < dest_rorp.getnumlinks(): return 0
src_key = get_inode_key(src_rorp)
index, remaining, dest_key = _inode_index[src_key]
if dest_key == "NA":
# Allow this to be ok for first comparison, but not any
# subsequent ones
_inode_index[src_key] = (index, remaining, None)
return 1
return dest_key == get_inode_key(dest_rorp)
def islinked(rorp):
"""True if rorp's index is already linked to something on src side"""
return len(get_indicies(rorp, 1)) >= 2
if not rorp.getnumlinks() > 1: return 0
dict_val = _inode_index.get(get_inode_key(rorp))
if not dict_val: return 0
return dict_val[0] != rorp.index # If equal, then rorp is first
def get_link_index(rorp):
"""Return first index on target side rorp is already linked to"""
return get_indicies(rorp, 1)[0]
def restore_link(index, rpath):
"""Restores a linked file by linking it
When restoring, all the hardlink data is already present, and
we can only link to something already written. In either
case, add to the _restore_index_path dict, so we know later
that the file is available for hard
linking.
Returns true if succeeded in creating rpath, false if must
restore rpath normally.
"""
if index not in _src_index_indicies: return None
for linked_index in _src_index_indicies[index]:
if linked_index in _restore_index_path:
srcpath = _restore_index_path[linked_index]
log.Log("Restoring %s by hard linking to %s" %
(rpath.path, srcpath), 6)
rpath.hardlink(srcpath)
return 1
_restore_index_path[index] = rpath.path
return None
return _inode_index[get_inode_key(rorp)][0]
def link_rp(diff_rorp, dest_rpath, dest_root = None):
"""Make dest_rpath into a link using link flag in diff_rorp"""
......@@ -172,6 +115,3 @@ def link_rp(diff_rorp, dest_rpath, dest_root = None):
dest_link_rpath = dest_root.new_index(diff_rorp.get_link_flag())
dest_rpath.hardlink(dest_link_rpath.path)
......@@ -180,10 +180,10 @@ class DestinationStruct:
def get_one_sig(cls, dest_base_rpath, index, src_rorp, dest_rorp):
"""Return a signature given source and destination rorps"""
if (Globals.preserve_hardlinks and
Hardlink.islinked(src_rorp or dest_rorp)):
if (Globals.preserve_hardlinks and src_rorp and
Hardlink.islinked(src_rorp)):
dest_sig = rpath.RORPath(index)
dest_sig.flaglinked(Hardlink.get_link_index(dest_sig))
dest_sig.flaglinked(Hardlink.get_link_index(src_rorp))
elif dest_rorp:
dest_sig = dest_rorp.getRORPath()
if dest_rorp.isreg():
......@@ -312,8 +312,8 @@ class CacheCollatedPostProcess:
will be backed up correctly.
"""
if source_rorp: Hardlink.add_rorp(source_rorp, source = 1)
if dest_rorp: Hardlink.add_rorp(dest_rorp, source = 0)
if Globals.preserve_hardlinks and source_rorp:
Hardlink.add_rorp(source_rorp, dest_rorp)
if (dest_rorp and dest_rorp.isdir() and Globals.process_uid != 0
and dest_rorp.getperms() % 01000 < 0700):
self.unreadable_dir_init(source_rorp, dest_rorp)
......@@ -355,6 +355,9 @@ class CacheCollatedPostProcess:
always false for un-changed files).
"""
if Globals.preserve_hardlinks and source_rorp:
Hardlink.del_rorp(source_rorp)
if not changed or success:
if source_rorp: self.statfileobj.add_source_file(source_rorp)
if dest_rorp: self.statfileobj.add_dest_file(dest_rorp)
......
......@@ -235,15 +235,17 @@ class MirrorStruct:
def get_diffs_from_collated(cls, collated):
"""Get diff iterator from collated"""
for mir_rorp, target_rorp in collated:
if Globals.preserve_hardlinks:
if mir_rorp: Hardlink.add_rorp(mir_rorp, source = 1)
if target_rorp: Hardlink.add_rorp(target_rorp, source = 0)
if Globals.preserve_hardlinks and mir_rorp:
Hardlink.add_rorp(mir_rorp, target_rorp)
if (not target_rorp or not mir_rorp or
not mir_rorp == target_rorp or
(Globals.preserve_hardlinks and not
Hardlink.rorp_eq(mir_rorp, target_rorp))):
yield cls.get_diff(mir_rorp, target_rorp)
diff = cls.get_diff(mir_rorp, target_rorp)
else: diff = None
if Globals.preserve_hardlinks and mir_rorp:
Hardlink.del_rorp(mir_rorp)
if diff: yield diff
def get_diff(cls, mir_rorp, target_rorp):
"""Get a diff for mir_rorp at time"""
......
......@@ -761,6 +761,7 @@ class RPath(RORPath):
def hardlink(self, linkpath):
"""Make self into a hardlink joined to linkpath"""
log.Log("Hard linking %s to %s" % (self.path, linkpath), 6)
self.conn.os.link(linkpath, self.path)
self.setdata()
......
......@@ -3,7 +3,7 @@ import os, sys, code
from rdiff_backup.log import Log
from rdiff_backup.rpath import RPath
from rdiff_backup import Globals, Hardlink, SetConnections, Main, \
selection, lazy, Time, rpath, eas_acls
selection, lazy, Time, rpath, eas_acls, rorpiter
RBBin = "../rdiff-backup"
SourceDir = "../rdiff_backup"
......@@ -175,63 +175,78 @@ def CompareRecursive(src_rp, dest_rp, compare_hardlinks = 1,
specified.
"""
if compare_hardlinks: reset_hardlink_dicts()
src_rp.setdata()
dest_rp.setdata()
Log("Comparing %s and %s, hardlinks %s, eas %s, acls %s" %
(src_rp.path, dest_rp.path, compare_hardlinks,
compare_eas, compare_acls), 3)
src_select = selection.Select(src_rp)
dest_select = selection.Select(dest_rp)
if ignore_tmp_files:
# Ignoring temp files can be useful when we want to check the
# correctness of a backup which aborted in the middle. In
# these cases it is OK to have tmp files lying around.
src_select.add_selection_func(src_select.regexp_get_sf(
".*rdiff-backup.tmp.[^/]+$", 0))
dest_select.add_selection_func(dest_select.regexp_get_sf(
".*rdiff-backup.tmp.[^/]+$", 0))
if exclude_rbdir:
src_select.parse_rbdir_exclude()
dest_select.parse_rbdir_exclude()
else:
# include rdiff-backup-data/increments
src_select.add_selection_func(src_select.glob_get_tuple_sf(
('rdiff-backup-data', 'increments'), 1))
dest_select.add_selection_func(dest_select.glob_get_tuple_sf(
('rdiff-backup-data', 'increments'), 1))
# but exclude rdiff-backup-data
src_select.add_selection_func(src_select.glob_get_tuple_sf(
('rdiff-backup-data',), 0))
dest_select.add_selection_func(dest_select.glob_get_tuple_sf(
('rdiff-backup-data',), 0))
dsiter1, dsiter2 = src_select.set_iter(), dest_select.set_iter()
def hardlink_equal(src_rorp, dest_rorp):
def get_selection_functions():
"""Return generators of files in source, dest"""
src_rp.setdata()
dest_rp.setdata()
src_select = selection.Select(src_rp)
dest_select = selection.Select(dest_rp)
if ignore_tmp_files:
# Ignoring temp files can be useful when we want to check the
# correctness of a backup which aborted in the middle. In
# these cases it is OK to have tmp files lying around.
src_select.add_selection_func(src_select.regexp_get_sf(
".*rdiff-backup.tmp.[^/]+$", 0))
dest_select.add_selection_func(dest_select.regexp_get_sf(
".*rdiff-backup.tmp.[^/]+$", 0))
if exclude_rbdir: # Exclude rdiff-backup-data directory
src_select.parse_rbdir_exclude()
dest_select.parse_rbdir_exclude()
return src_select.set_iter(), dest_select.set_iter()
def preprocess(src_rorp, dest_rorp):
"""Initially process src and dest_rorp"""
if compare_hardlinks and src_rorp:
Hardlink.add_rorp(src_rorp, dest_rorp)
def postprocess(src_rorp, dest_rorp):
"""After comparison, process src_rorp and dest_rorp"""
if compare_hardlinks and src_rorp:
Hardlink.del_rorp(src_rorp)
def equality_func(src_rorp, dest_rorp):
"""Combined eq func returns true iff two files compare same"""
if not src_rorp:
Log("Source rorp missing: " + str(dest_rorp), 3)
return 0
if not dest_rorp:
Log("Dest rorp missing: " + str(src_rorp), 3)
return 0
if not src_rorp.equal_verbose(dest_rorp,
compare_ownership = compare_ownership):
return None
if not Hardlink.rorp_eq(src_rorp, dest_rorp):
return 0
if compare_hardlinks and not Hardlink.rorp_eq(src_rorp, dest_rorp):
Log("Hardlink compare failure", 3)
Log("%s: %s" % (src_rorp.index,
Hardlink.get_indicies(src_rorp, 1)), 3)
Hardlink.get_inode_key(src_rorp)), 3)
Log("%s: %s" % (dest_rorp.index,
Hardlink.get_indicies(dest_rorp, None)), 3)
return None
Hardlink.get_inode_key(dest_rorp)), 3)
return 0
if compare_eas and not eas_acls.ea_compare_rps(src_rorp, dest_rorp):
Log("Different EAs in files %s and %s" %
(src_rorp.get_indexpath(), dest_rorp.get_indexpath()), 3)
return None
return 0
if compare_acls and not eas_acls.acl_compare_rps(src_rorp, dest_rorp):
Log("Different ACLs in files %s and %s" %
(src_rorp.get_indexpath(), dest_rorp.get_indexpath()), 3)
return None
return 0
return 1
Log("Comparing %s and %s, hardlinks %s, eas %s, acls %s" %
(src_rp.path, dest_rp.path, compare_hardlinks,
compare_eas, compare_acls), 3)
if compare_hardlinks: reset_hardlink_dicts()
src_iter, dest_iter = get_selection_functions()
for src_rorp, dest_rorp in rorpiter.Collate2Iters(src_iter, dest_iter):
preprocess(src_rorp, dest_rorp)
if not equality_func(src_rorp, dest_rorp): return 0
postprocess(src_rorp, dest_rorp)
return 1
def rbdir_equal(src_rorp, dest_rorp):
"""Like hardlink_equal, but make allowances for data directories"""
if not src_rorp.index and not dest_rorp.index: return 1
......@@ -263,30 +278,10 @@ def CompareRecursive(src_rp, dest_rp, compare_hardlinks = 1,
Hardlink.get_indicies(dest_rorp, None)), 3)
return None
if equality_func: result = lazy.Iter.equal(dsiter1, dsiter2,
1, equality_func)
elif compare_hardlinks:
dsiter1 = Hardlink.add_rorp_iter(dsiter1, 1)
dsiter2 = Hardlink.add_rorp_iter(dsiter2, None)
if exclude_rbdir:
result = lazy.Iter.equal(dsiter1, dsiter2, 1, hardlink_equal)
else: result = lazy.Iter.equal(dsiter1, dsiter2, 1, rbdir_equal)
elif not exclude_rbdir:
result = lazy.Iter.equal(dsiter1, dsiter2, 1, rbdir_equal)
else: result = lazy.Iter.equal(dsiter1, dsiter2, 1,
lambda x, y: x.equal_verbose(y, compare_ownership = compare_ownership))
for i in dsiter1: pass # make sure all files processed anyway
for i in dsiter2: pass
return result
def reset_hardlink_dicts():
"""Clear the hardlink dictionaries"""
Hardlink._src_inode_indicies = {}
Hardlink._src_index_indicies = {}
Hardlink._dest_inode_indicies = {}
Hardlink._dest_index_indicies = {}
Hardlink._restore_index_path = {}
Hardlink._inode_index = {}
def BackupRestoreSeries(source_local, dest_local, list_of_dirnames,
compare_hardlinks = 1,
......
......@@ -2,7 +2,7 @@ import os, unittest, time
from commontest import *
from rdiff_backup import Globals, Hardlink, selection, rpath
Log.setverbosity(3)
Log.setverbosity(6)
class HardlinkTest(unittest.TestCase):
"""Test cases for Hard links"""
......@@ -34,62 +34,24 @@ class HardlinkTest(unittest.TestCase):
Globals.preserve_hardlinks = 1
reset_hardlink_dicts()
for dsrp in selection.Select(self.hardlink_dir3).set_iter():
Hardlink.add_rorp(dsrp, 1)
Hardlink.add_rorp(dsrp)
assert len(Hardlink._src_inode_indicies.keys()) == 3, \
Hardlink._src_inode_indicies
assert len(Hardlink._src_index_indicies.keys()) == 3, \
Hardlink._src_index_indicies
vals1 = Hardlink._src_inode_indicies.values()
vals2 = Hardlink._src_index_indicies.values()
vals1.sort()
vals2.sort()
assert vals1 == vals2
def testBuildingDict2(self):
"""Same as testBuildingDict but test destination building"""
Globals.preserve_hardlinks = 1
reset_hardlink_dicts()
for dsrp in selection.Select(self.hardlink_dir3).set_iter():
Hardlink.add_rorp(dsrp, None)
assert len(Hardlink._dest_inode_indicies.keys()) == 3, \
Hardlink._dest_inode_indicies
assert len(Hardlink._dest_index_indicies.keys()) == 3, \
Hardlink._dest_index_indicies
vals1 = Hardlink._dest_inode_indicies.values()
vals2 = Hardlink._dest_index_indicies.values()
vals1.sort()
vals2.sort()
assert vals1 == vals2
assert len(Hardlink._inode_index.keys()) == 3, \
Hardlink._inode_index
def testCompletedDict(self):
"""See if the hardlink dictionaries are built correctly"""
reset_hardlink_dicts()
for dsrp in selection.Select(self.hardlink_dir1).set_iter():
Hardlink.add_rorp(dsrp, 1)
assert Hardlink._src_inode_indicies == {}, \
Hardlink._src_inode_indicies
hll1 = [('file1',), ('file2',), ('file3',)]
hll2 = [('file4',), ('file5',), ('file6',)]
dict = {}
for index in hll1: dict[index] = hll1
for index in hll2: dict[index] = hll2
assert Hardlink._src_index_indicies == dict
Hardlink.add_rorp(dsrp)
Hardlink.del_rorp(dsrp)
assert Hardlink._inode_index == {}, Hardlink._inode_index
reset_hardlink_dicts()
for dsrp in selection.Select(self.hardlink_dir2).set_iter():
Hardlink.add_rorp(dsrp, 1)
assert Hardlink._src_inode_indicies == {}, \
Hardlink._src_inode_indicies
hll1 = [('file1',), ('file3',), ('file4',)]
hll2 = [('file2',), ('file5',), ('file6',)]
dict = {}
for index in hll1: dict[index] = hll1
for index in hll2: dict[index] = hll2
assert Hardlink._src_index_indicies == dict
Hardlink.add_rorp(dsrp)
Hardlink.del_rorp(dsrp)
assert Hardlink._inode_index == {}, Hardlink._inode_index
def testSeries(self):
"""Test hardlink system by backing up and restoring a few dirs"""
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment