### bigarray: Support resizing in-place

In NumPy, ndarray has .resize() but actually it does a whole array
copy into newly allocated larger segment which makes e.g. appending O(n).

For BigArray, we don't have that internal constraint NumPy has - to
keep the array itself contiguously _stored_ (compare to contiguously
_presented_ in memory). So we can have O(1) resize for big arrays.

NOTE having O(1) resize, here is how O(δ) append can be done:

A                               # ZBigArray e.g. of shape   (N, 3)
n = len(A)                      # lengh of A's major index  =N
A.resize((n+δ, A.shape[1:]))    # add δ new entries ; now len(A) =N+δ
A[-δ:] = <new-data>             # set data for last new δ entries

/cc @klaus
1 parent 929922fa
Showing with 134 additions and 2 deletions
 ... ... @@ -142,6 +142,43 @@ class BigArray(object): # .base # ~~~ ndarray-like with different semantics # resize BigArray in-place # # NOTE # # - ndarray.resize() works in O(n) time # # ( on-growth numpy allocates new memory for whole array and copies data # there. This is done because numpy.ndarray has to be contiguously stored # in memory. ) # # - BigArray.resize() works in O(1) time # # ( BigArrays are only mapped to contiguous virtual address-space, and # storage is organized using separate data blocks. ) # # NOTE even after BigArray is resized, already-established ndarray views of # BigArray stay of original size. def resize(self, new_shape, refcheck=True): # NOTE refcheck is in args only for numpy API compatibility - as we # don't move memory we don't need to check anything before resizing. # for BigArray resizing is just changing .shape - BigFile currently # works as if it is infinite storage with non-set blocks automatically # reading as whole-zeros. So # # - if array grows, on further mapping we'll map new blocks from # ._fileh # # - if array shrinks, we'll not let clients to map blocks past array # end. # # TODO discard data from backing file on shrinks. self._init0(new_shape, self.dtype, order='C') # FIXME order hardcoded # ~~~ get/set item/slice connect bigfile blocks to ndarray in RAM. # only basic indexing is supported - see numpy/.../arrays.indexing.rst ... ...
 ... ... @@ -20,7 +20,7 @@ from wendelin.bigfile.tests.common_zodb import dbopen, dbclose from wendelin.bigfile.tests.test_filezodb import kkey, cacheInfo from persistent import UPTODATE import transaction from numpy import dtype, uint8, all from numpy import dtype, uint8, all, array_equal def test_zbigarray(tmpdir): root = dbopen('%s/1.fs' % tmpdir) ... ... @@ -124,3 +124,38 @@ def test_zbigarray(tmpdir): assert all(a[33+1:-2] == 0) assert a[-2] == 98 assert a[-1] == 99 # resize array & append data A.resize((24*1024*1024,)) assert A.shape == (24*1024*1024,) assert A.dtype == dtype(uint8) b = A[:] assert array_equal(a, b[:16*1024*1024]) b[16*1024*1024] = 100 b[-1] = 255 # commit; reload & verify changes transaction.commit() dbclose(root) del root, a, b, A root = dbopen('%s/1.fs' % tmpdir) A = root['zarray'] assert isinstance(A, ZBigArray) assert A.shape == (24*1024*1024,) assert A.dtype == dtype(uint8) a = A[:] assert all(a[:33] == 0) assert a == 33 assert all(a[33+1:16*1024*1024-2] == 0) assert a[16*1024*1024-2] == 98 assert a[16*1024*1024-1] == 99 assert a[16*1024*1024] == 100 assert a[24*1024*1024-1] == 255
 ... ... @@ -19,7 +19,7 @@ from wendelin.bigarray import BigArray from wendelin.bigfile import BigFile from wendelin.lib.mem import memcpy from numpy import ndarray, dtype, int32, uint32, uint8, all, arange, multiply, array_equal from numpy import ndarray, dtype, int32, uint32, uint8, all, zeros, arange, multiply, array_equal from pytest import raises ... ... @@ -301,3 +301,63 @@ def test_bigarray_indexing_Nd(): # newaxis - added after at some position(s) for newaxis in range(3): # 0 - no newaxis """ def test_bigarray_resize(): data = zeros(8*PS, dtype=uint32) f = BigFile_Data(data, PS) fh = f.fileh_open() # set first part & ensure it is set correctly A = BigArray((10,3), uint32, fh) A[:,:] = arange(10*3, dtype=uint32).reshape((10,3)) a = A[:] assert array_equal(a.ravel(), arange(10*3, dtype=uint32)) # grow array A.resize((11,3)) # a as already mapped, should stay the same assert array_equal(a.ravel(), arange(10*3, dtype=uint32)) # mapping it once again maps it whole with new size b = A[:] assert isinstance(b, ndarray) assert b.shape == (11,3) assert b.dtype == dtype(uint32) # head data is the same as a assert array_equal(a, b[:10,:]) # tail is zeros assert array_equal(b[10,:], zeros(3, dtype=uint32)) # old mapping stays valid and changes propageate to/from it assert a[0,0] == 0 assert b[0,0] == 0 a[0,0] = 1 assert b[0,0] == 1 b[0,0] = 2 assert a[0,0] == 2 a[0,0] = 0 assert b[0,0] == 0 assert a[ -1,-1] == 10*3-1 assert b[10-1,-1] == 10*3-1 a[ -1,-1] = 1 assert b[10-1,-1] == 1 b[10-1,-1] = 2 assert a[ -1,-1] == 2 a[ -1,-1] = 10*3-1 assert b[10-1,-1] == 10*3-1 # we cannot access old mapping beyond it's end assert raises(IndexError, 'a[10,:]') # we can change tail b[10,:] = arange(10*3, (10+1)*3) # map it whole again and ensure we have correct data c = A[:] assert array_equal(c.ravel(), arange(11*3, dtype=uint32))
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!