Commit ca064f75 authored by Kirill Smelkov's avatar Kirill Smelkov

bigarray: Support resizing in-place

In NumPy, ndarray has .resize() but actually it does a whole array
copy into newly allocated larger segment which makes e.g. appending O(n).

For BigArray, we don't have that internal constraint NumPy has - to
keep the array itself contiguously _stored_ (compare to contiguously
_presented_ in memory). So we can have O(1) resize for big arrays.

NOTE having O(1) resize, here is how O(δ) append can be done:

    A                               # ZBigArray e.g. of shape   (N, 3)
    n = len(A)                      # lengh of A's major index  =N
    A.resize((n+δ, A.shape[1:]))    # add δ new entries ; now len(A) =N+δ
    A[-δ:] = <new-data>             # set data for last new δ entries

/cc @klaus
parent 929922fa
...@@ -142,6 +142,43 @@ class BigArray(object): ...@@ -142,6 +142,43 @@ class BigArray(object):
# .base # .base
# ~~~ ndarray-like with different semantics
# resize BigArray in-place
#
# NOTE
#
# - ndarray.resize() works in O(n) time
#
# ( on-growth numpy allocates new memory for whole array and copies data
# there. This is done because numpy.ndarray has to be contiguously stored
# in memory. )
#
# - BigArray.resize() works in O(1) time
#
# ( BigArrays are only mapped to contiguous virtual address-space, and
# storage is organized using separate data blocks. )
#
# NOTE even after BigArray is resized, already-established ndarray views of
# BigArray stay of original size.
def resize(self, new_shape, refcheck=True):
# NOTE refcheck is in args only for numpy API compatibility - as we
# don't move memory we don't need to check anything before resizing.
# for BigArray resizing is just changing .shape - BigFile currently
# works as if it is infinite storage with non-set blocks automatically
# reading as whole-zeros. So
#
# - if array grows, on further mapping we'll map new blocks from
# ._fileh
#
# - if array shrinks, we'll not let clients to map blocks past array
# end.
#
# TODO discard data from backing file on shrinks.
self._init0(new_shape, self.dtype, order='C') # FIXME order hardcoded
# ~~~ get/set item/slice connect bigfile blocks to ndarray in RAM. # ~~~ get/set item/slice connect bigfile blocks to ndarray in RAM.
# only basic indexing is supported - see numpy/.../arrays.indexing.rst # only basic indexing is supported - see numpy/.../arrays.indexing.rst
......
...@@ -20,7 +20,7 @@ from wendelin.bigfile.tests.common_zodb import dbopen, dbclose ...@@ -20,7 +20,7 @@ from wendelin.bigfile.tests.common_zodb import dbopen, dbclose
from wendelin.bigfile.tests.test_filezodb import kkey, cacheInfo from wendelin.bigfile.tests.test_filezodb import kkey, cacheInfo
from persistent import UPTODATE from persistent import UPTODATE
import transaction import transaction
from numpy import dtype, uint8, all from numpy import dtype, uint8, all, array_equal
def test_zbigarray(tmpdir): def test_zbigarray(tmpdir):
root = dbopen('%s/1.fs' % tmpdir) root = dbopen('%s/1.fs' % tmpdir)
...@@ -124,3 +124,38 @@ def test_zbigarray(tmpdir): ...@@ -124,3 +124,38 @@ def test_zbigarray(tmpdir):
assert all(a[33+1:-2] == 0) assert all(a[33+1:-2] == 0)
assert a[-2] == 98 assert a[-2] == 98
assert a[-1] == 99 assert a[-1] == 99
# resize array & append data
A.resize((24*1024*1024,))
assert A.shape == (24*1024*1024,)
assert A.dtype == dtype(uint8)
b = A[:]
assert array_equal(a, b[:16*1024*1024])
b[16*1024*1024] = 100
b[-1] = 255
# commit; reload & verify changes
transaction.commit()
dbclose(root)
del root, a, b, A
root = dbopen('%s/1.fs' % tmpdir)
A = root['zarray']
assert isinstance(A, ZBigArray)
assert A.shape == (24*1024*1024,)
assert A.dtype == dtype(uint8)
a = A[:]
assert all(a[:33] == 0)
assert a[33] == 33
assert all(a[33+1:16*1024*1024-2] == 0)
assert a[16*1024*1024-2] == 98
assert a[16*1024*1024-1] == 99
assert a[16*1024*1024] == 100
assert a[24*1024*1024-1] == 255
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
from wendelin.bigarray import BigArray from wendelin.bigarray import BigArray
from wendelin.bigfile import BigFile from wendelin.bigfile import BigFile
from wendelin.lib.mem import memcpy from wendelin.lib.mem import memcpy
from numpy import ndarray, dtype, int32, uint32, uint8, all, arange, multiply, array_equal from numpy import ndarray, dtype, int32, uint32, uint8, all, zeros, arange, multiply, array_equal
from pytest import raises from pytest import raises
...@@ -301,3 +301,63 @@ def test_bigarray_indexing_Nd(): ...@@ -301,3 +301,63 @@ def test_bigarray_indexing_Nd():
# newaxis - added after at some position(s) # newaxis - added after at some position(s)
for newaxis in range(3): # 0 - no newaxis for newaxis in range(3): # 0 - no newaxis
""" """
def test_bigarray_resize():
data = zeros(8*PS, dtype=uint32)
f = BigFile_Data(data, PS)
fh = f.fileh_open()
# set first part & ensure it is set correctly
A = BigArray((10,3), uint32, fh)
A[:,:] = arange(10*3, dtype=uint32).reshape((10,3))
a = A[:]
assert array_equal(a.ravel(), arange(10*3, dtype=uint32))
# grow array
A.resize((11,3))
# a as already mapped, should stay the same
assert array_equal(a.ravel(), arange(10*3, dtype=uint32))
# mapping it once again maps it whole with new size
b = A[:]
assert isinstance(b, ndarray)
assert b.shape == (11,3)
assert b.dtype == dtype(uint32)
# head data is the same as a
assert array_equal(a, b[:10,:])
# tail is zeros
assert array_equal(b[10,:], zeros(3, dtype=uint32))
# old mapping stays valid and changes propageate to/from it
assert a[0,0] == 0
assert b[0,0] == 0
a[0,0] = 1
assert b[0,0] == 1
b[0,0] = 2
assert a[0,0] == 2
a[0,0] = 0
assert b[0,0] == 0
assert a[ -1,-1] == 10*3-1
assert b[10-1,-1] == 10*3-1
a[ -1,-1] = 1
assert b[10-1,-1] == 1
b[10-1,-1] = 2
assert a[ -1,-1] == 2
a[ -1,-1] = 10*3-1
assert b[10-1,-1] == 10*3-1
# we cannot access old mapping beyond it's end
assert raises(IndexError, 'a[10,:]')
# we can change tail
b[10,:] = arange(10*3, (10+1)*3)
# map it whole again and ensure we have correct data
c = A[:]
assert array_equal(c.ravel(), arange(11*3, dtype=uint32))
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment