Commit 425dc5d1 authored by Kirill Smelkov's avatar Kirill Smelkov

bigarray: Raise IndexError for out-of-bound element access

The way BigArray.__getitem__ works for element access is that for e.g.

    A[i]

it translates the request to

    A[i:i+1]

and remembers to lower the dimensionality at scalar index

    dim_adjust = (0,)

so, in full, A[i] is computed this way:

    A[i] -> A[i:i+1](0,)

( it is done this way to unify code for scalar / slice access in
  __getitem__ - see 0c826d5c "BigArray: An ndarray-like on top of
  BigFile memory mappings" )

The code for slice access also has a shortcut - if it sees that slice
results in empty array (e.g. for out-of-bound slice), we can avoid
spending time to create a file vma mapping only to create empty view on
top of it.

In 0c826d5c, that optimization, however forgot to apply the "lower the
dimensionality" step on top of resulting empty view, and that turned out
for not raising IndexError for out-of-bounds scalar access:

    A = BigArray((10,), uint8)
    In [1]: A[0]
    Out[1]: 0

    In [2]: A[1]
    Out[2]: 0

    In [3]: A[2]
    Out[3]: 0

    In [4]: A[9]
    Out[4]: 0

    In [5]: A[10]
    Out[5]: array([], dtype=uint8)

NOTE that A[10] returns empty array instead of raising IndexError.

So do not forget to apply the "reduce dimensionality" step for empty
views, and this way we get proper IndexError (because for empty view,
scalar access results in IndexError).

NOTE:

this bug was also preventing for e.g.

    list(A)

to work, because list(A) internally works this way:

    l = []
    i = iter(A)
    for _ in i:
        l.append(_)

but iterating would not stop after 10 elements - after array end, _ will
be always array([], dtype=uint8), and thus the loop never finished and
memory usage grow to infinity.

/cc @Tyagov
parent 4680c0cd
......@@ -302,39 +302,42 @@ class BigArray(object):
nitems0 = (idx0_stop - idx0_start - sign(idx0_stride)) // idx0_stride + 1
#print('nitem0:\t', nitems0)
# if major row is "empty" slice, we can return right away without creating vma.
# if major row is "empty" slice, we can build view right away without creating vma.
# e.g. 10:5:1, 5:10:-1, 5:5, size+100:size+200 -> []
if nitems0 <= 0:
return ndarray((0,) + self._shape[1:], self._dtype)
view = ndarray((0,) + self._shape[1:], self._dtype)
# major slice -> in bytes
byte0_start = idx0_start * stride0
byte0_stop = idx0_stop * stride0
byte0_stride = idx0_stride * stride0
# create appropriate vma and ndarray view to it
else:
# major slice -> in bytes
byte0_start = idx0_start * stride0
byte0_stop = idx0_stop * stride0
byte0_stride = idx0_stride * stride0
# major slice -> in file pages, always increasing, inclusive
page0_min = min(byte0_start, byte0_stop+byte0_stride) // pagesize # TODO -> fileh.pagesize
page0_max = max(byte0_stop-byte0_stride, byte0_start) // pagesize # TODO -> fileh.pagesize
# major slice -> in file pages, always increasing, inclusive
page0_min = min(byte0_start, byte0_stop+byte0_stride) // pagesize # TODO -> fileh.pagesize
page0_max = max(byte0_stop-byte0_stride, byte0_start) // pagesize # TODO -> fileh.pagesize
# ~~~ mmap file part corresponding to full major slice into memory
vma0 = self._fileh.mmap(page0_min, page0_max-page0_min+1)
# ~~~ mmap file part corresponding to full major slice into memory
vma0 = self._fileh.mmap(page0_min, page0_max-page0_min+1)
# first get ndarray view with only major slice specified and rest indices being ":"
view0_shape = (nitems0,) + self._shape[1:]
view0_offset = byte0_start - page0_min * pagesize # TODO -> fileh.pagesize
view0_stridev = (byte0_stride,) + self._stridev[1:]
#print('view0_shape:\t', view0_shape, self.shape)
#print('view0_offset:\t', view0_offset)
#print('len(vma0):\t', len(vma0))
view0 = ndarray(view0_shape, self._dtype, vma0, view0_offset, view0_stridev)
# first get ndarray view with only major slice specified and rest indices being ":"
view0_shape = (nitems0,) + self._shape[1:]
view0_offset = byte0_start - page0_min * pagesize # TODO -> fileh.pagesize
view0_stridev = (byte0_stride,) + self._stridev[1:]
#print('view0_shape:\t', view0_shape, self.shape)
#print('view0_offset:\t', view0_offset)
#print('len(vma0):\t', len(vma0))
view0 = ndarray(view0_shape, self._dtype, vma0, view0_offset, view0_stridev)
# now take into account indices after major one
view = view0[(slice(None),) + tuple(idx[1:])]
# now take into account indices after major one
view = view0[(slice(None),) + tuple(idx[1:])]
#print('view0:\t', view0.shape)
#print('view:\t', view.shape)
#print('view0:\t', view0.shape)
#print('view:\t', view.shape)
#print('View:\t', view)
#print('view/d:\t', view[dim_adjust])
......
......@@ -111,6 +111,18 @@ def test_bigarray_indexing_1d():
assert array_equal(A_[[0,1,2,3,4]], [0,2,4,6,8])
raises (TypeError, 'A[[0,1,2,3,4]]')
# index out of range
# - element access -> raises IndexError
# - slice access -> empty
assert AA[10*PS-1] == (0,0)
raises(IndexError, 'A_[10*PS]')
raises(IndexError, 'A [10*PS]')
a, _ = AA[10*PS:10*PS+1]
assert isinstance(a, ndarray)
assert array_equal(a, _)
assert a.dtype == dtype(uint8)
assert a.shape == (0,)
# "empty" slices
assert A[10:5:1] .size == 0
......@@ -288,15 +300,45 @@ def test_bigarray_indexing_Nd():
A = BigArray(shape, uint32, fh) # bigarray with test data and shape
A_ = data[:multiply.reduce(shape)].reshape(shape) # ndarray ----//----
# AA[key] -> A[key], A_[key]
AA = DoubleGet(A, A_)
# now just go over combinations of various slice at each dimension, and see
# whether slicing result is the same ndarray would do.
for idx in idx_to_test(shape):
a = A [idx]
a_ = A_[idx]
a, a_ = AA[idx]
assert array_equal(a, a_)
# any part of index out of range
# - element access -> raises IndexError
# - slice access -> empty
for idxpos in range(len(shape)):
idx = [0]*len(shape)
# idx -> tuple(idx)
# ( list would mean advanced indexing - not what we want )
idxt = lambda : tuple(idx)
# valid access element access
idx[idxpos] = shape[idxpos] - 1 # 0, 0, 0, Ni-1, 0 ,0, 0
a, a_ = AA[idxt()]
assert array_equal(a, a_)
# out-of-range element access
idx[idxpos] = shape[idxpos] # 0, 0, 0, Ni , 0 ,0, 0
raises(IndexError, 'A [idxt()]')
raises(IndexError, 'A_[idxt()]')
# out-of-range slice access
idx[idxpos] = slice(shape[idxpos], # 0, 0, 0, Ni:Ni+1 , 0 ,0, 0
shape[idxpos]+1)
a, a_ = AA[idxt()]
assert array_equal(a, a_)
assert a .size == 0
assert a_.size == 0
# TODO ... -> expanded (0,1,2,negative), rejected if many
# TODO newaxis
# TODO nidx < len(shape)
......@@ -368,3 +410,15 @@ def test_bigarray_resize():
# map it whole again and ensure we have correct data
c = A[:]
assert array_equal(c.ravel(), arange(11*3, dtype=uint32))
def test_bigarray_list():
Z = BigFile_Zero(PS)
Zh = Z.fileh_open()
A = BigArray((10,), uint8, Zh)
# the IndexError for out-of-bound scalar access should allow, though
# inefficient, for list(A) to work (instead of looping inside forever)
l = list(A)
assert isinstance(l, list)
assert l == [0]*10
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment