improved test tool selection and zblk format and refactoring

- now there are parameters to run our tests with: wendelin+pandas, wendelin+numpy, numpy in memory, and numpy with memmaps - fixed the selection of the ZBlkFormat - test moved to performance_test_server.py (performance_test_client to be added next)

improved test tool selection and zblk format and refactoring
- now there are parameters to run our tests with: wendelin+pandas, wendelin+numpy, numpy in memory, and numpy with memmaps - fixed the selection of the ZBlkFormat - test moved to performance_test_server.py (performance_test_client to be added next)
81b8a103 · Douglas · 88bcb9a1 · 81b8a103
Commit 81b8a103 authored Oct 14, 2015 by Douglas
Hide whitespace changes
Inline Side-by-side

Showing with 70 additions and 43 deletions

performance_test_server.py performance_test_server.py +70 -43

No files found.
--- a/performance_test.py
+++ b/performance_test.py
@@ -13,12 +13,6 @@ import cProfile
 import pstats


-def bigger_than_memory(self):
-  message_list = []
-  message_list.extend(bigger_than_memory_write(self))
-  message_list.extend(bigger_than_memory_read(self))
-  return message_list
-  
 def bigger_than_memory_write(self, out_of_core_index=False, zblk_format='0', **kwargs):
  message_list = []
  message_list.append('Write start: ' + get_process_memory_usage())
@@ -28,6 +22,7 @@ def bigger_than_memory_write(self, out_of_core_index=False, zblk_format='0', **k
                
  array_memory_size = 251*MB
  array_number_of_items = array_memory_size // array_schema.itemsize
+  message_list.append(set_zblock_format(zblk_format))
  
  big_array = create_zbigarray(memory_size=array_memory_size, dtype=array_schema)

@@ -40,18 +35,22 @@ def bigger_than_memory_write(self, out_of_core_index=False, zblk_format='0', **k
      big_index = np.arange(array_number_of_items)
      big_array = store_arrays(root, [big_array, 'big_array'])[0]
  
-  message_list.append(set_zblock_format(big_array, zblk_format))
-  check_zblock_format(big_array, zblk_format)
+  check_zblock_format(zblk_format)
  
  message_list.append('Populating array with %s rows.' % array_number_of_items)
  response = populate_array(self, big_array, big_index, array_schema)
  message_list.extend(response['messages'])
-
+  
+  for message in response['messages']:
+    if 'ZBigArray' in message:
+      return json.dumps({'messages': message_list, 'result': message})
  message_list.append('Write end: ' + get_process_memory_usage())
-  return message_list
+  return json.dumps(message_list)

-def bigger_than_memory_read(self, out_of_core_index=False, **kwargs):
+def bigger_than_memory_read(self, out_of_core_index=False, tool='wendelin.numpy'):
+  
  message_list = []
+  message_list.append('Reading with format %s' % tool)
  message_list.append('Read start ' + get_process_memory_usage())
  
  root = self.getPhysicalRoot()
@@ -67,13 +66,19 @@ def bigger_than_memory_read(self, out_of_core_index=False, **kwargs):
  number_of_items = len(root.big_array)
  item_size = root.big_array.dtype.itemsize
  message_list.append('Processing %s items with %s bytes each' % (number_of_items, item_size))
+  message_list.append(len(root.big_array._p_jar.db().storage.load(root.big_array._p_oid, '')[0])+42)
  
  big_index = root.big_index[:] if out_of_core_index else np.arange(number_of_items)
  
-  messages = process_data(root.big_array[:], big_index[:], columns)
+  messages = process_data(self, root.big_array, big_index, columns, tool)
  message_list.extend(messages)
  message_list.append('Read end: ' + get_process_memory_usage())
-  return message_list
+  
+  for message in messages:
+    for tool in ['wendelin.pandas', 'wendelin.numpy', 'numpy.memory', 'numpy.memmap']:
+      if tool in message:
+        return json.dumps({'messages': message_list, 'result': message})
+  return json.dumps(message_list)

 @contextmanager
 def timer(name, message_list):
@@ -103,8 +108,8 @@ def get_field_names_and_schema():
      np.dtype,
      ['i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8',
        'i8', 'i8', 'i8', 'f8', 'i8', 'i8', 'f8', 'f8', 'f8',
-        'a50', 'a50', 'a50', 'a50']
-        #'f8', 'f8', 'f8', 'f8']
+        # 'a50', 'a50', 'a50', 'a50']
+        'f8', 'f8', 'f8', 'f8']
    )
  })
  return field_names, array_schema
@@ -117,20 +122,17 @@ def create_zbigarray(memory_size=None, dtype=None):
  # else:
    # raise Exception('You need to provide memory_size and dtype, %s - %s' % (memory_size, dtype))
    
-def set_zblock_format(array, zblk_format):
+def set_zblock_format(zblk_format):
  '''Set the array to write with the provide zblock format.
  Formats are:
-    - 0 (low-overhead access time and high-overhead storage size)
+    - 0 (low-overhead access timed high-overhead storage size)
    - 1 (low overhead storage size and high-overhead access time)
  '''
-  if zblk_format in ('0', '1'):
-    array._fileh.ZBlk_fmt_write = zblk_format_string = 'ZBlk%s' % zblk_format
-    return 'Writting with format: %s' % zblk_format_string
-  else:
-    raise 'Unknown zblk_format provided. Choose between 0 and 1. See docs strings for more information.'
+  file_zodb.ZBlk_fmt_write = zblk_format_string = 'ZBlk%s' % zblk_format
+  return 'Writting with format: %s' % zblk_format_string

-def check_zblock_format(array, zblk_format):
-  if not array._fileh.ZBlk_fmt_write == 'ZBlk%s' % zblk_format:
+def check_zblock_format(zblk_format):
+  if not file_zodb.ZBlk_fmt_write == 'ZBlk%s' % zblk_format:
    raise 'Zblk format didnt match'
  else:
    return True
@@ -151,7 +153,7 @@ def populate_array(root, big_array, big_index, schema):
  with timer('Time to prepare data for assignment', message_list):
    data = root.stock_offset(my_offset=0, my_limit=1)
    columns = data._names
-    row = tuple([filter_item(item, normalize=False) for item in data.tuples()[0]])
+    row = tuple([filter_item(item, normalize=True) for item in data.tuples()[0]])
  
  max_memory_usage =  100*MB
  message_list.append('%s megabytes of memory can be used per chunk.' % (max_memory_usage//MB))
@@ -166,11 +168,9 @@ def populate_array(root, big_array, big_index, schema):
    for i in range(chunks):
      big_array[chunk_begin:chunksize*(i+1)] = row
      chunk_begin = chunksize + 1
-      with timer('Commit each ZBigArray chunk', message_list):
-        transaction.commit()
+      transaction.commit()
  return {'messages': message_list, 'columns': columns}

-# method to cast Zope's DateTime objects, falsy-values and strings to floats
 def filter_item(item, normalize=True):
  '''Typecast item to numeric values if it is a: DateTime or falsy to help
  pandas/numpy deal with them. If normalize is True it will typecast strings
@@ -185,26 +185,53 @@ def filter_item(item, normalize=True):
  else:
    return item

-def process_data(big_array, big_index, columns):
+def process_data(root, big_array, big_index, columns, tool):
  '''Process all data in big_array. Currently, does a complete sum of the
  quantity column both with wendein.core/numpy and pandas.
  '''
  message_list = []
  result = None
+
+  if tool == 'wendelin.pandas':
+    try:
+      with timer(tool, message_list):
+        df = pd.DataFrame.from_records(big_array[:], index=big_index)
+        result = df.quantity.sum()
+      if result:
+        message_list.append('Pandas result: %s' % result)
+    except MemoryError:
+      message_list.append('MemoryError while creating pandas.Dataframe!')
+  
+  if tool == 'wendelin.numpy':
+    with timer(tool, message_list):
+      result = big_array[:]['quantity'].sum()
+    message_list.append('NumPy result: %s' % result)
+  
+  # common setup for in memory and memmap
+  if tool == 'numpy.memory' or tool == 'numpy.memmap':
+    data = root.stock_offset(my_offset=0, my_limit=1)
+    columns = data._names
+    row = tuple([filter_item(item, normalize=True) for item in data.tuples()[0]])
+    _, schema = get_field_names_and_schema()
+    array = np.ndarray((1430394,), schema)
+    array[:] = row
  
-  # When benchmarking, only runs either pandas or numpy, one process creates
-  # a cache for the other and this gives deceiving results.
-  # TODO: add parameter to choose wether numpy or pandas should be used
-  try:
-    with timer('pandas sum', message_list):
-      df = pd.DataFrame.from_records(big_array, index=big_index)
-      result = df.quantity.sum()
-    message_list.append('Pandas result: %s' % result) if result else None
-  except MemoryError:
-    message_list.append('MemoryError while creating pandas.Dataframe!')
-  
-  with timer('numpy sum', message_list):
-    result = big_array['quantity'].sum()
-  message_list.append('NumPy result: %s' % result)
+  if tool == 'numpy.memory':
+    with timer(tool, message_list):
+      result = array['quantity'].sum()
+    message_list.append('numpy in memory result: %s' % result)
+
+  if tool == 'numpy.memmap':
+    import os.path as path
+    filename = path.join('/tmp', 'numpy.dat')
+    write_fp = np.memmap(filename, dtype=schema, mode='w+', shape=(1430394,))
+    with timer('time to write numpy memmap', message_list):
+      write_fp[:] = row
+      write_fp.flush()
+    with timer(tool, message_list):
+      read_fp = np.memmap(filename, dtype=schema, mode='r', shape=(1430394,))
+      array = np.ndarray((1430394,), schema, buffer=read_fp)
+      result = array['quantity'].sum()
+    message_list.append('numpy memmap result: %s' % result)

  return message_list
\ No newline at end of file