Commit cd432f09 authored by Douglas's avatar Douglas

performance test improvements

- added the disk size of the array object in zodb
- fixed profiling methods
- created a context manager to help time pieces of code
- timing numpy.sum() along with pandas.DataFrame.sum()
parent f70ea66c
......@@ -5,6 +5,7 @@ import pandas as pd
from numpy import float64, dtype, cumsum, sin, uint8
from DateTime import DateTime
from time import time
from contextlib import contextmanager
import sys
import psutil
import os
......@@ -53,6 +54,9 @@ def bigger_than_memory_read(self, out_of_core_index=False):
columns = range(23)
columns[13] = 'quantity'
ob = root.big_index
message_list.append(len(ob._p_jar.db().storage.load(ob._p_oid, '')[0])+42)
big_index = root.big_index[:] if out_of_core_index else np.arange(360000)
response = process_data(root.big_array[:], big_index[:], columns)
......@@ -63,17 +67,25 @@ def bigger_than_memory_read(self, out_of_core_index=False):
return message_list
def bigger_than_memory_profile(self):
profile_path_template = '/tmp/profile_%s'
profiles = [bigger_than_memory_read_profile, bigger_than_memory_write_profile]
return [profile(self, profile_path_template) for profile in profiles]
return [profile(self) for profile in profiles]
def bigger_than_memory_read_profile(self):
profile_path = '/tmp/profile_read'
cProfile.runctx('bigger_than_memory_read(self)', globals(), locals(), profile_path)
return "To see the profile start a console on server and: import pstats; pstats.Stats('%s').print_stats()" % profile_path
def bigger_than_memory_read_profile(self, profile_path_template):
cProfile.runctx('bigger_than_memory_read(self)', globals(), locals(), profile_path_template % 'read')
return "To see the profile start a console on server and: import pstats; pstats.Stats('/tmp/profile/read').print_stats()"
def bigger_than_memory_write_profile(self):
profile_path = '/tmp/profile_write'
cProfile.runctx('bigger_than_memory_write(self)', globals(), locals(), profile_path)
return "To see the profile start a console on server and: import pstats; pstats.Stats('%s').print_stats()" % profile_path
def bigger_than_memory_write_profile(self, profile_path_template):
cProfile.runctx('bigger_than_memory_write(self)', globals(), locals(), profile_path_template % 'write')
return "To see the profile start a console on server and: import pstats; pstats.Stats('/tmp/profile/write').print_stats()"
@contextmanager
def timer(name, message_list):
start_time = time()
yield
elapsed_time = time() - start_time
message_list.append('[%s] finished in % seconds' % (name, elapsed_time))
def get_process_memory_usage():
process = psutil.Process(os.getpid())
......@@ -87,8 +99,9 @@ def get_field_names_and_schema():
'formats': map(
np.dtype,
['i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8',
'i8', 'i8', 'i8', 'f8', 'i8', 'i8', 'f8', 'f8', 'f8', 'a50',
'a50', 'a50', 'a50']
'i8', 'i8', 'i8', 'f8', 'i8', 'i8', 'f8', 'f8', 'f8',
'a50', 'a50', 'a50', 'a50']
#'f8', 'f8', 'f8', 'f8']
)
})
return field_names, array_schema
......@@ -105,9 +118,9 @@ def store_arrays(root, *arrays_filenames):
def populate_array(root, chunk_size, number_of_chunks, big_array, big_index, schema):
offset = 0
start_populate_wendelin = mysql_time = time()
# ZSQL Method to fetch data from stocks table, chunk by chunk, and put it in
# the ZBigArray. Implemented using DTML's sqlvar instruction, LIMIT andd OFFSET.
start_populate_wendelin = mysql_time = time()
data = root.stock_offset(my_offset=0, my_limit=chunk_size)
mysql_time = time() - mysql_time
columns = data._names
......@@ -138,19 +151,24 @@ def populate_array(root, chunk_size, number_of_chunks, big_array, big_index, sch
return {'messages': message_list, 'columns': columns}
def filter_item(item):
if not item or isinstance(item, type(None)):
if not item or isinstance(item, type(None)): #or isinstance(item, (str, unicode)):
return 0
elif isinstance(item, DateTime):
return float(item)
return 0
else:
return item
def process_data(big_array, big_index, columns):
message_list = []
start = time()
df = pd.DataFrame.from_records(big_array, index=big_index, columns=columns)
result = None
with timer('pandas sum', message_list):
df = pd.DataFrame.from_records(big_array, index=big_index)
result = df.quantity.sum()
finish = time()
message_list.append('%s seconds to read the wendelin array with %s rows.' % (finish-start, len(big_array)))
message_list.append('Total size of out-of-core array: %s megabytes.' % (big_array.nbytes // MB))
with timer('numpy sum', message_list):
big_array['quantity'].sum()
# message_list.append('%s seconds to read the wendelin array with %s rows.' % (finish-start, len(big_array)))
# message_list.append('Total size of out-of-core array: %s megabytes.' % (big_array.nbytes // MB))
return { 'messages': message_list, 'result': result }
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment