Commit cd432f09 authored by Douglas's avatar Douglas

performance test improvements

- added the disk size of the array object in zodb
- fixed profiling methods
- created a context manager to help time pieces of code
- timing numpy.sum() along with pandas.DataFrame.sum()
parent f70ea66c
...@@ -5,6 +5,7 @@ import pandas as pd ...@@ -5,6 +5,7 @@ import pandas as pd
from numpy import float64, dtype, cumsum, sin, uint8 from numpy import float64, dtype, cumsum, sin, uint8
from DateTime import DateTime from DateTime import DateTime
from time import time from time import time
from contextlib import contextmanager
import sys import sys
import psutil import psutil
import os import os
...@@ -53,6 +54,9 @@ def bigger_than_memory_read(self, out_of_core_index=False): ...@@ -53,6 +54,9 @@ def bigger_than_memory_read(self, out_of_core_index=False):
columns = range(23) columns = range(23)
columns[13] = 'quantity' columns[13] = 'quantity'
ob = root.big_index
message_list.append(len(ob._p_jar.db().storage.load(ob._p_oid, '')[0])+42)
big_index = root.big_index[:] if out_of_core_index else np.arange(360000) big_index = root.big_index[:] if out_of_core_index else np.arange(360000)
response = process_data(root.big_array[:], big_index[:], columns) response = process_data(root.big_array[:], big_index[:], columns)
...@@ -63,17 +67,25 @@ def bigger_than_memory_read(self, out_of_core_index=False): ...@@ -63,17 +67,25 @@ def bigger_than_memory_read(self, out_of_core_index=False):
return message_list return message_list
def bigger_than_memory_profile(self): def bigger_than_memory_profile(self):
profile_path_template = '/tmp/profile_%s'
profiles = [bigger_than_memory_read_profile, bigger_than_memory_write_profile] profiles = [bigger_than_memory_read_profile, bigger_than_memory_write_profile]
return [profile(self, profile_path_template) for profile in profiles] return [profile(self) for profile in profiles]
def bigger_than_memory_read_profile(self, profile_path_template): def bigger_than_memory_read_profile(self):
cProfile.runctx('bigger_than_memory_read(self)', globals(), locals(), profile_path_template % 'read') profile_path = '/tmp/profile_read'
return "To see the profile start a console on server and: import pstats; pstats.Stats('/tmp/profile/read').print_stats()" cProfile.runctx('bigger_than_memory_read(self)', globals(), locals(), profile_path)
return "To see the profile start a console on server and: import pstats; pstats.Stats('%s').print_stats()" % profile_path
def bigger_than_memory_write_profile(self, profile_path_template): def bigger_than_memory_write_profile(self):
cProfile.runctx('bigger_than_memory_write(self)', globals(), locals(), profile_path_template % 'write') profile_path = '/tmp/profile_write'
return "To see the profile start a console on server and: import pstats; pstats.Stats('/tmp/profile/write').print_stats()" cProfile.runctx('bigger_than_memory_write(self)', globals(), locals(), profile_path)
return "To see the profile start a console on server and: import pstats; pstats.Stats('%s').print_stats()" % profile_path
@contextmanager
def timer(name, message_list):
start_time = time()
yield
elapsed_time = time() - start_time
message_list.append('[%s] finished in % seconds' % (name, elapsed_time))
def get_process_memory_usage(): def get_process_memory_usage():
process = psutil.Process(os.getpid()) process = psutil.Process(os.getpid())
...@@ -87,8 +99,9 @@ def get_field_names_and_schema(): ...@@ -87,8 +99,9 @@ def get_field_names_and_schema():
'formats': map( 'formats': map(
np.dtype, np.dtype,
['i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', ['i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8', 'i8',
'i8', 'i8', 'i8', 'f8', 'i8', 'i8', 'f8', 'f8', 'f8', 'a50', 'i8', 'i8', 'i8', 'f8', 'i8', 'i8', 'f8', 'f8', 'f8',
'a50', 'a50', 'a50'] 'a50', 'a50', 'a50', 'a50']
#'f8', 'f8', 'f8', 'f8']
) )
}) })
return field_names, array_schema return field_names, array_schema
...@@ -105,9 +118,9 @@ def store_arrays(root, *arrays_filenames): ...@@ -105,9 +118,9 @@ def store_arrays(root, *arrays_filenames):
def populate_array(root, chunk_size, number_of_chunks, big_array, big_index, schema): def populate_array(root, chunk_size, number_of_chunks, big_array, big_index, schema):
offset = 0 offset = 0
start_populate_wendelin = mysql_time = time()
# ZSQL Method to fetch data from stocks table, chunk by chunk, and put it in # ZSQL Method to fetch data from stocks table, chunk by chunk, and put it in
# the ZBigArray. Implemented using DTML's sqlvar instruction, LIMIT andd OFFSET. # the ZBigArray. Implemented using DTML's sqlvar instruction, LIMIT andd OFFSET.
start_populate_wendelin = mysql_time = time()
data = root.stock_offset(my_offset=0, my_limit=chunk_size) data = root.stock_offset(my_offset=0, my_limit=chunk_size)
mysql_time = time() - mysql_time mysql_time = time() - mysql_time
columns = data._names columns = data._names
...@@ -138,19 +151,24 @@ def populate_array(root, chunk_size, number_of_chunks, big_array, big_index, sch ...@@ -138,19 +151,24 @@ def populate_array(root, chunk_size, number_of_chunks, big_array, big_index, sch
return {'messages': message_list, 'columns': columns} return {'messages': message_list, 'columns': columns}
def filter_item(item): def filter_item(item):
if not item or isinstance(item, type(None)): if not item or isinstance(item, type(None)): #or isinstance(item, (str, unicode)):
return 0 return 0
elif isinstance(item, DateTime): elif isinstance(item, DateTime):
return float(item) return 0
else: else:
return item return item
def process_data(big_array, big_index, columns): def process_data(big_array, big_index, columns):
message_list = [] message_list = []
start = time() result = None
df = pd.DataFrame.from_records(big_array, index=big_index, columns=columns)
result = df.quantity.sum() with timer('pandas sum', message_list):
finish = time() df = pd.DataFrame.from_records(big_array, index=big_index)
message_list.append('%s seconds to read the wendelin array with %s rows.' % (finish-start, len(big_array))) result = df.quantity.sum()
message_list.append('Total size of out-of-core array: %s megabytes.' % (big_array.nbytes // MB))
with timer('numpy sum', message_list):
big_array['quantity'].sum()
# message_list.append('%s seconds to read the wendelin array with %s rows.' % (finish-start, len(big_array)))
# message_list.append('Total size of out-of-core array: %s megabytes.' % (big_array.nbytes // MB))
return { 'messages': message_list, 'result': result } return { 'messages': message_list, 'result': result }
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment