R
Rodrigo Daunaravicius
I'm having trouble understanding hotshot's results. It's the first time I
use a profiler.
I'm trying to profile a loop that takes almost six hours (~21000 seconds)
to execute in production. The profiler stats, however, give me a total of
16.903 CPU seconds (that's ~17 seconds, right?). The results *are*
relevant, as they helped me to pull execution time down from the original
80 hours, but I wonder if I'm missing something that might help take the
execution time further down.
Follows a simplified version of the code and the real hotshot stats.
Thanks,
Rodrigo
--rodelrod.py
"""Personal toolbox"""
def selectDistinct(list):
"""Get distinct elements from a list"""
[zip]
class LookupTable:
"""Key-acessed table with multiple fields (dictionary with multiple
values)
"""
def __init__(self, *fields):
self.data = {}
self.fields = fields
def __setitem__(self, key, values):
self.data[key] = dict(zip(self.fields, values))
def __getitem__(self, key):
return self.data[key]
[other methods]
--cdr.py
"""From a list of cdrs (call detail records) calculate number of calls to a
given destination and above a given duration threshold
"""
import rodelrod
import hotshot
class DestinationResult:
"""Data structure to store results for each country destination"""
def __init__(self):
self.calls = 0
self.duration = 0
def addcall(self, callduration):
self.calls += 1
self.duration += callduration
[other methods]
# LookupTable {prefix: {'destination': d, 'mincallduration': m}}
destLookup = rodelrod.LookupTable('destination', 'mincallduration')
[among other stuff, populate DestLookup]
profiler = hotshot.Profile('d:/tmp/cdr.prof')
profiler.start()
# Build results (dictionary of DestinationResult() objects)
destinations = rodelrod.selectDistinct([destLookup[x]['destination'] for x
in destLookup])
destinations.sort()
results = dict(zip(destinations, [DestinationResult() for x in
range(len(destinations))]))
# - If destination ANI (cdr[idxDnis]) in destination lookup table,
# add call to respective result object.
# - Iteration must be on a reverse sorted list so that longer (more
specific)
# prefixes are evaluated before shorter (more general) prefixes
lstPrefixes = destLookup.keys()
lstPrefixes.sort()
lstPrefixes.reverse()
# cdrs is a list of tuples (destination phone number, call duration)
cdrs.sort()
cdrs.reverse()
for prefix in lstPrefixes: # len(lstPrefixes) about 80 ~ 200
destination = destLookup[prefix]['destination']
mincallduration = destLookup[prefix]['mincallduration']
addcall = results[destination].addcall #XXX should probably ditch
#this object thing altogether
lenprefix = len(prefix)
for cdr in list(cdrs): # len(cdrs) about 500k ~ 1M
cdrprefix = cdr[0][:lenprefix]
if cdrprefix > prefix: continue
#XXX try and remove these cdr from cdrs right away
# doubt that it will do any good though
elif cdrprefix == prefix:
cdrDuration = cdr[1]
if cdrDuration > mincallduration:
addcall(cdrDuration)
cdrs.remove(cdr) # each cdr must be counted only once
elif cdrprefix < prefix: break
profiler.stop()
--hotshot stats 313303 function calls in 16.903 CPU seconds
Ordered by: internal time, call count
ncalls tottime percall cumtime percall filename:lineno(function)
313042 16.836 0.000 16.836 0.000 cdr.py:34(addcall)
246 0.049 0.000 0.049 0.000 rodelrod.py:44(__getitem__)
6 0.018 0.003 0.018 0.003 cdr.py:31(__repr__)
1 0.000 0.000 0.000 0.000
rodelrod.py:21(selectDistinct)
6 0.000 0.000 0.000 0.000 cdr.py:27(__init__)
1 0.000 0.000 0.000 0.000 rodelrod.py:48(__iter__)
1 0.000 0.000 0.000 0.000 rodelrod.py:52(keys)
0 0.000 0.000 profile:0(profiler)
use a profiler.
I'm trying to profile a loop that takes almost six hours (~21000 seconds)
to execute in production. The profiler stats, however, give me a total of
16.903 CPU seconds (that's ~17 seconds, right?). The results *are*
relevant, as they helped me to pull execution time down from the original
80 hours, but I wonder if I'm missing something that might help take the
execution time further down.
Follows a simplified version of the code and the real hotshot stats.
Thanks,
Rodrigo
--rodelrod.py
"""Personal toolbox"""
def selectDistinct(list):
"""Get distinct elements from a list"""
[zip]
class LookupTable:
"""Key-acessed table with multiple fields (dictionary with multiple
values)
"""
def __init__(self, *fields):
self.data = {}
self.fields = fields
def __setitem__(self, key, values):
self.data[key] = dict(zip(self.fields, values))
def __getitem__(self, key):
return self.data[key]
[other methods]
--cdr.py
"""From a list of cdrs (call detail records) calculate number of calls to a
given destination and above a given duration threshold
"""
import rodelrod
import hotshot
class DestinationResult:
"""Data structure to store results for each country destination"""
def __init__(self):
self.calls = 0
self.duration = 0
def addcall(self, callduration):
self.calls += 1
self.duration += callduration
[other methods]
# LookupTable {prefix: {'destination': d, 'mincallduration': m}}
destLookup = rodelrod.LookupTable('destination', 'mincallduration')
[among other stuff, populate DestLookup]
profiler = hotshot.Profile('d:/tmp/cdr.prof')
profiler.start()
# Build results (dictionary of DestinationResult() objects)
destinations = rodelrod.selectDistinct([destLookup[x]['destination'] for x
in destLookup])
destinations.sort()
results = dict(zip(destinations, [DestinationResult() for x in
range(len(destinations))]))
# - If destination ANI (cdr[idxDnis]) in destination lookup table,
# add call to respective result object.
# - Iteration must be on a reverse sorted list so that longer (more
specific)
# prefixes are evaluated before shorter (more general) prefixes
lstPrefixes = destLookup.keys()
lstPrefixes.sort()
lstPrefixes.reverse()
# cdrs is a list of tuples (destination phone number, call duration)
cdrs.sort()
cdrs.reverse()
for prefix in lstPrefixes: # len(lstPrefixes) about 80 ~ 200
destination = destLookup[prefix]['destination']
mincallduration = destLookup[prefix]['mincallduration']
addcall = results[destination].addcall #XXX should probably ditch
#this object thing altogether
lenprefix = len(prefix)
for cdr in list(cdrs): # len(cdrs) about 500k ~ 1M
cdrprefix = cdr[0][:lenprefix]
if cdrprefix > prefix: continue
#XXX try and remove these cdr from cdrs right away
# doubt that it will do any good though
elif cdrprefix == prefix:
cdrDuration = cdr[1]
if cdrDuration > mincallduration:
addcall(cdrDuration)
cdrs.remove(cdr) # each cdr must be counted only once
elif cdrprefix < prefix: break
profiler.stop()
--hotshot stats 313303 function calls in 16.903 CPU seconds
Ordered by: internal time, call count
ncalls tottime percall cumtime percall filename:lineno(function)
313042 16.836 0.000 16.836 0.000 cdr.py:34(addcall)
246 0.049 0.000 0.049 0.000 rodelrod.py:44(__getitem__)
6 0.018 0.003 0.018 0.003 cdr.py:31(__repr__)
1 0.000 0.000 0.000 0.000
rodelrod.py:21(selectDistinct)
6 0.000 0.000 0.000 0.000 cdr.py:27(__init__)
1 0.000 0.000 0.000 0.000 rodelrod.py:48(__iter__)
1 0.000 0.000 0.000 0.000 rodelrod.py:52(keys)
0 0.000 0.000 profile:0(profiler)