воскресенье, 26 апреля 2015 г.

Python Profiling


Здесь рассматриваются методики профилировали в Питоне. 
Это скорее памятка самому себе, но если кому-то пригодится, буду рад!)
Так же доступна интерактивная версия Ipython Notebook.
In [2]:
#  Перед этим pip install line_profiler, memory_profiler
%load_ext line_profiler
%load_ext memory_profiler
In [6]:
from functools import wraps

def timefn(fn):
    @wraps(fn)
    def measure_time(*args, **kwargs):
        t1 = time.time()
        result = fn(*args, **kwargs)
        t2 = time.time()
        print ("@timefn:" + fn.func_name + " took " + str(t2 - t1) + " seconds")
        return result
    return measure_time
In [5]:
#@timefn
#@profile
def calculate_z_serial_purepython(maxiter, zs, cs): 
    """Calculate output list using Julia update rule"""
    
    output = [0] * len(zs)
    for i in xrange(len(zs)):
        n=0
        z = zs[i]
        c = cs[i]
        while abs(z) < 2 and n < maxiter:
            z=z*z+c
            n+=1
        output[i] = n

    return output
In [15]:
import time

x1, x2, y1, y2 = -1.8, 1.8, -1.8, 1.8
c_real, c_imag = -0.62772, -.42193

def calc_pure_python(desired_width, max_iterations): 
    """Create a list of complex coordinates (zs) and complex parameters (cs), build Julia set, and display"""
    
    x_step = (float(x2 - x1) / float(desired_width))
    y_step = (float(y1 - y2) / float(desired_width))
    x=[]
    y=[]
    ycoord = y2
    while ycoord > y1:
        y.append(ycoord)
        ycoord += y_step
    xcoord = x1
    while xcoord < x2:
        x.append(xcoord)
        xcoord += x_step
        
    ##################### SHOW PYTHON HEAP
    from guppy import hpy;
    hp = hpy()
    print "heapy after creating y and x lists of floats"
    h = hp.heap()
    print h
    ######################
    
        
    # Build a list of coordinates and the initial condition for each cell.
    # Note that our initial condition is a constant and could easily be removed; # we use it to simulate a real-world scenario with several inputs to
    # our function.
    zs=[]
    cs=[]
    
    for ycoord in y:
        for xcoord in x:
            zs.append(complex(xcoord, ycoord))
            cs.append(complex(c_real, c_imag))
            
    print "Length of x:", len(x)
    print "Total elements:", len(zs)
    start_time = time.time()
    output = calculate_z_serial_purepython(max_iterations, zs, cs)
    end_time = time.time()
    secs = end_time - start_time
    print calculate_z_serial_purepython.func_name + " took", secs, "seconds"
    # This sum is expected for a 1000^2 grid with 300 iterations. # It catches minor errors we might introduce when we're
    # working on a fixed set of inputs.
    #assert sum(output) == 33219980
    return output
In [6]:
"""
При построении с учетом отттенков серого, цвет меняющийся высоким контрастом дал бы нам представление о том, 
где стоимость функции медленно меняется, а где быстро. На данном рисунке, мы имеем линейный цветовую карту: 
черный дорого вычислять(>300), серый бесплатен (сразу 0) и белый дешевле максимума(0<x<300).
"""


from PIL import Image, ImageDraw

width = height = 1000

image = Image.new("RGBA", (width, height), (0,0,0,0))
pixels = image.load()

output = calc_pure_python(desired_width=height, max_iterations=300)
#print output

i = 0
x, y = 0, 0
for val in output:
    if i and i%height == 0:
        y += 1
        x = 0
        
    if val == 300:
        color = (0, 0, 0)
    elif val == 0:
        color = (125, 125, 125)
    else:
        color = (255, 255, 255)
        
    pixels[x, y] = color
    i += 1
    x += 1
    
image.show()
heapy after creating y and x lists of floats
Partition of a set of 2125520 objects. Total size = 96975808 bytes.
 Index  Count   %     Size   % Cumulative  % Kind (class / dict of class)
     0 2000001  94 64000032  66  64000032  66 complex
     1   1228   0 16507040  17  80507072  83 list
     2  56303   3  4946792   5  85453864  88 str
     3  28739   1  2462944   3  87916808  91 tuple
     4   1701   0  1640952   2  89557760  92 dict (no owner)
     5    420   0  1240416   1  90798176  94 dict of module
     6   7150   0   915200   1  91713376  95 types.CodeType
     7   7023   0   842760   1  92556136  95 function
     8    854   0   769040   1  93325176  96 type
     9    854   0   684560   1  94009736  97 dict of type
<540 more rows. Type e.g. '_.more' to view.>

Length of x: 1000
Total elements: 1000000
calculate_z_serial_purepython took 6.69428300858 seconds

Приступим к профайлингу

Производительность

In [7]:
output = calc_pure_python(desired_width=1000, max_iterations=300)
heapy after creating y and x lists of floats
Partition of a set of 2227318 objects. Total size = 107656256 bytes.
 Index  Count   %     Size   % Cumulative  % Kind (class / dict of class)
     0 2000001  90 64000032  59  64000032  59 complex
     1   1191   0 24628920  23  88628952  82 list
     2  57069   3  5003896   5  93632848  87 str
     3  29259   1  2510848   2  96143696  89 tuple
     4 103191   5  2476584   2  98620280  92 int
     5   1711   0  1656808   2 100277088  93 dict (no owner)
     6    427   0  1252360   1 101529448  94 dict of module
     7   7263   0   929664   1 102459112  95 types.CodeType
     8   7129   0   855480   1 103314592  96 function
     9    854   0   769040   1 104083632  97 type
<546 more rows. Type e.g. '_.more' to view.>

Length of x: 1000
Total elements: 1000000
calculate_z_serial_purepython took 6.71625804901 seconds
In [41]:
#  in IPython (probably because you’re using %run)
%timeit calc_pure_python(desired_width=1000, max_iterations=300)
Length of x: 1000
Total elements: 1000000
@timefn:calculate_z_serial_purepython took 6.66308379173 seconds
calculate_z_serial_purepython took 6.66349101067 seconds
Length of x: 1000
Total elements: 1000000
@timefn:calculate_z_serial_purepython took 6.68743419647 seconds
calculate_z_serial_purepython took 6.68781113625 seconds
Length of x: 1000
Total elements: 1000000
@timefn:calculate_z_serial_purepython took 6.79939007759 seconds
calculate_z_serial_purepython took 6.79975104332 seconds
Length of x: 1000
Total elements: 1000000
@timefn:calculate_z_serial_purepython took 6.83708381653 seconds
calculate_z_serial_purepython took 6.83745718002 seconds
1 loops, best of 3: 7.24 s per loop
In [43]:
#  cProfile
#  this time includes the overhead of using cProfile
%prun calc_pure_python(desired_width=1000, max_iterations=300)
Length of x: 1000
Total elements: 1000000
@timefn:calculate_z_serial_purepython took 9.6828110218 seconds
calculate_z_serial_purepython took 9.68326115608 seconds
 
In [21]:
#  line_profiler
%lprun -f calculate_z_serial_purepython calc_pure_python(desired_width=1000, max_iterations=300)
UsageError: option -l not recognized ( allowed: "rsf:m:D:T:" )
In [1]:
z = 0+0j
%timeit abs(z) < 2

n=1
maxiter = 300
%timeit n < maxiter

# Тут можно порассуждать, о правильности порядка abs(z) < 2 and n < maxiter или n < maxiter and abs(z) < 2
# мы знаем, что только 10% точек доходит до 300-го вычисления и при этом проверка n < maxiter в 2 раза дешевле
The slowest run took 66.17 times longer than the fastest. This could mean that an intermediate result is being cached 
10000000 loops, best of 3: 93.7 ns per loop
10000000 loops, best of 3: 44 ns per loop

Память

Основной закон этого мира - это "Закон сохранения энергии" Это он мешает создать нам вечный двигтаель и он же создает проблему перетягивания каната: память - скорость
Понимание использования памяти вашего кода позволяет вам задать себе два вопроса: • Можем ли мы использовать меньше памяти, переписав эту функцию, чтобы работать более эффективно? • Можем ли мы использовать больше оперативной памяти и сохранить ресурсы процессора за счет кэширования?
In [14]:
from calc_pure_python import calc_pure_python2
#  NOTE: %mprun can only be used on functions defined in physical files, and not in the IPython environment.
%mprun -f calc_pure_python calc_pure_python2(desired_width=10, max_iterations=300)
Length of x: 11
Total elements: 121
calculate_z_serial_purepython took 0.00995206832886 seconds
('',)
In [15]:
import os
os.path.dirname(os.getcwd())
Out[15]:
'/Users'
In [16]:
z = 0+0j
%memit abs(z) < 2

n=1
maxiter = 300
%memit n < maxiter
peak memory: 22.28 MiB, increment: 0.99 MiB
peak memory: 22.34 MiB, increment: 0.05 MiB

CherryPy server

In [8]:
def launch_memory_usage_server(port=8080): 
    import cherrypy
    import dowser
    cherrypy.tree.mount(dowser.Root())
    cherrypy.config.update({
        'environment': 'embedded',
        'server.socket_port': port
    })
    cherrypy.engine.start()
In [13]:
import time

x1, x2, y1, y2 = -1.8, 1.8, -1.8, 1.8
c_real, c_imag = -0.62772, -.42193

def calc_pure_python_cherry(desired_width, max_iterations): 
    """Create a list of complex coordinates (zs) and complex parameters (cs), build Julia set, and display"""
    
    x_step = (float(x2 - x1) / float(desired_width))
    y_step = (float(y1 - y2) / float(desired_width))
    x=[]
    y=[]
    
    ycoord = y2
    while ycoord > y1:
        y.append(ycoord)
        ycoord += y_step
    
    xcoord = x1
    while xcoord < x2:
        x.append(xcoord)
        xcoord += x_step

        
    # Build a list of coordinates and the initial condition for each cell.
    # Note that our initial condition is a constant and could easily be removed; # we use it to simulate a real-world scenario with several inputs to
    # our function.
    zs=[]
    cs=[]
    
    for ycoord in y:
        for xcoord in x:
            zs.append(complex(xcoord, ycoord))
            cs.append(complex(c_real, c_imag))

    #  тут стартуем сервер
    launch_memory_usage_server()
            
    start_time = time.time()
    output = calculate_z_serial_purepython(max_iterations, zs, cs)
    end_time = time.time()
    secs = end_time - start_time

    #  делаем так, чтобы сервер не потух
    print "now waiting..."
    while True:
        time.sleep(1)
    
    return output
calc_pure_python_cherry(desired_width=1000, max_iterations=300)
INFO:cherrypy.error:[26/Apr/2015:22:31:29] ENGINE Bus STARTING
INFO:cherrypy.error:[26/Apr/2015:22:31:29] ENGINE Monitor thread '_TimeoutMonitor' already started.
INFO:cherrypy.error:[26/Apr/2015:22:31:29] ENGINE Already serving on http://127.0.0.1:8080
INFO:cherrypy.error:[26/Apr/2015:22:31:29] ENGINE Bus STARTED
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-13-8b6b3a917280> in <module>()
     48 
     49     return output
---> 50 calc_pure_python_cherry(desired_width=1000, max_iterations=300)

<ipython-input-13-8b6b3a917280> in calc_pure_python_cherry(desired_width, max_iterations)
     38 
     39     start_time = time.time()
---> 40     output = calculate_z_serial_purepython(max_iterations, zs, cs)
     41     end_time = time.time()
     42     secs = end_time - start_time

<ipython-input-5-0ee9a7f0d062> in calculate_z_serial_purepython(maxiter, zs, cs)
     11         while abs(z) < 2 and n < maxiter:
     12             z=z*z+c
---> 13             n+=1
     14         output[i] = n
     15 

KeyboardInterrupt: 

Byte Code

Первый столбец содержит номера строк исполнения.Вторая колонка содержит символы - >>; это направления для точек перехода, в других местах в коде.Третий столбец адрес операции вместе с именем операции.Четвертый столбец содержит параметры для работы.Пятая колонна содержит аннотации, чтобы помочь выстроить байт-код с оригинальными параметрами Python.
In [16]:
import dis
dis.dis(calc_pure_python)
  9           0 LOAD_GLOBAL              0 (float)
              3 LOAD_GLOBAL              1 (x2)
              6 LOAD_GLOBAL              2 (x1)
              9 BINARY_SUBTRACT     
             10 CALL_FUNCTION            1
             13 LOAD_GLOBAL              0 (float)
             16 LOAD_FAST                0 (desired_width)
             19 CALL_FUNCTION            1
             22 BINARY_DIVIDE       
             23 STORE_FAST               2 (x_step)

 10          26 LOAD_GLOBAL              0 (float)
             29 LOAD_GLOBAL              3 (y1)
             32 LOAD_GLOBAL              4 (y2)
             35 BINARY_SUBTRACT     
             36 CALL_FUNCTION            1
             39 LOAD_GLOBAL              0 (float)
             42 LOAD_FAST                0 (desired_width)
             45 CALL_FUNCTION            1
             48 BINARY_DIVIDE       
             49 STORE_FAST               3 (y_step)

 11          52 BUILD_LIST               0
             55 STORE_FAST               4 (x)

 12          58 BUILD_LIST               0
             61 STORE_FAST               5 (y)

 13          64 LOAD_GLOBAL              4 (y2)
             67 STORE_FAST               6 (ycoord)

 14          70 SETUP_LOOP              39 (to 112)
        >>   73 LOAD_FAST                6 (ycoord)
             76 LOAD_GLOBAL              3 (y1)
             79 COMPARE_OP               4 (>)
             82 POP_JUMP_IF_FALSE      111

 15          85 LOAD_FAST                5 (y)
             88 LOAD_ATTR                5 (append)
             91 LOAD_FAST                6 (ycoord)
             94 CALL_FUNCTION            1
             97 POP_TOP             

 16          98 LOAD_FAST                6 (ycoord)
            101 LOAD_FAST                3 (y_step)
            104 INPLACE_ADD         
            105 STORE_FAST               6 (ycoord)
            108 JUMP_ABSOLUTE           73
        >>  111 POP_BLOCK           

 17     >>  112 LOAD_GLOBAL              2 (x1)
            115 STORE_FAST               7 (xcoord)

 18         118 SETUP_LOOP              39 (to 160)
        >>  121 LOAD_FAST                7 (xcoord)
            124 LOAD_GLOBAL              1 (x2)
            127 COMPARE_OP               0 (<)
            130 POP_JUMP_IF_FALSE      159

 19         133 LOAD_FAST                4 (x)
            136 LOAD_ATTR                5 (append)
            139 LOAD_FAST                7 (xcoord)
            142 CALL_FUNCTION            1
            145 POP_TOP             

 20         146 LOAD_FAST                7 (xcoord)
            149 LOAD_FAST                2 (x_step)
            152 INPLACE_ADD         
            153 STORE_FAST               7 (xcoord)
            156 JUMP_ABSOLUTE          121
        >>  159 POP_BLOCK           

 23     >>  160 LOAD_CONST               1 (-1)
            163 LOAD_CONST               2 (('hpy',))
            166 IMPORT_NAME              6 (guppy)
            169 IMPORT_FROM              7 (hpy)
            172 STORE_FAST               8 (hpy)
            175 POP_TOP             

 24         176 LOAD_FAST                8 (hpy)
            179 CALL_FUNCTION            0
            182 STORE_FAST               9 (hp)

 25         185 LOAD_CONST               3 ('heapy after creating y and x lists of floats')
            188 PRINT_ITEM          
            189 PRINT_NEWLINE       

 26         190 LOAD_FAST                9 (hp)
            193 LOAD_ATTR                8 (heap)
            196 CALL_FUNCTION            0
            199 STORE_FAST              10 (h)

 27         202 LOAD_FAST               10 (h)
            205 PRINT_ITEM          
            206 PRINT_NEWLINE       

 34         207 BUILD_LIST               0
            210 STORE_FAST              11 (zs)

 35         213 BUILD_LIST               0
            216 STORE_FAST              12 (cs)

 37         219 SETUP_LOOP              75 (to 297)
            222 LOAD_FAST                5 (y)
            225 GET_ITER            
        >>  226 FOR_ITER                67 (to 296)
            229 STORE_FAST               6 (ycoord)

 38         232 SETUP_LOOP              58 (to 293)
            235 LOAD_FAST                4 (x)
            238 GET_ITER            
        >>  239 FOR_ITER                50 (to 292)
            242 STORE_FAST               7 (xcoord)

 39         245 LOAD_FAST               11 (zs)
            248 LOAD_ATTR                5 (append)
            251 LOAD_GLOBAL              9 (complex)
            254 LOAD_FAST                7 (xcoord)
            257 LOAD_FAST                6 (ycoord)
            260 CALL_FUNCTION            2
            263 CALL_FUNCTION            1
            266 POP_TOP             

 40         267 LOAD_FAST               12 (cs)
            270 LOAD_ATTR                5 (append)
            273 LOAD_GLOBAL              9 (complex)
            276 LOAD_GLOBAL             10 (c_real)
            279 LOAD_GLOBAL             11 (c_imag)
            282 CALL_FUNCTION            2
            285 CALL_FUNCTION            1
            288 POP_TOP             
            289 JUMP_ABSOLUTE          239
        >>  292 POP_BLOCK           
        >>  293 JUMP_ABSOLUTE          226
        >>  296 POP_BLOCK           

 42     >>  297 LOAD_CONST               4 ('Length of x:')
            300 PRINT_ITEM          
            301 LOAD_GLOBAL             12 (len)
            304 LOAD_FAST                4 (x)
            307 CALL_FUNCTION            1
            310 PRINT_ITEM          
            311 PRINT_NEWLINE       

 43         312 LOAD_CONST               5 ('Total elements:')
            315 PRINT_ITEM          
            316 LOAD_GLOBAL             12 (len)
            319 LOAD_FAST               11 (zs)
            322 CALL_FUNCTION            1
            325 PRINT_ITEM          
            326 PRINT_NEWLINE       

 44         327 LOAD_GLOBAL             13 (time)
            330 LOAD_ATTR               13 (time)
            333 CALL_FUNCTION            0
            336 STORE_FAST              13 (start_time)

 45         339 LOAD_GLOBAL             14 (calculate_z_serial_purepython)
            342 LOAD_FAST                1 (max_iterations)
            345 LOAD_FAST               11 (zs)
            348 LOAD_FAST               12 (cs)
            351 CALL_FUNCTION            3
            354 STORE_FAST              14 (output)

 46         357 LOAD_GLOBAL             13 (time)
            360 LOAD_ATTR               13 (time)
            363 CALL_FUNCTION            0
            366 STORE_FAST              15 (end_time)

 47         369 LOAD_FAST               15 (end_time)
            372 LOAD_FAST               13 (start_time)
            375 BINARY_SUBTRACT     
            376 STORE_FAST              16 (secs)

 48         379 LOAD_GLOBAL             14 (calculate_z_serial_purepython)
            382 LOAD_ATTR               15 (func_name)
            385 LOAD_CONST               6 (' took')
            388 BINARY_ADD          
            389 PRINT_ITEM          
            390 LOAD_FAST               16 (secs)
            393 PRINT_ITEM          
            394 LOAD_CONST               7 ('seconds')
            397 PRINT_ITEM          
            398 PRINT_NEWLINE       

 52         399 LOAD_FAST               14 (output)
            402 RETURN_VALUE        
Простое правило гласит, что несколько строк байт-кода будет выполняться более медленно, чем меньшим количеством эквивалентных линий байт-кода, использующих встроенные функции.