Skip to content

Commit

Permalink
Merge pull request #5 from nouiz/fft2
Browse files Browse the repository at this point in the history
Theano fft experimental version
  • Loading branch information
soumith committed Jul 29, 2014
2 parents 68789f2 + d82aa91 commit 7044171
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 105 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
convnet-benchmarks
==================

**Work in progress** We are missing many benchmarks. Don't use this to
mean anything about any of the software! Many software have many
convolution implementation and we don't yet benchmark all of
them. Also, they will probably update them to use the faster version
shortly, once we have some benchmark results!

Easy benchmarking of all public open-source implementations of convnets.
A summary is provided in the section below.

Expand Down
226 changes: 121 additions & 105 deletions theano/pylearn2_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,121 +12,137 @@
from pylearn2.sandbox.cuda_convnet.filter_acts import FilterActs
from theano.sandbox.cuda.basic_ops import gpu_contiguous

steps = 4 # nb of steps in loop to average perf
ops = 2 # ops per point

steps = 4 # nb of steps in loop to average perf
ops = 2 # ops per point

runs = [
{
'ni' : 3,
'no' : 96,
'kw' : 11,
'kh' : 11,
'iw' : 128,
'ih' : 128,
'bs' : 128,
'dw' : 1,
'dh' : 1,
'ni': 3,
'no': 96,
'kw': 11,
'kh': 11,
'iw': 128,
'ih': 128,
'bs': 128,
'dw': 1,
'dh': 1,
},
{
'ni' : 64,
'no' : 128,
'kw' : 9,
'kh' : 9,
'iw' : 64,
'ih' : 64,
'bs' : 128,
'dw' : 1,
'dh' : 1,
'ni': 64,
'no': 128,
'kw': 9,
'kh': 9,
'iw': 64,
'ih': 64,
'bs': 128,
'dw': 1,
'dh': 1,
},
{
'ni' : 128,
'no' : 128,
'kw' : 9,
'kh' : 9,
'iw' : 32,
'ih' : 32,
'bs' : 128,
'dw' : 1,
'dh' : 1,
'ni': 128,
'no': 128,
'kw': 9,
'kh': 9,
'iw': 32,
'ih': 32,
'bs': 128,
'dw': 1,
'dh': 1,
},
{
'ni' : 128,
'no' : 128,
'kw' : 7,
'kh' : 7,
'iw' : 16,
'ih' : 16,
'bs' : 128,
'dw' : 1,
'dh' : 1,
'ni': 128,
'no': 128,
'kw': 7,
'kh': 7,
'iw': 16,
'ih': 16,
'bs': 128,
'dw': 1,
'dh': 1,
}
]

for i in range(4):
run = runs[i]
# params for run:
ni,no,kw,kh,bs,iw,ih,dw,dh = run['ni'],run['no'],run['kw'],run['kh'],run['bs'],run['iw'],run['ih'],run['dw'],run['dh']
print ''
print 'CONFIG: input =',ni,'x',iw,'x',ih,'* ker =',ni,'x',no,'x',kw,'x',kh,'( bs =',bs,', stride =',dw,')'

conv = MLP(
batch_size=bs,
input_space=Conv2DSpace((ih,iw), num_channels=ni, axes=('b', 'c', 0, 1)),
layers=[ConvElemwise(no,(kw,kh),'ConvTest',ConvNonlinearity(),irange=0.1)]
)

inputBatch = np.random.randn(bs, ni, ih, iw)
sharedX = theano.shared(inputBatch.astype('float32'))
sharedY = theano.shared(np.random.randn(bs, no, (ih-kh)/dh+1, (iw-kw)/dw+1).astype('float32'))

X = theano.tensor.tensor4()

Y=conv.fprop(X)

fprop = theano.function([],[],givens=[(X,sharedX)],updates=[(sharedY,Y)],on_unused_input='ignore')

theano.sandbox.cuda.synchronize()
start = time.time()
for i in range(steps):
fprop()
theano.sandbox.cuda.synchronize()
tm = (time.time()-start)/steps

del fprop
del sharedX
del conv
del sharedY

print 'pylearn2.models.mlp.ConvElemwise:', (ni*no*kw*kh*(iw-kw+1)*(ih-kh+1) /dw/dh * bs * ops / tm / 1e9) , 'GFLOP/s ( tm =', tm, ')'

### pylearn2 work-around for using cuda-convnet (http://benanne.github.io/2014/04/03/faster-convolutions-in-theano.html) ###

#(channels, rows, columns, batch_size)
inputBatch = np.random.randn(ni, ih, iw, bs)
sharedX = theano.shared(inputBatch.astype('float32'))
sharedY = theano.shared(np.random.randn(no, (ih-kh)/dh+1, (iw-kw)/dw+1, bs).astype('float32'))
# (channels, rows, columns, number of filters)
sharedW = theano.shared(np.random.randn(ni, kh, kw, no).astype('float32'))

conv_op = FilterActs()
contiguous_input = gpu_contiguous(sharedX)
contiguous_filters = gpu_contiguous(sharedW)
Y = conv_op(contiguous_input, contiguous_filters)

fprop = theano.function([],[],givens=[(X,sharedX)],updates=[(sharedY,Y)],on_unused_input='ignore')

theano.sandbox.cuda.synchronize()
start = time.time()
for i in range(steps):
fprop()
theano.sandbox.cuda.synchronize()
tm = (time.time()-start)/steps

del fprop
del sharedX
del conv_op
del sharedY
del sharedW

print ' pylearn2.sandbox.cuda_convnet:', (ni*no*kw*kh*(iw-kw+1)*(ih-kh+1) /dw/dh * bs * ops / tm / 1e9) , 'GFLOP/s ( tm =', tm, ')'
run = runs[i]
# params for run:
ni, no, kw, kh, bs, iw, ih, dw, dh = run['ni'], run['no'], run['kw'], run['kh'], run['bs'], run['iw'], run['ih'], run['dw'], run['dh']
print ''
print 'CONFIG: input =', ni, 'x', iw, 'x', ih, '* ker =', ni, 'x', no, 'x', kw, 'x', kh, '( bs =', bs, ', stride =', dw, ')'

conv = MLP(
batch_size=bs,
input_space=Conv2DSpace((ih, iw), num_channels=ni, axes=('b', 'c', 0, 1)),
layers=[ConvElemwise(no, (kw, kh), 'ConvTest', ConvNonlinearity(), irange=0.1)]
)

inputBatch = np.random.randn(bs, ni, ih, iw)
sharedX = theano.shared(inputBatch.astype('float32'))
sharedY = theano.shared(np.random.randn(bs, no, (ih-kh)/dh+1, (iw-kw)/dw+1).astype('float32'))

X = theano.tensor.tensor4()

Y = conv.fprop(X)

fprop = theano.function([], [], givens=[(X, sharedX)], updates=[(sharedY, Y)], on_unused_input='ignore')

theano.sandbox.cuda.synchronize()
start = time.time()
for i in range(steps):
fprop()
theano.sandbox.cuda.synchronize()
tm = (time.time()-start)/steps

print 'pylearn2.models.mlp.ConvElemwise:', (ni*no*kw*kh*(iw-kw+1)*(ih-kh+1) / dw/dh * bs * ops / tm / 1e9), 'GFLOP/s ( tm =', tm, ')'

# Mimic Theano flag THEANO_FLAGS=optimizer_including=conv_fft_valid:conv_fft_full
mode = theano.compile.get_default_mode()
mode = mode.including('conv_fft_valid', 'conv_fft_full')
fprop = theano.function([], [], givens=[(X, sharedX)],
updates=[(sharedY, Y)],
on_unused_input='ignore', mode=mode)

theano.sandbox.cuda.synchronize()
start = time.time()
for i in range(steps):
fprop()
theano.sandbox.cuda.synchronize()
tm = (time.time()-start)/steps

del fprop
del sharedX
del conv
del sharedY

print '(fft experimental) pylearn2.models.mlp.ConvElemwise:', (ni*no*kw*kh*(iw-kw+1)*(ih-kh+1) / dw/dh * bs * ops / tm / 1e9), 'GFLOP/s ( tm =', tm, ')'

### pylearn2 work-around for using cuda-convnet (http://benanne.github.io/2014/04/03/faster-convolutions-in-theano.html) ###

#(channels, rows, columns, batch_size)
inputBatch = np.random.randn(ni, ih, iw, bs)
sharedX = theano.shared(inputBatch.astype('float32'))
sharedY = theano.shared(np.random.randn(no, (ih-kh)/dh+1, (iw-kw)/dw+1, bs).astype('float32'))
# (channels, rows, columns, number of filters)
sharedW = theano.shared(np.random.randn(ni, kh, kw, no).astype('float32'))

conv_op = FilterActs()
contiguous_input = gpu_contiguous(sharedX)
contiguous_filters = gpu_contiguous(sharedW)
Y = conv_op(contiguous_input, contiguous_filters)

fprop = theano.function([], [], givens=[(X, sharedX)], updates=[(sharedY, Y)], on_unused_input='ignore')

theano.sandbox.cuda.synchronize()
start = time.time()
for i in range(steps):
fprop()
theano.sandbox.cuda.synchronize()
tm = (time.time()-start)/steps

del fprop
del sharedX
del conv_op
del sharedY
del sharedW

print ' pylearn2.sandbox.cuda_convnet:', (ni*no*kw*kh*(iw-kw+1)*(ih-kh+1) / dw/dh * bs * ops / tm / 1e9), 'GFLOP/s ( tm =', tm, ')'

0 comments on commit 7044171

Please sign in to comment.