关于 python：theano 中的 MLP 分类器设置在局部最小值

MLP classifier in theano settles at local minima

我使用 theano 编写了一个 MLP 分类器。使用反向传播算法的训练函数如下：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

self.weights=[theano.shared(numpy.random.random((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]
self.bias=[theano.shared(numpy.random.random(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
self.layers=network.layers
self.prev_rate=[theano.shared(numpy.zeros((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]+[theano.shared(numpy.zeros(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
prediction=T.dmatrix()
output=T.dmatrix()
reg_lambda=T.dscalar()
alpha=T.dscalar()
momentum=T.dscalar()
cost=T.nnet.categorical_crossentropy(prediction,output).mean()
for i,j in zip(self.weights,self.bias):
cost+=T.sum(i**2)*reg_lambda
cost+=T.sum(j**2)*reg_lambda
parameters=self.weights+self.bias
rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate) for parameter,prev_rate in zip(parameters,self.prev_rate)]
updates=[(weight,weight-rate) for weight,rate in zip(parameters,rates)]+[(prev_rate,rate) for prev_rate,rate in zip(self.prev_rate,rates)]
self.backprop=theano.function([prediction,output,reg_lambda,alpha,momentum],cost,updates=updates)

我尝试为 XOR 问题训练分类器。实现是

1
2
3
4
5

network=FeedForwardNetwork([2,2,2])
network.initialize()
network.train(numpy.array([[0.,0.],[0.,1.],[1.,0.],[1.,1.],[0.,0.],[0.,1.],[1.,0.],[1.,1.]]),numpy.array([[0.,1.],[1.,0.],[1.,0.],[0.,1.],[0.,1.],[1.,0.],[1.,0.],[0.,1.]]),alpha=0.01,epochs=1000000000000000,momentum=0.9)
print network.predict(numpy.array([[1.,0.]]))
print network.predict(numpy.array([[0.,0.]]))

initialize() 方法只编译后端的所有函数，即反向传播函数、用于计算预测的前向传递函数和一些其他的 theano 函数。现在，当我运行这段代码时，训练会达到局部最小值。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056

在训练开始时，损失约为 0.92。它稳步下降到上述值并停在那里。我尝试改变 alpha 值和动量。我做错了什么？

附言
整个代码在这里：
网络.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

import theano
import theano.tensor as T
import numpy
from layers import *
from backend import NetworkBackend

class Network:

def __init__(self,architecture):
self.architecture=architecture
self.layers=[]
self.weights=[]
self.bias=[]

def __str__(self):
banner=''
for i in range(len(self.weights)):
banner+=str(self.weights[i])+'\
'
banner+=str(self.bias[i])+'\
'
return banner

class FeedForwardNetwork(Network):

def initialize(self):
self.layers.append(InputLayer(units=self.architecture[0]))
for i in range(1,len(self.architecture[:-1])):
self.layers.append(SigmoidLayer(units=self.architecture[i]))
self.layers.append(SoftmaxLayer(units=self.architecture[-1]))
self.backend=NetworkBackend(self)

def predict(self,inputs):
return self.backend.activate(inputs)

def train(self,X,y,alpha=100,reg_lambda=0.0001,epochs=10000,momentum=0.9):
cost=1
while cost>0.01 and epochs:
prediction=self.predict(X)
cost=self.backend.backprop(prediction,y,reg_lambda,alpha,momentum)
print cost
epochs-=1

if __name__=='__main__':
network=FeedForwardNetwork([2,2,2])
network.initialize()
network.train(numpy.array([[0.,0.],[0.,1.],[1.,0.],[1.,1.],[0.,0.],[0.,1.],[1.,0.],[1.,1.]]),numpy.array([[0.,1.],[1.,0.],[1.,0.],[0.,1.],[0.,1.],[1.,0.],[1.,0.],[0.,1.]]),alpha=0.01,epochs=1000000000000000,momentum=0.9)
print network.predict(numpy.array([[1.,0.]]))
print network.predict(numpy.array([[0.,0.]]))

layers.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

import theano
import theano.tensor as T
import scipy
from backend import ComputationBackend

class Layer:

def __init__(self,units):
self.units=units
self.backend=ComputationBackend()

def __str__(self):
banner=self.__class__.__name__
banner+=" Units:%d"%self.units
return banner

class SigmoidLayer(Layer):

def forwardPass(self,inputs):
return self.backend.sigmoid(inputs)

class InputLayer(Layer):

def forwardPass(self,inputs):
return inputs

class SoftmaxLayer(Layer):

def forwardPass(self,inputs):
return self.backend.softmax(inputs)

后端.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46

import theano
import theano.tensor as T
import numpy

class NetworkBackend:

def __init__(self,network):

# initialize shared variables
self.weights=[theano.shared(numpy.random.random((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]
self.bias=[theano.shared(numpy.random.random(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
self.layers=network.layers
self.prev_rate=[theano.shared(numpy.zeros((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]+[theano.shared(numpy.zeros(network.architecture[i+1])) for i in range(len(network.architecture)-1)]

# activation for network layers
inputs=T.dmatrix()
temp=self.layers[0].forwardPass(inputs)
for i in range(1,len(self.layers[:-1])):
temp=self.layers[i].forwardPass(T.dot(temp,self.weights[i-1].transpose())+self.bias[i-1])
output=self.layers[-1].forwardPass(T.dot(temp,self.weights[-1].transpose())+self.bias[-1])
self.activate=theano.function([inputs],output)

prediction=T.dmatrix()
output=T.dmatrix()
reg_lambda=T.dscalar()
alpha=T.dscalar()
momentum=T.dscalar()
cost=T.nnet.categorical_crossentropy(prediction,output).mean()
for i,j in zip(self.weights,self.bias):
cost+=T.sum(i**2)*reg_lambda
cost+=T.sum(j**2)*reg_lambda
parameters=self.weights+self.bias
rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate) for parameter,prev_rate in zip(parameters,self.prev_rate)]
updates=[(weight,weight-rate) for weight,rate in zip(parameters,rates)]+[(prev_rate,rate) for prev_rate,rate in zip(self.prev_rate,rates)]
self.backprop=theano.function([prediction,output,reg_lambda,alpha,momentum],cost,updates=updates)

class ComputationBackend:

def __init__(self):

# sigmoid activation
self.sigmoid=T.nnet.sigmoid

# softmax activation
self.softmax=T.nnet.softmax