explore regularization techniques.

# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save  # hint to help gc free up memory
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)
image_size = 28
num_labels = 10

def reformat(dataset, labels):
    dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
    # Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
    labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
    return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)
Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)

定义计算准确度函数

def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))/ predictions.shape[0])  # 1 mean row 0 means column


1. 加入对权重矩阵的惩罚项到loss


在tensorflow中,tf.nn.l2_loss函数可以帮助我们进行计算。

线性逻辑回归模型

# implement a logistic model
batch_size = 128
regularation_param = 0.0001

graph = tf.Graph()
with graph.as_default():
    # Input data. For the training data, we use a placeholder that will be fed ----------------------------------------1
    # at run time with a training minibatch.
    #  相当于开辟空间
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables.                                                                                                       ------------------------------------------2
    weights = tf.Variable(tf.truncated_normal([image_size * image_size, num_labels]))
    biases = tf.Variable(tf.zeros([num_labels]))
  
    # Training computation.                                                                                  ------------------------------------------3
    logits = tf.matmul(tf_train_dataset, weights) + biases
    
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + regularation_param * tf.nn.l2_loss(weights)
  
  # Optimizer.                                                                                                       -------------------------------------------4
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
    # Predictions for the training, validation, and test data.                             --------------------------------------------5
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(tf.matmul(tf_valid_dataset, weights) + biases)
    test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)
    

num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
        #  传递值到tf的命名空间
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%% with regularation_param = %f " %( accuracy(test_prediction.eval(), test_labels), regularation_param))
Initialized
Minibatch loss at step 0: 46.886597
Minibatch accuracy: 11.7%
Validation accuracy: 11.4%
Minibatch loss at step 500: 0.940757
Minibatch accuracy: 77.3%
Validation accuracy: 80.3%
Minibatch loss at step 1000: 0.832657
Minibatch accuracy: 82.0%
Validation accuracy: 81.2%
Minibatch loss at step 1500: 0.707090
Minibatch accuracy: 85.2%
Validation accuracy: 81.1%
Minibatch loss at step 2000: 0.972838
Minibatch accuracy: 76.6%
Validation accuracy: 81.3%
Minibatch loss at step 2500: 0.756738
Minibatch accuracy: 80.5%
Validation accuracy: 81.0%
Minibatch loss at step 3000: 0.879621
Minibatch accuracy: 82.8%
Validation accuracy: 81.4%
Test accuracy: 87.9% with regularation_param = 0.010000

一层神经网络模型
# implement in neutron networks

batch_size = 128
hiden_layer_node_num = 1024
regularation_param = 0.01
graph = tf.Graph()
with graph.as_default():
    # input                                                                                                             -----------------------------------------1
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables.                                                                                                       ------------------------------------------2
    weights1 = tf.Variable(tf.truncated_normal([image_size * image_size, hiden_layer_node_num]))
    biases1 = tf.Variable(tf.zeros([hiden_layer_node_num]))
    
    # input layer output (batch_size, hiden_layer_node_num)
    weights2 = tf.Variable(tf.truncated_normal([hiden_layer_node_num, num_labels]))
    biases2 = tf.Variable(tf.zeros([num_labels]))
    
  
    # Training computation.                                                                                  ------------------------------------------3
    logits = tf.matmul(tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1), weights2) + biases2
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + regularation_param * (tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2))
  
  # Optimizer.                                                                                                       -------------------------------------------4
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
    # Predictions for the training, validation, and test data.                            --------------------------------------------5
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1), weights2) + biases2)
    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1), weights2) + biases2)
num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
        #  传递值到tf的命名空间
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%% with regularation_param = %f " %( accuracy(test_prediction.eval(), test_labels), regularation_param))
Initialized
Minibatch loss at step 0: 337.164185
Minibatch accuracy: 14.8%
Validation accuracy: 23.4%
Minibatch loss at step 500: 45.676537
Minibatch accuracy: 80.5%
Validation accuracy: 80.5%
Minibatch loss at step 1000: 41.467537
Minibatch accuracy: 80.5%
Validation accuracy: 81.5%
Minibatch loss at step 1500: 34.926945
Minibatch accuracy: 83.6%
Validation accuracy: 80.7%
Minibatch loss at step 2000: 28.990902
Minibatch accuracy: 80.5%
Validation accuracy: 81.4%
Minibatch loss at step 2500: 25.874664
Minibatch accuracy: 83.6%
Validation accuracy: 80.9%
Minibatch loss at step 3000: 24.691601
Minibatch accuracy: 86.7%
Validation accuracy: 81.6%
Test accuracy: 88.7% with regularation_param = 0.000100 


2. overfitting


我们来看一看过拟合的实例。

只要简单的把训练数据的范围缩小并重复训练就能得到overfitting.

(1)线性逻辑回归过拟合

# in lofistic regression  overfitting version
batch_size = 128
regularation_param = 0.01

graph = tf.Graph()
with graph.as_default():
    # Input data. For the training data, we use a placeholder that will be fed ----------------------------------------1
    # at run time with a training minibatch.
    #  相当于开辟空间
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables.                                                                                                       ------------------------------------------2
    weights = tf.Variable(tf.truncated_normal([image_size * image_size, num_labels]))
    biases = tf.Variable(tf.zeros([num_labels]))
  
    # Training computation.                                                                                  ------------------------------------------3
    logits = tf.matmul(tf_train_dataset, weights) + biases
    
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + regularation_param * tf.nn.l2_loss(weights)
  
  # Optimizer.                                                                                                       -------------------------------------------4
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
    # Predictions for the training, validation, and test data.                             --------------------------------------------5
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(tf.matmul(tf_valid_dataset, weights) + biases)
    test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)
    

num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # restrict train dataset in a smaller dataset 
        if offset > 10000:
            offset = 0
    # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
        #  传递值到tf的命名空间
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%% with regularation_param = %f " %( accuracy(test_prediction.eval(), test_labels), regularation_param))
Initialized
Minibatch loss at step 0: 49.519794
Minibatch accuracy: 11.7%
Validation accuracy: 13.5%
Minibatch loss at step 500: 0.310027
Minibatch accuracy: 100.0%
Validation accuracy: 73.7%
Minibatch loss at step 1000: 0.123203
Minibatch accuracy: 100.0%
Validation accuracy: 72.7%
Minibatch loss at step 1500: 0.121531
Minibatch accuracy: 100.0%
Validation accuracy: 72.7%
Minibatch loss at step 2000: 0.122027
Minibatch accuracy: 100.0%
Validation accuracy: 73.5%
Minibatch loss at step 2500: 0.121300
Minibatch accuracy: 100.0%
Validation accuracy: 72.7%
Minibatch loss at step 3000: 0.121102
Minibatch accuracy: 100.0%
Validation accuracy: 72.5%
Test accuracy: 79.1% with regularation_param = 0.010000 

(2)一层神经网络过拟合

# implement overfitting in neutron networks version
batch_size = 128
hiden_layer_node_num = 1024
regularation_param = 0.01
num_steps = 3001

graph = tf.Graph()
with graph.as_default():
    # input                                                                                                             -----------------------------------------1
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables.                                                                                                       ------------------------------------------2
    weights1 = tf.Variable(tf.truncated_normal([image_size * image_size, hiden_layer_node_num]))
    biases1 = tf.Variable(tf.zeros([hiden_layer_node_num]))
    
    # input layer output (batch_size, hiden_layer_node_num)
    weights2 = tf.Variable(tf.truncated_normal([hiden_layer_node_num, num_labels]))
    biases2 = tf.Variable(tf.zeros([num_labels]))
    
  
    # Training computation.                                                                                  ------------------------------------------3
    logits = tf.matmul(tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1), weights2) + biases2
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + regularation_param * (tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2))
  
  # Optimizer.                                                                                                       -------------------------------------------4
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
    # Predictions for the training, validation, and test data.                            --------------------------------------------5
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1), weights2) + biases2)
    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1), weights2) + biases2)


with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # restrict a smaller training dataset
        if offset > 1000:
            offset = 0
        
        
    # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
        #  传递值到tf的命名空间
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%% with regularation_param = %f " %( accuracy(test_prediction.eval(), test_labels), regularation_param))


3.Dropout

先看一下dropout的文档。



Let's see docs of nn.dropout firstly.
tf.nn.dropout(x, keep_prob, noise_shape=None, seed=None, name=None)
With probability keep_prob, outputs the input element scaled up by
1 / keep_prob, otherwise outputs 0. The scaling is so that the expected
sum is unchanged.
Args:
x: A tensor.
keep_prob: A scalar Tensor with the same type as x. The probability that each element is kept.
noise_shape: A 1-D Tensor of type int32, representing the shape for randomly generated keep/drop flags.
seed: A Python integer. Used to create random seeds. See set_random_seedfor behavior.
name: A name for this operation (optional).
Returns:
A Tensor of the same shape of x.
Raises:
ValueError: If keep_prob is not in (0, 1].

需要注意的是,在训练阶段可以dropout,但是在validation,testing阶段不要dropout。

否则,你得到的validation 和 test 的准确度都是随机的。

batch_size = 128
hiden_layer_node_num = 1024
regularation_param = 0.01
num_steps = 3001

graph = tf.Graph()
with graph.as_default():
    # input                                                                                                             -----------------------------------------1
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables.                                                                                                       ------------------------------------------2
    weights1 = tf.Variable(tf.truncated_normal([image_size * image_size, hiden_layer_node_num]))
    biases1 = tf.Variable(tf.zeros([hiden_layer_node_num]))
    
    # input layer output (batch_size, hiden_layer_node_num)
    weights2 = tf.Variable(tf.truncated_normal([hiden_layer_node_num, num_labels]))
    biases2 = tf.Variable(tf.zeros([num_labels]))
    
  
    # Training computation.                                                                                  ------------------------------------------3
    temp = tf.nn.dropout(tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1), 0.8)  # ---------------------------------------------droupout
    logits = tf.matmul(temp, weights2) + biases2
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + regularation_param * (tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2))
  
  # Optimizer.                                                                                                       -------------------------------------------4
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
    # Predictions for the training, validation, and test data.                            --------------------------------------------5
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1), weights2) + biases2)
    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1), weights2) + biases2)


with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        
        
    # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
        #  传递值到tf的命名空间
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%% with regularation_param = %f " %( accuracy(test_prediction.eval(), test_labels), regularation_param))

Initialized
Minibatch loss at step 0: 3401.901367
Minibatch accuracy: 16.4%
Validation accuracy: 18.4%
Minibatch loss at step 500: 21.367651
Minibatch accuracy: 82.0%
Validation accuracy: 84.1%
Minibatch loss at step 1000: 0.976428
Minibatch accuracy: 81.2%
Validation accuracy: 83.5%
Minibatch loss at step 1500: 0.671196
Minibatch accuracy: 86.7%
Validation accuracy: 82.8%
Minibatch loss at step 2000: 0.887370
Minibatch accuracy: 78.1%
Validation accuracy: 83.8%
Minibatch loss at step 2500: 0.701504
Minibatch accuracy: 85.9%
Validation accuracy: 83.2%
Minibatch loss at step 3000: 0.803447
Minibatch accuracy: 85.2%
Validation accuracy: 83.6%
Test accuracy: 89.9% with regularation_param = 0.010000


4.学习速率衰减


迭代的次数越多需要的学习速率越应该小,这样才会使loss的值逐渐降低。

否则会产生在一小范围内loss的值来回震荡的现象。

先看一下相关函数的说明:

tf.train.exponential_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None)

decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)

相关细节为:

global_step = tf.Variable(0) # count the number of steps taken.

learning_rate = tf.train.exponential_decay(0.5, global_step, decay_steps)

optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

这次我们使用多层神经网络进行学习。


本文使用的数据的已知最高识别准确度为97.1%

# without regularation
# test accuracy 96.3%
batch_size =128

num_steps = 20001
def compute_logits(data, weightss, biasess, dropout_vals=None):
    temp = data
    if dropout_vals:
        for w,b,d in zip(weightss[:-1], biasess[:-1], dropout_vals[:-1]):
            temp = tf.nn.relu_layer(tf.nn.dropout(temp, d), w, b)
        temp = tf.matmul(temp, weightss[-1]) + biasess[-1]
    else:
        for w,b in zip(weightss[:-1], biasess[:-1]):
            temp = tf.nn.relu_layer(temp, w, b)
        temp = tf.matmul(temp, weightss[-1]) + biasess[-1]
    return temp
    


graph = tf.Graph()
with graph.as_default():
    # input
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size*image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # variable
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(0.08, global_step, 200, 0.98)
    weights1 = tf.Variable(tf.truncated_normal([784, 1024], stddev = np.sqrt(2.0/1024)))
    biases1 = tf.Variable(tf.zeros([1024]))
    
    weights2 = tf.Variable(tf.truncated_normal([1024, 800], stddev = np.sqrt(2.0/800)))
    biases2 = tf.Variable(tf.zeros([800]))
    
    weights3 = tf.Variable(tf.truncated_normal([800, 512], stddev = np.sqrt(2.0/512)))
    biases3 = tf.Variable(tf.zeros([512]))
    
    weights4 = tf.Variable(tf.truncated_normal([512, 256], stddev = np.sqrt(2.0/256)))
    biases4 = tf.Variable(tf.zeros([256]))
    
    weights5 = tf.Variable(tf.truncated_normal([256, 10], stddev = np.sqrt(2.0/10)))
    biases5 = tf.Variable(tf.zeros([10]))
    
    #training
    logits = compute_logits(tf_train_dataset, [weights1, weights2, weights3,weights4, weights5], [biases1,biases2,biases3,biases4, biases5], \
                            dropout_vals=(1.0,0.95,0.95,0.95,1.0))
    #temp = tf.nn.relu_layer(tf_train_dataset, weights1, biases1)
    #temp = tf.nn.relu_layer(tf.nn.dropout(temp, 0.95), weights2, biases2)
    #temp = tf.nn.relu_layer(tf.nn.dropout(temp, 0.95), weights3, biases3)
    #logits = tf.matmul(temp, weights4) + biases4
    
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    #optimizer
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
    
    # prediction
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(compute_logits(tf_valid_dataset, [weights1, weights2, weights3, weights4, weights5], \
                                                    [biases1, biases2, biases3, biases4,biases5]))
    test_prediction = tf.nn.softmax(compute_logits(tf_test_dataset, [weights1, weights2, weights3, weights4,weights5],\
                                                   [biases1, biases2, biases3, biases4,biases5]))

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        
        
    # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
        #  传递值到tf的命名空间
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%% " %( accuracy(test_prediction.eval(), test_labels)))
    

Minibatch loss at step 19500: 0.203250
Minibatch accuracy: 94.5%
Validation accuracy: 91.3%
Minibatch loss at step 20000: 0.137203
Minibatch accuracy: 95.3%
Validation accuracy: 91.2%
Test accuracy: 96.1%