Deep Learning

Table of Contents

1. Deep Learning

1.1. linear regression

import matplotlib.pyplot as plt
import numpy as np

X, Y = 0, 0
def get_training_set():
    global X,Y
    data = np.loadtxt("../extra/data.txt", delimiter=",")
    X = data[:, 0].reshape(-1, 1)
    Y = data[:, 1].reshape(-1, 1)

get_training_set()
plt.scatter(x=X[:, 0], y=Y[:, 0])
plt.show()

machine_learning_1_0.png

Figure 1: png

1.1.1. training set

print("X:",X[:10])
print()
print("Y:",Y[:10])
X: [[6.1101]
 [5.5277]
 [8.5186]
 [7.0032]
 [5.8598]
 [8.3829]
 [7.4764]
 [8.5781]
 [6.4862]
 [5.0546]]

Y: [[17.592 ]
 [ 9.1302]
 [13.662 ]
 [11.854 ]
 [ 6.8233]
 [11.886 ]
 [ 4.3483]
 [12.    ]
 [ 6.5987]
 [ 3.8166]]

1.1.2. feature & label

X is a matrix of features

Y is a matrix of labels

1.1.3. regression

给定一个 training set 中不存在的 x, 例如 5.2, 如何预测它对应的 y?

1.1.4. hypothesis function

\(\hat{y}=h(x)=Wx+B\)

W ~ weights

B ~ Bias

get_training_set()
plt.scatter(x=X[:, 0], y=Y[:, 0])
W=0.2
B=8
x=np.arange(5,22.5,0.1)
plt.plot(x, np.dot(x,W)+B,label="bad", )
W=1.18
B=-3.79
plt.plot(x, np.dot(x,W)+B,label="good")
plt.legend()
plt.show()

machine_learning_10_0.png

Figure 2: png

1.1.5. cost function

使用 cost function 来衡量 hypothesis function 是否足够好, cost function 值越小, hypothesis function 越好

1.1.6. mean square errno

\(J(W,B)=\frac{1}{2m}\sum\limits_{i=1}^{m}(\hat{y^i}-y^i)^2\)

1.1.7. gradient decent

gradient decent

$ W → W-α\frac{\partial}{∂{W}}J(W,B) $

$ B → B-α\frac{\partial}{∂{B}}J(W,B) $

gradient_decent.png

Figure 3: gradient

1.1.8. partial derivative

\(\frac{\partial}{\partial{W}}J(W,B) = \frac{1}{2m}\sum2/(\hat{y}-y)/\frac{\partial}{\partial{W}}{\hat{y^i}} = \frac{1}{m}\sum(\hat{y}-y)*{x^j}\)

\(\frac{\partial}{\partial{B}}J(W,B) = \frac{1}{2m}\sum2/(\hat{y}-y)/\frac{\partial}{\partial{B}}{\hat{y^i}} = \frac{1}{m}\sum(\hat{y}-y)\)

1.1.9. simple linear regression example

from sklearn import preprocessing
from sklearn.datasets import load_boston
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from PIL import Image
import sys

EPOCH = 50
LEARNING_RATE = 0.1
WITH_FEATURE_SCALING=True

def mse(A, B):
    return np.square(np.subtract(A, B)).mean()/2

def get_training_set():
    global X, Y
    data = np.loadtxt("../extra/data.txt", delimiter=",")

    X = data[:, 0].reshape(-1, 1)
    Y = data[:, 1].reshape(-1, 1)
    if WITH_FEATURE_SCALING:
        X = preprocessing.scale(X)
        Y = preprocessing.scale(Y)

def cost_function(X, Y, W, B):
    m = len(X)
    J = 0.
    dw = np.zeros_like(W)
    db = np.zeros_like(B)
    y_hat = np.matmul(X, W) + B
    J = mse(y_hat, Y)
    dw = np.matmul(np.transpose(X), y_hat - Y)/m
    db = (y_hat - Y).mean()
    return J, dw, db

def gradient_decent(X, Y, W, B):
    alpha = LEARNING_RATE
    for epoch in range(EPOCH):
        cost, dw, db = cost_function(X, Y, W, B)
        if epoch % (EPOCH//10) == 0:
            print("training: #", epoch, cost ,W, B)
        W = W - alpha * dw
        B = B - alpha * db
    return W, B

X, Y = 0, 0
W, B = 0, 0

def train():
    global X, Y, W, B
    W = np.random.randn(X.shape[1], 1)
    B = np.random.randn(1, 1)
    W, B = gradient_decent(X, Y, W, B)

def predict():
    global W, B, X, Y
    plt.scatter(x=X[:, 0], y=Y[:, 0])
    plt.plot(X, np.matmul(X, W) + B, 'g')
    plt.show()

get_training_set()
train()
predict()
training: # 0 0.44911036737309545 [[1.57832978]] [[0.22798332]]
training: # 5 0.2536317389235355 [[1.27510542]] [[0.13462187]]
training: # 10 0.1854725556828555 [[1.09605446]] [[0.07949287]]
training: # 15 0.16170691799200512 [[0.99032667]] [[0.04693974]]
training: # 20 0.15342035251397768 [[0.92789546]] [[0.02771745]]
training: # 25 0.15053100578931256 [[0.89103046]] [[0.01636688]]
training: # 30 0.14952355288044825 [[0.86926204]] [[0.00966448]]
training: # 35 0.14917227577171124 [[0.85640801]] [[0.00570678]]
training: # 40 0.149049793017394 [[0.84881783]] [[0.00336979]]
training: # 45 0.14900708592167952 [[0.84433591]] [[0.00198983]]

machine_learning_21_1.png

Figure 4: png

1.1.10. gradient checking

make sure that:

$\frac{\partial}{∂{W}}J(W,B) ≈ lim\h->0\frac{J(W+h,B)-J(W,B)}{h} $

def check_gradient():
    global X,Y
    W = np.random.randn(X.shape[1], 1)
    B = np.random.randn(1, 1)
    cost,dw,db=cost_function(X,Y,W,B)
    W[0]+=1e-3
    cost2,_,_=cost_function(X,Y,W,B)
    print((cost2-cost)/1e-3,dw[0,0])
    W[0]-=1e-3
    B[0]+=1e-3
    cost2,_,_=cost_function(X,Y,W,B)
    print((cost2-cost)/1e-3,db)

check_gradient()
-0.8722622453554152 -0.8727622453556267
-1.5264856192187537 -1.5269856192187428

1.1.11. learning rate

  • learning rate 过小会导致 gradient decent 过慢
  • learning rate 过大会导致 gradient decent 无法收敛
    • 在最小值附近反复
    • 变的越来越大

1.1.12. feature scaling

通过把 feature scale 到一定固定的范围, 例如 (0,1), 可以更容易的选择合适的 learning rate

WITH_FEATURE_SCALING=False

get_training_set()
train()
predict()
training: # 0 70.15408361397526 [[-0.40822281]] [[-0.4916959]]
training: # 5 24925634169.17572 [[24500.47499024]] [[2460.57784825]]
training: # 10 9.625926543152364e+18 [[-4.81456725e+08]] [[-48367551.32807031]]
training: # 15 3.717396364137783e+27 [[9.46140039e+12]] [[9.50500295e+11]]
training: # 20 1.4356057742758617e+36 [[-1.85931762e+17]] [[-1.86788622e+16]]
training: # 25 5.544105974322745e+44 [[3.6538587e+21]] [[3.67069739e+20]]
training: # 30 2.1410551284544287e+53 [[-7.18042105e+25]] [[-7.21351179e+24]]
training: # 35 8.268451368556306e+61 [[1.41106843e+30]] [[1.41757129e+29]]
training: # 40 3.193158696643795e+70 [[-2.77297681e+34]] [[-2.78575599e+33]]
training: # 45 1.2331526192107467e+79 [[5.44934622e+38]] [[5.47445936e+37]]

machine_learning_29_1.png

Figure 5: png

WITH_FEATURE_SCALING=False
LEARNING_RATE=0.01

get_training_set()
train()
predict()
training: # 0 103.38177778309628 [[-0.85004288]] [[1.11590542]]
-125.37942448626893 -125.42012645764883
training: # 5 6.86205229028994 [[0.67856153]] [[1.22261939]]
-0.07423379726745338 -0.11493576867876723
training: # 10 6.81940301336375 [[0.68344985]] [[1.1766803]]
-0.05115933051325072 -0.09186130192399862
training: # 15 6.777519309593261 [[0.68802624]] [[1.13112685]]
-0.05033045343871834 -0.09103242485023677
training: # 20 6.736384504793291 [[0.69256148]] [[1.08598248]]
-0.04951293203792062 -0.09021490344897255
training: # 25 6.6959852083221 [[0.69705599]] [[1.04124353]]
-0.048702753118412545 -0.08940472452908702
training: # 30 6.656308268968583 [[0.70151014]] [[0.99690637]]
-0.04789985004727271 -0.08860182145685279
training: # 35 6.617340770671049 [[0.70592429]] [[0.95296738]]
-0.04710415748121477 -0.08780612889108504
training: # 40 6.579070028312648 [[0.7102988]] [[0.90942298]]
-0.04631561066581469 -0.08701758207749494
training: # 45 6.541483583591981 [[0.71463402]] [[0.86626963]]
-0.045534145431957995 -0.08623611684335007

machine_learning_30_1.png

Figure 6: png

1.2. multi-feature linear regression

1.2.1. feature & label

X_train, Y_train, X_test, Y_test=0,0,0,0
def get_training_set():
    X, Y = load_boston(True)
    X = preprocessing.scale(X)
    Y = Y.reshape(-1, 1)
    [m, features] = X.shape
    # Z = np.concatenate((X, Y), axis=1)
    # np.random.shuffle(Z)
    # X = Z[:, :features]
    # Y = Z[:, features:]
    global X_train, Y_train, X_test, Y_test
    offset = int(0.8 * m)
    X_train, Y_train = X[:offset], Y[:offset]
    X_test, Y_test = X[offset:], Y[offset:]

get_training_set()
print(X_train.shape)
print(X_train[:5])
print()
print(Y_train[:5])
(404, 13)
[[-0.41771335  0.28482986 -1.2879095  -0.27259857 -0.14421743  0.41367189
  -0.12001342  0.1402136  -0.98284286 -0.66660821 -1.45900038  0.44105193
  -1.0755623 ]
 [-0.41526932 -0.48772236 -0.59338101 -0.27259857 -0.74026221  0.19427445
   0.36716642  0.55715988 -0.8678825  -0.98732948 -0.30309415  0.44105193
  -0.49243937]
 [-0.41527165 -0.48772236 -0.59338101 -0.27259857 -0.74026221  1.28271368
  -0.26581176  0.55715988 -0.8678825  -0.98732948 -0.30309415  0.39642699
  -1.2087274 ]
 [-0.41468015 -0.48772236 -1.30687771 -0.27259857 -0.83528384  1.01630251
  -0.80988851  1.07773662 -0.75292215 -1.10611514  0.1130321   0.41616284
  -1.36151682]
 [-0.41040922 -0.48772236 -1.30687771 -0.27259857 -0.83528384  1.22857665
  -0.51117971  1.07773662 -0.75292215 -1.10611514  0.1130321   0.44105193
  -1.02650148]]

[[24. ]
 [21.6]
 [34.7]
 [33.4]
 [36.2]]
from sklearn import preprocessing
from sklearn.datasets import load_boston
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from PIL import Image
import sys

EPOCH = 10000
LEARNING_RATE = 0.001

def MSE(A, B):
    return np.square(np.subtract(A, B)).mean()/2

def get_training_set():
    X, Y = load_boston(True)
    X = preprocessing.scale(X)
    Y = Y.reshape(-1, 1)
    [m, features] = X.shape
    # Z = np.concatenate((X, Y), axis=1)
    # np.random.shuffle(Z)
    # X = Z[:, :features]
    # Y = Z[:, features:]
    global X_train, Y_train, X_test, Y_test
    offset = int(0.8 * m)
    X_train, Y_train = X[:offset], Y[:offset]
    X_test, Y_test = X[offset:], Y[offset:]

def cost_function(X, Y, W, B):
    m = len(X)
    J = 0.
    dw = np.zeros_like(W)
    db = np.zeros_like(B)
    # for i in range(m):
    #     x = X[i, :].reshape(1, -1)
    #     y = Y[i, :].reshape(1, 1)
    #     y_hat = np.matmul(x, W) + B
    #     # print(y_hat[0], y[0], MSE(y_hat[0], y[0]))
    #     loss = MSE(y_hat, y)
    #     J += loss
    #     dw += np.matmul(np.transpose(x), y_hat - y)
    #     db += y_hat - y
    # J /= m
    # dw /= m
    # db /= m
    y_hat = np.matmul(X, W) + B
    J = MSE(y_hat, Y)
    dw = np.matmul(np.transpose(X), y_hat - Y) / m
    db = (y_hat - Y).mean(axis=0)
    return J, dw, db

def gradient_decent(X, Y, W, B):
    alpha = LEARNING_RATE
    for epoch in range(EPOCH):
        cost, dw, db = cost_function(X, Y, W, B)
        W = W - alpha * dw
        B = B - alpha * db
        if epoch % (EPOCH // 20) == 0:
            print("training: #", epoch, cost)
    return W, B

X_test, Y_test, X_train, Y_train = 0, 0, 0, 0
W, B = 0, 0

def train():
    global X_train, Y_train, W, B
    W = np.random.randn(X_train.shape[1], 1)
    B = np.random.randn(1, 1)
    W, B = gradient_decent(X_train, Y_train, W, B)

def predict():
    global W, B
    predicted = np.matmul(X_test, W) + B
    print("final mse:",MSE(predicted, Y_test))

get_training_set()
train()
predict()
print(W.shape,B.shape)
training: # 0 394.9508088516925
training: # 500 100.00785133938153
training: # 1000 48.72540414517899
training: # 1500 29.563643302467156
training: # 2000 21.405362653669222
training: # 2500 17.580513440523312
training: # 3000 15.634018864772019
training: # 3500 14.561620691509146
training: # 4000 13.919491179775438
training: # 4500 13.50050685782247
training: # 5000 13.2039753832719
training: # 5500 12.979149491604765
training: # 6000 12.799463170021674
training: # 6500 12.650373482061207
training: # 7000 12.523479650727792
training: # 7500 12.413618800743162
training: # 8000 12.317401683858627
training: # 8500 12.232456842224869
training: # 9000 12.15702919159006
training: # 9500 12.0897592813033
final mse: 11.213129305213641
(13, 1) (1, 1)

1.2.2. learning rate

1.2.2.1. learning rate 偏大
LEARNING_RATE = 0.1
train()
predict()
training: # 0 323.6508450938955
training: # 500 11.39299545706192
training: # 1000 11.389204091166178
training: # 1500 11.389189815007983
training: # 2000 11.389189761104692
training: # 2500 11.389189760901166
training: # 3000 11.389189760900395
training: # 3500 11.389189760900395
training: # 4000 11.389189760900393
training: # 4500 11.389189760900395
training: # 5000 11.389189760900393
training: # 5500 11.389189760900395
training: # 6000 11.389189760900395
training: # 6500 11.389189760900395
training: # 7000 11.389189760900395
training: # 7500 11.389189760900395
training: # 8000 11.389189760900395
training: # 8500 11.389189760900395
training: # 9000 11.389189760900395
training: # 9500 11.389189760900395
final mse: 16.537813246137475
1.2.2.2. learning rate 过大
LEARNING_RATE = 1
train()
predict()
training: # 0 323.85035063163946
training: # 500 nan
training: # 1000 nan
training: # 1500 nan
training: # 2000 nan
training: # 2500 nan

/usr/lib/python3.6/site-packages/numpy/core/_methods.py:70: RuntimeWarning: overflow encountered in reduce
  ret = umr_sum(arr, axis, dtype, out, keepdims)
/usr/lib/python3.6/site-packages/ipykernel_launcher.py:14: RuntimeWarning: overflow encountered in square

training: # 3000 nan
training: # 3500 nan
training: # 4000 nan
training: # 4500 nan
training: # 5000 nan
training: # 5500 nan
training: # 6000 nan
training: # 6500 nan
training: # 7000 nan
training: # 7500 nan
training: # 8000 nan
training: # 8500 nan
training: # 9000 nan
training: # 9500 nan
final mse: nan
1.2.2.3. learning rate 过小
LEARNING_RATE = 0.001
train()
predict()
training: # 0 368.22478200801294
training: # 500 113.60286787202149
training: # 1000 54.16900270646828
training: # 1500 32.30450821323542
training: # 2000 23.043704375375118
training: # 2500 18.67757830837426
training: # 3000 16.42877795251914
training: # 3500 15.169696609178645
training: # 4000 14.40183707813676
training: # 4500 13.891712751211747
training: # 5000 13.52521273665566
training: # 5500 13.244422131600182
training: # 6000 13.018730561961272
training: # 6500 12.83116082771574
training: # 7000 12.671739347524003
training: # 7500 12.534213486546351
training: # 8000 12.414390934896627
training: # 8500 12.30927849859642
training: # 9000 12.216621683323485
training: # 9500 12.134649211355635
final mse: 12.350629498610731

1.3. logistic regression

from sklearn.datasets import make_moons
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

X, Y = make_moons(n_samples=1000, noise=0.2)
Y = Y.reshape(-1, 1)

cm = ListedColormap(['#FF0000', '#0000FF'])
plt.scatter(x=X[:, 0], y=X[:, 1], c=Y[:, 0], cmap=cm)
plt.show()

machine_learning_43_0.png

Figure 7: png

1.3.1. feature & label

print("X:",X[:10])
print()
print("Y:",Y[:10])
X: [[ 0.8525511  -0.39862199]
 [ 0.1632493  -0.34467561]
 [ 1.15179796  0.77013127]
 [ 1.09955561 -0.42176977]
 [ 2.14572763 -0.19269439]
 [-0.04550188  1.2857288 ]
 [ 1.18711691  0.17124724]
 [ 1.82545617  0.84777188]
 [-0.15896536 -0.1438627 ]
 [ 1.04386354  1.06882075]]

Y: [[1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]]

1.3.2. regression

给定一个 x (x0, x1), 预测它所属的类型 (1 或 0, 蓝或红)

1.3.3. hypothesis function

linear regression 的 hypothesis function: \(\hat{y}=h(x)=Wx+B\) 对于 logistic regression 不直接适用, 因为我们需要它输出 0 或 1 的离散值

1.3.4. sigmoid

\(sigmoid(x)=\frac{1}{1+e^{-x}}\)

x=np.arange(-10,10,0.1)
y=1/(1+np.exp(-x))
plt.plot(x,y)
plt.show()

machine_learning_52_0.png

Figure 8: png

1.3.5. cost function

logistic regression 没有使用 mse 做为 cost function sigmoid 为何需要搭配 BCELoss

因此, logistic regression 使用 cross entropy 作为 cost function, 直观上说, 两个概率分布越接近, 其 cross entropy 越小

1.3.6. cross entropy

\(H(p,q)=-\sum _{x}p(x)\,\log q(x).\!\)

def cross_entropy(predictions, labels):
    epsilon = 1e-12
    predictions = np.clip(predictions, epsilon, 1 - epsilon)
    N = predictions.shape[0]

    ce = 0 - np.sum(np.sum(labels * np.log(predictions))) / N
    return ce

print(cross_entropy(np.array([[0.1,0.9]]),np.array([[1,0]])))
print(cross_entropy(np.array([[0.9,0.1]]),np.array([[1,0]])))
print(cross_entropy(np.array([[0.9,0.1]]),np.array([[0,1]])))
print(cross_entropy(np.array([[0.2,0.8]]),np.array([[0,1]])))
2.3025850929940455
0.10536051565782628
2.3025850929940455
0.2231435513142097

logistic regression 的 cost function 是一个简化版的 cross entropy:

即 p 为 \((\hat{y},1-\hat{y})\), q 为 \((y,1-y)\),

所以它的 cost function 为

\(J(W,B)=-\sum_1^m({y*\log(\hat{y})+(1-y)*\log(1-\hat{y})})\)

x=np.arange(0.01,1,0.01)
plt.plot(x,-np.log(x),label="-log(x)")
plt.plot(x,-np.log(1-x),label="-log(1-x)")
plt.legend()
plt.show()

machine_learning_59_0.png

Figure 9: png

1.3.7. partial derivative

\(h(x)=Wx+B\)

\(\hat{y}=sigmoid(h(x))\)

\(J(W,B)=-\sum({y*\log(\hat{y})+(1-y)*\log(1-\hat{y})})\)

1.4. $\frac{\partial}{∂{W}}J(W,B)

-∑{\frac{y}{\hat{y}}}\frac{\partial}{∂{W}}\hat{y}-\frac{1-y}{1-\hat{y}}\frac{\partial}{∂{W}}\hat{y} $

$\frac{\partial}{∂{W}}{\hat{y}} = \hat{y}/(1-\hat{y})/x $

\(\frac{\partial}{\partial{W}}J(W,B) = \frac{1}{m}{\sum{(\hat{y}-y})*x}\)

\(\frac{\partial}{\partial{B}}J(W,B) = \frac{1}{m}{\sum{(\hat{y}-y})}\)

1.4.1. logistic regression example

from sklearn import preprocessing
from sklearn.datasets import make_moons
from sklearn.datasets import make_circles
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from PIL import Image
import sys

EPOCH = 500
LEARNING_RATE = 0.01
BATCH_SIZE = 10
FEATURE_SIZE = 0
POLY_FEATURES = 3

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def cross_entropy(predictions, labels):
    epsilon = 1e-12
    predictions = np.clip(predictions, epsilon, 1 - epsilon)
    N = predictions.shape[0]
    ce = 0 - np.sum(np.sum(labels * np.log(predictions))) / N
    return ce

def draw_decision_boundary(W, B):
    xx, yy = np.meshgrid(
        np.arange(-1.5, 2.5, 0.02), np.arange(-1.5, 2.5, 0.02))
    X = np.c_[xx.ravel(), yy.ravel()]

    poly = preprocessing.PolynomialFeatures(POLY_FEATURES, include_bias=False)
    X = poly.fit_transform(X)

    Y = ((sigmoid(np.matmul(X, W) + B)) > 0.5)[:, 0]
    Y = Y.reshape(xx.shape)
    cm = ListedColormap(['#FF0000', '#0000FF'])
    plt.contour(xx, yy, Y, cmap=cm)

def get_training_set():
    X, Y = make_moons(n_samples=1000, noise=0.2)
    # X, Y = make_circles(n_samples=1000, noise=0.2, factor=0.5)
    Y = Y.reshape(1000, 1)
    poly = preprocessing.PolynomialFeatures(POLY_FEATURES, include_bias=False)
    X = poly.fit_transform(X, Y)
    global FEATURE_SIZE
    FEATURE_SIZE = X.shape[1]
    return X, Y

def cost_function(X, Y, W, B):
    m = len(X)
    assert (len(X) == len(Y))

    J = 0.

    dw = np.zeros_like(W)
    db = np.zeros_like(B)

    y_hat = sigmoid(np.matmul(X, W) + B)
    J = cross_entropy(np.concatenate((y_hat, 1 - y_hat)), np.concatenate((Y, 1 - Y)))
    dw = np.matmul(X.T, y_hat - Y) / m
    db = (y_hat - Y).mean(axis=0)
    return J, dw, db

W,B=0,0
def gradient_decent(X, Y, W, B):
    alpha = LEARNING_RATE
    for epoch in range(EPOCH):
        batch = len(X) // BATCH_SIZE
        total_loss = 0
        for X_batch, Y_batch in zip(
                np.split(X[:batch * BATCH_SIZE], batch),
                np.split(Y[:batch * BATCH_SIZE], batch)):
            cost, dw, db = cost_function(X_batch, Y_batch, W, B)
            total_loss += cost
            W = W - alpha * dw
            B = B - alpha * db

        if epoch % (EPOCH//30) == 0:
            print("training: #", epoch, total_loss / batch)

    return W, B

def train():
    global W,B
    X, Y = get_training_set()

    Z = np.concatenate((X, Y), axis=1)
    np.random.shuffle(Z)
    X = Z[:, :FEATURE_SIZE]
    Y = Z[:, FEATURE_SIZE:]

    W = np.random.randn(FEATURE_SIZE, 1)
    B = np.random.randn(1, 1)

    W, B = gradient_decent(X, Y, W, B)

def predict():
    global W,B
    X, Y = get_training_set()
    cm = ListedColormap(['#FF0000', '#0000FF'])
    # plt.plot(X[:, 0], X[:, 1], 'bo', label='Real data')
    y_hat = ((sigmoid(np.matmul(X, W) + B)) > 0.5)[:, 0]
    plt.scatter(x=X[:, 0], y=X[:, 1], c=Y[:, 0], cmap=cm)
    # plt.scatter(x=X[:, 0], y=X[:, 1], c=y_hat, cmap=cm)
    draw_decision_boundary(W, B)
    plt.show()

train()
predict()
training: # 0 0.619742973585997
training: # 16 0.17178658979627992
training: # 32 0.13487915355393248
training: # 48 0.12197846405639046
training: # 64 0.11395684458496719
training: # 80 0.10787208561892815
training: # 96 0.10291655423822696
training: # 112 0.0987579063665693
training: # 128 0.09521177753383403
training: # 144 0.09215291261899158
training: # 160 0.08948702750211028
training: # 176 0.08714053451732778
training: # 192 0.08505555055681267
training: # 208 0.08318653814486694
training: # 224 0.08149765158102548
training: # 240 0.07996058158625095
training: # 256 0.07855283335926001
training: # 272 0.07725638132916592
training: # 288 0.076056636971483
training: # 304 0.07494166614098187
training: # 320 0.07390159911921022
training: # 336 0.07292818633539222
training: # 352 0.07201446275282071
training: # 368 0.0711544927837573
training: # 384 0.0703431748029327
training: # 400 0.06957608988875362
training: # 416 0.06884938357165127
training: # 432 0.06815967240663312
training: # 448 0.06750396938582705
training: # 464 0.06687962379072875
training: # 480 0.06628427222462412
training: # 496 0.0657157983897877

machine_learning_64_1.png

Figure 10: png

1.4.2. polynormial features

moon 这个例子里, decision boundary 明显不是一条直线,如果我们仅仅用原始的 feature 进行 regression, 最终只能得到一条直线, 例如下面 POLY\_FEATURES 为 1 的情形

POLY_FEATURES=1
train()
predict()
training: # 0 0.2854766049060549
training: # 16 0.17277657607109922
training: # 32 0.15635266723112187
training: # 48 0.1493546886034027
training: # 64 0.1455593393582365
training: # 80 0.14329109540467472
training: # 96 0.14185780692665098
training: # 112 0.14091661921781962
training: # 128 0.14028055239864892
training: # 144 0.13984099949762763
training: # 160 0.1395318373157802
training: # 176 0.139311279131328
training: # 192 0.1391521054830019
training: # 208 0.13903613821495753
training: # 224 0.13895098343274292
training: # 240 0.1388880435606131
training: # 256 0.13884126717244613
training: # 272 0.13880634230124886
training: # 288 0.13878016421457948
training: # 304 0.13876047744704142
training: # 320 0.13874563097468282
training: # 336 0.13873440831324527
training: # 352 0.1387259081022462
training: # 368 0.13871945923467058
training: # 384 0.1387145599485834
training: # 400 0.13871083374087895
training: # 416 0.13870799721751376
training: # 432 0.13870583649403773
training: # 448 0.1387041897724283
training: # 464 0.13870293441266346
training: # 480 0.13870197729689518
training: # 496 0.13870124761970357

machine_learning_67_1.png

Figure 11: png

为了拟合成一条曲线, 我们把原始的 feature 做了修改, 例如, 原始 feature 为 [\(x1\),\(x2\),\(x3\)], 我们把它变成 [\(x1\),\(x2\),\(x3\),\(x1^2\),\(x2^2\),\(x3^2\),\(x1*x2\),\(x1*x3\),\(x2*x3\)], sklearn 的 PolynormialFeatures 类可以完成这种操作

POLY_FEATURES=7
train()
predict()
training: # 0 1.072493450454601
training: # 16 0.12517269994676714
training: # 32 0.10019008105575024
training: # 48 0.08845764537675999
training: # 64 0.08117824825999641
training: # 80 0.07617331085450527
training: # 96 0.0724893405480805
training: # 112 0.06964268516153896
training: # 128 0.06736243690988726
training: # 144 0.06548490220832501
training: # 160 0.06390509140298205
training: # 176 0.06255237236394463
training: # 192 0.06137734432013081
training: # 208 0.0603443289005192
training: # 224 0.05942685621245631
training: # 240 0.05860483518675161
training: # 256 0.057862715087380216
training: # 272 0.05718825356269268
training: # 288 0.05657166868407597
training: # 304 0.056005041406647346
training: # 320 0.055481885676289505
training: # 336 0.05499683340343065
training: # 352 0.05454539978629234
training: # 368 0.05412380589131122
training: # 384 0.05372884272286389
training: # 400 0.05335776581488923
training: # 416 0.053008212586936033
training: # 432 0.052678136893091254
training: # 448 0.05236575670587306
training: # 464 0.05206951194142586
training: # 480 0.05178803019125905
training: # 496 0.051520098673991474

/usr/lib/python3.6/site-packages/ipykernel_launcher.py:18: RuntimeWarning: overflow encountered in exp

machine_learning_69_2.png

Figure 12: png

1.4.3. underfitting & overfitting

在上面的例子中, POLY\_FEATURES 设为 1 时, 拟合的结果是 underfitting, 当 POLY\_FEATURES 为 7 时, 结果有些 overfitting, FEATURES 为 3 时看起来是比较合适的

1.4.4. regularization

  • 没有 regularization

\(J(W,B)=-\sum({y*\log(\hat{y})+(1-y)*\log(1-\hat{y})})\)

\(\frac{\partial}{\partial{W}}J(W,B) = \frac{1}{m}{\sum{(\hat{y}-y})*x}\)

  • 加上 regularization

$J(W,B)=-∑({y/log(\hat{y})+(1-y)/log(1-\hat{y})}) + \frac{\alpha}{2m} ∑\_i W\_i\^2 $

$\frac{\partial}{∂{W}}J(W,B) = \frac{1}{m}{∑{(\hat{y}-y})*x} +\frac{\alpha}{m}W $

通过 \(\alpha\) 可以控制 regularization 的程度, 当 \(\alpha\) 很小时,倾向于 overfitting, 当 \(\alpha\) 很大时, 倾向于 underfitting

1.5. multi-class logistic regression

普通的 logistic regression 用来作二分类, hypothesis function 输出可以看作是预测结果为 1 的概率, label 只有两种结果: 0 或 1

在多分类的问题中, label 会有多个值。 mnist 手写数字识别是一个经典的 multi-classification 问题

1.5.1. training set

mnist training set 中, 每张图片是 28 *28 像素, 处理时把它 flatten 成一个 784 大小的 array.

每个 label 的值的范围是 0~9, 表示图片对应的数字

from os import listdir
from os.path import isfile, join

def get_test_set():
    X = np.ndarray(shape=(0, 784))
    Y = np.ndarray(shape=(0, 10))
    baseDir = "/home/sunway/program/mnist/testSet"
    for i in range(10):
        currDir = (baseDir + "/" + str(i))
        files = [
            join(currDir, f) for f in listdir(currDir)
            if isfile(join(currDir, f))
        ]
        for f in files:
            X = np.concatenate((X, np.array(Image.open(f)).reshape(1, 784)))
            Y = np.concatenate((Y, one_hot(np.array([i]), 10)))

    return X, Y

X,Y=get_test_set()
print("X:",X[0])
print()
print("Y:",Y[0])
X: [  0.   0.   0.   0.   0.   0.   0.   0.   0.  11.   3.   0.   6.   8.
   0.   9.   3.   0.   9.   0.   7.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   9.   0.   0.
  14.   0.   0.   1.   3.   0.   0.   3.   1.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.  12.   0.  27.   0.   0.  10.
  15.   0.   0.   0.   0.  20.   0.   0.   0.   4.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.  28.   0.  14.   0.   0.
   0.   2.   9.   5.   0.   1.   0.   9.   7.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   7.   0.   6.   0.  13.   0.
  10. 108. 168. 224. 231. 145.  58.   0.   0.   6.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   7.   3.   6.   0.  14.  56.
 208. 237. 254. 255. 255. 247. 208.  79.   0.   2.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.  12.   0.   0.   8. 215.
 255. 255. 171. 136. 169. 231. 255. 150.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   6.   0.   5. 160. 247.
 255. 192.  32.   0.   4. 104. 248. 238.  33.   7.   0.   0.   0.   0.
   0.   1.   0.   6.   7.   0.   0.   0.   4.   4.   0.  12. 231. 255.
 178.  11.   3.   0.   7.  19. 246. 255.  55.   0.   0.   0.   0.   0.
   0.   5.   0.   0.   0.   5.  10.   2.   0.   0.   0. 145. 255. 232.
   7.   0.   0.  14.   0.  10. 255. 255.  95.  12.   0.   0.   0.   0.
   0.   0.   4.   0.   0.   4.   5.   0.   0.  14.  97. 232. 255. 138.
   0.   3.   0.   7.   0.  18. 255. 234.  48.   0.   0.   0.   0.   0.
   0.   0.   8.  11.   0.   0.   0.   1.   6.   5. 218. 249. 250. 110.
   0.   6.   0.   0.   8.  43. 250. 252.  25.   7.   0.   0.   0.   0.
   6.   0.   5.   0.   0.   6.   0.   7.   1. 170. 255. 188. 133.  38.
   0.   0.   4.   0.  10.  83. 239. 247.  30.   5.   0.   0.   0.   0.
   0.   0.   7.   0.   0.  15.   0.   0. 118. 255. 252.  54.   0.  14.
   6.   6.   4.   0.   2. 175. 255. 163.  10.   0.   0.   0.   0.   0.
   0.   0.   8.   0.   5.   3.   0.  49. 250. 248.  84.  17.   0.   0.
  31.   0.   0.   0.  66. 255. 255.  79.  10.   6.   0.   0.   0.   0.
   9.   0.   0.   0.   6.   0.  13. 168. 248. 244.   7.   0.   9.   0.
   0.   7.   3.  33. 176. 255. 186.  16.   0.   0.   0.   0.   0.   0.
   2.   3.   0.   5.   0.   3.  61. 237. 255.  68.   0.   8.   2.   0.
  12.   2.   8. 112. 255. 239. 103.   0.   0.   0.   0.   0.   0.   0.
   0.   5.   0.   0.   9.   0. 147. 254. 173.  20.   0.   0.   6.  11.
   0.   0.  76. 250. 233. 133.   0.   8.   4.   5.   0.   0.   0.   0.
   0.   0.   3.   0.  10.  18. 214. 238.  43.   0.   6.   0.  10.   0.
   0. 141. 252. 255. 242.  34.   3.   0.   7.   0.   0.   0.   0.   0.
   7.   0.   5.   3.   0.  95. 233. 205.   9.   0.  18.   0.  15.  98.
 182. 248. 255. 228.  28.   0.   0.   7.   0.  29.   0.   0.   0.   0.
   4.   0.   0.   6.   0. 129. 255. 216.  32.  28.  35.  96. 212. 241.
 252. 255. 203.  76.  31.   0.   1.  14.   0.   0.   0.   0.   0.   0.
   0.  10.   0.   0.   0.  84. 255. 255. 255. 251. 252. 255. 255. 255.
 244. 106.   0.   8.   0.   0.  16.   0.   1.   0.   0.   0.   0.   0.
   0.   1.   0.   0.   3.  21. 190. 254. 252. 248. 245. 250. 251. 159.
  65.   0.  10.   0.   9.  17.   0.   0.   0.  14.   0.   0.   0.   0.
  10.   0.   9.   4.   0.   0.  78. 213. 198. 184. 174. 106.  57.   3.
   0.   7.   0.   5.   0.   0.  14.   0.   0.   2.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]

Y: [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

mnist 中, W 的 shape 不再是 [784, 1], 而是变成了 [784, 10], 其中 W[:,0] 用来预测图片为 0 的概率, W[:,1] 预测图片为 1 的概率…

因此 \(WX+B\) 结果是一个 [m,10] 的 matrix, 但 Y 的值却是 0~9, 所以我们需要把 label 值变为一个 [10,1] 的 matrix, 其中第 n 行代表 label 是 n 的概率。

这个转换 label 的过程叫做 one\_hot

1.5.2. one_hot

def one_hot(Y, C):
    one = np.eye(C)[Y.reshape(-1)]
    return one

x=np.array([[0,1,2,3,4,5,6,7,8,9]])
print(one_hot(x,10))
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]

普通的 logistic regression 通过 sigmoid 把 一个值 转换为 =结果为 1 的概率=, 在 mnist 中 \(XW+B\) 输出 [10,1] 的 matrix,我们需要一个 hypothesis function 把这个 matrix 变成 10 个概率,然后再通过某个 cost\_function 与 one\_hot 转换后的 label 比较.

把 [10,1] 的 matrix 转换为 10 个概率的方法, 可以是分别进行 sigmoid, 但更常见的是通过 softmax 函数

1.5.3. softmax

\(softmax=\frac{e^{x^i}}{\sum{e^x}}\)

def softmax(X):
    exps = np.exp(X)
    return exps / np.sum(exps)

x=np.array([[1,2,3,4]])
print(softmax(x))
[[0.0320586  0.08714432 0.23688282 0.64391426]]

1.5.4. partial derivative

\(h=WX+B\)

\(\hat{y}=softmax(h)\)

\(J=cross\_entropy(\hat{y},y)\)

\(\frac{\partial}{\partial{W}}J(W,B) = \frac{1}{m}{\sum{(\hat{y}-y})*x}\)

\(\frac{\partial}{\partial{B}}J(W,B) = \frac{1}{m}{\sum{(\hat{y}-y})}\)

1.5.5. load_digits logistic regression example

import numpy as np
from PIL import Image
import sys
from sklearn import preprocessing
from sklearn.datasets import load_digits

EPOCH = 100
LEARNING_RATE = 0.01
BATCH_SIZE = 2
REGULARIZATION_FACTOR = 0.01

def softmax(z):
    s = np.max(z, axis=1)
    s = s[:, np.newaxis]  # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis]  # dito
    return e_x / div

def one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

def cross_entropy(predictions, labels):
    epsilon = 1e-12
    predictions = np.clip(predictions, epsilon, 1 - epsilon)
    N = predictions.shape[0]

    ce = 0 - np.sum(np.sum(labels * np.log(predictions))) / N
    return ce

def cost_function(X, Y, W, B):
    m = len(X)
    assert (len(X) == len(Y))

    J = 0.

    dw = np.zeros_like(W)
    db = np.zeros_like(B)

    # for i in range(m):
    #     x = X[i, :].reshape(1, 64)
    #     y = Y[i, :].reshape(1, 10)
    #     f = np.matmul(x, W) + B
    #     assert (f.shape == (1, 10))
    #     y_hat = softmax(f)
    #     assert (y.shape == y_hat.shape)
    #     loss = cross_entropy(y_hat, y)
    #     J += loss

    #     dw += np.matmul(np.transpose(x), y_hat - y)
    #     db += y_hat - y

    # J /= m
    # dw /= m
    # db /= m

    f = np.matmul(X, W) + B
    y_hat = softmax(f)
    J = cross_entropy(y_hat, Y)
    dw = np.matmul(np.transpose(X), y_hat - Y) / m
    db = (y_hat - Y).mean(axis=0)

    # regularation
    J += REGULARIZATION_FACTOR * sum(sum(np.square(W))) / (2 * m)
    dw += REGULARIZATION_FACTOR * W / m
    return J, dw, db

X_train, Y_train, X_test, Y_test = 0, 0, 0, 0
W, B = 0, 0

def get_training_set():
    X, Y = load_digits(10, True)
    Y = Y.reshape(-1, 1)
    Y = one_hot(Y, 10)
    [m, features] = X.shape
    Z = np.concatenate((X, Y), axis=1)
    np.random.shuffle(Z)
    X = Z[:, :features]
    Y = Z[:, features:]
    global X_train, Y_train, X_test, Y_test
    offset = int(0.8 * m)
    X_train, Y_train = X[:offset], Y[:offset]
    X_test, Y_test = X[offset:], Y[offset:]

def gradient_decent(X, Y, W, B):
    alpha = LEARNING_RATE
    for epoch in range(EPOCH):
        batch = len(X) // BATCH_SIZE
        total_loss = 0
        for X_batch, Y_batch in zip(
                np.split(X[:batch * BATCH_SIZE], batch),
                np.split(Y[:batch * BATCH_SIZE], batch)):
            cost, dw, db = cost_function(X_batch, Y_batch, W, B)
            total_loss += cost
            W = W - alpha * dw
            B = B - alpha * db
        if epoch % (EPOCH // 30) == 0:
            print("training: #", epoch, total_loss / batch)
    return W, B

def predict():
    global X_test, Y_test, W, B

    wrong = 0
    correct = 0
    for i in range(len(X_test)):
        x = X_test[i, :].reshape(1, -1)
        y = Y_test[i, :].reshape(1, -1)
        c = np.matmul(x, W) + B
        y_hat = softmax(c)

        if np.argmax(y_hat) == np.argmax(y):
            correct += 1
        else:
            wrong += 1
    print("correct: %d, wrong: %d, accuracy: %f" % (correct, wrong, correct /
                                                    (correct + wrong)))

def train():
    global W, B, X_train, Y_train
    W = np.random.randn(64, 10)
    B = np.random.randn(1, 10)
    W, B = gradient_decent(X_train, Y_train, W, B)

get_training_set()
train()
predict()
training: # 0 7.413641925368053
training: # 3 2.061976320350589
training: # 6 1.5752044059841135
training: # 9 1.25185391564078
training: # 12 1.0334895315893526
training: # 15 0.9154572322047317
training: # 18 0.7912384177526893
training: # 21 0.6569779524881322
training: # 24 0.6154367878411028
training: # 27 0.6250007208991311
training: # 30 0.519215167487594
training: # 33 0.5119700580330538
training: # 36 0.5275788708720639
training: # 39 0.5304955801897778
training: # 42 0.44298460952993773
training: # 45 0.40653637116386976
training: # 48 0.45331106089167467
training: # 51 0.37611751513467656
training: # 54 0.37240293166520994
training: # 57 0.4553268960419496
training: # 60 0.3623002402468144
training: # 63 0.37149394065324326
training: # 66 0.4276031139481319
training: # 69 0.4003649152441614
training: # 72 0.43875428025202756
training: # 75 0.40422299306062504
training: # 78 0.4026933009211833
training: # 81 0.40849225974261905
training: # 84 0.3801959998258428
training: # 87 0.3976900455580225
training: # 90 0.4330326570439522
training: # 93 0.4622412012606176
training: # 96 0.4132252813914691
training: # 99 0.3563033761424611
correct: 339, wrong: 21, accuracy: 0.941667

1.6. artificial neural networks

ANN 可以看作是多层的 logistic regression 级联起来, 前一层 logistic regression 的输出作为后一层 logistic regression 的输入

ann1.png

Figure 13: ann

把 hidden layer 去掉, 就是一个基本的 logistic regression

1.6.1. activation function

每个 hidden layer 中的 node 可以认为是一个 logistic regression, 即它们会完成简单的 \(sigmoid(WX+B)\) 运算,其中 sigmoid 在 ANN 中称为 activation function.

除了 sigmoid, 常用的 activation function 还有 tanh, relu 等

ann2.png

Figure 14: ann2

1.6.2. relu

relu 是现在的推荐的 activation function

\(relu(x)=\max(x,0)\)

def relu(X):
    return np.maximum(X, 0)

x=np.arange(-10,10,0.1)
plt.plot(x,relu(x))
plt.show()

machine_learning_97_0.png

Figure 15: png

1.6.3. forward propergation

以 mnist 为例,

  1. 假设 ann 有一个 hidden layer, 该 layer 有三个 node
  2. input layer 有 784 个 feature
  3. output layer 有 10 个 node, 对应 0~9 十个数字
  4. hidden layer 使用 sigmoid 作为 activation function
  5. output layer 使用 softmax + cross\_entropy 做为 cost function

input layer 与 hidden layer 之间的 W 记为 W1, 其 shape 为 [784, 3], 其中 W1[:,0] 对应 hidden layer 的第一个节点

hidden layer 与 output layer 之间的 W 记为 W2, 其 shape 为 [3,10], 其中 W2[:0] 对应 output layer 第一个节点

forward propergation

\(c^{1}=W^{1}X+B^{1}\)

\(a^{1}=sigmoid(c^{1})\)

\(c^{2}=W^{2}*a^{1}+B^{2}\)

\(\hat{y}=softmax(c^{2})\)

\(cost=cross\_entropy(\hat{y},y)\)

1.6.4. backward propergation

需要分别计算:

  • \(\frac{\partial}{\partial{W1}}J(W1,W2,B1,B2)\)
  • \(\frac{\partial}{\partial{B1}}J(W1,W2,B1,B2)\)
  • \(\frac{\partial}{\partial{W2}}J(W1,W2,B1,B2)\)
  • \(\frac{\partial}{\partial{B2}}J(W1,W2,B1,B2)\)

\(\frac{\partial}{\partial{W2}}J(W1,W2,B1,B2)=\frac{1}{m}\sum(\hat{y}-y)a1\)

\(\frac{\partial}{\partial{B2}}J(W1,W2,B1,B2)=\frac{1}{m}\sum(\hat{y}-y)\)

backward propergation

\(\frac{\partial}{\partial{W1}}J(W1,W2,B1,B2)= \frac{\partial}{\partial{a1}}J(W1,W2,B1,B2) * \frac{\partial}{\partial{W1}}a{1}\)

\(\frac{\partial}{\partial{W1}}a1=a1*(1-a1)*x\)

\(\frac{\partial}{\partial{B1}}J(W1,W2,B1,B2)= \frac{\partial}{\partial{a1}}J(W1,W2,B1,B2) * \frac{\partial}{\partial{B1}}a{1}\)

\(\frac{\partial}{\partial{B1}}a1=a1*(1-a1)\)

1.6.5. make_moon ANN example

import numpy as np
import sys
from sklearn import preprocessing
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons

HIDDEN_NODES_NUM = 30
LEARNING_RATE = 0.01
EPOCH = 2000
BATCH_SIZE = 100

def relu(X):
    return np.maximum(X, 0)

def relu_derivative(X):
    return 1. * (X > 0)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis]  # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis]  # dito
    return e_x / div

def cross_entropy(predictions, labels):
    epsilon = 1e-12
    predictions = np.clip(predictions, epsilon, 1 - epsilon)
    N = predictions.shape[0]

    ce = 0 - np.sum(np.sum(labels * np.log(predictions)))/N
    return ce

def cost_function(X, Y, W1, B1, W2, B2):
    # X:   m   *  2
    # Y:   m   *  1
    # W1:  2   *  HIDDEN_NODES_NUM
    # B1:  1   *  HIDDEN_NODES_NUM
    # W2:  HIDDEN_NODES_NUM   *  1
    # B2:  1   *  1

    m = len(X)
    J = 0.
    dw1 = np.zeros_like(W1)
    db1 = np.zeros_like(B1)
    dw2 = np.zeros_like(W2)
    db2 = np.zeros_like(B2)
    # for i in range(m):
    #     x = X[i, :].reshape(1, 2)
    #     y = Y[i, :].reshape(1, 1)
    #     z1 = np.matmul(x, W1) + B1
    #     a1 = relu(z1)
    #     z2 = np.matmul(a1, W2) + B2
    #     y_hat = sigmoid(z2)
    #     loss = cross_entropy(
    #            np.concatenate((y_hat, 1 - y_hat)), np.concatenate((y, 1 - y)))
    #     J += loss
    #     # bp
    #     delta3 = y_hat - y
    #     dw2 += np.matmul(np.transpose(a1), delta3)
    #     db2 += delta3
    #     # 1*HIDDEN_NODES_NUM
    #     delta2 = np.matmul(delta3, np.transpose(W2))
    #     delta2 = delta2 * relu_derivative(a1)

    #     # dw1: 784 * HIDDEN_NODES_NUM
    #     dw1 += np.matmul(np.transpose(x), delta2)
    #     # db1: 1 * HIDDEN_NODES_NUM
    #     db1 += delta2
    # J /= m
    # dw1 /= m
    # db1 /= m
    # dw2 /= m
    # db2 /= m

    z1 = np.matmul(X, W1) + B1
    a1 = relu(z1)
    z2 = np.matmul(a1, W2) + B2
    y_hat = sigmoid(z2)

    J = cross_entropy(
         np.c_[y_hat,1-y_hat],np.c_[Y,1-Y])

    # backward propergation
    delta3 = y_hat - Y
    dw2 = np.matmul(np.transpose(a1), delta3) / m
    db2 = delta3.mean(axis=0)

    delta2 = np.matmul(delta3, np.transpose(W2))
    delta2 = delta2 * relu_derivative(a1)

    dw1 = np.matmul(np.transpose(X), delta2) / m
    db1 = delta2.mean(axis=0)
    return J, dw1, db1, dw2, db2

def get_training_set():
    X, Y = make_moons(n_samples=1000, noise=0.2)
    Y = Y.reshape(1000, 1)
    X = preprocessing.scale(X)
    return X, Y

def gradient_decent(X, Y, W1, B1, W2, B2):
    alpha = LEARNING_RATE
    for epoch in range(EPOCH):
        batch = len(X) // BATCH_SIZE
        total_loss = 0
        for X_batch, Y_batch in zip(
                np.split(X[:batch * BATCH_SIZE], batch),
                np.split(Y[:batch * BATCH_SIZE], batch)):
            cost, dw1, db1, dw2, db2 = cost_function(X_batch, Y_batch, W1, B1,
                                                     W2, B2)
            # print("training: #", epoch, cost)
            total_loss += cost
            W1 = W1 - alpha * dw1
            B1 = B1 - alpha * db1
            W2 = W2 - alpha * dw2
            B2 = B2 - alpha * db2
        if epoch % (EPOCH // 30) == 0:
            print("training: #", epoch, total_loss / batch)
    return W1, B1, W2, B2

def draw_decision_boundary(W1, B1, W2, B2):
    xx, yy = np.meshgrid(np.arange(-4, 4, 0.02), np.arange(-4, 4, 0.02))
    X = np.c_[xx.ravel(), yy.ravel()]

    z1 = np.dot(X, W1) + B1
    a1 = relu(z1)
    z2 = np.dot(a1, W2) + B2
    a2 = sigmoid(z2)
    y_hat = (a2 > 0.5)[:, 0]
    y_hat = y_hat.reshape(xx.shape)
    cm = ListedColormap(['#FF0000', '#0000FF'])
    plt.contour(xx, yy, y_hat, cmap=cm)

def train():
    global W1, B1, W2, B2
    X, Y = get_training_set()
    Z = np.concatenate((X, Y), axis=1)
    np.random.shuffle(Z)
    X = Z[:, :2]
    Y = Z[:, 2:]
    W1 = np.random.randn(2, HIDDEN_NODES_NUM)
    B1 = np.random.randn(1, HIDDEN_NODES_NUM)
    W2 = np.random.randn(HIDDEN_NODES_NUM, 1)
    B2 = np.random.randn(1, 1)
    W1, B1, W2, B2 = gradient_decent(X, Y, W1, B1, W2, B2)

W1, B1, W2, B2 = 0, 0, 0, 0

def predict():
    global W1, B1, W2, B2
    X, Y = get_training_set()
    cm = ListedColormap(['#FF0000', '#0000FF'])
    plt.scatter(x=X[:, 0], y=X[:, 1], c=Y[:, 0], cmap=cm)
    draw_decision_boundary(W1, B1, W2, B2)
    plt.show()

train()
predict()
training: # 0 1.1010464305437424
training: # 66 0.26612700400246736
training: # 132 0.2279739563132131
training: # 198 0.205101820126328
training: # 264 0.1884051410770355
training: # 330 0.17477153275449023
training: # 396 0.16369062967529763
training: # 462 0.1545121735063119
training: # 528 0.14688858135821387
training: # 594 0.14037311583041678
training: # 660 0.13481409437384884
training: # 726 0.1299481748662618
training: # 792 0.1256549732023355
training: # 858 0.1218073002961437
training: # 924 0.11833164864213408
training: # 990 0.11520153425399286
training: # 1056 0.11239083604018733
training: # 1122 0.10985753265561604
training: # 1188 0.10755024891442373
training: # 1254 0.10549406656678581
training: # 1320 0.10362753609096016
training: # 1386 0.10193490774925815
training: # 1452 0.10040633693099095
training: # 1518 0.09902660303283958
training: # 1584 0.09776264737376265
training: # 1650 0.09658752873463469
training: # 1716 0.09549979433826702
training: # 1782 0.09442971652084056
training: # 1848 0.09343720322848759
training: # 1914 0.09254972524996466
training: # 1980 0.09171308756241331

machine_learning_107_1.png

Figure 16: png

1.6.6. gradient checking

def check_gradient():
    X, Y = get_training_set()

    W1 = np.random.randn(2, HIDDEN_NODES_NUM)
    B1 = np.random.randn(1, HIDDEN_NODES_NUM)
    W2 = np.random.randn(HIDDEN_NODES_NUM, 1)
    B2 = np.random.randn(1, 1)

    cost,dw1,db1,dw2,db2=cost_function(X,Y,W1,B1,W2,B2)
    W1[0,0]+=1e-3
    cost2,_,_,_,_=cost_function(X,Y,W1,B1,W2,B2)
    print((cost2-cost)/1e-3,dw1[0,0])
    W1[0,0]-=1e-3

    W2[0,0]+=1e-3
    cost2,_,_,_,_=cost_function(X,Y,W1,B1,W2,B2)
    print((cost2-cost)/1e-3,dw2[0,0])
    W2[0,0]-=1e-3

check_gradient()
-0.11087431749534638 -0.11087971896090518
0.2626766888012 0.2626112969246978

1.6.7. ANN underfitting

HIDDEN_NODES_NUM = 2
train()
predict()
training: # 0 0.2648734052642972
training: # 66 0.19051075781763988
training: # 132 0.16981209881990778
training: # 198 0.1601615756297182
training: # 264 0.15477825811739088
training: # 330 0.15119051277788853
training: # 396 0.14873291511906206
training: # 462 0.1469816534584579
training: # 528 0.14558059343098362
training: # 594 0.14465389497268938
training: # 660 0.14397394615231032
training: # 726 0.14345675216822246
training: # 792 0.1429674772570023
training: # 858 0.14263718647940898
training: # 924 0.14242584431954342
training: # 990 0.14230632971907475
training: # 1056 0.142229241092
training: # 1122 0.1421798949537487
training: # 1188 0.14214490864219884
training: # 1254 0.14210683414275965
training: # 1320 0.14206771306265004
training: # 1386 0.14201645856299278
training: # 1452 0.14195097849497493
training: # 1518 0.1419079606039002
training: # 1584 0.14188212493955574
training: # 1650 0.14184885602018105
training: # 1716 0.14181514251831864
training: # 1782 0.14177638752625493
training: # 1848 0.14173627526074
training: # 1914 0.14171024414399375
training: # 1980 0.1416899151978159

machine_learning_111_1.png

Figure 17: png

1.6.8. ANN overfitting

HIDDEN_NODES_NUM = 500
train()
predict()
training: # 0 2.0567161790524358
training: # 66 0.04604867297108463
training: # 132 0.03752036254177539
training: # 198 0.034219530144220814
training: # 264 0.03230772048803781
training: # 330 0.030934205053510792
training: # 396 0.02990669936277099
training: # 462 0.029094081507601677
training: # 528 0.028367062027028496
training: # 594 0.027774583043645072
training: # 660 0.027274793841699875
training: # 726 0.026865490017476307
training: # 792 0.026495695363192913
training: # 858 0.026169749917253328
training: # 924 0.02587170841831545
training: # 990 0.025602493238891168
training: # 1056 0.02536622820636284
training: # 1122 0.025144524174879168
training: # 1188 0.02492772562338179
training: # 1254 0.02471288682522171
training: # 1320 0.024507402054118534
training: # 1386 0.024313735181501994
training: # 1452 0.024136089408418184
training: # 1518 0.02397675666435648
training: # 1584 0.0238277056608206
training: # 1650 0.02367850858998746
training: # 1716 0.023515055913084094
training: # 1782 0.02335790705150862
training: # 1848 0.02323471795470979
training: # 1914 0.023123359844330806
training: # 1980 0.023017238312910197

machine_learning_113_1.png

Figure 18: png

1.6.9. load_digits ANN example

import numpy as np
import sys
from sklearn import preprocessing
from sklearn.datasets import load_digits

HIDDEN_NODES_NUM = 30
LEARNING_RATE = 0.01
EPOCH = 100
BATCH_SIZE = 2
REGULARIZATION_FACTOR = 0.01
USE_RELU = False

def relu(X):
    return np.maximum(X, 0)

def relu_derivative(X):
    return 1. * (X > 0)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(X):
    return X * (1 - X)

def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis]  # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis]  # dito
    return e_x / div

def one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

def cross_entropy(predictions, labels):
    epsilon = 1e-12
    predictions = np.clip(predictions, epsilon, 1 - epsilon)
    N = predictions.shape[0]

    ce = 0 - np.sum(np.sum(labels * np.log(predictions))) / N
    return ce

def cost_function(X, Y, W1, B1, W2, B2):
    m = len(X)
    assert (len(X) == len(Y))

    J = 0.

    dw1 = np.zeros_like(W1)
    db1 = np.zeros_like(B1)

    dw2 = np.zeros_like(W2)
    db2 = np.zeros_like(B2)
    # normal
    # for i in range(m):
    #     x = X[i, :].reshape(1, -1)
    #     y = Y[i, :].reshape(1, -1)
    #     z1 = np.matmul(x, W1) + B1
    #     if USE_RELU:
    #         a1 = relu(z1)
    #     else:
    #         a1 = sigmoid(z1)
    #     z2 = np.matmul(a1, W2) + B2
    #     y_hat = softmax(z2)
    #     loss = cross_entropy(y_hat, y)
    #     J += loss
    #     # bp
    #     delta3 = y_hat - y
    #     dw2 += np.matmul(np.transpose(a1), delta3)
    #     db2 += delta3
    #     delta2 = np.matmul(delta3, np.transpose(W2))
    #     if USE_RELU:
    #         delta2 = delta2 * relu_derivative(a1)
    #     else:
    #         delta2 = delta2 * sigmoid_derivative(a1)
    #     dw1 += np.matmul(np.transpose(x), delta2)
    #     db1 += delta2
    # J += REGULARIZATION_FACTOR * (
    #     np.sum(np.sum(np.square(W1))) + np.sum(np.sum(np.square(W2)))) / 2
    # dw1 += REGULARIZATION_FACTOR * W1
    # dw2 += REGULARIZATION_FACTOR * W2
    # J /= m
    # dw1 /= m
    # db1 /= m
    # dw2 /= m
    # db2 /= m

    # vectorization
    z1 = np.matmul(X, W1) + B1
    if USE_RELU:
        a1 = relu(z1)
    else:
        a1 = sigmoid(z1)
    z2 = np.matmul(a1, W2) + B2
    y_hat = softmax(z2)
    J = cross_entropy(y_hat, Y)

    delta3 = y_hat - Y
    dw2 = np.matmul(np.transpose(a1), delta3) / m
    db2 = delta3.mean(axis=0)

    delta2 = np.matmul(delta3, np.transpose(W2))
    if USE_RELU:
        delta2 = delta2 * relu_derivative(a1)
    else:
        delta2 = delta2 * sigmoid_derivative(a1)

    dw1 = np.matmul(np.transpose(X), delta2) / m
    db1 = delta2.mean(axis=0)

    # regulation
    J += REGULARIZATION_FACTOR * (
        np.sum(np.sum(np.square(W1))) + np.sum(np.sum(np.square(W2)))) / (
            2 * m)
    dw1 += REGULARIZATION_FACTOR * W1 / m
    dw2 += REGULARIZATION_FACTOR * W2 / m

    return J, dw1, db1, dw2, db2

X_train, Y_train, X_test, Y_test = 0, 0, 0, 0
W1, B1, W2, B2 = 0, 0, 0, 0

def get_training_set():
    X, Y = load_digits(10, True)
    Y = Y.reshape(-1, 1)
    Y = one_hot(Y, 10)
    [m, features] = X.shape
    Z = np.concatenate((X, Y), axis=1)
    np.random.shuffle(Z)
    X = Z[:, :features]
    Y = Z[:, features:]
    global X_train, Y_train, X_test, Y_test
    offset = int(0.8 * m)
    X_train, Y_train = X[:offset], Y[:offset]
    X_test, Y_test = X[offset:], Y[offset:]

def gradient_decent(X, Y, W1, B1, W2, B2):
    alpha = LEARNING_RATE
    for epoch in range(EPOCH):
        batch = len(X) // BATCH_SIZE
        total_loss = 0
        for X_batch, Y_batch in zip(
                np.split(X[:batch * BATCH_SIZE], batch),
                np.split(Y[:batch * BATCH_SIZE], batch)):
            cost, dw1, db1, dw2, db2 = cost_function(X_batch, Y_batch, W1, B1,
                                                     W2, B2)
            total_loss += cost
            W1 = W1 - alpha * dw1
            B1 = B1 - alpha * db1
            W2 = W2 - alpha * dw2
        if epoch % (EPOCH // 20) == 0:
            print("training: #", epoch, total_loss / batch)

    return W1, B1, W2, B2

def predict():
    global X_test, Y_test, W1, B1, W2, B2

    wrong = 0
    correct = 0
    for i in range(len(X_test)):
        x = X_test[i, :].reshape(1, -1)
        y = Y_test[i, :].reshape(1, -1)
        z1 = np.matmul(x, W1) + B1
        if USE_RELU:
            a1 = relu(z1)
        else:
            a1 = sigmoid(z1)
        z2 = np.matmul(a1, W2) + B2
        a2 = softmax(z2)

        if np.argmax(a2) == np.argmax(y):
            correct += 1
        else:
            wrong += 1
    print("correct: %d, wrong: %d, accuracy: %f" % (correct, wrong, correct /
                                                    (correct + wrong)))

def train():
    global W1, B2, W2, B2, X_train, Y_train
    W1 = np.random.randn(64, HIDDEN_NODES_NUM)
    B1 = np.random.randn(1, HIDDEN_NODES_NUM)

    W2 = np.random.randn(HIDDEN_NODES_NUM, 10)
    B2 = np.random.randn(1, 10)

    W1, B1, W2, B2 = gradient_decent(X_train, Y_train, W1, B1, W2, B2)

get_training_set()
train()
predict()
training: # 0 8.179987275868209
training: # 5 4.487616756607857
training: # 10 3.190552810756847
training: # 15 2.3609422313800867
training: # 20 1.7760653062722396
training: # 25 1.3739858353803298
training: # 30 1.0825991991172912
training: # 35 0.8802306595921261
training: # 40 0.7309049938786311
training: # 45 0.6226726072585916
training: # 50 0.5489742276092043
training: # 55 0.49740708896607166
training: # 60 0.46216408821398436
training: # 65 0.43638596801652746
training: # 70 0.4120282356053934
training: # 75 0.3927758648862652
training: # 80 0.3769661143322518
training: # 85 0.36297691382952496
training: # 90 0.3523873696978746
training: # 95 0.34356971806469333
correct: 350, wrong: 10, accuracy: 0.972222

1.7. What's Next

  • Convolutional neural networks (ConvNets)
  • Recurrent neural networks

Author: [email protected]
Date:
Last updated: 2022-01-25 Tue 16:16

知识共享许可协议