import numpy as np
import scipy.io as sio
from scipy.io import loadmat
import scipy.optimize as opt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
d=loadmat('ex5data1.mat')
d
{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Fri Nov 4 22:27:26 2011',
'__version__': '1.0',
'__globals__': [],
'X': array([[-15.93675813],
[-29.15297922],
[ 36.18954863],
[ 37.49218733],
[-48.05882945],
[ -8.94145794],
[ 15.30779289],
[-34.70626581],
[ 1.38915437],
[-44.38375985],
[ 7.01350208],
[ 22.76274892]]),
'y': array([[ 2.13431051],
[ 1.17325668],
[34.35910918],
[36.83795516],
[ 2.80896507],
[ 2.12107248],
[14.71026831],
[ 2.61418439],
[ 3.74017167],
[ 3.73169131],
[ 7.62765885],
[22.7524283 ]]),
'Xtest': array([[-33.31800399],
[-37.91216403],
[-51.20693795],
[ -6.13259585],
[ 21.26118327],
[-40.31952949],
[-14.54153167],
[ 32.55976024],
[ 13.39343255],
[ 44.20988595],
[ -1.14267768],
[-12.76686065],
[ 34.05450539],
[ 39.22350028],
[ 1.97449674],
[ 29.6217551 ],
[-23.66962971],
[ -9.01180139],
[-55.94057091],
[-35.70859752],
[ 9.51020533]]),
'ytest': array([[ 3.31688953],
[ 5.39768952],
[ 0.13042984],
[ 6.1925982 ],
[17.08848712],
[ 0.79950805],
[ 2.82479183],
[28.62123334],
[17.04639081],
[55.38437334],
[ 4.07936733],
[ 8.27039793],
[31.32355102],
[39.15906103],
[ 8.08727989],
[24.11134389],
[ 2.4773548 ],
[ 6.56606472],
[ 6.0380888 ],
[ 4.69273956],
[10.83004606]]),
'Xval': array([[-16.74653578],
[-14.57747075],
[ 34.51575866],
[-47.01007574],
[ 36.97511905],
[-40.68611002],
[ -4.47201098],
[ 26.53363489],
[-42.7976831 ],
[ 25.37409938],
[-31.10955398],
[ 27.31176864],
[ -3.26386201],
[ -1.81827649],
[-40.7196624 ],
[-50.01324365],
[-17.41177155],
[ 3.5881937 ],
[ 7.08548026],
[ 46.28236902],
[ 14.61228909]]),
'yval': array([[ 4.17020201e+00],
[ 4.06726280e+00],
[ 3.18730676e+01],
[ 1.06236562e+01],
[ 3.18360213e+01],
[ 4.95936972e+00],
[ 4.45159880e+00],
[ 2.22763185e+01],
[-4.38738274e-05],
[ 2.05038016e+01],
[ 3.85834476e+00],
[ 1.93650529e+01],
[ 4.88376281e+00],
[ 1.10971588e+01],
[ 7.46170827e+00],
[ 1.47693464e+00],
[ 2.71916388e+00],
[ 1.09269007e+01],
[ 8.34871235e+00],
[ 5.27819280e+01],
[ 1.33573396e+01]])}
数据包括:
X, y, Xval, yval, Xtest, ytest = map(np.ravel, [d['X'], d['y'], d['Xval'], d['yval'], d['Xtest'], d['ytest']])
df = pd.DataFrame({'water_level':X, 'flow':y})
df.head()
| water_level | flow | |
|---|---|---|
| 0 | -15.936758 | 2.134311 |
| 1 | -29.152979 | 1.173257 |
| 2 | 36.189549 | 34.359109 |
| 3 | 37.492187 | 36.837955 |
| 4 | -48.058829 | 2.808965 |
#可视化
sns.lmplot('water_level','flow',data=df,fit_reg=False)
plt.show()
E:\Anaconda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(

X, Xval, Xtest = [np.insert(x.reshape(x.shape[0], 1), 0, np.ones(x.shape[0]), axis=1) for x in (X, Xval, Xtest)]
def cost(theta,X,y):
#input:参数值theta,数据X,标签y
#output:代价函数
#样本个数
m=X.shape[0]
#计算代价函数
inner=X@theta-y
square_sum=inner.T@inner
cost=square_sum/(2*m)
return cost
theta = np.ones(X.shape[1])
cost(theta, X, y)
303.9515255535976
#正则化代价函数
def regularized_cost(theta,X,y,l=1):
m=X.shape[0]
regularized_term=(1/(2*m))*np.power(theta[1:],2).sum()
return cost(theta,X,y)+regularized_term
def gradient(theta,X,y):
#input:参数值theta,数据X,标签y
#output:当前参数下梯度
#样本个数
m=X.shape[0]
#计算代价函数
grad=(X.T@(X@theta-y))/m
return grad
gradient(theta, X, y)
array([-15.30301567, 598.16741084])
def regularized_gradient(theta, X, y, l=1):
m=X.shape[0]
regularized_term=theta.copy()
regularized_term[0]=0#theta0不正则化
regularized_term=(1/m)*regularized_term
return gradient(theta, X, y) + regularized_term
regularized_gradient(theta, X, y)
array([-15.30301567, 598.25074417])
def linear_regression_np(X, y, l=1):
# STEP1:初始化参数
theta = np.ones(X.shape[1])
# STEP2:调用优化算法拟合参数
# your code here (appro ~ 1 lines)
res = opt.minimize(fun=regularized_cost,
x0=theta,
args=(X, y, l),
method='TNC',
jac=regularized_gradient,
options={'disp': True})
return res
theta = np.ones(X.shape[0])
final_theta = linear_regression_np(X, y, l=0).get('x')
b=final_theta[0]
k=final_theta[1]
plt.scatter(X[:,1],y,label='training data',c='r')
plt.plot(X[:,1],X[:,1]*k+b,label='prediction')
plt.legend(loc=2)
plt.show()

training_cost, cv_cost = [], []
m = X.shape[0]
for i in range(1, m+1):
# 计算当前样本的代价
res = linear_regression_np(X[:i, :], y[:i], l=0)
tc = regularized_cost(res.x, X[:i, :], y[:i], l=0)
cv = regularized_cost(res.x, Xval, yval, l=0)
# 把计算结果存储至预先定义的数组training_cost, cv_cost中
training_cost.append(tc)
cv_cost.append(cv)
plt.plot(np.arange(1,m+1),training_cost,label='training cost')
plt.plot(np.arange(1,m+1),cv_cost,label='cv cost')
plt.legend(loc=1)
plt.show()

#特征映射
def poly_features(x, power, as_ndarray=False): #特征映射
data = {'f{}'.format(i): np.power(x, i) for i in range(1, power + 1)}
df = pd.DataFrame(data)
return df.values if as_ndarray else df
def normalize_feature(df):
"""Applies function along input axis(default 0) of DataFrame."""
return df.apply(lambda column: (column - column.mean()) / column.std())
def prepare_poly_data(*args, power):
"""
args: keep feeding in X, Xval, or Xtest
will return in the same order
"""
def prepare(x):
# 特征映射
df = poly_features(x, power=power)
# 归一化处理
ndarr = normalize_feature(df).values
# 添加偏置项
return np.insert(ndarr, 0, np.ones(ndarr.shape[0]), axis=1)
return [prepare(x) for x in args]
X, y, Xval, yval, Xtest, ytest = map(np.ravel, [d['X'], d['y'], d['Xval'], d['yval'], d['Xtest'], d['ytest']])
poly_features(X, power=3)
| f1 | f2 | f3 | |
|---|---|---|---|
| 0 | -15.936758 | 253.980260 | -4047.621971 |
| 1 | -29.152979 | 849.896197 | -24777.006175 |
| 2 | 36.189549 | 1309.683430 | 47396.852168 |
| 3 | 37.492187 | 1405.664111 | 52701.422173 |
| 4 | -48.058829 | 2309.651088 | -110999.127750 |
| 5 | -8.941458 | 79.949670 | -714.866612 |
| 6 | 15.307793 | 234.328523 | 3587.052500 |
| 7 | -34.706266 | 1204.524887 | -41804.560890 |
| 8 | 1.389154 | 1.929750 | 2.680720 |
| 9 | -44.383760 | 1969.918139 | -87432.373590 |
| 10 | 7.013502 | 49.189211 | 344.988637 |
| 11 | 22.762749 | 518.142738 | 11794.353058 |
X_poly, Xval_poly, Xtest_poly= prepare_poly_data(X, Xval, Xtest, power=8)
def plot_learning_curve(X, y, Xval, yval, l=0):
# INPUT:训练数据集X,y,交叉验证集Xval,yval,正则化参数l
# OUTPUT:当前参数值下梯度
# TODO:根据参数和输入的数据计算梯度
# STEP1:初始化参数,获取样本个数,开始遍历
training_cost, cv_cost = [], []
m = X.shape[0]
for i in range(1, m + 1):
# STEP2:调用之前写好的拟合数据函数进行数据拟合
# your code here (appro ~ 1 lines)
res = linear_regression_np(X[:i, :], y[:i], l=l)
# STEP3:计算样本代价
# your code here (appro ~ 1 lines)
tc = cost(res.x, X[:i, :], y[:i])
cv = cost(res.x, Xval, yval)
# STEP3:把计算结果存储至预先定义的数组training_cost, cv_cost中
# your code here (appro ~ 2 lines)
training_cost.append(tc)
cv_cost.append(cv)
plt.plot(np.arange(1, m + 1), training_cost, label='training cost')
plt.plot(np.arange(1, m + 1), cv_cost, label='cv cost')
plt.legend(loc=1)
plot_learning_curve(X_poly, y, Xval_poly, yval, l=0)
plt.show()
