python 数学公式_线性回归算法用到的数学公式及python实现
约定,m个数据,每个数据n个特征
代价函数
def cost(theta, x, y):
# x @ theta = (m,n) @ (n,) = (m,)
#j = np.sum(np.power((x.dot(theta) - y), 2)) / (2 * x.shape[0]) #写法1
#j = np.sum(np.power((x @ theta - y), 2)) / (2 * x.shape[0]) #写法2
j = ((x @ theta - y) @ (x @ theta - y)) / (2 * x.shape[0]) #写法3
return j
正则化代价函数
def regularized_cost(theta, X, y, l=1):
m = X.shape[0]
regularized_term = (l.0 / (2.0 * m)) * np.power(theta[1:], 2).sum()
return cost(theta, X, y) + regularized_term
梯度下降法求解
# 定义学习率alpha和迭代次数epoch
# loss存放每次修改theta后代价函数的值
loss = []
for i in range(epoch):
#theta = theta - (((x.dot(theta) - y).ravel()).dot(x)) * alpha / x.shape[0] #写法1
theta = theta - ((x @ theta - y) @ x) * alpha / x.shape[0] #写法2
loss.append(cost(theta, x, y))
return theta, loss
正则化梯度下降法求解
求梯度
其中j=0,1,2...n
def gradient(theta, x, y):
m = x.shape[0]
#return ((sigmoid(x.dot(theta))-y).T).dot(x)
# x @ theta => (n, m) @ (m,) = (n,)
# (sigmoid(x @ theta) - y).T => (n,)
# (sigmoid(x @ theta) - y).T @ x => (n,) @ (n,m) = (m,)
# 问题,其中.T不起作用啊
return ((x @ theta - y).T @ x) / m
求正则化梯度
其中j=0
其中j=1,2..n
def regularized_gradient(theta, x, y, lam):
m = x.shape[0]
regularized_term = lam * theta / m
regularized_term[0] = 0 # 不惩罚第一项
return gradient(theta, x, y) + regularized_term
添加多项式特征,比如[1, x] => [1, x, x^2, x^3, x^4, x^5]
def gen_poly_features(x, power):
"""添加多项式特征
每次在array的最后一列插入第二列的i+2次方(第一列为偏置)
从二次方开始开始插入(因为本身含有一列一次方)
"""
x_new = x.copy()
for i in range(2, power + 1):
x_new = np.insert(x_new, x_new.shape[1], np.power(x[:,1], i), axis=1)
return x_new
获取数据的均值和误差
def get_means_std(x):
means = np.mean(x, axis=0)
stds = np.std(x, axis=0, ddof=1) # ddof=1 means 样本标准差
return means, stds
对数据进行z-score标准化(正太标准化)
计算公式:z=(x-mean(x))/std(x)
def feature_normalize(x, means, stds):
x_norm = x.copy()
x_norm[:, 1:] = x_norm[:, 1:] - means[1:]
x_norm[:, 1:] = x_norm[:, 1:] / stds[1:]
return x_norm
绘制学习曲线
def plot_learning_curve(minimum, theta, x_train, y_train, x_cv, y_cv, lam):
'''
画出学习曲线,即交叉验证误差和训练误差随数据数量的变化的变化.
训练数据X从1开始逐渐增加,训练出不同的参数向量θ。接着通过交叉验证计算验证误差。
1.使用训练集的子集来训练模型,得到不同的theta;
2.通过theta计算训练误差和交叉验证误差,切记此时不要使用正则化,将λ设置为0;吴恩达机器学习87课时3分22秒明确说明了,但为什么那?
3.计算交叉验证误差时记得整个交叉验证集来计算,无需分为子集;
minimum:指定最少取多少数据,如果minimum=1,误差可能特别大,取全部数据后误差很小,绘图效果不好
'''
m = x_train.shape[0]
training_cost, cv_cost = [], []
for i in range(minimum, m + 1):
res = opt.minimize(fun=regularized_cost,
x0=theta, args=(x_train[:i, :], y_train[:i], lam),
method='TNC',
jac=regularized_gradient,
options={'disp': True})
tc = regularized_cost(res.x, x_train[:i, :], y_train[:i], 0) #
cv = regularized_cost(res.x, x_cv, y_cv, 0)
training_cost.append(tc)
cv_cost.append(cv)
plt.plot(np.arange(len(training_cost)), training_cost, label='training cost')
plt.plot(np.arange(len(cv_cost)), cv_cost, label='cv cost')
plt.legend()
plt.xlabel('Number of training examples')
plt.ylabel('Error')
plt.title('Learning curve for linear regression')
plt.grid(True)