手撕线性回归-不使用sklearn工具包

/ 默认分类 / 0 条评论 / 987浏览

这篇笔记主要记录下,基于前几个章节中学习的机器学习中的线性回归算法的底层原理,手撕线性回归算法程序,这里我们不适用sklearn中的工具包,所有的处理都自己实现。这样对原理也有了更深的了解。 这里我用的是colab,因为不需要管很多环境问题,Jupyter不大好用个人感觉。 up已经在本地一行行开发完毕,非常建议大家自行编写,会受益匪浅,详细的colab我后面整理好会贴在评论区。

一.先看下训练接数据

从kaggle下载的

import pandas as pd
import tkinter
import matplotlib.pyplot as plt

data = pd.read_csv('advertising.csv')

# 得到训练和测试数据
train_data = data.sample(frac=0.8)
test_data = data.drop(train_data.index)

input_param_name = 'wechat'
output_param_name = 'sales'

x_train = train_data[[input_param_name]].values
y_train = train_data[[output_param_name]].values

x_test = test_data[input_param_name].values
y_test = test_data[output_param_name].values

plt.scatter(x_train, y_train, label='train data')
plt.scatter(x_test, y_test, label='test data')
plt.xlabel(input_param_name)
plt.ylabel(output_param_name)
plt.title('Ad data')
plt.legend()
plt.show()

image.png

二.训练集数据预处理

主要使用归一化或者叫做标准化处理,将原始样本数据减去均值,这样可以让分布不均杂乱的单个维度的样本数据变成以原点对称。然后再除以标准差,这样可以使得每个维度的样本数据的差异化值最小。比如特征x1的取值有10和1000等等,x2的取值是0.1,0.3等等,那么用前面的差值除以各自的标准差之后就可能都变为了-1和1范围内的数据。

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


def normalize(features):
    features_nomalized = np.copy(features).astype(float)
    features_mean = np.mean(features,0)
    features_deviation = np.std(features,0)
    if features.shape[0] > 1:
        features_nomalized -= features_mean
    print("type+type:",type(features_deviation))
    features_deviation[features_deviation==0]=1
    features_nomalized/=features_deviation
    return features_nomalized,features_mean,features_deviation

def prepare_for_training(data,polynomial_degree=0,sinusoid_degree=0,normalize_data=0):
    num_examples = data.shape[0]
    data_processed = np.copy(data)

    feature_mean = 0
    features_deviation = 0
    print("type666",type(data_processed))
    if normalize_data:
        (
            data_nomaliz

三.模型类封装

class LinearRegression:
    def __init__(self, data, labels, polynomial_degree=0, sinusoid_degree=0, normalize_data=True):
        print("type111",type(data))
        (data_processed,
         features_mean,
         features_deviation) = prepare_for_training(data, polynomial_degree, sinusoid_degree, normalize_data)
        # 使用预处理后的数据作为训练集数据
        self.data = data_processed
        # 标签值
        self.labels = labels
        self.features_mean = features_mean
        self.features_deviation = features_deviation
        # 将多项式次数赋值给类的实例变量
        self.polynomial_degree = polynomial_degree
        # 将正弦函数次数赋值给类的实例变量
        self.sinusoid_degree = sinusoid_degree
        # 将是否规范化数据的标志赋值给类的实例变量
        self.normalize_data = normalize_data

        # 获取数据的特征数量(列数)
        num_features = self.data.shape[1]
        # 初始化theta参数为零向量,大小为(num_features, 1),例如三个特征的,三行一列
        # [[0.0],
        #  [0.0],
        #  [0.0]]find . -type d -name "__pycache__" -exec rm -r {} +find . -type d -name "__pycache__" -exec rm -r {} +
        self.theta = np.zeros((num_features, 1))

    # 训练模型
    def train(self, alpha, num_iterations=500):
        """
        :param alpha: 梯度下降的学习率
        :param num_iterations:梯度下降的迭代次数
        """
        loss_history = self.gradient_descent(alpha, num_iterations)
        return self.theta,loss_history

    def gradient_descent(self,alpha,num_iterations):
        loss_history = []
        for _ in range(num_iterations):
            self.gradient_step(alpha)
            loss_history.append(self.cal_loss(self.data,self.labels))
        return loss_history

    def gradient_step(self, alpha):
        # 这里我们使用批量梯度下降,样本数就是训练集的大小
        train_data_num = self.data.shape[0]
        # 执行一次假设预测
        prediction = LinearRegression.hypothesis(self.data, self.theta)
        # 计算预测值和实际值的差值
        delta = prediction - self.labels
        theta = self.theta
        # 更新theta参数
        theta = theta - alpha * (1/train_data_num)*(np.dot(delta.T,self.data)).T
        self.theta = theta

    def cal_loss(self,data,lables):
        train_data_num = data.shape[0]
        delta = LinearRegression.hypothesis(self.data,self.theta)-lables
        # 计算的是误差的平方和 (均方误差)
        loss = (1/2)*np.dot(delta.T,delta)/train_data_num
        # loss就是一个1,1的矩阵,直接取标量值即可(无维度的单一数值)
        return loss[0][0]

    
    # 进行归一化处理之后才传入回归方程模型中进行预测
    def predict(self, data):
            print("type444",type(data))
            print("deal before",data)
            data_processed = prepare_for_training(data,
                                                  self.polynomial_degree,
                                                  self.sinusoid_degree,
                                                  self.normalize_data)[0]
            print("deal after",data)                                      
            predictions = LinearRegression.hypothesis(data_processed, self.theta)
            return predictions
        

    @staticmethod
    def hypothesis(data, theta):
        # 计算预测值
        predictions = np.dot(data, theta)
        return predictions

四.模型训练

4.1 训练过程

data = pd.read_csv('advertising.csv')

# 得到训练和测试数据
train_data = data.sample(frac=0.8)
print("type222",type(train_data))
test_data = data.drop(train_data.index)
print("type333",type(test_data))

input_param_name = 'wechat'
output_param_name = 'sales'

x_train = train_data[[input_param_name]].values
y_train = train_data[[output_param_name]].values

x_test = test_data[input_param_name].values
y_test = test_data[output_param_name].values

# 定义梯度下降迭代次数
num_iterations = 500
# 定义学习率
learning_rate = 0.01

linear_regression = LinearRegression(x_train,y_train)
(theta,cost_history) = linear_regression.train(learning_rate,num_iterations)
print("开始时的损失值:",cost_history[0])
print("训练后的损失值:",cost_history[-1])
type222 <class 'pandas.core.frame.DataFrame'>
type333 <class 'pandas.core.frame.DataFrame'>
type111 <class 'numpy.ndarray'>
type666 <class 'numpy.ndarray'>
type+type: <class 'numpy.ndarray'>
开始时的损失值: 129.4170833303712
训练后的损失值: 2.6448034627745933

4.2 损失值变化图像

plt.plot(range(num_iterations),cost_history)
plt.xlabel('iter')
plt.ylabel('loss')
plt.title('Loss value change')
plt.show()

image.png

五.基于训练的模型绘制回归方程

# 进行归一化处理之后才传入回归方程模型中进行预测
def predict(self, data):
        print("type444",type(data))
        print("deal before",data)
        data_processed = prepare_for_training(data,
                                              self.polynomial_degree,
                                              self.sinusoid_degree,
                                              self.normalize_data)[0]
        print("deal after",data)                                      
        predictions = LinearRegression.hypothesis(data_processed, self.theta)
        return predictions





predict_num = 100
x_predictions = np.linspace(x_train.min(),x_train.max(),predict_num).reshape(-1,1)
print("min:",x_train.min())
print("max:",x_train.max())
print(linear_regression.theta)
y_predictions = linear_regression.predict(x_predictions)
print(y_predictions)
plt.scatter(x_train, y_train, label='train data')
plt.scatter(x_test, y_test, label='test data')
plt.plot(x_predictions,y_predictions,'r',label='Predit')
plt.xlabel(input_param_name)
plt.ylabel(output_param_name)
plt.title('Ad data')
plt.legend()
plt.show()

得到的结果: image.png

每次重新训练模型的话,因为我们选取的训练样本数据是随机的80%,所以可能不一样,所以得到的最终的θ值可能就略微不同。

这里来说一个小疑惑点,因为我们在训练的时候进行了样本数据的归一化处理,所以在预测的时候也需要对预测样本数据进行相同的归一化处理,如果我们直接使用原始样本数据进行预测,那么就毫无意义,因为我们的线性回归方程本质上是针对于训练集样本数据归一化之后的数据进行拟合的回归方程。 比如我们将预测函数改为:


    def predict(self, data):
        print("type444",type(data))
        print("deal before",data)
        #data_processed = prepare_for_training(data,
        #                                      self.polynomial_degree,
        #                                      self.sinusoid_degree,
        #                                      self.normalize_data)[0]
        # 直接为样本数据(当前只有一维样本数据)加上一个1的偏置项,变为例如
        # [[1,10]
        # [1,28]]
        data = np.hstack((np.ones((data.shape[0], 1)), data))
        print("deal after",data)                                      
        predictions = LinearRegression.hypothesis(data, self.theta)
        return predictions

执行如下(假设这里我们只传入两个特征变量):

predict_num = 2
x_predictions = np.linspace(x_train.min(),x_train.max(),predict_num).reshape(-1,1)
print("min:",x_train.min())
print("max:",x_train.max())
print(linear_regression.theta)
x_p=np.array([[4],[10]])
y_predictions = linear_regression.predict(x_p)
print(y_predictions)
plt.scatter(x_train, y_train, label='train data')
plt.scatter(x_test, y_test, label='test data')
plt.plot(x_p,y_predictions,'r',label='Predit')
plt.xlabel(input_param_name)
plt.ylabel(output_param_name)
plt.title('Ad data')
plt.legend()
plt.show()
min: 3.2
max: 1348.6
[[15.23175807]
 [ 4.8279428 ]]
[[ 4]
 [10]]
type444 <class 'numpy.ndarray'>
deal before [[ 4]
 [10]]
deal after [[ 1.  4.]
 [ 1. 10.]]
[[34.54352928]
 [63.51118608]]

image.png

六. 总结

这就解释了初学的时候的一个疑惑,训练模型是为了得到一条明确的拟合的线性直线吗?然后对于需要预测的样本数据,直接将x值带入是不是就得到了预测值y了,那么这条线性回归方程在模型训练完毕之后,应该就已经确定了(假设是一个一维的模型,那么就是y=ax这样的形式),也就是说模型训练完成之后,线性回归方程这条直线就已经确定了,所以无论我们使用多少的预测样本数据,比如我们用两个(predict_num=2),那么两个点就确定了一个直线,这条执行一定就是模型的回归方程,即使我们换一组数据,或者使用10个样本数据,那么我们预测得到是个点,连起来应该还是这条线性回归方程。因为直线的斜率已经确定了。

但实际上并非如此,因为我们对训练的样本数据做了归一化处理,所以实际的回归方程通过梯度下降优化得到的参数,是针对归一化之后的样本数据来的,所以我们在预测的时候也需要对预测的数据进行归一化处理。

总结一下,当我们训练好了模型,也就是完成了梯度下降优化,那么我们得到了明确的θ值,也就是说,本质上,到这一步,线性回归模型已经确定了。 但是这个模型的这个线性图像长什么样? 假设我们现在为了绘制这个图像,我们使用2个样本数据,那么两个样本数据(假设只有一维)进行归一化操作之后, x1_1_origin==>x1_1_st x1_2_origin==>x1_2_st 然后将x1_st和x2_st作为回归方程的入参,得到预测值是 y1和y2,那么我们的坐标数据就是 (x1_1_origin,y1)和(x1_2_origin,y2),这两个点组成了一个回归方程直线图像,也就是下面这样:

min: 3.2
max: 1348.6
[[15.03555574]
 [ 4.85744712]]
type444 <class 'numpy.ndarray'>
deal before [[   3.2]
 [1348.6]]
type666 <class 'numpy.ndarray'>
type+type: <class 'numpy.ndarray'>
deal after [[ 1. -1.]
 [ 1.  1.]]
[[10.17810862]
 [19.89300286]]

image.png

如果我们将绘制方程的样本点增多,比如: x1_1_origin==>x1_1_st x1_2_origin==>x1_2_st ......... x1_n_origin==>x1_n_st 比如我们将样本数据的值变为从训练集中的最小样本数据到最大样本数据范围内的连续的100个数据来作为输入。 那么也就是这样的:

打印的数据放在最后,有点多

image.png

为什么呢?很明显,因为不同的绘制方程的样本数据量,生成的归一化之后的样本数据值就不同,假设原来绘制的样本个数是2的时候,原始样本数据10,归一化之后是0.5,传入预测模型,得到的预测值是20。但是现在绘制的样本数据个数是100,同样的原始样本数据10,归一化处理之后变成了0.7(前面介绍了归一化的方式和目的),那么传入预测模型得到的结果是18,此时同样的x值10,绘制的y值就不同了,所以得到的回归方程直线就肯定不同了。所以一般我们都是使用最小值和最大值之间取合适的数量的连续数据来进行绘制线性方程。

打印数据

min: 3.2
max: 1348.6
[[15.03555574]
 [ 4.85744712]]
type444 <class 'numpy.ndarray'>
deal before [[   3.2       ]
 [  16.78989899]
 [  30.37979798]
 [  43.96969697]
 [  57.55959596]
 [  71.14949495]
 [  84.73939394]
 [  98.32929293]
 [ 111.91919192]
 [ 125.50909091]
 [ 139.0989899 ]
 [ 152.68888889]
 [ 166.27878788]
 [ 179.86868687]
 [ 193.45858586]
 [ 207.04848485]
 [ 220.63838384]
 [ 234.22828283]
 [ 247.81818182]
 [ 261.40808081]
 [ 274.9979798 ]
 [ 288.58787879]
 [ 302.17777778]
 [ 315.76767677]
 [ 329.35757576]
 [ 342.94747475]
 [ 356.53737374]
 [ 370.12727273]
 [ 383.71717172]
 [ 397.30707071]
 [ 410.8969697 ]
 [ 424.48686869]
 [ 438.07676768]
 [ 451.66666667]
 [ 465.25656566]
 [ 478.84646465]
 [ 492.43636364]
 [ 506.02626263]
 [ 519.61616162]
 [ 533.20606061]
 [ 546.7959596 ]
 [ 560.38585859]
 [ 573.97575758]
 [ 587.56565657]
 [ 601.15555556]
 [ 614.74545455]
 [ 628.33535354]
 [ 641.92525253]
 [ 655.51515152]
 [ 669.10505051]
 [ 682.69494949]
 [ 696.28484848]
 [ 709.87474747]
 [ 723.46464646]
 [ 737.05454545]
 [ 750.64444444]
 [ 764.23434343]
 [ 777.82424242]
 [ 791.41414141]
 [ 805.0040404 ]
 [ 818.59393939]
 [ 832.18383838]
 [ 845.77373737]
 [ 859.36363636]
 [ 872.95353535]
 [ 886.54343434]
 [ 900.13333333]
 [ 913.72323232]
 [ 927.31313131]
 [ 940.9030303 ]
 [ 954.49292929]
 [ 968.08282828]
 [ 981.67272727]
 [ 995.26262626]
 [1008.85252525]
 [1022.44242424]
 [1036.03232323]
 [1049.62222222]
 [1063.21212121]
 [1076.8020202 ]
 [1090.39191919]
 [1103.98181818]
 [1117.57171717]
 [1131.16161616]
 [1144.75151515]
 [1158.34141414]
 [1171.93131313]
 [1185.52121212]
 [1199.11111111]
 [1212.7010101 ]
 [1226.29090909]
 [1239.88080808]
 [1253.47070707]
 [1267.06060606]
 [1280.65050505]
 [1294.24040404]
 [1307.83030303]
 [1321.42020202]
 [1335.01010101]
 [1348.6       ]]
type666 <class 'numpy.ndarray'>
type+type: <class 'numpy.ndarray'>
deal after [[ 1.         -1.71481604]
 [ 1.         -1.68017329]
 [ 1.         -1.64553055]
 [ 1.         -1.6108878 ]
 [ 1.         -1.57624505]
 [ 1.         -1.5416023 ]
 [ 1.         -1.50695955]
 [ 1.         -1.4723168 ]
 [ 1.         -1.43767406]
 [ 1.         -1.40303131]
 [ 1.         -1.36838856]
 [ 1.         -1.33374581]
 [ 1.         -1.29910306]
 [ 1.         -1.26446031]
 [ 1.         -1.22981757]
 [ 1.         -1.19517482]
 [ 1.         -1.16053207]
 [ 1.         -1.12588932]
 [ 1.         -1.09124657]
 [ 1.         -1.05660382]
 [ 1.         -1.02196108]
 [ 1.         -0.98731833]
 [ 1.         -0.95267558]
 [ 1.         -0.91803283]
 [ 1.         -0.88339008]
 [ 1.         -0.84874733]
 [ 1.         -0.81410459]
 [ 1.         -0.77946184]
 [ 1.         -0.74481909]
 [ 1.         -0.71017634]
 [ 1.         -0.67553359]
 [ 1.         -0.64089084]
 [ 1.         -0.6062481 ]
 [ 1.         -0.57160535]
 [ 1.         -0.5369626 ]
 [ 1.         -0.50231985]
 [ 1.         -0.4676771 ]
 [ 1.         -0.43303435]
 [ 1.         -0.39839161]
 [ 1.         -0.36374886]
 [ 1.         -0.32910611]
 [ 1.         -0.29446336]
 [ 1.         -0.25982061]
 [ 1.         -0.22517786]
 [ 1.         -0.19053512]
 [ 1.         -0.15589237]
 [ 1.         -0.12124962]
 [ 1.         -0.08660687]
 [ 1.         -0.05196412]
 [ 1.         -0.01732137]
 [ 1.          0.01732137]
 [ 1.          0.05196412]
 [ 1.          0.08660687]
 [ 1.          0.12124962]
 [ 1.          0.15589237]
 [ 1.          0.19053512]
 [ 1.          0.22517786]
 [ 1.          0.25982061]
 [ 1.          0.29446336]
 [ 1.          0.32910611]
 [ 1.          0.36374886]
 [ 1.          0.39839161]
 [ 1.          0.43303435]
 [ 1.          0.4676771 ]
 [ 1.          0.50231985]
 [ 1.          0.5369626 ]
 [ 1.          0.57160535]
 [ 1.          0.6062481 ]
 [ 1.          0.64089084]
 [ 1.          0.67553359]
 [ 1.          0.71017634]
 [ 1.          0.74481909]
 [ 1.          0.77946184]
 [ 1.          0.81410459]
 [ 1.          0.84874733]
 [ 1.          0.88339008]
 [ 1.          0.91803283]
 [ 1.          0.95267558]
 [ 1.          0.98731833]
 [ 1.          1.02196108]
 [ 1.          1.05660382]
 [ 1.          1.09124657]
 [ 1.          1.12588932]
 [ 1.          1.16053207]
 [ 1.          1.19517482]
 [ 1.          1.22981757]
 [ 1.          1.26446031]
 [ 1.          1.29910306]
 [ 1.          1.33374581]
 [ 1.          1.36838856]
 [ 1.          1.40303131]
 [ 1.          1.43767406]
 [ 1.          1.4723168 ]
 [ 1.          1.50695955]
 [ 1.          1.5416023 ]
 [ 1.          1.57624505]
 [ 1.          1.6108878 ]
 [ 1.          1.64553055]
 [ 1.          1.68017329]
 [ 1.          1.71481604]]
[[ 6.70592749]
 [ 6.87420281]
 [ 7.04247812]
 [ 7.21075344]
 [ 7.37902876]
 [ 7.54730408]
 [ 7.7155794 ]
 [ 7.88385472]
 [ 8.05213003]
 [ 8.22040535]
 [ 8.38868067]
 [ 8.55695599]
 [ 8.72523131]
 [ 8.89350662]
 [ 9.06178194]
 [ 9.23005726]
 [ 9.39833258]
 [ 9.5666079 ]
 [ 9.73488322]
 [ 9.90315853]
 [10.07143385]
 [10.23970917]
 [10.40798449]
 [10.57625981]
 [10.74453512]
 [10.91281044]
 [11.08108576]
 [11.24936108]
 [11.4176364 ]
 [11.58591172]
 [11.75418703]
 [11.92246235]
 [12.09073767]
 [12.25901299]
 [12.42728831]
 [12.59556363]
 [12.76383894]
 [12.93211426]
 [13.10038958]
 [13.2686649 ]
 [13.43694022]
 [13.60521553]
 [13.77349085]
 [13.94176617]
 [14.11004149]
 [14.27831681]
 [14.44659213]
 [14.61486744]
 [14.78314276]
 [14.95141808]
 [15.1196934 ]
 [15.28796872]
 [15.45624403]
 [15.62451935]
 [15.79279467]
 [15.96106999]
 [16.12934531]
 [16.29762063]
 [16.46589594]
 [16.63417126]
 [16.80244658]
 [16.9707219 ]
 [17.13899722]
 [17.30727253]
 [17.47554785]
 [17.64382317]
 [17.81209849]
 [17.98037381]
 [18.14864913]
 [18.31692444]
 [18.48519976]
 [18.65347508]
 [18.8217504 ]
 [18.99002572]
 [19.15830104]
 [19.32657635]
 [19.49485167]
 [19.66312699]
 [19.83140231]
 [19.99967763]
 [20.16795294]
 [20.33622826]
 [20.50450358]
 [20.6727789 ]
 [20.84105422]
 [21.00932954]
 [21.17760485]
 [21.34588017]
 [21.51415549]
 [21.68243081]
 [21.85070613]
 [22.01898144]
 [22.18725676]
 [22.35553208]
 [22.5238074 ]
 [22.69208272]
 [22.86035804]
 [23.02863335]
 [23.19690867]
 [23.36518399]]