利用 PyTorch 进行线性回归,采用 kaggle 波士顿房价预测数据集
导入依赖包
1 2 3 4 5 6 7
| import matplotlib.pyplot as plt import numpy as np import pandas as pd import torch import torch.nn as nn import torch.optim as optim import visdom
|
'1.12.1+cu102'
加载数据
1 2 3
| path = "/workspace/disk1/datasets/scalar/train_dataset.csv" data = pd.read_csv(path) data.head(3)
|
|
CRIM |
ZN |
INDUS |
CHAS |
NOX |
RM |
AGE |
DIS |
RAD |
TAX |
PIRATIO |
B |
LSTAT |
PRICE |
0 |
0.00632 |
18.0 |
2.31 |
0 |
0.538 |
6.575 |
65.2 |
4.0900 |
1 |
296 |
15.3 |
396.90 |
4.98 |
24.0 |
1 |
0.02731 |
0.0 |
7.07 |
0 |
0.469 |
6.421 |
78.9 |
4.9671 |
2 |
242 |
17.8 |
396.90 |
9.14 |
21.6 |
2 |
0.02729 |
0.0 |
7.07 |
0 |
0.469 |
7.185 |
61.1 |
4.9671 |
2 |
242 |
17.8 |
392.83 |
4.03 |
34.7 |
1 2 3 4
| data = data.fillna(method="pad")
data.info()
|
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 455 entries, 0 to 454
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CRIM 455 non-null float64
1 ZN 455 non-null float64
2 INDUS 455 non-null float64
3 CHAS 455 non-null int64
4 NOX 455 non-null float64
5 RM 455 non-null float64
6 AGE 455 non-null float64
7 DIS 455 non-null float64
8 RAD 455 non-null int64
9 TAX 455 non-null int64
10 PIRATIO 455 non-null float64
11 B 455 non-null float64
12 LSTAT 455 non-null float64
13 PRICE 455 non-null float64
dtypes: float64(11), int64(3)
memory usage: 49.9 KB
训练数据选择
1 2 3
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device
|
device(type='cuda')
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
|
variables = [ "LSTAT", ] X = ( torch.from_numpy(data[variables].to_numpy().reshape(-1, len(variables))) .type(torch.FloatTensor) .to(device) )
Y = ( torch.from_numpy(data.PRICE.to_numpy().reshape(-1, 1)) .type(torch.FloatTensor) .to(device) )
X.shape, Y.shape
|
(torch.Size([455, 1]), torch.Size([455, 1]))
创建模型
1 2 3 4 5 6 7 8
| class LinearRegressionModel(nn.Module): def __init__(self, variables): super(LinearRegressionModel, self).__init__() self.linear = nn.Linear(in_features=len(variables), out_features=1, bias=True)
def forward(self, inputs): outputs = self.linear(inputs) return outputs
|
1 2
| model = LinearRegressionModel(variables).to(device) model
|
LinearRegressionModel(
(linear): Linear(in_features=1, out_features=1, bias=True)
)
1 2 3
| loss_fn = nn.MSELoss()
loss_fn
|
MSELoss()
1 2
| optimizer = optim.SGD(model.parameters(), lr=0.0001) optimizer
|
SGD (
Parameter Group 0
dampening: 0
foreach: None
lr: 0.0001
maximize: False
momentum: 0
nesterov: False
weight_decay: 0
)
模型训练
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
| epochs = 250
viz = visdom.Visdom( server="http://localhost", port=8097, base_url="/visdom", username="jinzhongxu", password="123123", ) win = "linear regression loss" opts = dict( title="train_losses", xlabel="epoch", ylabel="loss", markers=True, legend=[ "loss", ], ) viz.line( [ [ 0.0, ] ], [0.0], win=win, opts=opts, )
for epoch in range(epochs): losses = [] for x, y in zip(X, Y): y_pred = model(x) loss = loss_fn(y_pred, y) losses.append(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step()
if epoch % 10 == 0: print(f"epoch:{epoch}, loss:{np.mean(losses)}") viz.line( [ [ np.mean(losses), ] ], [epoch], win=win, update="append", )
|
Setting up a new session...
epoch:0, loss:266.77103529293294
epoch:10, loss:197.07076109784762
epoch:20, loss:150.77508147556117
epoch:30, loss:117.78742978721678
epoch:40, loss:94.28888019471626
epoch:50, loss:77.55574789468857
epoch:60, loss:65.64433264322592
epoch:70, loss:57.16943190648613
epoch:80, loss:51.14319554189922
epoch:90, loss:46.86080275933942
epoch:100, loss:43.82023009671523
epoch:110, loss:41.66319618386697
epoch:120, loss:40.13473528863745
epoch:130, loss:39.05330128137491
epoch:140, loss:38.28926664580331
epoch:150, loss:37.75066141055531
epoch:160, loss:37.37189170430676
epoch:170, loss:37.106266385903574
epoch:180, loss:36.92063599934509
epoch:190, loss:36.79157183500532
epoch:200, loss:36.702249215782494
epoch:210, loss:36.64089728668073
epoch:220, loss:36.59914092393465
epoch:230, loss:36.57104065759833
epoch:240, loss:36.55244242344576
训练损失函数曲线:
1 2
| list(model.named_parameters())
|
[('linear.weight',
Parameter containing:
tensor([[-0.9671]], device='cuda:0', requires_grad=True)),
('linear.bias',
Parameter containing:
tensor([33.6645], device='cuda:0', requires_grad=True))]
1 2 3 4 5 6 7 8 9 10
| plt.figure(figsize=(10, 10)) for var in variables: plt.scatter(data[var], data.PRICE) plt.xlabel(var) plt.ylabel("房价") plt.legend(variables) plt.plot(X.to("cpu").numpy(), model(X).detach().to("cpu").numpy(), color="r") plt.legend(["数据点", "拟合曲线"]) plt.show()
|
参考文献
- [抖音:日月光华,PyTorch极简快速入门 简明易懂]