利用 PyTorch 进行线性回归,采用 kaggle 波士顿房价预测数据集

导入依赖包

1
2
3
4
5
6
7
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import visdom
1
torch.__version__
'1.12.1+cu102'

加载数据

1
2
3
path = "/workspace/disk1/datasets/scalar/train_dataset.csv"
data = pd.read_csv(path)
data.head(3)

CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PIRATIO B LSTAT PRICE
0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7
1
2
3
4
# 填充空缺值
data = data.fillna(method="pad")
# 查看数据信息
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 455 entries, 0 to 454
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     455 non-null    float64
 1   ZN       455 non-null    float64
 2   INDUS    455 non-null    float64
 3   CHAS     455 non-null    int64  
 4   NOX      455 non-null    float64
 5   RM       455 non-null    float64
 6   AGE      455 non-null    float64
 7   DIS      455 non-null    float64
 8   RAD      455 non-null    int64  
 9   TAX      455 non-null    int64  
 10  PIRATIO  455 non-null    float64
 11  B        455 non-null    float64
 12  LSTAT    455 non-null    float64
 13  PRICE    455 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 49.9 KB

训练数据选择

1
2
3
# 选择设备,这里使用 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
device(type='cuda')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# X = (
# torch.from_numpy(data.LSTAT.to_numpy().reshape(-1, 1))
# .type(torch.FloatTensor)
# .to(device)
# )

# 指定自变量
variables = [
"LSTAT",
]
X = (
torch.from_numpy(data[variables].to_numpy().reshape(-1, len(variables)))
.type(torch.FloatTensor)
.to(device)
)
# 指定因变量
Y = (
torch.from_numpy(data.PRICE.to_numpy().reshape(-1, 1))
.type(torch.FloatTensor)
.to(device)
)

X.shape, Y.shape
(torch.Size([455, 1]), torch.Size([455, 1]))

创建模型

1
2
3
4
5
6
7
8
class LinearRegressionModel(nn.Module):
def __init__(self, variables):
super(LinearRegressionModel, self).__init__()
self.linear = nn.Linear(in_features=len(variables), out_features=1, bias=True)

def forward(self, inputs):
outputs = self.linear(inputs)
return outputs
1
2
model = LinearRegressionModel(variables).to(device)
model
LinearRegressionModel(
  (linear): Linear(in_features=1, out_features=1, bias=True)
)
1
2
3
loss_fn = nn.MSELoss()
# loss_fn = nn.L1Loss()
loss_fn
MSELoss()
1
2
optimizer = optim.SGD(model.parameters(), lr=0.0001)
optimizer
SGD (
Parameter Group 0
    dampening: 0
    foreach: None
    lr: 0.0001
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

模型训练

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# 总训练轮次数
epochs = 250
# 使用 pytorch 可视化模块 visdom 可视化训练损失变化情况
viz = visdom.Visdom(
server="http://localhost",
port=8097,
base_url="/visdom",
username="jinzhongxu",
password="123123",
)
win = "linear regression loss"
opts = dict(
title="train_losses",
xlabel="epoch",
ylabel="loss",
markers=True,
legend=[
"loss",
],
)
viz.line(
[
[
0.0,
]
],
[0.0],
win=win,
opts=opts,
)

for epoch in range(epochs):
losses = [] # 记录每 EPOCH 的损失,计算平均损失
for x, y in zip(X, Y):
y_pred = model(x)
loss = loss_fn(y_pred, y)
losses.append(loss.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()

if epoch % 10 == 0:
print(f"epoch:{epoch}, loss:{np.mean(losses)}")
viz.line(
[
[
np.mean(losses),
]
],
[epoch],
win=win,
update="append",
)
Setting up a new session...


epoch:0, loss:266.77103529293294
epoch:10, loss:197.07076109784762
epoch:20, loss:150.77508147556117
epoch:30, loss:117.78742978721678
epoch:40, loss:94.28888019471626
epoch:50, loss:77.55574789468857
epoch:60, loss:65.64433264322592
epoch:70, loss:57.16943190648613
epoch:80, loss:51.14319554189922
epoch:90, loss:46.86080275933942
epoch:100, loss:43.82023009671523
epoch:110, loss:41.66319618386697
epoch:120, loss:40.13473528863745
epoch:130, loss:39.05330128137491
epoch:140, loss:38.28926664580331
epoch:150, loss:37.75066141055531
epoch:160, loss:37.37189170430676
epoch:170, loss:37.106266385903574
epoch:180, loss:36.92063599934509
epoch:190, loss:36.79157183500532
epoch:200, loss:36.702249215782494
epoch:210, loss:36.64089728668073
epoch:220, loss:36.59914092393465
epoch:230, loss:36.57104065759833
epoch:240, loss:36.55244242344576

训练损失函数曲线:

png

1
2
# 学习到的参数值
list(model.named_parameters())
[('linear.weight',
  Parameter containing:
  tensor([[-0.9671]], device='cuda:0', requires_grad=True)),
 ('linear.bias',
  Parameter containing:
  tensor([33.6645], device='cuda:0', requires_grad=True))]
1
2
3
4
5
6
7
8
9
10
# 查看拟合曲线
plt.figure(figsize=(10, 10))
for var in variables:
plt.scatter(data[var], data.PRICE)
plt.xlabel(var)
plt.ylabel("房价")
plt.legend(variables)
plt.plot(X.to("cpu").numpy(), model(X).detach().to("cpu").numpy(), color="r")
plt.legend(["数据点", "拟合曲线"])
plt.show()

png

参考文献

  1. [抖音:日月光华,PyTorch极简快速入门 简明易懂]