diff --git a/HousePrice/model.pth b/HousePrice/model.pth index e41c711..abd13c9 100644 Binary files a/HousePrice/model.pth and b/HousePrice/model.pth differ diff --git a/HousePrice/src/DataPreprocess.py b/HousePrice/src/DataPreprocess.py index 6c483e3..8e5364b 100644 --- a/HousePrice/src/DataPreprocess.py +++ b/HousePrice/src/DataPreprocess.py @@ -7,6 +7,10 @@ import pandas as pd import torch from torch import nn +from torch.utils.data import DataLoader +from torch.utils.data import Dataset +from sklearn.model_selection import train_test_split +import numpy as np pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 100) @@ -40,17 +44,120 @@ # 处理非数值型数据 # print(all_data.shape) # (2919, 79) all_data = pd.get_dummies(all_data, dummy_na=True) -# print(all_data.shape) # (2919, 331) # 将处理好的数据分成训练集和测试集,以及标签 -train_feature = torch.tensor(all_data.iloc[:len(train_data)].values, dtype=torch.float64) -test_feature = torch.tensor(all_data.iloc[len(train_data):].values, dtype=torch.float64) -train_label = torch.tensor(train_data.iloc[:, -1].values.reshape(-1, 1), dtype=torch.float64) +train_feature = all_data.iloc[:len(train_data)] +test_feature = all_data.iloc[len(train_data):] +train_label = train_data.iloc[:, -1] + +class myDataset(Dataset): + def __init__(self, dataset, labels, mode='train', valid_ration=0.25): + # 确定是训练集还是验证集 + self.mode = mode + # 确定数据集 + self.data = dataset + self.label = labels + # 数据集长度 + self.data_len = len(self.data) + # 训练集长度 + self.train_len = int(self.data_len * (1 - valid_ration)) + + if self.mode == 'train': + # 从data和lable中获取对应的数据 + self.train_data = np.asarray(self.data.loc[:self.train_len]) + self.train_label = np.asarray(self.label.loc[:self.train_len]) + self.datas = self.train_data + self.labels = self.train_label + elif self.mode == "valid": + self.valid_data = np.asarray(self.data.loc[self.train_len:]) + self.valid_label = np.asarray(self.label.loc[self.train_len:]) + self.datas = self.valid_data + self.labels = self.valid_label + + self.datas_len = len(self.datas) + print(f'Finished reading {self.mode} dataset. {self.datas_len} number samples found.') + + def __getitem__(self, index): + data = self.datas[index] + label = self.labels[index] + return data, label + + def __len__(self): + return self.datas_len + +train_dataset = myDataset(train_feature, train_label, mode='train') +valid_dataset = myDataset(train_feature, train_label, mode='valid') +# print(train_dataset, valid_dataset) + +BATCH_SIZE = 16 +train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True) +valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True) +# print(len(train_loader), len(valid_loader)) + + + + +# class Network(object): +# +# def __init__(self, num_of_weights): # 初始化权重值 +# # 随机产生w的初始值,为了保持程序每次运行结果的一致性,设置固定的随机数种子 +# np.random.seed(0) +# self.w = np.random.randn(num_of_weights, 1) #初始参数一般符合标准正态分布 +# self.b = 0. +# +# def forward(self, x): +# z = np.dot(x, self.w) + self.b +# return z +# +# def loss(self, z, y): +# error = z - y +# cost = error * error +# cost = np.mean(cost) +# return cost +# +# def gradient(self, x, y, z): +# gradient_w = (z - y) * x +# gradient_w = np.mean(gradient_w, axis=0) +# gradient_w = gradient_w[:, np.newaxis] +# gradient_b = (z - y) +# gradient_b = np.mean(gradient_b) +# return gradient_w, gradient_b +# +# def update(self, gradient_w, gradient_b, eta=0.01): +# self.w = self.w - eta * gradient_w +# self.b = self.b - eta * gradient_b +# +# def train(self, train_data, train_label, valid_data, valid_label, epoch_num=100, batch_size=10, lr=0.01): +# n = len(train_data) +# # losses = [] +# step = 0 +# for epoch in range(1, epoch_num+1): +# print(f'=============第{epoch}轮=============') +# # 打乱数据集并分批 +# np.random.shuffle(train_data) +# for k in range(0, n, batch_size): +# mini_batches = [train_data[k:k+batch_size]] +# for iter_id, mini_batch in enumerate(mini_batches): +# x = mini_batch[:, :-1] +# y = mini_batch[:, -1:] +# z = self.forward(x) +# L = self.loss(z, y) +# gradient_w, gradient_b = self.gradient(x, y, z) +# self.update(gradient_w, gradient_b, lr) +# # losses.append(L) +# step += 1 +# +# if step % 100 == 0: +# print(f'Loss:{L.item()}') +# model = Network(331) +# +# x_train, x_valid, y_train, y_valid = train_test_split(train[0], train[1], test_size=.20) +# print(x_train.shape, x_valid.shape) # torch.Size([1168, 331]) torch.Size([292, 331]) + -train = [train_feature, train_label] if __name__ == '__main__': - print(type(train)) + pass diff --git a/HousePrice/src/model.py b/HousePrice/src/model.py index 99e84e3..af32bf5 100644 --- a/HousePrice/src/model.py +++ b/HousePrice/src/model.py @@ -9,7 +9,8 @@ class haonet(nn.Module): def __init__(self): super(haonet, self).__init__() self.module = nn.Sequential( - nn.Linear(331, 1) + nn.Linear(331, 128), + nn.Linear(128, 1) ) def forward(self, input): @@ -17,4 +18,5 @@ def forward(self, input): return output if __name__ == '__main__': - pass \ No newline at end of file + model = haonet() + print(model) \ No newline at end of file diff --git a/HousePrice/src/train.py b/HousePrice/src/train.py index 22b304f..9ef6319 100644 --- a/HousePrice/src/train.py +++ b/HousePrice/src/train.py @@ -7,30 +7,28 @@ from DataPreprocess import * from model import * import torch +import numpy as np import torchvision from torch import optim from torch.utils.data import DataLoader from sklearn.model_selection import train_test_split from torch.utils.data.sampler import SubsetRandomSampler -batch_size = 64 EPOCHS = 100 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') -x_train, x_test, y_train, y_test = train_test_split(train[0], train[1], test_size=.20) -# print(x_train.shape) -# train_loader = DataLoader(train, batch_size=batch_size, sampler=) -# test_loader = DataLoader(test_feature, batch_size=batch_size) - -# model = haonet() -def get_model(): - model = nn.Sequential(nn.Linear(331, 1)) - return model -model = get_model() +model = haonet() model.to(device) -lossfc = nn.MSELoss() -lossfc.to(device) +# lossfc = nn.L1Loss() +# lossfc.to(device) + +def loss(z, y): + error = z - y + cost = (error * error).mean() + # cost = np.mean(cost) + return cost + learning_rate = 1e-3 optim = optim.Adam(model.parameters(), lr=learning_rate) @@ -42,30 +40,27 @@ def get_model(): print(f'================第{epoch}轮================') model.train() - - for x, y in zip(x_train, y_train): - x = torch.tensor(x, dtype=float).to(torch.float32) - y = torch.tensor(y).to(torch.float32) + for x, y in train_loader: + x, y = x.to(torch.float32), y.to(torch.float32) output = model(x) - loss = lossfc(output, y) + L = loss(output, y) optim.zero_grad() - loss.backward() + L.backward() optim.step() step += 1 if step % 100 == 0: - print(f'step:{step}, loss:{loss.item()}') + print(f'step:{step}, loss:{L.item()}') model.eval() with torch.no_grad(): - for x, y in zip(x_test, y_test): - x = torch.tensor(x, dtype=float).to(torch.float32) - y = torch.tensor(y).to(torch.float32) + for x, y in valid_loader: + x, y = x.to(torch.float32), y.to(torch.float32) output = model(x) # print(output) - right_num += (output.item() == y).sum() + # right_num += (output.item() == y).sum() - accuracy = right_num / len(x_test) - print(f'accuracy:{accuracy}') + # accuracy = right_num / len(valid_dataset) + # print(f'accuracy:{accuracy}') torch.save(model.state_dict(), '../model.pth') \ No newline at end of file