From 3bd5e00f6e1478dfca366051ec207dc77a307080 Mon Sep 17 00:00:00 2001 From: bruce <> Date: Sat, 11 Jun 2022 11:23:39 +0800 Subject: [PATCH 1/2] skl-tree-impl --- numpy_ml/trees/skl_tree_imp.py | 40 ++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 numpy_ml/trees/skl_tree_imp.py diff --git a/numpy_ml/trees/skl_tree_imp.py b/numpy_ml/trees/skl_tree_imp.py new file mode 100644 index 0000000..f63d31a --- /dev/null +++ b/numpy_ml/trees/skl_tree_imp.py @@ -0,0 +1,40 @@ +import numpy as np +import matplotlib.pyplot as plt +from sklearn.tree import DecisionTreeRegressor +from sklearn import linear_model + +# Data set +x = np.array(list(range(1, 11))).reshape(-1, 1) +y = np.array([5.56, 5.70, 5.91, 6.40, 6.80, 7.05, 8.90, 8.70, 9.00, 9.05]).ravel() + +# Fit regression model 和手工计算的一样,是个三段函数 +# x≤3.5 5.72 +# 3.5⩽x≤6.5 6.75 +# 6.5 < x 8.91 + +model1 = DecisionTreeRegressor(max_depth=1) +model2 = DecisionTreeRegressor(max_depth=3, max_leaf_nodes=4, min_samples_leaf=3) +model3 = linear_model.LinearRegression() +model1.fit(x, y) +model2.fit(x, y) +model3.fit(x, y) + +# Predict +X_test = np.arange(0.0, 10.0, 0.01)[:, np.newaxis] +y_1 = model1.predict(X_test) +y_2 = model2.predict(X_test) +y_3 = model3.predict(X_test) + +# Plot the results +plt.figure() +plt.scatter(x, y, s=20, edgecolor="black", + c="darkorange", label="data") +plt.plot(X_test, y_1, color="cornflowerblue", + label="max_depth=1", linewidth=2) +plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=3", linewidth=2) +plt.plot(X_test, y_3, color='red', label='liner regression', linewidth=2) +plt.xlabel("data") +plt.ylabel("target") +plt.title("Decision Tree Regression") +plt.legend() +plt.show() \ No newline at end of file From cca1baaca1a89abde38e3b786c6cf917d44138de Mon Sep 17 00:00:00 2001 From: bruce <> Date: Sat, 11 Jun 2022 20:05:51 +0800 Subject: [PATCH 2/2] new code info --- .gitignore | 4 +- numpy_ml/trees/cart_tree_imp.py | 96 +++++++++++++++++++ .../{skl_tree_imp.py => scikit_learn_tree.py} | 2 +- numpy_ml/trees/train_data | 10 ++ 4 files changed, 110 insertions(+), 2 deletions(-) create mode 100644 numpy_ml/trees/cart_tree_imp.py rename numpy_ml/trees/{skl_tree_imp.py => scikit_learn_tree.py} (96%) create mode 100644 numpy_ml/trees/train_data diff --git a/.gitignore b/.gitignore index a8c73a9..780ba4b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,11 @@ ### OSX ### # General .DS_Store +.gitignore .AppleDouble .LSOverride - +*.xml +*.iml # Icon must end with two \r Icon diff --git a/numpy_ml/trees/cart_tree_imp.py b/numpy_ml/trees/cart_tree_imp.py new file mode 100644 index 0000000..0404bd9 --- /dev/null +++ b/numpy_ml/trees/cart_tree_imp.py @@ -0,0 +1,96 @@ +from numpy import * + + +# 载入数据 +def loadDataSet(fileName): + dataMat = [] + fr = open(fileName) + for line in fr.readlines(): + curLine = line.strip().split('\t') + # python3不适用:fltLine = map(float,curLine) 修改为: + fltLine = list(map(float, curLine)) # 将每行映射成浮点数,python3返回值改变,所以需要 + dataMat.append(fltLine) + return dataMat + + +# 切分数据集为两个子集 +def binSplitDataSet(dataSet, feature, value): # 数据集 待切分特征 特征值 + mat0 = dataSet[nonzero(dataSet[:, feature] > value)[0], :] + mat1 = dataSet[nonzero(dataSet[:, feature] <= value)[0], :] + # 下面原书代码报错 index 0 is out of bounds,使用上面两行代码 + # mat0 = dataSet[nonzero(dataSet[:, feature] > value)[0], :][0] + # mat1 = dataSet[nonzero(dataSet[:, feature] <= value)[0], :][0] + return mat0, mat1 + + +# Tree结点类型:回归树 +def regLeaf(dataSet): # 生成叶结点,在回归树中是目标变量特征的均值 + return mean(dataSet[:, -1]) + + +# 误差计算函数:回归误差 +def regErr(dataSet): # 计算目标的平方误差(均方误差*总样本数) + return var(dataSet[:, -1]) * shape(dataSet)[0] + + +# 二元切分 +def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(0, 1)): + # 切分特征的参数阈值,用户初始设置好 + tolS = ops[0] # 允许的误差下降值 + tolN = ops[1] # 切分的最小样本数 + # 若所有特征值都相同,停止切分 + featureNum = len(set(dataSet[:, -1].T.tolist()[0])) + if len(set(dataSet[:, -1].T.tolist()[0])) == 1: # 倒数第一列转化成list 不重复 + return None, leafType(dataSet) # 如果剩余特征数为1,停止切分1。 + # 找不到好的切分特征,调用regLeaf直接生成叶结点 + m, n = shape(dataSet) + S = errType(dataSet) # 最好的特征通过计算平均误差 + bestS = inf + bestIndex = 0 + bestValue = 0 + for featIndex in range(n - 1): # 遍历数据的每个属性特征 + # for splitVal in set(dataSet[:,featIndex]): python3报错修改为下面 + for splitVal in set((dataSet[:, featIndex].T.A.tolist())[0]): # 遍历每个特征里不同的特征值 + mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal) # 对每个特征进行二元分类 + if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue + newS = errType(mat0) + errType(mat1) + if newS < bestS: # 更新为误差最小的特征 + bestIndex = featIndex + bestValue = splitVal + bestS = newS + # 如果切分后误差效果下降不大,则取消切分,直接创建叶结点 + if (S - bestS) < tolS: + return None, leafType(dataSet) # 停止切分2 + mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue) + # 判断切分后子集大小,小于最小允许样本数停止切分3 + if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): + return None, leafType(dataSet) + return bestIndex, bestValue # 返回特征编号和用于切分的特征值 + + +# 构建tree +def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(0, 1)): + # 数据集默认NumPy Mat 其他可选参数【结点类型:回归树,误差计算函数,ops包含树构建所需的其他元组】 + feat, val = chooseBestSplit(dataSet, leafType, errType, ops) + if feat == None: return val # 满足停止条件时返回叶结点值 + # 切分后赋值 + retTree = {} + retTree['spInd'] = feat + retTree['spVal'] = val + # 切分后的左右子树 + lSet, rSet = binSplitDataSet(dataSet, feat, val) + retTree['left'] = createTree(lSet, leafType, errType, ops) + retTree['right'] = createTree(rSet, leafType, errType, ops) + return retTree + + +if __name__ == "__main__": + myDat = mat(loadDataSet('train_data')) + print(createTree(myDat)) + + # 绘制数据点图 + import matplotlib.pyplot as plt + + plt.plot(myDat[:, 0], myDat[:, 1], 'ro') + plt.show() + diff --git a/numpy_ml/trees/skl_tree_imp.py b/numpy_ml/trees/scikit_learn_tree.py similarity index 96% rename from numpy_ml/trees/skl_tree_imp.py rename to numpy_ml/trees/scikit_learn_tree.py index f63d31a..0b92777 100644 --- a/numpy_ml/trees/skl_tree_imp.py +++ b/numpy_ml/trees/scikit_learn_tree.py @@ -3,7 +3,7 @@ from sklearn.tree import DecisionTreeRegressor from sklearn import linear_model -# Data set +# Data set,skl官方代码给出样例 x = np.array(list(range(1, 11))).reshape(-1, 1) y = np.array([5.56, 5.70, 5.91, 6.40, 6.80, 7.05, 8.90, 8.70, 9.00, 9.05]).ravel() diff --git a/numpy_ml/trees/train_data b/numpy_ml/trees/train_data new file mode 100644 index 0000000..7f11985 --- /dev/null +++ b/numpy_ml/trees/train_data @@ -0,0 +1,10 @@ +1 4.50 +2 4.75 +3 4.91 +4 5.34 +5 5.80 +6 7.05 +7 7.90 +8 8.23 +9 8.70 +10 9.00 \ No newline at end of file