logo资料库

电动汽车价格预测 包含数据挖掘和模型比较.pdf

第1页 / 共9页
第2页 / 共9页
第3页 / 共9页
第4页 / 共9页
第5页 / 共9页
第6页 / 共9页
第7页 / 共9页
第8页 / 共9页
资料共9页,剩余部分请下载后查看
课 题: 机器学习与数据挖掘 学 院: 信息工程学院 专 业: 姓 名: 学 号: 19 计算机科学与技术 胡俊峰 401030919008 指导老师: 邱桃荣 日 期: 2020 年 6 月 11 日
3.1 数据分析 import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import warnings warnings.filterwarnings('ignore') #载入训练集 train_data = pd.read_excel('E:/FOut/train.xlsx') x_train = train.drop(['price','id'],axis=1) y_train = train['price'] #连续型数据 numeric_features=['feat1','feat3','feat5','feat7','feat8','feat9','feat11','fea t12','feat13','feat14','feat15','feat16','feat17'] #离散型数据 categorical_features=[ 'feat2','feat4','feat6','feat10','feat18','feat19','fea t20' ] #对离散型数据画直方图 def bar_plot(x, y, **kwargs): sns.barplot(x=x, y=y) x = plt.xticks(rotation=1200) f = pd.melt(train_data, id_vars=['price'], value_vars=categorical_features) g = sns.FacetGrid(f, col='variable', col_wrap=2, sharex=False, sharey=False, size=7) g = g.map(bar_plot, 'value', 'price')
#连续性特征 feat17,其余特征一致 facet = sns.FacetGrid(train_data, hue="price",aspect=4) facet.map(sns.kdeplot,'feat17',shade= True) facet.set(xlim=(0, train_data['feat17'].max())) facet.add_legend()
plt.xlabel('feat17') plt.ylabel('density') 特征 17,特征 3,特征 5,其余图片不一一列举 #特征间相关性 train_corr = train_data.drop(['id'],axis=1).corr() print(train_corr['price'].sort_values(ascending = False),'\n')
#特征的相关性热力图 #分析结果
特征 2,4,6,18,19,20 直方图分析发现差距不大,特征 3,5 密度图分析阴影面积差距 很少。最后得出特征 2,3,4,5,6,18,19,20 是无关特征。 3.2 模型验证 #kNN k 近邻算法,logreg , 留一法,导入声明 import numpy as np import pandas as pd from sklearn.model_selection import train_test_split #分割 from sklearn.model_selection import LeaveOneOut #留一法 from sklearn.model_selection import cross_val_score #交叉验证 from sklearn.linear_model import LogisticRegression #逻辑回归 from sklearn.linear_model import LinearRegression #线性回归 from sklearn.neighbors import KNeighborsClassifier #knn 算法 from sklearn.linear_model import Ridge #岭回归 from sklearn.tree import DecisionTreeClassifier #决策树 from sklearn.ensemble import RandomForestClassifier #随机森林 from sklearn import preprocessing import warnings warnings.filterwarnings('ignore') 使用留一法验证, #导入数据集 #因为需要交叉验证,将训练集和测试集统一放在训练集中,训练集有 22 行 1700 列数据 train_data = pd.read_excel('E:/FOut/train.xlsx') valid_data = pd.read_excel('E:/FOut/valid.xlsx') #归一化 train_data = pd.read_excel('E:/FOut/train.xlsx') valid_data = pd.read_excel('E:/FOut/valid.xlsx') x_max = np.max(train_data) x_min = np.min(train_data)
train_data1 = (train_data-x_min)/(x_max-x_min) xv_max = np.max(valid_data) xv_min = np.min(valid_data) valid_data1 = (valid_data-xv_min)/(xv_max-xv_min) #清除无关特征后,训练和测试 x_train=train_data1.drop(['price','id','feat2','feat3','feat4','feat5','feat6', 'feat10','feat18','feat19','feat20'],axis=1) y_train = train_data1['price'] x_valid=valid_data1.drop(['price','id','feat2','feat3','feat4','feat5','feat6', 'feat10','feat18','feat19','feat20'],axis=1) y_valid = valid_data1['price'] #查看数据是否符合要求。 print(x_train) print(y_train) print(x_valid) print(y_valid) #留一法 重复 10 次 loo = LeaveOneOut() for i in range(1,10): print("random_state is ", i,", and accuracy score is:") train_x,test_x,train_y,test_y=train_test_split(x_train,y_train,test_size=0. 2,random_state=0) #knn = KNeighborsClassifier(n_neighbors=1).fit(train_x,train_y) #logreg = LogisticRegression().fit(train_x,train_y) #lr = LinearRegression().fit(train_x,train_y) Ri = Ridge().fit(train_x,train_y) #tree = DecisionTreeClassifier(max_depth=12).fit(train_x,train_y) #forest=RandomForestClassifier(n_estimators=12,random_state=2).fit(train_x,
train_y) scores = cross_val_score(Ri,x_train,y_train, cv=loo) print("Test set accuracy: {:.2f}".format(Ri.score(test_x,test_y))) print("Number of cv iterations: ", len(scores)) print("Mean accuracy: {:.2f}".format(scores.mean())) print("valid set accuracy: {:.2f}".format(Ri.score(x_valid, y_valid))) #验证集上 测试 #10 倍交叉验证 for i in range(1,10): train_x,test_x,train_y,test_y=train_test_split(x_train,y_train,test_size=0. 2,random_state=0) #tree = DecisionTreeClassifier(max_depth=12).fit(train_x,train_y) #knn = KNeighborsClassifier(n_neighbors=1).fit(train_x,train_y) #logreg = LogisticRegression().fit(train_x,train_y) #lr = LinearRegression().fit(train_x,train_y) #Ri=Ridge().fit(train_x,train_y) forest=RandomForestClassifier(n_estimators=12,random_state=2).fit(train_x,t rain_y) scores = cross_val_score(forest,x_train,y_train,cv=10) print(scores.mean()) print("valid set accuracy: {:.2f}".format(forest.score(x_valid, y_valid))) #验证 集上测试 3.3 测试结果
分享到:
收藏