分类算法实例二:用Softmax算法预测葡萄酒质量

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import sklearn
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model.coordinate_descent import ConvergenceWarning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import label_binarize
from sklearn import metrics

# 设置字符集,防止中文乱码
mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False
# 拦截异常
warnings.filterwarnings(action = 'ignore', category=ConvergenceWarning)

# 读取数据
path1 = "datas/winequality-red.csv"
df1 = pd.read_csv(path1, sep=";")
# 设置数据类型为红葡萄酒
df1['type'] = 1
path2 = "datas/winequality-white.csv"
df2 = pd.read_csv(path2, sep=";")
# 设置数据类型为白葡萄酒
df2['type'] = 2
# 合并两个df
df = pd.concat([df1,df2], axis=0)

# 自变量名称
names = ["fixed acidity","volatile acidity","citric acid",
         "residual sugar","chlorides","free sulfur dioxide",
         "total sulfur dioxide","density","pH","sulphates",
         "alcohol", "type"]
# 因变量名称
quality = "quality"

# 显示前5条数据
df.head(5)

# 异常数据处理
new_df = df.replace('?', np.nan)
# 只要有列为空,就进行删除操作
datas = new_df.dropna(how = 'any')
print ("原始数据条数:%d;异常数据处理后数据条数:%d;异常数据条数:%d" % (len(df), len(datas), len(df) - len(datas)))

# 提取自变量和因变量
X = datas[names]
Y = datas[quality]

# 1. 数据分割
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25,random_state=0)

print ("训练数据条数:%d;数据特征个数:%d;测试数据条数:%d" % (X_train.shape[0], X_train.shape[1], X_test.shape[0]))

# 2. 数据归一化
ss = MinMaxScaler()
X_train = ss.fit_transform(X_train)
# 查看y值的范围和数据
Y_train.value_counts()

# 3. 模型构建及训练
# penalty: 过拟合解决参数,l1或者l2
# solver: 参数优化方式
# 当penalty为l1的时候,参数只能是:liblinear(坐标轴下降法);
# 当penalty为l2的时候,参数可以是:lbfgs(拟牛顿法)、newton-cg(牛顿法变种)
# multi_class: 分类方式参数;参数可选: ovr(默认)、multinomial;这两种方式在二元分类问题中,效果是一样的;在多元分类问题中,效果不一样
# ovr: one-vs-rest, 对于多元分类的问题,先将其看做二元分类,分类完成后,再迭代对其中一类继续进行二元分类
# multinomial: many-vs-many(MVM),对于多元分类问题,如果模型有T类,我们每次在所有的T类样本里面选择两类样本出来,不妨记为T1类和T2类,
# 把所有的输出为T1和T2的样本放在一起,把T1作为正例,T2作为负例,进行二元逻辑回归,得到模型参数。我们一共需要T(T-1)/2次分类
# class_weight: 特征权重参数
# Softmax算法相对于Logistic算法来讲,在sklearn中体现的代码形式来讲,主要只是参数的不同而已
# Logistic算法回归(二分类)使用的是ovr;如果是softmax回归,建议使用multinomial

lr = LogisticRegressionCV(fit_intercept=True, Cs=np.logspace(-5, 1, 100), 
                          multi_class='multinomial', penalty='l2', solver='lbfgs')
lr.fit(X_train, Y_train)

# 4. 模型效果获取
r = lr.score(X_train, Y_train)
print("R值:", r)
print("特征稀疏化比率:%.2f%%" % (np.mean(lr.coef_.ravel() == 0) * 100))
print("参数:",lr.coef_)
print("截距:",lr.intercept_)

# 5. 数据预测
# a. 使用标准化模型进行预测数据格式化(归一化)
X_test = ss.transform(X_test)
# b. 结果数据预测
Y_predict = lr.predict(X_test)
# c. 图表展示
x_len = range(len(X_test))
plt.figure(figsize=(14,7), facecolor='w')
plt.ylim(-1,11)
plt.plot(x_len, Y_test, 'ro',markersize = 8, zorder=3, label=u'真实值')
plt.plot(x_len, Y_predict, 'go', markersize = 12, zorder=2, label=u'预测值,$R^2$=%.3f' % lr.score(X_train, Y_train))
plt.legend(loc = 'upper left')
plt.xlabel(u'数据编号', fontsize=18)
plt.ylabel(u'葡萄酒质量', fontsize=18)
plt.title(u'葡萄酒质量预测统计', fontsize=20)
plt.show()

# 查看数据分布情况
[len(df[df.quality == i]) for i in range(11)]

df.quality.value_counts()

# 对数据进行降维处理后建模,查看效果:使用PCA降维(有时候进行特征抽取和数据降维对于模型的算法是没有太好的改进的)
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer

# 1. 数据分割
X1_train,X1_test,Y1_train,Y1_test = train_test_split(X,Y,test_size=0.025,random_state=0)
print ("训练数据条数:%d;数据特征个数:%d;测试数据条数:%d" % (X1_train.shape[0], X1_train.shape[1], X1_test.shape[0]))

# 2. 数据特征转换(归一化)
ss2 = Normalizer()
# 训练模型及归一化数据
X1_train = ss2.fit_transform(X1_train)

# 3. 特征选择,只考虑3个维度
# skb = SelectKBest(chi2, k=3)
# 训练模型及特征选择
# X1_train = skb.fit_transform(X1_train, Y1_train)

# 4. 降维
# 将样本数据维度降低成为2个维度
# pca = PCA(n_components=5)
# X1_train = pca.fit_transform(X1_train)
# print("贡献率:", pca.explained_variance_)

# 5. 模型构建
lr2 = LogisticRegressionCV(fit_intercept=True, Cs=np.logspace(-5, 1, 100), 
                          multi_class='multinomial', penalty='l2', solver='lbfgs')
lr2.fit(X1_train, Y1_train)

# 6. 模型效果输出
r = lr2.score(X1_train, Y1_train)
print("R值:", r)
print("特征稀疏化比率:%.2f%%" % (np.mean(lr2.coef_.ravel() == 0) * 100))
print("参数:",lr2.coef_)
print("截距:",lr2.intercept_)

# 7. 数据预测
# a. 预测数据格式化(归一化)
# 测试数据归一化
X1_test = ss2.transform(X1_test)
# 测试数据特征选择
# X1_test = skb.transform(X1_test)
# 测试数据降维
# X1_test = pca.fit_transform(X1_test)

# b. 结果数据预测
Y1_predict = lr2.predict(X1_test)

# c. 图表展示
x1_len = range(len(X1_test))
plt.figure(figsize=(14,7), facecolor='w')
plt.ylim(-1,11)
plt.plot(x1_len, Y1_test, 'ro',markersize = 8, zorder=3, label=u'真实值')
plt.plot(x1_len, Y1_predict, 'go', markersize = 12, zorder=2, label=u'预测值,$R^2$=%.3f' % lr2.score(X1_train, Y1_train))
plt.legend(loc = 'upper left')
plt.xlabel(u'数据编号', fontsize=18)
plt.ylabel(u'葡萄酒质量', fontsize=18)
plt.title(u'葡萄酒质量预测统计(降维处理)', fontsize=20)
plt.show()


已标记关键词 清除标记
是因为我的输入数据较少吗还是算法有问题 ![图片说明](https://img-ask.csdn.net/upload/201905/03/1556844528_823169.png) ``` import math import pandas as pd import numpy as np import random import time from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score class Softmax(object): def __init__(self): self.learning_step = 0.000001 # 学习速率 self.max_iteration = 100000 # 最大迭代次数 self.weight_lambda = 0.01 # 衰退权重 def cal_e(self,x,l): ''' 计算博客中的公式3 ''' theta_l = self.w[l] product = np.dot(theta_l,x) return math.exp(product) def cal_probability(self,x,j): ''' 计算博客中的公式2 ''' molecule = self.cal_e(x,j) denominator = sum([self.cal_e(x,i) for i in range(self.k)]) return molecule/denominator def cal_partial_derivative(self,x,y,j): ''' 计算博客中的公式1 ''' first = int(y==j) # 计算示性函数 second = self.cal_probability(x,j) # 计算后面那个概率 return -x*(first-second) + self.weight_lambda*self.w[j] def predict_(self, x): result = np.dot(self.w,x) row, column = result.shape # 找最大值所在的列 _positon = np.argmax(result) m, n = divmod(_positon, column) return m def train(self, features, labels): self.k = len(set(labels)) self.w = np.zeros((self.k,len(features[0])+1)) time = 0 while time < self.max_iteration: print('loop %d' % time) time += 1 index = random.randint(0, len(labels) - 1) x = features[index] y = labels[index] x = list(x) x.append(1.0) x = np.array(x) derivatives = [self.cal_partial_derivative(x,y,j) for j in range(self.k)] for j in range(self.k): self.w[j] -= self.learning_step * derivatives[j] def predict(self,features): labels = [] for feature in features: x = list(feature) x.append(1) x = np.matrix(x) x = np.transpose(x) labels.append(self.predict_(x)) return labels if __name__ == '__main__': print('Start read data') time_1 = time.time() raw_data = pd.read_csv('E:\jiqi\jiqiqiq.CSV', header=0) data = raw_data.values imgs = data[0::, 1::] labels = data[::, 0] # 选取 2/3 数据作为训练集, 1/3 数据作为测试集 train_features, test_features, train_labels, test_labels = train_test_split( imgs, labels, test_size=0.33, random_state=23323) # print train_features.shape # print train_features.shape time_2 = time.time() print('read data cost '+ str(time_2 - time_1)+' second') print('Start training') p = Softmax() p.train(train_features, train_labels) time_3 = time.time() print('training cost '+ str(time_3 - time_2)+' second') print('Start predicting') test_predict = p.predict(test_features) time_4 = time.time() print('predicting cost ' + str(time_4 - time_3) +' second') score = accuracy_score(test_labels, test_predict) print("The accruacy socre is " + str(score)) ```
训练结果有测试集和训练集的准确率,但是调取测试集的预测值时,全都是nan,我需要得到测试结果实际应用的。请问怎么解决?拿一个分类为例 以下是前期搭建框架 import numpy as np import tensorflow as tf x=tf.placeholder("float", [None,115]) y=tf.placeholder("float", [None,2]) W=tf.Variable(tf.zeros([115,2])) b=tf.Variable(tf.zeros([2])) actv= tf.nn.softmax(tf.matmul(x,W)+b) ####网上查到说下面这个loss函数中如果是log(0)的话就会有nan,但那应该连训练结果都没有才对吧?我现在能得到训练结果,但是结果都是nan怎么办? cost=tf.reduce_mean(-tf.reduce_sum(y*tf.log(actv), reduction_indices=1)) learning_rate=0.01 optm= tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) pred=tf.equal(tf.argmax(actv, 1), tf.argmax(y, 1)) accr=tf.reduce_mean(tf.cast(pred,"float")) init=tf.global_variables_initializer() sess=tf.InteractiveSession() sess=tf.Session() sess.run(init) training_lenth=len(G)####(回测长度) training_epochs =50 #训练次数 batch_size = len(G) #每次迭代用多少样本(用全套) ##display_step = 5 #展示 print('Down') 训练重点来了,我需要得到result的实际结果 lenth=2 for epoch in range(training_epochs): avg_cost=0 num_batch=int(len(G)/batch_size) for i in range((lenth-1),lenth): batch_xs=np.array(G[i]) batch_ys=np.array(F[i]) sess.run(optm, feed_dict={x: batch_xs, y: batch_ys}) feeds={x:batch_xs, y: batch_ys} avg_cost += sess.run (cost, feed_dict=feeds)/(num_batch*lenth) feeds_train = {x: batch_xs, y: batch_ys} feeds_test = {x: G[i+1], y: F[i+1]} train_acc = sess.run(accr, feed_dict=feeds_train) #feed_dict 针对place holder占位 test_acc = sess.run(accr,feed_dict=feeds_test) result=sess.run(actv,feed_dict=feeds_test) 但是实际给我的结果中,test_acc和train_acc都是有的,但是具体分类的概率值都是nan。 result=sess.run(actv,feed_dict=feeds_train) print (train_acc)# print(test_acc) train_acc a=sess.run([accr,actv],feed_dict=feeds_test) print(a) 0.930233 0.465116 [0.46511629, array([[ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan], [ nan, nan]], dtype=float32)] 求大神指教,跪送分
©️2020 CSDN 皮肤主题: 书香水墨 设计师:CSDN官方博客 返回首页