Logistic回归的使用和缺失值的处理
从疝气病预测病马的死亡率
数据集:
UCI上的数据,368个样本,28个特征
测试方法:
交叉测试
实现细节:
1.数据中因为存在缺失值所以要进行预处理,这点待会再单独谈2.数据中本来有三个标签,这里为了简单直接将未能存活和安乐死合并了3.代码中计算10次求均值
缺失值的处理:
一般来说有这么几种方法处理缺失值:
- 人工填写缺失值
- 使用全局变量填充缺失值
- 忽略有缺失值的样本
- 使用属性的中心度量(均值或中位数等)填充缺失值
- 使用与给定元祖同一类的所有样本的属性均值或中位数
- 使用最可能的值(需要机器学习算法推到)对不同的数据我们要采用不同的方法,这里考虑到我们用Logistic回归那么我们可以采用0填充,因为用0在更新
weight = weight + alpha * error * dataMatrix[randIndex]
的时候不会产生更新,并且sigmoid(0)=0.5,他对结果也不会产生影响。
-
1 #coding=utf-8 2 from numpy import * 3 4 def loadDataSet(): 5 dataMat = [] 6 labelMat = [] 7 fr = open('testSet.txt') 8 for line in fr.readlines(): 9 lineArr = line.strip().split()10 dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])11 labelMat.append(int(lineArr[2]))12 return dataMat, labelMat13 14 def sigmoid(inX):15 return 1.0/(1+exp(-inX))16 17 def stocGradAscent1(dataMatrix, classLabels, numIter=150):18 m,n = shape(dataMatrix)19 20 #alpha = 0.00121 weight = ones(n)22 for j in range(numIter):23 dataIndex = range(m)24 for i in range(m):25 alpha = 4/ (1.0+j+i) +0.0126 randIndex = int(random.uniform(0,len(dataIndex)))27 h = sigmoid(sum(dataMatrix[randIndex]*weight))28 error = classLabels[randIndex] - h29 weight = weight + alpha * error * dataMatrix[randIndex]30 del(dataIndex[randIndex])31 return weight32 33 def classifyVector(inX, weights):34 prob = sigmoid(sum(inX*weights))35 if prob > 0.5: return 1.036 else: return 0.037 38 def colicTest():39 frTrain = open('horseColicTraining.txt'); frTest = open('horseColicTest.txt')40 trainingSet = []; trainingLabels = []41 for line in frTrain.readlines():42 currLine = line.strip().split('\t')43 lineArr =[]44 for i in range(21):45 lineArr.append(float(currLine[i]))46 trainingSet.append(lineArr)47 trainingLabels.append(float(currLine[21]))48 trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 1000)49 errorCount = 0; numTestVec = 0.050 for line in frTest.readlines():51 numTestVec += 1.052 currLine = line.strip().split('\t')53 lineArr =[]54 for i in range(21):55 lineArr.append(float(currLine[i]))56 if int(classifyVector(array(lineArr), trainWeights))!= int(currLine[21]):57 errorCount += 158 errorRate = (float(errorCount)/numTestVec)59 print "the error rate of this test is: %f" % errorRate60 return errorRate61 62 def multiTest():63 numTests = 10; errorSum=0.064 for k in range(numTests):65 errorSum += colicTest()66 print "after %d iterations the average error rate is: %f" % (numTests, errorSum/float(numTests))67 68 def main():69 multiTest()70 71 if __name__ == '__main__':72 main()