2019独角兽企业重金招聘Python工程师标准>>>
第18题要求在第16题 Random PLA 算法的基础上使用 Pocket 算法对数据做二元划分。Pocket算法在第2篇文章介绍过,通常用来处理有杂质的数据集,在每一次更新 Weights(权向量)之后,把当前犯错最少的Weights放在pocket中,直至达到指定迭代次数(50次),pocket中的Weights即为所求。然后用测试数据验证W(pocket)的错误率,进行2000次计算取平均。
#include <fstream>
#include <iostream>
#include <vector>
#include <ctime>
using namespace std;
#define DEMENSION 5 //数据维度
int index = 0; //当前数据条目索引
int step = 0; //当前权向量更新次数
char *file = "training_data.txt";
char *file_test = "test_data.txt";
struct record {
double input[DEMENSION]; //输入
int output; //输出
};
int sign(double x)
{
//同Q16
}
//两个向量相加,更新第一个向量
void add(double *v1, double *v2, int demension)
{
//同Q16
}
//两个向量相乘,返回内积
double multiply(double *v1, double *v2, int demension)
{
//同Q16
}
//向量与实数相乘,结果通过*result返回,不改变参与计算的向量
void multiply(double *result, double *v, double num, int demension)
{
//同Q16
}
//对 traininig set 创建一个随机排序
void setRandomOrder(vector<record> &trainingSet, vector<int> &randIndexes)
{
//同Q16
}
//读取数据
void getData(ifstream & dataFile, vector<record> &data)
{
//同Q16
}
//错误统计及Pocket向量更新
void errCountAndPocketUpdate(vector<record> &trainingSet, vector<int> &randIndexes,
double *weights, double *pocketWeights, double &trainingErrRate, int dataLength)
{
int errCount = 0;
double curTrainingErrRate = 1.0;
for(int i=0;i<dataLength;++i){
if(trainingSet[randIndexes[i]].output
!= sign(multiply(weights,trainingSet[randIndexes[i]].input,DEMENSION))){
errCount++;
}
}
curTrainingErrRate = double(errCount)/double(dataLength);
if(curTrainingErrRate < trainingErrRate){
trainingErrRate = curTrainingErrRate;
for(int j=0; j<DEMENSION; ++j){
pocketWeights[j] = weights[j];
}
}
}
void Pocket(vector<record> &trainingSet, vector<int> &randIndexes, double *weights,
double *pocketWeights, double &trainingErrRate)
{
int length = trainingSet.size();
double curInput[DEMENSION];
errCountAndPocketUpdate(trainingSet, randIndexes, weights,
pocketWeights, trainingErrRate, length);
//找到下一个错误记录的index
while( trainingSet[randIndexes[index]].output ==
sign(multiply(weights,trainingSet[randIndexes[index]].input,DEMENSION)) ){
if(index==length-1) {index = 0;}
else {index++;}
}
if(step<50){
step++;
//更新: weights = weights + curOutput * curInput
multiply( curInput, trainingSet[randIndexes[index]].input,
trainingSet[randIndexes[index]].output, DEMENSION );
add( weights, curInput, DEMENSION );
if(index==length-1) {index = 0;}
else {index++;}
Pocket(trainingSet, randIndexes, weights, pocketWeights, trainingErrRate);
}else{
return;
}
}
//统计 W(pocket) 在测试数据集上的错误率
double getTestErrRate(vector<record> &testSet, double *pocketWeights, int dataLength)
{
int errCount = 0;
for(int i=0;i<dataLength;++i){
if(testSet[i].output !=
sign(multiply(pocketWeights,testSet[i].input,DEMENSION))){
errCount++;
}
}
return double(errCount)/double(dataLength);
}
void main()
{
double totalTestErrRate = 1.0;
for(int i=0;i<2000;++i){
double weights[DEMENSION]; //当前权重向量
double pocketWeights[DEMENSION]; //当前最优权重向量
vector<record> trainingSet; //训练数据
vector<record> testSet; //测试数据
vector<int> randIndexes; //访问数据的随机索引列表
ifstream dataFile(file);
ifstream testDataFile(file_test);
double trainingErrRate = 1.0; //训练集中的错误率[0.0, 1.0]
double testErrRate = 1.0; //测试集中的错误率[0.0, 1.0]
step = 0;
index = 0;
if( dataFile.is_open() && testDataFile.is_open() ){
getData(dataFile,trainingSet);
getData(testDataFile,testSet);
setRandomOrder(trainingSet,randIndexes);
}else{
cerr<<"ERROR ---> 文件打开失败"<<endl;
exit(1);
}
for(int j=0;j<DEMENSION;++j){
weights[j] = 0.0;
pocketWeights[j] = 0.0;
}
Pocket(trainingSet, randIndexes, weights, pocketWeights, trainingErrRate);
testErrRate = getTestErrRate(testSet,pocketWeights,testSet.size());
totalTestErrRate += testErrRate;
cout<<"\n ******************** 第 "<<i+1
<<" 次计算结束 ************************* \n"<<endl;
}
cout<<"Average Perportion = "<<totalTestErrRate/2000<<endl;
}
题19要求用经过50次更新的W(50)进行验证,而不是W(pocket),由于W(50)未必是当下最优,所以平均错误率一定会升高。代码几乎没有改动,只需在调用 getTestErrRate 函数是传入W(50)的指针即可。
本题要求把 Weights 的更新次数从50增加到100,可以预计平均错误率是降低的。