http://sobuhu.com/ml/2012/11/11/navie-bayes-classify.html
http://changfengmingzhi.blog.163.com/blog/static/167105288201392033733991
/
//
Email: goonyangxiaofang@163.com
0 0 0
0 0 1
0 1 0
// QQ: 591247876
//
// Naive Bayes
//
// 输入样例
// 序号 类别 属性 1.2.3
//
//
//
//
// 测试样例
// 序号 类别 属性 1.2.3
//
//
//
//
//
//
//
// Wiki:http://en.wikipedia.org/wiki/Naive_Bayes_classifier
// http://msdn.microsoft.com/zh-cn/library/ms174806.aspx
//
0 1 1
1 0 0
1 0 1
1 1 0
1 1 1
1
2
3
4
5
6
7
8
1
0
1
0
1
0
1
1
#include
#include
#include
#include
#include
#include
using namespace std;
#define MYTRACE(t) cout << #t": " << t << endl;
typedef vector TypeV;
typedef vector > TypeVV;
typedef vector > > TypeVVV;
struct Pattern
{
long id;
long type;
long new_type;
vector
data;
bool operator<(const Pattern& p)
{
return type < p.type;
}
};
bool readData(vector& train_data, const string& file, long f);
void prlongData(const vector& train_data, long f);
void prlongVVV(const TypeVVV& vvv);
void computerVVV(TypeVVV& vvv, const vector& train_data);
void testTestData(vector& test_data, const TypeVVV& vvv);
void estimateResult(const vector& test_data);
long main(long argc, char* argv[])
{
vector train_data;
if (!readData(train_data, "train.txt", 1))
{
cerr << "Read train.txt error!" << endl;
return -1;
}
// prlongData(train_data, 1);
const long D_1st = 2;
const long D_2nd = train_data.size();
const long D_3rd = 2;
TypeV v(D_3rd, 0.0);
TypeVV vv(D_2nd, v);
TypeVVV vvv(D_1st, vv);
computerVVV(vvv, train_data);
// prlongVVV(vvv);
vector test_data;
if (!readData(test_data, "test.txt", 1))
{
cerr << "Read test.txt error!" << endl;
return -2;
}
testTestData(test_data, vvv);
prlongData(test_data, 1);
prlongData(test_data, 2);
estimateResult(test_data);
return 0;
}
bool readData(vector
& train_data, const string& file, long f)
{
ifstream fin(file.c_str());
if (fin.fail())
{
return false;
}
string s;
while (getline(fin, s))
{
istringstream sin(s);
double d;
Pattern pattern;
sin >> pattern.id;
if (f == 1)
{
sin >> pattern.type;
}
while (sin >> d)
{
pattern.data.push_back(d);
}
train_data.push_back(pattern);
}
fin.close();
return true;
}
void prlongData(const vector& train_data, long f)
{
for (size_t i = 0; i < train_data.size(); ++i)
{
cout << train_data[i].id << '\t';
if (f == 1)
{
cout << train_data[i].type << '\t';
}
else if (f == 2)
{
cout << train_data[i].new_type << '\t';
}
for (size_t j = 0; j < train_data[i].data.size(); ++j)
{
cout << train_data[i].data[j] << ' ';
}
cout << endl;
}
cout << endl;
}
void prlongVVV(const TypeVVV& vvv)
{
size_t i, j, k;
for (i = 0; i < vvv.size(); ++i)
{
for (j = 0; j < vvv[i].size(); ++j)
{
for (k = 0; k < vvv[i][j].size(); ++k)
{
cout << vvv[i][j][k] << ' ';
}
cout << endl;
}
cout << endl;
}
cout << endl;
// cout << i << endl << j << endl << k << endl;
}
long computerTypeAmount(const vector
& train_data, long type)
{
long ret = 0;
for (size_t i = 0; i < train_data.size(); ++i)
{
if (train_data[i].type == type)
{
++ret;
}
}
return ret;
}
long computerAmountByValueAttributeType(const vector& train_data,
long value, long attribute, long type)
{
long ret = 0;
for (size_t i = 0; i < train_data.size(); ++i)
{
if (train_data[i].type == type)
{
if (train_data[i].data[attribute] == value)
{
++ret;
}
}
}
return ret;
}
void computerVVV(TypeVVV& vvv, const vector
& train_data)
{
size_t i, j, k;
for (i = 0; i < vvv.size(); ++i)
{
long type_amount = computerTypeAmount(train_data, i);
for (j = 0; j < vvv[i].size(); ++j)
{
for (k = 0; k < vvv[i][j].size(); ++k)
{
vvv[i][j][k] = 1.0 * computerAmountByValueAttributeType(train_dat
a, k, j, i) / type_amount;
}
}
}
}
double log2(double n)
{
return log(n) / log(2.0);
}
void testTestData(vector& test_data, const TypeVVV& vvv)
{
for (size_t t = 0; t < test_data.size(); ++t)
{
double p = 0.0;
long typ = 0;
for (size_t i = 0; i < vvv.size(); ++i)
{
double q = 0.0;
for (size_t j = 0; j < vvv[i].size(); ++j)
{
q = q + (-log2(vvv[i][j][static_cast
(test_data[t].dat
a[j])] + 0.0001));
}
// cout << q << '\t';
if (q > p)
{
p = q;
typ = i;
// cout << "test" << endl;
}
}
// cout << endl;
// cout << typ << endl;
test_data[t].new_type = typ;
}
}
void estimateResult(const vector& test_data)
{
const long D_1st = 2;
vector v(5, 0.0);
vector > vv(D_1st, v);
for (long i = 0; i < D_1st; ++i)
{
for (size_t j = 0; j < test_data.size(); ++j)
{
if (test_data[j].type == i)
{
++vv[i][0];
}
if (test_data[j].new_type == i)
{
++vv[i][1];
}
if (test_data[j].type == test_data[j].new_type && test_data[j].type ==
i)
{
}
}
++vv[i][2];
}
double total_right = 0.0;
double weighting_recall = 0.0;
double weighting_precision = 0.0;
for (long i = 0; i < D_1st; ++i)
{
cout << i << ": " << endl;
// MYTRACE(vv[i][0]);
// MYTRACE(vv[i][1]);
// MYTRACE(vv[i][2]);
cout << "Recall: " << (vv[i][3] = vv[i][2] / vv[i][0]) << endl;
cout << "Precison: " << (vv[i][4] = vv[i][2] / vv[i][1]) << endl;
cout << endl;
weighting_recall += vv[i][3] * vv[i][0] / test_data.size();
weighting_precision += vv[i][4] * vv[i][0] / test_data.size();
total_right += vv[i][3];
}
cout << "right: " << total_right / test_data.size() << endl << endl;
cout << "Weight Recall: " << weighting_recall << endl;
cout << "Weight Precision: " << weighting_precision << endl;
cout << endl;
}