声明:
按类别特征词选择算法声明
vector
<
pair
<
string
,
double
>
>
LocalDFFeatureSelectionForPerclass(DICTIONARY
&
mymap,CONTINGENCY
&
contingencyTable,
string
classLabel);
//
局部DF法针对每个词对每个类别进行排序
void DFFeatureSelection(vector < string > classLabels,DICTIONARY & mymap,CONTINGENCY & contingencyTable, int N, char * address); // 调用局部DF特征词选择函数
void DFFeatureSelection(vector < string > classLabels,DICTIONARY & mymap,CONTINGENCY & contingencyTable, int N, char * address); // 调用局部DF特征词选择函数
函数实现:
对词典中的每个词,统计其在某一个类别中出现的次数,并按词频从大到小排序
/*
**********************************************************************
*/
/* 按类别的DF特征词选择法 */
/* ********************************************************************** */
vector < pair < string , double > > Preprocess::LocalDFFeatureSelectionForPerclass(DICTIONARY & mymap,CONTINGENCY & contingencyTable , string classLabel)
{
// int finalKeyWordsCount=0; // 计算共取了多少个关键词
clock_t start,finish;
double totaltime;
start = clock();
vector < pair < string , double > > DFinfo;
for (map < string ,vector < pair < int , int >>> ::iterator it = mymap.begin();it != mymap.end(); ++ it)
{
pair < string , string > compoundKey = make_pair(it -> first,classLabel);
double classCount = ( double )contingencyTable[compoundKey].first;
DFinfo.push_back(make_pair(it -> first,classCount));
}
stable_sort(DFinfo.begin(),DFinfo.end(),isLarger);
finish = clock();
totaltime = ( double )(finish - start) / CLOCKS_PER_SEC;
cout << " 为类别 " << classLabel << " 遴选特征词共用了 " << totaltime << endl;
return DFinfo;
}
/* 按类别的DF特征词选择法 */
/* ********************************************************************** */
vector < pair < string , double > > Preprocess::LocalDFFeatureSelectionForPerclass(DICTIONARY & mymap,CONTINGENCY & contingencyTable , string classLabel)
{
// int finalKeyWordsCount=0; // 计算共取了多少个关键词
clock_t start,finish;
double totaltime;
start = clock();
vector < pair < string , double > > DFinfo;
for (map < string ,vector < pair < int , int >>> ::iterator it = mymap.begin();it != mymap.end(); ++ it)
{
pair < string , string > compoundKey = make_pair(it -> first,classLabel);
double classCount = ( double )contingencyTable[compoundKey].first;
DFinfo.push_back(make_pair(it -> first,classCount));
}
stable_sort(DFinfo.begin(),DFinfo.end(),isLarger);
finish = clock();
totaltime = ( double )(finish - start) / CLOCKS_PER_SEC;
cout << " 为类别 " << classLabel << " 遴选特征词共用了 " << totaltime << endl;
return DFinfo;
}
DF特征词选择法:
代码
/*
**********************************************************************
*/
/* DF特征词选择法 */
/* ********************************************************************** */
void Preprocess:: DFFeatureSelection(vector < string > classLabels,DICTIONARY & mymap,CONTINGENCY & contingencyTable, int N, char * address)
{
clock_t start,finish;
double totaltime;
int totalTraingingCorpus = endIndex - beginIndex + 1 ; // 训练语料库总共的文章数目
set < string > finalKeywords; // 存放最终遴选出的特征词
vector < pair < string , double >> DFInfo;
start = clock();
for (vector < string > ::iterator it = classLabels.begin();it != classLabels.end();it ++ )
{
// 训练语料库中某个类别的文章数目
int N_subClassCnt = getCategorizationNum( * it, " TrainingCorpus " );
// threshold决定每个类别遴选多少个特征词
int threshold = N_subClassCnt * N / totalTraingingCorpus;
DFInfo = LocalDFFeatureSelectionForPerclass(mymap,contingencyTable, * it);
for (vector < pair < string , double > > ::size_type j = 0 ;j < threshold;j ++ )
{
finalKeywords.insert(DFInfo[j].first);
}
DFInfo.clear();
}
ofstream outfile(address);
int finalKeyWordsCount = finalKeywords.size();
for ( set < string > ::iterator it = finalKeywords.begin();it != finalKeywords.end();it ++ )
{
outfile <<* it << endl;
}
outfile.close();
cout << " 最后共选择特征词 " << finalKeyWordsCount << endl;
finish = clock();
totaltime = ( double )(finish - start) / CLOCKS_PER_SEC;
cout << " 遴选特征词共有了 " << totaltime << endl;
}
/* DF特征词选择法 */
/* ********************************************************************** */
void Preprocess:: DFFeatureSelection(vector < string > classLabels,DICTIONARY & mymap,CONTINGENCY & contingencyTable, int N, char * address)
{
clock_t start,finish;
double totaltime;
int totalTraingingCorpus = endIndex - beginIndex + 1 ; // 训练语料库总共的文章数目
set < string > finalKeywords; // 存放最终遴选出的特征词
vector < pair < string , double >> DFInfo;
start = clock();
for (vector < string > ::iterator it = classLabels.begin();it != classLabels.end();it ++ )
{
// 训练语料库中某个类别的文章数目
int N_subClassCnt = getCategorizationNum( * it, " TrainingCorpus " );
// threshold决定每个类别遴选多少个特征词
int threshold = N_subClassCnt * N / totalTraingingCorpus;
DFInfo = LocalDFFeatureSelectionForPerclass(mymap,contingencyTable, * it);
for (vector < pair < string , double > > ::size_type j = 0 ;j < threshold;j ++ )
{
finalKeywords.insert(DFInfo[j].first);
}
DFInfo.clear();
}
ofstream outfile(address);
int finalKeyWordsCount = finalKeywords.size();
for ( set < string > ::iterator it = finalKeywords.begin();it != finalKeywords.end();it ++ )
{
outfile <<* it << endl;
}
outfile.close();
cout << " 最后共选择特征词 " << finalKeyWordsCount << endl;
finish = clock();
totaltime = ( double )(finish - start) / CLOCKS_PER_SEC;
cout << " 遴选特征词共有了 " << totaltime << endl;
}
主函数调用:
代码
p.LoadDictionary(mymap,
"
F:\\finallyliuyu\\dict.dat
"
);
p.LoadContingencyTable(contingenyTable, " F:\\finallyliuyu\\contingency.dat " );
p.DFFeatureSelection(labels,mymap,contingenyTable, 2000 , " F:\\finallyliuyu\\keywords.dat " );
p.LoadContingencyTable(contingenyTable, " F:\\finallyliuyu\\contingency.dat " );
p.DFFeatureSelection(labels,mymap,contingenyTable, 2000 , " F:\\finallyliuyu\\keywords.dat " );