/*C++ program to show cosine similarities*/
#include<iostream>//include the library of input output stream
#include<fstream>//Header file for reading the word in the file
#include<vector>//Header file for storing the word in the documents
#include<map>//for relating the terms,frequencies,TF&TF_IDF and cosine smilarities
#include<cmath>//for doing mathematical calculation
#include<algorithm>//for sorting
using namespace std;//allows us to use the standard library
//string Lower(string& lowercase_term);//prototype for changing words to lowercase
void tf_idf_compute(map<string,int> &frequency,vector<vector<string> > &documents, vector<string> &terms);//prototype for finding a frequency of terms
int main()//where a c++ program starts/execution starts.
{//openining curled brac i.e main()opens
cout<<"Terms\t\t""Term frequency\t\t\t""TF_IDF\t\t""CosineSimilarity";
cout<<"\n";
vector<vector<string> > dc;// dictionary
vector<string> tokens;//container for documents which represent refered vector in prototype
map<string,int>S;//A vector "s"that stores int type values.
tf_idf_compute(S,dc,tokens);//call function for tf_idf
}
void tf_idf_compute(map<string,int> &frequency,vector<vector<string> > &documents,vector<string> &terms)//function for creating a dictionery
{
fstream file("regs.txt");//opens the file named regs.
if(!file)// reading file is not found
{
cout<<"file not found"<<endl;
}
else
{
while(!file.eof())//reading file is not found doesnot mark end of fuction
{
string hb;//variable of type string for holding a term
vector<string> words;//container for storing terms before storing the terms into a memory
while(file>>hb && hb!="#")//condition which direct raeding of documents with specified delimiter as the sign of an end of a document
{
words.push_back(hb);//put terms into a temporary holding vector
terms.push_back(hb);//keep terms in memory
frequency[hb]++;
sort(terms.begin(),terms.end());//sorting the terms
terms.erase(unique(terms.begin(),terms.end()),terms.end());//remove term repeatation
}
if(!words.empty())//if the vector is not empty
{
documents.push_back(words);//push the words in temporary vector into the vector of vectors in order to be stored in a memory
}
}
int a;
vector<int> ting;
for(int j=0;j<terms.size();j++)
{
a=0;
for(int i=0;i<documents.size();i++)
{
//finds if a term occurs or doesn't occcur in the document
vector<string>::const_iterator p=find(documents[i].begin(),documents[i].end(),terms[j]);
if(p!=documents[i].end())
{
a=a+1;
}
}
ting.push_back(a);
//cout<<cnt<<endl;
}
//operation for calculating tf_idf
map<string,int>::iterator iter;
float tf_idf;
float cosine;
for(iter=frequency.begin();iter!=frequency.end();iter++)
{
tf_idf=(1+log10(iter->second))*log10(documents.size()/a);//formular to calculate tf_idf & cosine similarities
cosine=(tf_idf*iter->second)/(abs(tf_idf)*abs(iter->second));
cout<<iter->first<<" "<<iter->second<<" "<<tf_idf<<" "<<cosine<<endl;
}
cin.get();//holds the screen
}
}//closing curled brac (}) i.e main()ends
====================================================================
your .txt file should look like
mine was "regs.txt"below
Information retrieval #
Information retrieval it is a discipline #
organization and storage should provide easy access #
Jumamosi, 30 Novemba 2013
Jumanne, 26 Novemba 2013
Compute Recall Precision and F-Measure using C++
in a .txt file copy and paste the below data
True-Positives= 20
False-Positives= 40
False-Negatives= 60
True-Negatives= 100
save both in the same folder and run it
Jisajili kwenye:
Machapisho (Atom)