Skip to content

Commit

Permalink
Merge pull request #68 from R-Goc/change-RNG-in-kmeans.h
Browse files Browse the repository at this point in the history
Changing the RNG in kmeans.h to platform agnostic.
  • Loading branch information
spirosmaggioros authored Jul 8, 2024
2 parents 18312c2 + 641b777 commit fb62936
Showing 1 changed file with 70 additions and 51 deletions.
121 changes: 70 additions & 51 deletions src/machine_learning/clustering/kmeans/kmeans.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,111 +15,130 @@ using json = nlohmann::json;
/**
* @ brief Class for the kmeans clustering algorithm
*/
class kmeans{
class kmeans
{
private:
/**
* @ brief distance function
* @ param a the first input vector
* @ param b the second input vector
* @ return int64_t the euclidean distance between vector a and b
*
*/
double distance(std::vector<double> &a, std::vector<double> &b){
return sqrt(pow((a[0] - b[0]) , 2) + pow((a[1] - b[1]), 2));
* @ brief distance function
* @ param a the first input vector
* @ param b the second input vector
* @ return int64_t the euclidean distance between vector a and b
*
*/
double distance(std::vector<double> &a, std::vector<double> &b)
{
return sqrt(pow((a[0] - b[0]), 2) + pow((a[1] - b[1]), 2));
}
std::vector<std::vector<double>> data;
int K;
std::vector<std::vector<double> > cluster_centers;
std::vector<std::vector<double>> cluster_centers;
std::map<std::vector<double>, int64_t> assignments;

public:
/**
* @brief Constructor for the kmeans class
* @param data: the input data(2D vector)
* @param K: the number of clusters
* @param MAX_ITER: default 500, maximum iterations till it converges
*/
kmeans(std::vector<std::vector<double>> data, int K, int64_t MAX_ITER=1500) : data(data), K(K) {


for(int i = 0; i<K; i++){
int rand_num = arc4random() % data.size() - 1;
* @brief Constructor for the kmeans class
* @param data: the input data(2D vector)
* @param K: the number of clusters
* @param MAX_ITER: default 500, maximum iterations till it converges
*/
kmeans(std::vector<std::vector<double>> data, int K, int64_t MAX_ITER = 1500) : data(data), K(K)
{

std::random_device rd;
std::mt19937_64 gen(rd());
std::uniform_real_distribution<double> distrib(std::numeric_limits<double>::min(), std::numeric_limits<double>::max());
for (int i = 0; i < K; i++)
{
double rand_num = distrib(gen);
this->cluster_centers.push_back(data[rand_num]);
}

for(int ww = 0; ww<MAX_ITER; ww++){
for(int64_t i = 0; i<data.size(); i++){
for (int ww = 0; ww < MAX_ITER; ww++)
{
for (int64_t i = 0; i < data.size(); i++)
{
assign_to_closest(data[i]);
}

std::vector<std::vector<std::vector<double> > > _clusters(K);
for(auto & x: assignments){
std::vector<std::vector<std::vector<double>>> _clusters(K);
for (auto &x : assignments)
{
_clusters[x.second].push_back(x.first);
}
std::vector<std::vector<double> > new_centroids;
for(int i = 0; i<K; i++){
std::vector<std::vector<double>> new_centroids;
for (int i = 0; i < K; i++)
{
new_centroids.push_back(get_centroid(_clusters[i]));
}

if(new_centroids == this->cluster_centers){
if (new_centroids == this->cluster_centers)
{
break;
}
else{
else
{
this->cluster_centers = new_centroids;
}
}
}

/**
* @brief Destroy the kmeans object
*/
~kmeans() {
~kmeans()
{
cluster_centers.clear();
assignments.clear();
}

/**
* @ brief assign_to_closest function
* assigns the closest centroid to point x
* @ param x: the input vector
*/
void assign_to_closest(std::vector<double> &x){
* @ brief assign_to_closest function
* assigns the closest centroid to point x
* @ param x: the input vector
*/
void assign_to_closest(std::vector<double> &x)
{
std::vector<double> id = this->cluster_centers[0];
int index = 0;
int min_dist = distance(x, id);
for(int j = 0; j<this->cluster_centers.size(); j++){
for (int j = 0; j < this->cluster_centers.size(); j++)
{
int current_dist = distance(this->cluster_centers[j], x);
if(current_dist < min_dist){
if (current_dist < min_dist)
{
min_dist = current_dist;
index = j;
id = this->cluster_centers[index];
}
}
assignments[x] = index;
}

/**
* @ brief get_centroid function
* @ param cluster: the input cluster
* @ return vector<int>: the centroid of the cluster
*/
std::vector<double> get_centroid(std::vector<std::vector<double> > cluster){
* @ brief get_centroid function
* @ param cluster: the input cluster
* @ return vector<int>: the centroid of the cluster
*/
std::vector<double> get_centroid(std::vector<std::vector<double>> cluster)
{
double sum_x = 0, sum_y = 0, n = cluster.size();
for(auto & x : cluster){
for (auto &x : cluster)
{
sum_x += x[0];
sum_y += x[1];
}
sum_x /= n;
sum_y /= n;
return {sum_x, sum_y};
}

/**
* @ brief run function
* @ return pair<vector<int64_t>, map<int64_t, int> >: the centroids and the assignments of the clustering algorithm
*
*/
std::pair<std::vector<std::vector<double> >, std::map<std::vector<double>, int64_t> > fit(){
* @ brief run function
* @ return pair<vector<int64_t>, map<int64_t, int> >: the centroids and the assignments of the clustering algorithm
*
*/
std::pair<std::vector<std::vector<double>>, std::map<std::vector<double>, int64_t>> fit()
{
return std::make_pair(cluster_centers, assignments);
}
};
Expand Down

0 comments on commit fb62936

Please sign in to comment.