Thursday, May 7, 2015

K-Means and Expectation Maximization (EM) implemented on n-Dimensional UCI dataset

Please find my full source code from the link.... K-Means and EM project with UCI sample

Some part of Expectation Maximization code is as follows:
------------------------------------------------------------ 

 /// <summary>
    /// Run K-Mean untill clusters not change and centroid converged
    /// </summary>
    public class EM
    {
        /// <summary>
        /// A set of n dimentional points
        /// </summary>
        private DataSet _dataSet;

        /// <summary>
        /// Entry point of EM algorithm
        /// </summary>
        /// <param name="dataSet"></param>
        /// <param name="K"></param>
        /// <param name="distanceMethod"></param>
        /// <param name="centroidMethod"></param>
        public static void Run(DataSet dataSet, int K, int distanceMethod = 1, int centroidMethod = 1)
        {
            var clusters = new Cluster[K];

            if (K > dataSet.DataPoints.Count)
            {
                throw new Exception("Clusters are more thn data points choosen");
            }

            ChooseRandomCentroids(dataSet.DataPoints, clusters);

            int _iteration = 1;

            while (true)
            {
                Console.WriteLine("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@");
                Console.WriteLine("@@@@@@@@@@@@ Iteration: {0} @@@@@@@@@@@@@@@@", _iteration++);
                Console.WriteLine("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@");

                clusters.ToList().ForEach(c => c.ClusterPoints.Clear());

                // Step 1: Find Euclidean Distances
                List<double[]> table = new List<double[]>();
                foreach (var dataPoint in dataSet.DataPoints)
                {
                    List<double> row =new List<double>();

                    foreach (var cluster in clusters)
                    {

                        var theDistance = DistanceMeasure.SquaredEuclideanDistance(dataPoint, cluster.Centroid);
                        row.Add(theDistance);
                    }

                    table.Add(row.ToArray());

                }

                // Step 2: E-Step
                foreach (var row in table)
                {
                    var sum = row.Sum();
                    //calculation step
                    var row2 = (double[]) row.Clone();
                    for (int j = 0; j < row.Length; j++)
                    {
                        row2[row.Length-1-j] = row[j] / sum;
                    }
                    //copy back
                    for (int j = 0; j < row.Length; j++)
                    {
                        row[j] = row2[j];
                    }
                   
                }

                // Step 3: M-Step
                foreach (var row in table)
                {
                    for (int j = 0; j < row.Length; j++)
                    {
                        row[j] *= row[j];
                    }
                }

                for (int i = 0; i < table[0].Length; i++)
                {
                    for (int j = 0; j < dataSet.DataPoints.Count; j++)
                    {
                        var dataPoint = DataPoint.Clone(dataSet.DataPoints[j]);

                        for (int k = 0; k < dataPoint.Vector.Length; k++)
                        {
                            dataPoint.Vector[k] *= table[j][i];
                        }
                        clusters[i].ClusterPoints.Add(dataPoint);
                    }
                }

                for (int i = 0; i < clusters.Length; i++)
                {
                    var cluster = clusters[i];
                    var dimensions = cluster.ClusterPoints[0].Dimensions;
                    var newCentroid = new DataPoint { Vector = new double[dimensions] };


                    double sum1 = 0, sum2 = 0;
                    for (int j = 0; j < table.Count; j++)
                    {
                        sum1 += table[j][i];
                    }

                    for (int k = 0; k < dimensions; k++)
                    {
                        sum2 = cluster.ClusterPoints.Sum(x => x.Vector[k]);

                        newCentroid.Vector[k] = sum2 / sum1;
                    }


                    cluster.OldCentroid = cluster.Centroid;
                    cluster.Centroid = newCentroid;

                      Console.Write(cluster.ToString());

                }

                bool allClustersConverged = true;
                foreach (var cluster in clusters)
                {
                    allClustersConverged = allClustersConverged & cluster.IsConverged();
                }

                if (allClustersConverged)
                    break;

            }
        }

        /// <summary>
        /// Used to choose K (# of clusters) initial points
        /// </summary>
        /// <param name="points"></param>
        /// <param name="clusters"></param>
        private static void ChooseRandomCentroids(IList<DataPoint> points, Cluster[] clusters)
        {
            if (points.Count < 2)
            {
                throw new Exception("Points are insufficient to start the algorithm");
            }
            if (clusters != null && clusters.Length < 2)
            {
                throw new Exception("Clusters are insufficient to start the algorithm");
            }

            Random rand = new Random();
            HashSet<int> results = new HashSet<int>();
            while (results.Count < clusters.Length)
            {
                results.Add(rand.Next(1, points.Count));
            }

            //randomly choose first points and choose second which extream distance from first
            for (int j = 0; j < results.Count; j++)
            {
                clusters[j] = new Cluster {Centroid = points[j]};
            }

        }