Some part of Expectation Maximization code is as follows:
------------------------------------------------------------
/// <summary>
/// Run K-Mean untill clusters not change and centroid
converged
/// </summary>
public class EM
{
/// <summary>
/// A set of n dimentional points
/// </summary>
private DataSet _dataSet;
/// <summary>
/// Entry point of EM algorithm
/// </summary>
/// <param
name="dataSet"></param>
/// <param
name="K"></param>
/// <param
name="distanceMethod"></param>
/// <param
name="centroidMethod"></param>
public static void Run(DataSet dataSet, int K, int distanceMethod = 1, int centroidMethod = 1)
{
var clusters = new Cluster[K];
if (K > dataSet.DataPoints.Count)
{
throw new Exception("Clusters are more thn
data points choosen");
}
ChooseRandomCentroids(dataSet.DataPoints, clusters);
int _iteration = 1;
while (true)
{
Console.WriteLine("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@");
Console.WriteLine("@@@@@@@@@@@@
Iteration: {0}
@@@@@@@@@@@@@@@@", _iteration++);
Console.WriteLine("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@");
clusters.ToList().ForEach(c
=> c.ClusterPoints.Clear());
//
Step 1: Find Euclidean Distances
List<double[]> table = new List<double[]>();
foreach (var dataPoint in dataSet.DataPoints)
{
List<double> row =new List<double>();
foreach (var cluster in clusters)
{
var theDistance = DistanceMeasure.SquaredEuclideanDistance(dataPoint,
cluster.Centroid);
row.Add(theDistance);
}
table.Add(row.ToArray());
}
//
Step 2: E-Step
foreach (var row in table)
{
var sum = row.Sum();
//calculation step
var row2 = (double[]) row.Clone();
for (int j = 0; j < row.Length;
j++)
{
row2[row.Length-1-j] =
row[j] / sum;
}
//copy back
for (int j = 0; j < row.Length;
j++)
{
row[j] = row2[j];
}
}
//
Step 3: M-Step
foreach (var row in table)
{
for (int j = 0; j < row.Length;
j++)
{
row[j] *= row[j];
}
}
for (int i = 0; i < table[0].Length; i++)
{
for (int j = 0; j <
dataSet.DataPoints.Count; j++)
{
var dataPoint = DataPoint.Clone(dataSet.DataPoints[j]);
for (int k = 0; k <
dataPoint.Vector.Length; k++)
{
dataPoint.Vector[k]
*= table[j][i];
}
clusters[i].ClusterPoints.Add(dataPoint);
}
}
for (int i = 0; i < clusters.Length; i++)
{
var cluster = clusters[i];
var dimensions =
cluster.ClusterPoints[0].Dimensions;
var newCentroid = new DataPoint { Vector = new double[dimensions] };
double sum1 = 0, sum2 = 0;
for (int j = 0; j < table.Count;
j++)
{
sum1 += table[j][i];
}
for (int k = 0; k < dimensions;
k++)
{
sum2 =
cluster.ClusterPoints.Sum(x => x.Vector[k]);
newCentroid.Vector[k] =
sum2 / sum1;
}
cluster.OldCentroid =
cluster.Centroid;
cluster.Centroid =
newCentroid;
Console.Write(cluster.ToString());
}
bool allClustersConverged = true;
foreach (var cluster in clusters)
{
allClustersConverged =
allClustersConverged & cluster.IsConverged();
}
if (allClustersConverged)
break;
}
}
/// <summary>
/// Used to choose K (# of clusters) initial points
/// </summary>
/// <param
name="points"></param>
/// <param
name="clusters"></param>
private static void ChooseRandomCentroids(IList<DataPoint> points, Cluster[] clusters)
{
if (points.Count < 2)
{
throw new Exception("Points are
insufficient to start the algorithm");
}
if (clusters != null && clusters.Length < 2)
{
throw new Exception("Clusters are
insufficient to start the algorithm");
}
Random rand = new Random();
HashSet<int> results = new HashSet<int>();
while (results.Count < clusters.Length)
{
results.Add(rand.Next(1,
points.Count));
}
//randomly
choose first points and choose second which extream distance from first
for (int j = 0; j < results.Count; j++)
{
clusters[j] = new Cluster {Centroid =
points[j]};
}
}