|
|
import random
|
|
|
import math
|
|
|
import sys
|
|
|
import statistics
|
|
|
|
|
|
def mahattan_distance_one_dimension(x1,x2):
|
|
|
return abs(x1-x2)
|
|
|
|
|
|
def minkowski_distance(x1, x2, power):
|
|
|
return ((abs(x1[0] - x2[0]))**power + (abs(x1[1] - x2[1])**power)) **(1.0/power)
|
|
|
|
|
|
def chebyshev_distance(x1, x2, power=0):
|
|
|
return max(abs(x1[0]-x2[0]),abs(x1[1]-x2[1]))
|
|
|
|
|
|
one_dimensional_data_points = [5,13,4,6,15,13,32,14,6,10,12,31,12,41,13]
|
|
|
data_points = [(1,7),(1,-11),(3,17),(7,18),(-8,4),(8,12),(11,-7),(12,14),(13,71),(-16,11),(13,1),(-9,2),(-5,3),(0,12)]
|
|
|
|
|
|
def k_means_clustering(K,func,power=0):
|
|
|
print("==============================")
|
|
|
print("K: {0}".format(K))
|
|
|
|
|
|
|
|
|
C = random.sample(data_points,K)
|
|
|
|
|
|
|
|
|
iteration = 1000000
|
|
|
|
|
|
|
|
|
min_decrease_sse = 1e-5
|
|
|
|
|
|
|
|
|
previous_sse = float("inf")
|
|
|
current_iteration = 0
|
|
|
|
|
|
while(True):
|
|
|
current_iteration +=1
|
|
|
current_sse = 0.0
|
|
|
|
|
|
|
|
|
cms_dict = {}
|
|
|
for c in C:
|
|
|
cms_dict[c] = []
|
|
|
|
|
|
for x in data_points:
|
|
|
max_dist = float("inf")
|
|
|
closest = ()
|
|
|
|
|
|
for c in C:
|
|
|
d= func(x,c,power)
|
|
|
if(d <= max_dist):
|
|
|
max_dist = d
|
|
|
closest = c
|
|
|
|
|
|
|
|
|
cms_dict[closest].append(x)
|
|
|
|
|
|
|
|
|
C = []
|
|
|
for cm in cms_dict:
|
|
|
cm_total_distance = 0.0
|
|
|
new_m_x = 0.0
|
|
|
new_m_y = 0.0
|
|
|
|
|
|
|
|
|
for x in cms_dict[cm]:
|
|
|
new_m_x += x[0]
|
|
|
new_m_y += x[1]
|
|
|
new_m_x /= len(cms_dict[cm])
|
|
|
new_m_y /= len(cms_dict[cm])
|
|
|
C.append((new_m_x,new_m_y))
|
|
|
|
|
|
|
|
|
for x in cms_dict[cm]:
|
|
|
cm_total_distance += func(x,(new_m_x,new_m_y),power)**2
|
|
|
current_sse += cm_total_distance
|
|
|
|
|
|
|
|
|
if(previous_sse - current_sse <= min_decrease_sse or current_iteration > iteration):
|
|
|
print("Final SSE: {0}".format(current_sse))
|
|
|
print("Final Iteration: {0}".format(current_iteration))
|
|
|
i = 0
|
|
|
silhouetee_coefficent = 0.0
|
|
|
|
|
|
for cm in cms_dict:
|
|
|
i+=1
|
|
|
abs = []
|
|
|
if(len(cms_dict[cm]) != 1):
|
|
|
if(K > 1):
|
|
|
m = 0
|
|
|
for xi in cms_dict[cm]:
|
|
|
total_distance = 0
|
|
|
for xj in cms_dict[cm]:
|
|
|
if(xi != xj):
|
|
|
total_distance += func(xi,xj,power)
|
|
|
ai = total_distance//(len(cms_dict[cm])-1)
|
|
|
bi = None
|
|
|
for cm2 in cms_dict:
|
|
|
if( cm !=cm2 ):
|
|
|
total_distance = 0
|
|
|
for xj in cms_dict[cm2]:
|
|
|
total_distance += func(xi,xj,power)
|
|
|
average = total_distance//len(cms_dict[cm2])
|
|
|
if(bi is None):
|
|
|
bi = average
|
|
|
else:
|
|
|
if(average < bi ):
|
|
|
bi = average
|
|
|
si = float(bi - ai) / max(ai,bi)
|
|
|
silhouetee_coefficent += si
|
|
|
dict ={}
|
|
|
dict["a{0}".format(m)] = ai
|
|
|
dict["b{0}".format(m)] = bi
|
|
|
dict["s{0}".format(m)] = si
|
|
|
abs.append(dict)
|
|
|
m+=1
|
|
|
else:
|
|
|
dict ={}
|
|
|
dict["a0"] = "Undefined"
|
|
|
dict["b0"] = "Undefined"
|
|
|
dict["s0"] = "0 by defintion"
|
|
|
abs.append(dict)
|
|
|
print("Cluster {0}: {1}".format(i,cms_dict[cm]))
|
|
|
print(abs)
|
|
|
print("------------------------------------------------")
|
|
|
if( K > 1):
|
|
|
print("Average Silhouetee Coefficent:{0}".format(silhouetee_coefficent/len(data_points)))
|
|
|
else:
|
|
|
print("Silhouetee Coefficent is not defined for K = 1")
|
|
|
break
|
|
|
else:
|
|
|
print("Current SSE: {0}".format(current_sse))
|
|
|
previous_sse = current_sse
|
|
|
|
|
|
|
|
|
def k_median_clustering(K):
|
|
|
print("==============================")
|
|
|
print("K: {0}".format(K))
|
|
|
|
|
|
|
|
|
C = random.sample(one_dimensional_data_points,K)
|
|
|
|
|
|
|
|
|
iteration = 1000000
|
|
|
|
|
|
|
|
|
min_decrease_se = 1e-5
|
|
|
|
|
|
|
|
|
previous_se = float("inf")
|
|
|
current_iteration = 0
|
|
|
|
|
|
while(True):
|
|
|
current_iteration +=1
|
|
|
current_se = 0.0
|
|
|
|
|
|
|
|
|
cms_dict = {}
|
|
|
for c in C:
|
|
|
cms_dict[c] = []
|
|
|
|
|
|
for x in one_dimensional_data_points:
|
|
|
max_dist = float("inf")
|
|
|
closest = None
|
|
|
|
|
|
for c in C:
|
|
|
d = mahattan_distance_one_dimension(x,c,)
|
|
|
if(d <= max_dist):
|
|
|
max_dist = d
|
|
|
closest = c
|
|
|
|
|
|
|
|
|
cms_dict[closest].append(x)
|
|
|
|
|
|
|
|
|
C = []
|
|
|
for cm in cms_dict:
|
|
|
cm_total_distance = 0.0
|
|
|
new_m_x = 0.0
|
|
|
|
|
|
|
|
|
new_m_x = statistics.median(cms_dict[cm])
|
|
|
C.append(new_m_x)
|
|
|
|
|
|
|
|
|
for x in cms_dict[cm]:
|
|
|
cm_total_distance += mahattan_distance_one_dimension(x,(new_m_x))
|
|
|
current_se += cm_total_distance
|
|
|
|
|
|
|
|
|
if(previous_se - current_se <= min_decrease_se or current_iteration > iteration):
|
|
|
print("Final SSE: {0}".format(current_se))
|
|
|
print("Final Iteration: {0}".format(current_iteration))
|
|
|
i = 0
|
|
|
silhouetee_coefficent = 0.0
|
|
|
|
|
|
for cm in cms_dict:
|
|
|
i+=1
|
|
|
abs = []
|
|
|
if(len(cms_dict[cm]) != 1):
|
|
|
if(K > 1):
|
|
|
m = 0
|
|
|
for xi in cms_dict[cm]:
|
|
|
total_distance = 0
|
|
|
for xj in cms_dict[cm]:
|
|
|
if(xi != xj):
|
|
|
total_distance += mahattan_distance_one_dimension(xi,xj)
|
|
|
ai = total_distance//(len(cms_dict[cm])-1)
|
|
|
bi = None
|
|
|
for cm2 in cms_dict:
|
|
|
if( cm !=cm2 ):
|
|
|
total_distance = 0
|
|
|
for xj in cms_dict[cm2]:
|
|
|
total_distance += mahattan_distance_one_dimension(xi,xj)
|
|
|
average = total_distance//len(cms_dict[cm2])
|
|
|
if(bi is None):
|
|
|
bi = average
|
|
|
else:
|
|
|
if(average < bi ):
|
|
|
bi = average
|
|
|
si = float(bi - ai) / max(ai,bi)
|
|
|
silhouetee_coefficent += si
|
|
|
dict ={}
|
|
|
dict["a{0}".format(m)] = ai
|
|
|
dict["b{0}".format(m)] = bi
|
|
|
dict["s{0}".format(m)] = si
|
|
|
abs.append(dict)
|
|
|
m+=1
|
|
|
else:
|
|
|
dict ={}
|
|
|
dict["a0"] = "Undefined"
|
|
|
dict["b0"] = "Undefined"
|
|
|
dict["s0"] = "0 by defintion"
|
|
|
abs.append(dict)
|
|
|
print("Cluster {0}: {1}, Median: {2}".format(i,cms_dict[cm],statistics.median(cms_dict[cm])))
|
|
|
print(abs)
|
|
|
print("------------------------------------------------")
|
|
|
if( K > 1):
|
|
|
print("Average Silhouetee Coefficent:{0}".format(silhouetee_coefficent / len(data_points)))
|
|
|
else:
|
|
|
print("Silhouetee Coefficent is not defined for K = 1")
|
|
|
break
|
|
|
else:
|
|
|
print("Current SSE: {0}".format(current_se))
|
|
|
previous_se = current_se
|
|
|
|
|
|
def k_medoids_clustering(K,func,power=0):
|
|
|
print("==============================")
|
|
|
print("K: {0}".format(K))
|
|
|
|
|
|
|
|
|
C = random.sample(data_points,K)
|
|
|
|
|
|
|
|
|
iteration = 1000000
|
|
|
|
|
|
|
|
|
min_decrease_sse = 1e-5
|
|
|
|
|
|
|
|
|
previous_sse = float("inf")
|
|
|
current_iteration = 0
|
|
|
|
|
|
while(True):
|
|
|
current_iteration +=1
|
|
|
current_sse = 0.0
|
|
|
|
|
|
|
|
|
cms_dict = {}
|
|
|
for c in C:
|
|
|
cms_dict[c] = []
|
|
|
|
|
|
for x in data_points:
|
|
|
max_dist = float("inf")
|
|
|
closest = ()
|
|
|
|
|
|
for c in C:
|
|
|
d= func(x,c,power)
|
|
|
if(d <= max_dist):
|
|
|
max_dist = d
|
|
|
closest = c
|
|
|
|
|
|
|
|
|
cms_dict[closest].append(x)
|
|
|
|
|
|
|
|
|
C = []
|
|
|
for m in cms_dict:
|
|
|
max_dist = float("inf")
|
|
|
new_c = None
|
|
|
|
|
|
for x1 in cms_dict[m]:
|
|
|
cm_total_distance = 0.0
|
|
|
for x2 in cms_dict[m]:
|
|
|
cm_total_distance += func(x1, x2, power)
|
|
|
if(cm_total_distance <= max_dist):
|
|
|
max_dist = cm_total_distance
|
|
|
new_c = x1
|
|
|
C.append(new_c)
|
|
|
|
|
|
|
|
|
cm_total_distance = 0.0
|
|
|
for x in cms_dict[m]:
|
|
|
cm_total_distance += func(x,new_c,power)**2
|
|
|
current_sse += cm_total_distance
|
|
|
|
|
|
|
|
|
if(previous_sse - current_sse <= min_decrease_sse or current_iteration > iteration):
|
|
|
print("Final SSE: {0}".format(current_sse))
|
|
|
print("Final Iteration: {0}".format(current_iteration))
|
|
|
silhouetee_coefficent = 0.0
|
|
|
i = 0
|
|
|
|
|
|
for cm in cms_dict:
|
|
|
i+=1
|
|
|
abs = []
|
|
|
if(len(cms_dict[cm]) != 1):
|
|
|
if(K > 1):
|
|
|
m = 0
|
|
|
for xi in cms_dict[cm]:
|
|
|
total_distance = 0
|
|
|
for xj in cms_dict[cm]:
|
|
|
if(xi != xj):
|
|
|
total_distance += func(xi,xj,power)
|
|
|
ai = total_distance//(len(cms_dict[cm])-1)
|
|
|
bi = None
|
|
|
for cm2 in cms_dict:
|
|
|
if( cm !=cm2 ):
|
|
|
total_distance = 0
|
|
|
for xj in cms_dict[cm2]:
|
|
|
total_distance += func(xi,xj,power)
|
|
|
average = total_distance//len(cms_dict[cm2])
|
|
|
if(bi is None):
|
|
|
bi = average
|
|
|
else:
|
|
|
if(average < bi ):
|
|
|
bi = average
|
|
|
si = float(bi - ai) / max(ai,bi)
|
|
|
silhouetee_coefficent += si
|
|
|
dict ={}
|
|
|
dict["a{0}".format(m)] = ai
|
|
|
dict["b{0}".format(m)] = bi
|
|
|
dict["s{0}".format(m)] = si
|
|
|
abs.append(dict)
|
|
|
m+=1
|
|
|
else:
|
|
|
dict ={}
|
|
|
dict["a0"] = "Undefined"
|
|
|
dict["b0"] = "Undefined"
|
|
|
dict["s0"] = "0 by defintion"
|
|
|
abs.append(dict)
|
|
|
print("Cluster {0}: {1}".format(i,cms_dict[cm]))
|
|
|
print(abs)
|
|
|
print("------------------------------------------------")
|
|
|
if( K > 1):
|
|
|
print("Average Silhouetee Coefficent:{0}".format(silhouetee_coefficent/len(data_points)))
|
|
|
else:
|
|
|
print("Silhouetee Coefficent is not defined for K = 1")
|
|
|
break
|
|
|
|
|
|
else:
|
|
|
print("Current SSE: {0}".format(current_sse))
|
|
|
previous_sse = current_sse
|
|
|
|
|
|
|
|
|
def main(argv):
|
|
|
algo = {
|
|
|
"euclidean": lambda k,pow: k_means_clustering(k,minkowski_distance,2),
|
|
|
"minkowski": lambda k,pow: k_means_clustering(k,minkowski_distance,pow),
|
|
|
"chebyshev": lambda k,pow,: k_means_clustering(k,chebyshev_distance),
|
|
|
"median": lambda k,pow: k_median_clustering(k),
|
|
|
"medoids": lambda k,pow: k_medoids_clustering(k,minkowski_distance,pow)
|
|
|
}
|
|
|
for k in range(1,int(argv[1])+1):
|
|
|
algo[str(argv[0])](k,float(argv[1]) if (len(argv) > 2 and argv[2].replace('.','',1).isdigit()) else 2 )
|
|
|
print("===========================================================================")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
if(len(sys.argv) == 1):
|
|
|
print("Missing Arugments")
|
|
|
else:
|
|
|
main(sys.argv[1:]) |