python - Labeling K-means cluster data points with matplotlib -
i have pulled following data .csv file(databoth.csv) , performed k-means clustering utilising matplotlib. data 3 columns(country, birthrate, life expectancy).
i need output: number of countries belonging each cluster. list of countries belonging each cluster. mean life expectancy , birth rate each cluster.
here code:
import csv import matplotlib.pyplot plt import sys import pylab plt import numpy np plt.ion() #k-means clustering implementation # data = set of data points # k = number of clusters # maxiters = maximum number of iterations executed k-means def kmeans(data, k, maxiters = 10, plot_progress = none): centroids = data[np.random.choice(np.arange(len(data)), k), :] in range(maxiters): # cluster assignment step c = np.array([np.argmin([np.dot(x_i-y_k, x_i-y_k) y_k in centroids]) x_i in data]) # move centroids step centroids = [data[c == k].mean(axis = 0) k in range(k)] if plot_progress != none: plot_progress(data, c, np.array(centroids)) return np.array(centroids) , c # calculates euclidean distance between # data point , available cluster # centroids. def euclidean_dist(data, centroids, clusters): instance in data: mu_index = min([(i[0], np.linalg.norm(instance-centroids[i[0]])) \ in enumerate(centroids)], key=lambda t:t[1])[0] try: clusters[mu_index].append(instance) except keyerror: clusters[mu_index] = [instance] # if cluster empty assign 1 point # data set randomly not have empty # clusters , 0 means. cluster in clusters: if not cluster: cluster.append(data[np.random.randint(0, len(data), size=1)].flatten().tolist()) return clusters # function reads data specified files def csvread(file): np.genfromtxt('databoth.csv', delimiter=',') # function show results on screen in form of 3 clusters def show(x, c, centroids, keep = false): import time time.sleep(0.5) plt.cla() plt.plot(x[c == 0, 0], x[c == 0, 1], '*b', x[c == 1, 0], x[c == 1, 1], '*r', x[c == 2, 0], x[c == 2, 1], '*g') plt.plot(centroids[:,0],centroids[:,1],'*m',markersize=20) plt.draw() if keep : plt.ioff() plt.show() # generate 3 cluster data data = csvread('databoth.csv') m1, cov1 = [9, 8], [[1.5, 2], [1, 2]] m2, cov2 = [5, 13], [[2.5, -1.5], [-1.5, 1.5]] m3, cov3 = [3, 7], [[0.25, 0.5], [-0.1, 0.5]] data1 = np.random.multivariate_normal(m1, cov1, 250) data2 = np.random.multivariate_normal(m2, cov2, 180) data3 = np.random.multivariate_normal(m3, cov3, 100) x = np.vstack((data1,np.vstack((data2,data3)))) np.random.shuffle(x) # calls functions # first find centroids using k-means centroids, c = kmeans(x, k = 3, plot_progress = show) #second show centroids on graph show(x, c, centroids, true)
maybe can use annotate
: http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.annotate
more example : http://matplotlib.org/users/annotations.html#plotting-guide-annotation
this allow have text label near each point.
or can use colours in post
Comments
Post a Comment