python - Labeling K-means cluster data points with matplotlib -


i have pulled following data .csv file(databoth.csv) , performed k-means clustering utilising matplotlib. data 3 columns(country, birthrate, life expectancy).

i need output: number of countries belonging each cluster. list of countries belonging each cluster. mean life expectancy , birth rate each cluster.

here code:

import csv import matplotlib.pyplot plt import sys import pylab plt import numpy np plt.ion()   #k-means clustering implementation # data = set of data points # k = number of clusters # maxiters = maximum number of iterations executed k-means def kmeans(data, k, maxiters = 10, plot_progress = none):      centroids = data[np.random.choice(np.arange(len(data)), k), :]     in range(maxiters):         # cluster assignment step         c = np.array([np.argmin([np.dot(x_i-y_k, x_i-y_k) y_k in          centroids]) x_i in data])         # move centroids step         centroids = [data[c == k].mean(axis = 0) k in range(k)]         if plot_progress != none: plot_progress(data, c, np.array(centroids))     return np.array(centroids) , c   # calculates euclidean distance between # data point , available cluster # centroids. def euclidean_dist(data, centroids, clusters):     instance in data:         mu_index = min([(i[0], np.linalg.norm(instance-centroids[i[0]])) \                         in enumerate(centroids)], key=lambda t:t[1])[0]     try:         clusters[mu_index].append(instance)     except keyerror:         clusters[mu_index] = [instance]  # if cluster empty assign 1 point # data set randomly not have empty # clusters , 0 means. cluster in clusters:     if not cluster:         cluster.append(data[np.random.randint(0, len(data), size=1)].flatten().tolist())  return clusters   # function reads data specified files def csvread(file):     np.genfromtxt('databoth.csv', delimiter=',')     # function show results on screen in form of 3 clusters def show(x, c, centroids, keep = false):     import time     time.sleep(0.5)     plt.cla()     plt.plot(x[c == 0, 0], x[c == 0, 1], '*b',      x[c == 1, 0], x[c == 1, 1], '*r',      x[c == 2, 0], x[c == 2, 1], '*g') plt.plot(centroids[:,0],centroids[:,1],'*m',markersize=20) plt.draw() if keep :     plt.ioff()     plt.show()  # generate 3 cluster data data = csvread('databoth.csv') m1, cov1 = [9, 8], [[1.5, 2], [1, 2]] m2, cov2 = [5, 13], [[2.5, -1.5], [-1.5, 1.5]] m3, cov3 = [3, 7], [[0.25, 0.5], [-0.1, 0.5]] data1 = np.random.multivariate_normal(m1, cov1, 250) data2 = np.random.multivariate_normal(m2, cov2, 180) data3 = np.random.multivariate_normal(m3, cov3, 100) x = np.vstack((data1,np.vstack((data2,data3)))) np.random.shuffle(x)   # calls functions # first find centroids using k-means centroids, c = kmeans(x, k = 3, plot_progress = show) #second show centroids on graph show(x, c, centroids, true) 

maybe can use annotate: http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.annotate

more example : http://matplotlib.org/users/annotations.html#plotting-guide-annotation

this allow have text label near each point.

or can use colours in post


Comments

Popular posts from this blog

ios - Change Storyboard View using Seague -

commonjs - How to write a typescript definition file for a node module that exports a function? -

openid - Okta: Failed to get authorization code through API call -