We will illustrate unsupervised learning with implementations of mixture of gaussians (an instance of EM) and k-means. K-means approaches the problem from a geometric perspective (at each iteration, a data point is assigned to one and only one cluster, the closest one), while mixture of gaussians does so from a probabilistic perspective (a data point belongs to all clusters, with a given probability):
#em_imp.py
from numpy import *
import sys
def gen_data(sources,k,m):
save3Dplot('sources',sources)
kdata=[zeros((m/k,2)) for i in range(k)]
for i in range(k):
for j in range(2):
mu=sources[i][j]['mu']
sd=sources[i][j]['sd']
kdata[i][:,j]=random.normal(mu,sd,m/k)
saveplot('X'+str(i),kdata[i],' ')
data=kdata[0]
for i in range(1,k):
data=concatenate((data,kdata[i]))
random.shuffle(data)
return data
def saveplot(fname,data,sep):
m,n=data.shape
fp=file(fname,'w')
for i in range(m):
for j in range(n):
fp.write(str(data[i,j])+sep)
fp.write('\n')
fp.close()
def plot(name,clusters,centroids,X):
saveplot(name+'centroids',centroids,' ')
k=shape(centroids)[0]
for i in range(k):
data=X.take(where(clusters==i)[0],0)
saveplot(name+'data'+str(i),data,' ')
def dnorm(X,muvec,covmat):
m=shape(X)[0]
probs=zeros((m,1))
dt=linalg.det(covmat)
A=1./((2*pi)*sqrt(dt))
covinv=linalg.inv(covmat)
for i in range(m):
t=X[i]-muvec
temp=-.5*dot(dot(t,covinv),transpose(t))
probs[i,0]=A*exp(temp)
return probs
def save3Dplot(fname,sources):
for src in range(len(sources)):
mu1=sources[src][0]['mu']
sd1=sources[src][0]['sd']
mu2=sources[src][1]['mu']
sd2=sources[src][1]['sd']
covmat=zeros((2,2))
covmat[0,0]=sd1**2
covmat[1,1]=sd2**2
avg=array([mu1,mu2])
data=zeros((100,100))
for i in range(100):
for j in range(100):
xvec=array([i,j])/10.
data[i,j]=dnorm(xvec.reshape(1,2),avg,covmat)
saveplot(fname+str(src),data,'\n')
def kmeans(X,k):
m=shape(X)[0]
n=shape(X)[1]
oldcentroids=zeros((k,n))
e=1
#pick actual points at random to be initial centroids
centroids=X[(random.ranf((k,1))*m).astype('int')]
centroids=centroids.reshape(k,n)
clusters=zeros((m,1),'int')
#repeat until convergence
while linalg.norm(oldcentroids-centroids)>1e-15:
oldcentroids=centroids+0 #copy
#compute cluster memberships
for i in range(m):
#repeat X[i,:] k times
dpoints=kron(ones((k,1)),X[i,:])
dists=sum((dpoints-centroids)**2,1).reshape(k,1)
clusters[i,0]=argmin(dists)
#compute new cluster centroids
for i in range(k):
centroids[i,:]=mean(X.take(where(clusters==i)[0],0),0)
print 'iter: %d, norm: %s'%(e,linalg.norm(oldcentroids-centroids))
e+=1
#compute final cluster memberships
for i in range(m):
dpoints=kron(ones((k,1)),X[i,:])
dists=sum((dpoints-centroids)**2,1).reshape(k,1)
clusters[i,0]=argmin(dists)
return clusters,centroids
def MoG(X,k,kclusters,kcentroids):
m=shape(X)[0]
n=shape(X)[1]
W=zeros((m,k))
lastll=sys.maxint
ll=0
e=1
#first guess parameters (using results from kmeans)
priors=zeros((k,1))
muvecs=zeros((k,n))
covmats=[zeros((n,n)) for i in range(k)]
for i in range(k):
priors[i,0]=len(X.take(where(kclusters==i)[0],0))/float(m)
covmats[i]=cov(transpose(X.take(where(kclusters==i)[0],0)))
muvecs[i]=kcentroids[i]
#repeat until convergence
while abs(lastll-ll)>1e-5 and lastll>ll:
#e-step: compute w's (probability of membership in each k)
for i in range(k):
#posterior~likelihood*prior
W[:,i:i+1]=dnorm(X,muvecs[i,:],covmats[i])*priors[i,0]
#normalize
W=W/W.sum(1).reshape(m,1)
#m-step: update parameters based on last membership
for i in range(k):
priors[i,0]=sum(W[:,i])/float(m)
muvecs[i,:]=sum(W[:,i:i+1]*X,0).reshape(1,n)/sum(W[:,i])
for j in range(n):
for l in range(n):
weight=W[:,i:i+1]
diff1=(X[:,j]-muvecs[i,j]).reshape(m,1)
diff2=(X[:,l]-muvecs[i,l]).reshape(m,1)
covmats[i][j,l]=sum(weight*diff1*diff2)/sum(W[:,i])
#compute log-likelihood
ll1=zeros((m,1))
ll2=W.argmax(1).reshape(m,1)
for i in range(m):
ll1[i,:]=dnorm(X[i].reshape(1,n),muvecs[ll2[i],:],covmats[ll2[i]])
ll2[i,0]=priors[ll2[i]]
ll1=(1e-15>ll1).choose(ll1,1e-15)
ll2=(1e-15>ll2).choose(ll2,1e-15)
lastll=ll
ll=sum(log(ll1)+log(ll2))
print 'iter: %d, log-likelihood: %s'%(e,ll)
e+=1
#compute final membership
clusters=W.argmax(1).reshape(m,1)
return clusters,muvecs
Driven with:
#em_inter.py
from em_imp import *
import os
#build your [bivariate] gaussians
sources=[[{'mu': 7.0, 'sd': 0.7}, {'mu': 7.0, 'sd': 0.7}],
[{'mu': 3.0, 'sd': 0.7}, {'mu': 7.0, 'sd': 0.7}],
[{'mu': 5.0, 'sd': 1.0}, {'mu': 4.0, 'sd': 1.0}]]
k=len(sources) #number of gaussians
m=1500*k #number of data points
random.seed(123456) #for repeatability
#sample data from sources
X=gen_data(sources,k,m)
#save files for data plot
saveplot('X',X,' ')
print '\nkmeans:'
#cluster data using k-means
kclusters,kcentroids=kmeans(X,3)
#save files for kmeans plot
plot('k',kclusters,kcentroids,X)
print '\nem:'
#cluster data using MoG
MoGclusters,MoGcentroids=MoG(X,3,kclusters,kcentroids)
#save files for MoG plot
plot('MoG',MoGclusters,MoGcentroids,X)
print '\nkcentroids:'
print kcentroids
print '\nMoGCentroids:'
print MoGcentroids
#run gnuplot script
os.system('gnuplot plotinst')
Using the following gnuplot script:
#plotinst
set term gif
set output 'source_contours.gif'
unset surface
set contour
set view 0,0,1,1
splot 'sources0','sources1','sources2'
set output 'data_color.gif'
plot 'X0' using 2:1, 'X1' using 2:1, 'X2' using 2:1
set output 'data_nocolor.gif'
plot 'X' using 2:1
set output 'sources3d.gif'
set hidden3d
set surface
set view 60,25,1,1
splot 'sources0','sources1','sources2'
set output 'kdata_color.gif'
plot 'kdata0' using 2:1, 'kdata1' using 2:1, 'kdata2' using 2:1, 'kcentroids' using 2:1 pointtype 7 pointsize 2
set output 'MoGdata_color.gif'
plot 'MoGdata0' using 2:1, 'MoGdata1' using 2:1, 'MoGdata2' using 2:1, 'MoGcentroids' using 2:1 pointtype 7 pointsize 2
set output
set term x11
We get the following output:
kmeans:
iter: 1, norm: 1.49829818106
iter: 2, norm: 0.143890700032
iter: 3, norm: 0.0288887879026
iter: 4, norm: 0.00466759639044
iter: 5, norm: 0.00179109984505
iter: 6, norm: 0.0
em:
iter: 1, log-likelihood: -165991.331942
iter: 2, log-likelihood: -165986.870227
kcentroids:
[[ 4.96865803 3.83410557]
[ 3.03335108 6.95460482]
[ 6.9650538 6.94975746]]
MoGCentroids:
[[ 4.97013245 3.91434377]
[ 3.02119885 6.98255096]
[ 6.96649683 6.97264637]]
And also the following plots:
- Contours of the bivariate gaussians sources of data to be clustered:

- Data generated from the gaussians:

- Data generated from the gaussians (no color):

- 3-D view of the gaussians:

- Data as clustered by the k-means algorithm:

- Data as clustered by the EM-mixture of gaussians algorithm (initialized with the results of k-means):
0 comments:
Post a Comment