You can translate the content of this page by selecting a language in the select box. But I suggest that the best display would be in English.
Fall, 2022

Text Clusting
With EM Algorithm

With the provided dataset, cluster the documents into n topics by fitting to multinomial distribution model, use the EM algorithm for converging the parameters.

To Project GitHub Page

    mycluster.py : file contaning the EM algorithm.

  • In this project we are categorizing the documents into 4 clusters.
  • Each document is vectorized into a vector using bag-of-word with length V (vocabulary size), collaboratively they form the matrx T.
  • Here the initialization assumes equal probabilities for matrix Πc= 1/size(T), because EM algorithm is not gauranteed to converge to a global minimum, different initializations more suitable to the dataset is encouraged when applied to different cases.
  • After initialization, we compute the expectation for Πc that maximizes the posterior probabilities and update iteratively till convergence. In this implementation, the function sets 1000 iterations.

	#mycluster.py
	import numpy as np

	def cluster(T, K, num_iters = 700, epsilon = 1e-12):
		"""   
		:param bow:
			bag-of-word matrix of (num_doc, V), where V is the vocabulary size
		:param K:
			number of topics
		:return:
			idx of size (num_doc), idx should be 1, 2, 3 or 4
		"""   
		
		'''Initialization'''
		J=T.shape[1]
		I=T.shape[0]
		#try different initialization
		pi_c=np.full((K,1),1/K)
		posterior_Di_c=np.zeros((I,K))
		mu=np.zeros((J,K))
		for k in range(K):
			mu_prep=T+np.random.uniform(0,np.sum(np.sum(T))/I,size=(I,J))
			mu[:,k]=np.reshape(np.sum(mu_prep,axis=0)/np.sum(np.sum(mu_prep)),(J))
		mu_temp=np.copy(mu)
		for k in range(K):
			for i in range(I):
				for j in range(J):
					mu_temp[j,k]=mu[j,k]**T[i,j]
				posterior_Di_c[i,k]=np.prod(mu_temp[:,k])
		tau_ic=np.zeros((I,K))
		print('Initialization done.')
		
		print('EM Iteraiton...')
		for count in range(num_iters):
			
			'''E-Step'''    
			for k in range(K):
				for i in range(I):
					tau_ic[i,k]=pi_c[k]*posterior_Di_c[i,k]/np.dot(pi_c.T,posterior_Di_c[i,:].T)
			
			
			'''M-Step'''
			for k in range(K):
				lower_temp=0        
				for j in range(J):
					for i in range(I):
						lower_temp+=(tau_ic[i,k]*T[i,j])
				for j in range(J):
					upper_temp=0
					for i in range(I):
						upper_temp+=(tau_ic[i,k]*T[i,j])
					mu[j,k]=upper_temp/lower_temp
				pi_c[k]=np.sum(tau_ic[:,k])/I
				mu_temp=np.copy(mu)
				for i in range(I):
					for j in range(J):
						mu_temp[j,k]=mu[j,k]**T[i,j]
					posterior_Di_c[i,k]=np.prod(mu_temp[:,k])

		print('EM Iteration done.')

		print('Computing final probability density estimaiton...')
		p_Di=np.zeros((I,K))
		for k in range(K):
			for i in range(I):
				p_Di[i,k]=pi_c[k]*posterior_Di_c[i,k]
		print('Done.')
		idx=np.zeros((I))
		for i in range(I):
			idx[i]=int(np.where(p_Di[i]==p_Di[i].max())[0])+1
		#raise NotImplementedError
		return idx	

    AccMeasure.py : file contaning the evaluation function.

  • Compares results from mycluster.py to provided true cluster reuslts and return the accuracy.

	#AccMeasure.py
	import scipy.io
	import itertools
	import numpy as np
	
	
	def acc_measure(idx):
		"""
	
		:param idx:
			numpy array of (num_doc)
		:return:
			accuracy
		"""
	
		mat = scipy.io.loadmat('data.mat')
		mat = mat['X']
		Y = mat[:, -1]
	
		# rotate for different idx assignments
		best_acc = 0
		for idx_order in itertools.permutations([1, 2, 3, 4]):
	
			for ind, label in enumerate(idx_order):
				Y[(ind)*100:(ind+1)*100] = label
	
			acc = (Y == idx).sum() / Y.shape[0]
			if acc > best_acc:
				best_acc = acc
	
		return best_acc
	
	
	if __name__ == '__main__':
		acc_measure(np.array([1]*100 + [3]*100 + [2]*100 + [4]*100))

homework2.py : the main file, calls on the previous files to run EM algorithm, accuracy evaluation and show results.


	#homework2.py
	import scipy.io
	from AccMeasure import acc_measure
	from mycluster import cluster
	#from mycluster_extra import cluster_extra
	
	mat = scipy.io.loadmat('data.mat')
	mat = mat['X']
	X = mat[:, :-1]
	print(X[[0]],X.shape)
	idx = cluster(X, 4)
	
	acc = acc_measure(idx)
	
	print('accuracy %.4f' % (acc))

Results: The results fluctuates around 92 to 96 accuracy. Reducing the iteration numbers produces similar results, showing convergence before 1000 iterations. With this initialization method, 700 iterations shows confidence in convergence.


	D:\Dak's Documents\GaTech\CSE 6740 Computational Data Analysis (ML)\homework2\homework2>py homework2.py
	Initialization done.
	EM Iteraiton...
	EM Iteration done.
	Computing final probability density estimaiton...
	Done.
	[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
	1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
	1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
	1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
	1. 1. 1. 1. 3. 3. 2. 3. 3. 3. 4. 2. 4. 4. 3. 3. 3. 3. 4. 2. 3. 4. 3. 3.
	3. 4. 3. 3. 3. 2. 2. 4. 3. 3. 3. 3. 4. 3. 3. 4. 3. 3. 4. 3. 3. 3. 3. 3.
	3. 3. 1. 3. 3. 4. 3. 3. 3. 3. 3. 3. 3. 3. 3. 4. 1. 3. 3. 3. 4. 4. 4. 4.
	2. 4. 3. 3. 3. 3. 3. 3. 3. 3. 2. 3. 3. 1. 3. 3. 3. 3. 2. 3. 3. 3. 3. 3.
	3. 3. 3. 3. 2. 3. 3. 3. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.
	4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.
	4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.
	4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.
	4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
	2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
	2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
	2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
	2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]
	accuracy 0.9275