Skip to content

Commit 09f1973

Browse files
authored
Clustering using PyGAD
Data clustering using the genetic algorithm. This example uses 3 clusters with artificial (non-real) data.
1 parent ece8d80 commit 09f1973

File tree

1 file changed

+134
-0
lines changed

1 file changed

+134
-0
lines changed

example_clustering_3.py

+134
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
import numpy
2+
import matplotlib.pyplot
3+
import pygad
4+
5+
cluster1_num_samples = 20
6+
cluster1_x1_start = 0
7+
cluster1_x1_end = 5
8+
cluster1_x2_start = 2
9+
cluster1_x2_end = 6
10+
cluster1_x1 = numpy.random.random(size=(cluster1_num_samples))
11+
cluster1_x1 = cluster1_x1 * (cluster1_x1_end - cluster1_x1_start) + cluster1_x1_start
12+
cluster1_x2 = numpy.random.random(size=(cluster1_num_samples))
13+
cluster1_x2 = cluster1_x2 * (cluster1_x2_end - cluster1_x2_start) + cluster1_x2_start
14+
15+
cluster2_num_samples = 20
16+
cluster2_x1_start = 4
17+
cluster2_x1_end = 12
18+
cluster2_x2_start = 14
19+
cluster2_x2_end = 18
20+
cluster2_x1 = numpy.random.random(size=(cluster2_num_samples))
21+
cluster2_x1 = cluster2_x1 * (cluster2_x1_end - cluster2_x1_start) + cluster2_x1_start
22+
cluster2_x2 = numpy.random.random(size=(cluster2_num_samples))
23+
cluster2_x2 = cluster2_x2 * (cluster2_x2_end - cluster2_x2_start) + cluster2_x2_start
24+
25+
cluster3_num_samples = 20
26+
cluster3_x1_start = 12
27+
cluster3_x1_end = 18
28+
cluster3_x2_start = 8
29+
cluster3_x2_end = 11
30+
cluster3_x1 = numpy.random.random(size=(cluster3_num_samples))
31+
cluster3_x1 = cluster3_x1 * (cluster3_x1_end - cluster3_x1_start) + cluster3_x1_start
32+
cluster3_x2 = numpy.random.random(size=(cluster3_num_samples))
33+
cluster3_x2 = cluster3_x2 * (cluster3_x2_end - cluster3_x2_start) + cluster3_x2_start
34+
35+
c1 = numpy.array([cluster1_x1, cluster1_x2]).T
36+
c2 = numpy.array([cluster2_x1, cluster2_x2]).T
37+
c3 = numpy.array([cluster3_x1, cluster3_x2]).T
38+
39+
data = numpy.concatenate((c1, c2, c3), axis=0)
40+
41+
matplotlib.pyplot.scatter(cluster1_x1, cluster1_x2)
42+
matplotlib.pyplot.scatter(cluster2_x1, cluster2_x2)
43+
matplotlib.pyplot.scatter(cluster3_x1, cluster3_x2)
44+
matplotlib.pyplot.title("Optimal Clustering")
45+
matplotlib.pyplot.show()
46+
47+
def euclidean_distance(X, Y):
48+
"""
49+
Calculate the euclidean distance between X and Y. It accepts:
50+
:X should be a matrix of size (N, f) where N is the number of samples and f is the number of features for each sample.
51+
:Y should be of size f. In other words, it is a single sample.
52+
53+
Returns a vector of N elements with the distances between the N samples and the Y.
54+
"""
55+
56+
return numpy.sqrt(numpy.sum(numpy.power(X - Y, 2), axis=1))
57+
58+
def cluster_data(solution, solution_idx):
59+
"""
60+
Clusters the data based on the current solution.
61+
"""
62+
63+
global num_clusters, feature_vector_length, data
64+
cluster_centers = [] # A list of size (C, f) where C is the number of clusters and f is the number of features representing each sample.
65+
all_clusters_dists = [] # A list of size (C, N) where C is the number of clusters and N is the number of data samples. It holds the distances between each cluster center and all the data samples.
66+
clusters = [] # A list with C elements where each element holds the indices of the samples within a cluster.
67+
clusters_sum_dist = [] # A list with C elements where each element represents the sum of distances of the samples with a cluster.
68+
69+
for clust_idx in range(num_clusters):
70+
# Return the current cluster center.
71+
cluster_centers.append(solution[feature_vector_length*clust_idx:feature_vector_length*(clust_idx+1)])
72+
# Calculate the distance (e.g. euclidean) between the current cluster center and all samples.
73+
cluster_center_dists = euclidean_distance(data, cluster_centers[clust_idx])
74+
all_clusters_dists.append(numpy.array(cluster_center_dists))
75+
76+
cluster_centers = numpy.array(cluster_centers)
77+
all_clusters_dists = numpy.array(all_clusters_dists)
78+
79+
# A 1D array that, for each sample, holds the index of the cluster with the smallest distance.
80+
# In other words, the array holds the sample's cluster index.
81+
cluster_indices = numpy.argmin(all_clusters_dists, axis=0)
82+
for clust_idx in range(num_clusters):
83+
clusters.append(numpy.where(cluster_indices == clust_idx)[0])
84+
# Calculate the sum of distances for the cluster.
85+
if len(clusters[clust_idx]) == 0:
86+
# In case the cluster is empty (i.e. has zero samples).
87+
clusters_sum_dist.append(0)
88+
else:
89+
# When the cluster is not empty (i.e. has at least 1 sample).
90+
clusters_sum_dist.append(numpy.sum(all_clusters_dists[clust_idx, clusters[clust_idx]]))
91+
# clusters_sum_dist.append(numpy.sum(euclidean_distance(data[clusters[clust_idx], :], cluster_centers[clust_idx])))
92+
93+
clusters_sum_dist = numpy.array(clusters_sum_dist)
94+
95+
return cluster_centers, all_clusters_dists, cluster_indices, clusters, clusters_sum_dist
96+
97+
def fitness_func(solution, solution_idx):
98+
_, _, _, _, clusters_sum_dist = cluster_data(solution, solution_idx)
99+
100+
# The tiny value 0.00000001 is added to the denominator in case the average distance is 0.
101+
fitness = 1.0 / (numpy.sum(clusters_sum_dist) + 0.00000001)
102+
103+
return fitness
104+
105+
num_clusters = 3
106+
feature_vector_length = data.shape[1]
107+
num_genes = num_clusters * feature_vector_length
108+
109+
ga_instance = pygad.GA(num_generations=100,
110+
sol_per_pop=10,
111+
init_range_low=0,
112+
init_range_high=20,
113+
num_parents_mating=5,
114+
keep_parents=2,
115+
num_genes=num_genes,
116+
fitness_func=fitness_func,
117+
suppress_warnings=True)
118+
119+
ga_instance.run()
120+
121+
best_solution, best_solution_fitness, best_solution_idx = ga_instance.best_solution()
122+
print("Best solution is {bs}".format(bs=best_solution))
123+
print("Fitness of the best solution is {bsf}".format(bsf=best_solution_fitness))
124+
print("Best solution found after {gen} generations".format(gen=ga_instance.best_solution_generation))
125+
126+
cluster_centers, all_clusters_dists, cluster_indices, clusters, clusters_sum_dist = cluster_data(best_solution, best_solution_idx)
127+
128+
for cluster_idx in range(num_clusters):
129+
cluster_x = data[clusters[cluster_idx], 0]
130+
cluster_y = data[clusters[cluster_idx], 1]
131+
matplotlib.pyplot.scatter(cluster_x, cluster_y)
132+
matplotlib.pyplot.scatter(cluster_centers[cluster_idx, 0], cluster_centers[cluster_idx, 1], linewidths=5)
133+
matplotlib.pyplot.title("Clustering using PyGAD")
134+
matplotlib.pyplot.show()

0 commit comments

Comments
 (0)