7. K MAEANS AND HIERARCHIAL CLUSTERING
Sun Nov 03 2024 13:14:38 GMT+0000 (Coordinated Universal Time)
1.
from sklearn.datasets import load_iris
import pandas as pd
iris=load_iris()
df=pd.DataFrame(data=iris.data, columns=iris.feature_names)
print("DataFrame Head:\n",df.head())
print("DataInfo:\n",df.info())
output
DataFrame Head:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 sepal length (cm) 150 non-null float64
1 sepal width (cm) 150 non-null float64
2 petal length (cm) 150 non-null float64
3 petal width (cm) 150 non-null float64
dtypes: float64(4)
memory usage: 4.8 KB
DataInfo:
None
2.
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x_scaled=scaler.fit_transform(df)
from sklearn.cluster import KMeans
Kmeans=KMeans(n_clusters=3, random_state=42)
Kmeans.fit((x_scaled))
clusters_labels=Kmeans.labels_
df['cluster']=clusters_labels
cluster_centers=Kmeans.cluster_centers_
print("cluster centers:\n",cluster_centers)
ouput
cluster centers:
[[ 1.13597027 0.08842168 0.99615451 1.01752612]
[-1.01457897 0.85326268 -1.30498732 -1.25489349]
[-0.05021989 -0.88337647 0.34773781 0.2815273 ]]
3.
from sklearn.metrics import adjusted_rand_score,silhouette_score
true_labels=iris.target
ari=adjusted_rand_score(true_labels,clusters_labels)
print(f"Adjusted Rand_Index(ARI):{ari}")
silhouette_avg=silhouette_score(x_scaled,clusters_labels)
print(f"sithouette score : {silhouette_avg}")
ouput
Adjusted Rand_Index(ARI):0.6201351808870379
sithouette score : 0.45994823920518635
ll.1
#Hierarchial clustering
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram,linkage
from scipy.cluster.hierarchy import fcluster
iris=load_iris()
data=pd.DataFrame(iris.data,columns=iris.feature_names)
scaler=StandardScaler()
scaled_data=scaler.fit_transform(data)
z=linkage(scaled_data,method="ward")
plt.figure(figsize=(10,7))
plt.title("Dendogram for Iris dataset")
dendrogram(z,labels=iris.target)
plt.show()
clusters=fcluster(z,3,criterion="maxclust")
data['cluster']=clusters
data['species']=iris.target
print(data.groupby(['cluster','species']).size())
silhouette_avg=silhouette_score(scaled_data,clusters)
print(f"Silhouette score: {silhouette_avg}")
output
cluster species
1 0 49
2 0 1
1 27
2 2
3 1 23
2 48
dtype: int64
Silhouette score: 0.446689041028591



Comments