#Step 1: Find path of data file and copy path (if using windows)
#Step 2: In the console below, type readClipboard()
#Step 3: Copy and paste path to the line below in quotes
#AUTOMATING PACKAGES NEEDED FOR ANALYSES--------------------------------------------------------------------
haspackage = require("datasets")
if (haspackage==FALSE){
install.packages("datasets")
}
library(datasets)
#dataset is already pre-loaded
#original data:
data01 = iris
#original data without species added:
data02 = data01[c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")]
#distance methods in R: the dist function ----------------------------------------------------------------
#a distance between variables in the data set:
variable_distance = dist(t(data02), method = "euclidean")
variable_distance
## Sepal.Length Sepal.Width Petal.Length
## Sepal.Width 36.15785
## Petal.Length 28.96619 25.77809
## Petal.Width 57.18304 25.86407 33.86473
flower_distance = dist(data02, method = "euclidean")
#quick graph of all the distances between each flower (index == which distance number)
plot(flower_distance)
#Classical method for cluster analysis: Agglomerative Hierarchical clustering ----------------------
#first, create distance matrix between flowers:
distance = dist(data02, method = "euclidean")
#second: conduct clustering
hierclust = hclust(distance, method = "ward.D")
plot(hierclust)
#cut tree into 3 clusters
groups = cutree(hierclust, k=3)
plot(groups)
#Modern clustering methods using likelihood-based models ----------------------------------------
haspackage = require("mclust")
## Loading required package: mclust
## Package 'mclust' version 5.2
## Type 'citation("mclust")' for citing this R package in publications.
if (haspackage==FALSE){
install.packages("mclust")
}
library(mclust)
#determine number of classes
BIC = mclustBIC(data02)
plot(BIC)
three_classes = Mclust(data02, G = 3)
summary(three_classes, parameters = TRUE)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEV (ellipsoidal, equal shape) model with 3 components:
##
## log.likelihood n df BIC ICL
## -186.0736 150 38 -562.5514 -566.4577
##
## Clustering table:
## 1 2 3
## 50 45 55
##
## Mixing probabilities:
## 1 2 3
## 0.3333333 0.3002348 0.3664319
##
## Means:
## [,1] [,2] [,3]
## Sepal.Length 5.006 5.914717 6.546545
## Sepal.Width 3.428 2.777559 2.949380
## Petal.Length 1.462 4.203528 5.481568
## Petal.Width 0.246 1.298712 1.985130
##
## Variances:
## [,,1]
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 0.13324824 0.10941877 0.019200078 0.011590068
## Sepal.Width 0.10941877 0.15500101 0.012099295 0.010013052
## Petal.Length 0.01920008 0.01209930 0.028278640 0.005820607
## Petal.Width 0.01159007 0.01001305 0.005820607 0.010691679
## [,,2]
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 0.22551946 0.07613511 0.14668558 0.04327572
## Sepal.Width 0.07613511 0.08016383 0.07368295 0.03434262
## Petal.Length 0.14668558 0.07368295 0.16588925 0.04941328
## Petal.Width 0.04327572 0.03434262 0.04941328 0.03332507
## [,,3]
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 0.42948927 0.10792653 0.33478260 0.06556364
## Sepal.Width 0.10792653 0.11608122 0.08931829 0.06148507
## Petal.Length 0.33478260 0.08931829 0.36479673 0.08741320
## Petal.Width 0.06556364 0.06148507 0.08741320 0.08679214
plot(three_classes, what="classification")