#Step 1: Find path of data file and copy path (if using windows)
#Step 2: In the console below, type readClipboard()
#Step 3: Copy and paste path to the line below in quotes

#AUTOMATING PACKAGES NEEDED FOR ANALYSES--------------------------------------------------------------------
haspackage = require("datasets")
if (haspackage==FALSE){
  install.packages("datasets")
}
library(datasets)

#dataset is already pre-loaded
#original data:
data01 = iris
#original data without species added:
data02 = data01[c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")]

#distance methods in R: the dist function ----------------------------------------------------------------

#a distance between variables in the data set:
variable_distance = dist(t(data02), method = "euclidean")
variable_distance
##              Sepal.Length Sepal.Width Petal.Length
## Sepal.Width      36.15785                         
## Petal.Length     28.96619    25.77809             
## Petal.Width      57.18304    25.86407     33.86473
flower_distance = dist(data02, method = "euclidean")
#quick graph of all the distances between each flower (index == which distance number)
plot(flower_distance)

#Classical method for cluster analysis: Agglomerative Hierarchical clustering ----------------------
#first, create distance matrix between flowers:
distance = dist(data02, method = "euclidean")

#second: conduct clustering
hierclust = hclust(distance, method = "ward.D")
plot(hierclust)

#cut tree into 3 clusters
groups = cutree(hierclust, k=3)
plot(groups)

#Modern clustering methods using likelihood-based models ----------------------------------------
haspackage = require("mclust")
## Loading required package: mclust
## Package 'mclust' version 5.2
## Type 'citation("mclust")' for citing this R package in publications.
if (haspackage==FALSE){
  install.packages("mclust")
}
library(mclust)

#determine number of classes
BIC = mclustBIC(data02)
plot(BIC)

three_classes = Mclust(data02, G = 3)
summary(three_classes, parameters = TRUE)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm 
## ----------------------------------------------------
## 
## Mclust VEV (ellipsoidal, equal shape) model with 3 components:
## 
##  log.likelihood   n df       BIC       ICL
##       -186.0736 150 38 -562.5514 -566.4577
## 
## Clustering table:
##  1  2  3 
## 50 45 55 
## 
## Mixing probabilities:
##         1         2         3 
## 0.3333333 0.3002348 0.3664319 
## 
## Means:
##               [,1]     [,2]     [,3]
## Sepal.Length 5.006 5.914717 6.546545
## Sepal.Width  3.428 2.777559 2.949380
## Petal.Length 1.462 4.203528 5.481568
## Petal.Width  0.246 1.298712 1.985130
## 
## Variances:
## [,,1]
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length   0.13324824  0.10941877  0.019200078 0.011590068
## Sepal.Width    0.10941877  0.15500101  0.012099295 0.010013052
## Petal.Length   0.01920008  0.01209930  0.028278640 0.005820607
## Petal.Width    0.01159007  0.01001305  0.005820607 0.010691679
## [,,2]
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length   0.22551946  0.07613511   0.14668558  0.04327572
## Sepal.Width    0.07613511  0.08016383   0.07368295  0.03434262
## Petal.Length   0.14668558  0.07368295   0.16588925  0.04941328
## Petal.Width    0.04327572  0.03434262   0.04941328  0.03332507
## [,,3]
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length   0.42948927  0.10792653   0.33478260  0.06556364
## Sepal.Width    0.10792653  0.11608122   0.08931829  0.06148507
## Petal.Length   0.33478260  0.08931829   0.36479673  0.08741320
## Petal.Width    0.06556364  0.06148507   0.08741320  0.08679214
plot(three_classes, what="classification")