#Step 1: Find path of data file and copy path (if using windows)
#Step 2: In the console below, type readClipboard()
#Step 3: Copy and paste path to the line below in quotes

#CHANGE BETWEEN QUOTES IN THIS LINE TO REFLECT DIRECTORY OF DATA:
myDataLocation = "/Users/jonathan/Dropbox/!EPSY 905/Lectures/13 PCA, EFA, and CFA/mv16epsy905_lecture13"

#SET WORKING DIRECTORY TO LOCATION OF DATA FILE
setwd(myDataLocation)

#IMPORT DATA AND PUT INTO DATASET
data01 = read.csv(file="mv16epsy905_lecture13a.csv",header=TRUE)
data02 = read.csv(file="mv16epsy905_lecture13b.csv",header=TRUE)

#AUTOMATING PACKAGES NEEDED FOR ANALYSES--------------------------------------------------------------------
haspackage = require("lavaan")
## Loading required package: lavaan
## This is lavaan 0.5-20
## lavaan is BETA software! Please report any bugs.
if (haspackage==FALSE){
  install.packages("lavaan")
}
library(lavaan)

#Advanced Matrix Operations --------------------------------------------------------------------------------

#correlation matrix of SAT data
sat_corrmat = cor(data01)

#eigenvalues and eigenvectors of correlation matrix:
sat_eigen = eigen(x = sat_corrmat, symmetric = TRUE)

sat_eigen$values
## [1] 1.7752238 0.2247762
sat_eigen$vectors
##           [,1]       [,2]
## [1,] 0.7071068 -0.7071068
## [2,] 0.7071068  0.7071068
#demonstration of spectral decomposition
corr_rank1 = sat_eigen$values[1] * sat_eigen$vectors[,1] %*% t(sat_eigen$vectors[,1])
corr_rank1
##           [,1]      [,2]
## [1,] 0.8876119 0.8876119
## [2,] 0.8876119 0.8876119
corr_rank2 = corr_rank1 + sat_eigen$values[2] * sat_eigen$vectors[,2] %*% t(sat_eigen$vectors[,2])
corr_rank2
##           [,1]      [,2]
## [1,] 1.0000000 0.7752238
## [2,] 0.7752238 1.0000000
#Principal Components Analysis of SAT data ---------------------------------------------------------------
#PCA of Correlation matrix (scale. = TRUE)
sat_pca_corr = prcomp(x = data01, scale. = TRUE)

#show the results (compare to eigenvalues/eigenvectors)
sat_pca_corr
## Standard deviations:
## [1] 1.3323753 0.4741057
## 
## Rotation:
##             PC1        PC2
## SATV -0.7071068  0.7071068
## SATM -0.7071068 -0.7071068
#show the summary statistics
summary(sat_pca_corr)
## Importance of components:
##                           PC1    PC2
## Standard deviation     1.3324 0.4741
## Proportion of Variance 0.8876 0.1124
## Cumulative Proportion  0.8876 1.0000
#create same analysis but with covariance matrix (for visual) scale.=FALSE (covariance matrix) 
sat_pca_cov = prcomp(x = data01, scale. = FALSE)

#create augmented data matrix for plot
data01a = data01
data01a$type = "Raw"

data01b = data.frame(SATV = sat_pca_cov$x[,1], SATM = sat_pca_cov$x[,2], type="PC")
data01c = rbind(data01a, data01b)

plot(x = data01c$SATV, y = data01c$SATM, ylab = "SATM/PC2", xlab = "SATV/PC1", cex.main=1.5, frame.plot=FALSE, col=ifelse(data01c$type=="Raw", "red", "blue"))
legend(0, 400, pch=1, col=c("red", "blue"), c("Data", "PCs"), bty="o",  box.col="darkgreen", cex=1.5)

#Principal Components analysis of Gambling Data------------------------------------------------------------
#listwise removal of missing data (common in PCA -- but still a problem)
data02a = data02[which(is.na(data02$X1)==FALSE & is.na(data02$X3)==FALSE & is.na(data02$X5)==FALSE & is.na(data02$X9)==FALSE & is.na(data02$X10)==FALSE &
                       is.na(data02$X13)==FALSE & is.na(data02$X14)==FALSE & is.na(data02$X18)==FALSE & is.na(data02$X21)==FALSE & is.na(data02$X23)==FALSE),]

#analysis of covariance matrix of gambling data items
gambling_pca_cov = prcomp(x = data02a, scale. = FALSE) 
gambling_pca_cov
## Standard deviations:
##  [1] 2.0484706 1.3228618 1.0883066 0.8360793 0.7509565 0.7336463 0.6861620
##  [8] 0.6483578 0.5823026 0.4643768
## 
## Rotation:
##            PC1         PC2        PC3          PC4          PC5
## X1  -0.3126472  0.10459331  0.1662869 -0.843735151  0.364943452
## X3  -0.2262923  0.07828499  0.1775587 -0.133596441 -0.524752411
## X5  -0.3199396  0.09403404  0.2294114  0.025555994 -0.492411902
## X9  -0.2433589  0.08687880  0.1596160  0.030715826 -0.117324662
## X10 -0.2798829  0.08714575  0.2352173  0.084941435 -0.194587548
## X13 -0.3293561  0.15569572  0.1863762  0.220349288  0.326257817
## X14 -0.4493615 -0.86166945 -0.2311989  0.005504759 -0.004148522
## X18 -0.3471876  0.40458523 -0.8343757 -0.028338155 -0.100291367
## X21 -0.2720582  0.13735301  0.0617232  0.198445605  0.179410916
## X23 -0.3258344  0.09836941  0.1385856  0.415552100  0.385544166
##             PC6          PC7          PC8          PC9         PC10
## X1  -0.08957759  0.090472261  0.002045810  0.028416754  0.039583080
## X3   0.68009146  0.343237952 -0.191565009 -0.015252456  0.025623885
## X5  -0.62592643 -0.090211626 -0.395234597 -0.043942452  0.187031376
## X9  -0.06172329 -0.152582899  0.152138070  0.044974999 -0.916942010
## X10 -0.02579154 -0.093484479  0.842208707 -0.027303911  0.306918670
## X13  0.29451341 -0.504704067 -0.225373117 -0.524435934  0.101286325
## X14  0.02615629 -0.037268705 -0.001356776 -0.004497823  0.002084634
## X18 -0.01866511  0.009093169  0.052785485 -0.074203041  0.001980035
## X21  0.15471581 -0.249558686 -0.142870079  0.841805796  0.128352736
## X23 -0.14657740  0.717890018 -0.025173918 -0.071163812 -0.032749511
summary(gambling_pca_cov)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     2.0485 1.3229 1.0883 0.83608 0.75096 0.73365
## Proportion of Variance 0.4043 0.1686 0.1141 0.06736 0.05434 0.05186
## Cumulative Proportion  0.4043 0.5730 0.6871 0.75447 0.80881 0.86067
##                            PC7     PC8     PC9    PC10
## Standard deviation     0.68616 0.64836 0.58230 0.46438
## Proportion of Variance 0.04537 0.04051 0.03267 0.02078
## Cumulative Proportion  0.90604 0.94655 0.97922 1.00000
prop_var = t(summary(gambling_pca_cov)$importance[2:3,])
#creating a scree plot and a proportion of variance plot

par(mfrow = c(1,2))
plot(gambling_pca_cov, type="l", main = "Scree Plot of PCA Eigenvalues", lwd = 5)
matplot(prop_var, type="l", main = "Proportion of Variance Explained by Component", lwd = 5)
legend(x=5, y=.5, legend = c("Component Variance", "Cumulative Variance"), lty = 1:2, lwd=5, col=1:2)