08-PCA.Rmd · 贾凯威/Machine-Learning-in-R

# Principal Component Analysis # Load packages ```{r} library(dplyr) library(PCAmixdata) ``` # Load data Reimport the heart disease dataset. ```{r} load("data/preprocessed.RData") ``` # Overview ## Unsupervised approaches Since we are not trying to predict the value of any target variable like in supervised approaches, the value of unsupervised machine learning can be to see how data separate based solely on the nature of their features. This is a major value, as we can include all of the data at once, and just see how it sorts! Unsupervised approaches are also useful for optimizing other machine learning algorithms. Principal component analysis (PCA) is a powerful linear transformation technique used to explore patterns in data and highly correlated variables. It is useful for distilling variation across many variables onto a reduced feature space, such as a two-dimensional scatterplot. ## Reclass variables dplyr is essential for changing the classes of multiple features at once :^) ## Scale numeric variables ```{r} vars_to_scale = c("age", "trestbps", "chol", "thalach", "oldpeak") h = data_original %>% mutate_at(scale, .vars = vars(vars_to_scale)) h = h %>% mutate_at(as.numeric, .vars = vars(vars_to_scale)) head(h) ``` ## Factorize categorical variables ```{r} # Quick rename target outcomes h$target = ifelse(h$target == 1, "yes", "no") vars_to_fac = c("sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal", "target") h = h %>% mutate_at(as.factor, .vars = vars(vars_to_fac)) # Awesome! sapply(h, class) # Create subset of numeric-only data (along with h.target) # Combine the scaled numeric data and the original target feature ml_num = data.frame(subset(h, select = vars_to_scale), h$target) head(ml_num) ``` # Fit model ```{r} split = splitmix(h) X1 = split$X.quanti X2 = split$X.quali res.pcamix = PCAmix(X.quanti = X1, X.quali = X2, rename.level = TRUE, graph = TRUE) # Stuff to unpack names(res.pcamix) res.pcamix$eig ``` ## Screeplot ```{r} barplot(res.pcamix$eig[,2], ylim = c(0, 20), las = 2) ``` ## ggplot coordinates ```{r} # ?plot.PCAmix # Convert the coordinates to a dataframe, and add the original target column pca1 = data.frame(res.pcamix$ind$coord, h$target) ggplot(pca1, aes(x = dim.1, y = dim.2, color = h.target)) + geom_point() + theme_bw() + guides(color = guide_legend(title = "Has heart \n disease?")) + ggtitle("PCA of heart disease") + xlab(paste("Dimension 1", paste0("(", round(res.pcamix$eig[1, 2], 2), "%", ")"))) + ylab(paste("Dimension 2", paste0("(", round(res.pcamix$eig[2, 2], 2), "%", ")"))) ``` ## View factor loadings ```{r} pca2 = data.frame(res.pcamix$sqload) pca2 # Dimension 1 ggplot(pca2, aes(x = reorder(rownames(pca2), -dim.1), y = dim.1)) + geom_bar(stat = "identity") + theme_bw() + ggtitle("Dimension 1 loadings") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Dimension 2 ggplot(pca2, aes(x = reorder(rownames(pca2), -dim.2), y = dim.2)) + geom_bar(stat = "identity") + theme_bw() + ggtitle("Dimension 2 loadings") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) ``` # PCA for Machine Learning Create a 70/30 training/test split ```{r} # Set seed for reproducibility set.seed(1) # Create a stratified random split training_rows = caret::createDataPartition(ml_num$h.target, p = 0.70, list = FALSE) # Partition training dataset train_x_class = ml_num[training_rows, ] # Partition test dataset test_x_class = ml_num[-training_rows, ] dim(train_x_class) dim(test_x_class) table(train_x_class$h.target) table(test_x_class$h.target) ``` # Fit PCA model to training set for numeric values ```{r} ?prcomp pca_ml = prcomp(subset(train_x_class, select = -h.target), retx = TRUE, center = FALSE, scale = FALSE) pca_ml # view percentage of variance explained summary(pca_ml) # or expl.var = round(pca_ml$sdev ^ 2 / sum(pca_ml$sdev ^ 2) * 100, 4) expl.var ``` ## Generate predicted values of PCs for test dataset ```{r} predicted_values = predict(pca_ml, newdata = subset(test_x_class, select = -h.target)) head(predicted_values) ``` ## Define plotting parameters ```{r} # Assign one color to each condition target_colors = 1:2 # Assign one shape for the training data and another shape for the test data target_shapes = c(1,16) # Squares = training data # Circles = test data # Which PCs to plot? target_PCs = 1:2 ``` ## Store the scores inside of dataframes ```{r} # Assign the data into dataframes like before gg_train = data.frame(pca_ml$x[, target_PCs]) head(gg_train) gg_test = data.frame(predicted_values[, target_PCs]) head(gg_test) ``` # Visualize We can plot the training and test data on the same plot! ```{r} ggplot( # training data gg_train, aes(x = gg_train[,1], y = gg_train[,2], color = train_x_class$h.target)) + geom_point(shape = 0, alpha = .5, stroke = 1, size = 3) + stat_ellipse(show.legend = FALSE, lwd = 0.5) + labs(color = "Has heart disease?", caption = "Squares = training data \n Circles = test data \n Ellipses are 95% confidence ellipses for training data") + xlab("Dimension 1") + ylab("Dimension 2") + xlim(c(-4, 4)) + ylim(c(-4, 4)) + theme_bw() + # test data geom_point(gg_test, mapping = aes(x = gg_test[,1], y = gg_test[,2], color = test_x_class$h.target, size = 3, alpha = 0.75)) + guides(size = FALSE, alpha = FALSE) + theme(legend.position = "top") + ggtitle("Heart disease training/test data") + theme(plot.title = element_text(hjust = 0.5, size = 10), legend.title = element_text(size = 10), legend.text = element_text(size = 10)) ``` ## Save `ml_num` for use in 09-hclust.Rmd ```{r} save(ml_num, file = "data/unsupervised.RData") ```