## ---- echo=FALSE--------------------------------------------------------- library(knitr) opts_chunk$set(tidy = FALSE, message = FALSE, warning = FALSE) ## ----getPackage, eval=FALSE---------------------------------------------- # # if (!requireNamespace('BiocManager', quietly = TRUE)) # install.packages('BiocManager') # BiocManager::install('PCAtools') # ## ----getPackageDevel, eval=FALSE----------------------------------------- # # devtools::install_github('kevinblighe/PCAtools') # ## ----Load, message=FALSE------------------------------------------------- library(PCAtools) ## ------------------------------------------------------------------------ library(Biobase) library(GEOquery) # load series and platform data from GEO gset <- getGEO('GSE2990', GSEMatrix =TRUE, getGPL=FALSE) x <- exprs(gset[[1]]) # remove Affymetrix control probes x <- x[-grep('^AFFX', rownames(x)),] # extract information of interest from the phenotype data (pdata) idx <- which(colnames(pData(gset[[1]])) %in% c('age:ch1', 'distant rfs:ch1', 'er:ch1', 'ggi:ch1', 'grade:ch1', 'size:ch1', 'time rfs:ch1')) metadata <- data.frame(pData(gset[[1]])[,idx], row.names = rownames(pData(gset[[1]]))) # tidy column names colnames(metadata) <- c('Age', 'Distant.RFS', 'ER', 'GGI', 'Grade', 'Size', 'Time.RFS') # prepare certain phenotypes metadata$Age <- as.numeric(gsub('^KJ', NA, metadata$Age)) metadata$Distant.RFS <- factor(metadata$Distant.RFS, levels=c(0,1)) metadata$ER <- factor(gsub('\\?', NA, metadata$ER), levels=c(0,1)) metadata$ER <- factor(ifelse(metadata$ER == 1, 'ER+', 'ER-'), levels = c('ER-', 'ER+')) metadata$GGI <- as.numeric(metadata$GGI) metadata$Grade <- factor(gsub('\\?', NA, metadata$Grade), levels=c(1,2,3)) metadata$Grade <- gsub(1, 'Grade 1', gsub(2, 'Grade 2', gsub(3, 'Grade 3', metadata$Grade))) metadata$Grade <- factor(metadata$Grade, levels = c('Grade 1', 'Grade 2', 'Grade 3')) metadata$Size <- as.numeric(metadata$Size) metadata$Time.RFS <- as.numeric(gsub('^KJX|^KJ', NA, metadata$Time.RFS)) # remove samples from the pdata that have any NA value discard <- apply(metadata, 1, function(x) any(is.na(x))) metadata <- metadata[!discard,] # filter the expression data to match the samples in our pdata x <- x[,which(colnames(x) %in% rownames(metadata))] # check that sample names match exactly between pdata and expression data all(colnames(x) == rownames(metadata)) ## ------------------------------------------------------------------------ p <- pca(x, metadata = metadata, removeVar = 0.1) ## ----ex1, fig.height = 7, fig.width = 16, fig.cap = 'Figure 1: A SCREE plot to show the proportion of explained variance by PC'---- screeplot(p) ## ----ex2, fig.height = 7, fig.width = 8, fig.cap = 'Figure 2: A bi-plot of PC1 versus PC2'---- biplot(p) ## ----ex3, fig.height = 10, fig.width = 10, fig.cap = 'Figure 3: A pairs plot, comparing PC1 - PC5 on a pairwise basis'---- pairsplot(p) ## ----ex4, fig.height = 6, fig.width = 8, fig.cap = 'Figure 4: Plot the component loadings and label genes most responsible for variation'---- plotloadings(p) ## ----ex5, fig.height = 4, fig.width = 8, fig.cap = 'Figure 5: Correlate PCs to metadata variables'---- eigencorplot(p, metavars = c('Age','Distant.RFS','ER','GGI','Grade','Size','Time.RFS')) ## ----ex6, fig.height = 7, fig.width = 10, fig.cap = 'Figure 6: Advanced SCREE plot illustrating number of PCs comprising at least 80% explained variation'---- library(ggplot2) screeplot(p, components = getComponents(p, 1:30), hline = 80, vline = 27) + geom_text(aes(20, 80, label = '80% explained variation', vjust = -1)) ## ----ex7, fig.height = 6, fig.width = 8, fig.cap = 'Figure 7: adding lines and a legend to a bi-plot'---- biplot(p, colby = 'ER', hline = 0, vline = 0, legendPosition = 'right') ## ----ex8, eval = FALSE, fig.height = 7, fig.width = 7, fig.cap = 'Figure 8: supplying custom colours to a bi-plot'---- # # biplot(p, # colby = 'ER', colkey = c('ER+'='forestgreen', 'ER-'='purple'), # hline = 0, vline = c(-25, 0, 25), # legendPosition = 'top', legendLabSize = 16, legendIconSize = 8.0) # ## ----ex9, fig.height = 7, fig.width = 7, fig.cap = 'Figure 9: supplying custom colours and shapes to a bi-plot, removing connectors, and modifying titles'---- biplot(p, colby = 'ER', colkey = c('ER+'='forestgreen', 'ER-'='purple'), hline = 0, vline = c(-25, 0, 25), legendPosition = 'top', legendLabSize = 16, legendIconSize = 8.0, shape = 'Grade', shapekey = c('Grade 1'=15, 'Grade 2'=17, 'Grade 3'=8), drawConnectors = FALSE, title = 'PCA bi-plot', subtitle = 'PC1 versus PC2', caption = '27 PCs == 80%') ## ----ex10, fig.height = 6, fig.width = 8, fig.cap = 'Figure 10: removing labels, modifying line types, removing gridlines, and increasing point size in a bi-plot'---- biplot(p, lab = FALSE, colby = 'ER', colkey = c('ER+'='royalblue', 'ER-'='red3'), hline = 0, vline = c(-25, 0, 25), vlineType = c('dotdash', 'solid', 'dashed'), gridlines.major = FALSE, gridlines.minor = FALSE, pointSize = 5, legendPosition = 'left', legendLabSize = 16, legendIconSize = 8.0, shape = 'Grade', shapekey = c('Grade 1'=15, 'Grade 2'=17, 'Grade 3'=8), drawConnectors = FALSE, title = 'PCA bi-plot', subtitle = 'PC1 versus PC2', caption = '27 PCs == 80%') ## ----ex11, eval = FALSE, fig.height = 6, fig.width = 8, fig.cap = 'Figure 11: colouring by a continuous variable and plotting other PCs in a bi-plot'---- # # biplot(p, x = 'PC10', y = 'PC50', # lab = FALSE, # colby = 'Age', # hline = 0, vline = 0, # hlineWidth = 1.0, vlineWidth = 1.0, # gridlines.major = FALSE, gridlines.minor = TRUE, # pointSize = 5, # legendPosition = 'left', legendLabSize = 16, legendIconSize = 8.0, # shape = 'Grade', shapekey = c('Grade 1'=15, 'Grade 2'=17, 'Grade 3'=8), # drawConnectors = FALSE, # title = 'PCA bi-plot', # subtitle = 'PC10 versus PC50', # caption = '27 PCs == 80%') # ## ----ex12, eval = FALSE, fig.height = 8, fig.width = 7, fig.cap = 'Figure 12: maximising available space in a pairs plot'---- # # pairsplot(p, # components = getComponents(p, c(1:10)), # triangle = TRUE, trianglelabSize = 12, # hline = 0, vline = 0, # pointSize = 0.4, # gridlines.major = FALSE, gridlines.minor = FALSE, # colby = 'Grade', # title = 'Pairs plot', plotaxes = FALSE, # margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm')) # ## ----ex13, eval = FALSE, fig.height = 6, fig.width = 9, fig.cap = 'Figure 13: arranging a pairs plot horizontally'---- # # pairsplot(p, # components = getComponents(p, c(4,33,11,1)), # triangle = FALSE, # hline = 0, vline = 0, # pointSize = 0.8, # gridlines.major = FALSE, gridlines.minor = FALSE, # colby = 'ER', # title = 'Pairs plot', plotaxes = TRUE, # margingaps = unit(c(0.1, 0.1, 0.1, 0.1), 'cm')) # ## ----ex14, fig.height = 6, fig.width = 9, fig.cap = 'Figure 14: modifying cut-off for labeling in a loadings plot'---- plotloadings(p, rangeRetain = 0.01, labSize = 3.0, title = 'Loadings plot', subtitle = 'PC1, PC2, PC3, PC4, PC5', caption = 'Top 1% variables', shape = 24, col = c('limegreen', 'black', 'red3'), drawConnectors = TRUE) ## ----eval=TRUE----------------------------------------------------------- library(biomaRt) mart <- useMart('ENSEMBL_MART_ENSEMBL') mart <- useDataset('hsapiens_gene_ensembl', mart) getBM(mart = mart, attributes = c('affy_hg_u133a', 'ensembl_gene_id', 'gene_biotype', 'external_gene_name'), filter = 'affy_hg_u133a', values = c('215281_x_at', '214464_at', '211122_s_at', '205225_at', '202037_s_at', '204540_at', '215176_x_at', '205044_at', '208650_s_at', '205380_at'), uniqueRows = TRUE) ## ----ex15, eval = FALSE, fig.height = 9, fig.width = 11, fig.cap = 'Figure 15: plotting absolute component loadings'---- # # plotloadings(p, # components = getComponents(p, c(4,33,11,1)), # rangeRetain = 0.1, # labSize = 3.0, # absolute = FALSE, # title = 'Loadings plot', # subtitle = 'Misc PCs', # caption = 'Top 10% variables', # shape = 23, shapeSizeRange = c(1, 16), # col = c('white', 'pink'), # drawConnectors = FALSE) # ## ----ex16, fig.height = 4, fig.width = 12, fig.cap = 'Figure 16: correlating PCs that account for at least 80% variation to clinical variables'---- eigencorplot(p, components = getComponents(p, 1:27), metavars = c('Age','Distant.RFS','ER','GGI','Grade','Size','Time.RFS'), col = c('darkblue', 'blue2', 'black', 'red2', 'darkred'), cexCorval = 0.7, colCorval = 'white', fontCorval = 2, posLab = 'bottomleft', rotLabX = 45, posColKey = 'top', cexLabColKey = 1.5, scale = TRUE, main = 'PC1-27 clinical correlations', colFrame = 'white', plotRsquared = FALSE) ## ----ex17, eval = FALSE, fig.height = 5, fig.width = 12, fig.cap = 'Figure 17: modifying cut-offs and symbols for statistical significance in eigencorplot'---- # # eigencorplot(p, # components = getComponents(p, 1:10), # metavars = c('Age','Distant.RFS','ER','GGI','Grade','Size','Time.RFS'), # col = c('white', 'cornsilk1', 'gold', 'forestgreen', 'darkgreen'), # cexCorval = 1.2, # fontCorval = 2, # posLab = 'all', # rotLabX = 45, # scale = TRUE, # main = bquote(Principal ~ component ~ Pearson ~ r^2 ~ clinical ~ correlates), # plotRsquared = TRUE, # corFUN = 'pearson', # corUSE = 'pairwise.complete.obs', # signifSymbols = c('****', '***', '**', '*', ''), # signifCutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1)) # ## ----ex18, eval = FALSE, fig.height = 10, fig.width = 15, fig.cap = 'Figure 18: a merged panel of all PCAtools plots'---- # # pscree <- screeplot(p, components = getComponents(p, 1:30), # hline = 80, vline = 27, axisLabSize = 10, returnPlot = FALSE) + # geom_text(aes(20, 80, label = '80% explained variation', vjust = -1)) # # ppairs <- pairsplot(p, components = getComponents(p, c(1:3)), # triangle = TRUE, trianglelabSize = 12, # hline = 0, vline = 0, # pointSize = 0.8, gridlines.major = FALSE, gridlines.minor = FALSE, # colby = 'Grade', # title = '', titleLabSize = 16, plotaxes = FALSE, # margingaps = unit(c(0.01, 0.01, 0.01, 0.01), 'cm'), # returnPlot = FALSE) # # pbiplot <- biplot(p, lab = FALSE, # colby = 'ER', colkey = c('ER+'='royalblue', 'ER-'='red3'), # hline = 0, vline = c(-25, 0, 25), vlineType = c('dotdash', 'solid', 'dashed'), # gridlines.major = FALSE, gridlines.minor = FALSE, # pointSize = 2, axisLabSize = 12, # legendPosition = 'left', legendLabSize = 10, legendIconSize = 3.0, # shape = 'Grade', shapekey = c('Grade 1'=15, 'Grade 2'=17, 'Grade 3'=8), # drawConnectors = FALSE, # title = 'PCA bi-plot', subtitle = 'PC1 versus PC2', # caption = '27 PCs == 80%', # returnPlot = FALSE) # # ploadings <- plotloadings(p, rangeRetain = 0.01, labSize = 2.5, # title = 'Loadings plot', axisLabSize = 12, # subtitle = 'PC1, PC2, PC3, PC4, PC5', # caption = 'Top 1% variables', # shape = 24, shapeSizeRange = c(4, 4), # col = c('limegreen', 'black', 'red3'), # legendPosition = 'none', # drawConnectors = FALSE, # returnPlot = FALSE) # # peigencor <- eigencorplot(p, # components = getComponents(p, 1:10), # metavars = c('Age','Distant.RFS','ER','GGI','Grade','Size','Time.RFS'), # #col = c('royalblue', '', 'gold', 'forestgreen', 'darkgreen'), # cexCorval = 0.6, # fontCorval = 2, # posLab = 'all', # rotLabX = 45, # scale = TRUE, # main = "PC clinical correlates", # cexMain = 1.5, # plotRsquared = FALSE, # corFUN = 'pearson', # corUSE = 'pairwise.complete.obs', # signifSymbols = c('****', '***', '**', '*', ''), # signifCutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1), # returnPlot = FALSE) # # library(cowplot) # library(ggplotify) # # top_row <- plot_grid(pscree, ppairs, pbiplot, # ncol = 3, # labels = c('A', 'B Pairs plot', 'C'), # label_fontfamily = 'serif', # label_fontface = 'bold', # label_size = 22, # align = 'h', # rel_widths = c(1.05, 0.9, 1.05)) # # bottom_row <- plot_grid(ploadings, # as.grob(peigencor), # ncol = 2, # labels = c('D', 'E'), # label_fontfamily = 'serif', # label_fontface = 'bold', # label_size = 22, # align = 'h', # rel_widths = c(1.5, 1.5)) # # plot_grid(top_row, bottom_row, ncol = 1, rel_heights = c(1.0, 1.0)) # ## ------------------------------------------------------------------------ sessionInfo()