## ---- echo=FALSE---------------------------------------------------------

    library(knitr)

    opts_chunk$set(tidy = FALSE, message = FALSE, warning = FALSE)


## ----getPackage, eval=FALSE----------------------------------------------
#  
#      if (!requireNamespace('BiocManager', quietly = TRUE))
#          install.packages('BiocManager')
#          BiocManager::install('PCAtools')
#  

## ----getPackageDevel, eval=FALSE-----------------------------------------
#  
#      devtools::install_github('kevinblighe/PCAtools')
#  

## ----Load, message=FALSE-------------------------------------------------

    library(PCAtools)


## ------------------------------------------------------------------------

  library(Biobase)
  library(GEOquery)

  # load series and platform data from GEO
  gset <- getGEO('GSE2990', GSEMatrix =TRUE, getGPL=FALSE)

  x <- exprs(gset[[1]])

  # remove Affymetrix control probes
  x <- x[-grep('^AFFX', rownames(x)),]

  # extract information of interest from the phenotype data (pdata)
  idx <- which(colnames(pData(gset[[1]])) %in%
    c('age:ch1', 'distant rfs:ch1', 'er:ch1',
      'ggi:ch1', 'grade:ch1', 'size:ch1',
      'time rfs:ch1'))

  metadata <- data.frame(pData(gset[[1]])[,idx],
    row.names = rownames(pData(gset[[1]])))

  # tidy column names
  colnames(metadata) <- c('Age', 'Distant.RFS', 'ER', 'GGI', 'Grade',
    'Size', 'Time.RFS')

  # prepare certain phenotypes
  metadata$Age <- as.numeric(gsub('^KJ', NA, metadata$Age))
  metadata$Distant.RFS <- factor(metadata$Distant.RFS, levels=c(0,1))
  metadata$ER <- factor(gsub('\\?', NA, metadata$ER), levels=c(0,1))
  metadata$ER <- factor(ifelse(metadata$ER == 1, 'ER+', 'ER-'), levels = c('ER-', 'ER+'))
  metadata$GGI <- as.numeric(metadata$GGI)
  metadata$Grade <- factor(gsub('\\?', NA, metadata$Grade), levels=c(1,2,3))
  metadata$Grade <- gsub(1, 'Grade 1', gsub(2, 'Grade 2', gsub(3, 'Grade 3', metadata$Grade)))
  metadata$Grade <- factor(metadata$Grade, levels = c('Grade 1', 'Grade 2', 'Grade 3'))
  metadata$Size <- as.numeric(metadata$Size)
  metadata$Time.RFS <- as.numeric(gsub('^KJX|^KJ', NA, metadata$Time.RFS))

  # remove samples from the pdata that have any NA value
  discard <- apply(metadata, 1, function(x) any(is.na(x)))

  metadata <- metadata[!discard,]

  # filter the expression data to match the samples in our pdata
  x <- x[,which(colnames(x) %in% rownames(metadata))]

  # check that sample names match exactly between pdata and expression data 
  all(colnames(x) == rownames(metadata))


## ------------------------------------------------------------------------

  p <- pca(x, metadata = metadata, removeVar = 0.1)


## ----ex1, fig.height = 7, fig.width = 16, fig.cap = 'Figure 1: A SCREE plot to show the proportion of explained variance by PC'----

  screeplot(p)


## ----ex2, fig.height = 7, fig.width = 8, fig.cap = 'Figure 2: A bi-plot of PC1 versus PC2'----

  biplot(p)


## ----ex3, fig.height = 10, fig.width = 10, fig.cap = 'Figure 3: A pairs plot, comparing PC1 - PC5 on a pairwise basis'----

  pairsplot(p)


## ----ex4, fig.height = 6, fig.width = 8, fig.cap = 'Figure 4: Plot the component loadings and label genes most responsible for variation'----

  plotloadings(p)


## ----ex5, fig.height = 4, fig.width = 8, fig.cap = 'Figure 5: Correlate PCs to metadata variables'----

  eigencorplot(p,
    metavars = c('Age','Distant.RFS','ER','GGI','Grade','Size','Time.RFS'))


## ----ex6, fig.height = 7, fig.width = 10, fig.cap = 'Figure 6: Advanced SCREE plot illustrating number of PCs comprising at least 80% explained variation'----

  library(ggplot2)

  screeplot(p,
    components = getComponents(p, 1:30),
    hline = 80, vline = 27) +
    geom_text(aes(20, 80, label = '80% explained variation', vjust = -1))


## ----ex7, fig.height = 6, fig.width = 8, fig.cap = 'Figure 7: adding lines and a legend to a bi-plot'----

  biplot(p,
    colby = 'ER',
    hline = 0, vline = 0,
    legendPosition = 'right')


## ----ex8, eval = FALSE, fig.height = 7, fig.width = 7, fig.cap = 'Figure 8: supplying custom colours to a bi-plot'----
#  
#    biplot(p,
#      colby = 'ER', colkey = c('ER+'='forestgreen', 'ER-'='purple'),
#      hline = 0, vline = c(-25, 0, 25),
#      legendPosition = 'top', legendLabSize = 16, legendIconSize = 8.0)
#  

## ----ex9, fig.height = 7, fig.width = 7, fig.cap = 'Figure 9: supplying custom colours and shapes to a bi-plot, removing connectors, and modifying titles'----

  biplot(p,
    colby = 'ER', colkey = c('ER+'='forestgreen', 'ER-'='purple'),
    hline = 0, vline = c(-25, 0, 25),
    legendPosition = 'top', legendLabSize = 16, legendIconSize = 8.0,
    shape = 'Grade', shapekey = c('Grade 1'=15, 'Grade 2'=17, 'Grade 3'=8),
    drawConnectors = FALSE,
    title = 'PCA bi-plot',
    subtitle = 'PC1 versus PC2',
    caption = '27 PCs == 80%')


## ----ex10, fig.height = 6, fig.width = 8, fig.cap = 'Figure 10: removing labels, modifying line types, removing gridlines, and increasing point size in a bi-plot'----

  biplot(p,
    lab = FALSE,
    colby = 'ER', colkey = c('ER+'='royalblue', 'ER-'='red3'),
    hline = 0, vline = c(-25, 0, 25),
    vlineType = c('dotdash', 'solid', 'dashed'),
    gridlines.major = FALSE, gridlines.minor = FALSE,
    pointSize = 5,
    legendPosition = 'left', legendLabSize = 16, legendIconSize = 8.0,
    shape = 'Grade', shapekey = c('Grade 1'=15, 'Grade 2'=17, 'Grade 3'=8),
    drawConnectors = FALSE,
    title = 'PCA bi-plot',
    subtitle = 'PC1 versus PC2',
    caption = '27 PCs == 80%')


## ----ex11, eval = FALSE, fig.height = 6, fig.width = 8, fig.cap = 'Figure 11: colouring by a continuous variable and plotting other PCs in a bi-plot'----
#  
#    biplot(p, x = 'PC10', y = 'PC50',
#      lab = FALSE,
#      colby = 'Age',
#      hline = 0, vline = 0,
#      hlineWidth = 1.0, vlineWidth = 1.0,
#      gridlines.major = FALSE, gridlines.minor = TRUE,
#      pointSize = 5,
#      legendPosition = 'left', legendLabSize = 16, legendIconSize = 8.0,
#      shape = 'Grade', shapekey = c('Grade 1'=15, 'Grade 2'=17, 'Grade 3'=8),
#      drawConnectors = FALSE,
#      title = 'PCA bi-plot',
#      subtitle = 'PC10 versus PC50',
#      caption = '27 PCs == 80%')
#  

## ----ex12, eval = FALSE, fig.height = 8, fig.width = 7, fig.cap = 'Figure 12: maximising available space in a pairs plot'----
#  
#    pairsplot(p,
#      components = getComponents(p, c(1:10)),
#      triangle = TRUE, trianglelabSize = 12,
#      hline = 0, vline = 0,
#      pointSize = 0.4,
#      gridlines.major = FALSE, gridlines.minor = FALSE,
#      colby = 'Grade',
#      title = 'Pairs plot', plotaxes = FALSE,
#      margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm'))
#  

## ----ex13, eval = FALSE, fig.height = 6, fig.width = 9, fig.cap = 'Figure 13: arranging a pairs plot horizontally'----
#  
#    pairsplot(p,
#      components = getComponents(p, c(4,33,11,1)),
#      triangle = FALSE,
#      hline = 0, vline = 0,
#      pointSize = 0.8,
#      gridlines.major = FALSE, gridlines.minor = FALSE,
#      colby = 'ER',
#      title = 'Pairs plot', plotaxes = TRUE,
#      margingaps = unit(c(0.1, 0.1, 0.1, 0.1), 'cm'))
#  

## ----ex14, fig.height = 6, fig.width = 9, fig.cap = 'Figure 14: modifying cut-off for labeling in a loadings plot'----

  plotloadings(p,
    rangeRetain = 0.01,
    labSize = 3.0,
    title = 'Loadings plot',
    subtitle = 'PC1, PC2, PC3, PC4, PC5',
    caption = 'Top 1% variables',
    shape = 24,
    col = c('limegreen', 'black', 'red3'),
    drawConnectors = TRUE)


## ----eval=TRUE-----------------------------------------------------------

  library(biomaRt)

  mart <- useMart('ENSEMBL_MART_ENSEMBL')
  mart <- useDataset('hsapiens_gene_ensembl', mart)

  getBM(mart = mart,
    attributes = c('affy_hg_u133a', 'ensembl_gene_id',
      'gene_biotype', 'external_gene_name'),
    filter = 'affy_hg_u133a',
    values = c('215281_x_at', '214464_at', '211122_s_at', '205225_at',
      '202037_s_at', '204540_at', '215176_x_at', '205044_at', '208650_s_at',
      '205380_at'),
    uniqueRows = TRUE)


## ----ex15, eval = FALSE, fig.height = 9, fig.width = 11, fig.cap = 'Figure 15: plotting absolute component loadings'----
#  
#    plotloadings(p,
#      components = getComponents(p, c(4,33,11,1)),
#      rangeRetain = 0.1,
#      labSize = 3.0,
#      absolute = FALSE,
#      title = 'Loadings plot',
#      subtitle = 'Misc PCs',
#      caption = 'Top 10% variables',
#      shape = 23, shapeSizeRange = c(1, 16),
#      col = c('white', 'pink'),
#      drawConnectors = FALSE)
#  

## ----ex16, fig.height = 4, fig.width = 12, fig.cap = 'Figure 16: correlating PCs that account for at least 80% variation to clinical variables'----

  eigencorplot(p,
    components = getComponents(p, 1:27),
    metavars = c('Age','Distant.RFS','ER','GGI','Grade','Size','Time.RFS'),
    col = c('darkblue', 'blue2', 'black', 'red2', 'darkred'),
    cexCorval = 0.7,
    colCorval = 'white',
    fontCorval = 2,
    posLab = 'bottomleft',
    rotLabX = 45,
    posColKey = 'top',
    cexLabColKey = 1.5,
    scale = TRUE,
    main = 'PC1-27 clinical correlations',
    colFrame = 'white',
    plotRsquared = FALSE)


## ----ex17, eval = FALSE, fig.height = 5, fig.width = 12, fig.cap = 'Figure 17: modifying cut-offs and symbols for statistical significance in eigencorplot'----
#  
#    eigencorplot(p,
#      components = getComponents(p, 1:10),
#      metavars = c('Age','Distant.RFS','ER','GGI','Grade','Size','Time.RFS'),
#      col = c('white', 'cornsilk1', 'gold', 'forestgreen', 'darkgreen'),
#      cexCorval = 1.2,
#      fontCorval = 2,
#      posLab = 'all',
#      rotLabX = 45,
#      scale = TRUE,
#      main = bquote(Principal ~ component ~ Pearson ~ r^2 ~ clinical ~ correlates),
#      plotRsquared = TRUE,
#      corFUN = 'pearson',
#      corUSE = 'pairwise.complete.obs',
#      signifSymbols = c('****', '***', '**', '*', ''),
#      signifCutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1))
#  

## ----ex18, eval = FALSE, fig.height = 10, fig.width = 15, fig.cap = 'Figure 18: a merged panel of all PCAtools plots'----
#  
#    pscree <- screeplot(p, components = getComponents(p, 1:30),
#      hline = 80, vline = 27, axisLabSize = 10, returnPlot = FALSE) +
#      geom_text(aes(20, 80, label = '80% explained variation', vjust = -1))
#  
#    ppairs <- pairsplot(p, components = getComponents(p, c(1:3)),
#      triangle = TRUE, trianglelabSize = 12,
#      hline = 0, vline = 0,
#      pointSize = 0.8, gridlines.major = FALSE, gridlines.minor = FALSE,
#      colby = 'Grade',
#      title = '', titleLabSize = 16, plotaxes = FALSE,
#      margingaps = unit(c(0.01, 0.01, 0.01, 0.01), 'cm'),
#      returnPlot = FALSE)
#  
#    pbiplot <- biplot(p, lab = FALSE,
#      colby = 'ER', colkey = c('ER+'='royalblue', 'ER-'='red3'),
#      hline = 0, vline = c(-25, 0, 25), vlineType = c('dotdash', 'solid', 'dashed'),
#      gridlines.major = FALSE, gridlines.minor = FALSE,
#      pointSize = 2, axisLabSize = 12,
#      legendPosition = 'left', legendLabSize = 10, legendIconSize = 3.0,
#      shape = 'Grade', shapekey = c('Grade 1'=15, 'Grade 2'=17, 'Grade 3'=8),
#      drawConnectors = FALSE,
#      title = 'PCA bi-plot', subtitle = 'PC1 versus PC2',
#        caption = '27 PCs == 80%',
#      returnPlot = FALSE)
#  
#    ploadings <- plotloadings(p, rangeRetain = 0.01, labSize = 2.5,
#      title = 'Loadings plot', axisLabSize = 12,
#      subtitle = 'PC1, PC2, PC3, PC4, PC5',
#      caption = 'Top 1% variables',
#      shape = 24, shapeSizeRange = c(4, 4),
#      col = c('limegreen', 'black', 'red3'),
#      legendPosition = 'none',
#      drawConnectors = FALSE,
#      returnPlot = FALSE)
#  
#    peigencor <- eigencorplot(p,
#      components = getComponents(p, 1:10),
#      metavars = c('Age','Distant.RFS','ER','GGI','Grade','Size','Time.RFS'),
#      #col = c('royalblue', '', 'gold', 'forestgreen', 'darkgreen'),
#      cexCorval = 0.6,
#      fontCorval = 2,
#      posLab = 'all',
#      rotLabX = 45,
#      scale = TRUE,
#      main = "PC clinical correlates",
#      cexMain = 1.5,
#      plotRsquared = FALSE,
#      corFUN = 'pearson',
#      corUSE = 'pairwise.complete.obs',
#      signifSymbols = c('****', '***', '**', '*', ''),
#      signifCutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1),
#      returnPlot = FALSE)
#  
#      library(cowplot)
#      library(ggplotify)
#  
#      top_row <- plot_grid(pscree, ppairs, pbiplot,
#        ncol = 3,
#        labels = c('A', 'B  Pairs plot', 'C'),
#        label_fontfamily = 'serif',
#        label_fontface = 'bold',
#        label_size = 22,
#        align = 'h',
#        rel_widths = c(1.05, 0.9, 1.05))
#  
#      bottom_row <- plot_grid(ploadings,
#        as.grob(peigencor),
#        ncol = 2,
#        labels = c('D', 'E'),
#        label_fontfamily = 'serif',
#        label_fontface = 'bold',
#        label_size = 22,
#        align = 'h',
#        rel_widths = c(1.5, 1.5))
#  
#      plot_grid(top_row, bottom_row, ncol = 1, rel_heights = c(1.0, 1.0))
#  

## ------------------------------------------------------------------------

sessionInfo()