This notebook begins analysis with already cleaned GC/MS data. I’ve already calculated relative peak areas (RPA) subtracted field blanks, and replaced any zeroes or NAs with 1/peak area of internal standard.
library(tidyverse)
[30m── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mggplot2[30m 3.0.0 [32m✔[30m [34mpurrr [30m 0.2.5
[32m✔[30m [34mtibble [30m 1.4.2 [32m✔[30m [34mdplyr [30m 0.7.6
[32m✔[30m [34mtidyr [30m 0.8.1 [32m✔[30m [34mstringr[30m 1.3.1
[32m✔[30m [34mreadr [30m 1.1.1 [32m✔[30m [34mforcats[30m 0.3.0[39m
[30m── [1mConflicts[22m ───────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()
[31m✖[30m [34mpurrr[30m::[32mmap()[30m masks [34mmclust[30m::map()[39m
library(here)
here() starts at /Users/scottericr/Documents/Tufts/Research Projects/BACE Tea/Data for drought by herbivory in tea
library(gglabeller)
library(cowplot)
Attaching package: ‘cowplot’
The following object is masked from ‘package:ggplot2’:
ggsave
library(ggrepel)
library(ropls)
library(HDoutliers)
`%!in%` <- Negate(`%in%`)
#detect if a column has at least one non-zero value. Probably other ways to do this.
not_empty <- function(x) !is.double(x) || is.double(x) && sum(x) != 0
#function to select columns that are NOT double, i.e. integer, factor, and character
not_double <- function(x) !is.double(x)
gc <- read_rds(here("cleaned_data", "GCMS.rds"))
gc_zeroes <- read_rds(here("cleaned_data", "GCMS_zeroes.rds"))
Create a “wide” version of the data frame for use in multivarite analyses
gc_wide <- gc %>%
select(sample, Treatment, Spray, Plot, Plant, Compound, RPA) %>%
spread(key = Compound, value = RPA)
I’ll use a combination of univariate scatter plots and preliminary PCA to detect outliers. ## Prep Data
p1 <- ggplot(plotdata[[1]], aes(x = shortname, y = RPA, color = Spray)) + horiz_scatter() + theme_bw()
p2 <- ggplot(plotdata[[2]], aes(x = shortname, y = RPA, color = Spray)) + horiz_scatter() + theme_bw()
p3 <- ggplot(plotdata[[3]], aes(x = shortname, y = RPA, color = Spray)) + horiz_scatter() + theme_bw()
p4 <- ggplot(plotdata[[4]], aes(x = shortname, y = RPA, color = Spray)) + horiz_scatter() + theme_bw()
A recent update to gglabeller broke this code I think. I think I need to explicitly drop unused levels from sample
for this to work.
p1.labeled <- gglabeller(p1, aes(label = plotdata[[1]]$sample))
p1 <- p1.labeled$plot
p2.labeled <- gglabeller(p2, aes(label = plotdata[[2]]$sample))
p2 <- p2.labeled$plot
p3.labeled <- gglabeller(p3, aes(label = plotdata[[3]]$sample))
p3 <- p3.labeled$plot
p4.labeled <- gglabeller(p4, aes(label = plotdata[[4]]$sample))
p4 <- p4.labeled$plot
Produced by running the chunk above interactively and copying the plot data from, e.g., p1.labeled$code
gglabeller_data <- p1$data
gglabeller_data$gglabeller_labels <- plotdata[[1]]$sample
gglabeller_data[c(1:483, 485:647, 649:659, 661:750, 752:804, 806, 808:816, 818:823, 826, 828:829, 831:1426, 1428:1536),'gglabeller_labels'] <- ''
p1 <- p1 + geom_text_repel(data = gglabeller_data,mapping = aes(label = gglabeller_labels), min.segment.length = unit(0.5, 'lines'),box.padding = unit(0.25, 'lines'),point.padding = unit(1e-06, 'lines'))
gglabeller_data <- p2$data
gglabeller_data$gglabeller_labels <- plotdata[[2]]$sample
gglabeller_data[c(1:133, 135, 137:364, 366:430, 432:478, 480:891, 893:1009, 1011:1161, 1163:1255, 1257:1296, 1298:1299, 1301:1417, 1419:1435, 1437:1457, 1459:1536),'gglabeller_labels'] <- ''
p2 <- p2 + geom_text_repel(data = gglabeller_data,mapping = aes(label = gglabeller_labels), min.segment.length = unit(0.5, 'lines'),box.padding = unit(0.25, 'lines'),point.padding = unit(1e-06, 'lines'))
gglabeller_data <- p3$data
gglabeller_data$gglabeller_labels <- plotdata[[3]]$sample
gglabeller_data[c(1:44, 46:288, 290:357, 360:473, 475:505, 507:761, 763:812, 814:821, 823:1287, 1289:1413, 1415:1424, 1426:1432, 1434:1536),'gglabeller_labels'] <- ''
p3 <- p3 + geom_text_repel(data = gglabeller_data,mapping = aes(label = gglabeller_labels), min.segment.length = unit(0.5, 'lines'),box.padding = unit(0.25, 'lines'),point.padding = unit(1e-06, 'lines'))
gglabeller_data <- p4$data
gglabeller_data$gglabeller_labels <- plotdata[[4]]$sample
gglabeller_data[c(1:41, 43:71, 73:354, 356:462, 464:494, 496:502, 504:823, 826:831, 833:1293, 1295:1417, 1419:1424, 1426:1536),'gglabeller_labels'] <- ''
p4<- p4 + geom_text_repel(data = gglabeller_data,mapping = aes(label = gglabeller_labels), min.segment.length = unit(0.5, 'lines'),box.padding = unit(0.25, 'lines'),point.padding = unit(1e-06, 'lines'))
legend <- get_legend(p1)
grid1 <- plot_grid(p1 + theme(legend.position = "none"),
p2 + theme(legend.position = "none"),
ncol = 2, nrow = 1)
grid2 <- plot_grid(grid1, legend, rel_widths = c(3, 0.4))
grid3 <- plot_grid(p3 + theme(legend.position = "none"),
p4 + theme(legend.position = "none"),
ncol = 2, nrow = 1)
grid4 <- plot_grid(grid3, legend, rel_widths = c(3, 0.4))
Looks like e6 and maybe c10 and i4 are outliers. e6 especially is showing up often as extreme values and with extreme values that are very extreme.
The data are also not normally distributed.
I also conduct a PCA with opls()
which will give me a diagnostic plot that identifies outliers.
pca_pre <- opls(select_if(gc_wide, is.double), plotL = FALSE, scale = "standard")
PCA
48 samples x 128 variables
standard scaling of predictors
plot(pca_pre, parLabVc = gc_wide$sample)
after log transformation:
gc_log <- gc_wide %>% mutate_if(is.double, log)
pca_pre_log <- opls(select_if(gc_log, is.double), plotL = FALSE, scale = "standard")
PCA
48 samples x 128 variables
standard scaling of predictors
plot(pca_pre_log, parLabVc = gc_log$sample)
Before log transformation, e6 and i4 are identified as possible outliers. After log transformation, e6, c6 and c9 are identified as outliers.
HDoutliers also identifies e6, but log transormation seems to fix it.
e6 is for sure an outlier based on univariate plots. I’ll also remove c6 and c9 and use the log transformed data from here on out.
outliers <- c("e6", "c6", "c9")
gc_log.1 <- gc_log %>%
filter(sample %!in% outliers) %>%
select_if(not_empty) #and check for empty columns
gc_zeroes.1 <- gc_zeroes %>%
filter(sample %!in% outliers) %>%
select_if(not_empty)
Save data frames with outliers removed for analysis step.
gc_log.1
gc_zeroes.1
write_rds(gc_log.1, here("cleaned_data", "GCMS_wide_loged.rds"))
write_rds(gc_zeroes.1, here("cleaned_data", "GCMS_tidy_zeroes.rds"))