### To run the code, download the supplementary table included in the ### online version of the paper, and save the file as csv file format in ### the working directory of R or R studio. ########################################################################## ### PPJ survey survey = read.csv("PPJ_Survey.csv", header = T, sep = ",") head(survey) dim(survey) # Removing extra rows survey = survey[1:129, -1] head(survey); tail(survey) Error_bar = data.frame(table(survey$SD_SE)) Error_bar$value = round(Error_bar$Freq/sum(Error_bar$Freq)*100, 1) lab1 = paste0(Error_bar$Var1, " (", Error_bar$value, "%)") P_value = data.frame(table(survey$Pvalue)) P_value$value = round(P_value$Freq/sum(P_value$Freq)*100, 1) lab2 = paste0(P_value$Var1, " (", P_value$value, "%)") Pseudo_rep = data.frame(table(survey$Pseudo.rep)) Pseudo_rep$value = round(Pseudo_rep$Freq/sum(Pseudo_rep$Freq)*100, 1) lab3 = paste0(Pseudo_rep$Var1, " (", Pseudo_rep$value, "%)") Description = data.frame(table(survey$Description)) Description$value = round(Description$Freq/sum(Description$Freq)*100, 1) lab4 = paste0(Description$Var1, " (", Description$value, "%)") library(ggpubr) p1 = ggpie(Error_bar, "value", label = lab1, fill = "Var1", color = "white", lab.adjust = 8, palette = get_palette(c("#00AFBB", "#E7B800", "#FC4E07"), 3), lab.pos = "in", lab.font = c(8, "bold", "black")) + theme(legend.position = "none") p2 = ggpie(P_value, "value", label = lab2, fill = "Var1", color = "white", lab.adjust = 8, palette = get_palette(c("#00AFBB", "#E7B800", "#FC4E07"), 3), lab.pos = "in", lab.font = c(8, "bold", "black")) + theme(legend.position = "none") p3 = ggpie(Pseudo_rep, "value", fill = "Var1", color = "white", label = lab3, lab.adjust = 4, palette = get_palette(c("#00AFBB", "#E7B800", "#FC4E07"), 4), lab.pos = "in", lab.font = c(8, "bold", "black")) + theme(legend.position = "none") p4 = ggpie(Description, "value", fill = "Var1", color = "white", label = lab4, lab.adjust = 4, palette = get_palette(c("#00AFBB", "#E7B800", "#FC4E07"), 3), lab.pos = "in", lab.font = c(8, "bold", "black")) + theme(legend.position = "none") ggarrange(p1, p2, p3, p4, labels = c("A", "B", "C", "D"), font.label = list(size = 18), label.x = 0.2, label.y = 0.9, ncol = 2, nrow = 2, align = 'v') ############################################################################################# ### Issues of technical reps and biological reps (associated with pseudo-replication) # Making up pseudo-data for demonstration purpose n = 3; ctr_mean = 8; trt_mean = 6 set.seed(20) ctr1 = rnorm(n, ctr_mean, sd = 0.5); trt1 = rnorm(n, trt_mean, sd = 0.5) ctr2 = rnorm(n, ctr_mean, sd = 0.7); trt2 = rnorm(n, trt_mean, sd = 0.7) ctr3 = rnorm(n, ctr_mean, sd = 0.5); trt3 = rnorm(n, trt_mean, sd = 0.9) data = data.frame(Exp = rep(c("Control", "Treatment"), each = 9), Measurement = c(ctr1, ctr2, ctr3, trt1, trt2, trt3), Replicate = as.factor(rep(rep(1:3, each = 3), 2))) library(ggpubr) p = ggdotplot(data, "Exp", "Measurement", fill = "Replicate", color = "Replicate", palette = 'jco', binwidth = 0.12, add = "mean_sd", add.params = list(color = "black")) + labs(x = NULL, y= NULL) + facet_wrap(.~ Replicate) ggpar(p, legend = "right", font.legend = c(20), font.tickslab = c(20), xtickslab.rt = 45, ytickslab.rt = 0, font.x = c(21), font.y = c(21)) # Defining bar_plot function to be used repeatedly in each scenario library(ggplot2) bar_plot = function(df){ ggplot(df, aes(x = Exp, y = Mean)) + geom_bar(position = position_dodge(), stat = 'identity', fill = "lightgrey", width = 0.5) + ylim(0, 9) + labs(x = NULL, y = NULL) + geom_errorbar(aes(ymin = Mean - SE, ymax = Mean + SE), width = 0.1, position = position_dodge(0.9)) + theme_pubr(base_size = 15) } ## Scenario 1 # t.test for each combination of control and treatment t.test(ctr1, trt1)$p.value # significant diff. t.test(ctr2, trt2)$p.value # marginally significant diff. t.test(ctr3, trt3)$p.value # highly significant diff. # all comparisons are significant anyway, so you might decide # to use just one set of data, in this case, the first set for # presentation. So you will report p-value = 0.001694895, and claim # that there is statistically significant diff. between control and # treatment. data1 = subset(data, Replicate == 1) df1 = as.data.frame(cbind(tapply(data1$Measurement, data1$Exp, mean), tapply(data1$Measurement, data1$Exp, sd)/(n^0.5))) names(df1) = c("Mean", "SE") df1$Exp = rownames(df1) rownames(df1) = c() bar_plot(df1) ## Scenario 2 # Alternatively you may decide to use all the data including technical replicates # for calculation of means and standard error df2 = as.data.frame(cbind(tapply(data$Measurement, data$Exp, mean), tapply(data$Measurement, data$Exp, sd)/(9^0.5))) # now sample size is 9, not 3 names(df2) = c("Mean", "SE") df2$Exp = rownames(df2) rownames(df2) = c() bar_plot(df2) t.test(data$Measurement ~ data$Exp, data = data) # highly significant diff., # altogether diff between means is smaller than in the previous scenario, due to # increase in sample size. But is this the right approach? ## Scenario 3 # Lastly, you may want to take mean of each biological reps and use it library(dplyr) df_m = data %>% group_by(Exp, Replicate) %>% summarize(Mean = mean(Measurement)) df3 = as.data.frame(cbind(tapply(df_m$Mean, df_m$Exp, mean), tapply(df_m$Mean, df_m$Exp, sd)/(n))) names(df3) = c("Mean", "SE") df3$Exp = rownames(df3) rownames(df3) = c() bar_plot(df3) t.test(df_m$Mean ~ df_m$Exp, df_m) # P < 0.05 but not as significant as the the previous scenario ### Linear mixed model can make comparison between control and treatment, accounting for dependence among data points. library(lme4) data.nest.model = lmer(Measurement ~ Exp + (1|Replicate), data = data, REML = F) summary(data.nest.lme) # note that this does not provide p-value. # Unfortunately, p-values for mixed models aren’t as straightforward as # they are for the linear model. There are multiple approaches, and there are # different opinions about which approach is the best. # Here, I use the likelihood ratio test as a means # to obtain p-values. data.nest.null = lmer(Measurement ~ 1|Replicate, data = data, REML = F) anova(data.nest.null, data.nest.model) # If you look carefully for p-value part, then you can see difference # between control and treatment is quite significant! ########################################################################## ### p-value issues ## Comparison of two groups having two different means # a function to draw the plot for visual inspection of two groups data_gen1 = function(n, mean1, mean2, sd){ x = rnorm(n, mean1, sd) y = rnorm(n, mean2, sd) data = data.frame(value = c(x,y), Group = c(rep("Control",length(x)), rep("Treatment",length(y)))) } plot_2groups = function(x){ ggdotplot(x, x= "Group", y = "value", size = 2, fill = "Group", color = "Group", palette = 'jco', binwidth = 0.12, add = "mean_se", add.params = list(color = "lightgrey")) + labs(x = NULL, y= NULL) + stat_compare_means(method = 't.test', label.y = 9) + stat_compare_means(label.y = 8.5) } # Mean differences are the same across the data sets but standard deviations are different data1 = data_gen1(3, 3, 5, 0.5) data2 = data_gen1(3, 3, 5, 1) data3 = data_gen1(3, 3, 5, 2) # Drawing plots with P-values d1 = plot_2groups(data1); d1 = ggpar(d1, ylim = c(0, 10), legend.title = '') d2 = plot_2groups(data2); d2 = ggpar(d2, ylim = c(0, 10), legend.title = '') d3 = plot_2groups(data3); d3 = ggpar(d3, ylim = c(0, 10), legend.title = '') ggarrange(d1, d2, d3, nrow = 3, ncol = 1) # a function to generate data set and p-value of t.test data_gen = function(n, mean1, mean2, sd){ x = rnorm(n, mean1, sd) y = rnorm(n, mean2, sd) data = data.frame(value = c(x,y), Group = c(rep("Control",length(x)), rep("Treatment",length(y)))) test = t.test(x, y) return(test$p.value) } ### Simulating effects of sample size and variability among samples rep_n = 10000 # 10000 simulations par(mfrow = c(4, 2)) # Defining plot function to repeatedly use pvhist = function(df){ p = gghistogram(df, x = 'p_value', color = 'white', fill = 'steelblue', bins = 20, rug = T) + labs(x = 'P-value', y = 'Count') ggpar(p, font.tickslab = c(14), font.x = c(18), font.y = c(18), xtickslab.rt = 45) } # Two group of data having the same mean and variance data_gen(3, 3, 3, 0.5) pv = data.frame(p_value = replicate(rep_n, data_gen(3, 3, 3, 0.5))) ph = pvhist(pv); sum(pv<.05)/rep_n data_gen(6, 3, 3, 0.5) pv0 = data.frame(p_value = replicate(rep_n, data_gen(6, 3, 3, 0.5))) ph0 = pvhist(pv0); sum(pv0<.05)/rep_n # Two groups of data having different means and same small variance data_gen(3, 3, 5, 0.5) pv1 = data.frame(p_value = replicate(rep_n, data_gen(3, 3, 5, 0.5))) ph1 = pvhist(pv1); sum(pv1>.05)/rep_n data_gen(6, 3, 5, 0.5) # larger sample size pv2 = data.frame(p_value = replicate(rep_n, data_gen(6, 3, 5, 0.5))) ph2 = pvhist(pv2); sum(pv2>.05)/rep_n ## Two groups of data having larger variance data_gen(3, 3, 5, 1) pv3 = data.frame(p_value = replicate(rep_n, data_gen(3, 3, 5, 1))) ph3 = pvhist(pv3); sum(pv3>.05)/rep_n data_gen(6, 3, 5, 1) pv4 = data.frame(p_value = replicate(rep_n, data_gen(6, 3, 5, 1))) # increased sample size ph4 = pvhist(pv4); sum(pv4>.05)/rep_n data_gen(3, 3, 5, 1.5) pv5 = data.frame(p_value = replicate(rep_n, data_gen(3, 3, 5, 1.5))) ph5 = pvhist(pv5); sum(pv5>.05)/rep_n data_gen(6, 3, 5, 1.5) pv6 = data.frame(p_value = replicate(rep_n, data_gen(6, 3, 5, 1.5))) # increased sample size ph6 = pvhist(pv6); sum(pv6>.05)/rep_n ggarrange(ph, ph0, ph1, ph2, ph3, ph4, ph5, ph6, nrow = 4, ncol = 2)