|
|
|
@ -32,7 +32,7 @@ install.packages("scales")
|
|
|
|
|
# Save and load workspace |
|
|
|
|
load("R_intro.Rdata") |
|
|
|
|
save.image("R_intro.Rdata") |
|
|
|
|
# COMMENT: Don't save .Rdata when prompted upon closing R. Either explicitly specifiy a file or don't save the workspace. |
|
|
|
|
# COMMENT: Don't save .Rdata when prompted upon closing R. Either explicitly specify a file or don't save the workspace. |
|
|
|
|
# In combination with the setwd() command, you will always know which data you are working with. |
|
|
|
|
# COMMENT: I usually comment out the save.image() command so avoid accidentally overwriting an existing workspace |
|
|
|
|
|
|
|
|
@ -45,7 +45,7 @@ save.image("R_intro.Rdata")
|
|
|
|
|
|
|
|
|
|
# QUESTION: What data type are you familiar with? |
|
|
|
|
|
|
|
|
|
# For demonstration purposes, let's make up some data to look at common data and object types in R. |
|
|
|
|
# For demonstration purposes, let's make up some data to look at common data and object types in R. |
|
|
|
|
# For this we will use several R functions. Have a look at the help of these functions to find out more if you are interested. |
|
|
|
|
|
|
|
|
|
# vectors #### |
|
|
|
@ -54,7 +54,7 @@ save.image("R_intro.Rdata")
|
|
|
|
|
# They usually consist of the series of individual values. |
|
|
|
|
# Those values must have the same data type. |
|
|
|
|
|
|
|
|
|
# Here we assign (whith <-) a series of values, specifically 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, to the R object x |
|
|
|
|
# Here we assign (with <-) a series of values, specifically 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, to the R object x |
|
|
|
|
x <- c(1:5, 16:20) |
|
|
|
|
|
|
|
|
|
# Show data type: |
|
|
|
@ -99,6 +99,8 @@ as.numeric(as.factor(a))
|
|
|
|
|
# Why does as.numeric(as.factor(x)) not reproduce the original numeric vector? |
|
|
|
|
|
|
|
|
|
as.factor(x) |
|
|
|
|
levels(as.factor(x)) |
|
|
|
|
str(as.factor(x)) |
|
|
|
|
as.numeric(as.factor(x)) |
|
|
|
|
as.numeric(as.character(as.factor(x))) |
|
|
|
|
|
|
|
|
@ -108,6 +110,9 @@ as.numeric(as.character(as.factor(x)))
|
|
|
|
|
|
|
|
|
|
sampling_time <- c("2022-03-02 12:00:00", "2022-03-04 12:00:00", "2022-03-10 12:00:00") |
|
|
|
|
sampling_time_posix <- as.POSIXlt(sampling_time, format = "%Y-%m-%d %H:%M:%S", tz = "CET") |
|
|
|
|
# sampling_time <- c("02.03.2022 12:00:00.234") |
|
|
|
|
# sampling_time_posix <- as.POSIXlt(sampling_time, format = "%d.%m.%Y %H:%M:%OS", tz = "CET") |
|
|
|
|
|
|
|
|
|
# COMMENT: If not setting a time zone (tz) make sure that R takes the correct one. |
|
|
|
|
# This is especially important if measuring instruments run in a |
|
|
|
|
# different time zone than the computer for data analysis. |
|
|
|
@ -116,8 +121,9 @@ sampling_time_posix <- as.POSIXlt(sampling_time, format = "%Y-%m-%d %H:%M:%S", t
|
|
|
|
|
sampling_values <- c(1.5, 2, 1.7) |
|
|
|
|
plot(sampling_time_posix, sampling_values) |
|
|
|
|
# COMMENT: Notice that the x-axis is on a numeric scale |
|
|
|
|
as.numeric(sampling_time_posix) |
|
|
|
|
|
|
|
|
|
# Other useful functios include: |
|
|
|
|
# Other useful functions include: |
|
|
|
|
?as.Date |
|
|
|
|
?Sys.setlocale |
|
|
|
|
?axis.POSIXct |
|
|
|
@ -214,8 +220,8 @@ rownames(DF) <- c("S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8", "S9", "S11")
|
|
|
|
|
|
|
|
|
|
# There are several way how you can navigate R objects and retrieve specific information from them. |
|
|
|
|
# Square brackets: |
|
|
|
|
# For 2-dimensional objects square brackets can be used to target ```[rows, columns]``` by index (row or column number) or name. |
|
|
|
|
# For 1-dimensional objects, the ```[position]``` is specified (again either by index or name). |
|
|
|
|
# For 2-dimensional objects square brackets can be used to target [rows, columns] by index (row or column number) or name. |
|
|
|
|
# For 1-dimensional objects, the [position] is specified (again either by index or name). |
|
|
|
|
# The symbol used in the substructure of an object (usually $ or @) followed by the name of the target element. |
|
|
|
|
# This only works with named elements of lists, incl. columns in data frames, and not vectors and matrices. |
|
|
|
|
|
|
|
|
@ -236,7 +242,7 @@ rownames(DF) <- c("S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8", "S9", "S11")
|
|
|
|
|
# Select the values for the columns V3 and V5 of row S1 in matrix X |
|
|
|
|
|
|
|
|
|
# Select the column c, a, and x from data frame DF with and without changing the original order |
|
|
|
|
# (hint: consider using the logical operator in%) |
|
|
|
|
# (hint: consider using the logical operator %in%) |
|
|
|
|
|
|
|
|
|
# Subset data frame DF to all rows where the values of column y are larger than 3 |
|
|
|
|
|
|
|
|
@ -282,16 +288,17 @@ str(L$int)
|
|
|
|
|
rm(list = ls()) |
|
|
|
|
|
|
|
|
|
# Then, we can read the new tables: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# community composition |
|
|
|
|
ASV <- read.table( |
|
|
|
|
"asvtab.txt", # file name |
|
|
|
|
h = T, # first line contains column names |
|
|
|
|
sep = "\t", # values separated by tabstops |
|
|
|
|
sep = "\t", # values separated by tab stops |
|
|
|
|
row.names = 1 # first row contains row names, i.e. OTU names |
|
|
|
|
) |
|
|
|
|
head(ASV) |
|
|
|
|
|
|
|
|
|
# bottom water data (carbonate chemistry, nutrients) |
|
|
|
|
# metadata and contextual information (environmental parameters) |
|
|
|
|
META <- read.table( |
|
|
|
|
"metadata.txt", |
|
|
|
|
h = T, |
|
|
|
@ -364,12 +371,12 @@ quantile(META$cell_count, seq(0, 1, 0.1))
|
|
|
|
|
|
|
|
|
|
# Casting data (similar to pivot tables) |
|
|
|
|
# This function comes from the package reshape, but here are several other options to achieve the same thing in R. |
|
|
|
|
cell_counts_overview <- cast(META, "day ~ salinity", mean, value = "cell_count") |
|
|
|
|
cell_counts_overview <- cast(META, "day ~ salinity", mean, value = "cell_count", fill = 0) |
|
|
|
|
|
|
|
|
|
# TASK: Modify the command so that instead of NA, the table is filled with zero. |
|
|
|
|
# This is just an exercise. Never just replace NA with zero without considering the meaning of the NA first. |
|
|
|
|
|
|
|
|
|
# TASK: Generate a table with the mean and temperature for each combination of experimental conditions. |
|
|
|
|
# TASK: Generate a table with the mean of cell count and temperature for each combination of experimental conditions. |
|
|
|
|
|
|
|
|
|
# Community composition is often expressed in percentages |
|
|
|
|
# Converting counts to percentages |
|
|
|
@ -382,11 +389,11 @@ ASV.rel <- prop.table(ASV, 2) * 100
|
|
|
|
|
### Loops #### |
|
|
|
|
|
|
|
|
|
# To get an overview of community composition, let's calculate the percentages at all available taxonomic levels (domain to genus). |
|
|
|
|
# Summarizing data like this is a repeptitive tasks and can be coded in a loop: |
|
|
|
|
# Summarizing data like this is a repetitive tasks and can be coded in a loop: |
|
|
|
|
|
|
|
|
|
# Create an empty list |
|
|
|
|
TAX.pooled <- vector(mode = "list", length = 6) |
|
|
|
|
names(TAX.pooled) <- colnames(TAX)[1:6] # domain to genus |
|
|
|
|
names(TAX.pooled) <- colnames(TAX) # domain to genus |
|
|
|
|
|
|
|
|
|
# Fill elements of list with data frames with sequence counts per taxon, |
|
|
|
|
# one taxonomic level at a time |
|
|
|
@ -409,7 +416,7 @@ str(TAX.pooled)
|
|
|
|
|
|
|
|
|
|
### Some very basic plots #### |
|
|
|
|
|
|
|
|
|
# Before starting any kind of statical analysis, ALWAYS look at the data. |
|
|
|
|
# Before starting any kind of statistical analysis, ALWAYS look at the data. |
|
|
|
|
# The below examples are for data exploration only and |
|
|
|
|
# do not necessarily represent publication quality data visualization approaches. |
|
|
|
|
|
|
|
|
@ -424,14 +431,20 @@ plot(
|
|
|
|
|
|
|
|
|
|
# If there are more than 2 numeric variables (or those that can be coerced as such), |
|
|
|
|
# the following will create all pairwise scatter plots: |
|
|
|
|
pairs(META[, c(1, 2, 4, 6, 9)]) |
|
|
|
|
pairs(META[, c(1, 2, 4:6)]) |
|
|
|
|
|
|
|
|
|
# Boxplots #### |
|
|
|
|
boxplot( |
|
|
|
|
log(cell_count) ~ day * salinity, # formula for how to summarize the data in the boxplot |
|
|
|
|
data = META, # object that contains the data |
|
|
|
|
las = 2, # plot tick labels perpendicular to axis |
|
|
|
|
col = rep(c("yellow2", "green", "red", "darkorchid2", "dodgerblue", "black"), 5) # define colors for the boxes |
|
|
|
|
las = 2 # plot tick labels perpendicular to axis |
|
|
|
|
# col = rep(day.color, 4) # define colors for the boxes |
|
|
|
|
) |
|
|
|
|
points( |
|
|
|
|
as.numeric(interaction(META$day, META$salinity)), |
|
|
|
|
log(META$cell_count), |
|
|
|
|
bg = META$color, # background color of symbols |
|
|
|
|
pch = META$symbol # point character |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
# QUESTION: What do you notice when looking at d0 in the plot? |
|
|
|
@ -440,7 +453,7 @@ boxplot(
|
|
|
|
|
# Histograms #### |
|
|
|
|
hist( |
|
|
|
|
META$cell_count, |
|
|
|
|
breaks = 20 |
|
|
|
|
breaks = 10 |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
# TASK: Modify the breaks argument to change the size of the bins for each bar in the histogram. |
|
|
|
@ -510,6 +523,8 @@ write.table(
|
|
|
|
|
# E.g. to ave lists and other objects which can't easily be represented in a tabular form. |
|
|
|
|
# Use readRDS() to read such data into R again |
|
|
|
|
saveRDS(TAX.pooled, "tax_summaries.rds") |
|
|
|
|
TAX.pooled2 <- readRDS("tax_summaries.rds") |
|
|
|
|
str(TAX.pooled2) |
|
|
|
|
|
|
|
|
|
# TASK: Use a loop (or any other approach) to save the individual elements of TAX.pooled.rel |
|
|
|
|
# (with values rounded to 2 decimals) as separate tables. |
|
|
|
@ -673,6 +688,8 @@ plan(multicore, workers = 60)
|
|
|
|
|
knots <- 100 # pick 100 sequencing depth for rarefaction |
|
|
|
|
n <- 10 # randomly subsample 10 times at each sequencing depth |
|
|
|
|
|
|
|
|
|
# DO NOT RUN THIS UNLESS YOU HAVE ACCESS TO >10 CORES FOR PARALLEL COMPUTING!!! |
|
|
|
|
|
|
|
|
|
# compare speed |
|
|
|
|
system.time( |
|
|
|
|
map( |
|
|
|
|