diff --git a/R_intro_2022/README.md b/R_intro_2022/README.md index 3e9a2bd..1150e61 100644 --- a/R_intro_2022/README.md +++ b/R_intro_2022/README.md @@ -23,12 +23,12 @@ The workshop will consist of 2 sessions: The first session is focused more on th * R data and object types * Data and object type conversions * Navigating R objects -* Reading data into R + * Good coding practice ### Session 2: -* Q&A session 1 +* Reading data into R * Rearranging R objects * Loops * Rudimentary data exploration (plots) diff --git a/R_intro_2022/R_intro_script.R b/R_intro_2022/R_intro_script.R index dfea615..84fac9a 100644 --- a/R_intro_2022/R_intro_script.R +++ b/R_intro_2022/R_intro_script.R @@ -32,7 +32,7 @@ install.packages("scales") # Save and load workspace load("R_intro.Rdata") save.image("R_intro.Rdata") -# COMMENT: Don't save .Rdata when prompted upon closing R. Either explicitly specifiy a file or don't save the workspace. +# COMMENT: Don't save .Rdata when prompted upon closing R. Either explicitly specify a file or don't save the workspace. # In combination with the setwd() command, you will always know which data you are working with. # COMMENT: I usually comment out the save.image() command so avoid accidentally overwriting an existing workspace @@ -45,7 +45,7 @@ save.image("R_intro.Rdata") # QUESTION: What data type are you familiar with? -# For demonstration purposes, let's make up some data to look at common data and object types in R. +# For demonstration purposes, let's make up some data to look at common data and object types in R. # For this we will use several R functions. Have a look at the help of these functions to find out more if you are interested. # vectors #### @@ -54,7 +54,7 @@ save.image("R_intro.Rdata") # They usually consist of the series of individual values. # Those values must have the same data type. -# Here we assign (whith <-) a series of values, specifically 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, to the R object x +# Here we assign (with <-) a series of values, specifically 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, to the R object x x <- c(1:5, 16:20) # Show data type: @@ -99,6 +99,8 @@ as.numeric(as.factor(a)) # Why does as.numeric(as.factor(x)) not reproduce the original numeric vector? as.factor(x) +levels(as.factor(x)) +str(as.factor(x)) as.numeric(as.factor(x)) as.numeric(as.character(as.factor(x))) @@ -108,6 +110,9 @@ as.numeric(as.character(as.factor(x))) sampling_time <- c("2022-03-02 12:00:00", "2022-03-04 12:00:00", "2022-03-10 12:00:00") sampling_time_posix <- as.POSIXlt(sampling_time, format = "%Y-%m-%d %H:%M:%S", tz = "CET") +# sampling_time <- c("02.03.2022 12:00:00.234") +# sampling_time_posix <- as.POSIXlt(sampling_time, format = "%d.%m.%Y %H:%M:%OS", tz = "CET") + # COMMENT: If not setting a time zone (tz) make sure that R takes the correct one. # This is especially important if measuring instruments run in a # different time zone than the computer for data analysis. @@ -116,8 +121,9 @@ sampling_time_posix <- as.POSIXlt(sampling_time, format = "%Y-%m-%d %H:%M:%S", t sampling_values <- c(1.5, 2, 1.7) plot(sampling_time_posix, sampling_values) # COMMENT: Notice that the x-axis is on a numeric scale +as.numeric(sampling_time_posix) -# Other useful functios include: +# Other useful functions include: ?as.Date ?Sys.setlocale ?axis.POSIXct @@ -214,8 +220,8 @@ rownames(DF) <- c("S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8", "S9", "S11") # There are several way how you can navigate R objects and retrieve specific information from them. # Square brackets: -# For 2-dimensional objects square brackets can be used to target ```[rows, columns]``` by index (row or column number) or name. -# For 1-dimensional objects, the ```[position]``` is specified (again either by index or name). +# For 2-dimensional objects square brackets can be used to target [rows, columns] by index (row or column number) or name. +# For 1-dimensional objects, the [position] is specified (again either by index or name). # The symbol used in the substructure of an object (usually $ or @) followed by the name of the target element. # This only works with named elements of lists, incl. columns in data frames, and not vectors and matrices. @@ -236,7 +242,7 @@ rownames(DF) <- c("S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8", "S9", "S11") # Select the values for the columns V3 and V5 of row S1 in matrix X # Select the column c, a, and x from data frame DF with and without changing the original order -# (hint: consider using the logical operator in%) +# (hint: consider using the logical operator %in%) # Subset data frame DF to all rows where the values of column y are larger than 3 @@ -282,16 +288,17 @@ str(L$int) rm(list = ls()) # Then, we can read the new tables: - + +# community composition ASV <- read.table( "asvtab.txt", # file name h = T, # first line contains column names - sep = "\t", # values separated by tabstops + sep = "\t", # values separated by tab stops row.names = 1 # first row contains row names, i.e. OTU names ) head(ASV) -# bottom water data (carbonate chemistry, nutrients) +# metadata and contextual information (environmental parameters) META <- read.table( "metadata.txt", h = T, @@ -364,12 +371,12 @@ quantile(META$cell_count, seq(0, 1, 0.1)) # Casting data (similar to pivot tables) # This function comes from the package reshape, but here are several other options to achieve the same thing in R. -cell_counts_overview <- cast(META, "day ~ salinity", mean, value = "cell_count") +cell_counts_overview <- cast(META, "day ~ salinity", mean, value = "cell_count", fill = 0) # TASK: Modify the command so that instead of NA, the table is filled with zero. # This is just an exercise. Never just replace NA with zero without considering the meaning of the NA first. -# TASK: Generate a table with the mean and temperature for each combination of experimental conditions. +# TASK: Generate a table with the mean of cell count and temperature for each combination of experimental conditions. # Community composition is often expressed in percentages # Converting counts to percentages @@ -382,11 +389,11 @@ ASV.rel <- prop.table(ASV, 2) * 100 ### Loops #### # To get an overview of community composition, let's calculate the percentages at all available taxonomic levels (domain to genus). -# Summarizing data like this is a repeptitive tasks and can be coded in a loop: +# Summarizing data like this is a repetitive tasks and can be coded in a loop: # Create an empty list TAX.pooled <- vector(mode = "list", length = 6) -names(TAX.pooled) <- colnames(TAX)[1:6] # domain to genus +names(TAX.pooled) <- colnames(TAX) # domain to genus # Fill elements of list with data frames with sequence counts per taxon, # one taxonomic level at a time @@ -409,7 +416,7 @@ str(TAX.pooled) ### Some very basic plots #### -# Before starting any kind of statical analysis, ALWAYS look at the data. +# Before starting any kind of statistical analysis, ALWAYS look at the data. # The below examples are for data exploration only and # do not necessarily represent publication quality data visualization approaches. @@ -424,14 +431,20 @@ plot( # If there are more than 2 numeric variables (or those that can be coerced as such), # the following will create all pairwise scatter plots: -pairs(META[, c(1, 2, 4, 6, 9)]) +pairs(META[, c(1, 2, 4:6)]) # Boxplots #### boxplot( log(cell_count) ~ day * salinity, # formula for how to summarize the data in the boxplot data = META, # object that contains the data - las = 2, # plot tick labels perpendicular to axis - col = rep(c("yellow2", "green", "red", "darkorchid2", "dodgerblue", "black"), 5) # define colors for the boxes + las = 2 # plot tick labels perpendicular to axis + # col = rep(day.color, 4) # define colors for the boxes +) +points( + as.numeric(interaction(META$day, META$salinity)), + log(META$cell_count), + bg = META$color, # background color of symbols + pch = META$symbol # point character ) # QUESTION: What do you notice when looking at d0 in the plot? @@ -440,7 +453,7 @@ boxplot( # Histograms #### hist( META$cell_count, - breaks = 20 + breaks = 10 ) # TASK: Modify the breaks argument to change the size of the bins for each bar in the histogram. @@ -510,6 +523,8 @@ write.table( # E.g. to ave lists and other objects which can't easily be represented in a tabular form. # Use readRDS() to read such data into R again saveRDS(TAX.pooled, "tax_summaries.rds") +TAX.pooled2 <- readRDS("tax_summaries.rds") +str(TAX.pooled2) # TASK: Use a loop (or any other approach) to save the individual elements of TAX.pooled.rel # (with values rounded to 2 decimals) as separate tables. @@ -673,6 +688,8 @@ plan(multicore, workers = 60) knots <- 100 # pick 100 sequencing depth for rarefaction n <- 10 # randomly subsample 10 times at each sequencing depth +# DO NOT RUN THIS UNLESS YOU HAVE ACCESS TO >10 CORES FOR PARALLEL COMPUTING!!! + # compare speed system.time( map( diff --git a/R_intro_2022/R_intro_slides.pdf b/R_intro_2022/R_intro_slides.pdf index 003e72b..ab27987 100644 Binary files a/R_intro_2022/R_intro_slides.pdf and b/R_intro_2022/R_intro_slides.pdf differ