1 #---------------------------------#
2 # Overview of R #
3 #---------------------------------#
4 #means comments
5 #After installing R from internet, get the full latest version of bioconductor using following
6 commands or Load the bioconductor (www.bioconductor.org) packages for working with
7 RNASeq data, Bioconductor provides tools for the analysis and comprehension of high-
8 throughput genomic data. Bioconductor uses the R statistical programming language, and is
9 open source and open development. It has two releases each year, 1024 software packages,
10 and an active user community.
11 # Go to R console and type following command
12 library(edgeR) # will upload edgeR, if not you will use bioclite() as under;
13 source("http://bioconductor.org/biocLite.R") # will connect to bioconductor website
14 biocLite("edgeR") # will upload edgeR directly from server, once downloaded, it will
15 library(edgeR) # load in your R Session
16 # biocLite() # will get all the updated packages in it takes longer, we can do them one by one
17 #### Same way load others also as;
18 library(affy) # if not loading, use biocLite as
19 biocLite("affy")
20 # if not loading, use biocLite as above
21 library(affycoretools) # if not loading, use biocLite as above
22 biocLite("affycoretools")
23 library(limma) # plotDensities
24 # during installation if it asks for go with yes or all or yes to all
25 dir()# will tell what files and directories/folders you have in your current folder
26 library() # will tell you what libraries you have already installed
27 ls() # will tell what variables you have so far, variable is anything that can store value/s, type of
28 a variable will be specified by the value it stores.
29 #### Check the current working directory, which is where R reads from and writes to unless full
30 path names are given. Change the working directory to the one where the data is.
31 getwd() ### will tell you where exactly you are right now
32 setwd("path_to_folder") # will change to where we want
33 # Need help? want to know about what any command does, just type
34 help() # will open help server for you, you can specify your question also as help(anything) e.g
35 help("ls")
36 help("objects")
37 help("class") # or
38 ?class# will help if it’s parent package is already loaded
39 ??class# will search for what package it exists in R
40 # Just play with R, write any number in R followed by any character
41 1#
42 x # will give an error since there is no variable defined as x
43 x <- 1 #will create a variable x (a numeric vector) and stores a value of 1 in it.
44 <- is assignment operator, similar to =
45 x # now will return var x and it's value
46 y <- 1:10 # will generate a series of numbers from 1 to 10 and store in y
47 y
48 z <- 1,3,4# will give an error, the right way is to use c(), concatenate function
49 z <- c(1,2,3,3)
50 z
51 # both x and y are also numeric vectors
52 v <- c('a', 'b', 'c') # we can store characters in v. V will become a character vector. Let's check it's
53 class
54 class(v)
55 rm() # you can remove any variable
56 rm(list=ls()) # you can remove all variables
57 length(z) # will return you how many items are there in a vector
58 unique(z) # will return a non-redundant list
59 w <- matrix(1:10, nrow=2, ncol=5, byrow=T) # will create a matrix type object & fill it row-wise
60 try byrow=F also. Matrix stores numeric vectors
61 dim(w) # will tell how many rows and columns we have in w
62 d <- data.frame(1:3,c('a', 'b','c')) # will create a data.frame that can have different types of
63 vectors like integer, character etc
64 factor() # will create a factor class object that have different levels
65 z <- c(1,2,2,3,3,5)
66 factor(z)
67 as.factor() # 'as' will change the data type to as what specified
68 as.numeric() # will change the data type to numeric
69 is.factor() # will check whether the data type is factor
70 is.numeric # will check whether the data type is numeric
71 apply() # will apply any function on any data
72 rep(1,3) # will print 1 three times
73 rep(c("A","B"), 3) # will print A, B three times alternatively
74 rep(c("A","B"), each=3) # will print A, 3 times and then B, 3 times
75 ls() #### Show the objects in workspace; if there are any, remove them.
76 ###################### RNA-Seq Data Analysis ###########################
77 The data is a subset from an in-house RNA-Seq project, comparing wild type with a mutant,
78 using 3 independent replicates of each. The data file has geneids, their transcript lengths and
79 counts of reads mapping to these 100 genes for each of the six samples obtained after aligning
80 the reads with A.thaliana transcripts obtained from TAIR10 using bowtie aligner and reads
81 mapped to each gene/transcript are counted using in-house python script getCountMatrix.py
82 Lets input this file as csv
83 raw.data <- read.csv("RNASeq.csv",header=T, row.names=1)
84 ls()
85 class(raw.data) # Because it's a data frame, we can ask what the dimensions are:
86 dim(raw.data) # how many rows and columns we have
87 head(raw.data) # Look at the first few (6) rows of the data
88 raw.data[1:5,] # Look at the first 5 rows of the data
89 raw.data[1:5,2:7] # Look at the first 5 rows and 2nd to 7th columns of the data
90 tail(raw.data) # Look at the last few lines of the data
91 raw.data$mu1 # will print this column or create a new one if it does not exist
92 #### Lets take a break, save the workspace and history, then practice quitting R and restarting
93 your analysis
94 save.image("RNASeqDemo.RData")
95 savehistory("RNASeqDemo.Rhistory")
96 q() #say no; if say yes, will save files with default names of ".RData" and ".Rhistory"
97 #### Start R again, change working directory, load .RData and .Rhistory
98 setwd("C:/Myworkshop")
99 load("RNASeqDemo.RData")
100 loadhistory("RNASeqDemo.Rhistory")
101 ls() #see that objects have been loaded
102 #### Use up and down arrows to see through history (.Rhistory)
103 #### Must load libraries again, every time you start R!
104 library(edgeR)
105 library(limma)
106 library(affy)
107 library(affycoretools)
108 ##########################Data Exploration############################
109 #### Because RNA-Seq count data is so very different from continuous microarray data, it's a
110 good idea to do some basic exploration of the data. Let's start by looking at the range of counts
111 per sample:
112 summary(raw.data) # function from affy package
113 summary(raw.data[,2:7])
114 boxplot(raw.data[,2:7]) # from graphics package, boxplots here are weird because of much
115 variation in the data. We might transform data
116 boxplot(log2(raw.data[,2:7])) # Give warnings as log0 is -Infboxplot(log2(raw.data[,2:7]+0.01)) #
117 small constant added to avoid 0s, let’s make it colorful
118 boxplot(log2(raw.data[,2:7]+0.01), col=rep(c("green","pink"), each=3), xlab= "Samples",
119 ylab="Raw Expression in log2 scale")
120 ## Boxplot does not show means in R. We might ask R to show the means
121 points(log2(colMeans(raw.data[,2:7]+0.01)), col="blue", pch=18)
122 #### All 6 samples have some 0 counts, which is typical for RNA-Seq data. We can also examine
123 the overall distributions of counts to see if any of the samples are different. Because of the
124 extreme range in the data, converting to log2 scale helps in plotting; however, you can't take
125 the log of 0, so we need to add a small constant before taking the logs and plotting the
126 distributions. Since the smallest count value is 1, pick a constant smaller than that, like 0.01
127 ##All graphics can be saved using the menu File -> Save as
128 plotDensities(log2(raw.data[,2:7])) # Warnings
129 plotDensities(log2(raw.data[,2:7]+0.01), legend="topright") # add constant if don't like to be
130 warned
131 #### The shapes are similar, except mu3 is slightly different. It has many more low values (likely
132 zeros), so maybe it has a smaller number of total counts. Let's check the library size for each
133 sample by summing all the counts:
134 library.sizes <- colSums(raw.data[,2:7])
135 library.sizes
136 #x11() #This will open a new graphing window; without it, previous graph will be replaced. But
137 it slows the system.
138 barplot(library.sizes, col=rep(c("green","pink"), each=3), xlab= "Samples", ylab="Total
139 RNA/sample", width=1000000, ylim=c(0, 12000000))
140 plot(library.sizes, col=rep(c("green","red"), each=3), xlab= "Samples", ylab="Total RNA/sample")
141 heatmap(as.matrix(raw.data[1:10,2:7])) # Heat map of counts of first 10 genes
142 heatmap(as.matrix(raw.data[1:10,2:7]),col = cm.colors(256)) # better colors
143 heatmap(as.matrix(raw.data[1:10,2:7]), Rowv=NA, Colv=NA) # Heat map without clustering
144 heatmap(as.matrix(raw.data[1:10,2:7]), Colv=NA) # Heat map without sample clustering
145