## Introduction to R ### Simple computations # The console can act as a simple calculator 1.0 + 1.1 2 * 2 3/2 2^3 1/0 0/0 # Use ";" for performing several operations in the same line (1 + 3) * 2 - 1; 3 + 2 # Elemental mathematical functions sqrt(2); 2^0.5 exp(1) log(10) # Natural or neperian logarithm log10(10); log2(10) # Logs in base 10 and 2 sin(pi); cos(0); asin(0) tan(pi/3) sqrt(-1) # Remember to close the parenthesis -- errors below # 1 + # (1 + 3 ### Variables and assignment # Any operation that you perform in R can be stored in a variable # (or object) with the assignment operator "<-" x <- 1 # To see the value of a variable, simply type it x # A variable can be overwritten x <- 1 + 1 # Now the value of x is 2 and not 1, as before x # Capitalization matters X <- 3 x; X # See what are the variables in the workspace ls() # Remove variables rm(X) # X # Error ### Vectors # We combine numbers with the function "c" c(1, 3) c(1.5, 0, 5, -3.4) # A handy way of creating integer sequences is the operator ":" 1:5 # Storing some vectors myData <- c(1, 2) myData2 <- c(-4.12, 0, 1.1, 1, 3, 4) myData myData2 # Entrywise operations myData + 1 myData^2 # If you want to access a position of a vector, use [position] myData[1] myData2[6] # You can also change elements myData[1] <- 0 myData # Think on what you want to access... myData2[7] myData2[0] # If you want to access all the elements except a position, # use [-position] myData2[-1] myData2[-2] # Also with vectors as indexes myData2[1:2] myData2[myData] # And also myData2[-c(1, 2)] # But do not mix positive and negative indexes! # myData2[c(-1, 2)] # Error # Remove the first element myData2 <- myData2[-1] ### Some functions # Functions take arguments between parenthesis and transform them # into an output sum(myData) prod(myData) # Summary of an object summary(myData) # Length of the vector length(myData) #Seeing what is inside a function debug(f1) undebug(f1) # Mean, standard deviation, variance, covariance, correlation mean(myData) var(myData) cov(myData, myData^2) cor(myData, myData * 2) quantile(myData) # Maximum and minimum of vectors min(myData) which.min(myData) # Usually the functions have several arguments, which are set # by "argument = value" In the next case, the second argument is # a logical flag to indicate the kind of sorting sort(myData) # If nothing is specified, decreasing = FALSE is # assumed sort(myData, decreasing = TRUE) # Do not know what are the arguments of a function? Use args # and help! args(mean) ?mean # EX 4 -------------------------------------------------------------------- y = seq(100,1,by = -1) y[2]=-1 mean(y) median(y) var(y) mean(y+1) median(y+1) var(y+1) max(y) which.max(y) sort(y, decreasing = FALSE)[c(5,76)] cov(y,y) var(y) ### Matrices, data frames, and lists # A matrix is an array of vectors A <- matrix(1:4, nrow = 2, ncol = 2) A # Another matrix B <- matrix(1, nrow = 2, ncol = 2, byrow = TRUE) B # Matrix is a vector with dimension attributes dim(A) # Binding by rows or columns rbind(1:3, 4:6) cbind(1:3, 4:6) # Entrywise operations A + 1 A * B # Accessing elements A[2, 1] # Element (2, 1) A[1, ] # First row -- this is a vector A[, 2] # First column -- this is a vector # Obtain rows and columns as matrices (and not as vectors) A[1, , drop = FALSE] A[, 2, drop = FALSE] # Matrix transpose t(A) # Matrix multiplication A %*% B A %*% B[, 1] A %*% B[1, ] # Care is needed # A %*% B[1, , drop = FALSE] # Incompatible product # Compute inverses with "solve" solve(A) %*% A # A data frame is a matrix with column names # Useful when you have multiple variables myDf <- data.frame(var1 = 1:2, var2 = 3:4) myDf # You can change names names(myDf) <- c("newname1", "newname2") myDf # You can access variables by its name with the "$" operator myDf$newname1 # And create new variables also (they have to be of the same # length as the rest of variables) myDf$myNewVariable <- c(0, 1) myDf #Matrix to dataframe mydfA <- as.data.frame(A) class(mydfA) # A list is a collection of arbitrary variables myList <- list(myData = myData, A = A, myDf = myDf) # Access elements by names myList$myData myList$A myList$myDf # Reveal the structure of an object str(myList) str(myDf) # A less lengthy output names(myList) ### More on data frames # The iris dataset is already imported in R # (beware: locfit has also an iris dataset, with different names # and shorter) # The beginning of the data head(iris) # "names" gives you the variables in the data frame names(iris) # So we can access variables by "$" or as in a matrix iris$Sepal.Length[1:10] iris[1:10, 1] iris[3, 1] # Information on the dimension of the data frame dim(iris) # "str" gives the structure of any object in R str(iris) # Recall the species variable: it is a categorical variable # (or factor), not a numeric variable iris$Species[1:10] # Factors can only take certain values levels(iris$Species) # EX 5 -------------------------------------------------------------------- M = matrix(c(y[3:5], y[3:5]^2,log(y[3:5])), nrow = length(y[3:5]), ncol=length(y[3:5])) myDataFrame = data.frame(y=y[3:5], y2=y[3:5]^2, logy=log(y[3:5])) x=c(1,0,3,4) l = list(x=x, M=M) myDataFrame2 = myDataFrame^2 log(myDataFrame + myDataFrame2) # If a file contains a variable with character strings as # observations (either encapsulated by quotation marks or not), # the variable will become a factor when imported into R ### Vector-related functions # The function "seq" creates sequences of numbers equally separated seq(0, 1, by = 0.1) seq(0, 1, length.out = 5) # You can short the latter argument seq(0, 1, l = 5) # Repeat number rep(0, 5) # Reverse a vector myVec <- c(1:5, -1:3) rev(myVec) # Another way myVec[length(myVec):1] # Count repetitions in your data table(iris$Species) ### Logical conditions and subsetting # Relational operators: x < y, x > y, x <= y, x >= y, x == y, x!= y # They return TRUE or FALSE # Smaller than 0 < 1 # Greater than 1 > 1 # Greater or equal to 1 >= 1 # Remember: ">="" and not "=>"" ! # Smaller or equal to 2 <= 1 # Remember: "<="" and not "=<"" ! # Equal 1 == 1 # Tests equality. Remember: "=="" and not "="" ! # Unequal 1 != 0 # Tests iequality # TRUE is encoded as 1 and FALSE as 0 TRUE + 1 FALSE + 1 # In a vector-like fashion x <- 1:5 y <- c(0, 3, 1, 5, 2) x < y x == y x != y # Subsetting of vectors x x[x >= 2] x[x < 3] # Easy way of work with parts of the data data <- data.frame(x = c(0, 1, 3, 3, 0), y = 1:5) data # Data such that x is zero data0 <- data[data$x == 0, ] data0 # Data such that x is larger than 2 data2 <- data[data$x > 2, ] data2 # Problem -- what happened? data[x > 2, ] # AND operator "&" TRUE & TRUE TRUE & FALSE FALSE & FALSE # OR operator "|" TRUE | TRUE TRUE | FALSE FALSE | FALSE # Both operators are useful for checking for ranges of data y index1 <- (y <= 3) & (y > 0) y[index1] index2 <- (y < 2) | (y > 4) y[index2] ### Plotting functions # "plot" is the main function for plotting in R # It has a different behaviour depending on the kind of object # that it receives # How to plot some data plot(iris$Sepal.Length, iris$Sepal.Width, main = "Sepal.Length vs Sepal.Width") # Change the axis limits plot(iris$Sepal.Length, iris$Sepal.Width, xlim = c(0, 10), ylim = c(0, 10)) # How to plot a curve (a parabola) x <- seq(-1, 1, l = 50) y <- x^2 plot(x, y, main = "A red thick parabola", type = "l", col = "red", lwd = 3) # Plotting a more complicated curve between -pi and pi x <- seq(-pi, pi, l = 50) y <- (2 + sin(10 * x)) * x^2 plot(x, y, type = "l") # Kind of rough... # Remember that we are joining points for creating a curve! # More detailed plot x <- seq(-pi, pi, l = 500) y <- (2 + sin(10 * x)) * x^2 plot(x, y, type = "l") # For more options in the plot customization see ?plot ?par # "plot" is a first level plotting function. That means that # whenever is called, it creates a new plot. If we want to # add information to an existing plot, we have to use a second # level plotting function such as "points", "lines" or "abline" plot(x, y) # Create a plot lines(x, x^2, col = "red") # Add lines points(x, y + 10, col = "blue") # Add points abline(a = 5, b = 1, col = "orange", lwd = 2) # Add a straight # line y = a + b * x ### Distributions # R allows to sample [r], compute density/probability mass # functions [d], compute distribution function [p], and compute # quantiles [q] for several continuous and discrete distributions. # The format employed is [rdpq]name, where name stands for: # - norm -> Normal # - unif -> Uniform # - exp -> Exponential # - t -> Student's t # - f -> Snedecor's F # - chisq -> Chi squared # - pois -> Poisson # - binom -> Binomial # More distributions: ?Distributions # Sampling from a Normal -- 5 random points from a N(0, 1) rnorm(n = 5, mean = 0, sd = 1) # If you want to have always the same result, set the seed of # the random number generator set.seed(45678) rnorm(n = 5, mean = 0, sd = 1) # Plotting the density of a N(0, 1) -- the Gaussian bell x <- seq(-4, 4, l = 100) y <- dnorm(x = x, mean = 0, sd = 1) plot(x, y, type = "l") # Plotting the distribution function of a N(0, 1) x <- seq(-4, 4, l = 100) y <- pnorm(q = x, mean = 0, sd = 1) plot(x, y, type = "l") # Computing the 95% quantile for a N(0, 1) qnorm(p = 0.95, mean = 0, sd = 1) # All distributions have the same syntax: rname(n,...), # dname(x,...), dname(p,...) and qname(p,...), but the # parameters in ... change. Look them in ?Distributions # For example, here is que same for the uniform distribution # Sampling from a U(0, 1) set.seed(45678) runif(n = 10, min = 0, max = 1) # Plotting the density of a U(0, 1) x <- seq(-2, 2, l = 100) y <- dunif(x = x, min = 0, max = 1) plot(x, y, type = "l") # Computing the 95% quantile for a U(0, 1) qunif(p = 0.95, min = 0, max = 1) # Sampling from a Bi(10, 0.5) set.seed(45678) samp <- rbinom(n = 200, size = 10, prob = 0.5) table(samp) / 200 # Plotting the probability mass of a Bi(10, 0.5) x <- 0:10 y <- dbinom(x = x, size = 10, prob = 0.5) plot(x, y, type = "h") # Vertical bars # Plotting the distribution function of a Bi(10, 0.5) x <- 0:10 y <- pbinom(q = x, size = 10, prob = 0.5) plot(x, y, type = "h") ### Functions # A function is a way of encapsulating a block of code so it can # be reused easily. They are useful for simplifying repetitive # tasks and organize analyses # This is a silly function that takes x and y and returns its sum # Note the use of "return" to indicate what should be returned add <- function(x, y) { z <- x + y return(z) } # Calling add -- you need to run the definition of the function # first! add(x = 1, y = 2) add(1, 1) # Arguments names can be omitted # A more complex function: computes a linear model and its # posterior summary. Saves us a few keystrokes when computing a # lm and a summary lmSummary <- function(formula, data) { model <- lm(formula = formula, data = data) summary(model) } # If no return(), the function returns the value of the last # expression # Usage lmSummary(Sepal.Length ~ Petal.Width, iris) # Recall: there is no variable called model in the workspace. # The function works on its own workspace! # model # Error # Add a line to a plot addLine <- function(x, beta0, beta1) { lines(x, beta0 + beta1 * x, lwd = 2, col = 2) } # Usage plot(x, y) addLine(x, beta0 = 0.1, beta1 = 0) # The function "sapply" allows to sequentially apply a function sapply(1:10, sqrt) #i want to apply to the vector 1:10, the function sqrt sqrt(1:10) # The same # The advantage of "sapply" is that you can use with any function myFun <- function(x) c(x, x^2) sapply(1:10, myFun) # Returns a 2 x 10 matrix # "sapply" is useful for plotting non-vectorized functions sumSeries <- function(n) sum(1:n) plot(1:10, sapply(1:10, sumSeries), type = "l") # "apply" applies iteratively a function to rows (1) or columns # (2) of a matrix or data frame A <- matrix(1:10, nrow = 5, ncol = 2) A apply(A, 1, sum) # Applies the function by rows #sums all the rows apply(A, 2, sum) # By columns rowSums(A) colSums(A) # With other functions apply(A, 1, sqrt) apply(A, 2, function(x) x^2) ### Control structures # The "for" statement allows to create loops that run along a # given vector # Print 3 times a message (i varies in 1:3) for (i in 1:3) { print(i) } # Another example a <- 0 for (i in 1:3) { a <- i + a } a # Nested loops are possible A <- matrix(0, nrow = 2, ncol = 3) for (i in 1:2) { for (j in 1:3) { A[i, j] <- i + j } } # The "if" statement allows to create conditional structures of # the forms: # if (condition) { # # Something # } else { # # Something else # } # These structures are thought to be inside functions # Is the number positive? isPositive <- function(x) { if (x > 0) { print("Positive") } else { print("Not positive") } } isPositive(1) isPositive(-1) # A loop can be interrupted with the "break" statement # Stop when x is above 100 x <- 1 for (i in 1:1000) { x <- (x + 0.01) * x print(x) if (x > 100) { break } } # EX 10 ------------------------------------------------------------------- qf(p = c(0.9, 0.95, 0.99),df1 = 1,df2 = 5) #density function plot x <- seq(-2, 2, l = 100) y <- dunif(x=x, min = 0, max = 1) plot(x, y, type = "l") #distribution function plot x <- seq(-2, 2, l = 100) y <- punif(q = x, min = 0, max = 1) plot(x, y, type = "l") pois = rpois(x, lambda = 5) unif = runif(x, min=-1, max=1) mean(unif) x <- seq(-4,4) y <- dt(x = x,df = 1) plot(x, y) x <- seq(-4,4) y <- dt(x = x,df = 5) plot(x, y, type = "l") lines(x, y, col = "red") x <- seq(-4,4) y <- dt(x = x,df = 10) plot(x, y, type = "l") lines(x, y, col = "blue") x <- seq(-4,4) y <- dt(x = x,df = 50) plot(x, y, type = "l") lines(x, y, col = "green") x <- seq(-4,4) y <- dt(x = x,df = 100) plot(x, y, type = "l") lines(x, y, col = "yellow")