R Statistics Essential Training

Lakukan tugas rumah & ujian kamu dengan baik sekarang menggunakan Quizwiz!

Data Input and Import

# Calling and investigating data library(help = "datasets") #shows all the cool data sets for practicing airmiles # shows (in console and workspace) the data in this data set data(airmiles) # loads the data into workspace str(airmiles) # gives information of the structure of a data set fix(airmiles) # allows you to be able to actually edit the data set # be aware that so far these datasets already exist in R (internal), it's different for external # practising with anscombe data (has rows and columns) data(anscombe) str(anscombe) anscombe # can also click on data name in workspace to see it rm(list = ls()) # Creating sequential data x1 <- 0:10 # assigns values to variable fix('x1') #shows whats inside the x variable x1 #shows x1 but in console (before was just workspace) x2 <- 10:0 # can also make descending variables x3 <- seq(10) # adds (from 1) all the way to specified number x4 <- seq(30, 0, by = -3) #told to go down from 30-0 by every 3 x5 <- c(5,12,13,66,7777) #manually entering the data x6 <- scan() #another way to add specific values #brings cursor to console to ask what you want put in # Just type number, press enter, type next, press enter...etc. # to finish double press enter, then your variable is set ls() #shows you what is in the workspace #Importing external data #excel spreadsheets. Dont go straight. #Convert to csv or text file first! #txt files examplefile.txt <- read.table("~/Desktop/R/examplefile.txt", header = TRUE) # Tells R to create variable "trends.txt" from the file location # read.table reads a file in table format and creates a data frame from it # header = true; lets R know first row is variable names. # File extension is specified! # will give error if missing data (just like in SAS) examplefile.txt <- read.table("~/Desktop/R/examplefile.txt", header = TRUE, sep = "\t") # same as before but "sep = " tells computer how table is spaced, "\t"abs or "," for commas) str(trends.txt) # gives information on the structure of the file View(trends.txt) # shows what file looks like (capital v) # csv files # strongl reccomend this format examplefile.csv <- read.csv("~/Desktop/R/examplefile.csv", header = TRUE) # same as txt but its not read.table its read.csv

Examining outliers

# Categorical outliers # A category that makes up <10% of the sample getwd() #OS <- read.csv("C:/Users/Cristian's Laptop/Desktop/OS.csv", header = TRUE) #this filepath is probably inactive now #OS # Remove outliers OS.hi <- subset(OS, Proportion > 0.1) #go into OS, only retrieve data with at least 10% proportion OS.hi # Quantitative Data require("datasets") hist(rivers) boxplot(rivers, horizontal = TRUE) #Let's remove outliers rivers.low <- rivers[rivers < 1210] boxplot(rivers.low, horizontal = TRUE) # new outliers since sample saize is different now # can keep cleaning up the boxplots as best see fit

Analyzing by subgroup

# Load data data(iris) iris mean(iris$Petal.Width) #shows overall mean # Same value but different roups aggregate(iris$Petal.Width ~ iris$Species, FUN = mean) #the '~' means 'as a function of', so asked for petal width as a function of species # FUN = function, want to aggregate the mean # So mean of petal width for all species #Compare groups on several variables #cbind to list outcome variables aggregate(cbind(iris$Petal.Width, iris$Petal.Length) ~ iris$Species, FUN = mean) #cbind = column bind, used for when two outcome variables # now see petal width and length by species # V1 and V2 are shown in order defined in code (witdh, length) rm(list=ls())

Calculating correlation

# Load data data(swiss) swiss # COrrelation cor(swiss) # ew ugly lets round round(cor(swiss), 2) # If want to test a pair of variables at a time # The following gives r, T-test, CI's etc. cor.test(swiss$Fertility, swiss$Education) # again, for a pair of variables # If want probability values install.packages("Hmisc") require("Hmisc") #First need to turn data into a matrix rcorr(as.matrix(swiss)) # first table is R-values, second is P-values

Creating Grouped Box Plots

# Load up data require(MASS) ?painters painters data(painters) painters #Draw boxplots of Expression (outcome) by School (group) boxplot(painters$Expression ~ painters$School) # no labels #Modified version require("RColorBrewer") boxplot(painters$Expression ~ painters$School, col = brewer.pal(8, "Pastel2"), #for colour names = c("Renais.", "Mannerist", "Seicento", "Venetian", "Lombard", "16th C.", "17th C.", "French"), # adding names of schools boxwex = 0.5, # Width of box, make skinnier whisklty = 1, # Whisker line type; 1 = solid line # staplelty = 0, Staple type; 0 = none # outpch = 16, # Outlier symbol; 16 = filled circle outcol = brewer.pal(8, "Pastel2"), # Outlier color main = "Expression Ratings of Painters by School\nFrom \"painters\" Dataset in \"MASS\" Package", xlab = "Painter's School", ylab = "Expression Ratings") #Cleanup detach("package:MASS", unload = TRUE) detach("package:RColorBrewer", unload = TRUE) rm(list=ls())

Robust statistics for univariate analysese

# Robust means resistant to outliers and non-conformity ?state.area #numeric vector of state areas (in square miles. For all 51 US States area <- state.area area () hist(area) #histogram, see outliers boxplot(area) #boxplot, see outliers better boxplot.stats(area) ?boxplot.stats# b/c i forgot what it showed since no labels #[1]lower whisker, lower hinge, median, upper hinge, upper whisker #[1] sample size #[1] confidence interval for median #[1] outliers # Measures of Centrality summary(area) #median and mean differ, mean skewed by outliers mean(area) # to find just mean (not robust) median(area) # to find just median (robust) mean(area, trim = 0.05) # 5% each end cut off (hopefully to remove outliers) then mean recalculated # ^^ does produce a mean closer to median # can change level of trimming # Measures of Spread sd(area) # not robust mad(area) #median absolute deviation (robust) IQR(area) fivenum(area) rm(list = ls())

Transforming Variables

# When cant get rid of data and remove outliers require("datasets") ?islands hist(islands, breaks = 16) # very skewed #z-scores islands.z <- scale(islands) #converts to Z-scores, mu = 0, sd = 1 # z scores convert, but all values retain same distance from each other as normal (so their hist and boxplots look same) # To view pre-converted key values attr(islands.z, "scaled:center") # show original mean attr(islands.z, "scaled:scale") # show original SD # To convert (if not in this form) into numeric islands.z <- as.numeric(islands.z) islands.z # Logarithmic transformations # if have the value '0' in data, try log(x+1) so 0's are included islands.ln <- log(islands) # base = e islands.log10 <- log10(islands) # base= 10 islands.log2 <- log2(islands) # base = 2 hist(islands.ln) # looking better boxplot(islands.ln) # also looking better # Squaring # For negatively skewed variables # Ranking # Maintains order (nothing else) islands.rank1 <- rank(islands) hist(islands.rank1) #should be flat unless ties # If ties (or near ties) must deal with them islands.rank2 <- rank(islands, ties.method = "random") # same as line 35 but addition method hist(islands.rank2) # looks even better # Dichotomizing # Be careful # If else statements continent <-ifelse(islands > 1000,1,0) #(data and value to check against, value to set if agree, value to set if disagree ) #^^ so if islands >1000 gets set to 1, if not it is set to 0 rm(list = ls())

Working with Base Colours

#Create (c)ustom barplot to practice on x <- c(12,4,21,17,13,9) barplot(x) ?colours # see how colours are named # R has names of 657 colours!! # colours colours are arranged alphabetrically, look online for the colour you want (either name or number) barplot(x, col = "slategray3") #'col' = colour # can also call colours by their number position in collection barplot(x, col = colours()[102]) # R looks for which colour in collection vector is 602 #Using RGB (red, green, blue) specifications for colour # Dont need to memorize RGB codes, just follow the function below to get RGB from colour name # Will be in decimals for % saturation of each component col2rgb("navyblue") # look in console: (0,0,128) barplot(x, col = rgb(.0,.0,.128)) #Using hex codes # Research hex code of colour you want and input it as follows (example is with blanchedalmond) barplot(x, col = "#FFEBCD") #Using multiple colours barplot(x, col = c("red", "blue")) # if fewer specified than bar#, will cycle, otherwise goes in order barplot(x, col = c("red", "blue", "green", "yellow")) #just another example of cycling #Using color palettes # R has build in palletes palette() #tells you what your current pallet is barplot(x, col = 1:6) #cycles through the pallet #Here are some other pallets, try them barplot(x, col = rainbow(6)) barplot(x, col = heat.colors(6)) barplot(x, col = terrain.colors(6)) barplot(x, col = topo.colors(6)) barplot(x, col = cm.colors(6)) palette("default") # Return to default pallet

Calculating frequencies

#Data groups <- c(rep("blue", 3990), rep("red", 4140), rep("orange", 1890), rep("green", 3770), rep("purple", 855)) #rep repeats its contents an n number of times # so blue is repeated 3990 times # Create frequency tables groups.t1 <- table(groups) # new var made, converted to table form. Can call upon this new variable groups.t1 #shows contents, now alphabetical #Modify table groups.t2 <- sort(groups.t1, decreasing = TRUE) #sorts table one into descending, saving product as table 2 groups.t2 # Proportions and percentages prop.table(groups.t2) # for proportions round(prop.table(groups.t2), 2) # round previous code to 2 demical places round(prop.table(groups.t2), 2)*100 # for percentages

Working with Colorbrewer

#External package in Colorbrewer x = c(12,4,21,17,13,9) barplot(x) install.packages("RColorBrewer") #install package, remember case sensitivity! require("RColorBrewer") #activate package display.brewer.all() #shows all pallets # upper is best forsequential gradients # middle is best for different categories (qualitative) # lower is best for Divergence display.brewer.pal(8, "Accent") # shows a specific pallet of interest and its category, make sure to specify how many colours in the pallet you want blues <- brewer.pal(6, "Blues") # Creating variable (essentialy a shortcut) for the blue pallet of interest barplot(x, col = blues) # see how referenced new variable? Saves having do write the whole code, use variables as shortcutes #Here are some more pallets barplot(x, col = brewer.pal(6, "Greens")) barplot(x, col = brewer.pal(6, "YlOrRd")) barplot(x, col = brewer.pal(6, "RdGy")) barplot(x, col = brewer.pal(6, "BrBG")) barplot(x, col = brewer.pal(6, "Dark2")) barplot(x, col = brewer.pal(6, "Paired")) barplot(x, col = brewer.pal(6, "Pastel2")) barplot(x, col = brewer.pal(6, "Set3")) palette("default") detach("package:RColorBrewer", unload = TRUE) rm(list=ls())

Computing composite variables

#For creating variables with multiple components # Component 1 rn1 <- rnorm(1000000) #creating 1 million random normal values and putting into an object hist(rn1) summary(rn1) # Component 2 rn2 <- rnorm(1000000) hist(rn2) summary(rn2) # Average scores across two variables rn.mean <- (rn1 + rn2)/2 # possible because two sets have same sample size hist(rn.mean) # R is vector, so added first obs of rn1 and rn2 and averaged, for every obs #Multiply scores across two variables rn.prod <- rn1 * rn2 hist(rn.prod) #Kurtosis comparisons # About peaks and curve, sensitive to outliers require("psych") kurtosi(rn1) kurtosi(rn2) kurtosi(rn.mean) kurtosi(rn.prod) rm=(list=ls())

Splitting and merging files

#Load data data(longley) longley #Split up data a1 <- longley[1:14, 1:6] #first 14 rows and 6 columns, now last two columns missing a2 <- longley[1:14, 6:7] #new column to add (with "Year" to match), here are the missing columns # always need a column to match two data sets b <- longley[15:16,] # new rows to add (e.g. data update) #Write each variable into a txt file and save on computer write.table(a1, "C:/Users/Cristian's Laptop/Desktop/longley.a1.txt", sep="\t") write.table(a2, "C:/Users/Cristian's Laptop/Desktop/longley.a2.txt", sep="\t") write.table(b, "C:/Users/Cristian's Laptop/Desktop/longley.b.txt", sep="\t") rm(list=ls()) #can reset # Import data a1t <- read.table("C:/Users/Cristian's Laptop/Desktop/longley.a1.txt", sep="\t") a2t <- read.table("C:/Users/Cristian's Laptop/Desktop/longley.a2.txt", sep="\t") # Merge a1t and a2t # Must specify variable to match cases # var "Year" appears in both a.1.2 <- merge(a1t,a2t, by = "Year") # check results by clicking workshop or typing name #merge works for columns #Add two more cases to bottom bt <- read.table("C:/Users/Cristian's Laptop/Desktop/longley.b.txt", sep="\t") all.data <- rbind(a.1.2,bt) #rowbind rm(list=ls())

Bar Charts of Group Means

#Load data spray <- InsectSprays spray #To plot means, need means of groups first. Need to show R the means first means <- aggregate(spray$count ~ spray$spray, FUN = mean) # means of count as a function of variable spray # gives mean counts for each spray group plot(means) #gives line plot...ew mean.data <- t(means[-1]) # Removes first columns # 't' = transpose = swap rows and columns # so removing first column (with letters), and swapping so that in the end, 6 means are in 1 row # but now R thinks each column is unique variable even though its 1 var across a row colnames(mean.data) <- means[,1] # Tell R it can find column names of meandata by looking at all rows and first column of means data # take a look at meandata and its g now barplot(mean.data)

Calculating descriptives

#Load dataset require("datasets") cars str(cars) data(cars) #Calculating descriptives summary(cars$speed) #summ for one var summary(cars) #summ for all vars # 5-number summary fivenum(cars$speed) # minimum, lower-hinge, median, upper-hinge, maximum #Boxplot stats boxplot.stats(cars$speed) # gives 5-num tukey summary, then sample total, then confidence interval for median, then outliers #Alternative descriptives install.packages("psych") require("psych") describe(cars) # gives lots of info, all from this package!

Pie Charts for Categorical Variables

#Loadup and make frequency table require("datasets") data("chickwts") feeds <- table(chickwts$feed) feeds #Make pie chart with defaults pie(feeds) #Modify pie chart pie(feeds[order(feeds, decreasing = TRUE)], init.angle = 90, # Starts as 12 o'clock instead of 3 clockwise = TRUE, # Default is FALSE (counter-clockwise) col = c("seashell", "cadetblue2", "lightpink", "lightcyan", "plum1", "papayawhip"), #typical colours main = "Pie Chart of Feeds from chickwts") #title #Loading Multiple pie charts pie.a <- c(22, 14, 18, 20, 14, 12) pie.b <- c(20, 18, 16, 18, 16, 12) pie.c <- c(12, 14, 20, 18, 14, 22) # Changing graphical parameters for a minute oldpar <- par() # Stores old graphical parameters par(mfrow = c(1, 3), # Num. rows/cols cex.main = 3) # Main title 3x bigger colors <- c("grey98", "grey90", "lightskyblue", "lightgreen", "grey98", "grey90") pie(pie.a, main = "Pie A", col = colors) pie(pie.b, main = "Pie B", col = colors) pie(pie.c, main = "Pie C", col = colors) #Problems with piecharts #hard to distinguish proportions if groups are similar # usually avoid them #

Coding Missing data

#Missing data (NA) makes some calulcations impossible #Create data with missing data x1 <- c(1,2,3,NA,5) summary(x1) # works fine mean(x1) #doesn't work # Find missing values # Good if dont know where missing data is which(is.na(x1)) #gives vector position # 'which' always gives the index position # is.na is the condition to search for #Ignore missing values mean(x1, na.rm=T) #Tells to remove the na #Replace the missing values (Option 1) x2 <- x1 #create new var so data is maintained x2[is.na(x2)] <- 0 #if something is na, 0 goes into its index position x2 # can see replacement #Replace the missing values (Option 2) x3 <- ifelse(is.na(x1),0,x1) #if na is found, make 0, if not, then add x1 (add nothing, leave it be) x3 # can see replacement #Replacing with another number (imputations)

Syntax and Manipulating Packages

#This is a comment, # makes line a comment #shift-cntl-c to remove # from highlighted text 8+5 #ctrl-enter to process code and go to console # using ':' tells to go through all values, e.g. 1:10 1:10 #ctrl-l to clear console, "L" print("Hello world!") # line 8 just prints out the string inside it x <- 1:5 # line 10 created variable x and set it to 1,2,3,4,5 # the output of 10 output into the workspace, not console! Only asked to set (not show) # <- is an assignment operator x # now shows in console y <- c(6, 500, 12, 55, 1000) # look at the workspace, same number of variables but different numbers # can use the c() to put specific values in, incase dont want range # use "alt - " for <- shortcut y # y now shows in console a <- b <- c <- 3 # can set multiple ariables at once x+y # able to combine vectors (since same size) x*y # asterix for multiplication rm(a) # removes variable a from workspace a # this variable no longer found or accessable rm(b,c) # remove multiple rm(list=ls()) # remove all workspace variables library() # shows you all the packages you have search() # shows packages currently loaded install.packages("ggplot2") # autoinstalls from the web! Just need to know the name (not active) ?install.packages # shows help resources for a command require("ggplot2") # this package is now active library(help="ggplot2") # brings up documentation for packages browseVignettes(package = "grid") # opens a page on your browser for examples using this package update.packages() # checks all installed (dont need to be active) packages for updates detach("package:ggplot2", unload = TRUE) # de-activates the package (still installed) remove.packages("psytabs") # will uninstall (completely) the package

Single mean hypothesis test and CI

#load data quakes quakes[1:5,] #only show rows 1-5, all columns (left blank) mag <- quakes$mag #saving only magnitude variable mag[1:5] t.test(mag) #since quantitative variable, can t-test (default for a 1-var compares mean to 0) << SAS did this too # gives CI and mean # again, null is that = 0 #One-sided t-test w/mu = 4 t.test(mag, alternative = "greater", mu = 4) # mu is the greek letter! Rememtber? # code specifies that true mean is great than 4

Converting tabular data into row data

#load the example dataset str(UCBAdmissions) # eyeball the data tells us that it's a table with three variables (admit, gender, dept) # admit is the first variable, gender second, dept third margin.table(UCBAdmissions,1) #margin means overall summary, we specified the data and said for the first variable # essentially asked for overall summary of variable 1 # if didnt specify a variable, just gives entire sample size admit.dept <- margin.table(UCBAdmissions,3) # created a table with just the dept variables from original data set (its in workspace now) barplot(admit.dept) # to see a barplot of our new variable (we didnt) admit.dept #shows frequencies prop.table(admit.dept) #shows proportions round(prop.table(admit.dept),2) #tell R to round the previous line of code to 2 decimal places round(prop.table(admit.dept),2) * 100 # tell R to essentially make into percentages # Converting tables into one row per case () View(UCBAdmissions) # to see what it normally looks like admit1 <- as.data.frame(UCBAdmissions) # Step 1: create new var, turns table into dataframe, can click on workspace admit2 <- lapply(admit1, function(x)rep(x, admit1$Freq)) # Step 2: Reform table by repeating each row by frequency admit3 <- as.data.frame(admit2) # Step 3: Converts list back to data frame admit4 <- admit3[, -4] # Removes the 5th column w/ frequencies (dont really need it). Its 5 b/c line number is a column # The comma separates rows and columns, if blank assumes all. admit.rows <- as.data.frame(lapply(as.data.frame.table(UCBAdmissions), function(x)rep(x, as.data.frame.table(UCBAdmissions)$Freq)))[, -4] # all in one (can just change name for next file) admit.rows[1:10,] #shows first 10 rows, all columns (since blank)

Saving images

*Just click export LOL (next to zoom)* require("datasets") feeds <- table(chickwts$feed) # Hard way: # For PNG file (Run entire block at once) # Open device, shows filepath png(filename= "~/Desktop/Ex02_06a.png", width = 888, height = 571) #Create graphic par(oma = c(1, 1, 1, 1)) # Outside margins: b, l, t, r par(mar = c(4, 5, 2, 1)) # Sets plot margins barplot(feeds[order(feeds)], # Create the chart horiz = TRUE, las = 1, # Orientation of axis labels col = c("beige", "blanchedalmond", "bisque1", "bisque2", "bisque3", "bisque4"), border = NA, # No borders on bars main = "Frequencies of Different Feeds\nin chickwts Dataset", xlab = "Number of Chicks") #Turn device off dev.off() # Close device (run in same block) # OR this one for PDF file (Run entire block at once) pdf("~/Desktop/R/Ex02_06b.pdf", width = 9, # in inches height = 6) # in inches par(oma = c(1, 1, 1, 1)) # Outside margins: b, l, t, r par(mar = c(4, 5, 2, 1)) # Sets plot margins barplot(feeds[order(feeds)], # Create the chart horiz = TRUE, las = 1, # Orientation of axis labels col = c("beige", "blanchedalmond", "bisque1", "bisque2", "bisque3", "bisque4"), border = NA, # No borders on bars main = "Frequencies of Different Feeds\nin chickwts Dataset", xlab = "Number of Chicks") dev.off() # Close device (run in same block) # The easy Way: With RStudio "Export" par(oma = c(1, 1, 1, 1)) # Outside margins: b, l, t, r par(mar = c(4, 5, 2, 1)) # Sets plot margins barplot(feeds[order(feeds)], # Create the chart horiz = TRUE, las = 1, # Orientation of axis labels col = c("beige", "blanchedalmond", "bisque1", "bisque2", "bisque3", "bisque4"), border = NA, # No borders on bars main = "Frequencies of Different Feeds\nin chickwts Dataset", xlab = "Number of Chicks")

Single categorical variables one sample chi-square test

?HairEyeColor str(HairEyeColor) HairEyeColor #Get marginal frequencies for eye color margin.table(HairEyeColor, 2) #notice different from past because it is tabular. 2 is b/c eye colour is var 2 (as listed in line 1) # Save into an object and tidy eyes <- margin.table(HairEyeColor, 2) round(prop.table(eyes), 2) # show as proportions 2/ 2 decimal places #Chi-square test # Default tests for even distribution of proportion chi1 <- chisq.test(eyes) #store results into an object chi1 # see results # p < 0.05, reject Ho that even distribution of eye colour #Compare to population eye colour distribution p = c(0.41,0.32,0.15,0.12) chi2 <- chisq.test(eyes, p = c(0.41,0.32,0.15,0.12)) #same as code on line 14 but providing explicity comparison data chi2 # see results # p > 0.05, so fail to reject null, chi1 data does not significantly differ from chi2 data rm(list = ls())

Just copy paste into R, it'll be colour coded and easier to interpret!

Yeh boi

Creating Scatterplots

data(cars) cars # Basic scatterplot plot(cars) #Modified scatterplot plot(cars, pch = 16, #markers to filled circles. Can customize with ?pch to find numer you want col = "gray", main = "Speed vs Stopping Distance for Cars", xlab = "Speed (MPH)", ylab = "Stopping Distance (feet)") # Adding linear regression line abline(lm(cars$dist ~ cars$speed), col = "darkred", lwd = 2) #line width # automatically superimposed # lm = linear model #regressing distance on speed (Y on X) #locally weighted scatterplot smoothing (lowess) lines(lowess (cars$speed, cars$dist), col = "blue", lwd = 2) # no '~' here # Another variation - external package install.packages("car") #CAR = companion to applied regression require(car) scatterplot(cars$dist ~ cars$speed, pch = 16, col = "darkblue", main = "Speed vs. Stopping Distance for Cars", xlab = "Speed (MPH)", ylab = "Stopping Distance (feet)") # Meh, gives extra stuff

Selecting cases

data(mtcars) mtcars mean(mtcars$qsec) mean(mtcars$qsec[mtcars$cyl==8]) # finding mean for cars dataset wfor the qsec variable but specifies a condition to use only obs where cyl = 8 # notice the double ==! mean(mtcars$mpg[mtcars$hp > median(mtcars$hp)]) # finding the mean for cars dataset for the mpg variable but a condition to use only obs that have an hp value greater than the hp median #Create a subset for only 8-cylinder cars cyl.8 <- mtcars[mtcars$cyl == 8,] # so the [ ] specifies want rows where the variable cyl is 8, and columns blank since want all # Create a subset with more than one condition mtcars[mtcars$cyl == 8 & mtcars$carb >=4, ] # [ ] has more specific row conditions, columns still blank # no variable made, since for this example ust want tto see in console

Single proportion hypothesis tests and CI's

prop.test(98,162) #98 successes out of 162 trials # gives p-value and x-squared (chi squared) # also gives confidence intervals #One-tailed test prop.test(98,162, alt = "greater", conf.level = 0.90) # can do "less" if asking the other way

Boxplots for Quantitative Variables

require("datasets") USJudgeRatings #fit outliers in external data set data(USJudgeRatings) boxplot(USJudgeRatings$RTEN) #specify data and specific variable want to plot #Modifying boxplots #for any of the terms you can see full customization by ?____ boxplot(USJudgeRatings$RTEN, horizontal = TRUE, #makes horizontal las = 1, #Make all labels horizontal # notch = TRUE, For CI and median ( dont like the look but its here) ylim = c(0,10), # Range for y-axis col = "slategray3", #colours boxwex = 0.5, #width of box as proportion of original whisklty = 1, #1 = solid whisker line staplelty = 0, #removes whisker tails ends outpch = 16, #symbols for outliers; 16 = filled circle outcol = "slategray3", # colour for outliers main = "Lawyers' Ratings of State Judges in the\nUS Superior Court (c. 1977)", xlab = "Lawyer's Ratings") #x-label # Multiple boxplots # Same code as above but not for one variable, all data! boxplot(USJudgeRatings, horizontal = TRUE, las = 1, ylim = c(0,10), col = "slategray3", boxwex = 0.5, whisklty = 1, staplelty = 0, outpch = 16, outcol = "slategray3", main = "Lawyers' Ratings of State Judges in the\nUS Superior Court (c. 1977)", xlab = "Lawyer's Ratings") rm(list=ls())

Histograms for Quantitative Variables

require("datasets") data(lynx) hist(lynx) #histograms report frequencies (order uninportant) # Modify histogram for proportions (densities) h <- hist(lynx, # Save histogram as object (can be done with any graphic) breaks = 11, # "Suggests" 11 bins # breaks = seq(0, 7000, by = 100), #^^ another way of specifying breaks, give min max and break every 100 # breaks = c(0, 100, 300, 500, 3000, 3500, 7000), #^^ another way of specifying breaks, specific intervals freq = FALSE, #default is TRUE which gives exact frequencies, with lots of data rather have proportionate (density) frequencies col = "thistle1", # Or use: col = colors() [626] main = "Histogram of Annual Canadian Lynx Trappings\n1821-1934", # title, \n for line break xlab = "Number of Lynx Trapped") #x-label # Add additional curve values onto graph curve(dnorm(x, mean = mean(lynx), sd = sd(lynx)), #dnorm = density normal col = "thistle4", #different colour to stand out lwd = 2, #line width 3 pixels add = TRUE) #add to existing chart # shows what a curve w/ same mean and SD would look like #So if have frequencies, just use densities, actual frequencies dont work well with superinposing in R rm(list = ls()) # Clean up

OVerlaying Plots

require("datasets") swiss str(swiss) fertility <- swiss$Fertility #Plots - practice using shift+cntrl+p #Plot1: Histogram h <- hist(fertility, prob = TRUE, # Flipside of "freq = FALSE", again get away from raw freqs, use densities (proportions) ylim = c(0, 0.04), xlim = c(30, 100), breaks = 11, col = "#E5E5E5", border = 0, main = "Fertility for 47 French-Speaking\nSwiss Provinces, c. 1888") # Plot 2: Normal curve (if prob = TRUE) curve(dnorm(x, mean = mean(fertility), sd = sd(fertility)), col = "red", lwd = 3, add = TRUE) # Plot 3 & 4: Kernel density lines (if prob = TRUE) lines(density(fertility), col = "blue") #default kernel. Like a smooth average lines(density(fertility, adjust = 3), col = "darkgreen") #changes less # Plot 5: Rug (That is, lineplot under histogram) rug(fertility, col = "red") rm(list = ls()) # Clean up

Bar Charts for Categorical Variables

require("datasets") #get the data #going to use the data 'chickwts' chickwts #take a look at structure, one row per observation plot(chickwts$feed) # the '$' tells R what variable want to look at # Let's use barplot function. More customizable but have to specify parameters beforehand # Cant use raw data in row-per-obs format, so need to create a summary table first feed <- table(chickwts$feed) #create variables to store functions # table command turns data into table feed #take a look barplot(feed) #looks the same as line 4. But can do more barplot(feed[order(feed, decreasing = TRUE)]) # order statement in [ ] # saying feeds twice may seem redundant in this example, but actually can allow to order one variable by another variable! #Customizing chart # par stands for parameters par(oma = c(1,1,1,1)) #oma = outside margins (bottom, left, top, right) par(mar = c(4,5,2,1)) #mar = plot margins # Values represent line #, trial and error to find best fit #once apply, will affect evey plot (unless set back to default) #Further customizations #remember blank space doesnt effect, its the same as line 12 but written more clearly since more variables barplot(feed[order(feed)], horiz = TRUE, #makes bars horizontal las = 1, # las gives orientation of axis labels (1 means horizontal) col = c("beige", "blanchedalmond", "bisque1", "bisque2", "bisque3", "bisque4"), # colors picked by preference border = NA, # No borders on bars main = "Frequencies of Different Feeds\nin chickwts Dataset", #main is the overall label # \n = line break xlab = "Number of Chicks") #x-axis label rm(list = ls()) # Clean up


Set pelajaran terkait

Fundementals of nursing final exam Prep Us

View Set

ACSM EP Domain I: Fitness and Health Assessment

View Set

McKay 16 Toward a New Worldview 1540-1789

View Set

Chapter 3: Race, Ethnicity, and Immigration

View Set

REVIEW NINJA MCQ BEFORE REG, CPA REG - Business Law Section, SuperFast CPA REG, SuperFast CPA REG, CPA Exam Regulation - Individual Taxation, REG CPA

View Set

geg 101 - Solar Energy and the Greenhouse Effect

View Set

Chapter 28 North Korea and South Korea quiz

View Set