library(ggplot2)
library(dplyr)
library(scales)
library(xlsx)
library(reshape2)
library(lubridate)
library(ggthemes)
library(gridExtra)
Q3.1
# a) Load the 'diamonds' data set in R Studio.
# How many observations are in the data set?
nrow(diamonds)
## [1] 53940
# b) How many variables are in the data set?
ncol(diamonds)
## [1] 10
# c) How many ordered factors are in the set?
# str(diamonds)
# 3
# d) What letter represents the best color for a diamonds?
levels(diamonds$color)
## [1] "D" "E" "F" "G" "H" "I" "J"
# help(diamonds)
# D
Q3.2
# Create a histogram of the price of
# all the diamonds in the diamond data set.
ggplot(diamonds, aes(x = price)) +
geom_histogram(color = "black", fill = "DarkOrange", binwidth = 500) +
scale_x_continuous(labels = dollar, breaks = seq(0, 20000, 1000)) +
theme(axis.text.x = element_text(angle = 90)) +
xlab("Price") + ylab("Count")
Q3.3
Describe the shape and center of the price distribution. Include summary statistics like the mean and median.
summary(diamonds$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 950 2400 3930 5320 18800
#
# The distribution is right-skewed with small amounts of very large prices driving up the mean, while the median remains a more robust measure of the center of the distribution.
Q3.4
# a) How many diamonds cost less than $500?
diamonds %>%
filter(price < 500) %>%
summarise(n = n())
## n
## 1 1729
# b) How many diamonds cost less than $250?
diamonds %>%
filter(price < 250) %>%
summarise(n = n())
## n
## 1 0
# c) How many diamonds cost more than $15,000?
diamonds %>%
filter(price >= 15000) %>%
summarise(n = n())
## n
## 1 1656
Q3.5
# Explore the largest peak in the
# price histogram you created earlier.
# Try limiting the x-axis, altering the bin width,
# and setting different breaks on the x-axis.
ggplot(diamonds, aes(x = price)) +
geom_histogram(color = "black", fill = "DarkOrange", binwidth = 25) +
scale_x_continuous(labels = dollar, breaks = seq(0, 2000, 100)) +
theme(axis.text.x = element_text(angle = 90)) +
coord_cartesian(c(0,2000)) +
xlab("Price") + ylab("Count")
Q3.6
# Break out the histogram of diamond prices by cut.
# You should have five histograms in separate
# panels on your resulting plot.
ggplot(diamonds, aes(x = price)) +
geom_histogram(color = "black", fill = "DarkOrange", binwidth = 25) +
scale_x_continuous(labels = dollar, breaks = seq(0, 4000, 100)) +
theme(axis.text.x = element_text(angle = 90)) +
coord_cartesian(c(0,4000)) +
facet_grid(cut~.) +
xlab("Price") + ylab("Count")
Q3.7
# a) Which cut has the highest priced diamond?
# Premium
# by(diamonds$price, diamonds$cut, max)
# by(diamonds$price, diamonds$cut, min)
# by(diamonds$price, diamonds$cut, median)
diamonds %>%
group_by(cut) %>%
summarise(max_price = max(price),
min_price = min(price),
median_price = median(price))
## Source: local data frame [5 x 4]
##
## cut max_price min_price median_price
## 1 Fair 18574 337 3282
## 2 Good 18788 327 3050
## 3 Very Good 18818 336 2648
## 4 Premium 18823 326 3185
## 5 Ideal 18806 326 1810
# b) Which cut has the lowest priced diamond?
# Premium & Ideal
# c) Which cut has the lowest median price?
# Ideal
Q3.8
# In the two last exercises, we looked at
# the distribution for diamonds by cut.
# Run the code below in R Studio to generate
# the histogram as a reminder.
# ===============================================================
qplot(x = price, data = diamonds) + facet_wrap(~cut, scales = "free")
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
# ===============================================================
# In the last exercise, we looked at the summary statistics
# for diamond price by cut. If we look at the output table, the
# the median and quartiles are reasonably close to each other.
# diamonds$cut: Fair
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 337 2050 3282 4359 5206 18570
# ------------------------------------------------------------------------
# diamonds$cut: Good
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 327 1145 3050 3929 5028 18790
# ------------------------------------------------------------------------
# diamonds$cut: Very Good
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 336 912 2648 3982 5373 18820
# ------------------------------------------------------------------------
# diamonds$cut: Premium
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 326 1046 3185 4584 6296 18820
# ------------------------------------------------------------------------
# diamonds$cut: Ideal
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 326 878 1810 3458 4678 18810
# This means the distributions should be somewhat similar,
# but the histograms we created don't show that.
# The 'Fair' and 'Good' diamonds appear to have
# different distributions compared to the better
# cut diamonds. They seem somewhat uniform
# on the left with long tails on the right.
# Let's look in to this more.
# Look up the documentation for facet_wrap in R Studio.
# Then, scroll back up and add a parameter to facet_wrap so that
# the y-axis in the histograms is not fixed. You want the y-axis to
# be different for each histogram.
Q3.9
# Create a histogram of price per carat
# and facet it by cut. You can make adjustments
# to the code from the previous exercise to get
# started.
# Adjust the bin width and transform the scale
# of the x-axis using log10.
# Submit your final code when you are ready.
# ENTER YOUR CODE BELOW THIS LINE.
# ===========================================================================
ggplot(diamonds, aes(x = price/carat)) +
geom_histogram(color = "black", fill = "DarkOrange", binwidth = .05) +
theme(axis.text.x = element_text(angle = 0)) +
scale_x_log10(expression(paste(Log[10], " of Price")),
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) +
facet_grid(cut~., scale = "free") + ylab("Count")
Q3.10
# Investigate the price of diamonds using box plots,
# numerical summaries, and one of the following categorical
# variables: cut, clarity, or color.
diamonds %>%
group_by(cut) %>%
summarise(count = n(),
avg_price = mean(price))
## Source: local data frame [5 x 3]
##
## cut count avg_price
## 1 Fair 1610 4359
## 2 Good 4906 3929
## 3 Very Good 12082 3982
## 4 Premium 13791 4584
## 5 Ideal 21551 3458
# There are many more Ideal diamonds than others, but the average price is also the lowest.
ggplot(diamonds, aes(x = clarity, y = price, color = cut)) +
geom_boxplot() +
facet_grid(color~., margins = TRUE)
# This boxplot matrix shows the distribution of price across all 3 categorical variables; cut, clarity, and color.
Q3.11
# a) What is the price range for the middle 50% of the diamonds with color D?
# c) What is the IQR for diamonds with the best color?
diamonds %>%
group_by(color) %>%
filter(color == "D") %>%
summarise(Quartile.25 = quantile(price, 0.25),
Quartile.75 = quantile(price, 0.75),
IQR = Quartile.75 - Quartile.25)
## Source: local data frame [1 x 4]
##
## color Quartile.25 Quartile.75 IQR
## 1 D 911 4214 3302
# b) What is the price range for the middle 50% of diamonds with color J?
# d) What is the IQR for the diamonds with the worst color?
diamonds %>%
group_by(color) %>%
filter(color == "J") %>%
summarise(Quartile.25 = quantile(price, 0.25),
Quartile.75 = quantile(price, 0.75),
IQR = Quartile.75 - Quartile.25)
## Source: local data frame [1 x 4]
##
## color Quartile.25 Quartile.75 IQR
## 1 J 1860 7695 5834
# by(diamonds$price, diamonds$color, summary)
Q3.12
# Investigate the price per carat of diamonds across
# the different colors of diamonds using boxplots.
ggplot(diamonds, aes(x = color, y = price/carat, fill = color)) +
geom_boxplot() +
coord_cartesian(ylim=c(1000, 6000)) +
scale_y_continuous(labels=dollar) +
xlab("Color") + ylab("Price per Carat")
Q3.13
# Investigate the weight of the diamonds (carat) using a frequency polygon. Use different bin widths to see how the frequency polygon changes. What carat size has a count greater than 2000? Check all that apply.
sizes = c(0.1, 0.3, 0.8, 1.01, 1.6, 2.0, 3.0, 5.0)
summary(diamonds$carat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.200 0.400 0.700 0.798 1.040 5.010
ggplot(diamonds, aes(x=carat)) +
geom_freqpoly(binwidth=0.1, alpha = 0.75) +
scale_x_continuous(breaks=sizes, expand = c(0,0)) +
scale_y_continuous(expand=c(0,0))+
geom_vline(xintercept = c(0.1, 0.8, 1.6, 2.0, 3.0, 5.0), color = "red", linetype="dashed", alpha = 0.75) +
geom_vline(xintercept = c(0.3, 1.01), color = "forestgreen", linetype = "twodash") +
geom_hline(yintercept = 2000, color = "brown", linetype="longdash", alpha = 0.5) +
xlab("Carat Size") + ylab("Count")
# 0.3 and 1.01 in green
Q3.14
# The Gapminder website contains over 500 data sets with information about
# the world's population. Your task is to download a data set of your choice
# and create 2-5 plots that make use of the techniques from Lesson 3.
# http://spreadsheets.google.com/pub?key=rdCufG2vozTpKw7TBGbyoWw&output=xls
hours <- tbl_df(read.xlsx("indicator_hours per week.xlsx", sheetName="Data", header=TRUE))
hours <- hours %>%
select(-NA.) %>% # Remove NA column carried over from xlsx
rename(Country = Working.hours.per.week) %>%
filter(Country != "<NA>") # Remove <NA> row carried over from xlsx
hours.long <- melt(hours, id=c("Country"), value.name="Hours", variable.name="Year")
hours.long <- tbl_df(hours.long)
hours.long <- hours.long %>%
mutate(Year = as.character(Year), # Convert to character
Year = substr(Year, 2, 5), # Slice out the X, leaving last 4 digits; R added X since initially since column names can't start with numbers.
Year = as.numeric(Year)) # Cast as numeric
yearStats <- hours.long %>%
group_by(Year) %>%
summarise(median = median(Hours, na.rm=TRUE),
mean = mean(Hours, na.rm=TRUE),
lower = min(Hours, na.rm=TRUE),
upper = max(Hours, na.rm=TRUE),
se = sd(Hours, na.rm=TRUE)/sqrt(length(Hours)),
avg_upper = mean + (2.101*se),
avg_lower = mean - (2.101*se),
quant.25 = quantile(Hours, na.rm=TRUE, 0.25),
quant.75 = quantile(Hours, na.rm=TRUE, 0.75))
yearStats <- round(yearStats, 2)
p <- ggplot(yearStats, aes(x=Year, y=median)) +
theme(plot.background = element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank()) +
geom_linerange(yearStats, mapping=aes(x=Year, ymin=lower, ymax=upper), colour = "wheat2", alpha=1, size=5) +
geom_linerange(yearStats, mapping=aes(x=Year, ymin=quant.25, ymax=quant.75), colour = "wheat4", size=5) +
geom_line(yearStats, mapping=aes(x=Year, y=median, group=1)) +
geom_vline(xintercept = 1980, colour = "wheat4", linetype=1, size=1, hjust=3) +
geom_hline(yintercept=seq(26, 56, 2), color="white", linetype=1)
dottedYears <- seq(1980, 2007, 5) # Pick the years to draw dotted vertical lines on
p <- p + geom_vline(xintercept = dottedYears, color="wheat4", linetype=3, size=0.5)
p <- p + coord_cartesian(ylim = c(26,58))+
scale_y_continuous(breaks=seq(26, 60, 2)) +
scale_x_continuous(breaks=seq(1980, 2005, 5), expand=c(0,0) )
p <- p + geom_line(data = subset(hours.long, Country == "United States"), aes(x = Year, y = Hours, group = Country), color ="brown") +
annotate("segment", x=2000, xend=2002, y=35.5, yend=36, color="brown") +
annotate("text", x=2003, y=36.3, label="U.S. Hours", size=3.5, color="brown") +
annotate("segment", x=2000, xend=2001, y=33.5, yend=32) +
annotate("text", x=2002, y=31.7, label="World Medians", size=3.5)
p1 <- p + annotate("text", x=1999.9, y=56, label="Data represents hours worked per week for 52 countries ", size=3, color="gray30") +
annotate("text", x=2000, y=55, label="from 1980 to 2007. Outer lighter bands show the min/max ", size=3, color="gray30") +
annotate("text", x=2000, y=54, label="hours for each year, and inner darker bands show the IQR.", size=3, color="gray30") +
ggtitle("World's Working Hours") +
theme(plot.title=element_text(face="bold",hjust=.95,vjust=.8,color="#3C3C3C",size=20)) +
annotate("text", x=1994.6, y=57.5, label="Weekly", size=4, fontface="bold")
p1
# Zoom in a bit to the Inter-Quartile Range(IQR)
p + coord_cartesian(ylim=c(30, 38)) +
ggtitle("Inter-Quartile Range of Weekly Hours Worked Per Year Per Country") +
theme(plot.title=element_text(face="bold",color="#3C3C3C",size=12)) +
geom_text(data = hours.long[hours.long$Country == "United States",], aes( x = Year, y = Hours, color = Country, group = Country, label = round(Hours, 2)), hjust = -.1, vjust = -1.2, size = 2, color = "brown") +
geom_text( aes(x = Year, y = median, label = median), hjust = -.1, vjust = 1.2, size = 2, color = "black")
ggplot(subset(hours.long, Country %in% c("Switzerland", "United States", "Japan", "United Kingdom", "France")), aes(x=Country, y=Hours, fill=Country)) +
geom_boxplot() +
stat_summary(fun.y = mean, geom="point", shape = 5) +
ylab("Hours Worked per Week") + xlab("") +
theme(legend.position="none") +
ggtitle("Box plots of Hours Worked per Week for 5 Countries") +
theme(plot.title=element_text(face="bold",color="#3C3C3C",size=12))
ggplot(subset(hours.long, Country %in% c("Switzerland", "United States", "Japan", "United Kingdom", "France")), aes(x = Year, y = Hours)) +
geom_line(aes(color = Country, group = Country)) +
theme(axis.text.x = element_text(angle = 45)) +
scale_y_continuous(breaks=seq(26, 56, 2)) +
scale_x_continuous(breaks=seq(1980, 2005, 5), expand=c(0,0) ) +
ylab("Hours Worked per Week") + xlab("") +
theme(plot.title=element_text(face="bold",color="#3C3C3C",size=12)) +
ggtitle("Change in Weekly Hours Worked from 1980-2007 for 5 Countries")
Q3.15
# How many birthdays are in each month?
# Which day of the year has the most number of birthdays?
# Do you have at least 365 friends that have birthdays on everyday
# of the year?
# **********************************************************************
# You will need to do some data munging and additional research to
# complete this task. This task won't be easy, and you may encounter some
# unexpected challenges along the way. We hope you learn a lot from it though.
# You can expect to spend 30 min or more on this task depending on if
# use the provided data or obtain your personal data. We also encourage you
# to use the lubridate package for working with dates. Read over the documentation
# in RStudio and search for examples online if you need help.
# You'll need to export your Facebooks friends' birthdays to a csv file.
# You may need to create a calendar of your Facebook friends’ birthdays
# in a program like Outlook or Gmail and then export the calendar as a
# csv file.
# Once you load the data into R Studio, you can use the strptime() function
# to extract the birth months and birth days. We recommend looking up the
# documentation for the function and finding examples online.
birthdays <- tbl_df(read.csv("birthdaysExample.csv"))
tempDates <- mdy(birthdays$dates)
birthdays <- birthdays %>%
mutate(Birthday = tempDates,
Year = year(tempDates),
Month = month(tempDates, label=TRUE, abbr=FALSE),
Day = day(tempDates),
Weekday = weekdays(tempDates, abbr=FALSE))
birthdays$Weekday <- factor(birthdays$Weekday, levels=c('Monday', 'Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'), ordered=TRUE)
# ifelse didn't seem to work here for creating 'optional' since the else would create a NULL vector within the function and R complained.
# creates optional ggplot argument only if Day is passed in.
# aes_string takes in a string as the column name for plotting. Perfect for our arguments we pass in as strings.
# Paste our argument in for custom X and Y-axis labeling
makePlots<- function(TimeLength){
optional<- NULL
if(TimeLength == "Day") optional<- scale_x_discrete(breaks = seq(1, 31, 1))
ggplot(birthdays, aes_string(x = TimeLength, fill = TimeLength)) +
geom_histogram(binwidth = 1, color = "black", show_guide = FALSE) +
scale_fill_brewer(palette="Paired") +
stat_bin(aes(label=..count..), vjust=-.1,
geom="text", position="identity", size=3) +
xlab(TimeLength)+
ylab(paste0("Number of Birthdays per ", TimeLength)) +
theme_tufte() +
optional
}
makePlots("Month")
makePlots("Day")
makePlots("Weekday")
Q4.1
# In this problem set, you'll continue
# to explore the diamonds data set.
# Your first task is to create a
# scatterplot of price vs x.
# using the ggplot syntax.
ggplot(diamonds, aes(x = x, y = price)) +
geom_point(alpha = 1/20) +
coord_cartesian(xlim=c(3.5, 12)) +
scale_y_continuous(breaks=seq(1000, 19000, 2000),label=dollar)
Q4.2
# What are your observations about the scatterplot of price vs x?
# The bulk of the data start at around x = 3.3 and price rises exponentially as it approaches 9.
# There also seems to be an artificial ceiling in price around $19k.
Q4.3
# What is the correlation between price and x?
with(diamonds, cor.test(price, x))
##
## Pearson's product-moment correlation
##
## data: price and x
## t = 440.2, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8826 0.8863
## sample estimates:
## cor
## 0.8844
# What is the correlation between price and y?
with(diamonds, cor.test(price, y))
##
## Pearson's product-moment correlation
##
## data: price and y
## t = 401.1, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8633 0.8675
## sample estimates:
## cor
## 0.8654
# What is the correlation between price and z?
with(diamonds, cor.test(price, z))
##
## Pearson's product-moment correlation
##
## data: price and z
## t = 393.6, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8591 0.8634
## sample estimates:
## cor
## 0.8612
Q4.4
# Create a simple scatter plot of price vs depth.
ggplot(diamonds, aes(x = depth, y = price)) + geom_point(alpha = 1/20)
Q4.5
# Change the code to make the transparency of the
# points to be 1/100 of what they are now and mark
# the x-axis every 2 units.
ggplot(data = diamonds, aes(x = depth, y = price)) +
geom_point(alpha = 1/100) +
scale_x_continuous(breaks = seq(min(diamonds$depth), max(diamonds$depth), 2),
labels = seq(min(diamonds$depth), max(diamonds$depth), 2))
Q4.6
# Based on the scatterplot of depth vs. price, most diamonds are between what values of depth?
# 58-64
Q4.7
# What's the correlation of depth vs. price?
with(diamonds, cor.test(depth, price))
##
## Pearson's product-moment correlation
##
## data: depth and price
## t = -2.473, df = 53938, p-value = 0.0134
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.019085 -0.002209
## sample estimates:
## cor
## -0.01065
# Based on the correlation coefficient, would you use depth to predict the price of a diamond? Why?
# No.
# The correlation is not equal to zero (p-value = 0.0134; likely due to the huge sample size),
# but the power of the correlation is almost non-existent.
# Depth alone would have virtually no predictive power for price.
Q4.8
# Create a scatterplot of price vs carat
# and omit the top 1% of price and carat
# values.
ggplot(diamonds, aes(x = carat, y = price)) +
geom_point(alpha=1/20) +
scale_x_continuous(limits=c(0, quantile(diamonds$carat, 0.99))) +
scale_y_continuous(breaks=seq(0, 18000, 2000),
limits=c(0 , quantile(diamonds$price, 0.99)),
labels=dollar)
Q4.9
# Create a scatterplot of price vs. volume (x * y * z).
# This is a very rough approximation for a diamond's volume.
# Create a new variable for volume in the diamonds data frame.
# This will be useful in a later exercise.
# Don't make any adjustments to the plot just yet.
diamonds2 <- diamonds %>%
mutate(volume=x*y*z)
ggplot(diamonds2, aes(x = volume, y = price)) +
geom_point()
Q4.10
# What are your observations from the price vs volume scatterplot?
# There are 3 obvious outliers, with 1 massive diamond expanding the plot quite a bit.
# Prices rise exponentially with volume making transformations of the x-scale a good idea.
# There may also be diamonds with volumes at or near 0.
Q4.11
# What's the correlation of price and volume?
# Exclude diamonds that have a volume of 0 or that are greater than or equal to 800.
with(subset(diamonds2, !(volume == 0 | volume >= 800) ), cor.test(price, volume))
##
## Pearson's product-moment correlation
##
## data: price and volume
## t = 559.2, df = 53915, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9223 0.9248
## sample estimates:
## cor
## 0.9235
Q4.12
# Subset the data to exclude diamonds with a volume
# greater than or equal to 800. Also, exclude diamonds
# with a volume of 0. Adjust the transparency of the
# points and add a linear model to the plot. (See the
# Instructor Notes or look up the documentation of
# geom_smooth() for more details about smoothers.)
# We encourage you to think about this next question and
# to post your thoughts in the discussion section.
# Do you think this would be a useful model to estimate
# the price of diamonds? Why or why not?
smaller <- diamonds2 %>%
filter(volume != 0,
volume <= 800)
ggplot(smaller, aes( x = volume, y = price)) +
geom_point(alpha = 1/20) +
geom_smooth(method = "lm",
se = TRUE)
# This is probably not the best model since the relationship doesn not appear to be normal.
# The residuals would show a heavy pattern as well
# instead of random scattering.
Q4.13
# Use the function dplyr package
# to create a new data frame containing
# info on diamonds by clarity.
# Name the data frame diamondsByClarity
# The data frame should contain the following
# variables in this order.
# (1) mean_price
# (2) median_price
# (3) min_price
# (4) max_price
# (5) n
# where n is the number of diamonds in each
# level of clarity.
diamondsByClarity<- diamonds %>%
group_by(clarity) %>%
summarise(mean_price = mean(price),
median_price = median(price),
min_price = min(price),
max_price = max(price),
n = n() ) %>%
arrange(clarity)
Q4.14
# We’ve created summary data frames with the mean price
# by clarity and color. You can run the code in R to
# verify what data is in the variables diamonds_mp_by_clarity
# and diamonds_mp_by_color.
# Your task is to write additional code to create two bar plots
# on one output image using the grid.arrange() function from the package
# gridExtra.
diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))
diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))
# ===================================================================
c1 <- ggplot(diamonds_mp_by_clarity, aes(x=clarity, y=mean_price, fill=clarity)) +
geom_bar(stat = "identity", color = "black") +
scale_fill_brewer(palette="Set3") +
guides(fill = guide_legend(ncol=2, title.hjust=0.3))
c2 <- ggplot(diamonds_mp_by_color, aes(x=color, y=mean_price, fill=color)) +
geom_bar(stat = "identity", color = "black") +
scale_fill_brewer(palette="Set2") +
guides(fill = guide_legend(ncol=2, title.hjust=0.4))
grid.arrange(c1, c2)
Q4.15
# What do you notice in each of the bar charts for mean price by clarity and mean price by color?
# There's a downward trend in average diamond price as clarity goes from I1 to IF,
# and an upward trend in average price as color goes from D to J.
Q4.16
# The Gapminder website contains over 500 data sets with information about
# the world's population. Your task is to continue the investigation you did at the
# end of Problem Set 3 or you can start fresh and choose a different
# data set from Gapminder.
# If you’re feeling adventurous or want to try some data munging see if you can
# find a data set or scrape one from the web.
# In your investigation, examine pairs of variable and create 2-5 plots that make
# use of the techniques from Lesson 4.
# ======================================================================================================
# Programme for International Student Assessment (PISA) is a triennial international survey
# which aims to evaluate education systems worldwide by testing the skills and knowledge of 15-year-old students.
# Around 510,000 students in 65 economies took part in PISA 2012 representing about 28 million 15-year-olds globally.
# Our dataset is composed of 485,490 test results from the 2012 PISA testing results.
# http://www.oecd.org/pisa/aboutpisa/
con <- url("http://beta.icm.edu.pl/PISAcontest/data/student2012.rda")
load(con)
# It's a large file, so alternatively, you can download the .rda files and load it directly.
# load("student2012.rda")
student2012 <- tbl_df(student2012)
# Each of the 5 PV variables for Reading, Math, and Science are plausible values created from tests which may have contained missing data.
# 5 different sets with imputations were created for completeness.
# These are averaged together to get the estimated (mean) plausible value for each subject.
## Scores variables have attributes which dplyr cannot use group_by with
## These can be cast as numeric to remove attributes
## mutate_each can apply a function(as.numeric) to each column matching a certain criteria
country_data <- student2012 %>%
mutate(Math = as.numeric((PV1MATH + PV2MATH + PV3MATH + PV4MATH + PV5MATH)/5),
Reading = as.numeric((PV1READ + PV2READ + PV3READ + PV4READ + PV5READ)/5),
Science = as.numeric((PV1SCIE + PV1SCIE + PV1SCIE + PV1SCIE + PV1SCIE)/5)) %>%
select(Gender = ST04Q01,
birth_year = ST03Q02,
birth_month = ST03Q01,
PC_at_home = ST26Q04,
Country = CNT,
Math, Reading, Science)
country_scores <- country_data %>%
group_by(Country, Gender) %>%
summarise(mean_math = mean(Math),
mean_reading = mean(Reading),
mean_science = mean(Science),
count = n() ) %>%
arrange(mean_math, mean_reading, mean_science)
ggplot(country_scores, aes(x = mean_reading, y = mean_math, color = Gender, size=count)) +
geom_point() +
geom_smooth() +
geom_point(data = subset(country_scores, Country %in% c("United States of America", "Mexico", "Peru", "Qatar", "China-Shanghai", "Singapore", "Korea", "Finland", "Chile")),
aes(label = Country),
color="#0266c8",
show_guide = FALSE) +
geom_text(data = subset(country_scores, Country %in% c("United States of America", "Mexico", "Peru", "Qatar", "China-Shanghai", "Singapore", "Korea", "Finland", "Chile")),
aes(label = Country),
color = "black",
size = 3,
vjust = -0.5,
show_guide = FALSE) +
#geom_abline(intercept = 0, slope = 1, linetype=2, color="orange") +
ylab("Average Math Score") +
xlab("Average Reading Score") +
ggtitle("Average Reading and Math Scores per country, by gender")
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
# It seems to be a world-wide trend that boys are achieving higher scores in math than reading,
# and girls are in turn doing better in reading than math on average.
# We can see the US in the middle of the pack, while Asian countries are making up the majority at the top.
ggplot(country_scores, aes(x = mean_reading, y = mean_science, color = Gender, size=count)) +
geom_point() +
geom_smooth() +
geom_point(data = subset(country_scores, Country %in% c("United States of America", "Mexico", "Peru", "Qatar", "China-Shanghai", "Singapore", "Korea", "Finland", "Chile")),
aes(label = Country),
color="#0266c8",
show_guide = FALSE) +
geom_text(data = subset(country_scores, Country %in% c("United States of America", "Mexico", "Peru", "Qatar", "China-Shanghai", "Singapore", "Korea", "Finland", "Chile")),
aes(label = Country),
color = "black",
size = 3,
vjust = -0.5,
show_guide = FALSE) +
#geom_abline(intercept = 0, slope = 1, linetype=2, color="orange") +
ylab("Average Science Score") +
xlab("Average Reading Score") +
ggtitle("Average Reading and Science Scores per country, by gender")
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
# The trend is similar for science vs reading.
country_scores_PC <- country_data %>%
group_by(Country, PC_at_home) %>%
summarise(mean_math = mean(Math),
mean_reading = mean(Reading),
mean_science = mean(Science),
count = n() ) %>%
arrange(mean_math, mean_reading, mean_science)
g1 <- ggplot(subset(country_scores_PC, !(is.na(PC_at_home))), aes(x = mean_reading, fill = PC_at_home, size=count)) +
geom_density(alpha=0.66) +
ylab("Density") +
xlab("Average Reading Score") +
ggtitle("Density Plot of Average Reading Scores with/without a PC at home")
g2 <- ggplot(subset(country_scores_PC, !(is.na(PC_at_home))), aes(x = mean_math, fill = PC_at_home, size=count)) +
geom_density(alpha=0.66) +
ylab("Density") +
xlab("Average Math Score") +
ggtitle("Density Plot of Average Math Scores with/without a PC at home")
g3 <- ggplot(subset(country_scores_PC, !(is.na(PC_at_home))), aes(x = mean_science, fill = PC_at_home, size=count)) +
geom_density(alpha=0.66) +
ylab("Density") +
xlab("Average Science Score") +
ggtitle("Density Plot of Average Science Scores with/without a PC at home")
grid.arrange(g1, g2, g3, ncol=1)
## Across all countries, those having a PC at home had a with higher distribution of scores in reading, math, and science.
Q5.1
# Create a histogram of diamond prices.
# Facet the histogram by diamond color
# and use cut to color the histogram bars.
# The plot should look something like this.
# http://i.imgur.com/b5xyrOu.jpg
# Note: In the link, a color palette of type
# 'qual' was used to color the histogram using
# scale_fill_brewer(type = 'qual')
ggplot(diamonds, aes(x = price, fill = cut)) +
geom_histogram() +
facet_wrap(~ color) +
scale_fill_brewer(type = 'qual') +
scale_x_log10(labels=dollar, expression(paste(Log[10], " of Price"))) +
ylab("Count")
Q5.2
# Create a scatterplot of diamond price vs.
# table and color the points by the cut of
# the diamond.
# The plot should look something like this.
# http://i.imgur.com/rQF9jQr.jpg
# Note: In the link, a color palette of type
# 'qual' was used to color the scatterplot using
# scale_color_brewer(type = 'qual')
ggplot(diamonds, aes(x = table, y = price, color = cut)) +
geom_jitter(size = 3) +
scale_x_continuous(breaks = seq(50, 80, 2),
limits = c(50, 80)) +
scale_color_brewer(type = 'qual') +
theme_minimal()
Q5.3
## What is the typical table range for the majority of diamonds of ideal cut?
# 54 to 57
## What is the typical table range for the majory of diamonds of premium cut?
# 58 to 60
## Use the graph that you created from the previous exercise to see the answer. You do not need to run summaries.
Q5.4
# Create a scatterplot of diamond price vs.
# volume (x * y * z) and color the points by
# the clarity of diamonds. Use scale on the y-axis
# to take the log10 of price. You should also
# omit the top 1% of diamond volumes from the plot.
# Note: Volume is a very rough approximation of
# a diamond's actual volume.
# The plot should look something like this.
# http://i.imgur.com/excUpea.jpg
# Note: In the link, a color palette of type
# 'div' was used to color the scatterplot using
# scale_color_brewer(type = 'div')
diamonds <- diamonds %>%
mutate(volume = x * y *z)
ggplot(subset(diamonds, volume <= quantile(volume, 0.99) & volume > 0 ), aes(x = volume, y = price, color = clarity)) +
geom_jitter(size = 3) +
scale_y_log10(labels=dollar) +
scale_color_brewer(type = 'div') +
theme_minimal()
Q5.5
# Many interesting variables are derived from two or more others.
# For example, we might wonder how much of a person's network on
# a service like Facebook the user actively initiated. Two users
# with the same degree (or number of friends) might be very
# different if one initiated most of those connections on the
# service, while the other initiated very few. So it could be
# useful to consider this proportion of existing friendships that
# the user initiated. This might be a good predictor of how active
# a user is compared with their peers, or other traits, such as
# personality (i.e., is this person an extrovert?).
# Your task is to create a new variable called 'prop_initiated'
# in the Pseudo-Facebook data set. The variable should contain
# the proportion of friendships that the user initiated.
fb <- tbl_df(read.table("pseudo_facebook.tsv", header=TRUE))
# fb <- fb %>%
# mutate(prop_initiated = ifelse(friend_count > 0, friendships_initiated/friend_count, 0))
# Using code below instead will generate NA's when friend_count is 0 due to division by 0
# But future exercises asking for averages will be different if we don't generate those NA's
fb <- fb %>%
mutate(prop_initiated = friendships_initiated/friend_count)
Q5.6
# Create a line graph of the proportion of
# friendships initiated ('prop_initiated') vs.
# tenure and color the line segment by
# year_joined.bucket.
# Recall, we created year_joined.bucket in Lesson 5
# by first creating year_joined from the variable tenure.
# Then, we used the cut function on year_joined to create
# four bins or cohorts of users.
# (2004, 2009]
# (2009, 2011]
# (2011, 2012]
# (2012, 2014]
# The plot should look something like this.
# http://i.imgur.com/vNjPtDh.jpg
# OR this
# http://i.imgur.com/IBN1ufQ.jpg
fb <- fb %>%
mutate(year_joined = floor(2014 - tenure/365),
year_joined_bucket = cut(year_joined, breaks=c(2004, 2009, 2011, 2012, 2014)))
## Adding in 0's for the mean function instead of leaving out NA's gives a better approximation
## for the proportion of friendships initiated for this plot.
fb2 <- fb %>%
mutate(prop_initiated = ifelse(friend_count > 0, friendships_initiated/friend_count, 0))
ggplot(subset(fb2, tenure > 0), aes(x=tenure, y=prop_initiated)) +
geom_line(aes(color=year_joined_bucket), stat='summary', fun.y=mean) +
theme_minimal()
Q5.7
# Smooth the last plot you created of
# of prop_initiated vs tenure colored by
# year_joined.bucket. You can use larger
# bins for tenure or add a smoother to the plot.
ggplot(subset(fb, tenure > 0), aes(x=tenure, y=prop_initiated)) +
#geom_line(aes(color=year_joined_bucket), stat='summary', fun.y=mean) +
geom_smooth(aes(color = year_joined_bucket))
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## Warning: Removed 5 rows containing missing values (stat_smooth).
## Warning: Removed 81 rows containing missing values (stat_smooth).
## Warning: Removed 408 rows containing missing values (stat_smooth).
## Warning: Removed 1403 rows containing missing values (stat_smooth).
Q5.8
## On average, which group initiated the greatest
## poportion of its Facebook friendships? The plot with
## the smoother that you created in the last exercise can
## help you answer this question.
## People who joined after 2012.
Q5.9
## For the group with the largest proportion of
## friendships initated, what is the group's average
## (mean) proportion on friendships initiated?
fb %>%
filter(year_joined_bucket == "(2012,2014]") %>%
summarise(avg = mean(prop_initiated, na.rm=TRUE))
## Source: local data frame [1 x 1]
##
## avg
## 1 0.6654
## Why do you think this group's proportion of friendships initiated is higher than the others?
# They have newer accounts. To get started after first creating an account,
# you have to build your online network of friends based on your existing network of friends.
# After this initial population, someone may add/acquire new friends
# at a lower rate allowing the proportion to even out slightly.
# The younger cohort may also have fundamentally different patterns of acquisition;
# 2005 wasn't too long ago and Facebook was first targeted at a college-age demographic,
# who are now in their late twenties and early thirties.
# Rates might have already been lower in general for this group,
# who never were in their mid-teens and starting ubiquitous Facebook accounts.
Q5.10
# Create a scatter plot of the price/carat ratio
# of diamonds. The variable x should be
# assigned to cut. The points should be colored
# by diamond color, and the plot should be
# faceted by clarity.
# The plot should look something like this.
# http://i.imgur.com/YzbWkHT.jpg.
# Note: In the link, a color palette of type
# 'div' was used to color the histogram using
# scale_color_brewer(type = 'div')
ggplot(diamonds, aes(x = cut, y = price/carat, color = color)) +
geom_jitter() +
facet_wrap(~clarity) +
scale_color_brewer(type = 'div')
Q5.11
# The Gapminder website contains over 500 data sets with information about
# the world's population. Your task is to continue the investigation you did at the
# end of Problem Set 4 or you can start fresh and choose a different
# data set from Gapminder.
# If you're feeling adventurous or want to try some data munging see if you can
# find a data set or scrape one from the web.
# In your investigation, examine 3 or more variables and create 2-5 plots that make
# use of the techniques from Lesson 5.
## Proficiency scores to assign proficency levels assigned to students.
prof_scores <- c(0, 358, 420, 482, 545, 607, 669, 1000)
country_data <- student2012 %>%
mutate(Math = as.numeric((PV1MATH + PV2MATH + PV3MATH + PV4MATH + PV5MATH)/5),
Reading = as.numeric((PV1READ + PV2READ + PV3READ + PV4READ + PV5READ)/5),
Science = as.numeric((PV1SCIE + PV1SCIE + PV1SCIE + PV1SCIE + PV1SCIE)/5),
math_levels = cut(Math, breaks = prof_scores,
labels = paste("Level", 1:7), ordered_result = TRUE),
reading_levels = cut(Reading, breaks = prof_scores,
labels = paste("Level", 1:7), ordered_result = TRUE),
science_levels = cut(Science, breaks = prof_scores,
labels = paste("Level", 1:7), ordered_result = TRUE)) %>%
select(Gender = ST04Q01,
birth_year = ST03Q02,
birth_month = ST03Q01,
PC_at_home = ST26Q04,
Country = CNT,
books_at_home = ST28Q01,
at_home_mother = ST11Q01,
at_home_father = ST11Q02,
mother_job = ST15Q01,
mother_schooling = ST13Q01,
father_job = ST19Q01,
father_schooling = ST17Q01,
own_room = ST26Q02,
internet = ST26Q06,
home_book_count = ST28Q01,
computer_count = ST27Q03,
first_pc_use = IC03Q01,
first_internet_use = IC04Q01,
use_internet_at_school = IC05Q01,
internet_out_of_sch_wd = IC06Q01,
internet_out_of_sch_we = IC07Q01,
internet_for_fun = IC08Q06,
internet_at_home = IC01Q04,
internet_at_school = IC02Q04,
Math, Reading, Science)
#####################################################
country_gender <- country_data %>%
group_by(Country, Gender) %>%
summarise(avg_math = mean(Math),
avg_reading = mean(Reading),
avg_science = mean(Science),
n = n() ) %>%
arrange(avg_math, avg_reading, avg_science)
# ggplot(country_data, aes(x = Math, y = Reading, color = Gender)) +
# geom_jitter(alpha = 1/20) +
# geom_smooth(aes(color = Gender))