Data Analysis with R

library(ggplot2)
library(dplyr)
library(scales)
library(xlsx)
library(reshape2)
library(lubridate)
library(ggthemes)
library(gridExtra)

Q3.1

# a) Load the 'diamonds' data set in R Studio. 
# How many observations are in the data set?

nrow(diamonds)

## [1] 53940

# b) How many variables are in the data set?

ncol(diamonds)

## [1] 10

# c) How many ordered factors are in the set?

# str(diamonds)
# 3

# d) What letter represents the best color for a diamonds?

levels(diamonds$color)

## [1] "D" "E" "F" "G" "H" "I" "J"

# help(diamonds)
# D

Q3.2

# Create a histogram of the price of
# all the diamonds in the diamond data set.

ggplot(diamonds, aes(x = price)) + 
  geom_histogram(color = "black", fill = "DarkOrange", binwidth = 500) + 
  scale_x_continuous(labels = dollar, breaks = seq(0, 20000, 1000)) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  xlab("Price") + ylab("Count")

plot of chunk unnamed-chunk-3

Q3.3

Describe the shape and center of the price distribution. Include summary statistics like the mean and median.

summary(diamonds$price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2400    3930    5320   18800

# 
# The distribution is right-skewed with small amounts of very large prices driving up the mean, while the median remains a more robust measure of the center of the distribution.

Q3.4

# a) How many diamonds cost less than $500?
diamonds %>%
  filter(price < 500) %>%
  summarise(n = n())

##      n
## 1 1729

# b) How many diamonds cost less than $250?
diamonds %>%
  filter(price < 250) %>%
  summarise(n = n())

##   n
## 1 0

# c) How many diamonds cost more than $15,000?
diamonds %>%
  filter(price >= 15000) %>%
  summarise(n = n())

##      n
## 1 1656

Q3.5

# Explore the largest peak in the
# price histogram you created earlier.

# Try limiting the x-axis, altering the bin width,
# and setting different breaks on the x-axis.

ggplot(diamonds, aes(x = price)) + 
  geom_histogram(color = "black", fill = "DarkOrange", binwidth = 25) + 
  scale_x_continuous(labels = dollar, breaks = seq(0, 2000, 100)) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  coord_cartesian(c(0,2000)) +
  xlab("Price") + ylab("Count")

plot of chunk unnamed-chunk-6

Q3.6

# Break out the histogram of diamond prices by cut.

# You should have five histograms in separate
# panels on your resulting plot.

ggplot(diamonds, aes(x = price)) + 
  geom_histogram(color = "black", fill = "DarkOrange", binwidth = 25) + 
  scale_x_continuous(labels = dollar, breaks = seq(0, 4000, 100)) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  coord_cartesian(c(0,4000)) +
  facet_grid(cut~.) + 
  xlab("Price") + ylab("Count")

plot of chunk unnamed-chunk-7

Q3.7

# a) Which cut has the highest priced diamond?
# Premium

# by(diamonds$price, diamonds$cut, max)
# by(diamonds$price, diamonds$cut, min)
# by(diamonds$price, diamonds$cut, median)

diamonds %>%
  group_by(cut) %>%
  summarise(max_price = max(price),
            min_price = min(price),
            median_price = median(price))

## Source: local data frame [5 x 4]
## 
##         cut max_price min_price median_price
## 1      Fair     18574       337         3282
## 2      Good     18788       327         3050
## 3 Very Good     18818       336         2648
## 4   Premium     18823       326         3185
## 5     Ideal     18806       326         1810

# b) Which cut has the lowest priced diamond?
# Premium & Ideal

# c) Which cut has the lowest median price?
# Ideal

Q3.8

# In the two last exercises, we looked at
# the distribution for diamonds by cut.

# Run the code below in R Studio to generate
# the histogram as a reminder.

# ===============================================================

qplot(x = price, data = diamonds) + facet_wrap(~cut, scales = "free")

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-9

# ===============================================================

# In the last exercise, we looked at the summary statistics
# for diamond price by cut. If we look at the output table, the
# the median and quartiles are reasonably close to each other.

# diamonds$cut: Fair
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#     337    2050    3282    4359    5206   18570 
# ------------------------------------------------------------------------ 
# diamonds$cut: Good
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#     327    1145    3050    3929    5028   18790 
# ------------------------------------------------------------------------ 
# diamonds$cut: Very Good
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#     336     912    2648    3982    5373   18820 
# ------------------------------------------------------------------------ 
# diamonds$cut: Premium
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#     326    1046    3185    4584    6296   18820 
# ------------------------------------------------------------------------ 
# diamonds$cut: Ideal
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#     326     878    1810    3458    4678   18810 

# This means the distributions should be somewhat similar,
# but the histograms we created don't show that.

# The 'Fair' and 'Good' diamonds appear to have 
# different distributions compared to the better
# cut diamonds. They seem somewhat uniform
# on the left with long tails on the right.

# Let's look in to this more.

# Look up the documentation for facet_wrap in R Studio.
# Then, scroll back up and add a parameter to facet_wrap so that
# the y-axis in the histograms is not fixed. You want the y-axis to
# be different for each histogram.

Q3.9

# Create a histogram of price per carat
# and facet it by cut. You can make adjustments
# to the code from the previous exercise to get
# started.

# Adjust the bin width and transform the scale
# of the x-axis using log10.

# Submit your final code when you are ready.

# ENTER YOUR CODE BELOW THIS LINE.
# ===========================================================================


ggplot(diamonds, aes(x = price/carat)) + 
  geom_histogram(color = "black", fill = "DarkOrange", binwidth = .05) + 
  theme(axis.text.x = element_text(angle = 0)) +
  scale_x_log10(expression(paste(Log[10], " of Price")),
                breaks = trans_breaks("log10", function(x) 10^x),
                labels = trans_format("log10", math_format(10^.x))) + 
  facet_grid(cut~., scale = "free") + ylab("Count")

plot of chunk unnamed-chunk-10

Q3.10

# Investigate the price of diamonds using box plots,
# numerical summaries, and one of the following categorical
# variables: cut, clarity, or color.

diamonds %>%
  group_by(cut) %>%
  summarise(count = n(), 
            avg_price = mean(price))

## Source: local data frame [5 x 3]
## 
##         cut count avg_price
## 1      Fair  1610      4359
## 2      Good  4906      3929
## 3 Very Good 12082      3982
## 4   Premium 13791      4584
## 5     Ideal 21551      3458

# There are many more Ideal diamonds than others, but the average price is also the lowest.


ggplot(diamonds, aes(x = clarity, y = price, color = cut)) + 
  geom_boxplot() + 
  facet_grid(color~., margins = TRUE)

plot of chunk unnamed-chunk-11

# This boxplot matrix shows the distribution of price across all 3 categorical variables; cut, clarity, and color.

Q3.11

# a) What is the price range for the middle 50% of the diamonds with color D?
# c) What is the IQR for diamonds with the best color?



diamonds %>%
  group_by(color) %>%
  filter(color == "D") %>%
  summarise(Quartile.25 = quantile(price, 0.25),
            Quartile.75 = quantile(price, 0.75),
            IQR = Quartile.75 - Quartile.25)

## Source: local data frame [1 x 4]
## 
##   color Quartile.25 Quartile.75  IQR
## 1     D         911        4214 3302

# b) What is the price range for the middle 50% of diamonds with color J?
# d) What is the IQR for the diamonds with the worst color?

diamonds %>%
  group_by(color) %>%
  filter(color == "J") %>%
  summarise(Quartile.25 = quantile(price, 0.25),
            Quartile.75 = quantile(price, 0.75),
            IQR = Quartile.75 - Quartile.25)

## Source: local data frame [1 x 4]
## 
##   color Quartile.25 Quartile.75  IQR
## 1     J        1860        7695 5834

# by(diamonds$price, diamonds$color, summary)

Q3.12

# Investigate the price per carat of diamonds across
# the different colors of diamonds using boxplots.

ggplot(diamonds, aes(x = color, y = price/carat, fill = color)) +
  geom_boxplot() +
  coord_cartesian(ylim=c(1000, 6000)) +
  scale_y_continuous(labels=dollar) + 
  xlab("Color") + ylab("Price per Carat")

plot of chunk unnamed-chunk-13

Q3.13

# Investigate the weight of the diamonds (carat) using a frequency polygon. Use different bin widths to see how the frequency polygon changes. What carat size has a count greater than 2000? Check all that apply.

sizes = c(0.1, 0.3, 0.8, 1.01, 1.6, 2.0, 3.0, 5.0)

summary(diamonds$carat)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.200   0.400   0.700   0.798   1.040   5.010

ggplot(diamonds, aes(x=carat)) + 
  geom_freqpoly(binwidth=0.1, alpha = 0.75) + 
  scale_x_continuous(breaks=sizes, expand = c(0,0)) +
  scale_y_continuous(expand=c(0,0))+
  geom_vline(xintercept = c(0.1, 0.8, 1.6, 2.0, 3.0, 5.0), color = "red", linetype="dashed", alpha = 0.75) +
  geom_vline(xintercept = c(0.3, 1.01), color = "forestgreen", linetype = "twodash") +
  geom_hline(yintercept = 2000, color = "brown", linetype="longdash", alpha = 0.5) + 
  xlab("Carat Size") + ylab("Count")

plot of chunk unnamed-chunk-14

# 0.3 and 1.01 in green

Q3.14

# The Gapminder website contains over 500 data sets with information about
# the world's population. Your task is to download a data set of your choice
# and create 2-5 plots that make use of the techniques from Lesson 3.

# http://spreadsheets.google.com/pub?key=rdCufG2vozTpKw7TBGbyoWw&output=xls


hours <- tbl_df(read.xlsx("indicator_hours per week.xlsx", sheetName="Data", header=TRUE))

hours <- hours %>%
  select(-NA.) %>% # Remove NA column carried over from xlsx
  rename(Country = Working.hours.per.week) %>%
  filter(Country != "<NA>") # Remove <NA> row carried over from xlsx

hours.long <- melt(hours, id=c("Country"), value.name="Hours", variable.name="Year")
hours.long <- tbl_df(hours.long)

hours.long <- hours.long %>%
  mutate(Year = as.character(Year), # Convert to character
         Year = substr(Year, 2, 5), # Slice out the X, leaving last 4 digits; R added X since initially since column names can't start with numbers.
         Year = as.numeric(Year))   # Cast as numeric

yearStats <- hours.long %>%
  group_by(Year) %>%
  summarise(median = median(Hours, na.rm=TRUE),
            mean = mean(Hours, na.rm=TRUE),
            lower = min(Hours, na.rm=TRUE),
            upper = max(Hours, na.rm=TRUE),
            se = sd(Hours, na.rm=TRUE)/sqrt(length(Hours)),
            avg_upper = mean + (2.101*se),
            avg_lower = mean - (2.101*se),
            quant.25 = quantile(Hours, na.rm=TRUE, 0.25),
            quant.75 = quantile(Hours, na.rm=TRUE, 0.75))

yearStats <- round(yearStats, 2)

p <- ggplot(yearStats, aes(x=Year, y=median)) + 
  theme(plot.background = element_blank(),
                panel.grid.minor = element_blank(),
                panel.grid.major = element_blank(),
                panel.border = element_blank(),
                panel.background = element_blank(),
                axis.ticks = element_blank(),
                axis.title = element_blank()) +
    geom_linerange(yearStats, mapping=aes(x=Year, ymin=lower, ymax=upper), colour = "wheat2", alpha=1, size=5) + 
  geom_linerange(yearStats, mapping=aes(x=Year, ymin=quant.25, ymax=quant.75), colour = "wheat4", size=5) +
  geom_line(yearStats, mapping=aes(x=Year, y=median, group=1)) +
  geom_vline(xintercept = 1980, colour = "wheat4", linetype=1, size=1, hjust=3) + 
  geom_hline(yintercept=seq(26, 56, 2), color="white", linetype=1) 

dottedYears <- seq(1980, 2007, 5) # Pick the years to draw dotted vertical lines on

p <- p + geom_vline(xintercept = dottedYears, color="wheat4", linetype=3, size=0.5)

p <- p + coord_cartesian(ylim = c(26,58))+
  scale_y_continuous(breaks=seq(26, 60, 2)) +
  scale_x_continuous(breaks=seq(1980, 2005, 5), expand=c(0,0) )

p <- p + geom_line(data = subset(hours.long, Country == "United States"), aes(x = Year, y = Hours, group = Country), color ="brown") +
  annotate("segment", x=2000, xend=2002, y=35.5, yend=36, color="brown") +
  annotate("text", x=2003, y=36.3, label="U.S. Hours", size=3.5, color="brown") + 
  annotate("segment", x=2000, xend=2001, y=33.5, yend=32) + 
  annotate("text", x=2002, y=31.7, label="World Medians", size=3.5)


p1 <- p + annotate("text", x=1999.9, y=56, label="Data represents hours worked per week for 52 countries   ", size=3, color="gray30") + 
    annotate("text", x=2000, y=55, label="from 1980 to 2007. Outer lighter bands show the min/max  ", size=3, color="gray30") +
    annotate("text", x=2000, y=54, label="hours for each year, and inner darker bands show the IQR.", size=3, color="gray30") + 
    ggtitle("World's Working Hours") +
    theme(plot.title=element_text(face="bold",hjust=.95,vjust=.8,color="#3C3C3C",size=20)) + 
    annotate("text", x=1994.6, y=57.5, label="Weekly", size=4, fontface="bold")

p1

plot of chunk unnamed-chunk-15

# Zoom in a bit to the Inter-Quartile Range(IQR)
p + coord_cartesian(ylim=c(30, 38)) + 
  ggtitle("Inter-Quartile Range of Weekly Hours Worked Per Year Per Country") +
  theme(plot.title=element_text(face="bold",color="#3C3C3C",size=12)) + 
    geom_text(data = hours.long[hours.long$Country == "United States",], aes( x = Year, y = Hours, color = Country, group = Country, label = round(Hours, 2)), hjust = -.1, vjust = -1.2, size = 2, color = "brown") +
  geom_text( aes(x = Year, y = median, label = median), hjust = -.1, vjust = 1.2, size = 2, color = "black")

plot of chunk unnamed-chunk-15

ggplot(subset(hours.long, Country %in% c("Switzerland", "United States", "Japan", "United Kingdom", "France")), aes(x=Country, y=Hours, fill=Country)) +
  geom_boxplot() + 
  stat_summary(fun.y = mean, geom="point", shape = 5) +
  ylab("Hours Worked per Week") + xlab("") +
  theme(legend.position="none") + 
  ggtitle("Box plots of Hours Worked per Week for 5 Countries") + 
  theme(plot.title=element_text(face="bold",color="#3C3C3C",size=12))

plot of chunk unnamed-chunk-15

ggplot(subset(hours.long, Country %in% c("Switzerland", "United States", "Japan", "United Kingdom", "France")), aes(x = Year, y = Hours)) + 
  geom_line(aes(color = Country, group = Country)) + 
  theme(axis.text.x = element_text(angle = 45)) + 
  scale_y_continuous(breaks=seq(26, 56, 2)) +
  scale_x_continuous(breaks=seq(1980, 2005, 5), expand=c(0,0) ) + 
  ylab("Hours Worked per Week") + xlab("") + 
  theme(plot.title=element_text(face="bold",color="#3C3C3C",size=12)) +
  ggtitle("Change in Weekly Hours Worked from 1980-2007 for 5 Countries")

plot of chunk unnamed-chunk-15

Q3.15

# How many birthdays are in each month?

# Which day of the year has the most number of birthdays?

# Do you have at least 365 friends that have birthdays on everyday
# of the year?

# **********************************************************************

# You will need to do some data munging and additional research to
# complete this task. This task won't be easy, and you may encounter some
# unexpected challenges along the way. We hope you learn a lot from it though.

# You can expect to spend 30 min or more on this task depending on if
# use the provided data or obtain your personal data. We also encourage you
# to use the lubridate package for working with dates. Read over the documentation
# in RStudio and search for examples online if you need help.

# You'll need to export your Facebooks friends' birthdays to a csv file.
# You may need to create a calendar of your Facebook friendsâ birthdays
# in a program like Outlook or Gmail and then export the calendar as a
# csv file.

# Once you load the data into R Studio, you can use the strptime() function
# to extract the birth months and birth days. We recommend looking up the
# documentation for the function and finding examples online.


birthdays <- tbl_df(read.csv("birthdaysExample.csv"))

tempDates <- mdy(birthdays$dates)

birthdays <- birthdays %>%
  mutate(Birthday = tempDates,
         Year = year(tempDates),
         Month = month(tempDates, label=TRUE, abbr=FALSE),
         Day = day(tempDates),
         Weekday = weekdays(tempDates, abbr=FALSE))


birthdays$Weekday <- factor(birthdays$Weekday, levels=c('Monday', 'Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'), ordered=TRUE)

 # ifelse didn't seem to work here for creating 'optional' since the else would create a NULL vector within the function and R complained.
 # creates optional ggplot argument only if Day is passed in.
 # aes_string takes in a string as the column name for plotting. Perfect for our arguments we pass in as strings.
 # Paste our argument in for custom X and Y-axis labeling

makePlots<- function(TimeLength){

  optional<- NULL
  if(TimeLength == "Day")  optional<- scale_x_discrete(breaks = seq(1, 31, 1)) 

  ggplot(birthdays, aes_string(x = TimeLength, fill = TimeLength)) + 
    geom_histogram(binwidth = 1, color = "black", show_guide = FALSE) + 
    scale_fill_brewer(palette="Paired") + 
    stat_bin(aes(label=..count..), vjust=-.1, 
             geom="text", position="identity", size=3) + 
    xlab(TimeLength)+
    ylab(paste0("Number of Birthdays per ", TimeLength)) + 
    theme_tufte() + 
    optional

}
    
makePlots("Month")

plot of chunk unnamed-chunk-16

makePlots("Day")

plot of chunk unnamed-chunk-16

makePlots("Weekday")

plot of chunk unnamed-chunk-16

Q4.1

# In this problem set, you'll continue
# to explore the diamonds data set.

# Your first task is to create a
# scatterplot of price vs x.
# using the ggplot syntax.

ggplot(diamonds, aes(x = x, y = price)) +
  geom_point(alpha = 1/20) +
  coord_cartesian(xlim=c(3.5, 12)) + 
  scale_y_continuous(breaks=seq(1000, 19000, 2000),label=dollar)

plot of chunk unnamed-chunk-17

Q4.2

# What are your observations about the scatterplot of price vs x?

# The bulk of the data start at around x = 3.3 and price rises exponentially as it approaches 9. 
# There also seems to be an artificial ceiling in price around $19k.

Q4.3

# What is the correlation between price and x?
with(diamonds, cor.test(price, x))

## 
##  Pearson's product-moment correlation
## 
## data:  price and x
## t = 440.2, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8826 0.8863
## sample estimates:
##    cor 
## 0.8844

# What is the correlation between price and y?
with(diamonds, cor.test(price, y))

## 
##  Pearson's product-moment correlation
## 
## data:  price and y
## t = 401.1, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8633 0.8675
## sample estimates:
##    cor 
## 0.8654

# What is the correlation between price and z?
with(diamonds, cor.test(price, z))

## 
##  Pearson's product-moment correlation
## 
## data:  price and z
## t = 393.6, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8591 0.8634
## sample estimates:
##    cor 
## 0.8612

Q4.4

# Create a simple scatter plot of price vs depth.
ggplot(diamonds, aes(x = depth, y = price)) + geom_point(alpha = 1/20)

plot of chunk unnamed-chunk-20

Q4.5

# Change the code to make the transparency of the
# points to be 1/100 of what they are now and mark
# the x-axis every 2 units. 

ggplot(data = diamonds, aes(x = depth, y = price)) + 
  geom_point(alpha = 1/100) + 
  scale_x_continuous(breaks = seq(min(diamonds$depth), max(diamonds$depth), 2),
                     labels = seq(min(diamonds$depth), max(diamonds$depth), 2))

plot of chunk unnamed-chunk-21

Q4.6

# Based on the scatterplot of depth vs. price, most diamonds are between what values of depth?

# 58-64

Q4.7

# What's the correlation of depth vs. price?
with(diamonds, cor.test(depth, price))

## 
##  Pearson's product-moment correlation
## 
## data:  depth and price
## t = -2.473, df = 53938, p-value = 0.0134
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.019085 -0.002209
## sample estimates:
##      cor 
## -0.01065

# Based on the correlation coefficient, would you use depth to predict the price of a diamond? Why?

# No.
# The correlation is not equal to zero (p-value = 0.0134; likely due to the huge sample size),
# but the power of the correlation is almost non-existent.
# Depth alone would have virtually no predictive power for price.

Q4.8

# Create a scatterplot of price vs carat
# and omit the top 1% of price and carat
# values.

ggplot(diamonds, aes(x = carat, y = price)) +
  geom_point(alpha=1/20) +
  scale_x_continuous(limits=c(0, quantile(diamonds$carat, 0.99))) +
  scale_y_continuous(breaks=seq(0, 18000, 2000), 
                     limits=c(0 , quantile(diamonds$price, 0.99)),
                     labels=dollar)

plot of chunk unnamed-chunk-24

Q4.9

# Create a scatterplot of price vs. volume (x * y * z).
# This is a very rough approximation for a diamond's volume.

# Create a new variable for volume in the diamonds data frame.
# This will be useful in a later exercise.

# Don't make any adjustments to the plot just yet.

diamonds2 <- diamonds %>%
  mutate(volume=x*y*z)

ggplot(diamonds2, aes(x = volume, y = price)) + 
  geom_point()

plot of chunk unnamed-chunk-25

Q4.10

# What are your observations from the price vs volume scatterplot?

# There are 3 obvious outliers, with 1 massive diamond expanding the plot quite a bit.
# Prices rise exponentially with volume making transformations of the x-scale a good idea.
# There may also be diamonds with volumes at or near 0.

Q4.11

# What's the correlation of price and volume?
# Exclude diamonds that have a volume of 0 or that are greater than or equal to 800.

with(subset(diamonds2, !(volume == 0 | volume >= 800) ), cor.test(price, volume))

## 
##  Pearson's product-moment correlation
## 
## data:  price and volume
## t = 559.2, df = 53915, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9223 0.9248
## sample estimates:
##    cor 
## 0.9235

Q4.12

# Subset the data to exclude diamonds with a volume
# greater than or equal to 800. Also, exclude diamonds
# with a volume of 0. Adjust the transparency of the
# points and add a linear model to the plot. (See the
# Instructor Notes or look up the documentation of
# geom_smooth() for more details about smoothers.)

# We encourage you to think about this next question and
# to post your thoughts in the discussion section.

# Do you think this would be a useful model to estimate
# the price of diamonds? Why or why not?

smaller <- diamonds2 %>%
  filter(volume != 0,
         volume <= 800)

ggplot(smaller, aes( x = volume, y = price)) + 
  geom_point(alpha = 1/20) +
  geom_smooth(method = "lm",
              se = TRUE)

plot of chunk unnamed-chunk-28

# This is probably not the best model since the relationship doesn not appear to be normal. 
# The residuals would show a heavy pattern as well 
# instead of random scattering.

Q4.13

# Use the function dplyr package
# to create a new data frame containing
# info on diamonds by clarity.

# Name the data frame diamondsByClarity

# The data frame should contain the following
# variables in this order.

#       (1) mean_price
#       (2) median_price
#       (3) min_price
#       (4) max_price
#       (5) n

# where n is the number of diamonds in each
# level of clarity.

diamondsByClarity<- diamonds %>%
  group_by(clarity) %>%
  summarise(mean_price = mean(price),
            median_price = median(price),
            min_price = min(price),
            max_price = max(price),
            n = n() ) %>%
  arrange(clarity)

Q4.14

# Weâve created summary data frames with the mean price
# by clarity and color. You can run the code in R to
# verify what data is in the variables diamonds_mp_by_clarity
# and diamonds_mp_by_color.

# Your task is to write additional code to create two bar plots
# on one output image using the grid.arrange() function from the package
# gridExtra.

diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))

diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))

# ===================================================================

c1 <- ggplot(diamonds_mp_by_clarity, aes(x=clarity, y=mean_price, fill=clarity)) +
  geom_bar(stat = "identity", color = "black") +
  scale_fill_brewer(palette="Set3") + 
  guides(fill = guide_legend(ncol=2, title.hjust=0.3))

c2 <- ggplot(diamonds_mp_by_color, aes(x=color, y=mean_price, fill=color)) +
  geom_bar(stat = "identity", color = "black") +
  scale_fill_brewer(palette="Set2") + 
  guides(fill = guide_legend(ncol=2, title.hjust=0.4))

grid.arrange(c1, c2)

plot of chunk unnamed-chunk-30

Q4.15

# What do you notice in each of the bar charts for mean price by clarity and mean price by color?

# There's a downward trend in average diamond price as clarity goes from I1 to IF,
# and an upward trend in average price as color goes from D to J.

Q4.16

# The Gapminder website contains over 500 data sets with information about
# the world's population. Your task is to continue the investigation you did at the
# end of Problem Set 3 or you can start fresh and choose a different
# data set from Gapminder.

# If youâre feeling adventurous or want to try some data munging see if you can
# find a data set or scrape one from the web.

# In your investigation, examine pairs of variable and create 2-5 plots that make
# use of the techniques from Lesson 4.

# ======================================================================================================

# Programme for International Student Assessment (PISA) is a triennial international survey
# which aims to evaluate education systems worldwide by testing the skills and knowledge of 15-year-old students.
# Around 510,000 students in 65 economies took part in PISA 2012 representing about 28 million 15-year-olds globally. 

# Our dataset is composed of 485,490 test results from the 2012 PISA testing results.

# http://www.oecd.org/pisa/aboutpisa/

con <- url("http://beta.icm.edu.pl/PISAcontest/data/student2012.rda")
load(con)

# It's a large file, so alternatively, you can download the .rda files and load it directly.
# load("student2012.rda")

student2012 <- tbl_df(student2012)

# Each of the 5 PV variables for Reading, Math, and Science are plausible values created from tests which may have contained missing data. 
# 5 different sets with imputations were created for completeness.
# These are averaged together to get the estimated (mean) plausible value for each subject.

## Scores variables have attributes which dplyr cannot use group_by with
## These can be cast as numeric to remove attributes
## mutate_each can apply a function(as.numeric) to each column matching a certain criteria

country_data <- student2012 %>% 
  mutate(Math = as.numeric((PV1MATH + PV2MATH + PV3MATH + PV4MATH + PV5MATH)/5),
         Reading = as.numeric((PV1READ + PV2READ + PV3READ + PV4READ + PV5READ)/5),
         Science = as.numeric((PV1SCIE + PV1SCIE + PV1SCIE + PV1SCIE + PV1SCIE)/5)) %>%
  select(Gender = ST04Q01,
         birth_year = ST03Q02,
         birth_month = ST03Q01,
         PC_at_home = ST26Q04,
         Country = CNT,
         Math, Reading, Science)

country_scores <- country_data %>%
  group_by(Country, Gender) %>%
  summarise(mean_math = mean(Math),
            mean_reading = mean(Reading),
            mean_science = mean(Science),
            count = n() ) %>%
  arrange(mean_math, mean_reading, mean_science)

ggplot(country_scores, aes(x = mean_reading, y = mean_math, color = Gender, size=count)) +
  geom_point() +
  geom_smooth() +
  geom_point(data = subset(country_scores, Country %in% c("United States of America", "Mexico", "Peru", "Qatar", "China-Shanghai", "Singapore", "Korea", "Finland", "Chile")), 
             aes(label = Country),
             color="#0266c8",
             show_guide = FALSE) +
  geom_text(data = subset(country_scores, Country %in% c("United States of America", "Mexico", "Peru", "Qatar", "China-Shanghai", "Singapore", "Korea", "Finland", "Chile")),
            aes(label = Country),
            color = "black",
            size = 3,
            vjust = -0.5,
            show_guide = FALSE) +
  #geom_abline(intercept = 0, slope = 1, linetype=2, color="orange") +
  ylab("Average Math Score") + 
  xlab("Average Reading Score") + 
  ggtitle("Average Reading and Math Scores per country, by gender")

## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

plot of chunk unnamed-chunk-33

# It seems to be a world-wide trend that boys are achieving higher scores in math than reading,
# and girls are in turn doing better in reading than math on average.

# We can see the US in the middle of the pack, while Asian countries are making up the majority at the top.

ggplot(country_scores, aes(x = mean_reading, y = mean_science, color = Gender, size=count)) +
  geom_point() +
  geom_smooth() +
  geom_point(data = subset(country_scores, Country %in% c("United States of America", "Mexico", "Peru", "Qatar", "China-Shanghai", "Singapore", "Korea", "Finland", "Chile")), 
             aes(label = Country),
             color="#0266c8",
             show_guide = FALSE) +
  geom_text(data = subset(country_scores, Country %in% c("United States of America", "Mexico", "Peru", "Qatar", "China-Shanghai", "Singapore", "Korea", "Finland", "Chile")),
            aes(label = Country),
            color = "black",
            size = 3,
            vjust = -0.5,
            show_guide = FALSE) +
  #geom_abline(intercept = 0, slope = 1, linetype=2, color="orange") +
  ylab("Average Science Score") + 
  xlab("Average Reading Score") + 
  ggtitle("Average Reading and Science Scores per country, by gender")

## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

plot of chunk unnamed-chunk-33

# The trend is similar for science vs reading.

country_scores_PC <- country_data %>%
  group_by(Country, PC_at_home) %>%
  summarise(mean_math = mean(Math),
            mean_reading = mean(Reading),
            mean_science = mean(Science),
            count = n() ) %>%
  arrange(mean_math, mean_reading, mean_science)

g1 <- ggplot(subset(country_scores_PC, !(is.na(PC_at_home))), aes(x = mean_reading, fill = PC_at_home, size=count)) +
  geom_density(alpha=0.66) +
  ylab("Density") + 
  xlab("Average Reading Score") + 
  ggtitle("Density Plot of Average Reading Scores with/without a PC at home")

g2 <- ggplot(subset(country_scores_PC, !(is.na(PC_at_home))), aes(x = mean_math, fill = PC_at_home, size=count)) +
  geom_density(alpha=0.66) +
  ylab("Density") + 
  xlab("Average Math Score") + 
  ggtitle("Density Plot of Average Math Scores with/without a PC at home")

g3 <- ggplot(subset(country_scores_PC, !(is.na(PC_at_home))), aes(x = mean_science, fill = PC_at_home, size=count)) +
  geom_density(alpha=0.66) +
  ylab("Density") + 
  xlab("Average Science Score") + 
  ggtitle("Density Plot of Average Science Scores with/without a PC at home")

grid.arrange(g1, g2, g3, ncol=1)

plot of chunk unnamed-chunk-33

## Across all countries, those having a PC at home  had a with higher distribution of scores in reading, math, and science.

Q5.1

# Create a histogram of diamond prices.
# Facet the histogram by diamond color
# and use cut to color the histogram bars.

# The plot should look something like this.
# http://i.imgur.com/b5xyrOu.jpg

# Note: In the link, a color palette of type
# 'qual' was used to color the histogram using
# scale_fill_brewer(type = 'qual')

ggplot(diamonds, aes(x = price, fill = cut)) +
  geom_histogram() +
  facet_wrap(~ color) +
  scale_fill_brewer(type = 'qual') +
  scale_x_log10(labels=dollar, expression(paste(Log[10], " of Price"))) +
  ylab("Count")

plot of chunk unnamed-chunk-34

Q5.2

# Create a scatterplot of diamond price vs.
# table and color the points by the cut of
# the diamond.

# The plot should look something like this.
# http://i.imgur.com/rQF9jQr.jpg

# Note: In the link, a color palette of type
# 'qual' was used to color the scatterplot using
# scale_color_brewer(type = 'qual')

ggplot(diamonds, aes(x = table, y = price, color = cut)) + 
  geom_jitter(size = 3) +
  scale_x_continuous(breaks = seq(50, 80, 2),
                     limits = c(50, 80)) +
  scale_color_brewer(type = 'qual') +
  theme_minimal()

plot of chunk unnamed-chunk-35

Q5.3

## What is the typical table range for the majority of diamonds of ideal cut?
# 54 to 57

## What is the typical table range for the majory of diamonds of premium cut?
# 58 to 60

## Use the graph that you created from the previous exercise to see the answer. You do not need to run summaries.

Q5.4

# Create a scatterplot of diamond price vs.
# volume (x * y * z) and color the points by
# the clarity of diamonds. Use scale on the y-axis
# to take the log10 of price. You should also
# omit the top 1% of diamond volumes from the plot.

# Note: Volume is a very rough approximation of
# a diamond's actual volume.

# The plot should look something like this.
# http://i.imgur.com/excUpea.jpg

# Note: In the link, a color palette of type
# 'div' was used to color the scatterplot using
# scale_color_brewer(type = 'div')

diamonds <- diamonds %>% 
  mutate(volume = x * y *z)

ggplot(subset(diamonds, volume <= quantile(volume, 0.99) & volume > 0 ), aes(x = volume, y = price, color = clarity)) +
  geom_jitter(size = 3) + 
  scale_y_log10(labels=dollar) +
  scale_color_brewer(type = 'div') + 
  theme_minimal()

plot of chunk unnamed-chunk-37

Q5.5

# Many interesting variables are derived from two or more others.
# For example, we might wonder how much of a person's network on
# a service like Facebook the user actively initiated. Two users
# with the same degree (or number of friends) might be very
# different if one initiated most of those connections on the
# service, while the other initiated very few. So it could be
# useful to consider this proportion of existing friendships that
# the user initiated. This might be a good predictor of how active
# a user is compared with their peers, or other traits, such as
# personality (i.e., is this person an extrovert?).

# Your task is to create a new variable called 'prop_initiated'
# in the Pseudo-Facebook data set. The variable should contain
# the proportion of friendships that the user initiated.

fb <- tbl_df(read.table("pseudo_facebook.tsv", header=TRUE))


# fb <- fb %>%
#    mutate(prop_initiated = ifelse(friend_count > 0, friendships_initiated/friend_count, 0))

# Using code below instead will generate NA's when friend_count is 0 due to division by 0
# But future exercises asking for averages will be different if we don't generate those NA's 

fb <- fb %>%
 mutate(prop_initiated = friendships_initiated/friend_count)

Q5.6

# Create a line graph of the proportion of
# friendships initiated ('prop_initiated') vs.
# tenure and color the line segment by
# year_joined.bucket.

# Recall, we created year_joined.bucket in Lesson 5
# by first creating year_joined from the variable tenure.
# Then, we used the cut function on year_joined to create
# four bins or cohorts of users.

# (2004, 2009]
# (2009, 2011]
# (2011, 2012]
# (2012, 2014]

# The plot should look something like this.
# http://i.imgur.com/vNjPtDh.jpg
# OR this
# http://i.imgur.com/IBN1ufQ.jpg

fb <- fb %>%
  mutate(year_joined = floor(2014 - tenure/365),
         year_joined_bucket = cut(year_joined, breaks=c(2004, 2009, 2011, 2012, 2014))) 

## Adding in 0's for the mean function instead of leaving out NA's gives a better approximation 
## for the proportion of friendships initiated for this plot.

fb2 <- fb %>%
    mutate(prop_initiated = ifelse(friend_count > 0, friendships_initiated/friend_count, 0))

ggplot(subset(fb2, tenure > 0), aes(x=tenure, y=prop_initiated)) +
  geom_line(aes(color=year_joined_bucket), stat='summary', fun.y=mean) + 
  theme_minimal()

plot of chunk unnamed-chunk-39

Q5.7

# Smooth the last plot you created of
# of prop_initiated vs tenure colored by
# year_joined.bucket. You can use larger
# bins for tenure or add a smoother to the plot.

ggplot(subset(fb, tenure > 0), aes(x=tenure, y=prop_initiated)) +
  #geom_line(aes(color=year_joined_bucket), stat='summary', fun.y=mean) +
  geom_smooth(aes(color = year_joined_bucket))

## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.

## Warning: Removed 5 rows containing missing values (stat_smooth).
## Warning: Removed 81 rows containing missing values (stat_smooth).
## Warning: Removed 408 rows containing missing values (stat_smooth).
## Warning: Removed 1403 rows containing missing values (stat_smooth).

plot of chunk unnamed-chunk-40

Q5.8

## On average, which group initiated the greatest
## poportion of its Facebook friendships? The plot with
## the smoother that you created in the last exercise can
## help you answer this question.

## People who joined after 2012.

Q5.9

## For the group with the largest proportion of 
## friendships initated, what is the group's average 
## (mean) proportion on friendships initiated?

fb %>%
  filter(year_joined_bucket == "(2012,2014]") %>%
  summarise(avg = mean(prop_initiated, na.rm=TRUE))

## Source: local data frame [1 x 1]
## 
##      avg
## 1 0.6654

## Why do you think this group's proportion of friendships initiated is higher than the others?

# They have newer accounts. To get started after first creating an account, 
# you have to build your online network of friends based on your existing network of friends. 
# After this initial population, someone may add/acquire new friends 
# at a lower rate allowing the proportion to even out slightly.
# The younger cohort may also have fundamentally different patterns of acquisition;
# 2005 wasn't too long ago and Facebook was first targeted at a college-age demographic, 
# who are now in their late twenties and early thirties.
# Rates might have already been lower in general for this group, 
# who never were in their mid-teens and starting ubiquitous Facebook accounts.

Q5.10

# Create a scatter plot of the price/carat ratio
# of diamonds. The variable x should be
# assigned to cut. The points should be colored
# by diamond color, and the plot should be
# faceted by clarity.

# The plot should look something like this.
# http://i.imgur.com/YzbWkHT.jpg.

# Note: In the link, a color palette of type
# 'div' was used to color the histogram using
# scale_color_brewer(type = 'div')

ggplot(diamonds, aes(x = cut, y = price/carat, color = color)) + 
  geom_jitter() + 
  facet_wrap(~clarity) + 
  scale_color_brewer(type = 'div')

plot of chunk unnamed-chunk-43

Q5.11

# The Gapminder website contains over 500 data sets with information about
# the world's population. Your task is to continue the investigation you did at the
# end of Problem Set 4 or you can start fresh and choose a different
# data set from Gapminder.

# If you're feeling adventurous or want to try some data munging see if you can
# find a data set or scrape one from the web.

# In your investigation, examine 3 or more variables and create 2-5 plots that make
# use of the techniques from Lesson 5.


## Proficiency scores to assign proficency levels assigned to students.
prof_scores <- c(0, 358, 420, 482, 545, 607, 669, 1000)

country_data <- student2012 %>%
  mutate(Math = as.numeric((PV1MATH + PV2MATH + PV3MATH + PV4MATH + PV5MATH)/5),
         Reading = as.numeric((PV1READ + PV2READ + PV3READ + PV4READ + PV5READ)/5),
         Science = as.numeric((PV1SCIE + PV1SCIE + PV1SCIE + PV1SCIE + PV1SCIE)/5),
         math_levels = cut(Math, breaks = prof_scores, 
                           labels = paste("Level", 1:7), ordered_result = TRUE),
         reading_levels = cut(Reading, breaks = prof_scores, 
                              labels = paste("Level", 1:7), ordered_result = TRUE),
         science_levels = cut(Science, breaks = prof_scores,
                              labels = paste("Level", 1:7), ordered_result = TRUE)) %>%
  select(Gender = ST04Q01,
         birth_year = ST03Q02,
         birth_month = ST03Q01,
         PC_at_home = ST26Q04,
         Country = CNT,
         books_at_home = ST28Q01,
         at_home_mother = ST11Q01,
         at_home_father = ST11Q02,
         mother_job = ST15Q01,
         mother_schooling = ST13Q01,
         father_job = ST19Q01,
         father_schooling = ST17Q01,
         own_room = ST26Q02,
         internet = ST26Q06,
         home_book_count = ST28Q01,
         computer_count = ST27Q03,
         first_pc_use = IC03Q01,
         first_internet_use = IC04Q01,
         use_internet_at_school = IC05Q01,
         internet_out_of_sch_wd = IC06Q01,
         internet_out_of_sch_we = IC07Q01,
         internet_for_fun = IC08Q06,
         internet_at_home = IC01Q04,
         internet_at_school = IC02Q04,
         Math, Reading, Science)

#####################################################
country_gender <- country_data %>%
  group_by(Country, Gender) %>%
  summarise(avg_math = mean(Math),
            avg_reading = mean(Reading),
            avg_science = mean(Science),
            n = n() ) %>%
  arrange(avg_math, avg_reading, avg_science)

# ggplot(country_data, aes(x = Math, y = Reading, color =  Gender)) +
#   geom_jitter(alpha = 1/20) + 
#   geom_smooth(aes(color = Gender))

Data Analysis with R - Exercises

Fernando Hernandez

Saturday, January 10, 2015