Preparations

Before doing anything with the data and making fun graphs and plots, we need to clean our data.

Loading Packages

library(data.table)
library(ggplot2)
library(ggside)
library(ggthemes)
library(plotly)
library(htmlwidgets)
library(RColorBrewer)

Importing Used Cars Dataset

The original dataset has 8 columns, but we won’t need Vin, City, and Model because they are too specific. We will drop these three columns to save resources.

used_cars <- fread(input = "true_car_listings.csv", drop = c("City", "Vin", "Model"))
#used_cars <- fread(input = "true_car_listings.csv")
str(used_cars)

Classes 'data.table' and 'data.frame':  852061 obs. of  5 variables:
 $ Price  : int  10194 6991 44995 6950 6194 8995 9995 10995 13194 13995 ...
 $ Year   : int  2011 2007 2006 2009 2000 2001 1999 2013 2015 2015 ...
 $ Mileage: int  63451 123972 16900 85354 128961 103636 217585 50761 35321 47315 ...
 $ State  : chr  "AK" "AK" "AK" "AK" ...
 $ Make   : chr  "Dodge" "Dodge" "Dodge" "Dodge" ...
 - attr(*, ".internal.selfref")=<externalptr>

Data Cleaning

Changing Year to factor and removing Year 2018 because of possibly incomplete data (2018 not done yet)

used_cars <- used_cars[Year != 2018,]
used_cars$Year <- factor(used_cars$Year, ordered = T)
summary(used_cars$Year)

  1997   1998   1999   2000   2001   2002   2003   2004   2005   2006 
   595    774   1252   1931   2583   3797   5648   8114  11000  15072 
  2007   2008   2009   2010   2011   2012   2013   2014   2015   2016 
 21168  24709  19060  27536  39764  49758  74696 162428 157513 132134 
  2017 
 91607

Fix bad state names

wrong_state_names <- c("Az", "Ca", "Fl", "ga", "Ga", "Md", "Oh", "Va")

state_fix <- function(state_name) {
    if (state_name %in% wrong_state_names) {
        return(toupper(state_name))
    } else 
        (return(state_name))
}

used_cars[, State := ifelse(State %in% wrong_state_names, toupper(State), State)]

# full_idx <- c()
# for (bad_state in wrong_state_names) {
#     tmp_idx <- which(used_cars$State == "AZ")
#     full_idx <- c(full_idx, tmp_idx)
# }
# 
# for (row_num in full_idx) {
#     if (used_cars[row_num, "State"] %in% wrong_state_names) {
#         used_cars[row_num, "State"] <- toupper(used_cars[row_num, "State"])
#     }
# }

used_cars$State <- factor(used_cars$State)
summary(used_cars$State)

   AK    AL    AR    AZ    CA    CO    CT    DC    DE    FL    GA 
 1663 13284  6950 22449 80353 22973 10900    25  2416 73121 39597 
   HI    IA    ID    IL    IN    KS    KY    LA    MA    MD    ME 
 2995  4829  3499 38629 16461  7891 13685  8523 17977 17648  1898 
   MI    MN    MO    MS    MT    NC    ND    NE    NH    NJ    NM 
10312 10988 16158  6236  1981 37494  1092  4921  5690 27821  4724 
   NV    NY    OH    OK    OR    PA    RI    SC    SD    TN    TX 
 6883 27909 23301  9582 10349 28082  1857  9782  1267 19160 94559 
   UT    VA    VT    WA    WI    WV    WY 
10360 35245  1122 23139 11250  1499   610

unique(used_cars$State)

 [1] AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME
[23] MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX
[45] UT VA VT WA WI WV WY
51 Levels: AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS ... WY

str(used_cars)

Classes 'data.table' and 'data.frame':  851139 obs. of  5 variables:
 $ Price  : int  10194 6991 44995 6950 6194 8995 9995 10995 13194 13995 ...
 $ Year   : Ord.factor w/ 21 levels "1997"<"1998"<..: 15 11 10 13 4 5 3 17 19 19 ...
 $ Mileage: int  63451 123972 16900 85354 128961 103636 217585 50761 35321 47315 ...
 $ State  : Factor w/ 51 levels "AK","AL","AR",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ Make   : chr  "Dodge" "Dodge" "Dodge" "Dodge" ...
 - attr(*, ".internal.selfref")=<externalptr>

Importing US States GPS Coordinates Dataset

states_coord <- fread("us_states_coord.csv")

Categorizing each make by country

make_country <- fread("manufacturers.csv")

for (i in 1:nrow(make_country)) {
    this_make <- make_country[i,Make]
    this_country <- make_country[i,Country]
    
    used_cars[Make == this_make, Country := this_country]
}

Putting each state’s favorite make on the map

setkey(states_coord, state_code)
setkey(used_cars, Make)

# Find each state's favorite make 
for (state in unique(used_cars$State)) {
    sorted <- used_cars[State==state, .N, by=list(Make)]
    this_fav_make <- sorted[order(-N)]$Make[1]
    states_coord[.(state), fav_make := this_fav_make]
}

# Find each state's favorite car manufacturer country 
for (state in unique(used_cars$State)) {
    sorted2 <- used_cars[State==state, .N, by=list(Country)]
    this_fav_country <- sorted2[order(-N)]$Country[1]
    states_coord[.(state), fav_country := this_fav_country]
}

# Find each state's number of listings
for (state in unique(used_cars$State)) {
    num_car_by_state <- used_cars[,.N, State]
    this_state_n <- num_car_by_state[State==state, N]
    states_coord[.(state), num_listings := this_state_n]
}

# Find each state's mean listing price
for (state in unique(used_cars$State)) {
    mean_price_by_state <- used_cars[,round(mean(Price),2), State]
    this_mean <- mean_price_by_state[State==state, V1]
    states_coord[.(state), mean_price := this_mean]
}

states_coord$fav_make <- factor(states_coord$fav_make)
# Got help from https://www.analytics-tuts.com/color-usa-map-based-on-a-categorical-variable-using-plotly-in-r/
states_coord$fav_make_val <- factor(as.numeric(states_coord$fav_make))

states_coord$hover_text <- with(states_coord, 
                                paste("<b>State:</b>", state_name, '<br>', 
                                      "<b>Number of Listings:</b>", num_listings,"<br>",
                                      "<b>Mean Price ($):</b>", mean_price, "<br>",
                                      "<b>Favorite Make:</b>", fav_make))

# specify some map projection/options
g <- list(
    scope = 'usa',
    projection = list(type = 'albers usa'),
    showocean = TRUE,
    oceancolor = toRGB("grey"),
    showlakes = TRUE,
    lakecolor = toRGB("LightBlue"),
    showrivers = TRUE,
    rivercolor = toRGB("LightBlue")
)

# Setting up colors for fav makes
nfactor <- length(levels(states_coord$fav_make_val))
#colr <- brewer.pal(n = nfactor,name = "RdYlGn")
#colr <- c("#8B4500", "#FFB90F", "#6495ED", "#458B74", "#838B8B", "#7A378B")
colr <- c("#8B6914", "#FFEC8B", "#63B8FF", "#76EEC6", "#C1CDCD", "#AB82FF")

names(colr) <- levels(states_coord$fav_make)

colrS <- function(n){
    CUTS <- seq(0,1,length.out=n+1)
    print(CUTS)
    rep(CUTS,ifelse(CUTS %in% 0:1,1,2))
}

colorScale <- data.frame(z=colrS(nfactor), 
                         col=rep(colr,each=2),
                         stringsAsFactors=FALSE)

[1] 0.0000000 0.1666667 0.3333333 0.5000000 0.6666667 0.8333333
[7] 1.0000000

p <- plot_ly(data = states_coord, 
             type = "choropleth",
             locations = ~state_code, 
             locationmode = "USA-states", 
             z = ~fav_make_val, 
             text = ~hover_text,
             colorscale = colorScale, 
             colorbar = list(tickvals=1:nfactor, 
                             ticktext=names(colr),
                             title = "Favorate Make") ) %>%
layout(title = '2018 US Used Cars -- Favorite Make by State 
       <br>(Hover for more details about listings)',
       geo = g) 

ggplotly(p)

saveWidget(p, 'map_fav_make.html')

# Setting up map for mean price
fig <- plot_geo(states_coord, locationmode = 'USA-states')
fig <- fig %>% add_trace(
    z = ~mean_price, color = ~mean_price, text = ~hover_text, locations = ~state_code, colors = "YlOrBr"
)

# fig <- fig %>% add_trace(
#     z = ~total.exports, text = ~hover_text, locations = ~state_code,
#     color = ~fav_make, colors = 'Purples'
#   )

fig <- fig %>% colorbar(title = "Mean Price ($)")
fig <- fig %>% layout(
    title = '2018 US Used Cars -- Mean Listing Price by State
    <br>(Hover for more details about listings)',
    geo = g 
)

ggplotly(fig)

saveWidget(fig, 'map_mean_price.html')

set.seed(233)
PA_cars <- used_cars[State=="PA"]
#sample_index <- sample(nrow(PA_cars),1000)
PA_sample_index <- sample(nrow(PA_cars),300)
PA_sample <- PA_cars[PA_sample_index]

AL_cars <- used_cars[State=="AL"]
AL_sample_index <- sample(nrow(AL_cars),300)
AL_sample <- AL_cars[AL_sample_index]

sample_df <- rbind(PA_sample,AL_sample)

summary(sample_df)

     Price            Year        Mileage           State    
 Min.   : 1595   2015   :139   Min.   :     7   AL     :300  
 1st Qu.:14000   2014   :111   1st Qu.: 21627   PA     :300  
 Median :19954   2016   : 86   Median : 38160   AK     :  0  
 Mean   :22223   2017   : 55   Mean   : 51222   AR     :  0  
 3rd Qu.:28243   2013   : 45   3rd Qu.: 71634   AZ     :  0  
 Max.   :94000   2012   : 42   Max.   :275714   CA     :  0  
                 (Other):122                    (Other):  0  
     Make             Country         
 Length:600         Length:600        
 Class :character   Class :character  
 Mode  :character   Mode  :character

Map showing favorite make in each state
Map showing the average price of each state
interactive plot showing the mean price of car manufactured in each year, grouped by states
relationship between mileage, year, and Price. (interactive scatterplot)
distribution of country for different price range (stacked proportional bar chart)
histogram of price by country

###Some Testing

library(plotly) 

p0 <- ggplot(sample_df) +
    geom_histogram(aes(x=Price, y = ..density.., fill=State), bins = 29, alpha=0.6) +
    geom_density(aes(x=Price, color=State)) + 
    #ylim(0, 5) +
    xlab("Price ($)") +
    labs(title="Distribution of Prices of PA vs AL", fill="State")

ggsave("price_dist_by_state.png")


pn <- ggplot(sample_df, aes(x=Year, y=Mileage)) +
    geom_point(aes(size=Price, fill=State)) +
    theme_light() +
    labs(title = "Price",   y = "Mileage")



ggplotly(p0) %>% 
    layout(plot_bgcolor='#e5ecf6',   
           xaxis = list(   
               title='Price ($)', 
               zerolinecolor = '#ffff',   
               zerolinewidth = 2,   
               gridcolor = 'ffff'),   
           yaxis = list(   
               title='Density', 
               zerolinecolor = '#ffff',   
               zerolinewidth = 2,   
               gridcolor = 'ffff'),
           title = 'Histogram and Density Plot of Prices by State')

p1 <- ggplot(PA_sample, aes(x=Year, y=Mileage)) +
    #geom_tile(aes(fill = Price)) +
    geom_point(aes(size=Price)) +
    scale_fill_distiller(palette = "YlGnBu", direction = 1) +
    theme_light() +
    labs(title = "Price",   y = "Mileage") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

p1

#ggplotly(p1)




p4 <- ggplot(sample_df, aes(x=Year, y=Mileage)) +
    #geom_tile(aes(fill = Price)) +
    geom_point(aes(size=Price, color=State)) +
    #scale_fill_distiller(palette = "YlGnBu", direction = 1) +
    #theme_light() +
    labs(title = "Price ($)",   y = "Mileage (Mi)") +
    ggtitle("Price by Year and Mileage, PA vs AL") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
p4

ggsave("price_by_year_&_mileage.png")

mean_prices_by_states <- c()

states <- rep(unique(used_cars$State), times=length(1997:2017))
years <- rep(1997:2017, each=length(unique(used_cars$State)))
mean_price <- rep(0,length(years))
mean_prices_by_year_n_state <- data.frame(states, years,mean_price)


for (i in 1:nrow(mean_prices_by_year_n_state)) {
    this_state <- mean_prices_by_year_n_state[i,1]
    this_year <- mean_prices_by_year_n_state[i,2]
    this_mean_price <- mean(used_cars$Price[used_cars$Year==this_year & used_cars$State==this_state])
    mean_prices_by_year_n_state[i,3] <- this_mean_price
}

used_cars[,State, Year]

        Year State
     1: 1999    AR
     2: 1999    IN
     3: 1999    AR
     4: 1999    FL
     5: 1999    GA
    ---           
851135: 2003    VA
851136: 2003    VA
851137: 2003    WA
851138: 2003    WA
851139: 2003    WA

all_data <- data.table()
for (i in 1997:2017) {
    states_mean <- used_cars[Year==i, mean(Price), State]
    setnames(states_mean, 'V1', 'mean_price')
    with_year <- data.table(rep(i, times=nrow(states_mean)), states_mean)
    setnames(with_year, 'V1', 'years')
    all_data <- rbind(all_data, with_year)
}

p10 <- ggplot(mean_prices_by_year_n_state, aes(x=years, y=mean_price, color=states)) +
    geom_point() +
    #    geom_smooth() +
    ggtitle("Mean Price by Year and State") +
    xlab("Year") +
    ylab("Price ($)")
ggplotly(p10)

p11 <- ggplot(all_data, aes(x=years, y=mean_price, color=State)) +
    geom_point() +
    #    geom_smooth() +
    ggtitle("Mean Price by Year and State") +
    xlab("Year") +
    ylab("Price ($)")
p11

ggplotly(p11)

saveWidget(ggplotly(p11), 'mean_price_by_year_and_state.html')

make by state

south -> american

used_cars[,PriceRange := ifelse(Price>=100000, "Super Car", 
                                ifelse(Price>=40000, "Premium", 
                                       ifelse(Price>=20000, "Intermediate", "Economy")))]
used_cars$PriceRange <- factor(used_cars$PriceRange, 
                               levels = c("Economy","Intermediate","Premium","Super Car"),
                               ordered = T)
#used_cars$Country <- factor()

library(ggridges)

country_mean <- used_cars[,mean(Price), by=list(Country)]
setnames(country_mean, 'V1', 'mean_price')

country_mean <- country_mean[order(-mean_price)]
country_mean$Country <- factor(country_mean$Country, 
                               levels = country_mean[,Country], ordered = T)

ggplot(country_mean, aes(x=Country, y=mean_price)) +
    geom_bar(stat="identity", alpha=0.9, aes(fill=Country)) +
    ggtitle("Mean Price of Cars by Country") +
    ylab("Mean Price ($)")

ggsave("mean_price.png")

ggplot(used_cars[Price<=100000], aes(x=Price,fill=Country)) +
    geom_histogram(bins = 20) +
    ggtitle("Distribution of Prices by Country")

ggsave("dist_price_country.png")
    
ggplot(used_cars[Price<=100000], aes(x=Price,y=Country,fill = stat(x))) +
    geom_density_ridges_gradient(scale = 2, rel_min_height = 0.01)+
    scale_fill_viridis_c(name = "Temp. [F]", option = "C")

ggplot(used_cars[Price>100000], aes(x=Price,fill=Country)) +
    geom_histogram()

country_summary <- used_cars[,.N, by=list(Country, PriceRange)]
country_summary$Country <- factor(country_summary$Country, 
                               levels = country_mean[,Country], ordered = T)

ggplot(country_summary, aes(x=PriceRange, y=N, fill=Country)) +
    geom_bar(position="fill", stat="identity", alpha=0.95) +
    scale_fill_brewer(type = "seq",
                      palette = "Set2",
                      direction = 1,
                      aesthetics = "fill"
    ) +
    ylab("Proportion") +
    ggtitle("Proportion of Country in Each Price Range")

ggsave("price_range_proportion.png")