Here are all the code I used to generate the plots. It is a bit messy though, as it contains some code used for testing purposes as well!
Before doing anything with the data and making fun graphs and plots, we need to clean our data.
The original dataset has 8 columns, but we won’t need Vin, City, and Model because they are too specific. We will drop these three columns to save resources.
used_cars <- fread(input = "true_car_listings.csv", drop = c("City", "Vin", "Model"))
#used_cars <- fread(input = "true_car_listings.csv")
str(used_cars)
Classes 'data.table' and 'data.frame': 852061 obs. of 5 variables:
$ Price : int 10194 6991 44995 6950 6194 8995 9995 10995 13194 13995 ...
$ Year : int 2011 2007 2006 2009 2000 2001 1999 2013 2015 2015 ...
$ Mileage: int 63451 123972 16900 85354 128961 103636 217585 50761 35321 47315 ...
$ State : chr "AK" "AK" "AK" "AK" ...
$ Make : chr "Dodge" "Dodge" "Dodge" "Dodge" ...
- attr(*, ".internal.selfref")=<externalptr>
Changing Year to factor and removing Year 2018 because of possibly incomplete data (2018 not done yet)
used_cars <- used_cars[Year != 2018,]
used_cars$Year <- factor(used_cars$Year, ordered = T)
summary(used_cars$Year)
1997 1998 1999 2000 2001 2002 2003 2004 2005 2006
595 774 1252 1931 2583 3797 5648 8114 11000 15072
2007 2008 2009 2010 2011 2012 2013 2014 2015 2016
21168 24709 19060 27536 39764 49758 74696 162428 157513 132134
2017
91607
Fix bad state names
wrong_state_names <- c("Az", "Ca", "Fl", "ga", "Ga", "Md", "Oh", "Va")
state_fix <- function(state_name) {
if (state_name %in% wrong_state_names) {
return(toupper(state_name))
} else
(return(state_name))
}
used_cars[, State := ifelse(State %in% wrong_state_names, toupper(State), State)]
# full_idx <- c()
# for (bad_state in wrong_state_names) {
# tmp_idx <- which(used_cars$State == "AZ")
# full_idx <- c(full_idx, tmp_idx)
# }
#
# for (row_num in full_idx) {
# if (used_cars[row_num, "State"] %in% wrong_state_names) {
# used_cars[row_num, "State"] <- toupper(used_cars[row_num, "State"])
# }
# }
used_cars$State <- factor(used_cars$State)
summary(used_cars$State)
AK AL AR AZ CA CO CT DC DE FL GA
1663 13284 6950 22449 80353 22973 10900 25 2416 73121 39597
HI IA ID IL IN KS KY LA MA MD ME
2995 4829 3499 38629 16461 7891 13685 8523 17977 17648 1898
MI MN MO MS MT NC ND NE NH NJ NM
10312 10988 16158 6236 1981 37494 1092 4921 5690 27821 4724
NV NY OH OK OR PA RI SC SD TN TX
6883 27909 23301 9582 10349 28082 1857 9782 1267 19160 94559
UT VA VT WA WI WV WY
10360 35245 1122 23139 11250 1499 610
unique(used_cars$State)
[1] AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME
[23] MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX
[45] UT VA VT WA WI WV WY
51 Levels: AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS ... WY
str(used_cars)
Classes 'data.table' and 'data.frame': 851139 obs. of 5 variables:
$ Price : int 10194 6991 44995 6950 6194 8995 9995 10995 13194 13995 ...
$ Year : Ord.factor w/ 21 levels "1997"<"1998"<..: 15 11 10 13 4 5 3 17 19 19 ...
$ Mileage: int 63451 123972 16900 85354 128961 103636 217585 50761 35321 47315 ...
$ State : Factor w/ 51 levels "AK","AL","AR",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Make : chr "Dodge" "Dodge" "Dodge" "Dodge" ...
- attr(*, ".internal.selfref")=<externalptr>
states_coord <- fread("us_states_coord.csv")
setkey(states_coord, state_code)
setkey(used_cars, Make)
# Find each state's favorite make
for (state in unique(used_cars$State)) {
sorted <- used_cars[State==state, .N, by=list(Make)]
this_fav_make <- sorted[order(-N)]$Make[1]
states_coord[.(state), fav_make := this_fav_make]
}
# Find each state's favorite car manufacturer country
for (state in unique(used_cars$State)) {
sorted2 <- used_cars[State==state, .N, by=list(Country)]
this_fav_country <- sorted2[order(-N)]$Country[1]
states_coord[.(state), fav_country := this_fav_country]
}
# Find each state's number of listings
for (state in unique(used_cars$State)) {
num_car_by_state <- used_cars[,.N, State]
this_state_n <- num_car_by_state[State==state, N]
states_coord[.(state), num_listings := this_state_n]
}
# Find each state's mean listing price
for (state in unique(used_cars$State)) {
mean_price_by_state <- used_cars[,round(mean(Price),2), State]
this_mean <- mean_price_by_state[State==state, V1]
states_coord[.(state), mean_price := this_mean]
}
states_coord$fav_make <- factor(states_coord$fav_make)
# Got help from https://www.analytics-tuts.com/color-usa-map-based-on-a-categorical-variable-using-plotly-in-r/
states_coord$fav_make_val <- factor(as.numeric(states_coord$fav_make))
states_coord$hover_text <- with(states_coord,
paste("<b>State:</b>", state_name, '<br>',
"<b>Number of Listings:</b>", num_listings,"<br>",
"<b>Mean Price ($):</b>", mean_price, "<br>",
"<b>Favorite Make:</b>", fav_make))
# specify some map projection/options
g <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showocean = TRUE,
oceancolor = toRGB("grey"),
showlakes = TRUE,
lakecolor = toRGB("LightBlue"),
showrivers = TRUE,
rivercolor = toRGB("LightBlue")
)
# Setting up colors for fav makes
nfactor <- length(levels(states_coord$fav_make_val))
#colr <- brewer.pal(n = nfactor,name = "RdYlGn")
#colr <- c("#8B4500", "#FFB90F", "#6495ED", "#458B74", "#838B8B", "#7A378B")
colr <- c("#8B6914", "#FFEC8B", "#63B8FF", "#76EEC6", "#C1CDCD", "#AB82FF")
names(colr) <- levels(states_coord$fav_make)
colrS <- function(n){
CUTS <- seq(0,1,length.out=n+1)
print(CUTS)
rep(CUTS,ifelse(CUTS %in% 0:1,1,2))
}
colorScale <- data.frame(z=colrS(nfactor),
col=rep(colr,each=2),
stringsAsFactors=FALSE)
[1] 0.0000000 0.1666667 0.3333333 0.5000000 0.6666667 0.8333333
[7] 1.0000000
p <- plot_ly(data = states_coord,
type = "choropleth",
locations = ~state_code,
locationmode = "USA-states",
z = ~fav_make_val,
text = ~hover_text,
colorscale = colorScale,
colorbar = list(tickvals=1:nfactor,
ticktext=names(colr),
title = "Favorate Make") ) %>%
layout(title = '2018 US Used Cars -- Favorite Make by State
<br>(Hover for more details about listings)',
geo = g)
ggplotly(p)
saveWidget(p, 'map_fav_make.html')
# Setting up map for mean price
fig <- plot_geo(states_coord, locationmode = 'USA-states')
fig <- fig %>% add_trace(
z = ~mean_price, color = ~mean_price, text = ~hover_text, locations = ~state_code, colors = "YlOrBr"
)
# fig <- fig %>% add_trace(
# z = ~total.exports, text = ~hover_text, locations = ~state_code,
# color = ~fav_make, colors = 'Purples'
# )
fig <- fig %>% colorbar(title = "Mean Price ($)")
fig <- fig %>% layout(
title = '2018 US Used Cars -- Mean Listing Price by State
<br>(Hover for more details about listings)',
geo = g
)
ggplotly(fig)
saveWidget(fig, 'map_mean_price.html')
set.seed(233)
PA_cars <- used_cars[State=="PA"]
#sample_index <- sample(nrow(PA_cars),1000)
PA_sample_index <- sample(nrow(PA_cars),300)
PA_sample <- PA_cars[PA_sample_index]
AL_cars <- used_cars[State=="AL"]
AL_sample_index <- sample(nrow(AL_cars),300)
AL_sample <- AL_cars[AL_sample_index]
sample_df <- rbind(PA_sample,AL_sample)
summary(sample_df)
Price Year Mileage State
Min. : 1595 2015 :139 Min. : 7 AL :300
1st Qu.:14000 2014 :111 1st Qu.: 21627 PA :300
Median :19954 2016 : 86 Median : 38160 AK : 0
Mean :22223 2017 : 55 Mean : 51222 AR : 0
3rd Qu.:28243 2013 : 45 3rd Qu.: 71634 AZ : 0
Max. :94000 2012 : 42 Max. :275714 CA : 0
(Other):122 (Other): 0
Make Country
Length:600 Length:600
Class :character Class :character
Mode :character Mode :character
Map showing favorite make in each state
Map showing the average price of each state
interactive plot showing the mean price of car manufactured in each year, grouped by states
relationship between mileage, year, and Price. (interactive scatterplot)
distribution of country for different price range (stacked proportional bar chart)
histogram of price by country
###Some Testing
library(plotly)
p0 <- ggplot(sample_df) +
geom_histogram(aes(x=Price, y = ..density.., fill=State), bins = 29, alpha=0.6) +
geom_density(aes(x=Price, color=State)) +
#ylim(0, 5) +
xlab("Price ($)") +
labs(title="Distribution of Prices of PA vs AL", fill="State")
ggsave("price_dist_by_state.png")
pn <- ggplot(sample_df, aes(x=Year, y=Mileage)) +
geom_point(aes(size=Price, fill=State)) +
theme_light() +
labs(title = "Price", y = "Mileage")
ggplotly(p0) %>%
layout(plot_bgcolor='#e5ecf6',
xaxis = list(
title='Price ($)',
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(
title='Density',
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
title = 'Histogram and Density Plot of Prices by State')
p1 <- ggplot(PA_sample, aes(x=Year, y=Mileage)) +
#geom_tile(aes(fill = Price)) +
geom_point(aes(size=Price)) +
scale_fill_distiller(palette = "YlGnBu", direction = 1) +
theme_light() +
labs(title = "Price", y = "Mileage") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
p1
#ggplotly(p1)
p4 <- ggplot(sample_df, aes(x=Year, y=Mileage)) +
#geom_tile(aes(fill = Price)) +
geom_point(aes(size=Price, color=State)) +
#scale_fill_distiller(palette = "YlGnBu", direction = 1) +
#theme_light() +
labs(title = "Price ($)", y = "Mileage (Mi)") +
ggtitle("Price by Year and Mileage, PA vs AL") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
p4
ggsave("price_by_year_&_mileage.png")
mean_prices_by_states <- c()
states <- rep(unique(used_cars$State), times=length(1997:2017))
years <- rep(1997:2017, each=length(unique(used_cars$State)))
mean_price <- rep(0,length(years))
mean_prices_by_year_n_state <- data.frame(states, years,mean_price)
for (i in 1:nrow(mean_prices_by_year_n_state)) {
this_state <- mean_prices_by_year_n_state[i,1]
this_year <- mean_prices_by_year_n_state[i,2]
this_mean_price <- mean(used_cars$Price[used_cars$Year==this_year & used_cars$State==this_state])
mean_prices_by_year_n_state[i,3] <- this_mean_price
}
used_cars[,State, Year]
Year State
1: 1999 AR
2: 1999 IN
3: 1999 AR
4: 1999 FL
5: 1999 GA
---
851135: 2003 VA
851136: 2003 VA
851137: 2003 WA
851138: 2003 WA
851139: 2003 WA
all_data <- data.table()
for (i in 1997:2017) {
states_mean <- used_cars[Year==i, mean(Price), State]
setnames(states_mean, 'V1', 'mean_price')
with_year <- data.table(rep(i, times=nrow(states_mean)), states_mean)
setnames(with_year, 'V1', 'years')
all_data <- rbind(all_data, with_year)
}
p10 <- ggplot(mean_prices_by_year_n_state, aes(x=years, y=mean_price, color=states)) +
geom_point() +
# geom_smooth() +
ggtitle("Mean Price by Year and State") +
xlab("Year") +
ylab("Price ($)")
ggplotly(p10)
p11 <- ggplot(all_data, aes(x=years, y=mean_price, color=State)) +
geom_point() +
# geom_smooth() +
ggtitle("Mean Price by Year and State") +
xlab("Year") +
ylab("Price ($)")
p11
ggplotly(p11)
saveWidget(ggplotly(p11), 'mean_price_by_year_and_state.html')
make by state
south -> american
used_cars[,PriceRange := ifelse(Price>=100000, "Super Car",
ifelse(Price>=40000, "Premium",
ifelse(Price>=20000, "Intermediate", "Economy")))]
used_cars$PriceRange <- factor(used_cars$PriceRange,
levels = c("Economy","Intermediate","Premium","Super Car"),
ordered = T)
#used_cars$Country <- factor()
library(ggridges)
country_mean <- used_cars[,mean(Price), by=list(Country)]
setnames(country_mean, 'V1', 'mean_price')
country_mean <- country_mean[order(-mean_price)]
country_mean$Country <- factor(country_mean$Country,
levels = country_mean[,Country], ordered = T)
ggplot(country_mean, aes(x=Country, y=mean_price)) +
geom_bar(stat="identity", alpha=0.9, aes(fill=Country)) +
ggtitle("Mean Price of Cars by Country") +
ylab("Mean Price ($)")
ggsave("mean_price.png")
ggplot(used_cars[Price<=100000], aes(x=Price,fill=Country)) +
geom_histogram(bins = 20) +
ggtitle("Distribution of Prices by Country")
ggsave("dist_price_country.png")
ggplot(used_cars[Price<=100000], aes(x=Price,y=Country,fill = stat(x))) +
geom_density_ridges_gradient(scale = 2, rel_min_height = 0.01)+
scale_fill_viridis_c(name = "Temp. [F]", option = "C")
ggplot(used_cars[Price>100000], aes(x=Price,fill=Country)) +
geom_histogram()
country_summary <- used_cars[,.N, by=list(Country, PriceRange)]
country_summary$Country <- factor(country_summary$Country,
levels = country_mean[,Country], ordered = T)
ggplot(country_summary, aes(x=PriceRange, y=N, fill=Country)) +
geom_bar(position="fill", stat="identity", alpha=0.95) +
scale_fill_brewer(type = "seq",
palette = "Set2",
direction = 1,
aesthetics = "fill"
) +
ylab("Proportion") +
ggtitle("Proportion of Country in Each Price Range")
ggsave("price_range_proportion.png")