I used a collection of R packages designed for data science, in particular the following:
# If the package is not already installed, it will be installed
install.packages("tidyverse", dependencies = TRUE)
# We load the package using libary
Of course, I used more than the seven, but I will always mention them at the points where I needed them.
airbnb <- read_csv("data/listings.csv")
You can glimpse (using the pipe %>%
operator), or look at head and tail of the data:
airbnb %>% glimpse()
With the help of the kableExtra
package, we can easily and nicely visualize our data. With the subscript [] we can address the data gradually.
kable(airbnb[1:7,1:24]) %>% # [1:7,1:24] = first seven rows and first 24 columns
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = F, font_size = 10) %>%
scroll_box(width = "910px", height = "400px")
After we saw all variables with the kable-table, we can select and subset the data to a dataframe that consists only of the variables of interest.
airbnb_data <- airbnb %>% select(id, last_scraped, description, host_id, host_is_superhost,
host_has_profile_pic, host_identity_verified, neighbourhood,
neighbourhood_cleansed, latitude, longitude, property_type, room_type,
accommodates, bathrooms, bedrooms, beds, price, security_deposit,
cleaning_fee, guests_included, extra_people, minimum_nights,
number_of_reviews, first_review, last_review, review_scores_rating,
review_scores_accuracy, review_scores_cleanliness, review_scores_checkin,
review_scores_communication, review_scores_location, review_scores_value,
instant_bookable, cancellation_policy, calculated_host_listings_count,
Now the cleaning process begins.
# Necessary condition: host must be verified and listins must have at least one review
airbnb_clean <- airbnb_data %>%
filter(host_identity_verified == "TRUE" & number_of_reviews > 0)
# Cleaning or variables containing prices
airbnb_clean <- airbnb_clean %>%
mutate(cleaning_fee_dkk = as.numeric(gsub("[\\$,]", "", airbnb_clean$cleaning_fee))) %>%
mutate(price_dkk = as.numeric(gsub("[\\$,]", "", airbnb_clean$price))) %>%
mutate(extra_people_dkk = as.numeric(gsub("[\\$,]", "", airbnb_clean$extra_people))) %>%
mutate(security_deposit_dkk = as.numeric(gsub("[\\$,]", "", airbnb_clean$security_deposit))) %>%
filter(price_dkk > 0 & price_dkk < 17500)
# first dummies
airbnb_clean <- airbnb_clean %>% mutate(
superhost = case_when(
host_is_superhost == "TRUE" ~ 1,
host_is_superhost == "FALSE" ~ 0
instant = case_when(
instant_bookable == "TRUE" ~ 1,
instant_bookable == "FALSE" ~ 0
# Clean property type
airbnb_clean$property_type <- airbnb_clean$property_type %>%
str_replace(., " \\& ", "_") %>%
str_replace(., "\\/", "_") %>%
# two new variables
airbnb_clean <- airbnb_clean %>% mutate(
listing_duration = as.numeric(difftime(airbnb_clean$last_scraped, airbnb_clean$first_review, unit = "days")),
price_person = price_dkk/accommodates)
# Deleting Shared Rooms and creating two dummies and an index
airbnb_clean <- airbnb_clean %>% mutate(
rtype = case_when(
room_type == "Shared room" ~ NA_character_,
room_type == "Entire home/apt" ~ "Entire Apartment",
room_type == "Private room" ~ "Private"
home = case_when(
room_type == "Shared room" ~ NA_real_, # drop Shared Rooms
room_type == "Entire home/apt" ~ 1,
room_type == "Private room" ~ 0
strict_cancel = case_when(
cancellation_policy == "strict_14_with_grace_period" ~ 1,
cancellation_policy == "moderate" ~ 0,
cancellation_policy == "flexible" ~ 0
index = ((airbnb_clean$review_scores_accuracy + airbnb_clean$review_scores_cleanliness
+ airbnb_clean$review_scores_checkin +airbnb_clean$review_scores_communication
+ airbnb_clean$review_scores_location + airbnb_clean$review_scores_value)/6)) %>%
Scraping of metro data using the rvest
and purrr
metroURL <- "https://en.wikipedia.org/wiki/List_of_Copenhagen_Metro_stations"
metro_scrap <- metroURL %>%
read_html %>%
html_nodes(xpath='//th') %>% html_nodes("a") %>% html_attr("href") %>% na.omit() %>%
paste0("https://en.wikipedia.org", .) #create link, we only need first 22 lists
# Using purrr instead of for-loops
# Extract the geo locations and names of the metro stations
names <- map_df(metro_scrap[1:22], ~ tibble(names = read_html(.) %>% html_nodes("#firstHeading") %>% html_text())) %>%
geo <- map_df(metro_scrap[1:22], ~ tibble(coor = read_html(.) %>% html_nodes(".geo") %>% html_text())) %>%
# A function as a workaround of untidy geo dataframe
Numextract_coord <- function(string){
as.data.frame(as.numeric(unlist(regmatches(string, gregexpr("[[:digit:]]+\\.*[[:digit:]]*", string)))))
df <- Numextract_coord(geo$coor)
lng <- df %>% dplyr::filter(row_number() %% 2 == 0) ## Select even rows
lat <- df %>% dplyr::filter(row_number() %% 2 == 1) ## Select odd rows
metro_df <- cbind(names, lat,lng)
colnames(metro_df) <- c("metro","lat", "long")
Visualisations withplotly
# # using plotly
f <- list(
family = "Viridis 20",
size = 18,
color = "#440154FF"
x <- list(
title = "Copenhagen Neigbourhoods",
titlefont = f
y <- list(
title = "Number of Listings",
titlefont = f
room_neigh <- airbnb_clean %>%
filter(!is.na(neighbourhood)) %>%
group_by(neighbourhood) %>%
room_neigh %>%
plot_ly(type = "bar",
x = ~neighbourhood,
y = ~n,
color = ~room_type,
colors = viridis_pal(option = "D")(3)) %>%
layout(title = "AirBnB listings sorted after type and neighbourhood",
xaxis = x, yaxis = y)
# only 17 shared room's listed in the data
# Entire home = 6318
# private room = 1199
Histograms and density plot of night price distributions.
price_apart <- airbnb_clean %>%
filter(price_dkk<2100 & room_type == "Entire home/apt") %>%
ggplot(aes(x = price_dkk)) +
stat_bin(breaks = seq(0,2100,50), fill="#69b3a2", color="#e9ecef", alpha = 0.9) +
ggtitle("Night price distribution of Airbnb appartements") +
theme_ipsum() +
theme_ipsum_rc(grid_col = "gray90") +
theme(plot.title = element_text(size = 12)) +
labs(x = "Price in DKK (Danish krone)",
price_privat <- airbnb_clean %>%
filter(price_dkk<2100 & room_type == "Private room") %>%
ggplot(aes(x = price_dkk)) +
stat_bin(breaks = seq(0,2100,50), fill = "#74add1", color = "#e9ecef", alpha = 0.9) +
ggtitle("Night price distribution of Airbnb private rooms") +
theme_ipsum() +
theme_ipsum_rc(grid_col = "gray90") +
theme(plot.title = element_text(size = 12)) +
labs(x = "Price in DKK (Danish krone)",
caption = "1 Euro = 7.4 DKK ")
density_plot <- ggplot(airbnb_clean, aes(x = log(airbnb_clean$price_dkk),
color = rtype)) +
geom_density(aes(fill = rtype, alpha = 0.5)) +
labs(x = "Nightly Rental Price (Log)", y = "Density",
title = "Price Density by Accommodation Type") +
theme_ipsum() +
theme_ipsum_rc(grid_col = "gray90")+
theme(plot.title = element_text(size=12),
legend.title = element_blank())
Calculate the distance between listings and nearest metro station as well as distance between listing and the city center. I wrote an AirBnBCopenhagen
package in order to simply especially the distance function.
metroURL <- "https://en.wikipedia.org/wiki/List_of_Copenhagen_Metro_stations"
metro_df <- geo_metro(metroURL)
airbnb_clean$distance <- AirBnBCopenhagen::distance(airbnb_clean$longitude,
plot_distance <- ggplot(airbnb_clean, aes(x = distance, y = log(price_dkk))) +
geom_point() +
geom_smooth(method = "lm") +
labs(x = "Distance to the next Metro station (in meters)",
y = "Nightly Rental Price (Log)",
title = "Price vs. Distance to the next metro") +
theme_ipsum() +
theme_ipsum_rc(grid_col = "gray90") +
theme(plot.title = element_text(size=12),
legend.title = element_blank())
# Distance to city center: nyhavn: 12.590659, 55.679687
nyhavn_lng <- 12.590659
nyhavn_lat <- 55.679687
airbnb_clean$dist_centrum <- AirBnBCopenhagen::distance(airbnb_clean$longitude,
nyhavn_lng, nyhavn_lat)
plot_distance_c <- ggplot(airbnb_clean, aes(x= dist_centrum, y= log(price_dkk))) +
geom_point() +
geom_smooth(method = "lm") +
labs(x = "Distance to Nyhavn, City Center",
y = "Nightly Rental Price (Log)",
title = "Price vs. Distance to the City Center") +
theme_ipsum() +
theme_ipsum_rc(grid_col = "gray90") +
theme(plot.title = element_text(size=12),
legend.title = element_blank())
Descriptive Analysis with stargazer
cph_data <- airbnb_clean[,c("price_dkk","index","strict_cancel",
"listing_duration", "instant",
"accommodates","minimum_nights" ,
"bathrooms", "distance",
"dist_centrum")] %>% na.omit()
colnames(cph_data) <- c("Price per night (DKK)",
"Review Index",
"Strict Cancellation",
"Full apartment/house",
"Cleaning Fee (DKK)",
"Duration of Listing (days)",
"Instant Bookable",
"Security Deposit (DKK)",
"No. Accommodates",
"Minimum Nights",
"No. Bathrooms",
"Distance Metro",
"Distance Center")
stargazer(as.data.frame(cph_data)[, c("Price per night (DKK)",
"Review Index",
"Strict Cancellation",
"Full apartment/house",
"Cleaning Fee (DKK)",
"Duration of Listing (days)",
"Instant Bookable",
"Security Deposit (DKK)",
"No. Accommodates",
"Minimum Nights",
"No. Bathrooms",
"Distance Metro",
"Distance Center")],
type = "html",
digits = 2,
summary.stat = c("mean","sd","median","min", "max"),
font.size = "small",
column.sep.width = "10pt")
Now we turn to mapping them in an interactive map using the leaflet
# leaflet package for R
# Create CPH Long Lat
m <- leaflet() %>% setView(lng = 12.568337,
lat = 55.676098,
zoom = 12) # Copenhagens longitude and latitude
# For the pop up
nyhavn <- paste(sep = "<br/>",
#paste0("<img src='https://en.wikipedia.org/wiki/Nyhavn#/media/File:Nyhavn_MichaD.jpg", "' />"),
paste0("<b>Name: </b>", "Nyhavn"),
paste0("<b>Place: </b>", "City Center, Copenhagen"),
paste0("<a href='https://en.wikipedia.org/wiki/Nyhavn",
... = "'>Link</a>"))
map_nyhavn <- m %>%
addProviderTiles("Esri.WorldImagery", group = "Background 1") %>%
addTiles(options = providerTileOptions(noWrap = TRUE), group = "Background 2") %>%
addCircles(data=metro_df, lng = ~long, lat = ~lat,popup = ~metro,
fillColor="red", stroke = TRUE, fillOpacity = 0.8 ,
radius = 80, group = "Metro Stations") %>%
addCircleMarkers(data = airbnb_clean, lng = ~ longitude, lat = ~ latitude,
radius = 1 , color =" black",
fillColor = "#ffa500", stroke = TRUE, fillOpacity = 2,
group="AirBnB Listings",
clusterOptions = markerClusterOptions()) %>%
addLayersControl(overlayGroups = c("Metro Stations","AirBnB Listings") ,
baseGroups = c("Background 1","Background 2"),
options = layersControlOptions(collapsed = FALSE))%>%
map_nyhavn %>% addMarkers(lat =55.679687, lng = 12.590659, popup=nyhavn)
The Neighbourhoods GEOJSON file which is also downloaded from InsideAirBnB.com provides geocoordinates of neighbourhoods of Copenhagen.
nb_geo <- geojson_read("data/neighbourhoods.geojson", what = 'sp')
borough_data <- airbnb_clean %>%
group_by(neighbourhood_cleansed) %>%
nest() %>%
mutate(center_lon = map_dbl(data, ~median(.$longitude)),
center_lat = map_dbl(data, ~median(.$latitude)),
number_listings = map_int(data, nrow))
map_poly <- m %>%
group="Background 1") %>%
addTiles(options = providerTileOptions(noWrap = TRUE),
group="Background 2") %>%
addCircleMarkers(data = airbnb_clean, lng = ~ longitude,
lat = ~ latitude, radius=1 , color="black",
fillColor="#ffa500", stroke = TRUE,
fillOpacity = 2, group="AirBnB Listings",
clusterOptions = markerClusterOptions()) %>%
addPolygons(data = nb_geo, color = "#444444", weight = 2,
opacity = 1, group = "Polygon") %>%
addLayersControl(overlayGroups = c("AirBnB Listings", "Polygon", "neighbourhood"),
baseGroups = c("Background 1","Background 2"),
options = layersControlOptions(collapsed = FALSE)) %>%
addLabelOnlyMarkers(data = borough_data,
lng = ~center_lon, lat = ~center_lat,
label = ~neighbourhood_cleansed,
labelOptions = labelOptions(noHide = TRUE,
direction = 'top',
textOnly = TRUE,
opacity = 1,
group = "neighbourhood")) %>%
Regression Analysis
cph_data <- airbnb_clean[,c("price_dkk","index","strict_cancel",
"listing_duration", "instant",
"accommodates","minimum_nights" ,
"bathrooms", "distance", "dist_centrum")] %>% na.omit()
# Correlation Matrix
res <- cor(cph_data)
round(res, 2)
# Linear Regression model where dependent variable is in level-form
mod_no_log <- lm(price_dkk ~ distance + dist_centrum, data = airbnb_clean)
plot_mod_no_log <- plot_model(mod_no_log, type = "diag") %>% plot_grid()
# Thus we take the log
mod1 <- lm(price_dkk %>% log() ~ distance + dist_centrum, data = cph_data )
plot_mod1 <- plot_model(mod1, type = "diag") %>% plot_grid()
mod2 <- lm(price_dkk %>% log() ~ distance + dist_centrum + home +
accommodates + bathrooms, data = cph_data)
plot_mod2 <-plot_model(mod2, type = "diag") %>% plot_grid()
mod3 <- lm(price_dkk %>% log() ~ distance + dist_centrum +
strict_cancel + instant + minimum_nights + cleaning_fee_dkk, data = cph_data)
plot_mod3 <- plot_model(mod3, type = "diag") %>% plot_grid()
mod4 <- lm(price_dkk %>% log() ~ distance + dist_centrum +
index + superhost +
listing_duration, data = cph_data)
plot_mod4 <-plot_model(mod4, type = "diag") %>% plot_grid()
mod5 <- lm(price_dkk %>% log() ~ distance + dist_centrum +
home + accommodates + bathrooms + strict_cancel + instant +
minimum_nights + cleaning_fee_dkk+
index + superhost + listing_duration, data = cph_data)
plot_mod5 <-plot_model(mod5, type = "diag") %>% plot_grid()
# Regressiondiagnostic
# Robust SE
mod1_rob <- coeftest(mod1, vcov=vcovHC(mod1))
mod2_rob <- coeftest(mod2, vcov=vcovHC(mod2))
mod3_rob <- coeftest(mod3, vcov=vcovHC(mod3))
mod4_rob <- coeftest(mod4, vcov=vcovHC(mod4))
mod5_rob <- coeftest(mod5, vcov=vcovHC(mod5))
## Multicollinearity
# no problems
## Breusch–Pagan test
# HO: homoscedasticity
# Heterosc. is existent in every model
# Final stargazer output
stargazer(mod1, mod2, mod3, mod4,mod5,
type = "html",
title = "Linear Regression Model",
style = "ajs",
summary = NULL,
out.header = FALSE,
column.labels = c("Distance", "Property", "Rules", "Reputation", "Full model"),
covariate.labels = c("Distance Metro",
"Distance Centre (Proxy)",
"Apartment (Dummy)",
"Number of Bathrooms",
"Strict Cancel",
"Instant Booking",
"Minimum nights",
"Cleaning Fee",
"Review Index",
"Listings duration"),
dep.var.caption = "Dep. Var: Log Price per night in DKK",
star.cutoffs = c(0.05,0.01,0.001),
dep.var.labels.include = TRUE)
# With robust SE
stargazer(mod1_rob, mod2_rob, mod3_rob, mod4_rob,mod5_rob,
type = "html",
title = "Linear Regression Model (Robust SE)",
style = "ajs",
summary = NULL,
out.header = FALSE,
column.labels = c("Distance", "Property", "Rules", "Reputation", "Full model"),
column.sep.width = "5pt",
covariate.labels = c("Distance Metro", # Covariate Labels
"Distance Centre (Proxy)",
"Apartment (Dummy)",
"Number of Bathrooms",
"Strict Cancel",
"Instant Booking",
"Minimum nights",
"Cleaning Fee",
"Review Index",
"Listings duration"),
dep.var.caption = "Dep. Var", # Caption (Top) of dependent variable
star.cutoffs = c(0.05,0.01,0.001),
dep.var.labels = c("Log Price per night in DKK", ""))
This shows the version of R I am using as well as all of the packages.
# library(help = "AirBnBCopenhagen")
