-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
377 additions
and
0 deletions.
There are no files selected for viewing
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
airbnb<-read.csv('airbnb-listings.csv',sep = ';') | ||
options(repr.plot.height=4,repr.plot.width=6,repr.plot.res = 300) | ||
library(dplyr) | ||
head(airbnb) | ||
df_madrid <- airbnb[airbnb$Room.Type == "Entire home/apt" & airbnb$Neighbourhood != '' & airbnb$City == "Madrid", | ||
c("Neighbourhood", "Accommodates", "Bathrooms", "Bedrooms", "Beds", "Price", "Square.Feet", "Guests.Included", "Extra.People", "Review.Scores.Rating", "Latitude", "Longitude")] | ||
head(df_madrid) | ||
df_madrid$Square.Meters <- df_madrid$Square.Feet * 0.092903 | ||
df_madrid[, c("Square.Feet", "Square.Meters")] | ||
head(df_madrid) | ||
entire_home_NA <- sum(is.na(df_madrid$Square.Meters)) | ||
porcent <- (entire_home_NA / nrow(df_madrid)) * 100 | ||
cat(porcent) | ||
entire_home_real <-entire_home_NA | ||
home_cero <-sum(df_madrid$Square.Meters ==0 & !is.na(df_madrid$Square.Meters)) | ||
porcent_real <- (home_cero / entire_home_real) *100 | ||
cat(porcent_real) | ||
df_madrid$Square.Meters[df_madrid$Square.Meters == '' | df_madrid$Square.Meters == 0] <- NA | ||
cat(df_madrid$Square.Meters) | ||
library(ggplot2) | ||
ggplot(df_madrid, aes(x = Square.Meters)) + | ||
geom_histogram(binwidth = 10, fill = 'red', color = "white") + | ||
labs(title = "Histograma de Metros Cuadrados", x = "Metros Cuadrados", y = "") | ||
df_madrid$Square.Meters[df_madrid$Square.Meters < 20] <- NA | ||
cat(df_madrid$Square.Meters) | ||
library(dplyr) | ||
df_madrid_Neighbourhood<- unique(df_madrid$Neighbourhood) | ||
Neighbourhood_0 <- character(0) | ||
for (Neighbourhood in df_madrid_Neighbourhood) { | ||
df_madrid_barrio <- df_madrid[df_madrid$Neighbourhood == Neighbourhood, ] | ||
df_na <- all(is.na(df_madrid_barrio$Square.Meters)) | ||
if (df_na) { | ||
Neighbourhood_0 <- c(Neighbourhood_0, Neighbourhood) | ||
} | ||
} | ||
df_madrid <- df_madrid[!df_madrid$Neighbourhood %in% Neighbourhood_0, ] | ||
summary(df_madrid) | ||
tky<-TukeyHSD(aov( formula=Square.Meters~Neighbourhood, data=df_madrid )) | ||
tky.result<-data.frame(tky$Neighbourhood) | ||
cn <-sort(unique(df_madrid$Neighbourhood)) | ||
resm <- matrix(NA, length(cn),length(cn)) | ||
rownames(resm) <- cn | ||
colnames(resm) <- cn | ||
resm[lower.tri(resm) ] <- round(tky.result$p.adj,4) | ||
resm[upper.tri(resm) ] <- t(resm)[upper.tri(resm)] | ||
diag(resm) <- 1 | ||
library(ggplot2) | ||
library(reshape2) | ||
dfResm <- melt(resm) | ||
ggplot(dfResm, aes(x=Var1, y=Var2, fill=value))+ | ||
geom_tile(colour = "black")+ | ||
scale_fill_gradient(low = "white",high = "steelblue")+ | ||
ylab("Neighbourhood")+xlab("Neighbourhood")+theme_bw()+ | ||
theme(axis.text.x = element_text(angle = 90, hjust = 1),legend.position="none") | ||
library(factoextra) | ||
distancia <- as.dist(1 - resm) | ||
clust_madrid <- hclust(distancia, method = "complete") | ||
dendrogram <- as.dendrogram(clust_madrid) | ||
fviz_dend(clust_madrid, cex = 0.8, lwd = 0.8, k = 38, | ||
rect = TRUE, | ||
k_colors = "jco", | ||
rect_border = "jco", | ||
rect_fill = TRUE, | ||
ggtheme = theme_gray()) | ||
cluster <- cutree(clust_madrid, k = 38) | ||
df_madrid_cluster <- data.frame(Neighbourhood = unique(df_madrid$Neighbourhood), neighb_id = cluster) | ||
df_madrid <- merge(df_madrid, df_madrid_cluster , by = "Neighbourhood", all.x = TRUE) | ||
head(df_madrid) | ||
set.seed(42) | ||
idx<-sample(1:nrow(df_madrid),nrow(df_madrid)*0.7) | ||
train.df<-df_madrid[idx,] | ||
test.df<-df_madrid[-idx,] | ||
train.df <- train.df |> select(!c(Neighbourhood,Square.Feet)) | ||
test.df <- test.df |> select(!c(Neighbourhood,Square.Feet)) | ||
cat("Número de valores:", nrow(train.df), "\n") | ||
cat("Número de valores de datos de prueba:", nrow(test.df), "\n") | ||
summary(train.df) | ||
cor(train.df[,c("Accommodates","Square.Meters","Bathrooms","Bedrooms","Beds","Price","Guests.Included","Extra.People")], use ="pairwise.complete.obs") | ||
modelo1 <- lm(Square.Meters ~ Accommodates + Bathrooms + Bedrooms + Beds + Price + Guests.Included + Extra.People + Review.Scores.Rating + Latitude + Longitude , data = df_train) | ||
summary(modelo1) | ||
modelo2 <- lm(Square.Meters ~ Accommodates + Bedrooms + Price, data = df_train) | ||
summary(modelo2) | ||
residuo1 <- residuals(modelo1, newdata = test.df) | ||
residuo2 <- residuals(modelo2, newdata = test.df) | ||
library(ggplot2) | ||
ggplot(data.frame(residuo = residuo1), aes(x = residuo)) + | ||
geom_histogram(binwidth = 1, fill = 'blue', color = "black", alpha = 0.5) + | ||
labs(title = "Histograma de residuos de modelo1", x = "Residuos", y = "") | ||
ggplot(data.frame(residuo = residuo2), aes(x = residuo)) + | ||
geom_histogram(binwidth = 1, fill = 'darkgreen', color = "black", alpha = 0.5) + | ||
labs(title = "Histograma de residuos de modelo2", x = "Residuos", y = "") | ||
df_anuncio <- data.frame( | ||
Accommodates=6, Bathrooms=1, Bedrooms=3, Beds= 3, | ||
Price=80, Guests.Included=3, Review.Scores.Rating = 80, | ||
Extra.People=1, Latitude=mean(df_madrid$Latitude,na.rm = TRUE), Longitude=-mean(df_madrid$Longitude,na.rm = TRUE), neighb_id = "1") | ||
metros_estimados_mod1 <- predict(modelo1, df_anuncio) | ||
paste("Metros cuadrados apartamento con modelo1:", metros_estimados) | ||
metros_habitacion_adicional <- (modelo1$coefficients["Bedrooms"]) | ||
paste("Metros por habitación adicionalcon modelo1:",metros_habitacion_adicional) | ||
metros_estimados_mod2 <- predict(modelo2, df_anuncio) | ||
paste("Metros cuadrados apartamento con modelo2:", metros_estimados) | ||
metros_habitacion_adicional <- (modelo2$coefficients["Bedrooms"]) | ||
paste("Metros por habitación adicionalcon modelo2:",metros_habitacion_adicional) | ||
df_madrid_nuevo <- df_madrid | ||
df_madrid_nuevo[is.na(df_madrid_nuevo$Square.Meters),"Square.Meters"] <- predict(modelo1, (df_madrid[is.na(df_madrid$Square.Meters),])) | ||
head(df_madrid_nuevo) | ||
savehistory("~/Documents/Documentos - MacBook Pro de Isabel (2)/Keepcoding/Cursos/0. Curso Big Data/12. data-mining/practica/Practica.Rhistory") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,270 @@ | ||
--- | ||
format: html | ||
editor: visual | ||
markdown: | ||
wrap: 72 | ||
--- | ||
|
||
Vasmos a cargar el dataset de AirBnB descargado de [aquí](https://public.opendatasoft.com/explore/dataset/airbnb-listings/export/?disjunctive.host_verifications&disjunctive.amenities&disjunctive.features&q=Madrid&dataChart=eyJxdWVyaWVzIjpbeyJjaGFydHMiOlt7InR5cGUiOiJjb2x1bW4iLCJmdW5jIjoiQ09VTlQiLCJ5QXhpcyI6Imhvc3RfbGlzdGluZ3NfY291bnQiLCJzY2llbnRpZmljRGlzcGxheSI6dHJ1ZSwiY29sb3IiOiJyYW5nZS1jdXN0b20ifV0sInhBeGlzIjoiY2l0eSIsIm1heHBvaW50cyI6IiIsInRpbWVzY2FsZSI6IiIsInNvcnQiOiIiLCJzZXJpZXNCcmVha2Rvd24iOiJyb29tX3R5cGUiLCJjb25maWciOnsiZGF0YXNldCI6ImFpcmJuYi1saXN0aW5ncyIsIm9wdGlvbnMiOnsiZGlzanVuY3RpdmUuaG9zdF92ZXJpZmljYXRpb25zIjp0cnVlLCJkaXNqdW5jdGl2ZS5hbWVuaXRpZXMiOnRydWUsImRpc2p1bmN0aXZlLmZlYXR1cmVzIjp0cnVlfX19XSwidGltZXNjYWxlIjoiIiwiZGlzcGxheUxlZ2VuZCI6dHJ1ZSwiYWxpZ25Nb250aCI6dHJ1ZX0%3D&location=16,41.38377,2.15774&basemap=jawg.streets) | ||
|
||
 | ||
|
||
```{r} | ||
airbnb<-read.csv('airbnb-listings.csv',sep = ';') | ||
options(repr.plot.height=4,repr.plot.width=6,repr.plot.res = 300) | ||
library(dplyr) | ||
``` | ||
|
||
1. Vamos a quedarnos con las columnas de mayor interés: 'City','Room.Type','Neighbourhood','Accommodates','Bathrooms','Bedrooms','Beds','Price','Square.Feet','Guests.Included','Extra.People','Review.Scores.Rating','Latitude', 'Longitude' Nos quedarmos solo con las entradas de Madrid para Room.Type=="Entire home/apt" y cuyo barrio (Neighbourhood) no está vacio '' Podemos eliminar las siguientes columnas que ya no son necesarias: "Room.Type",'City' Llama a nuevo dataframe df_madrid. | ||
|
||
```{r} | ||
head(airbnb) | ||
``` | ||
|
||
```{r} | ||
df_madrid <- airbnb[airbnb$Room.Type == "Entire home/apt" & airbnb$Neighbourhood != '' & airbnb$City == "Madrid", | ||
c("Neighbourhood", "Accommodates", "Bathrooms", "Bedrooms", "Beds", "Price", "Square.Feet", "Guests.Included", "Extra.People", "Review.Scores.Rating", "Latitude", "Longitude")] | ||
head(df_madrid) | ||
``` | ||
|
||
------------------------------------------------------------------------ | ||
|
||
2. Crea una nueva columna llamada Square.Meters a partir de Square.Feet. Recuerda que un pie cuadrado son 0.092903 metros cuadrados. | ||
|
||
```{r} | ||
df_madrid$Square.Meters <- df_madrid$Square.Feet * 0.092903 | ||
df_madrid[, c("Square.Feet", "Square.Meters")] | ||
head(df_madrid) | ||
``` | ||
|
||
------------------------------------------------------------------------ | ||
|
||
3. ¿Que porcentaje de los apartamentos no muestran los metros cuadrados? Es decir, ¿cuantos tienen NA en Square.Meters? | ||
|
||
```{r} | ||
entire_home_NA <- sum(is.na(df_madrid$Square.Meters)) | ||
porcent <- (entire_home_NA / nrow(df_madrid)) * 100 | ||
cat(porcent) | ||
``` | ||
|
||
------------------------------------------------------------------------ | ||
|
||
4. De todos los apartamentos que tienen un valor de metros cuadrados diferente de NA ¿Que porcentaje de los apartamentos tienen 0 metros cuadrados? | ||
|
||
```{r} | ||
entire_home_real <-entire_home_NA | ||
home_cero <-sum(df_madrid$Square.Meters ==0 & !is.na(df_madrid$Square.Meters)) | ||
porcent_real <- (home_cero / entire_home_real) *100 | ||
cat(porcent_real) | ||
``` | ||
|
||
------------------------------------------------------------------------ | ||
|
||
5. Reemplazar todos los 0m\^2 por NA | ||
|
||
```{r} | ||
df_madrid$Square.Meters[df_madrid$Square.Meters == '' | df_madrid$Square.Meters == 0] <- NA | ||
cat(df_madrid$Square.Meters) | ||
``` | ||
|
||
------------------------------------------------------------------------ | ||
|
||
Hay muchos NAs, vamos a intentar crear un modelo que nos prediga cuantos son los metros cuadrados en función del resto de variables para tratar de rellenar esos NA. Pero **antes de crear el modelo** vamos a hacer: \* pintar el histograma de los metros cuadrados y ver si tenemos que filtrar algún elemento más. \* crear una variable sintética nueva basada en la similitud entre barrios que usaremos en nuestro modelo. | ||
|
||
6. Pinta el histograma de los metros cuadrados y ver si tenemos que filtrar algún elemento más | ||
|
||
```{r} | ||
library(ggplot2) | ||
ggplot(df_madrid, aes(x = Square.Meters)) + | ||
geom_histogram(binwidth = 10, fill = 'red', color = "white") + | ||
labs(title = "Histograma de Metros Cuadrados", x = "Metros Cuadrados", y = "") | ||
``` | ||
|
||
------------------------------------------------------------------------ | ||
|
||
7. Asigna el valor NA a la columna Square.Meters de los apartamentos que tengan menos de 20 m\^2 | ||
|
||
```{r} | ||
df_madrid$Square.Meters[df_madrid$Square.Meters < 20] <- NA | ||
cat(df_madrid$Square.Meters) | ||
``` | ||
|
||
------------------------------------------------------------------------ | ||
|
||
8. Existen varios Barrios que todas sus entradas de Square.Meters son NA, vamos a eliminar del dataset todos los pisos que pertenecen a estos barrios. | ||
|
||
```{r} | ||
library(dplyr) | ||
df_madrid_Neighbourhood<- unique(df_madrid$Neighbourhood) | ||
Neighbourhood_0 <- character(0) | ||
for (Neighbourhood in df_madrid_Neighbourhood) { | ||
df_madrid_barrio <- df_madrid[df_madrid$Neighbourhood == Neighbourhood, ] | ||
df_na <- all(is.na(df_madrid_barrio$Square.Meters)) | ||
if (df_na) { | ||
Neighbourhood_0 <- c(Neighbourhood_0, Neighbourhood) | ||
} | ||
} | ||
df_madrid <- df_madrid[!df_madrid$Neighbourhood %in% Neighbourhood_0, ] | ||
summary(df_madrid) | ||
``` | ||
|
||
------------------------------------------------------------------------ | ||
|
||
El barrio parece ser un indicador importante para los metros cuadrados de un apartamento. | ||
|
||
Vamos a agrupar los barrios por metros cuadrados. Podemos usar una matriz de similaridad de Tukey tal y como hicimos en el curso de estadística: | ||
|
||
```{r} | ||
tky<-TukeyHSD(aov( formula=Square.Meters~Neighbourhood, data=df_madrid )) | ||
tky.result<-data.frame(tky$Neighbourhood) | ||
cn <-sort(unique(df_madrid$Neighbourhood)) | ||
resm <- matrix(NA, length(cn),length(cn)) | ||
rownames(resm) <- cn | ||
colnames(resm) <- cn | ||
resm[lower.tri(resm) ] <- round(tky.result$p.adj,4) | ||
resm[upper.tri(resm) ] <- t(resm)[upper.tri(resm)] | ||
diag(resm) <- 1 | ||
library(ggplot2) | ||
library(reshape2) | ||
dfResm <- melt(resm) | ||
ggplot(dfResm, aes(x=Var1, y=Var2, fill=value))+ | ||
geom_tile(colour = "black")+ | ||
scale_fill_gradient(low = "white",high = "steelblue")+ | ||
ylab("Neighbourhood")+xlab("Neighbourhood")+theme_bw()+ | ||
theme(axis.text.x = element_text(angle = 90, hjust = 1),legend.position="none") | ||
``` | ||
|
||
9. Usando como variable de distancia: 1-resm Dibuja un dendrograma de los diferentes barrios. | ||
|
||
```{r} | ||
library(factoextra) | ||
distancia <- as.dist(1 - resm) | ||
clust_madrid <- hclust(distancia, method = "complete") | ||
dendrogram <- as.dendrogram(clust_madrid) | ||
fviz_dend(clust_madrid, cex = 0.8, lwd = 0.8, k = 38, | ||
rect = TRUE, | ||
k_colors = "jco", | ||
rect_border = "jco", | ||
rect_fill = TRUE, | ||
ggtheme = theme_gray()) | ||
``` | ||
|
||
|
||
------------------------------------------------------------------------ | ||
|
||
10. ¿Que punto de corte sería el aconsejable?, ¿cuantos clusters aparecen? | ||
|
||
El punto de corte sería 0.1. Hay 38 , uno por cada barrio. | ||
|
||
------------------------------------------------------------------------ | ||
|
||
11. Vamos a crear una nueva columna en el dataframe df_madrid con un nuevo identificador marcado por los clusters obtenidos. Esta columna la llamaremos neighb_id | ||
|
||
```{r} | ||
cluster <- cutree(clust_madrid, k = 38) | ||
df_madrid_cluster <- data.frame(Neighbourhood = unique(df_madrid$Neighbourhood), neighb_id = cluster) | ||
df_madrid <- merge(df_madrid, df_madrid_cluster , by = "Neighbourhood", all.x = TRUE) | ||
head(df_madrid) | ||
``` | ||
|
||
|
||
------------------------------------------------------------------------ | ||
|
||
12. Vamos a crear dos grupos, uno test y otro train. | ||
|
||
```{r} | ||
set.seed(42) | ||
idx<-sample(1:nrow(df_madrid),nrow(df_madrid)*0.7) | ||
train.df<-df_madrid[idx,] | ||
test.df<-df_madrid[-idx,] | ||
train.df <- train.df |> select(!c(Neighbourhood,Square.Feet)) | ||
test.df <- test.df |> select(!c(Neighbourhood,Square.Feet)) | ||
cat("Número de valores:", nrow(train.df), "\n") | ||
cat("Número de valores de datos de prueba:", nrow(test.df), "\n") | ||
``` | ||
|
||
------------------------------------------------------------------------ | ||
|
||
13. Tratamos de predecir los metros cuadrados en función del resto de columnas del dataframe. | ||
|
||
```{r} | ||
summary(train.df) | ||
cor(train.df[,c("Accommodates","Square.Meters","Bathrooms","Bedrooms","Beds","Price","Guests.Included","Extra.People")], use ="pairwise.complete.obs") | ||
``` | ||
|
||
```{r} | ||
modelo1 <- lm(Square.Meters ~ Accommodates + Bathrooms + Bedrooms + Beds + Price + Guests.Included + Extra.People + Review.Scores.Rating + Latitude + Longitude , data = df_train) | ||
summary(modelo1) | ||
modelo2 <- lm(Square.Meters ~ Accommodates + Bedrooms + Price, data = df_train) | ||
summary(modelo2) | ||
``` | ||
|
||
------------------------------------------------------------------------ | ||
|
||
14. Mirad el histograma de los residuos sobre el conjunto de test para evaluar la calidad de vuestro modelo | ||
|
||
```{r} | ||
residuo1 <- residuals(modelo1, newdata = test.df) | ||
residuo2 <- residuals(modelo2, newdata = test.df) | ||
library(ggplot2) | ||
ggplot(data.frame(residuo = residuo1), aes(x = residuo)) + | ||
geom_histogram(binwidth = 1, fill = 'blue', color = "black", alpha = 0.5) + | ||
labs(title = "Histograma de residuos de modelo1", x = "Residuos", y = "") | ||
ggplot(data.frame(residuo = residuo2), aes(x = residuo)) + | ||
geom_histogram(binwidth = 1, fill = 'darkgreen', color = "black", alpha = 0.5) + | ||
labs(title = "Histograma de residuos de modelo2", x = "Residuos", y = "") | ||
``` | ||
|
||
------------------------------------------------------------------------ | ||
|
||
15. Si tuvieramos un anuncio de un apartamento para 6 personas (Accommodates), con 1 baño, con un precio de 80€/noche y 3 habitaciones en el barrio de Sol, con 3 camas y un review de 80. ¿Cuantos metros cuadrados tendría? Si tu modelo necesita algúna variable adicional puedes inventartela dentro del rango de valores del dataset. ¿Como varía sus metros cuadrados con cada habitación adicional? | ||
|
||
```{r} | ||
df_anuncio <- data.frame( | ||
Accommodates=6, Bathrooms=1, Bedrooms=3, Beds= 3, | ||
Price=80, Guests.Included=3, Review.Scores.Rating = 80, | ||
Extra.People=1, Latitude=mean(df_madrid$Latitude,na.rm = TRUE), Longitude=-mean(df_madrid$Longitude,na.rm = TRUE), neighb_id = "1") | ||
metros_estimados_mod1 <- predict(modelo1, df_anuncio) | ||
paste("Metros cuadrados apartamento con modelo1:", metros_estimados) | ||
metros_habitacion_adicional <- (modelo1$coefficients["Bedrooms"]) | ||
paste("Metros por habitación adicionalcon modelo1:",metros_habitacion_adicional) | ||
metros_estimados_mod2 <- predict(modelo2, df_anuncio) | ||
paste("Metros cuadrados apartamento con modelo2:", metros_estimados) | ||
metros_habitacion_adicional <- (modelo2$coefficients["Bedrooms"]) | ||
paste("Metros por habitación adicionalcon modelo2:",metros_habitacion_adicional) | ||
``` | ||
|
||
------------------------------------------------------------------------ | ||
|
||
16. Rellenar los Square.Meters con valor NA con el estimado con el modelo anterior. | ||
|
||
```{r} | ||
df_madrid_nuevo <- df_madrid | ||
df_madrid_nuevo[is.na(df_madrid_nuevo$Square.Meters),"Square.Meters"] <- predict(modelo1, (df_madrid[is.na(df_madrid$Square.Meters),])) | ||
head(df_madrid_nuevo) | ||
``` | ||
|
||
------------------------------------------------------------------------ | ||
|
||
17. Usar PCA para encontrar el apartamento más cercano a uno dado. Este algoritmo nos ayudaría a dado un apartamento que el algoritmo nos devolvería los 5 apartamentos más similares. | ||
|
||
Crearemos una función tal que le pasemos un apartamento con los siguientes datos: \* Accommodates \* Bathrooms \* Bedrooms \* Beds \* Price \* Guests.Included \* Extra.People \* Review.Scores.Rating \* Latitude \* Longitude \* Square.Meters | ||
|
||
y nos devuelva los 5 más similares de: | ||
------------------------------------------------------------------------ |