WITH Colombia_properties_rent_2015 AS (
    SELECT * FROM `bigquery-public-data.properati_properties_co.properties_rent_201501`
    UNION ALL
    SELECT * FROM `bigquery-public-data.properati_properties_co.properties_rent_201502`
    UNION ALL
    SELECT * FROM `bigquery-public-data.properati_properties_co.properties_rent_201503`
    UNION ALL
    SELECT * FROM `bigquery-public-data.properati_properties_co.properties_rent_201504`
    UNION ALL
    SELECT * FROM `bigquery-public-data.properati_properties_co.properties_rent_201505`
    UNION ALL
    SELECT * FROM `bigquery-public-data.properati_properties_co.properties_rent_201506`
    UNION ALL
    SELECT * FROM `bigquery-public-data.properati_properties_co.properties_rent_201507`
    UNION ALL
    SELECT * FROM `bigquery-public-data.properati_properties_co.properties_rent_201508`
    UNION ALL
    SELECT * FROM `bigquery-public-data.properati_properties_co.properties_rent_201509`
    UNION ALL
    SELECT * FROM `bigquery-public-data.properati_properties_co.properties_rent_201510`
    UNION ALL
    SELECT * FROM `bigquery-public-data.properati_properties_co.properties_rent_201511`
    UNION ALL
    SELECT * FROM `bigquery-public-data.properati_properties_co.properties_rent_201512`
)

SELECT created_on AS Creation_Date,
  property_type AS Type,
  place_name AS City,
  state_name AS State,
  country_name AS Country, 
  lat AS Latitude,
  lon AS Longitude,
  price_aprox_local_currency AS Price_COP,
  price_aprox_usd AS Price_USD,
  surface_covered_in_m2 AS Surface,
  price_usd_per_m2 AS Price_m2_USD,
  floor AS Floor,
  rooms AS Rooms

 FROM Colombia_properties_rent_2015

# Importing relevant Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Assigning the data set to a variable
df = pd.read_csv('Colombia_properties_rent_2015.CSV')
df

# Checking how many empty values are in the data set
print(df.isnull().sum())  # Count missing values per column
df.shape

Creation_Date       0
Type                0
City                0
State               0
Country             0
Latitude          110
Longitude         110
Price_COP          21
Price_USD          21
Surface           274
Price_m2_USD     1588
Floor            2385
Rooms            2795
dtype: int64

(5099, 13)

# Removing the rows with missing values for Latitude and Longitude
df.dropna(subset=['Latitude', 'Longitude'], inplace=True)
print(df.isnull().sum())

Creation_Date       0
Type                0
City                0
State               0
Country             0
Latitude            0
Longitude           0
Price_COP          21
Price_USD          21
Surface           263
Price_m2_USD     1533
Floor            2304
Rooms            2744
dtype: int64

# Replace missing values with the mean of the column for Price_COP, Price_USD, Surface, Price_m2_USD
df[['Price_COP', 'Price_USD', 'Surface', 'Price_m2_USD']] = df[['Price_COP', 'Price_USD', 'Surface', 'Price_m2_USD']].apply(lambda x: x.fillna(x.mean()))
print(df.isnull().sum())

Creation_Date       0
Type                0
City                0
State               0
Country             0
Latitude            0
Longitude           0
Price_COP           0
Price_USD           0
Surface             0
Price_m2_USD        0
Floor            2304
Rooms            2744
dtype: int64

import pandas as pd
import numpy as np

# Assuming df is your DataFrame

# Function to fill missing rooms based on surface range
def fill_rooms(row, surface_ranges):
    if pd.isnull(row['Rooms']):
        for prop_type, ranges in surface_ranges.items():
            if row['Type'] == prop_type:
                for min_surf, max_surf, rooms in ranges:
                    if min_surf <= row['Surface'] <= max_surf:
                        return rooms
    return row['Rooms']

# Group by property type and calculate surface ranges
surface_ranges = {}
for prop_type, group in df.groupby('Type'):
    ranges = []
    for rooms, sub_group in group.groupby('Rooms'):
        if not pd.isnull(rooms):
            min_surf = sub_group['Surface'].min()
            max_surf = sub_group['Surface'].max()
            ranges.append((min_surf, max_surf, rooms))
    surface_ranges[prop_type] = ranges

# Apply the function to fill missing rooms
df['Rooms'] = df.apply(lambda row: fill_rooms(row, surface_ranges), axis=1)

# Check the result
print(df.isnull().sum())

Creation_Date       0
Type                0
City                0
State               0
Country             0
Latitude            0
Longitude           0
Price_COP           0
Price_USD           0
Surface             0
Price_m2_USD        0
Floor            2304
Rooms              96
dtype: int64

df = df.drop(columns=['Floor'])
df.dropna(inplace=True)
print(df.isnull().sum())
df.shape

Creation_Date    0
Type             0
City             0
State            0
Country          0
Latitude         0
Longitude        0
Price_COP        0
Price_USD        0
Surface          0
Price_m2_USD     0
Rooms            0
dtype: int64

(4893, 12)

# Turning Rooms into an integer
df['Rooms'] = df['Rooms'].astype(int)
df

#Check for duplicates
duplicates = df.duplicated()

# Print the number of duplicate rows
print(f"Number of duplicate rows: {duplicates.sum()}")

df.drop_duplicates(inplace=True)

df.shape

Number of duplicate rows: 3509

(1384, 12)

# Check Data Types
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1384 entries, 0 to 4432
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Creation_Date  1384 non-null   object 
 1   Type           1384 non-null   object 
 2   City           1384 non-null   object 
 3   State          1384 non-null   object 
 4   Country        1384 non-null   object 
 5   Latitude       1384 non-null   float64
 6   Longitude      1384 non-null   float64
 7   Price_COP      1384 non-null   float64
 8   Price_USD      1384 non-null   float64
 9   Surface        1384 non-null   float64
 10  Price_m2_USD   1384 non-null   float64
 11  Rooms          1384 non-null   int32  
dtypes: float64(6), int32(1), object(5)
memory usage: 135.2+ KB

df['Surface'] = df['Surface'].round(2)
df['Price_m2_USD'] = df['Price_m2_USD'].round(2)
df

df.to_csv('Clean_Colombia_properties_rent_2015.csv', index=False)

# Basic information
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
Index: 1384 entries, 0 to 4432
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Creation_Date  1384 non-null   object 
 1   Type           1384 non-null   object 
 2   City           1384 non-null   object 
 3   State          1384 non-null   object 
 4   Country        1384 non-null   object 
 5   Latitude       1384 non-null   float64
 6   Longitude      1384 non-null   float64
 7   Price_COP      1384 non-null   float64
 8   Price_USD      1384 non-null   float64
 9   Surface        1384 non-null   float64
 10  Price_m2_USD   1384 non-null   float64
 11  Rooms          1384 non-null   int32  
dtypes: float64(6), int32(1), object(5)
memory usage: 135.2+ KB
None
          Latitude    Longitude     Price_COP     Price_USD      Surface  \
count  1384.000000  1384.000000  1.384000e+03   1384.000000  1384.000000   
mean      7.304399   -74.165171  1.138219e+07   3906.051588   518.659379   
std       3.259636     6.376757  2.069535e+07   7102.473683   913.167668   
min      -0.001614   -77.022000  0.000000e+00      0.000000     1.000000   
25%       4.675000   -74.822000  2.399951e+06    825.920000    77.000000   
50%       4.787000   -74.782000  4.683899e+06   1605.460000   186.000000   
75%      10.998000   -74.054981  1.199935e+07   4129.610000   561.250000   
max      11.245000     0.000907  2.413553e+08  82592.290000  8000.000000   

       Price_m2_USD        Rooms  
count   1384.000000  1384.000000  
mean      11.015166     1.999277  
std        6.985004     1.758976  
min        0.090000     1.000000  
25%        6.880000     1.000000  
50%       10.960000     1.000000  
75%       12.120000     3.000000  
max       66.250000    17.000000

# Define numerical columns excluding 'Latitude' and 'Longitude'
numerical_columns = df.select_dtypes(include=[np.number]).columns.difference(['Latitude', 'Longitude'])

# Plot histograms for the selected numerical columns
df[numerical_columns].hist(bins=30, figsize=(15, 10))
plt.show()
# # Define numerical columns excluding 'Latitude' and 'Longitude'
# numerical_columns = df.select_dtypes(include=[np.number]).columns.difference(['Latitude', 'Longitude'])

# # Plot histograms for the selected numerical columns using Plotly
# for column in numerical_columns:
#     fig = px.histogram(df, x=column, nbins=30, title=f'Histogram of {column}')
#     fig.show()

# Exclude 'Creation_Date' and 'Country' columns
categorical_columns = ['Type', 'City', 'State']

# Get the top 20 cities
top_20_cities = df['City'].value_counts().nlargest(20).index

# Filter the DataFrame to include only the top 20 cities
df_top_cities = df[df['City'].isin(top_20_cities)]

# Order states by their count in descending order
state_order = df['State'].value_counts().index

# Plot the distribution of categorical columns
fig, axes = plt.subplots(3, 1, figsize=(15, 20))

for i, col in enumerate(categorical_columns):
    if col == 'City':
        sns.countplot(data=df_top_cities, x=col, ax=axes[i], order=top_20_cities)
        axes[i].set_title('Distribution of Top 20 Cities')
    elif col == 'State':
        sns.countplot(data=df, x=col, ax=axes[i], order=state_order)
        axes[i].set_title('Distribution of State (Descending)')
    else:
        sns.countplot(data=df, x=col, ax=axes[i])
        axes[i].set_title(f'Distribution of {col}')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Count')
    axes[i].tick_params(axis='x', rotation=90)

plt.tight_layout()
plt.show()

sns.pairplot(df)
plt.show()

# Define numerical columns excluding 'Latitude' and 'Longitude'
numerical_columns = df.select_dtypes(include=[np.number]).columns.difference(['Latitude', 'Longitude'])

# Plot vertical boxplots for the selected numerical columns
fig, axes = plt.subplots(len(numerical_columns), 1, figsize=(15, 5 * len(numerical_columns)))

for i, col in enumerate(numerical_columns):
    sns.boxplot(data=df, x=col, ax=axes[i])
    axes[i].set_title(f'Boxplot of {col}')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Value')

plt.tight_layout()
plt.show()

# Define numerical columns excluding 'Latitude' and 'Longitude'
numerical_columns = df.select_dtypes(include=[np.number]).columns.difference(['Latitude', 'Longitude'])

# Plot horizontal boxplots for the selected numerical columns
fig, axes = plt.subplots(len(numerical_columns), 1, figsize=(15, 5 * len(numerical_columns)))

for i, col in enumerate(numerical_columns):
    sns.boxplot(data=df, x=col, ax=axes[i], showfliers=False)
    axes[i].set_title(f'Boxplot of {col} (without outliers)')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Value')

plt.tight_layout()
plt.show()

	Creation_Date	Type	City	State	Country	Latitude	Longitude	Price_COP	Price_USD	Surface	Price_m2_USD	Floor	Rooms
0	2015-08-31	house	Paipa	Boyacá	Colombia	5.784000	-73.116000	3821455.54	1307.71	130.0	10.059308	2.0	NaN
1	2015-10-27	house	Cartagena	Bolívar	Colombia	10.424000	-75.549000	8547990.36	2925.14	300.0	9.750467	2.0	NaN
2	2015-07-04	house	Armenia	Quindío	Colombia	4.461000	-75.667000	1508465.45	516.20	41.0	NaN	NaN	NaN
3	2015-08-05	house	Retiro	Antioquia	Colombia	NaN	NaN	25141168.76	8603.36	700.0	NaN	1.0	NaN
4	2015-06-01	apartment	Antioquia	Antioquia	Colombia	6.057394	-75.502792	653678.10	223.69	NaN	NaN	NaN	1.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...
5094	2015-05-07	house	Ríomar	Atlántico	Colombia	11.008000	-74.822000	4499743.79	1548.60	45.0	34.413333	1.0	NaN
5095	2015-05-07	house	Ríomar	Atlántico	Colombia	11.012000	-74.825000	5999658.38	2064.80	NaN	4.040705	NaN	NaN
5096	2015-05-08	house	Ríomar	Atlántico	Colombia	11.015000	-74.823000	3999762.57	1376.53	180.0	NaN	NaN	3.0
5097	2015-05-08	house	Puerto Colombia	Atlántico	Colombia	11.023000	-74.885000	1299916.29	447.37	85.0	5.263176	NaN	3.0
5098	2015-05-08	house	San Antonio del Tequendama	Cundinamarca	Colombia	4.616000	-74.352000	4499743.79	1548.60	285.0	0.120984	NaN	4.0

	Creation_Date	Type	City	State	Country	Latitude	Longitude	Price_COP	Price_USD	Surface	Price_m2_USD	Rooms
0	2015-08-31	house	Paipa	Boyacá	Colombia	5.784000	-73.116000	3821455.54	1307.71	130.000000	10.059308	1
1	2015-10-27	house	Cartagena	Bolívar	Colombia	10.424000	-75.549000	8547990.36	2925.14	300.000000	9.750467	1
2	2015-07-04	house	Armenia	Quindío	Colombia	4.461000	-75.667000	1508465.45	516.20	41.000000	10.960724	1
4	2015-06-01	apartment	Antioquia	Antioquia	Colombia	6.057394	-75.502792	653678.10	223.69	1371.975032	10.960724	1
5	2015-06-01	apartment	Antioquia	Antioquia	Colombia	6.057394	-75.502792	653678.10	223.69	1371.975032	10.960724	1
...	...	...	...	...	...	...	...	...	...	...	...	...
5094	2015-05-07	house	Ríomar	Atlántico	Colombia	11.008000	-74.822000	4499743.79	1548.60	45.000000	34.413333	1
5095	2015-05-07	house	Ríomar	Atlántico	Colombia	11.012000	-74.825000	5999658.38	2064.80	1371.975032	4.040705	1
5096	2015-05-08	house	Ríomar	Atlántico	Colombia	11.015000	-74.823000	3999762.57	1376.53	180.000000	10.960724	3
5097	2015-05-08	house	Puerto Colombia	Atlántico	Colombia	11.023000	-74.885000	1299916.29	447.37	85.000000	5.263176	3
5098	2015-05-08	house	San Antonio del Tequendama	Cundinamarca	Colombia	4.616000	-74.352000	4499743.79	1548.60	285.000000	0.120984	4

	Creation_Date	Type	City	State	Country	Latitude	Longitude	Price_COP	Price_USD	Surface	Price_m2_USD	Rooms
0	2015-08-31	house	Paipa	Boyacá	Colombia	5.784000	-73.116000	3821455.54	1307.71	130.00	10.06	1
1	2015-10-27	house	Cartagena	Bolívar	Colombia	10.424000	-75.549000	8547990.36	2925.14	300.00	9.75	1
2	2015-07-04	house	Armenia	Quindío	Colombia	4.461000	-75.667000	1508465.45	516.20	41.00	10.96	1
4	2015-06-01	apartment	Antioquia	Antioquia	Colombia	6.057394	-75.502792	653678.10	223.69	1371.98	10.96	1
8	2015-07-01	apartment	Antioquia	Antioquia	Colombia	6.057394	-75.502792	3419353.94	1170.11	1371.98	10.96	2
...	...	...	...	...	...	...	...	...	...	...	...	...
4428	2015-01-29	apartment	Bucaramanga	Santander	Colombia	8.788330	-76.035560	819112.60	281.90	40.00	10.96	1
4429	2015-01-29	apartment	Bucaramanga	Santander	Colombia	8.788330	-76.035560	819112.60	281.90	37.00	10.96	1
4430	2015-01-29	apartment	Bucaramanga	Santander	Colombia	8.788330	-76.035560	1222566.96	420.75	54.00	10.96	1
4431	2015-01-29	apartment	Bucaramanga	Santander	Colombia	8.788330	-76.035560	1650458.13	568.01	27.00	10.96	1
4432	2015-01-29	house	Bucaramanga	Santander	Colombia	8.788330	-76.035560	3056402.88	1051.87	180.00	10.96	3

Colombia Property Rent 2015 Descriptive Data Analysis

Dataset Source:

Data retrieval of all the 2015 tables were united with the following Query:

Data Wrangling

Exploratory Data Analysis

This is Dashboard 1/2 of the representation of the data in Tableau

This is Dashboard 2/2 of the representation of the data in Tableau