#Importing essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

#Viewing Our DataFrame!
df = pd.read_csv("vgsales new.csv")
df.head()

# 1. Check for missing values
print(df.isnull().sum())

Rank              0
Name              0
Platform          0
Year            271
Genre             0
Publisher        58
NA_Sales          0
EU_Sales          0
JP_Sales          0
Other_Sales       0
Global_Sales      0
dtype: int64

# 2. Drop missing values
df.dropna(inplace=True)
print(df.isnull().sum())

Rank            0
Name            0
Platform        0
Year            0
Genre           0
Publisher       0
NA_Sales        0
EU_Sales        0
JP_Sales        0
Other_Sales     0
Global_Sales    0
dtype: int64

# 3. Check for duplicates
print(df.duplicated().sum())

0

# 4. Check data types
print(df.dtypes)

Rank              int64
Name             object
Platform         object
Year            float64
Genre            object
Publisher        object
NA_Sales        float64
EU_Sales        float64
JP_Sales        float64
Other_Sales     float64
Global_Sales    float64
dtype: object

# 5. Check the shape of the DataFrame
df.shape

(16291, 11)

# 6. Concise summary of the DataFrame  
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16291 entries, 0 to 16326
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          16291 non-null  int64  
 1   Name          16291 non-null  object 
 2   Platform      16291 non-null  object 
 3   Year          16291 non-null  float64
 4   Genre         16291 non-null  object 
 5   Publisher     16291 non-null  object 
 6   NA_Sales      16291 non-null  float64
 7   EU_Sales      16291 non-null  float64
 8   JP_Sales      16291 non-null  float64
 9   Other_Sales   16291 non-null  float64
 10  Global_Sales  16291 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.5+ MB

# 7. Change 'Year' from float to integer
df['Year'] = df['Year'].astype(int)

# 8. Concise summary of the DataFrame  after changing the data type
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16291 entries, 0 to 16326
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          16291 non-null  int64  
 1   Name          16291 non-null  object 
 2   Platform      16291 non-null  object 
 3   Year          16291 non-null  int32  
 4   Genre         16291 non-null  object 
 5   Publisher     16291 non-null  object 
 6   NA_Sales      16291 non-null  float64
 7   EU_Sales      16291 non-null  float64
 8   JP_Sales      16291 non-null  float64
 9   Other_Sales   16291 non-null  float64
 10  Global_Sales  16291 non-null  float64
dtypes: float64(5), int32(1), int64(1), object(4)
memory usage: 1.4+ MB

# Optional: Export the cleaned DataFrame to a CSV file, used to also demonstrate data visualisation in other software such as Tableau, Power BI, etc.
#df.to_csv('vgsales_clean.csv', index=False)

#print("DataFrame exported successfully.")

# 9. Display summary statistics
print(df.describe())

               Rank          Year      NA_Sales      EU_Sales      JP_Sales  \
count  16291.000000  16291.000000  16291.000000  16291.000000  16291.000000   
mean    8290.190228   2006.405561      0.265647      0.147731      0.078833   
std     4792.654450      5.832412      0.822432      0.509303      0.311879   
min        1.000000   1980.000000      0.000000      0.000000      0.000000   
25%     4132.500000   2003.000000      0.000000      0.000000      0.000000   
50%     8292.000000   2007.000000      0.080000      0.020000      0.000000   
75%    12439.500000   2010.000000      0.240000      0.110000      0.040000   
max    16600.000000   2020.000000     41.490000     29.020000     10.220000   

        Other_Sales  Global_Sales  
count  16291.000000  16291.000000  
mean       0.048426      0.540910  
std        0.190083      1.567345  
min        0.000000      0.010000  
25%        0.000000      0.060000  
50%        0.010000      0.170000  
75%        0.040000      0.480000  
max       10.570000     82.740000

# 10. Display data distribution
# Histograms
df.hist(bins=30, figsize=(15, 10))
plt.show()

# Boxplots
df.plot(kind='box', subplots=True, layout=(3, 3), figsize=(15, 10))
plt.show()

# 11. Correlation matrix
# Select only numeric columns
numeric_df = df.select_dtypes(include=[np.number])

# Compute the correlation matrix
corr = numeric_df.corr()

# Plot the correlation matrix
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()

# 12. Pairplot
sns.pairplot(df)
plt.show()

	Rank	Name	Platform	Year	Genre	Publisher	NA_Sales	EU_Sales	Other_Sales	Global_Sales
0	259	Asteroids	2600	1980.0	Shooter	Atari	4.00	0.26	0.05	4.31
1	545	Missile Command	2600	1980.0	Shooter	Atari	2.56	0.17	0.03	2.76
2	1768	Kaboom!	2600	1980.0	Misc	Activision	1.07	0.07	0.01	1.15
3	1971	Defender	2600	1980.0	Misc	Atari	0.99	0.05	0.01	1.05
4	2671	Boxing	2600	1980.0	Fighting	Activision	0.72	0.04	0.01	0.77

Video Game Sales Dataset Descriptive Analysis

About Data Set:

Step 1: Data Wrangling

Step 2: Exploratory Data Aanalysis

Description

Overall Insights:

My Tableau Visualization¶

Description of each histogram

Key Observations:

My Tableau Visualization¶

Description of each histogram

Key Observations:

Color Scale

The color scale ranges from -1 (dark blue) to 1 (dark red):

Key Observations:

Rank

Year

NA_Sales

EU_Sales

JP_Sales

Other_Sales

Global_Sales

General Insights

1. Diagonal Plots (Histograms)

2. Scatterplots (Off-Diagonal Plots)

Key Insights

Overall