In [9]:
# Plotly setup for VS Code
import plotly.io as pio
pio.renderers.default = 'notebook'
import plotly.offline as py
py.init_notebook_mode(connected=True)
In [10]:
#import all the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [11]:
import plotly.figure_factory as ff
#import all the necessary functions and classes
import plotly.express as px
import plotly.offline as py
py.offline.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.figure_factory import create_table
In [12]:
py.offline.init_notebook_mode(connected=True)
In [38]:
gapminder = px.data.gapminder() #loading from plotly express
# Step 1: Check for missing values
print("Missing values per column:")
print(gapminder.isnull().sum())
Missing values per column: country 0 continent 0 year 0 lifeExp 0 pop 0 gdpPercap 0 iso_alpha 0 iso_num 0 dtype: int64
In [39]:
# Step 2: Check for duplicates
duplicate_rows = gapminder.duplicated()
print(f"Number of duplicate rows: {duplicate_rows.sum()}")
Number of duplicate rows: 0
In [40]:
# Step 3: Validate data types
print("\nData types of each column:")
print(gapminder.dtypes)
Data types of each column: country object continent object year int64 lifeExp float64 pop int64 gdpPercap float64 iso_alpha object iso_num int64 dtype: object
In [41]:
# Step 4: Categorical consistency
print("\nUnique continents:")
print(gapminder['continent'].unique())
print("\nNumber of unique countries:")
print(gapminder['country'].nunique())
Unique continents: ['Asia' 'Europe' 'Africa' 'Americas' 'Oceania'] Number of unique countries: 142
In [42]:
# Step 5: Summary stats and outlier check
numerical_cols = ['lifeExp', 'pop', 'gdpPercap']
print("\nSummary statistics for numerical columns:")
print(gapminder[numerical_cols].describe())
Summary statistics for numerical columns: lifeExp pop gdpPercap count 1704.000000 1.704000e+03 1704.000000 mean 59.474439 2.960121e+07 7215.327081 std 12.917107 1.061579e+08 9857.454543 min 23.599000 6.001100e+04 241.165876 25% 48.198000 2.793664e+06 1202.060309 50% 60.712500 7.023596e+06 3531.846989 75% 70.845500 1.958522e+07 9325.462346 max 82.603000 1.318683e+09 113523.132900
In [43]:
gapminder.head()
Out[43]:
country | continent | year | lifeExp | pop | gdpPercap | iso_alpha | iso_num | |
---|---|---|---|---|---|---|---|---|
0 | Afghanistan | Asia | 1952 | 28.801 | 8425333 | 779.445314 | AFG | 4 |
1 | Afghanistan | Asia | 1957 | 30.332 | 9240934 | 820.853030 | AFG | 4 |
2 | Afghanistan | Asia | 1962 | 31.997 | 10267083 | 853.100710 | AFG | 4 |
3 | Afghanistan | Asia | 1967 | 34.020 | 11537966 | 836.197138 | AFG | 4 |
4 | Afghanistan | Asia | 1972 | 36.088 | 13079460 | 739.981106 | AFG | 4 |
In [44]:
#check the shape of the data
gapminder.shape
Out[44]:
(1704, 8)
In [45]:
#check the columns
gapminder.columns
Out[45]:
Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap', 'iso_alpha', 'iso_num'], dtype='object')
In [46]:
#check the info of the data
gapminder.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1704 entries, 0 to 1703 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 country 1704 non-null object 1 continent 1704 non-null object 2 year 1704 non-null int64 3 lifeExp 1704 non-null float64 4 pop 1704 non-null int64 5 gdpPercap 1704 non-null float64 6 iso_alpha 1704 non-null object 7 iso_num 1704 non-null int64 dtypes: float64(2), int64(3), object(3) memory usage: 106.6+ KB
In [47]:
#describe the data
gapminder.describe()
Out[47]:
year | lifeExp | pop | gdpPercap | iso_num | |
---|---|---|---|---|---|
count | 1704.00000 | 1704.000000 | 1.704000e+03 | 1704.000000 | 1704.000000 |
mean | 1979.50000 | 59.474439 | 2.960121e+07 | 7215.327081 | 425.880282 |
std | 17.26533 | 12.917107 | 1.061579e+08 | 9857.454543 | 248.305709 |
min | 1952.00000 | 23.599000 | 6.001100e+04 | 241.165876 | 4.000000 |
25% | 1965.75000 | 48.198000 | 2.793664e+06 | 1202.060309 | 208.000000 |
50% | 1979.50000 | 60.712500 | 7.023596e+06 | 3531.846989 | 410.000000 |
75% | 1993.25000 | 70.845500 | 1.958522e+07 | 9325.462346 | 638.000000 |
max | 2007.00000 | 82.603000 | 1.318683e+09 | 113523.132900 | 894.000000 |
In [48]:
gapminder['country'].value_counts() #check the records for each country
Out[48]:
country Afghanistan 12 Albania 12 Algeria 12 Angola 12 Argentina 12 .. Vietnam 12 West Bank and Gaza 12 Yemen, Rep. 12 Zambia 12 Zimbabwe 12 Name: count, Length: 142, dtype: int64
In [49]:
gapminder['continent'].value_counts() #check the records for each continent
Out[49]:
continent Africa 624 Asia 396 Europe 360 Americas 300 Oceania 24 Name: count, dtype: int64
In [50]:
import plotly.figure_factory as ff
# Create a Plotly table from the first few rows of the Gapminder dataset
table_fig = ff.create_table(gapminder.head())
# Show the table
table_fig.show()
Barc chart¶
In [51]:
#query the data for canada
canada_data = px.data.gapminder().query("country == 'Canada'")
canada_data
Out[51]:
country | continent | year | lifeExp | pop | gdpPercap | iso_alpha | iso_num | |
---|---|---|---|---|---|---|---|---|
240 | Canada | Americas | 1952 | 68.750 | 14785584 | 11367.16112 | CAN | 124 |
241 | Canada | Americas | 1957 | 69.960 | 17010154 | 12489.95006 | CAN | 124 |
242 | Canada | Americas | 1962 | 71.300 | 18985849 | 13462.48555 | CAN | 124 |
243 | Canada | Americas | 1967 | 72.130 | 20819767 | 16076.58803 | CAN | 124 |
244 | Canada | Americas | 1972 | 72.880 | 22284500 | 18970.57086 | CAN | 124 |
245 | Canada | Americas | 1977 | 74.210 | 23796400 | 22090.88306 | CAN | 124 |
246 | Canada | Americas | 1982 | 75.760 | 25201900 | 22898.79214 | CAN | 124 |
247 | Canada | Americas | 1987 | 76.860 | 26549700 | 26626.51503 | CAN | 124 |
248 | Canada | Americas | 1992 | 77.950 | 28523502 | 26342.88426 | CAN | 124 |
249 | Canada | Americas | 1997 | 78.610 | 30305843 | 28954.92589 | CAN | 124 |
250 | Canada | Americas | 2002 | 79.770 | 31902268 | 33328.96507 | CAN | 124 |
251 | Canada | Americas | 2007 | 80.653 | 33390141 | 36319.23501 | CAN | 124 |
In [52]:
px.bar(canada_data,x='year',y='lifeExp',height=400) #plot a barchart for year and lifexp
In [53]:
px.bar(canada_data,x='year',y='pop',color='lifeExp',height=400,
labels={'pop':'population'}) # add color to the barchart to see life exp and on y axis plot population
Life Expectency vs gdp per capita
In [54]:
#query the data for 2007
gapminder2007 = gapminder.query('year == 2007')
gapminder2007
Out[54]:
country | continent | year | lifeExp | pop | gdpPercap | iso_alpha | iso_num | |
---|---|---|---|---|---|---|---|---|
11 | Afghanistan | Asia | 2007 | 43.828 | 31889923 | 974.580338 | AFG | 4 |
23 | Albania | Europe | 2007 | 76.423 | 3600523 | 5937.029526 | ALB | 8 |
35 | Algeria | Africa | 2007 | 72.301 | 33333216 | 6223.367465 | DZA | 12 |
47 | Angola | Africa | 2007 | 42.731 | 12420476 | 4797.231267 | AGO | 24 |
59 | Argentina | Americas | 2007 | 75.320 | 40301927 | 12779.379640 | ARG | 32 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
1655 | Vietnam | Asia | 2007 | 74.249 | 85262356 | 2441.576404 | VNM | 704 |
1667 | West Bank and Gaza | Asia | 2007 | 73.422 | 4018332 | 3025.349798 | PSE | 275 |
1679 | Yemen, Rep. | Asia | 2007 | 62.698 | 22211743 | 2280.769906 | YEM | 887 |
1691 | Zambia | Africa | 2007 | 42.384 | 11746035 | 1271.211593 | ZMB | 894 |
1703 | Zimbabwe | Africa | 2007 | 43.487 | 12311143 | 469.709298 | ZWE | 716 |
142 rows × 8 columns
In [55]:
px.scatter(gapminder2007,x='gdpPercap',y='lifeExp') #plot a scatter plot between grdpPercap vs lifeExp
In [56]:
px.scatter(gapminder2007,x='gdpPercap',y='lifeExp',color='continent') #add color based on the continent
In [57]:
#draw a bubble plot
px.scatter(gapminder2007,x='gdpPercap',y='lifeExp',color='continent',size='pop',size_max=60)
In [58]:
#add hover name - country
px.scatter(gapminder2007,x='gdpPercap',y='lifeExp',color='continent',size='pop',size_max=60,
hover_name='country')
In [59]:
#facetplot
px.scatter(gapminder, x = 'gdpPercap',y='lifeExp',color='continent',size='pop',
size_max=60, hover_name='country',facet_col='continent')
In [60]:
#facetplot
px.scatter(gapminder, x = 'gdpPercap',y='lifeExp',color='continent',size='pop',
size_max=60, hover_name='country',facet_col='continent',log_x=True)
In [61]:
#adding animation to bubble plot
px.scatter(gapminder,x='gdpPercap',y='lifeExp',color='continent',size='pop',size_max=40,
hover_name='country',log_x=True,animation_frame='year',
animation_group='country',range_y=[23,90])
In [62]:
#plot geographical maps
px.choropleth(gapminder,locations='iso_alpha',color='lifeExp',hover_name='country',
animation_frame='year',color_continuous_scale=px.colors.sequential.Plasma,projection='natural earth')