In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb
from sklearn import datasets, linear_model
import statsmodels.formula.api
In [9]:
#data from https://github.com/jennybc/gapminder/blob/master/data-raw/08_gap-every-five-years.tsv
data = pd.read_csv("gap.tsv", sep='\t')
data.head()
Out[9]:
country continent year lifeExp pop gdpPercap
0 Afghanistan Asia 1952 28.801 8425333 779.445314
1 Afghanistan Asia 1957 30.332 9240934 820.853030
2 Afghanistan Asia 1962 31.997 10267083 853.100710
3 Afghanistan Asia 1967 34.020 11537966 836.197138
4 Afghanistan Asia 1972 36.088 13079460 739.981106
In [10]:
#violin plot of life expectancy vs time
sb.violinplot(data['year'],data['lifeExp'])
plt.ylabel('Life Expectancy')
plt.show()
In [11]:
#Over time life expextancy has increased on average besides some outliers.
In [12]:
#For most years the life expectancy has a large range across countries. The earlier years are skewed towards lower
#life expectancy, whereas the later ones are skewed towards higher life expectancy. Only the later plots are somewhat
#unimodal with values congregating around 70 years. 1967-1972 are fairly symetric with most values either around 70 or
#45.
In [13]:
#I would reject the null hypothesis of no relationship as I can see the average consistently increasing in tandem with
#the year.
In [14]:
#I would predict lower residuals until around 1987 when the increasing averages seem to flatten out.
#With this information I would predict the residuals plot to be close to the linear regression on average.
In [15]:
#Since the linear regression model generally cannot fit the data perfectly I suspect a violin plot of the residuals
#would be fairly flat along a y value greater than 0 as all variables will be around the same distance from the line.
In [16]:
#linear regression of life expectancy vs time
x = data['year'].values.reshape(-1,1)
y = data['lifeExp'].values.reshape(-1,1)
    
reg = linear_model.LinearRegression()
reg.fit(x,y)
pred = reg.predict(x)

plt.scatter(data['year'],data['lifeExp'])
plt.plot(x,pred)
plt.show()
In [17]:
ols = statsmodels.formula.api.ols(formula="lifeExp ~ year", data=data).fit()
print(ols.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                lifeExp   R-squared:                       0.190
Model:                            OLS   Adj. R-squared:                  0.189
Method:                 Least Squares   F-statistic:                     398.6
Date:                Fri, 20 Sep 2019   Prob (F-statistic):           7.55e-80
Time:                        18:54:32   Log-Likelihood:                -6597.9
No. Observations:                1704   AIC:                         1.320e+04
Df Residuals:                    1702   BIC:                         1.321e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   -585.6522     32.314    -18.124      0.000    -649.031    -522.273
year           0.3259      0.016     19.965      0.000       0.294       0.358
==============================================================================
Omnibus:                      386.124   Durbin-Watson:                   0.197
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               90.750
Skew:                          -0.268   Prob(JB):                     1.97e-20
Kurtosis:                       2.004   Cond. No.                     2.27e+05
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.27e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
In [18]:
#Our ols produces a line modeled by the equation y=x*0.3259-585.6522, indicating life expectancy increases by ~32.6%
#annually on average.
In [19]:
#With a p-value of 7.55e-80 which is much less than a signifigance value of .05 so we can reject the null hypothesis.
In [20]:
#plot the residuals of our linear regression
l = []
for index,row in data.iterrows():
    l.append(abs((row['year']*0.3259-585.6522)-row['lifeExp']))
    
sb.violinplot(data['year'],l)
plt.ylabel('Residuals')
plt.show()
In [21]:
#My statment that the reiduals would increase post 1987 turned out true as there seem to be more outliers in this
#period that are very far away but the bulk of the rediduals are clumped around a mode of 10 years off of the line.
In [22]:
#violion plot of residuals vs continent
sb.violinplot(data['continent'],l)
plt.ylabel('Residuals')
plt.show()
In [23]:
#There's certianly a range of average residuals by continent, suggesting that regression models would be more accurate
#if made for a specific continent.
In [24]:
#make a scatter plot of life expectancy vs year grouped by continent
df2 = data.groupby(['continent','year'],as_index=True).mean()[['lifeExp']]

y,c = [],[]
laf,lam,las,le,lo = [],[],[],[],[]
for index,row in df2.iterrows():
    y.append(index[1])
    c.append(index[0])  

df2['year'] = y
df2['continent'] = c
In [25]:
year = []
laf,lam,las,le,lo = [],[],[],[],[]
for index,row in df2.iterrows():
    if index[1] not in year:
        year.append(index[1])
    if index[0] == 'Africa':
        laf.append(row['lifeExp'])
    if index[0] == 'Americas':
        lam.append(row['lifeExp'])
    if index[0] == 'Asia':
        las.append(row['lifeExp'])
    if index[0] == 'Europe':
        le.append(row['lifeExp'])
    if index[0] == 'Oceania':
        lo.append(row['lifeExp'])
In [26]:
x = df2['year'].values.reshape(-1,1)
y = df2['lifeExp'].values.reshape(-1,1)

reg = linear_model.LinearRegression()
reg.fit(x,y)
pred = reg.predict(x)
In [27]:
plt.scatter(year,laf,label='Africa')
plt.scatter(year,lam,label='Americas')
plt.scatter(year,las,label='Asia')
plt.scatter(year,le,label='Europe')
plt.scatter(year,lo,label='Oceania')

plt.plot(x,pred)
plt.ylabel('Average life expectancy')
plt.legend()
plt.show()
In [28]:
#There should be an interaction term for continent and year because the continent has a huge effect on average life
#expectancy. As observed in the plot there is a huge gap between the averages for Africa and Oceania/Europe, indicating
#that the linear regression will have high residuals.
In [37]:
#make a model that includes interaction between year and continent
ols2 = statsmodels.formula.api.ols(formula="lifeExp ~ year * continent", data=df2).fit()
print(ols2.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                lifeExp   R-squared:                       0.992
Model:                            OLS   Adj. R-squared:                  0.991
Method:                 Least Squares   F-statistic:                     732.4
Date:                Fri, 20 Sep 2019   Prob (F-statistic):           7.30e-50
Time:                        18:56:10   Log-Likelihood:                -80.688
No. Observations:                  60   AIC:                             181.4
Df Residuals:                      50   BIC:                             202.3
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
==============================================================================================
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
Intercept                   -524.2578     33.677    -15.567      0.000    -591.899    -456.616
continent[T.Americas]       -138.8484     47.626     -2.915      0.005    -234.508     -43.189
continent[T.Asia]           -312.6330     47.626     -6.564      0.000    -408.292    -216.974
continent[T.Europe]          156.8469     47.626      3.293      0.002      61.187     252.506
continent[T.Oceania]         182.3499     47.626      3.829      0.000      86.691     278.009
year                           0.2895      0.017     17.019      0.000       0.255       0.324
year:continent[T.Americas]     0.0781      0.024      3.247      0.002       0.030       0.126
year:continent[T.Asia]         0.1636      0.024      6.800      0.000       0.115       0.212
year:continent[T.Europe]      -0.0676      0.024     -2.810      0.007      -0.116      -0.019
year:continent[T.Oceania]     -0.0793      0.024     -3.294      0.002      -0.128      -0.031
==============================================================================
Omnibus:                        0.319   Durbin-Watson:                   0.416
Prob(Omnibus):                  0.853   Jarque-Bera (JB):                0.199
Skew:                          -0.139   Prob(JB):                        0.905
Kurtosis:                       2.947   Cond. No.                     1.32e+06
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.32e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
In [30]:
#As you can see from the table above all variables are significantly different from 0.
In [31]:
ols2.params
#The data bellow shows the slope/coeficient for each country as a function of the year.
Out[31]:
Intercept                    -524.257846
continent[T.Americas]        -138.848447
continent[T.Asia]            -312.633049
continent[T.Europe]           156.846852
continent[T.Oceania]          182.349883
year                            0.289529
year:continent[T.Americas]      0.078122
year:continent[T.Asia]          0.163593
year:continent[T.Europe]       -0.067597
year:continent[T.Oceania]      -0.079257
dtype: float64
In [32]:
ols.fvalue
Out[32]:
398.6047457117622
In [33]:
ols2.fvalue
Out[33]:
732.3996046857956
In [34]:
#ols2 has a higher fvalue and therefore is a better model
In [38]:
#residuals vs year for interaction model
l = []
for index,row in data.iterrows():
    l.append(abs((row['year']*0.289529-524.257846)-row['lifeExp']))
In [39]:
sb.violinplot(data['year'],l)
plt.show()