import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb
from sklearn import datasets, linear_model
import statsmodels.formula.api
#data from https://github.com/jennybc/gapminder/blob/master/data-raw/08_gap-every-five-years.tsv
data = pd.read_csv("gap.tsv", sep='\t')
data.head()
#violin plot of life expectancy vs time
sb.violinplot(data['year'],data['lifeExp'])
plt.ylabel('Life Expectancy')
plt.show()
#Over time life expextancy has increased on average besides some outliers.
#For most years the life expectancy has a large range across countries. The earlier years are skewed towards lower
#life expectancy, whereas the later ones are skewed towards higher life expectancy. Only the later plots are somewhat
#unimodal with values congregating around 70 years. 1967-1972 are fairly symetric with most values either around 70 or
#45.
#I would reject the null hypothesis of no relationship as I can see the average consistently increasing in tandem with
#the year.
#I would predict lower residuals until around 1987 when the increasing averages seem to flatten out.
#With this information I would predict the residuals plot to be close to the linear regression on average.
#Since the linear regression model generally cannot fit the data perfectly I suspect a violin plot of the residuals
#would be fairly flat along a y value greater than 0 as all variables will be around the same distance from the line.
#linear regression of life expectancy vs time
x = data['year'].values.reshape(-1,1)
y = data['lifeExp'].values.reshape(-1,1)
reg = linear_model.LinearRegression()
reg.fit(x,y)
pred = reg.predict(x)
plt.scatter(data['year'],data['lifeExp'])
plt.plot(x,pred)
plt.show()
ols = statsmodels.formula.api.ols(formula="lifeExp ~ year", data=data).fit()
print(ols.summary())
#Our ols produces a line modeled by the equation y=x*0.3259-585.6522, indicating life expectancy increases by ~32.6%
#annually on average.
#With a p-value of 7.55e-80 which is much less than a signifigance value of .05 so we can reject the null hypothesis.
#plot the residuals of our linear regression
l = []
for index,row in data.iterrows():
l.append(abs((row['year']*0.3259-585.6522)-row['lifeExp']))
sb.violinplot(data['year'],l)
plt.ylabel('Residuals')
plt.show()
#My statment that the reiduals would increase post 1987 turned out true as there seem to be more outliers in this
#period that are very far away but the bulk of the rediduals are clumped around a mode of 10 years off of the line.
#violion plot of residuals vs continent
sb.violinplot(data['continent'],l)
plt.ylabel('Residuals')
plt.show()
#There's certianly a range of average residuals by continent, suggesting that regression models would be more accurate
#if made for a specific continent.
#make a scatter plot of life expectancy vs year grouped by continent
df2 = data.groupby(['continent','year'],as_index=True).mean()[['lifeExp']]
y,c = [],[]
laf,lam,las,le,lo = [],[],[],[],[]
for index,row in df2.iterrows():
y.append(index[1])
c.append(index[0])
df2['year'] = y
df2['continent'] = c
year = []
laf,lam,las,le,lo = [],[],[],[],[]
for index,row in df2.iterrows():
if index[1] not in year:
year.append(index[1])
if index[0] == 'Africa':
laf.append(row['lifeExp'])
if index[0] == 'Americas':
lam.append(row['lifeExp'])
if index[0] == 'Asia':
las.append(row['lifeExp'])
if index[0] == 'Europe':
le.append(row['lifeExp'])
if index[0] == 'Oceania':
lo.append(row['lifeExp'])
x = df2['year'].values.reshape(-1,1)
y = df2['lifeExp'].values.reshape(-1,1)
reg = linear_model.LinearRegression()
reg.fit(x,y)
pred = reg.predict(x)
plt.scatter(year,laf,label='Africa')
plt.scatter(year,lam,label='Americas')
plt.scatter(year,las,label='Asia')
plt.scatter(year,le,label='Europe')
plt.scatter(year,lo,label='Oceania')
plt.plot(x,pred)
plt.ylabel('Average life expectancy')
plt.legend()
plt.show()
#There should be an interaction term for continent and year because the continent has a huge effect on average life
#expectancy. As observed in the plot there is a huge gap between the averages for Africa and Oceania/Europe, indicating
#that the linear regression will have high residuals.
#make a model that includes interaction between year and continent
ols2 = statsmodels.formula.api.ols(formula="lifeExp ~ year * continent", data=df2).fit()
print(ols2.summary())
#As you can see from the table above all variables are significantly different from 0.
ols2.params
#The data bellow shows the slope/coeficient for each country as a function of the year.
ols.fvalue
ols2.fvalue
#ols2 has a higher fvalue and therefore is a better model
#residuals vs year for interaction model
l = []
for index,row in data.iterrows():
l.append(abs((row['year']*0.289529-524.257846)-row['lifeExp']))
sb.violinplot(data['year'],l)
plt.show()