import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb
from sklearn import datasets, linear_model
import statsmodels.formula.api

#data from https://github.com/jennybc/gapminder/blob/master/data-raw/08_gap-every-five-years.tsv
data = pd.read_csv("gap.tsv", sep='\t')
data.head()

#violin plot of life expectancy vs time
sb.violinplot(data['year'],data['lifeExp'])
plt.ylabel('Life Expectancy')
plt.show()

#Over time life expextancy has increased on average besides some outliers.

#For most years the life expectancy has a large range across countries. The earlier years are skewed towards lower
#life expectancy, whereas the later ones are skewed towards higher life expectancy. Only the later plots are somewhat
#unimodal with values congregating around 70 years. 1967-1972 are fairly symetric with most values either around 70 or
#45.

#I would reject the null hypothesis of no relationship as I can see the average consistently increasing in tandem with
#the year.

#I would predict lower residuals until around 1987 when the increasing averages seem to flatten out.
#With this information I would predict the residuals plot to be close to the linear regression on average.

#Since the linear regression model generally cannot fit the data perfectly I suspect a violin plot of the residuals
#would be fairly flat along a y value greater than 0 as all variables will be around the same distance from the line.

#linear regression of life expectancy vs time
x = data['year'].values.reshape(-1,1)
y = data['lifeExp'].values.reshape(-1,1)
    
reg = linear_model.LinearRegression()
reg.fit(x,y)
pred = reg.predict(x)

plt.scatter(data['year'],data['lifeExp'])
plt.plot(x,pred)
plt.show()

ols = statsmodels.formula.api.ols(formula="lifeExp ~ year", data=data).fit()
print(ols.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                lifeExp   R-squared:                       0.190
Model:                            OLS   Adj. R-squared:                  0.189
Method:                 Least Squares   F-statistic:                     398.6
Date:                Fri, 20 Sep 2019   Prob (F-statistic):           7.55e-80
Time:                        18:54:32   Log-Likelihood:                -6597.9
No. Observations:                1704   AIC:                         1.320e+04
Df Residuals:                    1702   BIC:                         1.321e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   -585.6522     32.314    -18.124      0.000    -649.031    -522.273
year           0.3259      0.016     19.965      0.000       0.294       0.358
==============================================================================
Omnibus:                      386.124   Durbin-Watson:                   0.197
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               90.750
Skew:                          -0.268   Prob(JB):                     1.97e-20
Kurtosis:                       2.004   Cond. No.                     2.27e+05
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.27e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

#Our ols produces a line modeled by the equation y=x*0.3259-585.6522, indicating life expectancy increases by ~32.6%
#annually on average.

#With a p-value of 7.55e-80 which is much less than a signifigance value of .05 so we can reject the null hypothesis.

#plot the residuals of our linear regression
l = []
for index,row in data.iterrows():
    l.append(abs((row['year']*0.3259-585.6522)-row['lifeExp']))
    
sb.violinplot(data['year'],l)
plt.ylabel('Residuals')
plt.show()

#My statment that the reiduals would increase post 1987 turned out true as there seem to be more outliers in this
#period that are very far away but the bulk of the rediduals are clumped around a mode of 10 years off of the line.

#violion plot of residuals vs continent
sb.violinplot(data['continent'],l)
plt.ylabel('Residuals')
plt.show()

#There's certianly a range of average residuals by continent, suggesting that regression models would be more accurate
#if made for a specific continent.

#make a scatter plot of life expectancy vs year grouped by continent
df2 = data.groupby(['continent','year'],as_index=True).mean()[['lifeExp']]

y,c = [],[]
laf,lam,las,le,lo = [],[],[],[],[]
for index,row in df2.iterrows():
    y.append(index[1])
    c.append(index[0])  

df2['year'] = y
df2['continent'] = c

year = []
laf,lam,las,le,lo = [],[],[],[],[]
for index,row in df2.iterrows():
    if index[1] not in year:
        year.append(index[1])
    if index[0] == 'Africa':
        laf.append(row['lifeExp'])
    if index[0] == 'Americas':
        lam.append(row['lifeExp'])
    if index[0] == 'Asia':
        las.append(row['lifeExp'])
    if index[0] == 'Europe':
        le.append(row['lifeExp'])
    if index[0] == 'Oceania':
        lo.append(row['lifeExp'])

x = df2['year'].values.reshape(-1,1)
y = df2['lifeExp'].values.reshape(-1,1)

reg = linear_model.LinearRegression()
reg.fit(x,y)
pred = reg.predict(x)

plt.scatter(year,laf,label='Africa')
plt.scatter(year,lam,label='Americas')
plt.scatter(year,las,label='Asia')
plt.scatter(year,le,label='Europe')
plt.scatter(year,lo,label='Oceania')

plt.plot(x,pred)
plt.ylabel('Average life expectancy')
plt.legend()
plt.show()

#There should be an interaction term for continent and year because the continent has a huge effect on average life
#expectancy. As observed in the plot there is a huge gap between the averages for Africa and Oceania/Europe, indicating
#that the linear regression will have high residuals.

#make a model that includes interaction between year and continent
ols2 = statsmodels.formula.api.ols(formula="lifeExp ~ year * continent", data=df2).fit()
print(ols2.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                lifeExp   R-squared:                       0.992
Model:                            OLS   Adj. R-squared:                  0.991
Method:                 Least Squares   F-statistic:                     732.4
Date:                Fri, 20 Sep 2019   Prob (F-statistic):           7.30e-50
Time:                        18:56:10   Log-Likelihood:                -80.688
No. Observations:                  60   AIC:                             181.4
Df Residuals:                      50   BIC:                             202.3
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
==============================================================================================
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
Intercept                   -524.2578     33.677    -15.567      0.000    -591.899    -456.616
continent[T.Americas]       -138.8484     47.626     -2.915      0.005    -234.508     -43.189
continent[T.Asia]           -312.6330     47.626     -6.564      0.000    -408.292    -216.974
continent[T.Europe]          156.8469     47.626      3.293      0.002      61.187     252.506
continent[T.Oceania]         182.3499     47.626      3.829      0.000      86.691     278.009
year                           0.2895      0.017     17.019      0.000       0.255       0.324
year:continent[T.Americas]     0.0781      0.024      3.247      0.002       0.030       0.126
year:continent[T.Asia]         0.1636      0.024      6.800      0.000       0.115       0.212
year:continent[T.Europe]      -0.0676      0.024     -2.810      0.007      -0.116      -0.019
year:continent[T.Oceania]     -0.0793      0.024     -3.294      0.002      -0.128      -0.031
==============================================================================
Omnibus:                        0.319   Durbin-Watson:                   0.416
Prob(Omnibus):                  0.853   Jarque-Bera (JB):                0.199
Skew:                          -0.139   Prob(JB):                        0.905
Kurtosis:                       2.947   Cond. No.                     1.32e+06
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.32e+06. This might indicate that there are
strong multicollinearity or other numerical problems.

#As you can see from the table above all variables are significantly different from 0.

ols2.params
#The data bellow shows the slope/coeficient for each country as a function of the year.

Intercept                    -524.257846
continent[T.Americas]        -138.848447
continent[T.Asia]            -312.633049
continent[T.Europe]           156.846852
continent[T.Oceania]          182.349883
year                            0.289529
year:continent[T.Americas]      0.078122
year:continent[T.Asia]          0.163593
year:continent[T.Europe]       -0.067597
year:continent[T.Oceania]      -0.079257
dtype: float64

ols.fvalue

398.6047457117622

ols2.fvalue

732.3996046857956

#ols2 has a higher fvalue and therefore is a better model

#residuals vs year for interaction model
l = []
for index,row in data.iterrows():
    l.append(abs((row['year']*0.289529-524.257846)-row['lifeExp']))

sb.violinplot(data['year'],l)
plt.show()

	country	continent	year	lifeExp	pop	gdpPercap
0	Afghanistan	Asia	1952	28.801	8425333	779.445314
1	Afghanistan	Asia	1957	30.332	9240934	820.853030
2	Afghanistan	Asia	1962	31.997	10267083	853.100710
3	Afghanistan	Asia	1967	34.020	11537966	836.197138
4	Afghanistan	Asia	1972	36.088	13079460	739.981106