Solar flare data scraping

In [59]:
import csv, json, requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
import re
In [60]:
r = requests.get("https://www.spaceweatherlive.com/en/solar-activity/top-50-solar-flares")
In [61]:
root = BeautifulSoup(r.content,"html")
In [62]:
#root.prettify()
In [63]:
table = root.find("table")
In [64]:
#make a list of lists, repersenting every entry in a row
entries = []
rows = table.findAll("tr")

for tr in rows:
    td = tr.findAll("td")
    row_text = [tr.text for tr in td]
    entries.append(row_text)
    
del entries[0] #First row is blank so drop it
In [65]:
df = pd.DataFrame(entries,columns=["Number","X-classification","Date","Region","Start","Max","End","Movie"])
In [66]:
df.head()
Out[66]:
Number X-classification Date Region Start Max End Movie
0 1 X28.0 2003/11/04 0486 19:29 19:53 20:06 MovieView archive
1 2 X20.0 2001/04/02 9393 21:32 21:51 22:03 MovieView archive
2 3 X17.2 2003/10/28 0486 09:51 11:10 11:24 MovieView archive
3 4 X17.0 2005/09/07 0808 17:17 17:40 18:03 MovieView archive
4 5 X14.4 2001/04/15 9415 13:19 13:50 13:55 MovieView archive
In [67]:
#drop movies column as it's useless
df = df.drop(labels="Movie",axis=1)
In [68]:
#combine date and start/end/max time of every row into datetime object
for index,row in df.iterrows():
    dts = pd.to_datetime(row['Date'] + " " + row['Start'])
    dtm = pd.to_datetime(row['Date'] + " " + row['Max'])
    dte = pd.to_datetime(row['Date'] + " " + row['End'])
    df.at[index,'Start'] = dts
    df.at[index,'Max'] = dtm
    df.at[index,'End'] = dte
In [69]:
#date column no longer necessary
df_d = df #for part 2 question 2
df = df.drop(labels='Date',axis=1)
In [70]:
#replace - with NaN even though there are none
df = df.replace(to_replace='-',value='NaN')
df.head()
Out[70]:
Number X-classification Region Start Max End
0 1 X28.0 0486 2003-11-04 19:29:00 2003-11-04 19:53:00 2003-11-04 20:06:00
1 2 X20.0 9393 2001-04-02 21:32:00 2001-04-02 21:51:00 2001-04-02 22:03:00
2 3 X17.2 0486 2003-10-28 09:51:00 2003-10-28 11:10:00 2003-10-28 11:24:00
3 4 X17.0 0808 2005-09-07 17:17:00 2005-09-07 17:40:00 2005-09-07 18:03:00
4 5 X14.4 9415 2001-04-15 13:19:00 2001-04-15 13:50:00 2001-04-15 13:55:00
In [71]:
r = requests.get("https://cdaw.gsfc.nasa.gov/CME_list/radio/waves_type2.html")
In [72]:
#get nasa data
root = BeautifulSoup(r.content,"html")
In [73]:
#root.prettify()
In [74]:
#get table
table = root.find("pre")
In [75]:
#convert beautifulsoup object to string and drop html links with regex
table_str = str(table)
t2 = re.sub('<a\shref="\S+">|<a\shref="\S+"\starget="\S+">|<\/a>',"",table_str)
#split into rows and only take necessary rows
t2 = t2.splitlines()
t2 = t2[12:523]
In [76]:
#split each row by whitespace
l = []
for x in t2:
    row = x.split()
    l.append(row)
    
#drop uneccesary columns(they contain comments)
df2 = pd.DataFrame(l,columns=['start_date','start_time', 'end_date', 'end_time', 'start_frequency', 'end_frequency', 'flare_location', 'flare_region','flare_importance','CME_date','CME_time','CPA','width','speed','plot','16','17','18','19','20','21','22','23','24'])
df2 = df2.drop(['16','17','18','19','20','21','22','23','24'],1)
In [77]:
df2.head()
Out[77]:
start_date start_time end_date end_time start_frequency end_frequency flare_location flare_region flare_importance CME_date CME_time CPA width speed plot
0 1997/04/01 14:00 04/01 14:15 8000 4000 S25E16 8026 M1.3 04/01 15:18 74 79 312 PHTX
1 1997/04/07 14:30 04/07 17:30 11000 1000 S28E19 8027 C6.8 04/07 14:27 Halo 360 878 PHTX
2 1997/05/12 05:15 05/14 16:00 12000 80 N21W08 8038 C1.3 05/12 05:30 Halo 360 464 PHTX
3 1997/05/21 20:20 05/21 22:00 5000 500 N05W12 8040 M1.3 05/21 21:00 263 165 296 PHTX
4 1997/09/23 21:53 09/23 22:16 6000 2000 S29E25 8088 C1.4 09/23 22:02 133 155 712 PHTX
In [78]:
#replace missing data with NaN
df2 = df2.replace(to_replace='---',value='NaN')
df2 = df2.replace(to_replace='----',value='NaN')
df2 = df2.replace(to_replace='-----',value='NaN')
df2 = df2.replace(to_replace='------',value='NaN')
df2 = df2.replace(to_replace='--:--',value='NaN')
df2 = df2.replace(to_replace='--/--',value='NaN')
In [79]:
df2.head()
Out[79]:
start_date start_time end_date end_time start_frequency end_frequency flare_location flare_region flare_importance CME_date CME_time CPA width speed plot
0 1997/04/01 14:00 04/01 14:15 8000 4000 S25E16 8026 M1.3 04/01 15:18 74 79 312 PHTX
1 1997/04/07 14:30 04/07 17:30 11000 1000 S28E19 8027 C6.8 04/07 14:27 Halo 360 878 PHTX
2 1997/05/12 05:15 05/14 16:00 12000 80 N21W08 8038 C1.3 05/12 05:30 Halo 360 464 PHTX
3 1997/05/21 20:20 05/21 22:00 5000 500 N05W12 8040 M1.3 05/21 21:00 263 165 296 PHTX
4 1997/09/23 21:53 09/23 22:16 6000 2000 S29E25 8088 C1.4 09/23 22:02 133 155 712 PHTX
In [80]:
#create extra columns for if it's a halo or lower bound and clean data from original rows
#check contents of the row and append the result to a new list
l = []
j = []
for index,row in df2.iterrows():
    if row['CPA'] == 'Halo':
        l.append('True')
        row['CPA'] = 'NA'
    else:
        l.append('False')
    if re.match('&gt;',row['width']):
        row['width'] = re.sub('&gt;','',row['width'])
        j.append('True')
    else:
        j.append('False')
 
df2['is_halo'] = l
df2['is_lower'] = j
In [81]:
df2.head()
Out[81]:
start_date start_time end_date end_time start_frequency end_frequency flare_location flare_region flare_importance CME_date CME_time CPA width speed plot is_halo is_lower
0 1997/04/01 14:00 04/01 14:15 8000 4000 S25E16 8026 M1.3 04/01 15:18 74 79 312 PHTX False False
1 1997/04/07 14:30 04/07 17:30 11000 1000 S28E19 8027 C6.8 04/07 14:27 NA 360 878 PHTX True False
2 1997/05/12 05:15 05/14 16:00 12000 80 N21W08 8038 C1.3 05/12 05:30 NA 360 464 PHTX True False
3 1997/05/21 20:20 05/21 22:00 5000 500 N05W12 8040 M1.3 05/21 21:00 263 165 296 PHTX False False
4 1997/09/23 21:53 09/23 22:16 6000 2000 S29E25 8088 C1.4 09/23 22:02 133 155 712 PHTX False False
In [82]:
#change times of 24:00 in accordance with piazza post
df2 = df2.replace('24:00','00:00')
In [83]:
#make all dates/times into datetime columns by combining the strings and passing them into to_datetime
start_dt = []
end_dt = []
cme_dt = []
for index,row in df2.iterrows():
    sdt = pd.to_datetime((row['start_date'] + " " + row['start_time']),format='%Y/%m/%d %H:%M')
    edt = pd.to_datetime((row['start_date'][0:5] + row['end_date'] + " " + row['end_time']),format='%Y/%m/%d %H:%M')
    if row['CME_date'] != 'NaN' and row['CME_time'] != 'NaN':
        cdt = pd.to_datetime((row['start_date'][0:5] + row['CME_date'] + " " + row['CME_time']),format='%Y/%m/%d %H:%M')
        cme_dt.append(cdt)
    else:
        cme_dt.append('NaN')
    start_dt.append(sdt)
    end_dt.append(edt)
In [84]:
df2['start_datetime'] = start_dt
df2['end_datetime'] = end_dt
df2['CME_datetime'] = cme_dt
df2_d = df2
df2 = df2.drop(['start_date','start_time','end_date','end_time','CME_date','CME_time'],1)#remove unecessary columns
In [85]:
#put all flares with classification X into a list with their date to cross reference then sort
max = []
for index,row in df2.iterrows():
    x = re.match('X\d{1,2}\.\d?',row['flare_importance'])
    if x:
        max.append([float(x.string[1:]),row['start_datetime']])
In [86]:
sorted(max,reverse=True)[0:49]
#After comparing the two tables there are certainly some entries not present here that are on the top 50 website.
#The level at which I can replicate the top 50 data depends on the data i have myself and while the nasa data has
#more entries they are missing some of the bigger flares. Note that my list includes all 90 X class events listed on
#the nasa site but after look at the other site every few rows there is a missing event.
Out[86]:
[[28.0, Timestamp('2003-11-04 20:00:00')],
 [20.0, Timestamp('2001-04-02 22:05:00')],
 [17.0, Timestamp('2003-10-28 11:10:00')],
 [14.0, Timestamp('2001-04-15 14:05:00')],
 [10.0, Timestamp('2003-10-29 20:55:00')],
 [9.4, Timestamp('1997-11-06 12:20:00')],
 [9.0, Timestamp('2006-12-05 10:50:00')],
 [8.3, Timestamp('2003-11-02 17:30:00')],
 [7.1, Timestamp('2005-01-20 07:15:00')],
 [6.9, Timestamp('2011-08-09 08:20:00')],
 [6.5, Timestamp('2006-12-06 19:00:00')],
 [6.2, Timestamp('2005-09-09 19:45:00')],
 [5.7, Timestamp('2000-07-14 10:30:00')],
 [5.6, Timestamp('2001-04-06 19:35:00')],
 [5.4, Timestamp('2012-03-07 01:00:00')],
 [5.3, Timestamp('2001-08-25 16:50:00')],
 [4.9, Timestamp('2014-02-25 00:56:00')],
 [4.8, Timestamp('2002-07-23 00:50:00')],
 [4.0, Timestamp('2000-11-26 17:00:00')],
 [3.9, Timestamp('2003-11-03 10:00:00')],
 [3.8, Timestamp('2005-01-17 10:00:00')],
 [3.6, Timestamp('2003-05-28 01:00:00')],
 [3.4, Timestamp('2006-12-13 02:45:00')],
 [3.4, Timestamp('2001-12-28 20:35:00')],
 [3.3, Timestamp('2002-07-20 21:30:00')],
 [3.2, Timestamp('2013-05-14 01:16:00')],
 [3.1, Timestamp('2002-08-24 01:45:00')],
 [2.8, Timestamp('2013-05-13 16:15:00')],
 [2.7, Timestamp('2015-05-05 22:24:00')],
 [2.7, Timestamp('2003-11-03 01:15:00')],
 [2.7, Timestamp('1998-05-06 08:25:00')],
 [2.6, Timestamp('2005-01-15 23:00:00')],
 [2.6, Timestamp('2001-09-24 10:45:00')],
 [2.6, Timestamp('1997-11-27 13:30:00')],
 [2.5, Timestamp('2004-11-10 02:25:00')],
 [2.3, Timestamp('2001-04-10 05:24:00')],
 [2.3, Timestamp('2000-11-24 15:25:00')],
 [2.3, Timestamp('2000-06-06 15:20:00')],
 [2.2, Timestamp('2011-02-15 02:10:00')],
 [2.1, Timestamp('2013-10-25 15:08:00')],
 [2.1, Timestamp('2011-09-06 22:30:00')],
 [2.1, Timestamp('2005-09-10 21:45:00')],
 [2.1, Timestamp('1997-11-04 06:00:00')],
 [2.0, Timestamp('2005-01-17 09:25:00')],
 [2.0, Timestamp('2004-11-07 16:25:00')],
 [2.0, Timestamp('2001-04-12 10:20:00')],
 [2.0, Timestamp('2000-11-24 05:10:00')],
 [1.9, Timestamp('2000-11-25 19:00:00')],
 [1.8, Timestamp('2002-07-18 07:55:00')]]
In [87]:
#make column name match
df_d = df_d.rename(index=str, columns={'Date': 'start_date','X-classification':'flare_importance'})

#fix formatting eg: make X28. = X28.0
fi = []
for index,row in df2_d.iterrows():
    x = re.match('X\d\d\.$',row['flare_importance'])
    if x:
        y = row['flare_importance'] = x.string+'0'
        fi.append(y)
    else:
        fi.append(row['flare_importance'])

df2_d['flare_importance'] = fi

#Merge dataframes and tidy
df_d = df_d.rename(index=str, columns={'Date': 'start_date','X-classification':'flare_importance'})
df3 = pd.merge(df_d,df2_d,on=['start_date','flare_importance'],how='left')
df4 = pd.merge(df2_d,df_d,on=['start_date','flare_importance'],how='outer')
df3 = df3.drop(['start_date'],axis=1)
df4 = df4.drop(['start_date','Region','Start','Max','End'],1)
df4 = df4.rename(index=str,columns={'Number':'Rank'})
In [88]:
df3.head() #Top 50 flares merged with nasa data
Out[88]:
Number flare_importance Region Start Max End start_time end_date end_time start_frequency ... CME_time CPA width speed plot is_halo is_lower start_datetime end_datetime CME_datetime
0 1 X28.0 0486 2003-11-04 19:29:00 2003-11-04 19:53:00 2003-11-04 20:06:00 20:00 11/04 00:00 10000 ... 19:54 NA 360 2657 PHTX True False 2003-11-04 20:00:00 2003-11-04 00:00:00 2003-11-04 19:54:00
1 2 X20.0 9393 2001-04-02 21:32:00 2001-04-02 21:51:00 2001-04-02 22:03:00 22:05 04/03 02:30 14000 ... 22:06 261 244 2505 PHTX False False 2001-04-02 22:05:00 2001-04-03 02:30:00 2001-04-02 22:06:00
2 3 X17.2 0486 2003-10-28 09:51:00 2003-10-28 11:10:00 2003-10-28 11:24:00 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT
3 4 X17.0 0808 2005-09-07 17:17:00 2005-09-07 17:40:00 2005-09-07 18:03:00 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT
4 5 X14.4 9415 2001-04-15 13:19:00 2001-04-15 13:50:00 2001-04-15 13:55:00 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT

5 rows × 24 columns

In [89]:
df4.head() #Nasa data with rank column
Out[89]:
start_time end_date end_time start_frequency end_frequency flare_location flare_region flare_importance CME_date CME_time CPA width speed plot is_halo is_lower start_datetime end_datetime CME_datetime Rank
0 14:00 04/01 14:15 8000 4000 S25E16 8026 M1.3 04/01 15:18 74 79 312 PHTX False False 1997-04-01 14:00:00 1997-04-01 14:15:00 1997-04-01 15:18:00 NaN
1 14:30 04/07 17:30 11000 1000 S28E19 8027 C6.8 04/07 14:27 NA 360 878 PHTX True False 1997-04-07 14:30:00 1997-04-07 17:30:00 1997-04-07 14:27:00 NaN
2 05:15 05/14 16:00 12000 80 N21W08 8038 C1.3 05/12 05:30 NA 360 464 PHTX True False 1997-05-12 05:15:00 1997-05-14 16:00:00 1997-05-12 05:30:00 NaN
3 20:20 05/21 22:00 5000 500 N05W12 8040 M1.3 05/21 21:00 263 165 296 PHTX False False 1997-05-21 20:20:00 1997-05-21 22:00:00 1997-05-21 21:00:00 NaN
4 21:53 09/23 22:16 6000 2000 S29E25 8088 C1.4 09/23 22:02 133 155 712 PHTX False False 1997-09-23 21:53:00 1997-09-23 22:16:00 1997-09-23 22:02:00 NaN
In [90]:
#Question 3
#Sum total of each combination of halo/not halo and top 50/unranked
t50_halo_c = 0
unranked_halo_c = 0
unranked_notHalo_c = 0
t50_notHalo_c = 0
for index,row in df4.iterrows():
    if type(row['Rank']) == str and row['is_halo'] == 'True':
        t50_halo_c += 1
    elif type(row['Rank']) != str and row['is_halo'] == 'True':
        unranked_halo_c += 1
    elif type(row['Rank']) == str and row['is_halo'] != 'True':
        t50_notHalo_c += 1
    elif type(row['Rank']) != str and row['is_halo'] != 'True':
        unranked_notHalo_c += 1
In [91]:
#make a dataframe
data = {
        'Halo':[t50_halo_c,unranked_halo_c],
        'Not Halo':[t50_notHalo_c,unranked_notHalo_c]
       }
In [92]:
h = pd.DataFrame(data,['Top 50',"Unranked"])
h
Out[92]:
Halo Not Halo
Top 50 24 26
Unranked 255 225
In [93]:
import matplotlib.pyplot as plt
In [95]:
#plot my dataframe
h.plot.hist()
plt.show()