Solar flare data scraping¶

import csv, json, requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
import re

r = requests.get("https://www.spaceweatherlive.com/en/solar-activity/top-50-solar-flares")

root = BeautifulSoup(r.content,"html")

#root.prettify()

table = root.find("table")

#make a list of lists, repersenting every entry in a row
entries = []
rows = table.findAll("tr")

for tr in rows:
    td = tr.findAll("td")
    row_text = [tr.text for tr in td]
    entries.append(row_text)
    
del entries[0] #First row is blank so drop it

df = pd.DataFrame(entries,columns=["Number","X-classification","Date","Region","Start","Max","End","Movie"])

df.head()

#drop movies column as it's useless
df = df.drop(labels="Movie",axis=1)

#combine date and start/end/max time of every row into datetime object
for index,row in df.iterrows():
    dts = pd.to_datetime(row['Date'] + " " + row['Start'])
    dtm = pd.to_datetime(row['Date'] + " " + row['Max'])
    dte = pd.to_datetime(row['Date'] + " " + row['End'])
    df.at[index,'Start'] = dts
    df.at[index,'Max'] = dtm
    df.at[index,'End'] = dte

#date column no longer necessary
df_d = df #for part 2 question 2
df = df.drop(labels='Date',axis=1)

#replace - with NaN even though there are none
df = df.replace(to_replace='-',value='NaN')
df.head()

r = requests.get("https://cdaw.gsfc.nasa.gov/CME_list/radio/waves_type2.html")

#get nasa data
root = BeautifulSoup(r.content,"html")

#root.prettify()

#get table
table = root.find("pre")

#convert beautifulsoup object to string and drop html links with regex
table_str = str(table)
t2 = re.sub('<a\shref="\S+">|<a\shref="\S+"\starget="\S+">|<\/a>',"",table_str)
#split into rows and only take necessary rows
t2 = t2.splitlines()
t2 = t2[12:523]

#split each row by whitespace
l = []
for x in t2:
    row = x.split()
    l.append(row)
    
#drop uneccesary columns(they contain comments)
df2 = pd.DataFrame(l,columns=['start_date','start_time', 'end_date', 'end_time', 'start_frequency', 'end_frequency', 'flare_location', 'flare_region','flare_importance','CME_date','CME_time','CPA','width','speed','plot','16','17','18','19','20','21','22','23','24'])
df2 = df2.drop(['16','17','18','19','20','21','22','23','24'],1)

df2.head()

#replace missing data with NaN
df2 = df2.replace(to_replace='---',value='NaN')
df2 = df2.replace(to_replace='----',value='NaN')
df2 = df2.replace(to_replace='-----',value='NaN')
df2 = df2.replace(to_replace='------',value='NaN')
df2 = df2.replace(to_replace='--:--',value='NaN')
df2 = df2.replace(to_replace='--/--',value='NaN')

df2.head()

#create extra columns for if it's a halo or lower bound and clean data from original rows
#check contents of the row and append the result to a new list
l = []
j = []
for index,row in df2.iterrows():
    if row['CPA'] == 'Halo':
        l.append('True')
        row['CPA'] = 'NA'
    else:
        l.append('False')
    if re.match('&gt;',row['width']):
        row['width'] = re.sub('&gt;','',row['width'])
        j.append('True')
    else:
        j.append('False')
 
df2['is_halo'] = l
df2['is_lower'] = j

df2.head()

#change times of 24:00 in accordance with piazza post
df2 = df2.replace('24:00','00:00')

#make all dates/times into datetime columns by combining the strings and passing them into to_datetime
start_dt = []
end_dt = []
cme_dt = []
for index,row in df2.iterrows():
    sdt = pd.to_datetime((row['start_date'] + " " + row['start_time']),format='%Y/%m/%d %H:%M')
    edt = pd.to_datetime((row['start_date'][0:5] + row['end_date'] + " " + row['end_time']),format='%Y/%m/%d %H:%M')
    if row['CME_date'] != 'NaN' and row['CME_time'] != 'NaN':
        cdt = pd.to_datetime((row['start_date'][0:5] + row['CME_date'] + " " + row['CME_time']),format='%Y/%m/%d %H:%M')
        cme_dt.append(cdt)
    else:
        cme_dt.append('NaN')
    start_dt.append(sdt)
    end_dt.append(edt)

df2['start_datetime'] = start_dt
df2['end_datetime'] = end_dt
df2['CME_datetime'] = cme_dt
df2_d = df2
df2 = df2.drop(['start_date','start_time','end_date','end_time','CME_date','CME_time'],1)#remove unecessary columns

#put all flares with classification X into a list with their date to cross reference then sort
max = []
for index,row in df2.iterrows():
    x = re.match('X\d{1,2}\.\d?',row['flare_importance'])
    if x:
        max.append([float(x.string[1:]),row['start_datetime']])

sorted(max,reverse=True)[0:49]
#After comparing the two tables there are certainly some entries not present here that are on the top 50 website.
#The level at which I can replicate the top 50 data depends on the data i have myself and while the nasa data has
#more entries they are missing some of the bigger flares. Note that my list includes all 90 X class events listed on
#the nasa site but after look at the other site every few rows there is a missing event.

[[28.0, Timestamp('2003-11-04 20:00:00')],
 [20.0, Timestamp('2001-04-02 22:05:00')],
 [17.0, Timestamp('2003-10-28 11:10:00')],
 [14.0, Timestamp('2001-04-15 14:05:00')],
 [10.0, Timestamp('2003-10-29 20:55:00')],
 [9.4, Timestamp('1997-11-06 12:20:00')],
 [9.0, Timestamp('2006-12-05 10:50:00')],
 [8.3, Timestamp('2003-11-02 17:30:00')],
 [7.1, Timestamp('2005-01-20 07:15:00')],
 [6.9, Timestamp('2011-08-09 08:20:00')],
 [6.5, Timestamp('2006-12-06 19:00:00')],
 [6.2, Timestamp('2005-09-09 19:45:00')],
 [5.7, Timestamp('2000-07-14 10:30:00')],
 [5.6, Timestamp('2001-04-06 19:35:00')],
 [5.4, Timestamp('2012-03-07 01:00:00')],
 [5.3, Timestamp('2001-08-25 16:50:00')],
 [4.9, Timestamp('2014-02-25 00:56:00')],
 [4.8, Timestamp('2002-07-23 00:50:00')],
 [4.0, Timestamp('2000-11-26 17:00:00')],
 [3.9, Timestamp('2003-11-03 10:00:00')],
 [3.8, Timestamp('2005-01-17 10:00:00')],
 [3.6, Timestamp('2003-05-28 01:00:00')],
 [3.4, Timestamp('2006-12-13 02:45:00')],
 [3.4, Timestamp('2001-12-28 20:35:00')],
 [3.3, Timestamp('2002-07-20 21:30:00')],
 [3.2, Timestamp('2013-05-14 01:16:00')],
 [3.1, Timestamp('2002-08-24 01:45:00')],
 [2.8, Timestamp('2013-05-13 16:15:00')],
 [2.7, Timestamp('2015-05-05 22:24:00')],
 [2.7, Timestamp('2003-11-03 01:15:00')],
 [2.7, Timestamp('1998-05-06 08:25:00')],
 [2.6, Timestamp('2005-01-15 23:00:00')],
 [2.6, Timestamp('2001-09-24 10:45:00')],
 [2.6, Timestamp('1997-11-27 13:30:00')],
 [2.5, Timestamp('2004-11-10 02:25:00')],
 [2.3, Timestamp('2001-04-10 05:24:00')],
 [2.3, Timestamp('2000-11-24 15:25:00')],
 [2.3, Timestamp('2000-06-06 15:20:00')],
 [2.2, Timestamp('2011-02-15 02:10:00')],
 [2.1, Timestamp('2013-10-25 15:08:00')],
 [2.1, Timestamp('2011-09-06 22:30:00')],
 [2.1, Timestamp('2005-09-10 21:45:00')],
 [2.1, Timestamp('1997-11-04 06:00:00')],
 [2.0, Timestamp('2005-01-17 09:25:00')],
 [2.0, Timestamp('2004-11-07 16:25:00')],
 [2.0, Timestamp('2001-04-12 10:20:00')],
 [2.0, Timestamp('2000-11-24 05:10:00')],
 [1.9, Timestamp('2000-11-25 19:00:00')],
 [1.8, Timestamp('2002-07-18 07:55:00')]]

#make column name match
df_d = df_d.rename(index=str, columns={'Date': 'start_date','X-classification':'flare_importance'})

#fix formatting eg: make X28. = X28.0
fi = []
for index,row in df2_d.iterrows():
    x = re.match('X\d\d\.$',row['flare_importance'])
    if x:
        y = row['flare_importance'] = x.string+'0'
        fi.append(y)
    else:
        fi.append(row['flare_importance'])

df2_d['flare_importance'] = fi

#Merge dataframes and tidy
df_d = df_d.rename(index=str, columns={'Date': 'start_date','X-classification':'flare_importance'})
df3 = pd.merge(df_d,df2_d,on=['start_date','flare_importance'],how='left')
df4 = pd.merge(df2_d,df_d,on=['start_date','flare_importance'],how='outer')
df3 = df3.drop(['start_date'],axis=1)
df4 = df4.drop(['start_date','Region','Start','Max','End'],1)
df4 = df4.rename(index=str,columns={'Number':'Rank'})

df3.head() #Top 50 flares merged with nasa data

df4.head() #Nasa data with rank column

#Question 3
#Sum total of each combination of halo/not halo and top 50/unranked
t50_halo_c = 0
unranked_halo_c = 0
unranked_notHalo_c = 0
t50_notHalo_c = 0
for index,row in df4.iterrows():
    if type(row['Rank']) == str and row['is_halo'] == 'True':
        t50_halo_c += 1
    elif type(row['Rank']) != str and row['is_halo'] == 'True':
        unranked_halo_c += 1
    elif type(row['Rank']) == str and row['is_halo'] != 'True':
        t50_notHalo_c += 1
    elif type(row['Rank']) != str and row['is_halo'] != 'True':
        unranked_notHalo_c += 1

#make a dataframe
data = {
        'Halo':[t50_halo_c,unranked_halo_c],
        'Not Halo':[t50_notHalo_c,unranked_notHalo_c]
       }

h = pd.DataFrame(data,['Top 50',"Unranked"])
h

import matplotlib.pyplot as plt

#plot my dataframe
h.plot.hist()
plt.show()

	Number	X-classification	Date	Region	Start	Max	End	Movie
0	1	X28.0	2003/11/04	0486	19:29	19:53	20:06	MovieView archive
1	2	X20.0	2001/04/02	9393	21:32	21:51	22:03	MovieView archive
2	3	X17.2	2003/10/28	0486	09:51	11:10	11:24	MovieView archive
3	4	X17.0	2005/09/07	0808	17:17	17:40	18:03	MovieView archive
4	5	X14.4	2001/04/15	9415	13:19	13:50	13:55	MovieView archive

	Number	X-classification	Region	Start	Max	End
0	1	X28.0	0486	2003-11-04 19:29:00	2003-11-04 19:53:00	2003-11-04 20:06:00
1	2	X20.0	9393	2001-04-02 21:32:00	2001-04-02 21:51:00	2001-04-02 22:03:00
2	3	X17.2	0486	2003-10-28 09:51:00	2003-10-28 11:10:00	2003-10-28 11:24:00
3	4	X17.0	0808	2005-09-07 17:17:00	2005-09-07 17:40:00	2005-09-07 18:03:00
4	5	X14.4	9415	2001-04-15 13:19:00	2001-04-15 13:50:00	2001-04-15 13:55:00

	start_date	start_time	end_date	end_time	start_frequency	end_frequency	flare_location	flare_region	flare_importance	CME_date	CME_time	CPA	width	speed	plot
0	1997/04/01	14:00	04/01	14:15	8000	4000	S25E16	8026	M1.3	04/01	15:18	74	79	312	PHTX
1	1997/04/07	14:30	04/07	17:30	11000	1000	S28E19	8027	C6.8	04/07	14:27	Halo	360	878	PHTX
2	1997/05/12	05:15	05/14	16:00	12000	80	N21W08	8038	C1.3	05/12	05:30	Halo	360	464	PHTX
3	1997/05/21	20:20	05/21	22:00	5000	500	N05W12	8040	M1.3	05/21	21:00	263	165	296	PHTX
4	1997/09/23	21:53	09/23	22:16	6000	2000	S29E25	8088	C1.4	09/23	22:02	133	155	712	PHTX

	start_date	start_time	end_date	end_time	start_frequency	end_frequency	flare_location	flare_region	flare_importance	CME_date	CME_time	CPA	width	speed	plot
0	1997/04/01	14:00	04/01	14:15	8000	4000	S25E16	8026	M1.3	04/01	15:18	74	79	312	PHTX
1	1997/04/07	14:30	04/07	17:30	11000	1000	S28E19	8027	C6.8	04/07	14:27	Halo	360	878	PHTX
2	1997/05/12	05:15	05/14	16:00	12000	80	N21W08	8038	C1.3	05/12	05:30	Halo	360	464	PHTX
3	1997/05/21	20:20	05/21	22:00	5000	500	N05W12	8040	M1.3	05/21	21:00	263	165	296	PHTX
4	1997/09/23	21:53	09/23	22:16	6000	2000	S29E25	8088	C1.4	09/23	22:02	133	155	712	PHTX

	start_date	start_time	end_date	end_time	start_frequency	end_frequency	flare_location	flare_region	flare_importance	CME_date	CME_time	CPA	width	speed	plot	is_halo	is_lower
0	1997/04/01	14:00	04/01	14:15	8000	4000	S25E16	8026	M1.3	04/01	15:18	74	79	312	PHTX	False	False
1	1997/04/07	14:30	04/07	17:30	11000	1000	S28E19	8027	C6.8	04/07	14:27	NA	360	878	PHTX	True	False
2	1997/05/12	05:15	05/14	16:00	12000	80	N21W08	8038	C1.3	05/12	05:30	NA	360	464	PHTX	True	False
3	1997/05/21	20:20	05/21	22:00	5000	500	N05W12	8040	M1.3	05/21	21:00	263	165	296	PHTX	False	False
4	1997/09/23	21:53	09/23	22:16	6000	2000	S29E25	8088	C1.4	09/23	22:02	133	155	712	PHTX	False	False

	Halo	Not Halo
Top 50	24	26
Unranked	255	225