import csv, json, requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
import re
r = requests.get("https://www.spaceweatherlive.com/en/solar-activity/top-50-solar-flares")
root = BeautifulSoup(r.content,"html")
#root.prettify()
table = root.find("table")
#make a list of lists, repersenting every entry in a row
entries = []
rows = table.findAll("tr")
for tr in rows:
td = tr.findAll("td")
row_text = [tr.text for tr in td]
entries.append(row_text)
del entries[0] #First row is blank so drop it
df = pd.DataFrame(entries,columns=["Number","X-classification","Date","Region","Start","Max","End","Movie"])
df.head()
#drop movies column as it's useless
df = df.drop(labels="Movie",axis=1)
#combine date and start/end/max time of every row into datetime object
for index,row in df.iterrows():
dts = pd.to_datetime(row['Date'] + " " + row['Start'])
dtm = pd.to_datetime(row['Date'] + " " + row['Max'])
dte = pd.to_datetime(row['Date'] + " " + row['End'])
df.at[index,'Start'] = dts
df.at[index,'Max'] = dtm
df.at[index,'End'] = dte
#date column no longer necessary
df_d = df #for part 2 question 2
df = df.drop(labels='Date',axis=1)
#replace - with NaN even though there are none
df = df.replace(to_replace='-',value='NaN')
df.head()
r = requests.get("https://cdaw.gsfc.nasa.gov/CME_list/radio/waves_type2.html")
#get nasa data
root = BeautifulSoup(r.content,"html")
#root.prettify()
#get table
table = root.find("pre")
#convert beautifulsoup object to string and drop html links with regex
table_str = str(table)
t2 = re.sub('<a\shref="\S+">|<a\shref="\S+"\starget="\S+">|<\/a>',"",table_str)
#split into rows and only take necessary rows
t2 = t2.splitlines()
t2 = t2[12:523]
#split each row by whitespace
l = []
for x in t2:
row = x.split()
l.append(row)
#drop uneccesary columns(they contain comments)
df2 = pd.DataFrame(l,columns=['start_date','start_time', 'end_date', 'end_time', 'start_frequency', 'end_frequency', 'flare_location', 'flare_region','flare_importance','CME_date','CME_time','CPA','width','speed','plot','16','17','18','19','20','21','22','23','24'])
df2 = df2.drop(['16','17','18','19','20','21','22','23','24'],1)
df2.head()
#replace missing data with NaN
df2 = df2.replace(to_replace='---',value='NaN')
df2 = df2.replace(to_replace='----',value='NaN')
df2 = df2.replace(to_replace='-----',value='NaN')
df2 = df2.replace(to_replace='------',value='NaN')
df2 = df2.replace(to_replace='--:--',value='NaN')
df2 = df2.replace(to_replace='--/--',value='NaN')
df2.head()
#create extra columns for if it's a halo or lower bound and clean data from original rows
#check contents of the row and append the result to a new list
l = []
j = []
for index,row in df2.iterrows():
if row['CPA'] == 'Halo':
l.append('True')
row['CPA'] = 'NA'
else:
l.append('False')
if re.match('>',row['width']):
row['width'] = re.sub('>','',row['width'])
j.append('True')
else:
j.append('False')
df2['is_halo'] = l
df2['is_lower'] = j
df2.head()
#change times of 24:00 in accordance with piazza post
df2 = df2.replace('24:00','00:00')
#make all dates/times into datetime columns by combining the strings and passing them into to_datetime
start_dt = []
end_dt = []
cme_dt = []
for index,row in df2.iterrows():
sdt = pd.to_datetime((row['start_date'] + " " + row['start_time']),format='%Y/%m/%d %H:%M')
edt = pd.to_datetime((row['start_date'][0:5] + row['end_date'] + " " + row['end_time']),format='%Y/%m/%d %H:%M')
if row['CME_date'] != 'NaN' and row['CME_time'] != 'NaN':
cdt = pd.to_datetime((row['start_date'][0:5] + row['CME_date'] + " " + row['CME_time']),format='%Y/%m/%d %H:%M')
cme_dt.append(cdt)
else:
cme_dt.append('NaN')
start_dt.append(sdt)
end_dt.append(edt)
df2['start_datetime'] = start_dt
df2['end_datetime'] = end_dt
df2['CME_datetime'] = cme_dt
df2_d = df2
df2 = df2.drop(['start_date','start_time','end_date','end_time','CME_date','CME_time'],1)#remove unecessary columns
#put all flares with classification X into a list with their date to cross reference then sort
max = []
for index,row in df2.iterrows():
x = re.match('X\d{1,2}\.\d?',row['flare_importance'])
if x:
max.append([float(x.string[1:]),row['start_datetime']])
sorted(max,reverse=True)[0:49]
#After comparing the two tables there are certainly some entries not present here that are on the top 50 website.
#The level at which I can replicate the top 50 data depends on the data i have myself and while the nasa data has
#more entries they are missing some of the bigger flares. Note that my list includes all 90 X class events listed on
#the nasa site but after look at the other site every few rows there is a missing event.
#make column name match
df_d = df_d.rename(index=str, columns={'Date': 'start_date','X-classification':'flare_importance'})
#fix formatting eg: make X28. = X28.0
fi = []
for index,row in df2_d.iterrows():
x = re.match('X\d\d\.$',row['flare_importance'])
if x:
y = row['flare_importance'] = x.string+'0'
fi.append(y)
else:
fi.append(row['flare_importance'])
df2_d['flare_importance'] = fi
#Merge dataframes and tidy
df_d = df_d.rename(index=str, columns={'Date': 'start_date','X-classification':'flare_importance'})
df3 = pd.merge(df_d,df2_d,on=['start_date','flare_importance'],how='left')
df4 = pd.merge(df2_d,df_d,on=['start_date','flare_importance'],how='outer')
df3 = df3.drop(['start_date'],axis=1)
df4 = df4.drop(['start_date','Region','Start','Max','End'],1)
df4 = df4.rename(index=str,columns={'Number':'Rank'})
df3.head() #Top 50 flares merged with nasa data
df4.head() #Nasa data with rank column
#Question 3
#Sum total of each combination of halo/not halo and top 50/unranked
t50_halo_c = 0
unranked_halo_c = 0
unranked_notHalo_c = 0
t50_notHalo_c = 0
for index,row in df4.iterrows():
if type(row['Rank']) == str and row['is_halo'] == 'True':
t50_halo_c += 1
elif type(row['Rank']) != str and row['is_halo'] == 'True':
unranked_halo_c += 1
elif type(row['Rank']) == str and row['is_halo'] != 'True':
t50_notHalo_c += 1
elif type(row['Rank']) != str and row['is_halo'] != 'True':
unranked_notHalo_c += 1
#make a dataframe
data = {
'Halo':[t50_halo_c,unranked_halo_c],
'Not Halo':[t50_notHalo_c,unranked_notHalo_c]
}
h = pd.DataFrame(data,['Top 50',"Unranked"])
h
import matplotlib.pyplot as plt
#plot my dataframe
h.plot.hist()
plt.show()