conda install -c conda-forge matplotlib=2.0.2 conda install -c conda-forge scikit-learn=0.18.1 conda install -c conda-forge pandas=0.20.1 conda install -c conda-forge scipy=0.19.0 conda install -c conda-forge numpy=1.12.1
import pandas as pd import numpy as np from sklearn import linear_model as lm import matplotlib.pyplot as plt from scipy import stats from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error
#Read energy data in from excel spreadsheet energy_data = pd.read_excel("Table_2.1_Energy_Consumption_by_Sector-2.xlsx", sheetname="Monthly Data", header=10, skiprows=) energy_data.head(5)
|Month||Primary Energy Consumed by the Residential Sector||Total Energy Consumed by the Residential Sector||Primary Energy Consumed by the Commercial Sector||Total Energy Consumed by the Commercial Sector||Primary Energy Consumed by the Industrial Sector||Total Energy Consumed by the Industrial Sector||Primary Energy Consumed by the Transportation Sector||Total Energy Consumed by the Transportation Sector||Primary Energy Consumed by the Electric Power Sector||Energy Consumption Balancing Item||Primary Energy Consumption Total|
#Change the column names, so they are shorter energy_data.columns = ["Month", "P.E. Residential", "T.E. Residential", "P.E. Commerical", "T.E. Commercial", "P.E. Industrial", "T.E. Industrial", "P.E. Transportation", "T.E. Transportation", "P.E. Electric", "Energy Consumption Balancing Item", "P.E. Total Consumption"] energy_data.head(5)
|Month||P.E. Residential||T.E. Residential||P.E. Commerical||T.E. Commercial||P.E. Industrial||T.E. Industrial||P.E. Transportation||T.E. Transportation||P.E. Electric||Energy Consumption Balancing Item||P.E. Total Consumption|
#loads the climate data climateTable = np.genfromtxt('tempData.txt', skip_header=2) #dataframe representing the climate data climate_data = pd.DataFrame(climateTable, columns=['Year', 'Month', 'Monthly Anomaly', 'Monthly Uncertainty', 'Annual Anomaly', 'Annual Uncertainty', 'Five Year Anomaly', 'Five Year Uncertainty', 'Ten Year Anomaly', 'Ten Year Uncertainty', 'Twenty Year Anomaly', 'Twenty Year Uncertainty']) climate_data.head(5)
|Year||Month||Monthly Anomaly||Monthly Uncertainty||Annual Anomaly||Annual Uncertainty||Five Year Anomaly||Five Year Uncertainty||Ten Year Anomaly||Ten Year Uncertainty||Twenty Year Anomaly||Twenty Year Uncertainty|
#creates a new column in the format of year-month-day climate_data['Day'] = np.ones(len(climate_data)) date = climate_data[['Year', 'Month', 'Day']].copy() date = pd.to_datetime(date) climate_data['Date'] = date #drops the old date columns climate_data = climate_data.drop('Year', axis=1) climate_data = climate_data.drop('Month', axis=1) climate_data = climate_data.drop('Day', axis=1) #rearranges the columns climate_data = climate_data[['Date', 'Monthly Anomaly', 'Monthly Uncertainty', 'Annual Anomaly', 'Annual Uncertainty', 'Five Year Anomaly', 'Five Year Uncertainty', 'Ten Year Anomaly', 'Ten Year Uncertainty', 'Twenty Year Anomaly', 'Twenty Year Uncertainty']] climate_data.head(5)
|Date||Monthly Anomaly||Monthly Uncertainty||Annual Anomaly||Annual Uncertainty||Five Year Anomaly||Five Year Uncertainty||Ten Year Anomaly||Ten Year Uncertainty||Twenty Year Anomaly||Twenty Year Uncertainty|
#merges the two dataframes on the date climate_energy_data = pd.merge(energy_data, climate_data, left_on = 'Month', right_on = 'Date') #Reindex the dataset, so indices are Datetime objects climate_energy_data = climate_energy_data.set_index(["Month"]) climate_energy_data = climate_energy_data.drop('Date', axis=1) climate_energy_data.index.name = None climate_energy_data.head(5)
|P.E. Residential||T.E. Residential||P.E. Commerical||T.E. Commercial||P.E. Industrial||T.E. Industrial||P.E. Transportation||T.E. Transportation||P.E. Electric||Energy Consumption Balancing Item||...||Monthly Anomaly||Monthly Uncertainty||Annual Anomaly||Annual Uncertainty||Five Year Anomaly||Five Year Uncertainty||Ten Year Anomaly||Ten Year Uncertainty||Twenty Year Anomaly||Twenty Year Uncertainty|
5 rows × 21 columns
getAvgsthat will make our job a little easier. This method takes in a dataframe, and a sector S, and calculates the average energy usage per year, over sector S. The returned result is a list of averages, over the relevant years 1973 - 2013.
#Gets the monthly average in Energy Usage for a sector over time. #If no sector is specified, it computes monthly averages for total energy consumption (across all sectors) def getAvgs(df, sector = None): if( sector == None): sector = "P.E. Total Consumption" elif( sector == "Electric"): sector = "P.E. Electric" else: sector = "T.E. " + sector avgs = np.zeros( 2013 - 1973 + 1) for i in range(1973, 2014): temp = df[str(i)] sum = np.sum(temp[sector].values) avgs[i-1973] = sum / float(len(temp)) return avgs
graphto make it easier to graph our data, and avoid reusing code. Our method takes in a list of X coordinates, Y coordinates, X-axis and Y-axis labels, and the type of plot to make (options are \"Line\" or \"Scatter\" plots. It then produces the specified type of plot, with a line of best fit.
#Graphs X and Y datapoints, and sets the title, xLabel, and yLabel #t is the type of graph. Currently, Line Graphs and Scatter Plots have been implemented. def graph(X, Y, title, xLabel, yLabel, t="Line"): if( t == "Line"): plt.plot(X, Y, label = "Data Points") elif( t == "Scatter"): plt.scatter(X, Y, label="Data Points") plt.plot(X, np.poly1d(np.polyfit(X, Y, 1))(np.unique(X)), label="Best Fit Line") slope, intercept = np.polyfit(X, Y, 1) plt.legend(loc=0) plt.title(title) plt.xlabel(xLabel) plt.ylabel(yLabel) plt.show() print("Slope of the line of best fit is: %f" %slope)
Using our methods above, we can easily compute and graph the yearly average in total monthly energy usage.
#Plot average monthly energy usage per year over time avgs = getAvgs(climate_energy_data) years = np.arange(1973, 2014) graph(years, avgs, "Average Monthly Energy Usage Across Sectors Over Time", "Year", "Energy Usage (Trillion BTU's)")
Slope of the line of best fit is: 62.529150
#Compute monthly averages for each individual sector sectors = ["Residential", "Commercial", "Industrial", "Transportation", "Electric"] for s in sectors: avgs = getAvgs(climate_energy_data, s) graph(years, avgs, "Average Monthly Energy Usage in the " + s + " Sector Over Time", "Year", "Energy Usage (Trillion BTU's)")
Slope of the line of best fit is: 16.002637
Slope of the line of best fit is: 21.360539
Slope of the line of best fit is: 2.751618
Slope of the line of best fit is: 22.419588
Slope of the line of best fit is: 46.465518
getAnomaliesthat functions similar to our
getAvgsfunction. Next, we get the specific annual temperature anomalies, for each year in our defined range. Lastly, we return a list of our anomalies and all the years for which we have aggregated data.
climate_data = climate_data.set_index(['Date']) climate_data.index.name = None
def getAnomalies(start, end): end = end + 1 anomalies = np.zeros(end - start) years = np.arange(start, end) for y in range(start, end): d = str(y) + '-06' anomalies[y - start] = climate_data[d]['Annual Anomaly'] return anomalies, years
Great job! Now, we will make use of our
getAnomalies method. We pass in 1973 and 2012, as start and end parameters respectively, to tell our method to get all anomalies during this time period. We then plot the returned lists by using our previously defined
graph function. Both the code to produce this and the resulting plot can be seen below.
anomalies, years = getAnomalies(1973, 2012) graph(years, anomalies, "Change in Temperature Over Time From 1973 - 2012", "Years", "Temperature Deviation from 1950-1970 Average (Degrees Celsius)")
Slope of the line of best fit is: 0.026938
As you can see from the plot above, there is a similar positive trend between temperature and time. As time increases, so does temperature. However, because of the variance of the plot, one might reasonably question the strength of this relationship. With temperature anomaly constantly jumping up and down, can we definitively say that time is causing temperature to increase?
The answer to this question becomes more apparent below, when we expand our effective domain. If we extend our time period farther back, to 1820, we can more clearly see a stronger relationship between time and temperature.
anomalies, years = getAnomalies(1820, 2012) graph(years, anomalies, "Change in Temperature Over Time From 19820 - 2012", "Years", "Temperature Deviation from 1951-1980 Average (Degrees Celsius)")
Slope of the line of best fit is: 0.006442
Now, we are going to delve deeper into the hypothesis testing and machine learning aspects of data science. Through data visualization and analysis, we were able to show that temperature and energy usage both increase with time. From this, we would now like to hypothesize that temperature and energy usage are directly correlated (i.e. an increase in one leads to an increase in the other). Being able to prove such a hypothesis would give key insight into global warming and how to combat it.
We are going to start by taking our hypothesis and calculating the pearson correlation coefficient for our temperature and energy data. If you are unfamiliar with what this coefficient is, or what it means, then check out this Wikipedia page. Luckily for us, there is a well defined stats library in scipy that can help us calculate this in one line:
stats.pearsonr(climate_energy_data['P.E. Total Consumption'], climate_energy_data['Monthly Anomaly'])
The result of the code above is a tuple of two values. The first value is the pearson's correlation coefficient. A correlation coefficient of +1 or -1 is indicative of a perfect linear relationship. With a correlation coefficient of roughly .3099, we can conclude that there is a weak positive correlation between our two datasets. As a result, we cannot definitively claim that temperature and energy are directly related.
The second value is our two-tailed p-value. This indicates the probability that an uncorrelatd system produced similar results. As we can see, our two-tailed p-value is miniscule and basically zero. So, we can safely conclude that an uncorrelated system would not produce similar results.
In essence, we have just shown that there is a weak correlation between our energy and temperature data. In other words, there is some correlation between our two datasets, but not enough for us to definitively say there is a linear relationship. This makes sense, too. To visualize and make more sense of this weak correlation, we will take a moment to graph Annual Temperature Anomaly vs. Energy Consumption. The code below uses our
getAnomalies to get the correct data, then produces a plot.
avgs = getAvgs(climate_energy_data) avgs = avgs[1:41] anomalies, years = getAnomalies(1973, 2012) graph(avgs, anomalies, "Anomalies and Energy Consumption", "Energy Consumption", "Yearly Temp Anomalies")
Slope of the line of best fit is: 0.000393
Now, we are going to transition away from hypothesis testing, and focus on machine learning. The whole concept of machine learning is being able to generalize, or learn, a predictive model based on some sort of training data. With any luck, this model will be able to make accurate predictions on a separate dataset, used for testing.
To develop a predictive model, we are going to use scikit-learn, a powerful Python library that handles much of the behind-the-scenes work for us. Scikit-learn makes ML fairly easy, as it reduces numerous lines of complex code down to one function call. To start off, we are going to define two functions.
First, we are going to define a function
trainTestSplit that takes our dataset and splits it into training and test data. This is necessary, because we want to train our predictive model on our training data and evaluate its accuracy using test data. Imagine you are taking an exam, and the test questions and answers are leaked to you. Obviously, exam scores will be high simply because students had access to the questions and answers, and as a result, the exam would not be an accurate evaluation of a student's ability to learn. The same idea can be applied to machine learning. If we peek at the answers (our test data) too early, we are cheating. As a result, it is important to maintain this train/test split. In order to implement the actual splitting of our data, we rely on scikit-learn to reduce numerous lines of code to one method call.
Our second method,
linear_model, trains a basic linear regression model using scikit-learn. In linear regression, our goal is to learn a vector of weights, w, for which we can compute a prediction. Say we have an example, X, represented by a vector. By computing the dot product of w and X, we can make a prediction for example X. In essence, this simple concept is linear regression.
The code for both of these two methods is below. As you can see, scikit-learn makes implementing these methods extremely easy. If you are interested in more machine learning models, feel free to checkout the scikit-learn's online documentation. The library is extremely powerful for machine learning purposes, and can be a good place for a beginner to start.
def trainTestSplit(X, Y): X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) return X_train, X_test, Y_train, Y_test
def linear_model(X, Y): #creates the linear regression model reg = lm.LinearRegression() reg.fit(X, Y) return reg
Now, we can train our linear regression model. We are going to try to predict energy consumption and temperature anomaly based on date. We start off by dropping all rows containing empty values from our climate_energy_data dataframe. Then, we aggregate all our data by creating a 2d-array of examples, with each row formatted as [year, month, day] and a 1d-array of corresponding labels. We then call our
trainTestSplit() method on our two arrays, to split our training from our test data. Lastly, we train a linear regresion model by feeding our training data into our
total_data = climate_energy_data.dropna() X = np.c_[total_data.index.year, total_data.index.month, total_data.index.day] Y = total_data['P.E. Total Consumption'].values X_train, X_test, Y_train, Y_test = trainTestSplit(X, Y) energy_reg = linear_model(X_train, Y_train)
We have just succesfully trained our linear regression model, and can now make predictions using it. We will start off by making predictions on our test set. Scikit-learn makes this extremely easy, we simply call scikit-learn's predict method, passing in the feature vectors from our test set. Scikit-learn then returns a 1d-array of predicted values, we call this array
Yhat = energy_reg.predict(X_test)
Now that we have made our predictions, we need a way to evaluate the performance of our learned model. The way we go about evaluating our accuracy, is to define our loss function. If you want to read up on loss functions, visit this Wikipedia Page, but essentially they are a way of penalizing wrong predictions.
For our model, we are going to use squared loss. The formula to compute our loss function is: SUM((Y-Yhat)2), where Y is the correct value and Yhat is our predicted value. Using this formula, we are going to define two functions. The first method,
getLoss(), calculates and returns our loss, given true values and predicted values. The second method,
plotLoss(), makes a scatter plot of our loss, and draws a horizontal line, indicating our average loss for our test set.
def getLoss(Ytrue, Yhat): return (Yhat - Ytrue)**2
def plotLoss(loss, avg_loss): N = len(loss) X = np.arange(N) + 1 plt.scatter(X, loss) plt.plot(X, np.ones(N) * avg_loss, label="Average Loss", c="red", linewidth=2.0) plt.title("Squared Loss") plt.ylabel("Loss value") plt.legend(loc=0) plt.show()
Next, we can use the two methods we just defined to calculate our loss. We use scikit-learn's mean_squared_error method to calculate our average squared loss. Then, we plot loss and average loss and print out the values for total and average loss, to more easily visualize these values. We will see that the loss for our predictions is extremely high, but do not worry. Our model is performing fine, rather the energy dataset uses extremely large values, so this is expected.
loss = getLoss(Y_test, Yhat) totalLoss = np.sum(loss) avg_loss = mean_squared_error(Y_test, Yhat) plotLoss(loss, avg_loss) print("Total Loss: %s" % totalLoss) print("Average Loss: %s" % avg_loss)
Total Loss: 19099035.1904 Average Loss: 258095.07014
Now, we repeat the process for our temperature data. We want to develop a linear regression model to help us predict temperature anomaly, given a date.
Remember to use the same process:
Y = total_data['Monthly Anomaly'].values X_test, X_train, Y_test, Y_train = trainTestSplit(X, Y) temp_reg = linear_model(X_train, Y_train)
Yhat = temp_reg.predict(X_test)
loss = getLoss(Y_test, Yhat) totalLoss = np.sum(loss) avg_loss = mean_squared_error(Y_test, Yhat) print("Total Loss: %s" % totalLoss) print("Average Loss: %s" % avg_loss) plotLoss(loss, avg_loss)
Total Loss: 268.964826292 Average Loss: 0.911745173873
Now, let's try to predict the future. For example, what will temperature and energy consumption look like in January 2050? You may not know the answer to this off the top of your head, but we can use our learned model to give us an idea of what to expect. Unlike before, we will not know the actual values of these predictions, as we are predicting the future, but we know our learned models perform decently well.
Let's try making predictions for a few dates: August 1st, 2020, November 8th, 2024, May 10th, 2030, May 18th, 2032, and February 24th 2035. We start off by defining an nd-arrary of the feature vectors we would like to predict. This ndarray contains the dates previously listed, with each row being of the format [Year, Month, Day].
X = np.array([[2020, 8, 1], [2024, 11, 8], [2032, 5, 10], [2035, 2, 24]])
Then, we make predictions on using this nd-array
Yhat_energy = energy_reg.predict(X) Yhat_temp = temp_reg.predict(X) print("Energy Predictions: %s" % str(Yhat_energy)) print("Temperature Predictions: %s" % str(Yhat_temp))
Energy Predictions: [ 9396.5706769 9623.36301856 10384.30589298 10688.86935783] Temperature Predictions: [ 1.52348165 1.70713331 2.00791078 2.11654447]
As we can see, these numbers continue increasing with time. On February 24th, 2035, we can expect energy usage to total 10,688.87 Trillion BTU's and energy anomaly to be 2.11 degrees higher than the 1951 - 1980 average. Given the state of our planet, and our desperate need to reduce our carbon footprint, wouldn't it make sense to reduce our energy usage? Not necessarily. The answer to this question, and the reason why is further explained in the next section, Insight and Policy