Install StatsModel¶
In [288]:
Copied!
#to activate the environment: conda activate rcaes_env_new
#to install use:
#mamba install -c conda-forge statsmodels
# or use
#conda install -c conda-forge statsmodels
#to activate the environment: conda activate rcaes_env_new
#to install use:
#mamba install -c conda-forge statsmodels
# or use
#conda install -c conda-forge statsmodels
Setting up¶
In [289]:
Copied!
#import the dependencies of the package
import numpy as np
import pandas as pd
import patsy as pa
import scipy as sci
#optional dependecies
import matplotlib.pyplot as plt
#import the dependencies of the package
import numpy as np
import pandas as pd
import patsy as pa
import scipy as sci
#optional dependecies
import matplotlib.pyplot as plt
In [290]:
Copied!
#import modules
import statsmodels.api as sm
from patsy import dmatrices
import statsmodels.formula.api as smf
#import modules
import statsmodels.api as sm
from patsy import dmatrices
import statsmodels.formula.api as smf
Load Data¶
In [291]:
Copied!
#load sample data using pandas dataframe
df = pd.read_csv('/home/vgi3/rcaes/StatsModelTutorial/LIS_2021_DB.csv', na_values = 'ND')
#load sample data using pandas dataframe
df = pd.read_csv('/home/vgi3/rcaes/StatsModelTutorial/LIS_2021_DB.csv', na_values = 'ND')
In [292]:
Copied!
df
df
Out[292]:
ID | RECORD_ID | DATE_STATION_LINK | DATE | YEAR | MONTH | TIME_24H | STATION_ID | DEPTH_M | CLASS | ... | FIELD_NOTES | WEATHER | PREV_24_H_RAIN | PREV_48_H_RAIN | TIDE_TIME_NR | TIDE_RANGE_FT_NR | TIDE_TYPE_NR | TIDE_TIME_KP | TIDE_RANGE_FT_KP | TIDE_TYPE_KP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 23807 | 9-413_1/5/2021_6:43_2.3 | 1/5/2021_9-413 | 1/5/2021 | 2021 | January | 6:43 | 9-413 | 2.3 | Bottom | ... | pH probe on YSI EXO1 sonde malfunctioned | Clear, cold, dry | 0.00 | 0.12 | 9:19 | 0.16 | Low | 9:37 | 0.17 | Low |
1 | 23808 | 9-413_1/5/2021_6:44_1 | 1/5/2021_9-413 | 1/5/2021 | 2021 | January | 6:44 | 9-413 | 1.0 | Middle | ... | pH probe on YSI EXO1 sonde malfunctioned | Clear, cold, dry | 0.00 | 0.12 | 9:19 | 0.16 | Low | 9:37 | 0.17 | Low |
2 | 23809 | 9-413_1/5/2021_6:45_0.5 | 1/5/2021_9-413 | 1/5/2021 | 2021 | January | 6:45 | 9-413 | 0.5 | Surface | ... | pH probe on YSI EXO1 sonde malfunctioned | Clear, cold, dry | 0.00 | 0.12 | 9:19 | 0.16 | Low | 9:37 | 0.17 | Low |
3 | 23810 | 9-412_1/5/2021_6:51_4.2 | 1/5/2021_9-412 | 1/5/2021 | 2021 | January | 6:51 | 9-412 | 4.2 | Bottom | ... | pH probe on YSI EXO1 sonde malfunctioned | Clear, cold, dry | 0.00 | 0.12 | 9:19 | 0.16 | Low | 9:37 | 0.17 | Low |
4 | 23811 | 9-412_1/5/2021_6:52_2.1 | 1/5/2021_9-412 | 1/5/2021 | 2021 | January | 6:52 | 9-412 | 2.1 | Middle | ... | pH probe on YSI EXO1 sonde malfunctioned | Clear, cold, dry | 0.00 | 0.12 | 9:19 | 0.16 | Low | 9:37 | 0.17 | Low |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1303 | 25110 | Dup E-12_11/30/2021_7:48_2.2 | 11/30/2021_Dup E-12 | 11/30/2021 | 2021 | November | 7:48 | Dup E-12 | 2.2 | Middle | ... | NaN | Cloudy, calm waters | 0.01 | 0.01 | 6:47 | 7.91 | High | 7:03 | 7.83 | High |
1304 | 25111 | Dup E-12_11/30/2021_7:48_0.5 | 11/30/2021_Dup E-12 | 11/30/2021 | 2021 | November | 7:48 | Dup E-12 | 0.5 | Surface | ... | NaN | Cloudy, calm waters | 0.01 | 0.01 | 6:47 | 7.91 | High | 7:03 | 7.83 | High |
1305 | 25112 | Dup DI2_11/30/2021_8:06_9.1 | 11/30/2021_Dup DI2 | 11/30/2021 | 2021 | November | 8:06 | Dup DI2 | 9.1 | Bottom | ... | NaN | Cloudy, calm waters | 0.01 | 0.01 | 6:47 | 7.91 | High | 7:03 | 7.83 | High |
1306 | 25113 | Dup DI2_11/30/2021_8:07_4.8 | 11/30/2021_Dup DI2 | 11/30/2021 | 2021 | November | 8:07 | Dup DI2 | 4.8 | Middle | ... | NaN | Cloudy, calm waters | 0.01 | 0.01 | 6:47 | 7.91 | High | 7:03 | 7.83 | High |
1307 | 25114 | Dup DI2_11/30/2021_8:07_0.5 | 11/30/2021_Dup DI2 | 11/30/2021 | 2021 | November | 8:07 | Dup DI2 | 0.5 | Surface | ... | NaN | Cloudy, calm waters | 0.01 | 0.01 | 6:47 | 7.91 | High | 7:03 | 7.83 | High |
1308 rows × 46 columns
In [293]:
Copied!
#isolate the columns used for analysis
df = df[['DISSOLVED_OXYGEN_MG_L', 'TEMPERATURE_C', 'PH']]
#isolate the columns used for analysis
df = df[['DISSOLVED_OXYGEN_MG_L', 'TEMPERATURE_C', 'PH']]
In [294]:
Copied!
#drop NA values, reset the index to match the new # of columns
df = df.dropna()
df = df.reset_index(drop=True)
#drop NA values, reset the index to match the new # of columns
df = df.dropna()
df = df.reset_index(drop=True)
In [295]:
Copied!
df.info()
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1239 entries, 0 to 1238 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DISSOLVED_OXYGEN_MG_L 1239 non-null float64 1 TEMPERATURE_C 1239 non-null float64 2 PH 1239 non-null float64 dtypes: float64(3) memory usage: 29.2 KB
In [296]:
Copied!
pd.to_numeric(df['PH'])
pd.to_numeric(df['PH'])
Out[296]:
0 8.36 1 8.38 2 8.39 3 8.45 4 8.45 ... 1234 8.25 1235 8.25 1236 8.19 1237 8.19 1238 8.20 Name: PH, Length: 1239, dtype: float64
Picking X and Y variables¶
In [205]:
Copied!
#explore dataset with pandasframes corr function look for potential relationships
#explore dataset with pandasframes corr function look for potential relationships
In [206]:
Copied!
df.corr()
#looks like there is a pretty good correlation between DO and TEMP, and DO and PH
df.corr()
#looks like there is a pretty good correlation between DO and TEMP, and DO and PH
Out[206]:
DISSOLVED_OXYGEN_MG_L | TEMPERATURE_C | PH | |
---|---|---|---|
DISSOLVED_OXYGEN_MG_L | 1.000000 | -0.836311 | 0.936684 |
TEMPERATURE_C | -0.836311 | 1.000000 | -0.782899 |
PH | 0.936684 | -0.782899 | 1.000000 |
In [207]:
Copied!
#visualize potential relationships with scatter plots
fig, ax = plt.subplots()
ax0 = plt.scatter(df['TEMPERATURE_C'], df['DISSOLVED_OXYGEN_MG_L'])
ax.set_xlabel('TEMPERATURE_C')
ax.set_ylabel('Dissolved_Oxygen_mg/L')
ax.set_title('DO vs TEMP')
#visualize potential relationships with scatter plots
fig, ax = plt.subplots()
ax0 = plt.scatter(df['TEMPERATURE_C'], df['DISSOLVED_OXYGEN_MG_L'])
ax.set_xlabel('TEMPERATURE_C')
ax.set_ylabel('Dissolved_Oxygen_mg/L')
ax.set_title('DO vs TEMP')
Out[207]:
Text(0.5, 1.0, 'DO vs TEMP')
In [208]:
Copied!
fig, ax = plt.subplots()
ax1 = plt.scatter(x=df['PH'], y=df['DISSOLVED_OXYGEN_MG_L'])
ax.set_xlabel('PH')
ax.set_ylabel('Dissolved_Oxygen_mg/L')
ax.set_title('DO vs PH')
fig, ax = plt.subplots()
ax1 = plt.scatter(x=df['PH'], y=df['DISSOLVED_OXYGEN_MG_L'])
ax.set_xlabel('PH')
ax.set_ylabel('Dissolved_Oxygen_mg/L')
ax.set_title('DO vs PH')
Out[208]:
Text(0.5, 1.0, 'DO vs PH')
Formatting X and Y¶
In [298]:
Copied!
#Statsmodel.api or Statsmodelformula.api?
#Statsmodel.api or Statsmodelformula.api?
In [299]:
Copied!
#Design Matrices (endo, exo)
# matrix of endogenous variables (dependent, y)
# matrix of exogenous variables (independent, x)
#ex. use patsy dmatrices
Y,X = dmatrices ('DISSOLVED_OXYGEN_MG_L~ PH + TEMPERATURE_C', data=df, return_type='dataframe')
#Design Matrices (endo, exo)
# matrix of endogenous variables (dependent, y)
# matrix of exogenous variables (independent, x)
#ex. use patsy dmatrices
Y,X = dmatrices ('DISSOLVED_OXYGEN_MG_L~ PH + TEMPERATURE_C', data=df, return_type='dataframe')
In [300]:
Copied!
#see below for an example of entering variables directly into model formula
#see below for an example of entering variables directly into model formula
Pick the statistical test or model¶
In [301]:
Copied!
#statsmodel documentation https://www.statsmodels.org/stable/user-guide.html
#statsmodel documentation https://www.statsmodels.org/stable/user-guide.html
In [302]:
Copied!
# unique formulas can be made, but this tutorial will use a model that already comes in the module
# unique formulas can be made, but this tutorial will use a model that already comes in the module
In [305]:
Copied!
#view options in statsmodel.api
dir(sm)
#view options in statsmodel.api
dir(sm)
Out[305]:
['BayesGaussMI', 'BinomialBayesMixedGLM', 'ConditionalLogit', 'ConditionalMNLogit', 'ConditionalPoisson', 'Factor', 'GEE', 'GLM', 'GLMGam', 'GLS', 'GLSAR', 'GeneralizedPoisson', 'HurdleCountModel', 'Logit', 'MANOVA', 'MI', 'MICE', 'MICEData', 'MNLogit', 'MixedLM', 'NegativeBinomial', 'NegativeBinomialP', 'NominalGEE', 'OLS', 'OrdinalGEE', 'PCA', 'PHReg', 'Poisson', 'PoissonBayesMixedGLM', 'ProbPlot', 'Probit', 'QuantReg', 'RLM', 'RecursiveLS', 'SurvfuncRight', 'TruncatedLFNegativeBinomialP', 'TruncatedLFPoisson', 'WLS', 'ZeroInflatedGeneralizedPoisson', 'ZeroInflatedNegativeBinomialP', 'ZeroInflatedPoisson', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '__version__', '__version_info__', 'add_constant', 'categorical', 'cov_struct', 'datasets', 'distributions', 'duration', 'emplike', 'families', 'formula', 'gam', 'genmod', 'graphics', 'iolib', 'load', 'load_pickle', 'multivariate', 'nonparametric', 'qqline', 'qqplot', 'qqplot_2samples', 'regression', 'robust', 'show_versions', 'stats', 'test', 'tools', 'tsa', 'webdoc']
In [306]:
Copied!
#view options in statsmodelformula.api
dir(smf)
#view options in statsmodelformula.api
dir(smf)
Out[306]:
['__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'conditional_logit', 'conditional_mnlogit', 'conditional_poisson', 'gee', 'glm', 'glmgam', 'gls', 'glsar', 'logit', 'mixedlm', 'mnlogit', 'negativebinomial', 'nominal_gee', 'ols', 'ordinal_gee', 'phreg', 'poisson', 'probit', 'quantreg', 'rlm', 'wls']
Create the model¶
In [217]:
Copied!
#following example will use ordinary least squares multiple linear regression
#model will predict dissolved oxygen based on PH and Temperature
# y = a + B1X1 + B2X2 + ...
#following example will use ordinary least squares multiple linear regression
#model will predict dissolved oxygen based on PH and Temperature
# y = a + B1X1 + B2X2 + ...
In [307]:
Copied!
#using statsmodels.api
#inputs to the model are design matrices
mod = sm.OLS(Y, X)
#update matrix so it has constant added to it
X= sm.add_constant(X)
#using statsmodels.api
#inputs to the model are design matrices
mod = sm.OLS(Y, X)
#update matrix so it has constant added to it
X= sm.add_constant(X)
In [309]:
Copied!
# check that column for intercept is added, yes
X.head()
# check that column for intercept is added, yes
X.head()
Out[309]:
Intercept | PH | TEMPERATURE_C | |
---|---|---|---|
0 | 1.0 | 8.36 | 3.596 |
1 | 1.0 | 8.38 | 3.345 |
2 | 1.0 | 8.39 | 2.748 |
3 | 1.0 | 8.45 | 2.721 |
4 | 1.0 | 8.45 | 2.687 |
In [310]:
Copied!
#using statsmodelsformula api to do the same thing
#model accepts patsy formula
#enter variables directly into patsy formula in model
#left side of tilde is "Y", right side of equation is "X's"
est = smf.ols(formula='DISSOLVED_OXYGEN_MG_L~PH+TEMPERATURE_C', data=df)
#using statsmodelsformula api to do the same thing
#model accepts patsy formula
#enter variables directly into patsy formula in model
#left side of tilde is "Y", right side of equation is "X's"
est = smf.ols(formula='DISSOLVED_OXYGEN_MG_L~PH+TEMPERATURE_C', data=df)
Fit the Model and Get Model Summary¶
In [311]:
Copied!
#statsmodel.api way
res = mod.fit()
res.summary()
#statsmodel.api way
res = mod.fit()
res.summary()
Out[311]:
Dep. Variable: | DISSOLVED_OXYGEN_MG_L | R-squared: | 0.905 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.905 |
Method: | Least Squares | F-statistic: | 5872. |
Date: | Mon, 04 Dec 2023 | Prob (F-statistic): | 0.00 |
Time: | 11:38:35 | Log-Likelihood: | -1601.8 |
No. Observations: | 1239 | AIC: | 3210. |
Df Residuals: | 1236 | BIC: | 3225. |
Df Model: | 2 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -43.9343 | 1.098 | -40.028 | 0.000 | -46.088 | -41.781 |
PH | 6.8033 | 0.132 | 51.629 | 0.000 | 6.545 | 7.062 |
TEMPERATURE_C | -0.1079 | 0.006 | -18.858 | 0.000 | -0.119 | -0.097 |
Omnibus: | 47.685 | Durbin-Watson: | 0.711 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 52.409 |
Skew: | -0.502 | Prob(JB): | 4.16e-12 |
Kurtosis: | 3.092 | Cond. No. | 907. |
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [312]:
Copied!
#statsmodelformula.api way does same thing
est = est.fit()
est.summary()
#statsmodelformula.api way does same thing
est = est.fit()
est.summary()
Out[312]:
Dep. Variable: | DISSOLVED_OXYGEN_MG_L | R-squared: | 0.905 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.905 |
Method: | Least Squares | F-statistic: | 5872. |
Date: | Mon, 04 Dec 2023 | Prob (F-statistic): | 0.00 |
Time: | 11:38:37 | Log-Likelihood: | -1601.8 |
No. Observations: | 1239 | AIC: | 3210. |
Df Residuals: | 1236 | BIC: | 3225. |
Df Model: | 2 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -43.9343 | 1.098 | -40.028 | 0.000 | -46.088 | -41.781 |
PH | 6.8033 | 0.132 | 51.629 | 0.000 | 6.545 | 7.062 |
TEMPERATURE_C | -0.1079 | 0.006 | -18.858 | 0.000 | -0.119 | -0.097 |
Omnibus: | 47.685 | Durbin-Watson: | 0.711 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 52.409 |
Skew: | -0.502 | Prob(JB): | 4.16e-12 |
Kurtosis: | 3.092 | Cond. No. | 907. |
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
More ways to get information from a model¶
In [313]:
Copied!
#summary table looks a little different
print(res.summary())
#summary table looks a little different
print(res.summary())
OLS Regression Results ================================================================================= Dep. Variable: DISSOLVED_OXYGEN_MG_L R-squared: 0.905 Model: OLS Adj. R-squared: 0.905 Method: Least Squares F-statistic: 5872. Date: Mon, 04 Dec 2023 Prob (F-statistic): 0.00 Time: 11:38:41 Log-Likelihood: -1601.8 No. Observations: 1239 AIC: 3210. Df Residuals: 1236 BIC: 3225. Df Model: 2 Covariance Type: nonrobust ================================================================================= coef std err t P>|t| [0.025 0.975] --------------------------------------------------------------------------------- Intercept -43.9343 1.098 -40.028 0.000 -46.088 -41.781 PH 6.8033 0.132 51.629 0.000 6.545 7.062 TEMPERATURE_C -0.1079 0.006 -18.858 0.000 -0.119 -0.097 ============================================================================== Omnibus: 47.685 Durbin-Watson: 0.711 Prob(Omnibus): 0.000 Jarque-Bera (JB): 52.409 Skew: -0.502 Prob(JB): 4.16e-12 Kurtosis: 3.092 Cond. No. 907. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [315]:
Copied!
#get more model specifics isolated
print("Parameters: ", res.params)
print("Standard errors: ", res.bse)
print("Predicted values: ", res.predict())
#get more model specifics isolated
print("Parameters: ", res.params)
print("Standard errors: ", res.bse)
print("Predicted values: ", res.predict())
Parameters: Intercept -43.934307 PH 6.803343 TEMPERATURE_C -0.107912 dtype: float64 Standard errors: Intercept 1.097581 PH 0.131773 TEMPERATURE_C 0.005722 dtype: float64 Predicted values: [12.55359218 12.71674491 12.8492017 ... 10.72322241 10.73627974 10.80517646]
In [316]:
Copied!
#isolate individual constants
alpha = res.params['Intercept']
print(alpha)
#isolate individual constants
alpha = res.params['Intercept']
print(alpha)
-43.93430733048154
In [317]:
Copied!
#isolate coefficients
b1 = res.params['PH']
print(b1)
#isolate coefficients
b1 = res.params['PH']
print(b1)
6.803343350479119
In [318]:
Copied!
#isolate coefficients
b2= res.params['TEMPERATURE_C']
print(b2)
#isolate coefficients
b2= res.params['TEMPERATURE_C']
print(b2)
-0.1079118195999333
Test out model and make predictions¶
In [319]:
Copied!
#get predictions
pred = res.predict()
#get predictions
pred = res.predict()
In [320]:
Copied!
pred
pred
Out[320]:
array([12.55359218, 12.71674491, 12.8492017 , ..., 10.72322241, 10.73627974, 10.80517646])
In [323]:
Copied!
#add column of predictions to original dataset for easy comparison
df['pred_DO'] = pred
#add column of predictions to original dataset for easy comparison
df['pred_DO'] = pred
In [324]:
Copied!
df.head()
df.head()
Out[324]:
DISSOLVED_OXYGEN_MG_L | TEMPERATURE_C | PH | pred_DO | |
---|---|---|---|---|
0 | 12.61 | 3.596 | 8.36 | 12.553592 |
1 | 12.53 | 3.345 | 8.38 | 12.716745 |
2 | 12.64 | 2.748 | 8.39 | 12.849202 |
3 | 13.01 | 2.721 | 8.45 | 13.260316 |
4 | 13.19 | 2.687 | 8.45 | 13.263985 |
Visualize Results¶
To plot regression between Y variable and one X variable at a time (DO vs TEMP, and DO vs PH)¶
1. Plot DO against Temperature¶
In [332]:
Copied!
#first make copy of X variable matrix
X2= X.copy()
#first make copy of X variable matrix
X2= X.copy()
In [333]:
Copied!
#set one variable equal to the average
PH_mean= df['PH'].mean()
#add mean PH to X
X2['PH'] = PH_mean
X2.head()
#set one variable equal to the average
PH_mean= df['PH'].mean()
#add mean PH to X
X2['PH'] = PH_mean
X2.head()
Out[333]:
Intercept | PH | TEMPERATURE_C | |
---|---|---|---|
0 | 1.0 | 7.703269 | 3.596 |
1 | 1.0 | 7.703269 | 3.345 |
2 | 1.0 | 7.703269 | 2.748 |
3 | 1.0 | 7.703269 | 2.721 |
4 | 1.0 | 7.703269 | 2.687 |
In [334]:
Copied!
#predicted DO with mean pH added to original dataframe
df['pred_DO_w/mean_pH'] = res.predict(X2)
#predicted DO with mean pH added to original dataframe
df['pred_DO_w/mean_pH'] = res.predict(X2)
In [335]:
Copied!
df.head()
df.head()
Out[335]:
DISSOLVED_OXYGEN_MG_L | TEMPERATURE_C | PH | pred_DO | pred_DO_w/mean_pH | pred_DO_w/mean_temp | |
---|---|---|---|---|---|---|
0 | 12.61 | 3.596 | 8.36 | 12.553592 | 8.085624 | 11.006152 |
1 | 12.53 | 3.345 | 8.38 | 12.716745 | 8.112710 | 11.142219 |
2 | 12.64 | 2.748 | 8.39 | 12.849202 | 8.177133 | 11.210252 |
3 | 13.01 | 2.721 | 8.45 | 13.260316 | 8.180047 | 11.618453 |
4 | 13.19 | 2.687 | 8.45 | 13.263985 | 8.183716 | 11.618453 |
In [336]:
Copied!
fig, ax = plt.subplots()
ax.scatter(df['TEMPERATURE_C'], df['DISSOLVED_OXYGEN_MG_L'], label = 'Actual_DO')
ax.plot(df['TEMPERATURE_C'], df['pred_DO_w/mean_pH'], color = 'BLACK', label='Regression_Line')
ax.set_xlabel('TEMPERATURE_C')
ax.set_ylabel('Dissolved Oxygen mg/L')
ax.legend()
ax.set_title('Predicted DO regression at a mean PH of 7.7')
fig, ax = plt.subplots()
ax.scatter(df['TEMPERATURE_C'], df['DISSOLVED_OXYGEN_MG_L'], label = 'Actual_DO')
ax.plot(df['TEMPERATURE_C'], df['pred_DO_w/mean_pH'], color = 'BLACK', label='Regression_Line')
ax.set_xlabel('TEMPERATURE_C')
ax.set_ylabel('Dissolved Oxygen mg/L')
ax.legend()
ax.set_title('Predicted DO regression at a mean PH of 7.7')
Out[336]:
Text(0.5, 1.0, 'Predicted DO regression at a mean PH of 7.7')
2. Plot DO against PH¶
In [337]:
Copied!
#follow same steps as before
X3=X.copy()
TEMP_mean= df['TEMPERATURE_C'].mean()
X3['TEMPERATURE_C'] = TEMP_mean
df['pred_DO_w/mean_temp'] = res.predict(X3)
df.head()
#follow same steps as before
X3=X.copy()
TEMP_mean= df['TEMPERATURE_C'].mean()
X3['TEMPERATURE_C'] = TEMP_mean
df['pred_DO_w/mean_temp'] = res.predict(X3)
df.head()
Out[337]:
DISSOLVED_OXYGEN_MG_L | TEMPERATURE_C | PH | pred_DO | pred_DO_w/mean_pH | pred_DO_w/mean_temp | |
---|---|---|---|---|---|---|
0 | 12.61 | 3.596 | 8.36 | 12.553592 | 8.085624 | 11.006152 |
1 | 12.53 | 3.345 | 8.38 | 12.716745 | 8.112710 | 11.142219 |
2 | 12.64 | 2.748 | 8.39 | 12.849202 | 8.177133 | 11.210252 |
3 | 13.01 | 2.721 | 8.45 | 13.260316 | 8.180047 | 11.618453 |
4 | 13.19 | 2.687 | 8.45 | 13.263985 | 8.183716 | 11.618453 |
In [338]:
Copied!
fig, ax = plt.subplots()
ax.scatter(df['PH'], df['DISSOLVED_OXYGEN_MG_L'], label = 'Actual_DO')
ax.plot(df['PH'], df['pred_DO_w/mean_temp'], color = 'BLACK', label='Regression_Line')
ax.set_xlabel('PH')
ax.set_ylabel('DIssolved Oxygen mg/L')
ax.legend()
ax.set_title('Predicted DO regression at a mean temp of 17.9 celsius')
fig, ax = plt.subplots()
ax.scatter(df['PH'], df['DISSOLVED_OXYGEN_MG_L'], label = 'Actual_DO')
ax.plot(df['PH'], df['pred_DO_w/mean_temp'], color = 'BLACK', label='Regression_Line')
ax.set_xlabel('PH')
ax.set_ylabel('DIssolved Oxygen mg/L')
ax.legend()
ax.set_title('Predicted DO regression at a mean temp of 17.9 celsius')
Out[338]:
Text(0.5, 1.0, 'Predicted DO regression at a mean temp of 17.9 celsius')
To plot Predicted against Actual Data (Predicted DO vs Actual DO)¶
In [339]:
Copied!
fig, ax = plt.subplots()
ax.plot(Y,Y, color = 'BLACK', label = 'Actual DO plotted on Actual DO')
ax.scatter(Y, pred, label = 'Predicted DO plotted on Acutal DO')
ax.set_xlabel('Actual DO')
ax.set_ylabel('Predicted DO')
ax.set_title('Predicted DO vs Actual DO')
ax.legend()
fig, ax = plt.subplots()
ax.plot(Y,Y, color = 'BLACK', label = 'Actual DO plotted on Actual DO')
ax.scatter(Y, pred, label = 'Predicted DO plotted on Acutal DO')
ax.set_xlabel('Actual DO')
ax.set_ylabel('Predicted DO')
ax.set_title('Predicted DO vs Actual DO')
ax.legend()
Out[339]:
<matplotlib.legend.Legend at 0x7f18cc7ce070>