For this challenge, we intend to use Machine Learning to predict the location of 3 infill wells and forecast their production for the next 2 years. In order to achieve this, we performed exploratory data analysys of the given datasets. This step included data preparation, feature imputation and feature ranking. Once we narowed down the predictor features for our model, we used a 70/30 train-test split. We decided to use Decision tress because they are one of the most powerful, cutting edge methodology in machine learning and they do not need standarization or normalization.
Import Required Packages
import pandas as pd
import math
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib import colors
from scipy.optimize import curve_fit
import geostatspy.GSLIB as GSLIB # GSLIB utilies, visualization and wrapper
import geostatspy.geostats as geostats # GSLIB methods convert to Python
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score # specific measures to check our models
from sklearn.model_selection import train_test_split # train and test split
from sklearn.impute import KNNImputer # k-nearest neighbour imputation method
from scipy.interpolate import griddata
from sklearn import tree # tree program from scikit learn (package for machine learning)
from sklearn.tree import _tree # for accessing tree information
from sklearn import metrics # measures to check our models
from sklearn.model_selection import cross_val_score # cross validation methods
from sklearn.tree import export_graphviz # graphical visualization of trees
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import (PolynomialFeatures, MaxAbsScaler,
MinMaxScaler, StandardScaler, RobustScaler)
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_percentage_error
from scipy import linalg
from scipy import stats
# sns.set(rc={'text.usetex' : True})
Let's define a couple of functions to streamline plotting correlation matrices and visualization of a machine learning regression model responce over the 2 predictor features. Functions written by Dr. Michael Pyrcz
def plot_corr(dataframe,size=10): # plots a graphical correlation matrix
corr = dataframe.corr()
fig, ax = plt.subplots(figsize=(size, size))
im = ax.matshow(corr,vmin = -1.0, vmax = 1.0)
plt.xticks(range(len(corr.columns)), corr.columns);
plt.yticks(range(len(corr.columns)), corr.columns);
plt.colorbar(im, orientation = 'vertical')
plt.title('Correlation Matrix')
def visualize_model(model,xfeature,yfeature,response,title,):# plots the data points and the decision tree prediction
n_classes = 10
cmap = plt.cm.RdYlBu
plot_step = 0.02
x_min, x_max = min(xfeature) - 1, max(xfeature) + 1
y_min, y_max = min(yfeature) - 1, max(yfeature) + 1
resp_min = round(min(response)); resp_max = round(max(response));
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
np.arange(y_min, y_max, plot_step))
z_min = round(min(response)); z_max = round(max(response))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=cmap,vmin=z_min, vmax=z_max)
im = plt.scatter(xfeature,yfeature,s=None, c=response, marker=None, cmap=cmap, norm=None, vmin=z_min, vmax=z_max, alpha=0.8, linewidths=0.3, edgecolors="black")
plt.title(title)
plt.xlabel(xfeature.name)
plt.ylabel(yfeature.name)
cbar = plt.colorbar(im, orientation = 'vertical')
cbar.set_label(response.name, rotation=270, labelpad=20)
return(plt)
Here's the command to load our comma delimited data file in to a Pandas' DataFrame object.
well_logs_df = pd.read_csv('Well_logs_base_copy.csv', index_col='Well_ID')
well_logs_df.head()
X(ft) | Y(ft) | Completion zone | Completion | Elevation Kelly Bushing (ft) | MD (ft) | Poro(v/v) | Permeability (mD) | RHOB(g/cm3) | DTS(us/ft) | DT(us/ft) | PEF(B/E) | RD(OHMM) | RS(OHMM) | ROP | DENC(g/cm3) | NPHI(v/v) | Zone | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Well_ID | ||||||||||||||||||
Well_0 | 137106.82 | 83818.47 | 0 | Upper | 193.32 | 4597.471739 | NaN | NaN | 2.700700 | 75.056761 | 51.232438 | 6.129957 | NaN | NaN | 10.710905 | 0.074251 | -0.001900 | Upper |
Well_0 | 137106.82 | 83818.47 | 0 | Upper | 193.32 | 4604.031739 | 0.087413 | NaN | 2.819380 | 85.369586 | 66.905283 | 8.498440 | NaN | NaN | 14.288629 | 0.088289 | 0.121538 | Upper |
Well_0 | 137106.82 | 83818.47 | 0 | Upper | 193.32 | 4610.591739 | NaN | NaN | 2.798909 | 124.121043 | 73.454382 | 9.761416 | NaN | NaN | 18.420916 | 0.051770 | 0.248546 | Upper |
Well_0 | 137106.82 | 83818.47 | 0 | Upper | 193.32 | 4617.151739 | 0.059590 | NaN | 2.575396 | 114.571242 | 71.776938 | 5.796872 | NaN | NaN | 12.595953 | 0.070758 | 0.142681 | Upper |
Well_0 | 137106.82 | 83818.47 | 0 | Upper | 193.32 | 4623.711739 | NaN | NaN | 2.895991 | 145.921799 | 62.656075 | 9.731257 | NaN | NaN | 22.043198 | 0.115700 | 0.353600 | Upper |
Summary statistics of the dataset
well_logs_df.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
X(ft) | 1963.0 | 124890.354575 | 9775.363008 | 109560.260000 | 115987.290000 | 124094.800000 | 134042.860000 | 142095.820000 |
Y(ft) | 1963.0 | 87551.832639 | 3210.182762 | 81747.120000 | 84476.470000 | 88549.740000 | 89987.000000 | 93691.550000 |
Completion zone | 1963.0 | 0.590932 | 0.491787 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
Elevation Kelly Bushing (ft) | 1963.0 | 181.281921 | 9.054317 | 160.220000 | 176.490000 | 181.350000 | 186.210000 | 203.850000 |
MD (ft) | 1963.0 | 4673.224657 | 165.718326 | 4228.005683 | 4546.547857 | 4666.802978 | 4796.413228 | 5096.702794 |
Poro(v/v) | 455.0 | 0.131140 | 0.062887 | 0.019566 | 0.072468 | 0.129662 | 0.185373 | 0.249241 |
Permeability (mD) | 221.0 | 10.497034 | 39.248645 | 0.000001 | 0.019500 | 0.094813 | 1.948104 | 352.697773 |
RHOB(g/cm3) | 1918.0 | 2.472862 | 0.207148 | 1.991035 | 2.326909 | 2.469486 | 2.578890 | 3.044163 |
DTS(us/ft) | 1004.0 | 128.273096 | 35.222315 | -471.675434 | 116.813090 | 131.851152 | 142.295954 | 186.253964 |
DT(us/ft) | 1920.0 | 80.322722 | 17.827921 | 1.138524 | 69.493418 | 78.929504 | 87.863830 | 179.133001 |
PEF(B/E) | 1689.0 | 5.351575 | 2.475290 | 0.019155 | 4.930555 | 5.782597 | 6.477832 | 11.490936 |
RD(OHMM) | 1724.0 | 181.844254 | 184.443838 | 10.197411 | 70.345749 | 130.884864 | 213.021581 | 2508.770579 |
RS(OHMM) | 1724.0 | 514.513168 | 3246.082233 | 2.270108 | 71.606283 | 137.398654 | 259.946834 | 62290.800000 |
ROP | 1726.0 | 20.167304 | 28.413135 | -490.300267 | 16.815940 | 20.011714 | 27.341557 | 46.895400 |
DENC(g/cm3) | 1684.0 | 0.047522 | 0.027440 | -0.050168 | 0.030830 | 0.050912 | 0.063165 | 0.161357 |
NPHI(v/v) | 1963.0 | 0.200203 | 0.107541 | -0.001900 | 0.122878 | 0.186431 | 0.244933 | 1.021738 |
Copy df and TRUNCATE Negative values
wells_df = well_logs_df.copy()
# wells_df.head()
# prevent negative values with truncation
num = wells_df._get_numeric_data() # get the numerical values
num[num < 0] = 0 # truncate negative values to 0.0
wells_df.describe().transpose() # calculate summary statistics for the data
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
X(ft) | 1963.0 | 124890.354575 | 9775.363008 | 109560.260000 | 115987.290000 | 124094.800000 | 134042.860000 | 142095.820000 |
Y(ft) | 1963.0 | 87551.832639 | 3210.182762 | 81747.120000 | 84476.470000 | 88549.740000 | 89987.000000 | 93691.550000 |
Completion zone | 1963.0 | 0.590932 | 0.491787 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
Elevation Kelly Bushing (ft) | 1963.0 | 181.281921 | 9.054317 | 160.220000 | 176.490000 | 181.350000 | 186.210000 | 203.850000 |
MD (ft) | 1963.0 | 4673.224657 | 165.718326 | 4228.005683 | 4546.547857 | 4666.802978 | 4796.413228 | 5096.702794 |
Poro(v/v) | 455.0 | 0.131140 | 0.062887 | 0.019566 | 0.072468 | 0.129662 | 0.185373 | 0.249241 |
Permeability (mD) | 221.0 | 10.497034 | 39.248645 | 0.000001 | 0.019500 | 0.094813 | 1.948104 | 352.697773 |
RHOB(g/cm3) | 1918.0 | 2.472862 | 0.207148 | 1.991035 | 2.326909 | 2.469486 | 2.578890 | 3.044163 |
DTS(us/ft) | 1004.0 | 129.502866 | 20.175138 | 0.000000 | 116.813090 | 131.851152 | 142.295954 | 186.253964 |
DT(us/ft) | 1920.0 | 80.322722 | 17.827921 | 1.138524 | 69.493418 | 78.929504 | 87.863830 | 179.133001 |
PEF(B/E) | 1689.0 | 5.351575 | 2.475290 | 0.019155 | 4.930555 | 5.782597 | 6.477832 | 11.490936 |
RD(OHMM) | 1724.0 | 181.844254 | 184.443838 | 10.197411 | 70.345749 | 130.884864 | 213.021581 | 2508.770579 |
RS(OHMM) | 1724.0 | 514.513168 | 3246.082233 | 2.270108 | 71.606283 | 137.398654 | 259.946834 | 62290.800000 |
ROP | 1726.0 | 21.578345 | 7.787905 | 0.000000 | 16.815940 | 20.011714 | 27.341557 | 46.895400 |
DENC(g/cm3) | 1684.0 | 0.047996 | 0.026419 | 0.000000 | 0.030830 | 0.050912 | 0.063165 | 0.161357 |
NPHI(v/v) | 1963.0 | 0.200208 | 0.107532 | 0.000000 | 0.122878 | 0.186431 | 0.244933 | 1.021738 |
fmin = [100000,80000,0.0,160.0,4200.0,0.0,0.0,1.9,0,1.0,0.0,10,2.0,0.0,0.0,0.0]
fmax = [150000,100000,1.0,210.0,5500.0,0.3,500,3.1,200.0,200.0,12.0,2600.0,65000,50.0,0.25,1.05]
(wells_df.isnull().sum()/len(wells_df)).plot(kind = 'bar')
plt.subplots_adjust(left=0.0, bottom=0.0, right=3.2, top=1.2, wspace=0.2, hspace=0.2) # plot formatting
plt.xlabel('Feature'); plt.ylabel('Percentage of Missing Records'); plt.title('Data Completeness')
plt.ylim([0,1.0])
plt.show()
# DROP UNUSEFUL COLUMNS
df_imputed = wells_df.copy(deep=True) # make a deep copy of the DataFrame
df_imputed.drop(['Elevation Kelly Bushing (ft)','Completion', 'DTS(us/ft)', 'DT(us/ft)', 'PEF(B/E)', 'RD(OHMM)', 'RS(OHMM)', 'ROP', 'DENC(g/cm3)', 'Zone'], axis=1, inplace=True)
df_imputed.head()
X(ft) | Y(ft) | Completion zone | MD (ft) | Poro(v/v) | Permeability (mD) | RHOB(g/cm3) | NPHI(v/v) | |
---|---|---|---|---|---|---|---|---|
Well_ID | ||||||||
Well_0 | 137106.82 | 83818.47 | 0 | 4597.471739 | NaN | NaN | 2.700700 | 0.000000 |
Well_0 | 137106.82 | 83818.47 | 0 | 4604.031739 | 0.087413 | NaN | 2.819380 | 0.121538 |
Well_0 | 137106.82 | 83818.47 | 0 | 4610.591739 | NaN | NaN | 2.798909 | 0.248546 |
Well_0 | 137106.82 | 83818.47 | 0 | 4617.151739 | 0.059590 | NaN | 2.575396 | 0.142681 |
Well_0 | 137106.82 | 83818.47 | 0 | 4623.711739 | NaN | NaN | 2.895991 | 0.353600 |
df_imputed.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
X(ft) | 1963.0 | 124890.354575 | 9775.363008 | 109560.260000 | 115987.290000 | 124094.800000 | 134042.860000 | 142095.820000 |
Y(ft) | 1963.0 | 87551.832639 | 3210.182762 | 81747.120000 | 84476.470000 | 88549.740000 | 89987.000000 | 93691.550000 |
Completion zone | 1963.0 | 0.590932 | 0.491787 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
MD (ft) | 1963.0 | 4673.224657 | 165.718326 | 4228.005683 | 4546.547857 | 4666.802978 | 4796.413228 | 5096.702794 |
Poro(v/v) | 455.0 | 0.131140 | 0.062887 | 0.019566 | 0.072468 | 0.129662 | 0.185373 | 0.249241 |
Permeability (mD) | 221.0 | 10.497034 | 39.248645 | 0.000001 | 0.019500 | 0.094813 | 1.948104 | 352.697773 |
RHOB(g/cm3) | 1918.0 | 2.472862 | 0.207148 | 1.991035 | 2.326909 | 2.469486 | 2.578890 | 3.044163 |
NPHI(v/v) | 1963.0 | 0.200208 | 0.107532 | 0.000000 | 0.122878 | 0.186431 | 0.244933 | 1.021738 |
df_knn = df_imputed.copy(deep=True) # make a deep copy of the DataFrame
knn_imputer = KNNImputer(n_neighbors=2, weights="uniform")
df_knn.iloc[:,:] = knn_imputer.fit_transform(df_knn)
df_imputed['RHOB(g/cm3)'] = df_knn['RHOB(g/cm3)']
df_imputed.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
X(ft) | 1963.0 | 124890.354575 | 9775.363008 | 109560.260000 | 115987.290000 | 124094.800000 | 134042.860000 | 142095.820000 |
Y(ft) | 1963.0 | 87551.832639 | 3210.182762 | 81747.120000 | 84476.470000 | 88549.740000 | 89987.000000 | 93691.550000 |
Completion zone | 1963.0 | 0.590932 | 0.491787 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
MD (ft) | 1963.0 | 4673.224657 | 165.718326 | 4228.005683 | 4546.547857 | 4666.802978 | 4796.413228 | 5096.702794 |
Poro(v/v) | 455.0 | 0.131140 | 0.062887 | 0.019566 | 0.072468 | 0.129662 | 0.185373 | 0.249241 |
Permeability (mD) | 221.0 | 10.497034 | 39.248645 | 0.000001 | 0.019500 | 0.094813 | 1.948104 | 352.697773 |
RHOB(g/cm3) | 1963.0 | 2.474359 | 0.205946 | 1.991035 | 2.329274 | 2.472083 | 2.580801 | 3.044163 |
NPHI(v/v) | 1963.0 | 0.200208 | 0.107532 | 0.000000 | 0.122878 | 0.186431 | 0.244933 | 1.021738 |
porosity_df = (wells_df[['Poro(v/v)', 'NPHI(v/v)', 'RHOB(g/cm3)', 'Completion zone']])
porosity_df.head()
Poro(v/v) | NPHI(v/v) | RHOB(g/cm3) | Completion zone | |
---|---|---|---|---|
Well_ID | ||||
Well_0 | NaN | 0.000000 | 2.700700 | 0 |
Well_0 | 0.087413 | 0.121538 | 2.819380 | 0 |
Well_0 | NaN | 0.248546 | 2.798909 | 0 |
Well_0 | 0.059590 | 0.142681 | 2.575396 | 0 |
Well_0 | NaN | 0.353600 | 2.895991 | 0 |
porosity_df.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Poro(v/v) | 455.0 | 0.131140 | 0.062887 | 0.019566 | 0.072468 | 0.129662 | 0.185373 | 0.249241 |
NPHI(v/v) | 1963.0 | 0.200208 | 0.107532 | 0.000000 | 0.122878 | 0.186431 | 0.244933 | 1.021738 |
RHOB(g/cm3) | 1918.0 | 2.472862 | 0.207148 | 1.991035 | 2.326909 | 2.469486 | 2.578890 | 3.044163 |
Completion zone | 1963.0 | 0.590932 | 0.491787 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
porosity2_df = porosity_df.dropna()
porosity2_df.head()
Poro(v/v) | NPHI(v/v) | RHOB(g/cm3) | Completion zone | |
---|---|---|---|---|
Well_ID | ||||
Well_0 | 0.087413 | 0.121538 | 2.819380 | 0 |
Well_0 | 0.059590 | 0.142681 | 2.575396 | 0 |
Well_0 | 0.082269 | 0.226349 | 2.831982 | 0 |
Well_0 | 0.062989 | 0.188662 | 2.727031 | 0 |
Well_0 | 0.067931 | 0.317625 | 2.974074 | 0 |
porosity2_df.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Poro(v/v) | 455.0 | 0.131140 | 0.062887 | 0.019566 | 0.072468 | 0.129662 | 0.185373 | 0.249241 |
NPHI(v/v) | 455.0 | 0.203304 | 0.110743 | 0.000000 | 0.121647 | 0.188193 | 0.245365 | 0.786988 |
RHOB(g/cm3) | 455.0 | 2.475084 | 0.213178 | 2.021383 | 2.336362 | 2.468460 | 2.589938 | 3.041181 |
Completion zone | 455.0 | 0.542857 | 0.498708 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
por_upper_df = porosity2_df[porosity2_df['Completion zone']==0]
por_upper_df.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Poro(v/v) | 208.0 | 0.120982 | 0.062531 | 0.026224 | 0.066350 | 0.100764 | 0.181212 | 0.249241 |
NPHI(v/v) | 208.0 | 0.212739 | 0.113857 | 0.000000 | 0.120654 | 0.198926 | 0.280975 | 0.786988 |
RHOB(g/cm3) | 208.0 | 2.476278 | 0.251717 | 2.021383 | 2.274563 | 2.472102 | 2.609761 | 3.041181 |
Completion zone | 208.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
por_lower_df = porosity2_df[porosity2_df['Completion zone']==1]
por_lower_df.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Poro(v/v) | 247.0 | 0.139693 | 0.062030 | 0.019566 | 0.079900 | 0.148503 | 0.192791 | 0.244583 |
NPHI(v/v) | 247.0 | 0.195359 | 0.107642 | 0.042470 | 0.122611 | 0.182560 | 0.217577 | 0.740866 |
RHOB(g/cm3) | 247.0 | 2.474079 | 0.174791 | 2.036921 | 2.360671 | 2.467156 | 2.580999 | 2.955648 |
Completion zone | 247.0 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
# Fitting a line
x1 = por_upper_df['NPHI(v/v)']
y1 = por_upper_df['Poro(v/v)']
c1 = np.polyfit(x1,y1,1) # Calculate the polynomial fit
ycalc1 = np.polyval(c1,x1) # Using the polynomial, calculate values for array x
x2 = por_lower_df['NPHI(v/v)']
y2 = por_lower_df['Poro(v/v)']
c2 = np.polyfit(x2,y2,1) # Calculate the polynomial fit
ycalc2 = np.polyval(c2,x2) # Using the polynomial, calculate values for array x
# create figure and axis objects with subplots()
fig,ax = plt.subplots()
# make a plot
ax.scatter(x1, y1, color='r', label='upper')
ax.scatter(x2, y2, color='b', label='lower')
#ax.plot(x1,y1,'o',label='values')
ax.plot(x1,ycalc1,'--^',label='upper -polynomial fit degree 1')
ax.plot(x2,ycalc2,'--^',label='lower -polynomial fit degree 1')
ax.legend(loc='best');
# set axis limits
# ax.set_xlim(0,10000)
# #ax.set_ylim(0,3000)
# set x-axis label
ax.set_xlabel('NPHI(v/v)',fontsize=14)
# set y-axis label
ax.set_ylabel("'Poro(v/v)'",color="black",fontsize=14)
ax.set_title('Porosity Cross-plot')
plt.grid()
plt.legend(loc=8)
# twin object for two different y-axis on the sample plot
#ax2=ax.twinx()
# make a plot with different y-axis using second axis object
#ax2.plot(df['Time (s)'], df['Ev(Lc)'], 'g', label='Ev')
#ax2.set_ylabel("Volumetric Strain [-]",color="green",fontsize=14)
#plt.legend(loc=9)
plt.show()
# save the plot as a file
# fig.savefig('He_porosity.png',dpi=300, facecolor='w', edgecolor='w',
# orientation='portrait', format=None,
# transparent=False, bbox_inches=None, pad_inches=0.1,
# metadata=None)
#MERGE WITH CUMULATIVE PRODUCTION
production_history_df = pd.read_csv('Well_Production.csv', index_col='Well_ID')
# production_history_df.head()
production_df = wells_df.merge(production_history_df, how='left', left_index=True, right_index=True)
production_df.head()
Summary statistics of one variable at a time. The describe command provides count, mean, minimum, maximum, and quartiles all in a compact data table. We use transpose() command to flip the table so that features are on the rows and the statistics are on the columns.
# Select 2 predictor features here:
predictor_features = ['NPHI(v/v)','RHOB(g/cm3)'] # for the first demonstration run we will use porosity and brittleness
#
response_feature = 'Poro(v/v)'
pindex = np.argwhere(por_upper_df.columns.isin(predictor_features)).ravel()
print('Selected predictor features: \n0: ' + predictor_features[0] + ', index = ' + str(pindex[0]) + '.')
print('1: ' + predictor_features[1] + ', index = ' + str(pindex[1]) + '.')
rindex = np.argwhere(por_upper_df.columns.isin([response_feature])).ravel()
print('\nSelected response feature: \n' + response_feature + ', index = ' + str(rindex[0]) + '.')
#my_data_cut = my_data.iloc[0:500,:]
X = por_upper_df[predictor_features] # extract the 2 selected response features, 500 samples to a X array
y = por_upper_df[[response_feature]]
Selected predictor features: 0: NPHI(v/v), index = 1. 1: RHOB(g/cm3), index = 2. Selected response feature: Poro(v/v), index = 0.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=73073)
n_train = len(X_train)
n_test = len(X_test)
print('Number of training ' + str(n_train) + ', number of test ' + str(n_test))
Number of training 166, number of test 42
# Step 1. Instantiate the Model
multilinear_reg = linear_model.LinearRegression()
# Step 2: Fit the Data on Training Data
multilinear_reg.fit(X_train.values.reshape(n_train,2), y_train[response_feature]) # fit model
# Print the model parameters
print(predictor_features[0] + ' Coef: ' + str(round(multilinear_reg.coef_[0],3)) + ', ' + predictor_features[1] + ' Coef: ', str(round(multilinear_reg.coef_[1],3)) + ', Intercept: ', str(round(multilinear_reg.intercept_,3)))
# Plot model fit
plt.subplot(121)
plt = visualize_model(multilinear_reg,X_test[predictor_features[0]],X_test[predictor_features[1]],y_test[response_feature],'Testing Data and Multilinear Regression Model')
# Step 3: - Make predictions using the testing dataset
multilinear_y_pred = multilinear_reg.predict(X_test.values.reshape(n_test,2))
# Report the goodness of fit
print('Variance explained: %.2f' % r2_score(y_test, multilinear_y_pred))
# Calculate the error at withheld testing samples
multilinear_y_res = multilinear_y_pred - y_test[response_feature].values
plt.subplot(122)
plt.hist(multilinear_y_res, alpha = 0.2, color = 'red', edgecolor = 'black', bins=np.linspace(-1*fmax[rindex[0]],fmax[rindex[0]],40))
plt.title('Linear Regression Model Prediction Error - ' + response_feature + ' from ' + predictor_features[0] + ' and ' + predictor_features[1]);
plt.xlabel(response_feature + ' Estimation Error '); plt.ylabel('Frequency')
plt.xlim(-1*fmax[rindex[0]],fmax[rindex[0]])
plt.subplots_adjust(left=0.0, bottom=0.0, right=3.2, top=1.6, wspace=0.1, hspace=0.2)
plt.show()
NPHI(v/v) Coef: 0.091, RHOB(g/cm3) Coef: -0.191, Intercept: 0.573 Variance explained: 0.70
#Plot the model
fig,ax = plt.subplots()
# make a plot
plt.plot(multilinear_y_pred, y_test, 'o', label="Linear reg. model - UPPER")
plt.arrow(0,0,0.3,0.3)
plt.xlim(0, 0.3)
plt.ylim(0, 0.3)
plt.grid()
plt.xlabel("Predicted"); plt.ylabel("Real"); plt.legend();
plt.subplots_adjust(left=0.0, bottom=0.0, right=2.3, top=1, wspace=0.2, hspace=0.3)
ax.set_aspect('equal', adjustable='box')
plt.show()
# Select 2 predictor features here:
predictor_features = ['NPHI(v/v)','RHOB(g/cm3)'] # for the first demonstration run we will use porosity and brittleness
#
response_feature = 'Poro(v/v)'
pindex = np.argwhere(por_upper_df.columns.isin(predictor_features)).ravel()
print('Selected predictor features: \n0: ' + predictor_features[0] + ', index = ' + str(pindex[0]) + '.')
print('1: ' + predictor_features[1] + ', index = ' + str(pindex[1]) + '.')
rindex = np.argwhere(por_upper_df.columns.isin([response_feature])).ravel()
print('\nSelected response feature: \n' + response_feature + ', index = ' + str(rindex[0]) + '.')
#my_data_cut = my_data.iloc[0:500,:]
X = por_lower_df[predictor_features] # extract the 2 selected response features, 500 samples to a X array
y = por_lower_df[[response_feature]]
Selected predictor features: 0: NPHI(v/v), index = 1. 1: RHOB(g/cm3), index = 2. Selected response feature: Poro(v/v), index = 0.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=73073)
n_train = len(X_train)
n_test = len(X_test)
print('Number of training ' + str(n_train) + ', number of test ' + str(n_test))
Number of training 197, number of test 50
# Step 1. Instantiate the Model
multilinear_reg = linear_model.LinearRegression()
# Step 2: Fit the Data on Training Data
multilinear_reg.fit(X_train.values.reshape(n_train,2), y_train[response_feature]) # fit model
# Print the model parameters
print(predictor_features[0] + ' Coef: ' + str(round(multilinear_reg.coef_[0],3)) + ', ' + predictor_features[1] + ' Coef: ', str(round(multilinear_reg.coef_[1],3)) + ', Intercept: ', str(round(multilinear_reg.intercept_,3)))
# Plot model fit
plt.subplot(121)
plt = visualize_model(multilinear_reg,X_test[predictor_features[0]],X_test[predictor_features[1]],y_test[response_feature],'Testing Data and Multilinear Regression Model')
# Step 3: - Make predictions using the testing dataset
multilinear_y_pred = multilinear_reg.predict(X_test.values.reshape(n_test,2))
# Report the goodness of fit
print('Variance explained: %.2f' % r2_score(y_test, multilinear_y_pred))
# Calculate the error at withheld testing samples
multilinear_y_res = multilinear_y_pred - y_test[response_feature].values
plt.subplot(122)
plt.hist(multilinear_y_res, alpha = 0.2, color = 'red', edgecolor = 'black', bins=np.linspace(-1*fmax[rindex[0]],fmax[rindex[0]],40))
plt.title('Linear Regression Model Prediction Error - ' + response_feature + ' from ' + predictor_features[0] + ' and ' + predictor_features[1]);
plt.xlabel(response_feature + ' Estimation Error '); plt.ylabel('Frequency')
plt.xlim(-1*fmax[rindex[0]],fmax[rindex[0]])
plt.subplots_adjust(left=0.0, bottom=0.0, right=3.2, top=1.6, wspace=0.1, hspace=0.2)
plt.show()
NPHI(v/v) Coef: 0.067, RHOB(g/cm3) Coef: -0.278, Intercept: 0.816 Variance explained: 0.75
#Plot the model
fig,ax = plt.subplots()
#plt.scatter(train.ENGINESIZE, train.CO2EMISSIONS, color='blue', label="Training")
plt.plot(multilinear_y_pred, y_test, 'o', label="Linear reg. model - LOWER")
plt.arrow(0,0,0.3,0.3)
plt.xlim(0, 0.3)
plt.ylim(0, 0.3)
plt.grid()
plt.xlabel("Predicted"); plt.ylabel("Real"); plt.legend();
plt.subplots_adjust(left=0.0, bottom=0.0, right=2.3, top=1, wspace=0.2, hspace=0.3)
ax.set_aspect('equal', adjustable='box')
plt.show()
df_por = df_imputed.copy(deep=True) # make a deep copy of the DataFrame
len(df_por)
1963
df_por = df_imputed.copy(deep=True) # make a deep copy of the DataFrame
for i in range(len(df_por)):
if (math.isnan(df_por['Poro(v/v)'][i]) and df_por['Completion zone'][i]==0) :
#print(i,df_por['Completion zone'][i], math.isnan(df_por['Poro(v/v)'][i]))
df_por['Poro(v/v)'][i]= (0.091*df_por['NPHI(v/v)'][i])-(0.191*df_por['RHOB(g/cm3)'][i])+0.573
elif (math.isnan(df_por['Poro(v/v)'][i]) and df_por['Completion zone'][i]==1):
#print(i,df_por['Completion zone'][i], math.isnan(df_por['Poro(v/v)'][i]))
df_por['Poro(v/v)'][i] = (0.067*df_por['NPHI(v/v)'][i])-(0.278*df_por['RHOB(g/cm3)'][i])+0.816
df_por.head(n=5)
C:\Users\monic\AppData\Local\Temp/ipykernel_40980/4074718225.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_por['Poro(v/v)'][i]= (0.091*df_por['NPHI(v/v)'][i])-(0.191*df_por['RHOB(g/cm3)'][i])+0.573 C:\Users\monic\AppData\Local\Temp/ipykernel_40980/4074718225.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_por['Poro(v/v)'][i] = (0.067*df_por['NPHI(v/v)'][i])-(0.278*df_por['RHOB(g/cm3)'][i])+0.816
X(ft) | Y(ft) | Completion zone | MD (ft) | Poro(v/v) | Permeability (mD) | RHOB(g/cm3) | NPHI(v/v) | |
---|---|---|---|---|---|---|---|---|
Well_ID | ||||||||
Well_0 | 137106.82 | 83818.47 | 0 | 4597.471739 | 0.057166 | NaN | 2.700700 | 0.000000 |
Well_0 | 137106.82 | 83818.47 | 0 | 4604.031739 | 0.087413 | NaN | 2.819380 | 0.121538 |
Well_0 | 137106.82 | 83818.47 | 0 | 4610.591739 | 0.061026 | NaN | 2.798909 | 0.248546 |
Well_0 | 137106.82 | 83818.47 | 0 | 4617.151739 | 0.059590 | NaN | 2.575396 | 0.142681 |
Well_0 | 137106.82 | 83818.47 | 0 | 4623.711739 | 0.052043 | NaN | 2.895991 | 0.353600 |
# prevent negative values with truncation
num = df_por._get_numeric_data() # get the numerical values
num[num < 0] = 0 # truncate negative values to 0.0
df_imputed['Poro(v/v)'] = df_por['Poro(v/v)']
df_imputed.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
X(ft) | 1963.0 | 124890.354575 | 9775.363008 | 109560.260000 | 115987.290000 | 124094.800000 | 134042.860000 | 142095.820000 |
Y(ft) | 1963.0 | 87551.832639 | 3210.182762 | 81747.120000 | 84476.470000 | 88549.740000 | 89987.000000 | 93691.550000 |
Completion zone | 1963.0 | 0.590932 | 0.491787 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
MD (ft) | 1963.0 | 4673.224657 | 165.718326 | 4228.005683 | 4546.547857 | 4666.802978 | 4796.413228 | 5096.702794 |
Poro(v/v) | 1963.0 | 0.132445 | 0.056066 | 0.000000 | 0.093060 | 0.129367 | 0.173578 | 0.323677 |
Permeability (mD) | 221.0 | 10.497034 | 39.248645 | 0.000001 | 0.019500 | 0.094813 | 1.948104 | 352.697773 |
RHOB(g/cm3) | 1963.0 | 2.474359 | 0.205946 | 1.991035 | 2.329274 | 2.472083 | 2.580801 | 3.044163 |
NPHI(v/v) | 1963.0 | 0.200208 | 0.107532 | 0.000000 | 0.122878 | 0.186431 | 0.244933 | 1.021738 |
$\kappa \sim \phi^3/(1−\phi)^2$
# Pipeline designed by Dr. John Foster
df_perm = df_imputed.copy(deep=True) # make a deep copy of the DataFrame
# print(len(missing_perms))
missing_perms = df_perm['Permeability (mD)'].isnull()
phi = df_perm.loc[~missing_perms, 'Poro(v/v)'].to_numpy()
fig, ax = plt.subplots()
sns.regplot(phi ** 3 / (1 - phi) ** 2,
wells_df.loc[~missing_perms, 'Permeability (mD)'], ax=ax);
ax.set_xlabel(r'$\frac{\phi^3}{(1 - \phi)^2}$');
C:\Users\monic\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
fphi = lambda phi, m, kappa0: m * phi ** 3 / (1 - phi) ** 2 + kappa0
popt, _ = curve_fit(fphi, phi, df_perm.loc[~missing_perms, 'Permeability (mD)'])
# fphi = lambda phi, m, kappa0: m * phi ** 3 / (1 - phi) ** 2 + kappa0
# popt, _ = curve_fit(fphi, phi, wells_df.loc[~missing_perms, 'Permeability (mD)'])
KC permeability, mD'
.df_perm['KC permeability (mD)'] = fphi(df_perm['Poro(v/v)'],
popt[0], popt[1])
df_perm.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
X(ft) | 1963.0 | 124890.354575 | 9775.363008 | 109560.260000 | 115987.290000 | 124094.800000 | 134042.860000 | 142095.820000 |
Y(ft) | 1963.0 | 87551.832639 | 3210.182762 | 81747.120000 | 84476.470000 | 88549.740000 | 89987.000000 | 93691.550000 |
Completion zone | 1963.0 | 0.590932 | 0.491787 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
MD (ft) | 1963.0 | 4673.224657 | 165.718326 | 4228.005683 | 4546.547857 | 4666.802978 | 4796.413228 | 5096.702794 |
Poro(v/v) | 1963.0 | 0.132445 | 0.056066 | 0.000000 | 0.093060 | 0.129367 | 0.173578 | 0.323677 |
Permeability (mD) | 221.0 | 10.497034 | 39.248645 | 0.000001 | 0.019500 | 0.094813 | 1.948104 | 352.697773 |
RHOB(g/cm3) | 1963.0 | 2.474359 | 0.205946 | 1.991035 | 2.329274 | 2.472083 | 2.580801 | 3.044163 |
NPHI(v/v) | 1963.0 | 0.200208 | 0.107532 | 0.000000 | 0.122878 | 0.186431 | 0.244933 | 1.021738 |
KC permeability (mD) | 1963.0 | 10.937444 | 22.481392 | -7.243402 | -4.001010 | 2.208734 | 18.096608 | 238.090327 |
scalers = [MaxAbsScaler(), MinMaxScaler(), StandardScaler(), RobustScaler()]
pipe = Pipeline([
('scaler', StandardScaler()),
('pca', PCA()),
('poly', PolynomialFeatures()),
('reg', LinearRegression())
])
params = {
'scaler': scalers,
'poly__degree': [2, 3, 4],
'pca__n_components': [1, 2, 3, 4, 5, 6, 7],
}
gcv = GridSearchCV(pipe, params, cv=KFold(random_state=5, shuffle=True))
gcv.fit(df_perm[~missing_perms].drop('Permeability (mD)', axis=1),
df_perm.loc[~missing_perms, 'Permeability (mD)'])
gcv.best_params_
{'pca__n_components': 1, 'poly__degree': 2, 'scaler': RobustScaler()}
df_perm.loc[missing_perms, 'Permeability (mD)'] = \
gcv.predict(df_perm[missing_perms].drop('Permeability (mD)', axis=1))
# prevent negative values with truncation
num = df_perm._get_numeric_data() # get the numerical values
num[num < 0] = 0 # truncate negative values to 0.0
df_perm.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
X(ft) | 1963.0 | 124890.354575 | 9775.363008 | 109560.260000 | 115987.290000 | 124094.800000 | 134042.860000 | 142095.820000 |
Y(ft) | 1963.0 | 87551.832639 | 3210.182762 | 81747.120000 | 84476.470000 | 88549.740000 | 89987.000000 | 93691.550000 |
Completion zone | 1963.0 | 0.590932 | 0.491787 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
MD (ft) | 1963.0 | 4673.224657 | 165.718326 | 4228.005683 | 4546.547857 | 4666.802978 | 4796.413228 | 5096.702794 |
Poro(v/v) | 1963.0 | 0.132445 | 0.056066 | 0.000000 | 0.093060 | 0.129367 | 0.173578 | 0.323677 |
Permeability (mD) | 1963.0 | 12.417411 | 33.259635 | 0.000000 | 0.000000 | 0.063276 | 8.568973 | 673.495316 |
RHOB(g/cm3) | 1963.0 | 2.474359 | 0.205946 | 1.991035 | 2.329274 | 2.472083 | 2.580801 | 3.044163 |
NPHI(v/v) | 1963.0 | 0.200208 | 0.107532 | 0.000000 | 0.122878 | 0.186431 | 0.244933 | 1.021738 |
KC permeability (mD) | 1963.0 | 12.824067 | 21.218079 | 0.000000 | 0.000000 | 2.208734 | 18.096608 | 238.090327 |
fig, ax = plt.subplots()
ax.scatter(df_perm[~missing_perms]['Poro(v/v)'], df_perm[~missing_perms]['Permeability (mD)'],
color='r', facecolors='none', label='K')
ax.scatter(df_perm[missing_perms]['Poro(v/v)'],
df_perm[missing_perms]['Permeability (mD)'],
color='r', label='K (Imputed)')
ax.set_xlabel('Porosity(v/v)')
ax.set_ylabel('Permeability, mD')
ax.legend(bbox_to_anchor=(1.5, 1), loc='upper right', ncol=1);
#The 'KC permeability, mD' is redundant now, so we'll drop it from the dataframe.
df_perm.drop('KC permeability (mD)', axis=1, inplace=True)
df_imputed['Permeability (mD)'] = df_perm['Permeability (mD)']
df_imputed.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
X(ft) | 1963.0 | 124890.354575 | 9775.363008 | 109560.260000 | 115987.290000 | 124094.800000 | 134042.860000 | 142095.820000 |
Y(ft) | 1963.0 | 87551.832639 | 3210.182762 | 81747.120000 | 84476.470000 | 88549.740000 | 89987.000000 | 93691.550000 |
Completion zone | 1963.0 | 0.590932 | 0.491787 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
MD (ft) | 1963.0 | 4673.224657 | 165.718326 | 4228.005683 | 4546.547857 | 4666.802978 | 4796.413228 | 5096.702794 |
Poro(v/v) | 1963.0 | 0.132445 | 0.056066 | 0.000000 | 0.093060 | 0.129367 | 0.173578 | 0.323677 |
Permeability (mD) | 1963.0 | 12.417411 | 33.259635 | 0.000000 | 0.000000 | 0.063276 | 8.568973 | 673.495316 |
RHOB(g/cm3) | 1963.0 | 2.474359 | 0.205946 | 1.991035 | 2.329274 | 2.472083 | 2.580801 | 3.044163 |
NPHI(v/v) | 1963.0 | 0.200208 | 0.107532 | 0.000000 | 0.122878 | 0.186431 | 0.244933 | 1.021738 |
df_imputed['Zone']=well_logs_df['Zone']
df_imputed.head(n=10)
X(ft) | Y(ft) | Completion zone | MD (ft) | Poro(v/v) | Permeability (mD) | RHOB(g/cm3) | NPHI(v/v) | Zone | |
---|---|---|---|---|---|---|---|---|---|
Well_ID | |||||||||
Well_0 | 137106.82 | 83818.47 | 0 | 4597.471739 | 0.057166 | 7.888303 | 2.700700 | 0.000000 | Upper |
Well_0 | 137106.82 | 83818.47 | 0 | 4604.031739 | 0.087413 | 4.408944 | 2.819380 | 0.121538 | Upper |
Well_0 | 137106.82 | 83818.47 | 0 | 4610.591739 | 0.061026 | 1.353763 | 2.798909 | 0.248546 | Upper |
Well_0 | 137106.82 | 83818.47 | 0 | 4617.151739 | 0.059590 | 0.000000 | 2.575396 | 0.142681 | Upper |
Well_0 | 137106.82 | 83818.47 | 0 | 4623.711739 | 0.052043 | 1.461211 | 2.895991 | 0.353600 | Upper |
Well_0 | 137106.82 | 83818.47 | 0 | 4630.271739 | 0.082269 | 1.532611 | 2.831982 | 0.226349 | Upper |
Well_0 | 137106.82 | 83818.47 | 0 | 4636.831739 | 0.055201 | 5.250441 | 2.774357 | 0.133007 | Upper |
Well_0 | 137106.82 | 83818.47 | 0 | 4643.391739 | 0.062989 | 1.049195 | 2.727031 | 0.188662 | Upper |
Well_0 | 137106.82 | 83818.47 | 0 | 4649.951739 | 0.030784 | 11.701884 | 2.926972 | 0.185004 | Upper |
Well_0 | 137106.82 | 83818.47 | 0 | 4656.511739 | 0.067931 | 4.197260 | 2.974074 | 0.317625 | Upper |
df_imputed['Zone']=df_imputed['Zone'].replace(np.nan,0)
df_imputed['Zone'].replace(['Upper','Lower'], [0,1], inplace=True)
df_imputed['mark']=df_imputed['Zone']-df_imputed['Completion zone']
df_mean = df_imputed[df_imputed['mark']==0].copy(deep=True) # make a deep copy of the DataFrame
df_mean.drop(['RHOB(g/cm3)','NPHI(v/v)' ,'Zone', 'mark'], axis=1, inplace=True)
well_summary_df = df_mean.groupby(df_mean.index).mean()
well_summary_df.head()
X(ft) | Y(ft) | Completion zone | MD (ft) | Poro(v/v) | Permeability (mD) | |
---|---|---|---|---|---|---|
Well_ID | ||||||
Well_0 | 137106.82 | 83818.47 | 0.0 | 4659.791739 | 0.074403 | 3.098397 |
Well_1 | 132460.98 | 85832.77 | 0.0 | 4868.900545 | 0.126777 | 13.832289 |
Well_10 | 134042.86 | 82411.27 | 0.0 | 4749.294657 | 0.125236 | 2.528347 |
Well_11 | 115725.49 | 86866.60 | 0.0 | 4628.309023 | 0.085998 | 0.635944 |
Well_12 | 133136.98 | 93691.55 | 1.0 | 4821.390995 | 0.130572 | 2.760318 |
NearestNDInterpolator
to extract the value of AI for all the wells in the reservoirfrom scipy.interpolate import NearestNDInterpolator
#Separete dataset in UPPER and Lower
df = pd.read_csv('Well_Head_and_Completion.csv', index_col='Well_ID')
df_Upper=df[df["Completion"] =="Upper"]
df_Lower=df[df["Completion"] =="Lower"]
df_AI = pd.read_csv('AI.csv')
df_AI.describe()
#Inpute the AI for each well location from the AI MAP 2020
x = df_AI['X(ft)']
y = df_AI['Y(ft)']
AIu= df_AI['AI_upper(2021-12-20)']
X= np.linspace(min(x), max(x))
Y= np.linspace(min(y), max(y))
X, Y= np.meshgrid(X, Y) # 2D grid for interpolation
interp_upper= NearestNDInterpolator(list(zip(x, y)), AIu)
Xu=df_Upper['X(ft)']
Yu=df_Upper['Y(ft)']
AIu = interp_upper(Xu, Yu)
AIl= df_AI['AI_lower(2021-12-20)']
interp = NearestNDInterpolator(list(zip(x, y)), AIl)
Xl=df_Lower['X(ft)']
Yl=df_Lower['Y(ft)']
AIl = interp(Xl, Yl)
Xln=[125500, 134000, 131000]
Yln=[88000, 88000, 87500]
AIln=interp(Xln, Yln)
df_Upper["AI"]=AIu
df_Lower["AI"]=AIl
wells_df1 = pd.concat([df_Lower, df_Upper])
wells_df1 =wells_df1.sort_values(by='Well_ID')
C:\Users\monic\AppData\Local\Temp/ipykernel_40980/1140994684.py:34: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_Upper["AI"]=AIu C:\Users\monic\AppData\Local\Temp/ipykernel_40980/1140994684.py:35: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_Lower["AI"]=AIl
AIln
array([6627.10689892, 6591.07568549, 6249.97858068])
wells_df1.head()
X(ft) | Y(ft) | Completion | Elevation Kelly Bushing (ft) | WOPR (bpd) | Np (bls) | AI | |
---|---|---|---|---|---|---|---|
Well_ID | |||||||
Well_0 | 137106.82 | 83818.47 | Upper | 193.32 | 8.809168 | 14473.503510 | 6527.288321 |
Well_1 | 132460.98 | 85832.77 | Upper | 187.15 | 38.675641 | 30591.819370 | 6523.526063 |
Well_10 | 134042.86 | 82411.27 | Upper | 184.33 | 46.363101 | 48077.967090 | 6473.198759 |
Well_11 | 115725.49 | 86866.60 | Upper | 192.03 | 0.000000 | 0.000000 | 6430.629923 |
Well_12 | 133136.98 | 93691.55 | Lower | 170.35 | 0.664632 | 1759.350823 | 6362.714848 |
wells_df1.drop(['X(ft)', 'Y(ft)', 'Completion', 'Elevation Kelly Bushing (ft)'], axis=1, inplace=True)
wells_df1.head()
WOPR (bpd) | Np (bls) | AI | |
---|---|---|---|
Well_ID | |||
Well_0 | 8.809168 | 14473.503510 | 6527.288321 |
Well_1 | 38.675641 | 30591.819370 | 6523.526063 |
Well_10 | 46.363101 | 48077.967090 | 6473.198759 |
Well_11 | 0.000000 | 0.000000 | 6430.629923 |
Well_12 | 0.664632 | 1759.350823 | 6362.714848 |
wells_df2 = wells_df1.copy(deep=True) # make a deep copy of the DataFrame
# wells_df2.head()
wells_df2 = well_summary_df.merge(wells_df2, how='left',
left_index=True,
right_index=True)
wells_df2.head()
X(ft) | Y(ft) | Completion zone | MD (ft) | Poro(v/v) | Permeability (mD) | WOPR (bpd) | Np (bls) | AI | |
---|---|---|---|---|---|---|---|---|---|
Well_ID | |||||||||
Well_0 | 137106.82 | 83818.47 | 0.0 | 4659.791739 | 0.074403 | 3.098397 | 8.809168 | 14473.503510 | 6527.288321 |
Well_1 | 132460.98 | 85832.77 | 0.0 | 4868.900545 | 0.126777 | 13.832289 | 38.675641 | 30591.819370 | 6523.526063 |
Well_10 | 134042.86 | 82411.27 | 0.0 | 4749.294657 | 0.125236 | 2.528347 | 46.363101 | 48077.967090 | 6473.198759 |
Well_11 | 115725.49 | 86866.60 | 0.0 | 4628.309023 | 0.085998 | 0.635944 | 0.000000 | 0.000000 | 6430.629923 |
Well_12 | 133136.98 | 93691.55 | 1.0 | 4821.390995 | 0.130572 | 2.760318 | 0.664632 | 1759.350823 | 6362.714848 |
wells_df2.to_csv('Wells_feature_ranking.csv')
wells_df3 = pd.read_csv('Wells_feature_ranking.csv', usecols=['Well_ID','X(ft)', 'Y(ft)', 'Completion zone', ' MD (ft)', 'Poro(v/v)', 'Permeability (mD)', 'AI', 'Np (bls)'], index_col='Well_ID')
wells_df3 = wells_df3.reindex(columns=['X(ft)', 'Y(ft)', 'Completion zone', ' MD (ft)', 'Poro(v/v)', 'Permeability (mD)', 'AI', 'Np (bls)'])
wells_df3.head()
X(ft) | Y(ft) | Completion zone | MD (ft) | Poro(v/v) | Permeability (mD) | AI | Np (bls) | |
---|---|---|---|---|---|---|---|---|
Well_ID | ||||||||
Well_0 | 137106.82 | 83818.47 | 0.0 | 4659.791739 | 0.074403 | 3.098397 | 6527.288321 | 14473.503510 |
Well_1 | 132460.98 | 85832.77 | 0.0 | 4868.900545 | 0.126777 | 13.832289 | 6523.526063 | 30591.819370 |
Well_10 | 134042.86 | 82411.27 | 0.0 | 4749.294657 | 0.125236 | 2.528347 | 6473.198759 | 48077.967090 |
Well_11 | 115725.49 | 86866.60 | 0.0 | 4628.309023 | 0.085998 | 0.635944 | 6430.629923 | 0.000000 |
Well_12 | 133136.98 | 93691.55 | 1.0 | 4821.390995 | 0.130572 | 2.760318 | 6362.714848 | 1759.350823 |
def partial_corr(C):
# Returns the sample linear partial correlation coefficients between pairs of variables in C, controlling
# for the remaining variables in C.
# Parameters
# C : array-like, shape (n, p)
# Array with the different variables. Each column of C is taken as a variable
# Returns
# P : array-like, shape (p, p)
# P[i, j] contains the partial correlation of C[:, i] and C[:, j] controlling
# for the remaining variables in C.
C = np.asarray(C)
p = C.shape[1]
P_corr = np.zeros((p, p), dtype=np.float)
for i in range(p):
P_corr[i, i] = 1
for j in range(i+1, p):
idx = np.ones(p, dtype=np.bool)
idx[i] = False
idx[j] = False
beta_i = linalg.lstsq(C[:, idx], C[:, j])[0]
beta_j = linalg.lstsq(C[:, idx], C[:, i])[0]
res_j = C[:, j] - C[:, idx].dot( beta_i)
res_i = C[:, i] - C[:, idx].dot(beta_j)
corr = stats.pearsonr(res_i, res_j)[0]
P_corr[i, j] = corr
P_corr[j, i] = corr
return P_corr
def semipartial_corr(C): # Michael Pyrcz modified the function above by Fabian Pedregosa-Izquierdo, f@bianp.net for semipartial correlation
C = np.asarray(C)
p = C.shape[1]
P_corr = np.zeros((p, p), dtype=np.float)
for i in range(p):
P_corr[i, i] = 1
for j in range(i+1, p):
idx = np.ones(p, dtype=np.bool)
idx[i] = False
idx[j] = False
beta_i = linalg.lstsq(C[:, idx], C[:, j])[0]
res_j = C[:, j] - C[:, idx].dot( beta_i)
res_i = C[:, i] # just use the value, not a residual
corr = stats.pearsonr(res_i, res_j)[0]
P_corr[i, j] = corr
P_corr[j, i] = corr
return P_corr
Partial Correlation
and Semipartial correlation
coefficients to map which features are more important for our predictionpartial_correlation = partial_corr(wells_df3.iloc[:,:8]) # calculate the partial correlation coefficients
partial_correlation = partial_correlation[:,7][:7] # extract a single row and remove production with itself
semipartial_correlation = semipartial_corr(wells_df3.iloc[:,:8]) # calculate the semi-partial correlation coefficients
semipartial_correlation = semipartial_correlation[:,7][:7] # extract a single row and remove production with itself
C:\Users\monic\AppData\Local\Temp/ipykernel_40980/2152434617.py:15: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations P_corr = np.zeros((p, p), dtype=np.float) C:\Users\monic\AppData\Local\Temp/ipykernel_40980/2152434617.py:19: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations idx = np.ones(p, dtype=np.bool) C:\Users\monic\AppData\Local\Temp/ipykernel_40980/2152434617.py:34: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations P_corr = np.zeros((p, p), dtype=np.float) C:\Users\monic\AppData\Local\Temp/ipykernel_40980/2152434617.py:38: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations idx = np.ones(p, dtype=np.bool)
features = wells_df3.columns.values[:][:7]
plt.figure(figsize=(10,5))
plt.subplot(121)
plt.plot(features,partial_correlation,color='black')
plt.plot([0.0,0.0,0.0,0.0,0.0,0.0],'r--',color='red',linewidth = 1.0)
plt.xlabel('Predictor Features')
plt.ylabel('Partial Correlation Coefficient')
t = plt.title('Partial Correlation Coefficient')
plt.ylim(-0.5,0.5)
plt.grid(True)
plt.subplot(122)
plt.plot(features,semipartial_correlation,color='black')
plt.plot([0.0,0.0,0.0,0.0,0.0,0.0],'r--',color='red',linewidth = 1.0)
plt.xlabel('Predictor Features')
plt.ylabel('Semipartial Correlation Coefficient')
t = plt.title('Semipartial Correlation Coefficient')
plt.ylim(-0.5,0.5)
plt.grid(True)
plt.subplots_adjust(left=0.0, bottom=0.0, right=1.6, top=1.2, wspace=0.3, hspace=0.2)
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
df_AI = pd.read_csv('AI.csv')
df_AI['Difference lower'] = df_AI['AI_lower(2021-12-20)'] - df_AI['AI_lower(2012-01-01)']
df_AI['Difference upper'] = df_AI['AI_upper(2021-12-20)'] - df_AI['AI_upper(2012-01-01)']
df_AI
location_AI = df_AI[['X(ft)', 'Y(ft)']]
df_head_completion = pd.read_csv('Well_Production.csv')
location_well = df_head_completion[['WOPR (bpd)','X(ft)','Y(ft)']]
plt.figure(figsize=(20, 10))
for i, col in enumerate(['AI_lower(2012-01-01)', 'AI_upper(2012-01-01)','Difference lower',
'AI_lower(2021-12-20)', 'AI_upper(2021-12-20)','Difference upper']):
ax = plt.subplot(2, 3, i+1)
ai = df_AI[col]
# Plot heatmap
plt.title(col, fontsize=15)
y = location_AI['X(ft)']
x = location_AI['Y(ft)']
xi = np.linspace(x.min(), x.max(), 1000)
yi = np.linspace(y.min(), y.max(), 1000)
zi = griddata((x.to_numpy(), y.to_numpy()), ai, (xi[None, :],yi[:, None]), method='cubic')
cs = plt.contourf(xi, yi, zi, 25)
for i in cs.collections:
i.set_label('nolegend')
cbar = plt.colorbar()
# Plot the location of the wells
upper = df_head_completion['Completion'].isin(['Upper'])
location_upper = location_well.loc[upper]
location_lower = location_well.loc[~upper]
ax = plt.gca()
if 'upper' in col:
ax.scatter(location_upper['Y(ft)'], location_upper['X(ft)'], location_upper['WOPR (bpd)'],color='b', marker='o', alpha=0.5)
else:
ax.scatter(location_lower['Y(ft)'], location_lower['X(ft)'], location_lower['WOPR (bpd)'], color='r', marker='o', alpha=0.5)
plt.clim(0,7e6)
Y1=np.linspace(x.min(), x.max(), 20000)
X1=-0.29*Y1+154670
ax.plot(Y1,X1,color = 'r')
plt.tight_layout()
plt.show()
# FUNCTIONS WRITTEN BY DR. MICHAEL PYRCZ
def plot_corr(dataframe,size=10): # plots a graphical correlation matrix
corr = dataframe.corr()
fig, ax = plt.subplots(figsize=(size, size))
im = ax.matshow(corr,vmin = -1.0, vmax = 1.0,cmap = plt.cm.bwr)
plt.xticks(range(len(corr.columns)), corr.columns);
plt.yticks(range(len(corr.columns)), corr.columns);
plt.colorbar(im, orientation = 'vertical')
plt.title('Correlation Matrix')
# def visualize_tree(tree,xfeature,xmin,xmax,yfeature,ymin,ymax,response,zmin,zmax,title,):# plots the data points and the decision tree prediction
# n_classes = 10
# xstep = (xmax - xmin)/float(100); ystep = (ymax-ymin)/float(100)
# resp_min = round(min(response)); resp_max = round(max(response));
# xx, yy = np.meshgrid(np.arange(xmin, xmax, xstep),
# np.arange(ymin, ymax, ystep))
# Z = tree.predict(np.c_[xx.ravel(), yy.ravel()])
# Z = Z.reshape(xx.shape)
# cs = plt.contourf(xx, yy, Z, cmap=plt.cm.inferno,vmin=zmin, vmax=zmax, levels = 100)
# im = plt.scatter(xfeature,yfeature,s=None, c=response, marker=None, cmap=plt.cm.inferno, norm=None, vmin=zmin, vmax=zmax, alpha=0.8, linewidths=0.3, edgecolors="white")
# plt.title(title)
# plt.xlabel(xfeature.name)
# plt.ylabel(yfeature.name)
# plt.xlim([xmin,xmax]); plt.ylim([ymin,ymax])
# cbar = plt.colorbar(im, orientation = 'vertical')
# cbar.set_label(response.name, rotation=270, labelpad=20)
def visualize_model(model,xfeature,x_min,x_max,yfeature,y_min,y_max,response,z_min,z_max,title,):# plots the data points and the decision tree prediction
cmap = plt.cm.inferno
xplot_step = (x_max - x_min)/300.0; yplot_step = (y_max - y_min)/300.0 # resolution of the model visualization
xx, yy = np.meshgrid(np.arange(x_min, x_max, xplot_step), # set up the mesh
np.arange(y_min, y_max, yplot_step))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) # predict with our trained model over the mesh
Z = Z.reshape(xx.shape)
plt.scatter(xfeature,yfeature,s=None, c=response, marker=None, cmap=cmap, norm=None, vmin=z_min, vmax=z_max, alpha=0.8, linewidths=0.3, edgecolors="white")
im = plt.imshow(Z,interpolation = None,aspect = "auto",extent = [x_min,x_max,y_min,y_max], vmin = z_min, vmax = z_max,cmap = cmap)
plt.title(title) # add the labels
plt.xlabel(xfeature.name); plt.ylabel(yfeature.name)
plt.xlim([x_min,x_max]); plt.ylim([y_min,y_max])
cbar = plt.colorbar(im, orientation = 'vertical') # add the color bar
cbar.set_label(response.name, rotation=270, labelpad=20)
return Z
def check_model(model,xfeature,yfeature,response,title): # plots the estimated vs. the actual
predict_train = model.predict(np.c_[xfeature,yfeature])
plt.scatter(response,predict_train,s=None, c='red',marker=None, cmap=None, norm=None, vmin=None, vmax=None, alpha=0.8, linewidths=0.3, edgecolors="black")
plt.title(title); plt.xlabel('Actual Production (bls)'); plt.ylabel('Estimated Production (bls)')
plt.xlim(0,3100); plt.ylim(0,3100)
plt.arrow(0,0,3100,3100,width=0.02,color='black',head_length=0.0,head_width=0.0)
MSE = metrics.mean_squared_error(response,predict_train)
Var_Explained = metrics.explained_variance_score(response,predict_train)
#cor = math.sqrt(metrics.r2_score(response,predict_train))
cor = (metrics.r2_score(response,predict_train))**0.5
print('Mean Squared Error on Testing = ', round(MSE,2),', Correlation Coefficient = ', round(cor,2))
def tree_to_code(tree, feature_names): # code from StackOverFlow by paulkernfeld
tree_ = tree.tree_ # convert tree object to portable code to use anywhere
feature_name = [
feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
for i in tree_.feature
]
print("def tree({}):".format(", ".join(feature_names)))
def recurse(node, depth):
indent = " " * depth
if tree_.feature[node] != _tree.TREE_UNDEFINED:
name = feature_name[node]
threshold = tree_.threshold[node]
print("{}if {} <= {}:".format(indent, name, threshold))
recurse(tree_.children_left[node], depth + 1)
print("{}elif {} > {}".format(indent, name, threshold))
recurse(tree_.children_right[node], depth + 1)
else:
print("{}return {}".format(indent, tree_.value[node]))
recurse(0, 1)
def get_lineage(tree, feature_names): # code from StackOverFlow by Zelanzny7
left = tree.tree_.children_left # track the decision path for any set of inputs
right = tree.tree_.children_right
threshold = tree.tree_.threshold
features = [feature_names[i] for i in tree.tree_.feature]
# get ids of child nodes
idx = np.argwhere(left == -1)[:,0]
def recurse(left, right, child, lineage=None):
if lineage is None:
lineage = [child]
if child in left:
parent = np.where(left == child)[0].item()
split = 'l'
else:
parent = np.where(right == child)[0].item()
split = 'r'
lineage.append((parent, split, threshold[parent], features[parent]))
if parent == 0:
lineage.reverse()
return lineage
else:
return recurse(left, right, parent, lineage)
for child in idx:
for node in recurse(left, right, child):
print(node)
wells_df4 = pd.read_csv('Wells_feature_ranking.csv', usecols=['Well_ID','X(ft)', 'Y(ft)', 'Completion zone', ' MD (ft)', 'Poro(v/v)', 'Permeability (mD)', 'AI', 'WOPR (bpd)'], index_col='Well_ID')
wells_df4 = wells_df4.reindex(columns=['X(ft)', 'Y(ft)', 'Completion zone', ' MD (ft)', 'Poro(v/v)', 'Permeability (mD)', 'AI', 'WOPR (bpd)'])
wells_df4.head()
X(ft) | Y(ft) | Completion zone | MD (ft) | Poro(v/v) | Permeability (mD) | AI | WOPR (bpd) | |
---|---|---|---|---|---|---|---|---|
Well_ID | ||||||||
Well_0 | 137106.82 | 83818.47 | 0.0 | 4659.791739 | 0.074403 | 3.098397 | 6527.288321 | 8.809168 |
Well_1 | 132460.98 | 85832.77 | 0.0 | 4868.900545 | 0.126777 | 13.832289 | 6523.526063 | 38.675641 |
Well_10 | 134042.86 | 82411.27 | 0.0 | 4749.294657 | 0.125236 | 2.528347 | 6473.198759 | 46.363101 |
Well_11 | 115725.49 | 86866.60 | 0.0 | 4628.309023 | 0.085998 | 0.635944 | 6430.629923 | 0.000000 |
Well_12 | 133136.98 | 93691.55 | 1.0 | 4821.390995 | 0.130572 | 2.760318 | 6362.714848 | 0.664632 |
wells_df4.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
X(ft) | 50.0 | 124960.021054 | 9922.015780 | 109560.260000 | 115954.590000 | 124027.505000 | 133954.012500 | 142095.820000 |
Y(ft) | 50.0 | 87617.655277 | 3297.936805 | 81792.973846 | 84687.942500 | 87780.815000 | 90481.025000 | 93691.550000 |
Completion zone | 50.0 | 0.597692 | 0.493235 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
MD (ft) | 50.0 | 4684.640791 | 152.593498 | 4293.605683 | 4578.519247 | 4680.457700 | 4793.418005 | 4978.818183 |
Poro(v/v) | 50.0 | 0.134331 | 0.035873 | 0.074403 | 0.108302 | 0.133547 | 0.154945 | 0.221956 |
Permeability (mD) | 50.0 | 14.957861 | 20.271745 | 0.000000 | 2.615153 | 7.079680 | 17.683856 | 89.126393 |
AI | 50.0 | 6552.812214 | 79.439447 | 6362.714848 | 6495.933803 | 6545.065779 | 6622.551235 | 6677.045989 |
WOPR (bpd) | 50.0 | 514.362284 | 686.271526 | 0.000000 | 8.873609 | 194.174395 | 1007.989810 | 3060.205978 |
# Select 2 predictor features here:
predictor_features = ['Poro(v/v)','AI'] # for the first demonstration run we will use porosity and brittleness
response_feature = 'WOPR (bpd)'
# msk = np.random.rand(len(df)) < 0.8 #Select at random 80% of the data for training
pindex = np.argwhere(wells_df4.columns.isin(predictor_features)).ravel()
print('Selected predictor features: \n0: ' + predictor_features[0] + ', index = ' + str(pindex[0]) + '.')
print('1: ' + predictor_features[1] + ', index = ' + str(pindex[1]) + '.')
rindex = np.argwhere(wells_df4.columns.isin([response_feature])).ravel()
print('\nSelected response feature: \n' + response_feature + ', index = ' + str(rindex[0]) + '.')
#my_data_cut = my_data.iloc[0:500,:]
X = wells_df4[predictor_features] # extract the 2 selected response features, 500 samples to a X array
y = wells_df4[[response_feature]]
Selected predictor features: 0: Poro(v/v), index = 4. 1: AI, index = 6. Selected response feature: WOPR (bpd), index = 7.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=73073)
n_train = len(X_train)
n_test = len(X_test)
print('Number of training ' + str(n_train) + ', number of test ' + str(n_test))
X_all = wells_df4[predictor_features]; y_all = wells_df4[[response_feature]] # training and testing together
Number of training 35, number of test 15
WOPRmin = 0.0; WOPRmax = 3100
pormin = 0.0; pormax = 0.3
AImin = 6000 ; AImax = 7000
max_leaf_nodes = 10
my_tree = tree.DecisionTreeRegressor(max_leaf_nodes = max_leaf_nodes)
#my_tree = tree.DecisionTreeRegressor(max_depth = 3, min_samples_leaf=5)
my_tree = my_tree.fit(X_train, y_train)
plt.subplot(131)
visualize_model(my_tree,X_train["Poro(v/v)"],pormin,pormax,X_train["AI"],AImin,AImax,y_train["WOPR (bpd)"],WOPRmin,WOPRmax,'Training Data and Decision Tree Model')
plt.subplot(132)
visualize_model(my_tree,X_test["Poro(v/v)"],pormin,pormax,X_test["AI"],AImin,AImax,y_test["WOPR (bpd)"],WOPRmin,WOPRmax,'Testing Data and Decision Tree Model')
plt.subplot(133)
check_model(my_tree,X_test["Poro(v/v)"],X_test["AI"],y_test["WOPR (bpd)"],'Decision Tree Prediction Accuracy')
plt.subplots_adjust(left=0.0, bottom=0.0, right=3.0, top=1.2, wspace=0.2, hspace=0.2)
plt.show()
C:\Users\monic\AppData\Local\Temp/ipykernel_40980/734566587.py:53: RuntimeWarning: invalid value encountered in double_scalars cor = (metrics.r2_score(response,predict_train))**0.5
Mean Squared Error on Testing = 946094.53 , Correlation Coefficient = nan
trees = []
error = []
var_exp = []
nodes = []
inodes = 2
while inodes < 375:
my_tree = tree.DecisionTreeRegressor(min_samples_leaf=1,max_leaf_nodes=inodes)
my_tree = my_tree.fit(X_train, y_train)
trees.append(my_tree)
#visualize_tree(my_tree,X_train["Por"],X_train["Brittle"],y_train["Prod"],'Training Data and Decision Tree Model, Nodes = ' + str(inodes) )
predict_train = my_tree.predict(np.c_[X_test["Poro(v/v)"],X_test["AI"]])
error.append(metrics.mean_squared_error(y_test["WOPR (bpd)"],predict_train))
var_exp.append(metrics.explained_variance_score(y_test["WOPR (bpd)"],predict_train))
all_nodes = my_tree.tree_.node_count
decision_nodes = len([x for x in my_tree.tree_.feature if x != _tree.TREE_UNDEFINED])
terminal_nodes = all_nodes - decision_nodes
nodes.append(terminal_nodes)
inodes+=1
plt.figure(figsize=(8,6))
plt.scatter(nodes,error,s=None, c='red', marker=None, cmap=None, norm=None, vmin=None, vmax=None, alpha=0.8, linewidths=0.3, edgecolors="black")
# plt.arrow(27,0,0,1500000,width=0.5,color='red',head_length=0.0,head_width=0.0)
plt.title('Decision Tree Cross Validation Testing Error vs. Complexity'); plt.xlabel('Number of Terminal Nodes'); plt.ylabel('Mean Square Error')
#plt.xlim(0,375); plt.ylim(0,1500000)
plt.subplots_adjust(left=0.0, bottom=0.0, right=1.5, top=0.6, wspace=0.2, hspace=0.2)
plt.show()
my_pruned_tree = tree.DecisionTreeRegressor(min_samples_leaf=1,max_leaf_nodes=10)
my_pruned_tree = my_pruned_tree.fit(X_all, y_all)
plt.subplot(121)
visualize_model(my_pruned_tree,X_all["Poro(v/v)"],pormin,pormax,X_all["AI"],AImin,AImax,y_all["WOPR (bpd)"],WOPRmin,WOPRmax,'All Data and Decision Tree Model')
plt.subplot(122)
check_model(my_pruned_tree,X_all["Poro(v/v)"],X_all["AI"],y_all["WOPR (bpd)"],'Decision Tree All Data Prediction Accuracy')
plt.subplots_adjust(left=0.0, bottom=0.0, right=2.0, top=1.2, wspace=0.2, hspace=0.2)
plt.show()
Mean Squared Error on Testing = 37211.51 , Correlation Coefficient = 0.96
88000,125500
88000,134000
87500,131000
wells_df5 = wells_df4.copy()
wells_df5.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
X(ft) | 50.0 | 124960.021054 | 9922.015780 | 109560.260000 | 115954.590000 | 124027.505000 | 133954.012500 | 142095.820000 |
Y(ft) | 50.0 | 87617.655277 | 3297.936805 | 81792.973846 | 84687.942500 | 87780.815000 | 90481.025000 | 93691.550000 |
Completion zone | 50.0 | 0.597692 | 0.493235 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
MD (ft) | 50.0 | 4684.640791 | 152.593498 | 4293.605683 | 4578.519247 | 4680.457700 | 4793.418005 | 4978.818183 |
Poro(v/v) | 50.0 | 0.134331 | 0.035873 | 0.074403 | 0.108302 | 0.133547 | 0.154945 | 0.221956 |
Permeability (mD) | 50.0 | 14.957861 | 20.271745 | 0.000000 | 2.615153 | 7.079680 | 17.683856 | 89.126393 |
AI | 50.0 | 6552.812214 | 79.439447 | 6362.714848 | 6495.933803 | 6545.065779 | 6622.551235 | 6677.045989 |
WOPR (bpd) | 50.0 | 514.362284 | 686.271526 | 0.000000 | 8.873609 | 194.174395 | 1007.989810 | 3060.205978 |
new_wells_df = pd.read_csv('New_wells.csv', index_col='Well_ID')
new_wells_df.head()
X(ft) | Y(ft) | Completion zone | MD (ft) | Poro(v/v) | Permeability (mD) | AI | WOPR (bpd) | |
---|---|---|---|---|---|---|---|---|
Well_ID | ||||||||
Well_50 | 125500.0 | 88000.0 | NaN | NaN | NaN | NaN | 6627.106899 | NaN |
Well_51 | 134000.0 | 88000.0 | NaN | NaN | NaN | NaN | 6591.075685 | NaN |
Well_52 | 131000.0 | 87500.0 | NaN | NaN | NaN | NaN | 6609.567000 | NaN |
NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
all_wells_df = pd.concat([wells_df5, new_wells_df])
all_wells_df.head(53)
X(ft) | Y(ft) | Completion zone | MD (ft) | Poro(v/v) | Permeability (mD) | AI | WOPR (bpd) | |
---|---|---|---|---|---|---|---|---|
Well_ID | ||||||||
Well_0 | 137106.820000 | 83818.470000 | 0.000000 | 4659.791739 | 0.074403 | 3.098397 | 6527.288321 | 8.809168 |
Well_1 | 132460.980000 | 85832.770000 | 0.000000 | 4868.900545 | 0.126777 | 13.832289 | 6523.526063 | 38.675641 |
Well_10 | 134042.860000 | 82411.270000 | 0.000000 | 4749.294657 | 0.125236 | 2.528347 | 6473.198759 | 46.363101 |
Well_11 | 115725.490000 | 86866.600000 | 0.000000 | 4628.309023 | 0.085998 | 0.635944 | 6430.629923 | 0.000000 |
Well_12 | 133136.980000 | 93691.550000 | 1.000000 | 4821.390995 | 0.130572 | 2.760318 | 6362.714848 | 0.664632 |
Well_13 | 119311.270000 | 89234.020000 | 0.000000 | 4456.551055 | 0.107536 | 0.164326 | 6535.346304 | 22.342583 |
Well_14 | 134837.260000 | 83865.820000 | 1.000000 | 4850.482978 | 0.144893 | 6.240244 | 6602.638484 | 381.194628 |
Well_15 | 109560.260000 | 86833.620000 | 1.000000 | 4478.001183 | 0.186269 | 77.964122 | 6592.567556 | 3060.205978 |
Well_16 | 112898.580000 | 91601.460000 | 1.000000 | 4628.528038 | 0.152577 | 14.217419 | 6597.023095 | 1221.216449 |
Well_17 | 112766.460000 | 88549.740000 | 1.000000 | 4676.039989 | 0.191878 | 47.259957 | 6655.082291 | 1367.727037 |
Well_18 | 117798.780000 | 90645.700000 | 0.000000 | 4476.766377 | 0.155699 | 18.402178 | 6535.257654 | 291.086371 |
Well_19 | 110189.220000 | 86209.710000 | 0.000000 | 4293.605683 | 0.084328 | 0.555241 | 6468.760916 | 7.082097 |
Well_2 | 133634.000000 | 82144.520000 | 0.000000 | 4818.221905 | 0.106651 | 9.425665 | 6454.475167 | 5.276535 |
Well_20 | 133687.470000 | 83554.440000 | 0.000000 | 4756.330722 | 0.096216 | 1.961186 | 6493.089944 | 39.545986 |
Well_21 | 121120.490000 | 87804.640000 | 0.000000 | 4465.073771 | 0.135529 | 5.875124 | 6504.434349 | 1.579448 |
Well_22 | 115987.290000 | 91854.910000 | 0.000000 | 4520.571112 | 0.117984 | 2.151969 | 6501.698982 | 13.159516 |
Well_23 | 139326.010000 | 84776.420000 | 0.000000 | 4553.628966 | 0.075356 | 2.784146 | 6469.654446 | 6.300324 |
Well_24 | 142095.820000 | 86042.020000 | 1.000000 | 4622.041710 | 0.122814 | 2.372382 | 6514.933678 | 90.374283 |
Well_25 | 128136.210000 | 89359.950000 | 1.000000 | 4978.818183 | 0.165565 | 19.636225 | 6633.693628 | 1328.452893 |
Well_26 | 118170.110000 | 89488.690000 | 1.000000 | 4595.273994 | 0.165111 | 9.080861 | 6661.780730 | 439.336342 |
Well_27 | 129829.390000 | 87756.990000 | 0.000000 | 4873.662794 | 0.074865 | 2.598431 | 6530.400862 | 9.066930 |
Well_28 | 120522.780000 | 89987.000000 | 0.000000 | 4521.890000 | 0.124638 | 7.008888 | 6528.779268 | 131.235255 |
Well_29 | 138541.420000 | 82281.790000 | 1.000000 | 4931.744256 | 0.146598 | 5.790967 | 6588.430241 | 859.540420 |
Well_3 | 140055.042692 | 81792.973846 | 0.884615 | 4926.308646 | 0.132372 | 3.707587 | 6586.112456 | 1606.279426 |
Well_30 | 131123.520000 | 84356.270000 | 0.000000 | 4785.126016 | 0.086982 | 0.877057 | 6436.910693 | 1.125125 |
Well_31 | 136285.140000 | 86479.950000 | 1.000000 | 4726.807521 | 0.167507 | 50.769417 | 6611.436361 | 460.674843 |
Well_32 | 128675.170000 | 89882.130000 | 1.000000 | 4960.307323 | 0.152685 | 29.536470 | 6630.297019 | 1618.477384 |
Well_33 | 123960.210000 | 85279.280000 | 1.000000 | 4566.676094 | 0.140795 | 2.665318 | 6424.713817 | 0.225082 |
Well_34 | 134053.710000 | 84658.450000 | 1.000000 | 4870.701505 | 0.128982 | 3.574510 | 6613.128042 | 1475.869991 |
Well_35 | 122230.810000 | 87539.370000 | 0.000000 | 4513.631544 | 0.126099 | 17.987098 | 6494.525521 | 2.201824 |
Well_36 | 111749.780000 | 91050.010000 | 1.000000 | 4590.757188 | 0.135747 | 9.185677 | 6591.681278 | 376.967843 |
Well_37 | 134902.250000 | 82632.620000 | 1.000000 | 4873.768495 | 0.141375 | 7.902247 | 6548.179020 | 377.621596 |
Well_38 | 116803.310000 | 91177.750000 | 1.000000 | 4633.727511 | 0.146293 | 16.774130 | 6647.429659 | 932.002925 |
Well_39 | 111890.880000 | 91708.680000 | 0.000000 | 4470.558722 | 0.110601 | 13.424790 | 6469.378107 | 13.290233 |
Well_4 | 109951.430000 | 89343.450000 | 1.000000 | 4579.069384 | 0.140607 | 7.591664 | 6611.813169 | 257.113535 |
Well_40 | 126601.300000 | 90679.210000 | 0.000000 | 4768.470967 | 0.102575 | 0.000000 | 6541.952538 | 49.518124 |
Well_41 | 114333.010000 | 91616.660000 | 1.000000 | 4666.308895 | 0.134721 | 9.812580 | 6623.273759 | 1056.826177 |
Well_42 | 136532.270000 | 83571.510000 | 1.000000 | 4810.877539 | 0.164032 | 12.738064 | 6620.383663 | 1146.619302 |
Well_43 | 115684.270000 | 89895.780000 | 1.000000 | 4661.095011 | 0.187194 | 27.178719 | 6677.045989 | 2175.493437 |
Well_44 | 118629.900000 | 88703.870000 | 1.000000 | 4578.335868 | 0.221956 | 89.126393 | 6636.265853 | 444.719213 |
Well_45 | 129176.300000 | 93171.140000 | 0.000000 | 4707.038233 | 0.075731 | 0.746445 | 6390.293204 | 0.142503 |
Well_46 | 139046.810000 | 84476.470000 | 1.000000 | 4708.287050 | 0.204351 | 57.889852 | 6649.857918 | 1249.728244 |
Well_47 | 137407.150000 | 84099.130000 | 1.000000 | 4767.704218 | 0.182678 | 39.694131 | 6642.461267 | 1215.319622 |
Well_48 | 124094.800000 | 90648.760000 | 1.000000 | 4768.066572 | 0.192294 | 51.663030 | 6653.786210 | 464.882440 |
Well_49 | 123188.650000 | 88556.290000 | 1.000000 | 4723.025829 | 0.157595 | 19.898726 | 6627.454609 | 351.281916 |
Well_5 | 123950.950000 | 92227.310000 | 1.000000 | 4796.182001 | 0.117896 | 7.150473 | 6582.968019 | 13.723622 |
Well_6 | 111035.320000 | 88692.320000 | 1.000000 | 4589.207677 | 0.145377 | 6.015432 | 6647.156891 | 1033.318772 |
Well_7 | 112439.690000 | 85560.850000 | 1.000000 | 4549.285528 | 0.102716 | 4.006895 | 6486.432395 | 5.925703 |
Well_8 | 127369.740000 | 91739.990000 | 0.000000 | 4730.919132 | 0.088633 | 1.631700 | 6500.158650 | 13.095474 |
Well_9 | 115943.690000 | 86730.440000 | 1.000000 | 4684.875411 | 0.131267 | 0.000000 | 6510.109098 | 6.434229 |
Well_50 | 125500.000000 | 88000.000000 | NaN | NaN | NaN | NaN | 6627.106899 | NaN |
Well_51 | 134000.000000 | 88000.000000 | NaN | NaN | NaN | NaN | 6591.075685 | NaN |
Well_52 | 131000.000000 | 87500.000000 | NaN | NaN | NaN | NaN | 6609.567000 | NaN |
df_knn = all_wells_df.copy(deep=True) # make a deep copy of the DataFrame
knn_imputer = KNNImputer(n_neighbors=4, weights="distance")
df_knn.iloc[:,:] = knn_imputer.fit_transform(df_knn)
all_wells_df['Poro(v/v)'] = df_knn['Poro(v/v)']
all_wells_df['Completion zone'] = df_knn['Completion zone']
all_wells_df[' MD (ft)'] = df_knn[' MD (ft)']
all_wells_df['Permeability (mD) '] = df_knn['Permeability (mD)']
all_wells_df['WOPR (bpd)'] = df_knn['WOPR (bpd)']
# all_wells_df.describe().T
all_wells_df.head(53)
X(ft) | Y(ft) | Completion zone | MD (ft) | Poro(v/v) | Permeability (mD) | AI | WOPR (bpd) | Permeability (mD)\t | |
---|---|---|---|---|---|---|---|---|---|
Well_ID | |||||||||
Well_0 | 137106.820000 | 83818.470000 | 0.000000 | 4659.791739 | 0.074403 | 3.098397 | 6527.288321 | 8.809168 | 3.098397 |
Well_1 | 132460.980000 | 85832.770000 | 0.000000 | 4868.900545 | 0.126777 | 13.832289 | 6523.526063 | 38.675641 | 13.832289 |
Well_10 | 134042.860000 | 82411.270000 | 0.000000 | 4749.294657 | 0.125236 | 2.528347 | 6473.198759 | 46.363101 | 2.528347 |
Well_11 | 115725.490000 | 86866.600000 | 0.000000 | 4628.309023 | 0.085998 | 0.635944 | 6430.629923 | 0.000000 | 0.635944 |
Well_12 | 133136.980000 | 93691.550000 | 1.000000 | 4821.390995 | 0.130572 | 2.760318 | 6362.714848 | 0.664632 | 2.760318 |
Well_13 | 119311.270000 | 89234.020000 | 0.000000 | 4456.551055 | 0.107536 | 0.164326 | 6535.346304 | 22.342583 | 0.164326 |
Well_14 | 134837.260000 | 83865.820000 | 1.000000 | 4850.482978 | 0.144893 | 6.240244 | 6602.638484 | 381.194628 | 6.240244 |
Well_15 | 109560.260000 | 86833.620000 | 1.000000 | 4478.001183 | 0.186269 | 77.964122 | 6592.567556 | 3060.205978 | 77.964122 |
Well_16 | 112898.580000 | 91601.460000 | 1.000000 | 4628.528038 | 0.152577 | 14.217419 | 6597.023095 | 1221.216449 | 14.217419 |
Well_17 | 112766.460000 | 88549.740000 | 1.000000 | 4676.039989 | 0.191878 | 47.259957 | 6655.082291 | 1367.727037 | 47.259957 |
Well_18 | 117798.780000 | 90645.700000 | 0.000000 | 4476.766377 | 0.155699 | 18.402178 | 6535.257654 | 291.086371 | 18.402178 |
Well_19 | 110189.220000 | 86209.710000 | 0.000000 | 4293.605683 | 0.084328 | 0.555241 | 6468.760916 | 7.082097 | 0.555241 |
Well_2 | 133634.000000 | 82144.520000 | 0.000000 | 4818.221905 | 0.106651 | 9.425665 | 6454.475167 | 5.276535 | 9.425665 |
Well_20 | 133687.470000 | 83554.440000 | 0.000000 | 4756.330722 | 0.096216 | 1.961186 | 6493.089944 | 39.545986 | 1.961186 |
Well_21 | 121120.490000 | 87804.640000 | 0.000000 | 4465.073771 | 0.135529 | 5.875124 | 6504.434349 | 1.579448 | 5.875124 |
Well_22 | 115987.290000 | 91854.910000 | 0.000000 | 4520.571112 | 0.117984 | 2.151969 | 6501.698982 | 13.159516 | 2.151969 |
Well_23 | 139326.010000 | 84776.420000 | 0.000000 | 4553.628966 | 0.075356 | 2.784146 | 6469.654446 | 6.300324 | 2.784146 |
Well_24 | 142095.820000 | 86042.020000 | 1.000000 | 4622.041710 | 0.122814 | 2.372382 | 6514.933678 | 90.374283 | 2.372382 |
Well_25 | 128136.210000 | 89359.950000 | 1.000000 | 4978.818183 | 0.165565 | 19.636225 | 6633.693628 | 1328.452893 | 19.636225 |
Well_26 | 118170.110000 | 89488.690000 | 1.000000 | 4595.273994 | 0.165111 | 9.080861 | 6661.780730 | 439.336342 | 9.080861 |
Well_27 | 129829.390000 | 87756.990000 | 0.000000 | 4873.662794 | 0.074865 | 2.598431 | 6530.400862 | 9.066930 | 2.598431 |
Well_28 | 120522.780000 | 89987.000000 | 0.000000 | 4521.890000 | 0.124638 | 7.008888 | 6528.779268 | 131.235255 | 7.008888 |
Well_29 | 138541.420000 | 82281.790000 | 1.000000 | 4931.744256 | 0.146598 | 5.790967 | 6588.430241 | 859.540420 | 5.790967 |
Well_3 | 140055.042692 | 81792.973846 | 0.884615 | 4926.308646 | 0.132372 | 3.707587 | 6586.112456 | 1606.279426 | 3.707587 |
Well_30 | 131123.520000 | 84356.270000 | 0.000000 | 4785.126016 | 0.086982 | 0.877057 | 6436.910693 | 1.125125 | 0.877057 |
Well_31 | 136285.140000 | 86479.950000 | 1.000000 | 4726.807521 | 0.167507 | 50.769417 | 6611.436361 | 460.674843 | 50.769417 |
Well_32 | 128675.170000 | 89882.130000 | 1.000000 | 4960.307323 | 0.152685 | 29.536470 | 6630.297019 | 1618.477384 | 29.536470 |
Well_33 | 123960.210000 | 85279.280000 | 1.000000 | 4566.676094 | 0.140795 | 2.665318 | 6424.713817 | 0.225082 | 2.665318 |
Well_34 | 134053.710000 | 84658.450000 | 1.000000 | 4870.701505 | 0.128982 | 3.574510 | 6613.128042 | 1475.869991 | 3.574510 |
Well_35 | 122230.810000 | 87539.370000 | 0.000000 | 4513.631544 | 0.126099 | 17.987098 | 6494.525521 | 2.201824 | 17.987098 |
Well_36 | 111749.780000 | 91050.010000 | 1.000000 | 4590.757188 | 0.135747 | 9.185677 | 6591.681278 | 376.967843 | 9.185677 |
Well_37 | 134902.250000 | 82632.620000 | 1.000000 | 4873.768495 | 0.141375 | 7.902247 | 6548.179020 | 377.621596 | 7.902247 |
Well_38 | 116803.310000 | 91177.750000 | 1.000000 | 4633.727511 | 0.146293 | 16.774130 | 6647.429659 | 932.002925 | 16.774130 |
Well_39 | 111890.880000 | 91708.680000 | 0.000000 | 4470.558722 | 0.110601 | 13.424790 | 6469.378107 | 13.290233 | 13.424790 |
Well_4 | 109951.430000 | 89343.450000 | 1.000000 | 4579.069384 | 0.140607 | 7.591664 | 6611.813169 | 257.113535 | 7.591664 |
Well_40 | 126601.300000 | 90679.210000 | 0.000000 | 4768.470967 | 0.102575 | 0.000000 | 6541.952538 | 49.518124 | 0.000000 |
Well_41 | 114333.010000 | 91616.660000 | 1.000000 | 4666.308895 | 0.134721 | 9.812580 | 6623.273759 | 1056.826177 | 9.812580 |
Well_42 | 136532.270000 | 83571.510000 | 1.000000 | 4810.877539 | 0.164032 | 12.738064 | 6620.383663 | 1146.619302 | 12.738064 |
Well_43 | 115684.270000 | 89895.780000 | 1.000000 | 4661.095011 | 0.187194 | 27.178719 | 6677.045989 | 2175.493437 | 27.178719 |
Well_44 | 118629.900000 | 88703.870000 | 1.000000 | 4578.335868 | 0.221956 | 89.126393 | 6636.265853 | 444.719213 | 89.126393 |
Well_45 | 129176.300000 | 93171.140000 | 0.000000 | 4707.038233 | 0.075731 | 0.746445 | 6390.293204 | 0.142503 | 0.746445 |
Well_46 | 139046.810000 | 84476.470000 | 1.000000 | 4708.287050 | 0.204351 | 57.889852 | 6649.857918 | 1249.728244 | 57.889852 |
Well_47 | 137407.150000 | 84099.130000 | 1.000000 | 4767.704218 | 0.182678 | 39.694131 | 6642.461267 | 1215.319622 | 39.694131 |
Well_48 | 124094.800000 | 90648.760000 | 1.000000 | 4768.066572 | 0.192294 | 51.663030 | 6653.786210 | 464.882440 | 51.663030 |
Well_49 | 123188.650000 | 88556.290000 | 1.000000 | 4723.025829 | 0.157595 | 19.898726 | 6627.454609 | 351.281916 | 19.898726 |
Well_5 | 123950.950000 | 92227.310000 | 1.000000 | 4796.182001 | 0.117896 | 7.150473 | 6582.968019 | 13.723622 | 7.150473 |
Well_6 | 111035.320000 | 88692.320000 | 1.000000 | 4589.207677 | 0.145377 | 6.015432 | 6647.156891 | 1033.318772 | 6.015432 |
Well_7 | 112439.690000 | 85560.850000 | 1.000000 | 4549.285528 | 0.102716 | 4.006895 | 6486.432395 | 5.925703 | 4.006895 |
Well_8 | 127369.740000 | 91739.990000 | 0.000000 | 4730.919132 | 0.088633 | 1.631700 | 6500.158650 | 13.095474 | 1.631700 |
Well_9 | 115943.690000 | 86730.440000 | 1.000000 | 4684.875411 | 0.131267 | 0.000000 | 6510.109098 | 6.434229 | 0.000000 |
Well_50 | 125500.000000 | 88000.000000 | 0.759755 | 4804.438564 | 0.154304 | NaN | 6627.106899 | 534.512439 | 22.431840 |
Well_51 | 134000.000000 | 88000.000000 | 0.518819 | 4829.734623 | 0.129182 | NaN | 6591.075685 | 489.577414 | 19.852680 |
Well_52 | 131000.000000 | 87500.000000 | 0.158024 | 4871.445386 | 0.101494 | NaN | 6609.567000 | 269.086371 | 9.231506 |
from sklearn import tree # import decision tree from scikit-learn
# my_data = pd.read_csv(r"https://raw.githubusercontent.com/GeostatsGuy/GeoDataSets/master/unconv_MV.csv") # load subsurface data table
my_tree = tree.DecisionTreeRegressor(min_samples_leaf=1, max_depth = 10) # instantiate tree with hyperparameters
# my_tree = my_tree.fit(all_wells_df[['Poro(v/v)','AI']],all_wells_df['WOPR (bpd)']) # train tree with training data
my_tree = my_tree.fit(wells_df5[['Poro(v/v)','AI']],wells_df5['WOPR (bpd)']) # train tree with training data
est_prod1 = my_tree.predict([[0.130085,6627.106899]])[0] # make a prediction (no tuning shown)
est_prod2 = my_tree.predict([[0.147142,6591.075685]])[0] # make a prediction (no tuning shown)
est_prod3 = my_tree.predict([[0.116,6690]])[0] # make a prediction (no tuning shown)
print('Well 50 - Estimated production for Porosity and AI is ' + str(round(est_prod1,1)) + ' (bpd).')
print('Well 51 - Estimated production for Porosity and AI is ' + str(round(est_prod2,1)) + ' (bpd).')
print('Well 52 - Estimated production for Porosity and AI is ' + str(round(est_prod3,1)) + ' (bpd).')
Well 50 - Estimated production for Porosity and AI is 1475.9 (bpd). Well 51 - Estimated production for Porosity and AI is 859.5 (bpd). Well 52 - Estimated production for Porosity and AI is 1475.9 (bpd).
Total_prod_well_50 = 1475.9*365*2
Total_prod_well_51 = 859.5*365*2
Total_prod_well_52 = 1475.9*365*2
print(Total_prod_well_50, '(bls)')
print(Total_prod_well_51, '(bls)')
print(Total_prod_well_52, '(bls)')
1077407.0 (bls) 627435.0 (bls) 1077407.0 (bls)