import geopandas as gpd
import numpy as np
from tempfile import NamedTemporaryFile
from pyspatialml import Raster
import pyspatialml.datasets.meuse as ms
import matplotlib as mpl
import matplotlib.pyplot as plt
Incorporating Spatial Autocorrelation into Spatial Predictions
Similarly to example 1, we are using the meuse dataset again to perform a multi-target prediction of soil properties using a regression model. However, in this case we will attempt to account for spatial autocorrelation in the model directly by generating new features that are based on the distance-weighted means of surrounding spatial locations.
Preparing the Raster Predictors
Import the raster predictors from the pyspatialml.datasets.meuse
module:
= ms.predictors
predictor_files = ms.meuse
training_pts_file = Raster(predictor_files)
stack 'ffreq') stack.drop(
Raster Object Containing 11 Layers
attribute values
0 names [chnl_dist, dem, dist, landimg2, landimg3, lan...
1 files [/Users/stevenpawley/GitHub/Pyspatialml/pyspat...
2 rows 104
3 cols 78
4 res (40.0, 40.0)
5 nodatavals [-99999.0, -99999.0, -1.0, -1.0, -1.0, -1.0, -...
In order to generate new features from surrounding spatial locations, we need their x,y coordinates, which will will add to the stack of the raster predictors using the pyspatialml.preprocessing.xy_coordinates
function:
from pyspatialml.preprocessing import xy_coordinates
= xy_coordinates(stack.iloc[0], NamedTemporaryFile(suffix=".tif").name)
xy_layers = stack.append(xy_layers, in_place=False) stack
Quickly plot the raster predictors:
'seaborn-v0_8')
mpl.style.use(= stack.plot(figsize=(9, 7))
axs = axs.flatten()[10]
ax = ax.images
im 0].colorbar.set_ticks([1,2,3])
im[= axs.flatten()[8]
ax ='x', labelrotation=65)
ax.tick_params(axis
plt.tight_layout() plt.show()
Extract the Training Data
Spatially query the raster predictors at the training point locations:
= gpd.read_file(training_pts_file)
training_pts = stack.extract_vector(gdf=training_pts)
training_df
= training_df.index.get_level_values("geometry_idx")
training_df.index = training_df.merge(
training_df "lead", "cadmium", "copper", "zinc", "om")],
training_pts.loc[:, (=True,
left_index=True
right_index
) = training_df.dropna() training_df
Split the response/target variables from the predictors:
= training_df.loc[:, stack.names].values
X = training_df.loc[:, ['lead', 'cadmium', 'copper', 'zinc', 'om']].values y
Develop a Spatially-Lagged Machine Learning Model
As well as using the ExtraTreeRegressor model which was also used in example 1, here we will use the custom pyspatialml.estimators.SpatialLagRegressor
metalearner class to wrap the extratrees regressor into a model that adds a new feature based on the distance-weighted mean of spatially-proximal observations:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from pyspatialml.transformers import KNNTransformer
from sklearn.model_selection import cross_validate, KFold
from sklearn.model_selection import GridSearchCV
# define regressor
= ExtraTreesRegressor(n_estimators=500, n_jobs=-1, random_state=1234)
et
= list(stack.names).index("soil")
soil_index = [list(stack.names).index(i) for i in ["x_coordinates", "y_coordinates"]]
xy_indexes
= ColumnTransformer([
preproc 'ohe', OneHotEncoder(categories='auto', handle_unknown='ignore'), [soil_index]),
('lags', KNNTransformer(weights='distance', measure="mean"), xy_indexes)
(='passthrough')
], remainder
= Pipeline([
wflow 'preproc', preproc),
('regressor', et)
(
])
= {"preproc__lags__n_neighbors": [3, 5, 7, 9]}
search_grid = KFold(n_splits=3, shuffle=True, random_state=1234)
inner = GridSearchCV(wflow, param_grid=search_grid, cv=inner, scoring="r2") model
Fit the model and cross-validate:
= model.fit(X, y)
model model.best_params_
{'preproc__lags__n_neighbors': 9}
= KFold(n_splits=10, shuffle=True, random_state=1234)
outer
= cross_validate(model, X, y, scoring='neg_mean_squared_error', cv=outer, n_jobs=1)
scores = np.sqrt(-scores['test_score']).mean()
rmse
print("Our RMSE score is {}".format(rmse))
Our RMSE score is 102.27495341202624
Comparing the RMSE score the the score obtained in example 1, where the spatial structure of the training data was accounted for indirectly by added a variety of raster distance measures, we can see that the RMSE score is slightly improved.
Multi-Target Predictions
= stack.predict(model)
preds
preds.rename(for old, new in zip(preds.names, ['lead', 'cadmium', 'copper', 'zinc', 'om'])},
{old: new =True
in_place
)= 'rainbow'
preds.lead.cmap = 'rainbow'
preds.cadmium.cmap = 'rainbow'
preds.copper.cmap = 'rainbow'
preds.zinc.cmap = 'rainbow' preds.om.cmap
=(200, 200), title_fontsize=14, figsize=(10, 8))
preds.plot(out_shape plt.show()