Data Loading and Transformation
# written in python 3.6.3
# Load the reuqired libraries
import numpy as np
import pandas as pd
import pygeostat as gs
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from math import nan
gs.get_executable (source = 'GSLIB')
%matplotlib inline
#load dat. file data and transfer original data into csv files
#log transform data
vari_data = gs.DataFile ('./dat_NWT.dat', x='Easting', y='Northing', readfl = True)
vari_data.write_file('./dat_NWT_test.csv')
df = pd.read_csv('./dat_NWT_test.csv')
df_norm = pd.DataFrame()
df_norm['Easting'] = df['Easting']
df_norm['Northing'] = df['Northing']
for c in df.columns:
if c == 'Easting' or c == 'Northing':
continue
df_norm[c] = np.log10(df[c])
df_norm.to_csv('dat_NWT_norm.csv')
vari_data = gs.DataFile ('./dat_NWT_norm.csv', x='Easting', y='Northing', readfl = True)
# written in python 3.6.3
# Load the reuqired libraries
import numpy as np
import pandas as pd
import pygeostat as gs
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from math import nan
gs.get_executable (source = 'GSLIB')
%matplotlib inline
#load dat. file data and transfer original data into csv files
#log transform data
vari_data = gs.DataFile ('./dat_NWT.dat', x='Easting', y='Northing', readfl = True)
vari_data.write_file('./dat_NWT_test.csv')
df = pd.read_csv('./dat_NWT_test.csv')
df_norm = pd.DataFrame()
df_norm['Easting'] = df['Easting']
df_norm['Northing'] = df['Northing']
for c in df.columns:
if c == 'Easting' or c == 'Northing':
continue
df_norm[c] = np.log10(df[c])
df_norm.to_csv('dat_NWT_norm.csv')
vari_data = gs.DataFile ('./dat_NWT_norm.csv', x='Easting', y='Northing', readfl = True)
Descriptive Statistics and Histograms
# written in python 3.6.3
vari_data.describe ()
#histogram
params = {'legend.fontsize': 'large', 'axes.labelsize': 'large', 'axes.titlesize':'large', 'xtick.labelsize':'large',
'ytick.labelsize':'large'}
pylab.rcParams.update (params)
gs.histogram_plot(vari_data, var = 'Mn', figsize=(7,6), xlabel = 'Mn', color = 'green',
bins=20, axis_xy=True, grid = True)
plt.savefig (outdir + './Hist_Mn.png', bbox_inches = 'tight', dpi = 150)
# written in python 3.6.3
vari_data.describe ()
#histogram
params = {'legend.fontsize': 'large', 'axes.labelsize': 'large', 'axes.titlesize':'large', 'xtick.labelsize':'large',
'ytick.labelsize':'large'}
pylab.rcParams.update (params)
gs.histogram_plot(vari_data, var = 'Mn', figsize=(7,6), xlabel = 'Mn', color = 'green',
bins=20, axis_xy=True, grid = True)
plt.savefig (outdir + './Hist_Mn.png', bbox_inches = 'tight', dpi = 150)
Correlation Calculation, Dendrogram and Heatmap
The Pearson Correlation Calculation is done in Python, the visualizations of dendrogram and correlation heatmap are conducted with pygeostat and matplotlib packages in Python. Code to produce the results and parameter settings are listed below:
data_file = gs.DataFile ("./dat_NWT_hm.csv", x='Easting', y='Northing', readfl = True)
data = data_file[data_file.variables]
data_cor = data.corr()
gs.correlation_matrix_plot(data_cor, figsize=(50, 50), cmap = 'bwr', hierarchy='weighted', dendrogram=True, cbar=True, cbar_label='bar label')
plt.savefig (outdir + './Hm.png', bbox_inches = 'tight', dpi = 150)
Discriminant Analysis and Visualization
#written in R 4.0.4
dat1 <- read.csv("dat_NWT.csv")
dat2 <- dat1[,1:3]
dat3 <- dat1[,c(4:45,47:55)] # Variable in column 46 is bad
head(dat1)
head(dat2)
head(dat3)
dat4=log10(dat3+1)
head(dat4)
dat5=cbind(dat2,dat4)
dat5$Group=as.factor(dat5$Group)
head(dat5)
str(dat5)
#install.packages("vegan")
library(vegan)
#install.packages("candisc")
library(candisc)
x=lm(as.matrix(dat4)~Group, data=dat5)
out4=candisc(x, term="Group")
scores=out4$scores
head(scores)
color = rainbow(10)
plot(scores$Can2~scores$Can1, col=rainbow(10)[dat5$Group], pch=19, cex=0.5)
plot(out4, which = 1:2, conf = 0.95, col=color, pch=19, asp = 1,
var.col = "black",
ellipse=FALSE, ellipse.prob = 0.68, fill.alpha=0.1,
prefix = "Can", suffix=TRUE,
titles.1d = c("Canonical scores", "Structure"))
library('tidyverse')
gg.candisc.plot(out4)
out4$eigenvalues
out4
summary(out4)
labels <- names(out4$scores)[1]
can.plot <- qplot(data=out4$scores,
x=Can1,
y=Can2,
colour=get(labels))+
stat_ellipse()+ #Add ellipses
coord_equal()
arrow.coord <- data.frame(x=0,y=0,
xend=out4$structure[,1],yend=out4$structure[,2],
variable=row.names(out4$structure))
#Enlarge arrows
enlarge <- 9
arrow.coord$xend <- enlarge*arrow.coord$xend
arrow.coord$yend <- enlarge*arrow.coord$yend
#Plot arrows
can.plot <- can.plot+geom_segment(data=arrow.coord,
colour=I("black"),
aes(x=x,
y=y,
xend=xend,
yend=yend),
arrow = arrow(length = unit(0.5,"cm")))
#Add arrow labels and assign colors to groups
can.plot <- can.plot+geom_text(data=arrow.coord,colour=I("black"),aes(x=xend,y=yend,label=variable,hjust=0.5,vjust=0.5))
can.plot+theme_minimal()+
scale_color_manual(values=c('red','forestgreen','blue','pink','springgreen1','green4','orange','purple','yellow','chocolate4'))
K-means Cluster and CART
#written in R 4.0.4
# K-means
clst <- read.csv("log transform.csv")
row.names(clst)=clst$LOCATION
LocationLabel=clst[,1:2]
clst_var <- clst[3:ncol(clst)]
head(clst)
head(LocationLabel)
install.packages('ecodist')
library(ecodist)
euclid=dist(clst, method ="euclidean")
euclid_sq=euclid^2
euclid_sqrt=sqrt(euclid)
install.packages("factoextra")
library(factoextra)
clst_var2 = data.frame(clst_var)
install.packages("tidyverse")
library(tidyverse)
clst_var2 <- select(clst_var2, -c(Ta, Ge))
dm=distance(clst_var2, method ="euclidean")
library(stats)
kmeans_10=kmeans(dm, centers=10, iter.max=50, nstart=25)
kmeans_10$cluster
#kmeans10$size; kmeans10$centers; kmeans10$cluster
write.csv(data.frame(kmeans_10$cluster),"kmeans_group_ns_10.csv")
#CART
dat1=read.csv("NWT_grouped_15.csv")
install.packages('mvpart')
install.packages('randomForest')
install.packages("devtools")
devtools::install_github("cran/mvpart")
library(randomForest)
library(mvpart)
out1=mvpart(Group~Ag+Al+As+Au+B+Ba+Be+Bi+Ca+Cd+Ce+Co+Cr+Cs+Cu+Fe+Ga+Ge+Hf+Hg+In+K+La+Li+Mg+Mn+Mo+Na+Nb+Ni+P+Pb+Pd+Pt+Rb+Re+S+Sb+Sc+Se+Sn+Sr+Ta+Te+Th+Ti+Tl+U+V+W+Yy+Zn+Zr,dat1,xv="p",all.leaves=T)
summary(out1)
out2=randomForest(Group~Ag+Al+As+Au+B+Ba+Be+Bi+Ca+Cd+Ce+Co+Cr+Cs+Cu+Fe+Ga+Ge+Hf+Hg+In+K+La+Li+Mg+Mn+Mo+Na+Nb+Ni+P+Pb+Pd+Pt+Rb+Re+S+Sb+Sc+Se+Sn+Sr+Ta+Te+Th+Ti+Tl+U+V+W+Yy+Zn+Zr,dat1,ntree=100,importance=T)
varImpPlot(out2)
#written in R 4.0.4
# K-means
clst <- read.csv("log transform.csv")
row.names(clst)=clst$LOCATION
LocationLabel=clst[,1:2]
clst_var <- clst[3:ncol(clst)]
head(clst)
head(LocationLabel)
install.packages('ecodist')
library(ecodist)
euclid=dist(clst, method ="euclidean")
euclid_sq=euclid^2
euclid_sqrt=sqrt(euclid)
install.packages("factoextra")
library(factoextra)
clst_var2 = data.frame(clst_var)
install.packages("tidyverse")
library(tidyverse)
clst_var2 <- select(clst_var2, -c(Ta, Ge))
dm=distance(clst_var2, method ="euclidean")
library(stats)
kmeans_10=kmeans(dm, centers=10, iter.max=50, nstart=25)
kmeans_10$cluster
#kmeans10$size; kmeans10$centers; kmeans10$cluster
write.csv(data.frame(kmeans_10$cluster),"kmeans_group_ns_10.csv")
#CART
dat1=read.csv("NWT_grouped_15.csv")
install.packages('mvpart')
install.packages('randomForest')
install.packages("devtools")
devtools::install_github("cran/mvpart")
library(randomForest)
library(mvpart)
out1=mvpart(Group~Ag+Al+As+Au+B+Ba+Be+Bi+Ca+Cd+Ce+Co+Cr+Cs+Cu+Fe+Ga+Ge+Hf+Hg+In+K+La+Li+Mg+Mn+Mo+Na+Nb+Ni+P+Pb+Pd+Pt+Rb+Re+S+Sb+Sc+Se+Sn+Sr+Ta+Te+Th+Ti+Tl+U+V+W+Yy+Zn+Zr,dat1,xv="p",all.leaves=T)
summary(out1)
out2=randomForest(Group~Ag+Al+As+Au+B+Ba+Be+Bi+Ca+Cd+Ce+Co+Cr+Cs+Cu+Fe+Ga+Ge+Hf+Hg+In+K+La+Li+Mg+Mn+Mo+Na+Nb+Ni+P+Pb+Pd+Pt+Rb+Re+S+Sb+Sc+Se+Sn+Sr+Ta+Te+Th+Ti+Tl+U+V+W+Yy+Zn+Zr,dat1,ntree=100,importance=T)
varImpPlot(out2)