-
Notifications
You must be signed in to change notification settings - Fork 21
/
space_group_a_CNN.py
327 lines (248 loc) · 11.8 KB
/
space_group_a_CNN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
#-*- coding: utf-8 -*-
#!usr/bin/env python
"""
SPACE GROUP a-CNN
filneame: space_group_a_CNN.py version: 1.0
dependencies:
autoXRD version 1.0
autoXRD_vis version 0.2
Code to perform classification of XRD patterns for various spcae-group using
physics-informed data augmentation and all convolutional neural network (a-CNN).
Code to plot class activation maps from a-CNN and global average pooling layer
@authors: Felipe Oviedo and Danny Zekun Ren
MIT Photovoltaics Laboratory / Singapore and MIT Alliance for Research and Tehcnology
All code is under Apache 2.0 license, please cite any use of the code as explained
in the README.rst file, in the GitHub repository.
"""
#################################################################
#Libraries and dependencies
#################################################################
# Loads series of functions for preprocessing and data augmentation
from autoXRD import *
# Loads CAMs visualizations for a-CNN
from autoXRD_vis import *
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.model_selection import KFold
# Neural networks uses Keran with TF background
import keras as K
from keras.models import Model
from keras.models import Sequential
from keras.callbacks import ReduceLROnPlateau
from keras.callbacks import EarlyStopping
import tensorflow as tf
# Clear Keras and TF session, if run previously
K.backend.clear_session()
tf.reset_default_graph()
# Training Parameters
BATCH_SIZE=128
# Network Parameters
n_input = 1200 # Total angles in XRD pattern
n_classes = 7 # Number of space-group classes
filter_size = 2
kernel_size = 10
################################################################
# Load data and preprocess
################################################################
# Load simulated and anonimized dataset
import os
dirname = os.path.dirname(__file__)
theor = pd.read_csv(os.path.join(dirname, 'Datasets/theor.csv'), index_col=0)
theor = theor.iloc[1:,]
theor_arr=theor.values
# Normalize data for training
ntheor = normdata(theor_arr)
# Load labels for simulated data
label_theo = pd.read_csv(os.path.join(dirname, 'Datasets/label_theo.csv'), header=None, index_col=0)
label_theo = label_theo[1].tolist()
# Load experimental data as dataframe
exp_arr_new = pd.read_csv(os.path.join(dirname, 'Datasets/exp.csv'), index_col=0)
exp_arr_new = exp_arr_new.values
# Load experimental class labels
label_exp= pd.read_csv(os.path.join(dirname, 'Datasets/label_exp.csv'), index_col=0).values
label_exp = label_exp.reshape([len(label_exp),])
# Load class enconding
space_group_enc = pd.read_csv(os.path.join(dirname, 'Datasets/encoding.csv'), index_col=0)
space_group_enc = list(space_group_enc['0'])
# Normalize experimental data
nexp = normdata(exp_arr_new)
# Define spectral range for data augmentation
exp_min = 0
exp_max = 1200
theor_min = 125
#window size for experimental data extraction
window = 20
theor_max = theor_min+exp_max-exp_min
# Preprocess experimental data
post_exp = normdatasingle(exp_data_processing (nexp, exp_min, exp_max, window))
################################################################
# Perform data augmentation
################################################################
# Specify how many data points we augmented
th_num = 2000
# Augment data, this may take a bit
augd,pard,crop_augd = augdata(ntheor, th_num, label_theo, theor_min, theor_max)
# Enconde theoretical labels
label_t=np.zeros([len(pard),])
for i in range(len(pard)):
label_t[i]=space_group_enc.index(pard[i])
# Input the num of experimetal data points
exp_num =88
# Prepare experimental arrays for training and testing
X_exp = np.transpose(post_exp[:,0:exp_num])
y_exp = label_exp[0:exp_num]
# Prepare simulated arrays for training and testing
X_th = np.transpose(crop_augd )
y_th = label_t
################################################################
# Perform training and cross-validation
################################################################
fold = 5 # Number of k-folds
k_fold = KFold(n_splits=fold, shuffle=True, random_state=3)
# Create arrays to populate metrics
accuracy_exp = np.empty((fold,1))
accuracy_exp_b = np.empty((fold,1))
accuracy_exp_r1 = np.empty((fold,1))
accuracy_exp_p1 = np.empty((fold,1))
accuracy_exp_r2 = np.empty((fold,1))
accuracy_exp_p2 = np.empty((fold,1))
f1=np.empty((fold,1))
f1_m=np.empty((fold,1))
# Create auxiliary arrays
accuracy=[]
logs=[]
ground_truth=[]
predictions_ord=[]
trains=[]
tests=[]
trains_combine=[]
trains_y=[]
# Run cross validation and define a-CNN each time in loop
for k, (train, test) in enumerate(k_fold.split(X_exp, y_exp)):
#Save splits for later use
trains.append(train)
tests.append(test)
#Data augmentation of experimental traning dataset, we
# already removed the experimental training dataset
temp_x = X_exp[train]
temp_y = y_exp[train]
exp_train_x,exp_train_y = exp_augdata(temp_x.T,5000,temp_y)
# Combine theoretical and experimenal dataset for training
train_combine = np.concatenate((X_th,exp_train_x.T))
trains_combine.append(train_combine)
# Clear weights and networks state
K.backend.clear_session()
# Network Parameters
BATCH_SIZE=128
n_input = 1200 # MNIST data input (img shape: 28*28)
n_classes = 7 # MNIST total classes (0-9 digits)
filter_size = 2
kernel_size = 10
enc = OneHotEncoder(sparse=False)
train_dim = train_combine.reshape(train_combine.shape[0],1200,1)
train_y = np.concatenate((y_th,exp_train_y))
trains_y.append(train_y)
train_y_hot = enc.fit_transform(train_y.reshape(-1,1))
# Define network structure
model = Sequential()
model.add(K.layers.Conv1D(32, 8,strides=8, padding='same',input_shape=(1200,1), activation='relu'))
model.add(K.layers.Conv1D(32, 5,strides=5, padding='same', activation='relu'))
model.add(K.layers.Conv1D(32, 3,strides=3, padding='same', activation='relu'))
model.add(K.layers.pooling.GlobalAveragePooling1D())
model.add(K.layers.Dense(n_classes, activation='softmax'))
#Define optimizer
optimizer = K.optimizers.Adam()
# Compile model
model.compile(loss='binary_crossentropy',
optimizer=optimizer,
metrics=['categorical_accuracy'])
# Choose early stop
#early_stop = EarlyStopping(monitor='val_loss', patience=50, verbose=1, restore_best_weights=True)
# Reduce learning rate during optimization
# reduce_lr = ReduceLROnPlateau(monitor = 'loss', factor=0.5,
# patience=50, min_lr=0.00001)
# Define test data
test_x = X_exp[test]
test_x = test_x.reshape(test_x.shape[0],1200,1)
test_y = enc.fit_transform(y_exp.reshape(-1,1))[test]
# Fit model
hist = model.fit(train_dim, train_y_hot, batch_size=BATCH_SIZE, nb_epoch=100,
verbose=1, validation_data=(test_x, test_y))
# hist = model.fit(train_dim, train_y_hot, batch_size=BATCH_SIZE, nb_epoch=100,
# verbose=1, validation_data=(test_x, test_y), callbacks = [early_stop])
#
#Compute model predictions
prediction=model.predict(test_x)
#Go from one-hot to ordinal...
prediction_ord=[np.argmax(element) for element in prediction]
predictions_ord.append(prediction_ord)
# Compute accuracy, recall, precision and F1 with macro and micro averaging
accuracy_exp[k] = metrics.accuracy_score(y_exp[test], prediction_ord)
accuracy_exp_r1[k] = metrics.recall_score(y_exp[test], prediction_ord, average='macro')
accuracy_exp_r2[k] = metrics.recall_score(y_exp[test], prediction_ord, average='micro')
accuracy_exp_p1[k] = metrics.precision_score(y_exp[test], prediction_ord, average='macro')
accuracy_exp_p2[k] = metrics.precision_score(y_exp[test], prediction_ord, average='micro')
f1[k]=metrics.f1_score(y_exp[test], prediction_ord, average='micro')
f1_m[k]=metrics.f1_score(y_exp[test], prediction_ord, average='macro')
#Produce ground_truth, each list element contains array with test elements on first column with respect to X_exp and
ground_truth.append(np.concatenate([test.reshape(len(test),1),y_exp[test].reshape(len(test),1)],axis=1))
#Compute loss and accuracy for each k validation
accuracy.append(model.evaluate(test_x, test_y, verbose=0))
#Save logs
log = pd.DataFrame(hist.history)
logs.append(log)
#Save models on current folder with names subscripts 0 to 4
model.save(os.path.join(dirname, 'keras_model')+str(k)+'.h5')
#
accuracy = np.array(accuracy)
# Plot final cross validation accuracy
print ('Mean Cross-val accuracy', np.mean(accuracy[:,1]))
################################################################
# Plotting Class Activation Maps
################################################################
# Compute correctly classified and incorrectly classified cases
corrects, incorrects = find_incorrects(ground_truth,predictions_ord)
# Get dataframe of all incorrects and dataframe of all corrects
corrects_df = pd.concat([r for r in corrects], ignore_index=False, axis=0)
incorrects_df = pd.concat([r for r in incorrects], ignore_index=False, axis=0)
# Get the cam for the trained examples, for each class we average the cam of all trained examples
# Trains refers to the elements in X_exp used for training
# Choose the model in cross validation as output, in this case we plot number 5
cam_outputs=get_cam('keras_model4.h5', trains[4], X_exp)
cam_df=pd.DataFrame(cam_outputs)
cam_df=cam_df.iloc[1:]
cam_df['Label']=y_exp[trains[4]]
# CAM with all augmented training data
rng=range(0,7000)
cam_outputs2=get_cam('keras_model4.h5', rng, train_combine)
cam_df2=pd.DataFrame(cam_outputs2)
cam_df2=cam_df2.iloc[1:]
cam_df2['Label']=train_y
# Now we focus on the incorrectly labelled cam's
incorrects_filtered=incorrects_df[incorrects_df.Model=='keras_model4.h5']
# Get CAM of incorrectly classified cases
cam_inc=get_cam('keras_model4.h5', [int(element) for element in incorrects_filtered.index], X_exp)
cam_inc=pd.DataFrame(cam_inc)
cam_inc=cam_inc.iloc[1:]
# Redefine main variable
cam_df=cam_df2
#Get the average class maps for a certain class, in this case 6
cam_filtered=cam_df[cam_df.Label==6]
means_6=cam_filtered.mean()
means_6=means_6.iloc[:-1]
#Plot the average class map for class 6
plot_cam(means_6,'Average CAM for Class 6, trained model4.h5')
#Plot correctly classified CAMs and patterns, no need to change this
corrects_filtered=corrects_df[corrects_df.Model=='keras_model4.h5']
cam_cor=get_cam('keras_model4.h5', [int(element) for element in corrects_filtered.index], X_exp)
cam_cor=pd.DataFrame(cam_cor)
cam_cor=cam_cor.iloc[1:]
#Plot a single correctly classified pattern
plot_cam(cam_cor.iloc[-1,:],'CAM for single pattern: true class 6, predicted is 6', X_exp[int(corrects_filtered.index[-1])])
# Note that results may vary due to NN variability during training, make sure you are plotting
# correct and incorrect cases clearly
################################################################