Skip to content
Advertisement

TensorFlow MirroredStrategy() not working for multi-gpu training

I am trying to implement TensorFlows MirroredStrategy() to run a 3DUNet on 2 Nvidia Titan RTX graphics cards. The code is verified to work for 1 GPU. My OS is Red Hat Enterprise Linux 8 (RHEL8). The error comes at model.fit().

I have installed the appropriate NCCL Nvidia Drivers and verified that I can parse the training data onto both GPUs using an example from tensorflow.org.

Code:

def get_model(optimizer, loss_metric, metrics, lr=1e-3):
    inputs = Input((sample_width, sample_height, sample_depth, 1))
    conv1 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(inputs)
    conv1 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(conv1)
    pool1 = MaxPooling3D(pool_size=(2, 2, 2))(conv1)
    drop1 = Dropout(0.5)(pool1)

    conv2 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(drop1)
    conv2 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(conv2)
    pool2 = MaxPooling3D(pool_size=(2, 2, 2))(conv2)
    drop2 = Dropout(0.5)(pool2)

    conv3 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(drop2)
    conv3 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(conv3)
    pool3 = MaxPooling3D(pool_size=(2, 2, 2))(conv3)
    drop3 = Dropout(0.3)(pool3)

    conv4 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(drop3)
    conv4 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(conv4)
    pool4 = MaxPooling3D(pool_size=(2, 2, 2))(conv4)
    drop4 = Dropout(0.3)(pool4)

    conv5 = Conv3D(512, (3, 3, 3), activation='relu', padding='same')(drop4)
    conv5 = Conv3D(512, (3, 3, 3), activation='relu', padding='same')(conv5)

    up6 = concatenate([Conv3DTranspose(256, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv5), conv4], axis=4)
    conv6 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(up6)
    conv6 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(conv6)

    up7 = concatenate([Conv3DTranspose(128, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv6), conv3], axis=4)
    conv7 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(up7)
    conv7 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(conv7)

    up8 = concatenate([Conv3DTranspose(64, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv7), conv2], axis=4)
    conv8 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(up8)
    conv8 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(conv8)

    up9 = concatenate([Conv3DTranspose(32, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv8), conv1], axis=4)
    conv9 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(up9)
    conv9 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(conv9)

    conv10 = Conv3D(1, (1, 1, 1), activation='sigmoid')(conv9)

    model = Model(inputs=[inputs], outputs=[conv10])
    model.compile(optimizer=optimizer(lr=lr), loss=loss_metric, metrics=metrics)
    return model

mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
    model = get_model(optimizer=Adam, loss_metric=dice_coef_loss, metrics=[dice_coef], lr=1e-3)

observe_var = 'dice_coef'
strategy = 'max'

model_checkpoint = ModelCheckpoint('unet_seg_cs9300_3d_{epoch:04}.model', monitor=observe_var, save_best_only=False, period = 1000)
model.fit(train_x, train_y, batch_size= 1, epochs= 100, verbose=1, shuffle=True, validation_split=0.2, callbacks=[model_checkpoint])
model.save('unet_seg_final_3d_test.model')

Error:

---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
<ipython-input-3-15c1c64c47ab> in <module>
    423 model_checkpoint = ModelCheckpoint('unet_seg_cs9300_3d_{epoch:04}.model', monitor=observe_var, save_best_only=False, period = 1000)
    424 
--> 425 model.fit(train_x, train_y, batch_size= 1, epochs= 100, verbose=1, shuffle=True, validation_split=0.2, callbacks=[model_checkpoint])
    426 
    427 model.save('unet_seg_final_3d_test.model')

~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
   1211         else:
   1212             fit_inputs = x + y + sample_weights
-> 1213         self._make_train_function()
   1214         fit_function = self.train_function
   1215 

~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/engine/training.py in _make_train_function(self)
    314                     training_updates = self.optimizer.get_updates(
    315                         params=self._collected_trainable_weights,
--> 316                         loss=self.total_loss)
    317                 updates = self.updates + training_updates
    318 

~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
     89                 warnings.warn('Update your `' + object_name + '` call to the ' +
     90                               'Keras 2 API: ' + signature, stacklevel=2)
---> 91             return func(*args, **kwargs)
     92         wrapper._original_function = func
     93         return wrapper

~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py in symbolic_fn_wrapper(*args, **kwargs)
     73         if _SYMBOLIC_SCOPE.value:
     74             with get_graph().as_default():
---> 75                 return func(*args, **kwargs)
     76         else:
     77             return func(*args, **kwargs)

~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/optimizers.py in get_updates(self, loss, params)
    548 
    549             # Apply constraints.
--> 550             if getattr(p, 'constraint', None) is not None:
    551                 new_p = p.constraint(new_p)
    552 

~/anaconda3/envs/gputest/lib/python3.7/site-packages/tensorflow_core/python/ops/variables.py in constraint(self)
    566       Can be `None` if no constraint was passed.
    567     """
--> 568     raise NotImplementedError
    569 
    570   def assign(self, value, use_locking=False, name=None, read_value=True):

NotImplementedError: 

Advertisement

Answer

This answer is based on a comment on OP’s question.

When conducting multi-gpu training with tf.distribute.MirroredStrategy, one should use the tf.keras API and not the tensorflow backend of the keras package.

In general, it is best not to mix tf.keras and keras.

User contributions licensed under: CC BY-SA
2 People found this is helpful
Advertisement