【keras】一台设备上同时使用多张显卡训练同一个网络模型_如何两张显卡跑一个网络

作者：代码探索者 | 2024-02-01 12:24:07

踩

如何两张显卡跑一个网络

Reference：

【简述-zzw】Keras同时用多张显卡训练网络

【知乎】如何让keras训练深度网络时使用两张显卡？

以 tensorflow 为后端，有两种方法可以在多张GPU上运行一个模型：数据并行和设备并行，参考keras中文文档。

数据并行：

数据并行将目标模型在多个设备上各复制一份，并使用每个设备上的复制品处理整个数据集的不同部分数据。Keras在keras.utils.multi_gpu_model中提供有内置函数，该函数可以产生任意模型的数据并行版本，最高支持在8片GPU上并行。请参考utils中的multi_gpu_model文档。下面是一个例子：


from keras.utils import multi_gpu_model
 
# Replicates `model` on 8 GPUs.
# This assumes that your machine has 8 available GPUs.
parallel_model = multi_gpu_model(model, gpus=8)
parallel_model.compile(loss='categorical_crossentropy',
                       optimizer='rmsprop')
 
# This `fit` call will be distributed on 8 GPUs.
# Since the batch size is 256, each GPU will process 32 samples.
parallel_model.fit(x, y, epochs=20, batch_size=256)

设备并行：

设备并行是在不同设备上运行同一个模型的不同部分，当模型含有多个并行结构，例如含有两个分支时，这种方式很适合。这种并行方法可以通过使用TensorFlow device scopes实现，下面是一个例子：


# Model where a shared LSTM is used to encode two different sequences in parallel
input_a = keras.Input(shape=(140, 256))
input_b = keras.Input(shape=(140, 256))
 
shared_lstm = keras.layers.LSTM(64)
 
# Process the first sequence on one GPU
with tf.device_scope('/gpu:0'):
    encoded_a = shared_lstm(tweet_a)
# Process the next sequence on another GPU
with tf.device_scope('/gpu:1'):
    encoded_b = shared_lstm(tweet_b)
 
# Concatenate results on CPU
with tf.device_scope('/cpu:0'):
    merged_vector = keras.layers.concatenate([encoded_a, encoded_b],
                                             axis=-1)

以keras框架使用两张GPU训练 inception_v4 模型为例：


# -*- coding: utf-8 -*-
import numpy as np
 
from keras.models import Sequential
from keras.layers import Input, Dense, Convolution2D, MaxPooling2D, AveragePooling2D, ZeroPadding2D, Dropout, Flatten, merge, Reshape, Activation
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras import backend as K
from sklearn.metrics import log_loss
# from load_cifar10 import load_cifar10_data
 
from keras.preprocessing.image import ImageDataGenerator
 
from keras import optimizers
import keras
import tensorflow as tf
 
from keras.utils import multi_gpu_model
 
 
 
def conv2d_bn(x, nb_filter, nb_row, nb_col,
              border_mode='same', subsample=(1, 1), bias=False):
    """
    Utility function to apply conv + BN. 
    (Slightly modified from https://github.com/fchollet/keras/blob/master/keras/applications/inception_v3.py)
    """
    if K.image_dim_ordering() == "th":
        channel_axis = 1
    else:
        channel_axis = -1
    x = Convolution2D(nb_filter, nb_row, nb_col,
                      subsample=subsample,
                      border_mode=border_mode,
                      bias=bias)(x)
    x = BatchNormalization(axis=channel_axis)(x)
    x = Activation('relu')(x)
    return x
 
def block_inception_a(input):
    if K.image_dim_ordering() == "th":
        channel_axis = 1
    else:
        channel_axis = -1
 
    branch_0 = conv2d_bn(input, 96, 1, 1)
 
    branch_1 = conv2d_bn(input, 64, 1, 1)
    branch_1 = conv2d_bn(branch_1, 96, 3, 3)
 
    branch_2 = conv2d_bn(input, 64, 1, 1)
    branch_2 = conv2d_bn(branch_2, 96, 3, 3)
    branch_2 = conv2d_bn(branch_2, 96, 3, 3)
 
    branch_3 = AveragePooling2D((3,3), strides=(1,1), border_mode='same')(input)
    branch_3 = conv2d_bn(branch_3, 96, 1, 1)
 
    x = merge([branch_0, branch_1, branch_2, branch_3], mode='concat', concat_axis=channel_axis)
    return x
 
 
def block_reduction_a(input):
    if K.image_dim_ordering() == "th":
        channel_axis = 1
    else:
        channel_axis = -1
 
    branch_0 = conv2d_bn(input, 384, 3, 3, subsample=(2,2), border_mode='valid')
 
    branch_1 = conv2d_bn(input, 192, 1, 1)
    branch_1 = conv2d_bn(branch_1, 224, 3, 3)
    branch_1 = conv2d_bn(branch_1, 256, 3, 3, subsample=(2,2), border_mode='valid')
 
    branch_2 = MaxPooling2D((3,3), strides=(2,2), border_mode='valid')(input)
 
    x = merge([branch_0, branch_1, branch_2], mode='concat', concat_axis=channel_axis)
    return x
 
 
def block_inception_b(input):
    if K.image_dim_ordering() == "th":
        channel_axis = 1
    else:
        channel_axis = -1
 
    branch_0 = conv2d_bn(input, 384, 1, 1)
 
    branch_1 = conv2d_bn(input, 192, 1, 1)
    branch_1 = conv2d_bn(branch_1, 224, 1, 7)
    branch_1 = conv2d_bn(branch_1, 256, 7, 1)
 
    branch_2 = conv2d_bn(input, 192, 1, 1)
    branch_2 = conv2d_bn(branch_2, 192, 7, 1)
    branch_2 = conv2d_bn(branch_2, 224, 1, 7)
    branch_2 = conv2d_bn(branch_2, 224, 7, 1)
    branch_2 = conv2d_bn(branch_2, 256, 1, 7)
 
    branch_3 = AveragePooling2D((3,3), strides=(1,1), border_mode='same')(input)
    branch_3 = conv2d_bn(branch_3, 128, 1, 1)
 
    x = merge([branch_0, branch_1, branch_2, branch_3], mode='concat', concat_axis=channel_axis)
    return x
 
 
def block_reduction_b(input):
    if K.image_dim_ordering() == "th":
        channel_axis = 1
    else:
        channel_axis = -1
 
    branch_0 = conv2d_bn(input, 192, 1, 1)
    branch_0 = conv2d_bn(branch_0, 192, 3, 3, subsample=(2, 2), border_mode='valid')
 
    branch_1 = conv2d_bn(input, 256, 1, 1)
    branch_1 = conv2d_bn(branch_1, 256, 1, 7)
    branch_1 = conv2d_bn(branch_1, 320, 7, 1)
    branch_1 = conv2d_bn(branch_1, 320, 3, 3, subsample=(2,2), border_mode='valid')
 
    branch_2 = MaxPooling2D((3, 3), strides=(2, 2), border_mode='valid')(input)
 
    x = merge([branch_0, branch_1, branch_2], mode='concat', concat_axis=channel_axis)
    return x
 
 
def block_inception_c(input):
    if K.image_dim_ordering() == "th":
        channel_axis = 1
    else:
        channel_axis = -1
 
    branch_0 = conv2d_bn(input, 256, 1, 1)
 
    branch_1 = conv2d_bn(input, 384, 1, 1)
    branch_10 = conv2d_bn(branch_1, 256, 1, 3)
    branch_11 = conv2d_bn(branch_1, 256, 3, 1)
    branch_1 = merge([branch_10, branch_11], mode='concat', concat_axis=channel_axis)
 
 
    branch_2 = conv2d_bn(input, 384, 1, 1)
    branch_2 = conv2d_bn(branch_2, 448, 3, 1)
    branch_2 = conv2d_bn(branch_2, 512, 1, 3)
    branch_20 = conv2d_bn(branch_2, 256, 1, 3)
    branch_21 = conv2d_bn(branch_2, 256, 3, 1)
    branch_2 = merge([branch_20, branch_21], mode='concat', concat_axis=channel_axis)
 
    branch_3 = AveragePooling2D((3, 3), strides=(1, 1), border_mode='same')(input)
    branch_3 = conv2d_bn(branch_3, 256, 1, 1)
 
    x = merge([branch_0, branch_1, branch_2, branch_3], mode='concat', concat_axis=channel_axis)
    return x
 
 
def inception_v4_base(input):
    if K.image_dim_ordering() == "th":
        channel_axis = 1
    else:
        channel_axis = -1
 
    # Input Shape is 299 x 299 x 3 (th) or 3 x 299 x 299 (th)
    net = conv2d_bn(input, 32, 3, 3, subsample=(2,2), border_mode='valid')
    net = conv2d_bn(net, 32, 3, 3, border_mode='valid')
    net = conv2d_bn(net, 64, 3, 3)
 
    branch_0 = MaxPooling2D((3,3), strides=(2,2), border_mode='valid')(net)
 
    branch_1 = conv2d_bn(net, 96, 3, 3, subsample=(2,2), border_mode='valid')
 
    net = merge([branch_0, branch_1], mode='concat', concat_axis=channel_axis)
 
    branch_0 = conv2d_bn(net, 64, 1, 1)
    branch_0 = conv2d_bn(branch_0, 96, 3, 3, border_mode='valid')
 
    branch_1 = conv2d_bn(net, 64, 1, 1)
    branch_1 = conv2d_bn(branch_1, 64, 1, 7)
    branch_1 = conv2d_bn(branch_1, 64, 7, 1)
    branch_1 = conv2d_bn(branch_1, 96, 3, 3, border_mode='valid')
 
    net = merge([branch_0, branch_1], mode='concat', concat_axis=channel_axis)
 
    branch_0 = conv2d_bn(net, 192, 3, 3, subsample=(2,2), border_mode='valid')
    branch_1 = MaxPooling2D((3,3), strides=(2,2), border_mode='valid')(net)
 
    net = merge([branch_0, branch_1], mode='concat', concat_axis=channel_axis)
 
    # 35 x 35 x 384
    # 4 x Inception-A blocks
    for idx in xrange(4):
      net = block_inception_a(net)
 
    # 35 x 35 x 384
    # Reduction-A block
    net = block_reduction_a(net)
 
    # 17 x 17 x 1024
    # 7 x Inception-B blocks
    for idx in xrange(7):
      net = block_inception_b(net)
 
    # 17 x 17 x 1024
    # Reduction-B block
    net = block_reduction_b(net)
 
    # 8 x 8 x 1536
    # 3 x Inception-C blocks
    for idx in xrange(3):
      net = block_inception_c(net)
 
    return net
 
 
def inception_v4_model(img_rows, img_cols, color_type=1, num_classes=None, dropout_keep_prob=0.2):
    '''
    Inception V4 Model for Keras
    Model Schema is based on
    https://github.com/kentsommer/keras-inceptionV4
    ImageNet Pretrained Weights 
    Theano: https://github.com/kentsommer/keras-inceptionV4/releases/download/2.0/inception-v4_weights_th_dim_ordering_th_kernels.h5
    TensorFlow: https://github.com/kentsommer/keras-inceptionV4/releases/download/2.0/inception-v4_weights_tf_dim_ordering_tf_kernels.h5
    Parameters:
      img_rows, img_cols - resolution of inputs
      channel - 1 for grayscale, 3 for color 
      num_classes - number of class labels for our classification task
    '''
 
    # Input Shape is 299 x 299 x 3 (tf) or 3 x 299 x 299 (th)
    if K.image_dim_ordering() == 'th':
        inputs = Input((3, 299, 299))
    else:
        inputs = Input((299, 299, 3))
 
    # Make inception base
    net = inception_v4_base(inputs)
 
 
    # Final pooling and prediction
 
    # 8 x 8 x 1536
    net_old = AveragePooling2D((8,8), border_mode='valid')(net)
 
    # 1 x 1 x 1536
    net_old = Dropout(dropout_keep_prob)(net_old)
    net_old = Flatten()(net_old)
 
    # 1536
    predictions = Dense(output_dim=1001, activation='softmax')(net_old)
 
    model = Model(inputs, predictions, name='inception_v4')
 
    if K.image_dim_ordering() == 'th':
      # Use pre-trained weights for Theano backend
      weights_path = 'imagenet_models/inception-v4_weights_th_dim_ordering_th_kernels.h5'
    else:
      # Use pre-trained weights for Tensorflow backend
      weights_path = 'imagenet_models/inception-v4_weights_tf_dim_ordering_tf_kernels.h5'
 
    # weights_path = './InceptionV4_model_fold_01.h5'
    model.load_weights(weights_path, by_name=True)
 
    # Truncate and replace softmax layer for transfer learning
    # Cannot use model.layers.pop() since model is not of Sequential() type
    # The method below works since pre-trained weights are stored in layers but not in the model
    net_ft = AveragePooling2D((8,8), border_mode='valid')(net)
    net_ft = Dropout(dropout_keep_prob)(net_ft)
    net_ft = Flatten()(net_ft)
    predictions_ft = Dense(output_dim=num_classes, activation='softmax')(net_ft)
 
    model = Model(inputs, predictions_ft, name='inception_v4')
 
    return model
 
if __name__ == '__main__':
 
    # import os
    # os.environ['CUDA_VISIBLE_DEVICES']='0'
    
    # dimensions of our images.
    # ADNI GM
    # X: 121*145
    # Y: 121*121
    # Z: 145*121
 
    # OASIS GM MRI
    # 176*208
    ### data_fold_01_train_val_test_entropy_keep_SliceNum_33
    img_width, img_height = 299, 299
    fold_name = "fold_01"  ## data_fold_01_entropy_keep_SliceNum_33
    ## single_subject_data_fold_01_train_val_test_entropy_keep_SliceNum_81
    train_data_dir = 'single_subject_data_' + fold_name + '_train_val_test_entropy_keep_SliceNum_81/train'
    validation_data_dir = 'single_subject_data_' + fold_name + '_train_val_test_entropy_keep_SliceNum_81/validation'
    filepath="model_single_subject_InceptionV4_" + fold_name + "_train_val_test_entropy_keep_SliceNum_81_best.h5"
 
    # train num (AD+NC) = 36207 + 41796 = 78003
    # validation num (AD+NC) = 9477 + 11178 = 20655
    # test num (AD+NC) = 2673 + 2916 = 
    # train_samples_AD =  len(os.listdir(path))
    nb_train_samples = 78003
    nb_validation_samples = 20655
    epochs = 120
    batch_size = 64 #10 #40
    channel = 3
    num_classes = 2
    
    print("=== paramaters info ===")
    print("epochs = {}.".format(epochs))
    print("batch_size = {}.".format(batch_size))
    print("nb_train_samples = {}.".format(nb_train_samples))
    print("nb_validation_samples = {}.".format(nb_validation_samples))
 
    #if K.image_data_format() == 'channels_first':
    #	input_shape = (3, img_width, img_height)
    #else:
    #	input_shape = (img_width, img_height, 3)
 
    # this is the augmentation configuration we will use for training
    train_datagen = ImageDataGenerator(
        rescale=1. / 255,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True)
 
    # this is the augmentation configuration we will use for testing:
    # only rescaling
    test_datagen = ImageDataGenerator(rescale=1. / 255)
 
 
 
    ### class_mode: "categorical", "binary", "sparse"或None之一.
    ### 默认为"categorical: 该参数决定了返回的标签数组的形式, "categorical"会返回2D的one-hot编码标签,
    ### "binary"返回1D的二值标签.
    ### "sparse"返回1D的整数标签,
    ### 如果为None则不返回任何标签, 生成器将仅仅生成batch数据, 这种情况在使用model.predict_generator()和model.evaluate_generator()等函数时会用到.
 
    train_generator = train_datagen.flow_from_directory(
        train_data_dir,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        class_mode='binary')
 
    validation_generator = test_datagen.flow_from_directory(
        validation_data_dir,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        class_mode='binary')
 
    # Load our model
    model = inception_v4_model(img_height, img_width, channel, num_classes, dropout_keep_prob=0.2)
    parallel_model = multi_gpu_model(model, gpus=2)
    # Learning rate is changed to 0.001
    sgd = optimizers.SGD(lr=1e-3, decay=1e-6, momentum=0.9, nesterov=True)
    parallel_model.compile(optimizer=sgd, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
 
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath = filepath,
        monitor='val_acc',
        verbose=1,
        save_best_only=True,
        # save_weights_only=False,
        mode='max',
        period=1
    )
    callbacks_list = [checkpoint]
 
 
    ### verbose：日志显示，0为不在标准输出流输出日志信息，1为输出进度条记录，2为每个epoch输出一行记录
    ### 
    parallel_model.fit_generator(
        train_generator,
        steps_per_epoch=nb_train_samples/batch_size,
        epochs=epochs,
        verbose = 2,
        validation_data=validation_generator,
        validation_steps=nb_validation_samples/batch_size,
        callbacks = callbacks_list)
        #validation_steps=nb_validation_samples // batch_size)
 
 
    # model.save('InceptionV4_model_fold_01.h5')
 
 
    # Make predictions
    #predictions_valid = model.predict(X_valid, batch_size=batch_size, verbose=1)
 
    # Cross-entropy loss score
    #score = log_loss(Y_valid, predictions_valid)
 
### CUDA_VISIBLE_DEVICES=0 python inception_v4_train_val_test_entropy_keep_SliceNum_81_fold_01_single_subject.py > acc_inception_v4_train_val_test_entropy_keep_SliceNum_81_fold_01_single_subject.txt
### python inception_v4_train_val_test_entropy_keep_SliceNum_81_fold_01_single_subject.py > acc_single_subject_inception_v4_train_val_test_entropy_keep_SliceNum_81_fold_01.txt

注意：

上述代码使用


    # parallel_model.fit_generator(
    #     train_generator,
    #     steps_per_epoch=nb_train_samples/batch_size,
    #     epochs=epochs,
    #     verbose = 2,
    #     validation_data=validation_generator,
    #     validation_steps=nb_validation_samples/batch_size,
    #     callbacks = callbacks_list)

会报错：

TypeError: can't pickle NotImplementedType objects

去掉 callbacks 即可，如下所示：


    parallel_model.fit_generator(
        train_generator,
        steps_per_epoch=nb_train_samples/batch_size,
        epochs=epochs,
        verbose = 2,
        validation_data=validation_generator,
        validation_steps=nb_validation_samples/batch_size)

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/article/detail/53768?site