Deep Learning：pytorchとkerasの速度比較

2019-03-27：実験に使ったkaggleのkernelへのリンクを追加

最近流行っているpytorchとkeras(tensorflow backend)だとpytorchの方が計算が倍早いという話を聞いたので試してみました。結果、シンプルなモデルで比較した結果pytorhの方がkerasより3倍早いことが分かりました。

実験環境
実験
- 前準備
- pytorch
- Keras
まとめ

実験環境

実験はCIFAR10を使ったpytorchのチュートリアルページに有るモデルをkerasでも実装し、pytorchとkerasそれぞれで実行時間を比較しました。環境はkaggleのkernelを用いて行いました。実験時のkernelの環境の詳細は以下のとおりです。

torch       :1.0.1.post2
keras       :2.2.4
tensorflow  :1.13.1
GPU         :Tesla P100

実験

前準備

速度比較をする前にライブラリのインポートとデータの準備をしましょう。 CIFAR10のデータは有志がアップロードしているこれを使っています。

from keras.utils import np_utils
import matplotlib.pyplot as plt
import numpy as np
import os
import time
from tqdm import tqdm

# pytorch
from torch.autograd import Variable
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision.transforms as transforms

# keras
import tensorflow
import keras
from keras.layers import Conv2D, MaxPooling2D,\
                         Lambda, Input, Dense,\
                         Flatten, BatchNormalization
from keras.models import Model
from keras import optimizers

# for eval
from sklearn.metrics import classification_report


def load_data():
    """Data loading function"""
    def _load_batch_file(batch_filename):
        filepath = os.path.join('../input/', batch_filename)
        unpickled = _unpickle(filepath)
        return unpickled

    def _unpickle(file):
        import pickle
        with open(file, 'rb') as fo:
            dict = pickle.load(fo, encoding='latin')
        return dict

    train_batch_1 = _load_batch_file('data_batch_1')
    train_batch_2 = _load_batch_file('data_batch_2')
    train_batch_3 = _load_batch_file('data_batch_3')
    train_batch_4 = _load_batch_file('data_batch_4')
    train_batch_5 = _load_batch_file('data_batch_5')
    test_batch = _load_batch_file('test_batch')

    num_classes = 10
    train_x = np.concatenate([train_batch_1['data'], train_batch_2['data'], train_batch_3['data'], train_batch_4['data'], train_batch_5['data']])
    train_x = train_x.astype('float32') # this is necessary for the division below

    train_x /= 255
    train_y = np.concatenate([np_utils.to_categorical(labels, num_classes) for labels in [train_batch_1['labels'], train_batch_2['labels'], train_batch_3['labels'], train_batch_4['labels'], train_batch_5['labels']]])

    test_x = test_batch['data'].astype('float32')/ 255
    test_y = np_utils.to_categorical(test_batch['labels'], num_classes)
    
    img_rows, img_cols = 32, 32
    channels = 3
    
    train_x = train_x.reshape(len(train_x), channels, img_rows, img_cols)
    test_x = test_x.reshape(len(test_x), channels, img_rows, img_cols)
    
    print("Sample image")
    plt.imshow(np.transpose(train_x[1], (1, 2, 0)))
    plt.show()
    
    return train_x, train_y, test_x, test_y
  

def preproc_pytorch(train_x, train_y, test_x, test_y):
    """
    Preprocessing func for pytorch
    """
    transform = transforms.Compose([torch.from_numpy])
    train_x = [transform(img) for img in train_x]
    test_x = [transform(img) for img in test_x]
    train = [(x, np.argmax(y)) for x, y in zip(train_x, train_y)]
    test = [(x, np.argmax(y)) for x, y in zip(test_x, test_y)]
    return train, test
  

# print env.
print(f"torch\t\t:{torch.__version__}")
print(f"keras\t\t:{keras.__version__}")
print(f"tensorflow\t:{tensorflow.__version__}")
! nvidia-smi

load_data()関数でCIFAR10の読み込みを行い、0-1の範囲に正規化を行っています。 preproc_pytorch()関数ではデータ構造をpytorch用に変換しています。 # print env.以下では先述したkernelの環境をprintしています。

pytorch

それではまずpytorchから実行時間を計測してみましょう。モデルはチュートリアルで使用されているものをそのまま用います。モデルを定義して、初期化を行います。

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


# init. model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
net = Net().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)

続いて学習用の関数の定義とデータを作成します。学習に使っているtrain_pytorch()関数はチュートリアルの学習部分をラップしたものになります。

def train_pytorch(trainloader, net, criterion, optimizer, verbose=False):
    # loop over the dataset multiple times
    for epoch in tqdm(range(EPOCHS), disable=verbose):
          
        for i, data in tqdm(enumerate(trainloader, 0), disable=verbose):
            # get the inputs
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device).long()

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        return net
    
trainloader = DataLoader(train,
                         batch_size=BATCH_SIZE,
                         shuffle=True,
                         num_workers=0)

testloader = DataLoader(test,
                        batch_size=BATCH_SIZE,
                        shuffle=True,
                        num_workers=0)

これでモデルとデータの準備は出来ました、pytorchの速度を測って見ましょう（tqdmのversionによっては実行時にAttributionErrorエラーが出ますが気にしないで大丈夫な様です）。

# 実行時間を計測
%timeit train_pytorch(trainloader, net, criterion, optimizer, True)

"""
output:
23.3 s ± 515 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
"""

結果を見ると7回の平均が23.3秒の様です。

それではkerasの速度を測る前にpytorchのモデルをメモリから消しておきましょう。

# pytorchで使用した変数をメモリから消す
import gc

torch.cuda.empty_cache()
del net, criterion, optimizer
gc.collect()

Keras

チュートリアルのモデルをkerasで実装、初期化します。

def init_keras_modes():
    inputs = Input(shape=(3, 32, 32))
    x = Conv2D(6, (5, 5), activation='relu',
               padding='valid',
               data_format='channels_first')(inputs)
    x = MaxPooling2D((2, 2), strides=(2, 2))(x)
    x = Conv2D(16, (5, 5), activation='relu',
               padding='valid',
               data_format='channels_first')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2))(x)
    flattened = Flatten(name='flatten')(x)
    x = Dense(120, activation='relu')(flattened)
    x = Dense(84, activation='relu')(x)
    predictions = Dense(10, activation='softmax', name='predictions')(x)

    sgd = optimizers.SGD(lr=LEARNING_RATE,
                         momentum=MOMENTUM)
    model = Model(inputs=inputs, outputs=predictions)

    model.compile(optimizer=sgd, loss='categorical_crossentropy')
    return model

model = init_keras_modes()

さて、実行時間を計測してみましょう

%timeit model.fit(train_x, train_y, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=0)

"""
output:
1min 16s ± 806 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
"""

結果は7回の平均が1分16秒(= 76秒）となりました。

まとめ

pytorchとkerasの実行時間を比べると、pytorchが23.3秒、kerasが76秒となりました。
どうやら今回の設定だとpytorchはkerasと比べて3倍以上早いようです。
どうしてpytorchの方が高速なのかについてまでは今回調べていませんが、kerasからpytorchに変更することで2倍以上の高速化を見込んでよいと思われます。勿論、今回の結果が全ての場合に当てはまるとは限りませんが基本的にdeepのモデルは今後pytorchで書くほうが良さそうです。

尚、実験に使ったkaggleのカーネルへのリンクはこちらになります。是非手元で実行してみてください。

猫になりたい

コンサルのデータ分析屋、計量経済とか機械学習をやっています。ｐｙてょｎは3.7を使ってマスコレルウィンストングリーン。

Deep Learning：pytorchとkerasの速度比較

実験環境

実験

前準備

pytorch

Keras

まとめ