Post
Ch6.1 Image Classification을 위한 Neural Network(AlexNet) | Gihun Son

Ch6.1 Image Classification을 위한 Neural Network(AlexNet)

AlexNet의 구조

image

image

AlexNet은 Convolutional layer에서 Activation function으로 ReLU함수를 사용한다. GPU-1은 color와 상관없는 정보를 추출하기 위한 커널이 학습되고, GPU-2는 주로 color와 관련된 정보를 추출하기 위한 커널이 학습된다.

실습

이전의 LeNet과 Training, Test process자체는 같다. 다마 모델의 구성이 다르기 때문에 그것에 집중에서 실습을 해보자.

필요 라이브러리 호출

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import torch
import torchvision
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from torch.autograd import Variable
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
import os
import cv2
import random
import time
import matplotlib.pyplot as plt
from PIL import Image
from tqdm.notebook import tqdm
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

image

AlexNet은 softmax함수를 통해 1000x1 vector의 output이 나오지만, 실습에서는 ‘Cat’, ‘Dog’ 두가지 label만을 사용

Data Pre-processing

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
class ImageTransform():
    def __init__(self, resize, mean, std):
        self.data_transform = {
            'train': transforms.Compose([
                transforms.RandomResizedCrop(resize, scale=(0.5, 1.0)),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize(mean, std)
            ]),
            'val': transforms.Compose([
                transforms.Resize(256),
                transforms.CenterCrop(resize),
                transforms.ToTensor(),
                transforms.Normalize(mean, std)
            ])
        }

    def __call__(self, img, phase):
        return self.data_transform[phase](img)
1
2
from google.colab import drive
drive.mount('/content/drive')
1
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
cat_directory='/content/drive/MyDrive/Pytorch_study/data/dogs-vs-cats/Cat/'
dog_directory='/content/drive/MyDrive/Pytorch_study/data/dogs-vs-cats/Dog/'

cat_images_directory=sorted([os.path.join(cat_directory, f) for f in os.listdir(cat_directory)])
dog_images_directory=sorted([os.path.join(dog_directory, f) for f in os.listdir(dog_directory)])

images_filepaths=[*cat_images_directory,*dog_images_directory]
correct_images_filepaths=[i for i in images_filepaths if cv2.imread(i) is not None]

random.seed(42)
random.shuffle(correct_images_filepaths)
train_images_filepath=correct_images_filepaths[:400]
val_images_filepath=correct_images_filepaths[400:-10]
test_images_filepath=correct_images_filepaths[-10:]
print(len(train_images_filepath),len(val_images_filepath),len(test_images_filepath))
1
400 92 10

실습을 진행하기 전 AlexNet은 parameter가 6000만개이다. 따라서 충분한 데이터가 없으면 성능이 좋지 않은데, 실습에서 많은 데이터를 쓸 수 없기 때문에 overfitting으로 인한 성능저하가 당연히 발생할 수 밖에 없다.

  • 해당 코드는 Custom_Dataset 클래스를 통해 image를 불러와 image preprocessing과 labeling을 진행한다. 이를 DataLoader에 전달하여 데이터를 메모리로 불러오는 것
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
class Custom_Dataset():
    def __init__(self,file_list,transform=None,phase='train'):
        self.file_list=file_list #image가 존재하는 경로
        self.transform=transform #Data preprocessing
        self.phase=phase #'train' 또는 'val'

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        img_path=self.file_list[idx]
        img=Image.open(img_path) #PIL을 이용하여 image불러옴
        img_transformed=self.transform(img,self.phase) #phase값에 따라 preprocessing과정이 달라짐(ImageTransform()에서 정의되어 있다.)

        label=img_path.split('/')[-1].split('.')[0] #'/content/drive/MyDrive/Pytorch_study/data/dogs-vs-cats/Cat/cat.1.jpg'의 경로에서 label추출
        if label=='dog':
            label=1
        elif label=='cat':
            label=0

        return img_transformed,label #전처리가 적용된 image와 label

1
2
3
4
5
#preprocessing에 필요한 mean, std등의 변수 값 정의
size=256 #model input의 크기
mean=(0.485,0.456,0.406)
std=(0.229,0.224,0.225)
batch_size=32
1
2
3
4
5
6
7
#각 dataset별로 Custom_Dataset의 객체 생성, __getitem__()함수를 통해 값을 반환하는 것이다.
train_dataset=Custom_Dataset(train_images_filepath,ImageTransform(resize=size,mean=mean,std=std),phase='train')
val_dataset=Custom_Dataset(val_images_filepath,ImageTransform(resize=size,mean=mean,std=std),phase='val')
test_dataset=Custom_Dataset(test_images_filepath,ImageTransform(resize=size,mean=mean,std=std),phase='val')

print('preprocessing된 image의 크기:', train_dataset.__getitem__(0)[0].size())
print('label:',train_dataset.__getitem__(0)[1])
1
2
preprocessing된 image의 크기: torch.Size([3, 256, 256])
label: 0

DataLoader

  • 데이터를 메모리로 불러옴
1
2
3
4
5
6
7
8
9
train_dataloader=DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
val_dataloader=DataLoader(val_dataset,batch_size=batch_size,shuffle=False)
test_dataloader=DataLoader(test_dataset,batch_size=batch_size,shuffle=False)
dataloader_dict={'train':train_dataloader,'val':val_dataloader}

batch_iterator=iter(train_dataloader) #iter는 반복 가능한 객체에서 이터레이터를 반환
inputs,label=next(batch_iterator) #iterator에서 값을 차례로 반환
print(inputs.size())
print(label)
1
2
3
torch.Size([32, 3, 256, 256])
tensor([1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0,
        1, 0, 0, 1, 0, 0, 1, 1])

Model Architecture

image

  • ReLU(inplace=True): inplace는 결과값을 새로운 변수에 저장하는 것이 아닌, 기존 데이터를 대체한다는 의미

  • self.avgpool = nn.AdaptiveAvgPool2d((6, 6)): AvgPool2d는 nn.AvgPool2d(2,stride=3) 또는 nn.AvgPool2d((2,1),stride=(3,2))로 사용하여 kernel size와 stride를 지정해준다.((2,1)은 2x1 kernel, stride=(3,2)는 H방향으로 stride=3, W방향으로 stride=2를 의미한다. 반면 nn.AdaptiveAvgPool2d는 pooling이 끝날 때의 출력을 정의한다. 즉, nn.AdaptiveAvgPool2d((6, 6))의 결과는 6x6이 된다.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
class AlexNet(nn.Module):
    def __init__(self) -> None: #->None은 함수가 반환하는 데이터 타입을 나타내는 주석과 같다. None은 반환하지 않음을 의미
        super(AlexNet, self).__init__() #작성하지 않아도 nn.Module을 상속받는다(가독성을 높이기 위한 코드)
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256*6*6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, 2),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor: #Tensor를 반환함
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

Model Object 생성

1
2
model=AlexNet()
model.to(device)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
    (2): ReLU(inplace=True)
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=4096, out_features=512, bias=True)
    (5): ReLU(inplace=True)
    (6): Linear(in_features=512, out_features=2, bias=True)
  )
)

Optimizer & Loss Function

1
2
optimizer=optim.SGD(model.parameters(),lr=0.001,momentum=0.9)
criterion=nn.CrossEntropyLoss().to(device)

Model Architecture 확인

1
2
from torchsummary import summary
summary(model,input_size=(3,256,256))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Conv2d-1           [-1, 64, 63, 63]          23,296
              ReLU-2           [-1, 64, 63, 63]               0
         MaxPool2d-3           [-1, 64, 31, 31]               0
            Conv2d-4          [-1, 192, 31, 31]         307,392
              ReLU-5          [-1, 192, 31, 31]               0
         MaxPool2d-6          [-1, 192, 15, 15]               0
            Conv2d-7          [-1, 384, 15, 15]         663,936
              ReLU-8          [-1, 384, 15, 15]               0
            Conv2d-9          [-1, 256, 15, 15]         884,992
             ReLU-10          [-1, 256, 15, 15]               0
           Conv2d-11          [-1, 256, 15, 15]         590,080
             ReLU-12          [-1, 256, 15, 15]               0
        MaxPool2d-13            [-1, 256, 7, 7]               0
AdaptiveAvgPool2d-14            [-1, 256, 6, 6]               0
          Dropout-15                 [-1, 9216]               0
           Linear-16                 [-1, 4096]      37,752,832
             ReLU-17                 [-1, 4096]               0
          Dropout-18                 [-1, 4096]               0
           Linear-19                  [-1, 512]       2,097,664
             ReLU-20                  [-1, 512]               0
           Linear-21                    [-1, 2]           1,026
================================================================
Total params: 42,321,218
Trainable params: 42,321,218
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.75
Forward/backward pass size (MB): 10.90
Params size (MB): 161.44
Estimated Total Size (MB): 173.10
----------------------------------------------------------------

Training

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def train_model(model,dataloader_dict,criterion,optimizer,num_epoch):

    since=time.time()
    best_acc=0.0

    for epoch in range(num_epoch):
        print('Epoch {}/{}'.format(epoch+1,num_epoch))
        print('-'*20)

        for phase in ['train','val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            epoch_loss=0.0
            epoch_corrects=0

            for inputs,labels in tqdm(dataloader_dict[phase]):
                inputs=inputs.to(device)
                labels=labels.to(device)
                optimizer.zero_grad() #back propagation 전에 항상 gradient를 0으로 초기화해주어야 한다.

                with torch.set_grad_enabled(phase=='train'): #autograd 활성화
                    outputs=model(inputs)
                    _, preds=torch.max(outputs,1) # preds=torch.max(outputs,1)로 쓰면 preds[0]은 최대값을, preds[1]은 최대값의 인덱스 즉 label을 나타낸다.
                    loss=criterion(outputs,labels)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                    epoch_loss += loss.item()*inputs.size(0) #inputs.size(0)은 input의 행, 즉 batch size를 의미한다. loss는 batch전체의 평균 loss이므로 batch size만큼 곱해준다.
                    epoch_corrects+=torch.sum(preds==labels.data) #예측과 정답이 얼마나 정확한지 측정

            epoch_loss=epoch_loss/len(dataloader_dict[phase].dataset) #epoch의 평균 loss를 구한다.
            epoch_acc=epoch_corrects.double()/len(dataloader_dict[phase].dataset)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase,epoch_loss,epoch_acc))
    time_elapsed=time.time()-since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed//60,time_elapsed%60))
    return model
1
2
num_epoch=10
model=train_model(model,dataloader_dict,criterion,optimizer,num_epoch)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
Epoch 1/10
--------------------



  0%|          | 0/13 [00:00<?, ?it/s]


train Loss: 0.6934 Acc: 0.5025



  0%|          | 0/3 [00:00<?, ?it/s]


val Loss: 0.6930 Acc: 0.5109
Epoch 2/10
--------------------



  0%|          | 0/13 [00:00<?, ?it/s]


train Loss: 0.6935 Acc: 0.5025



  0%|          | 0/3 [00:00<?, ?it/s]


val Loss: 0.6930 Acc: 0.5109
Epoch 3/10
--------------------



  0%|          | 0/13 [00:00<?, ?it/s]


train Loss: 0.6934 Acc: 0.5025



  0%|          | 0/3 [00:00<?, ?it/s]


val Loss: 0.6930 Acc: 0.5109
Epoch 4/10
--------------------



  0%|          | 0/13 [00:00<?, ?it/s]


train Loss: 0.6933 Acc: 0.5025



  0%|          | 0/3 [00:00<?, ?it/s]


val Loss: 0.6930 Acc: 0.5109
Epoch 5/10
--------------------



  0%|          | 0/13 [00:00<?, ?it/s]


train Loss: 0.6935 Acc: 0.5025



  0%|          | 0/3 [00:00<?, ?it/s]


val Loss: 0.6930 Acc: 0.5109
Epoch 6/10
--------------------



  0%|          | 0/13 [00:00<?, ?it/s]


train Loss: 0.6935 Acc: 0.5025



  0%|          | 0/3 [00:00<?, ?it/s]


val Loss: 0.6929 Acc: 0.5109
Epoch 7/10
--------------------



  0%|          | 0/13 [00:00<?, ?it/s]


train Loss: 0.6929 Acc: 0.5025



  0%|          | 0/3 [00:00<?, ?it/s]


val Loss: 0.6929 Acc: 0.5109
Epoch 8/10
--------------------



  0%|          | 0/13 [00:00<?, ?it/s]


train Loss: 0.6931 Acc: 0.5025



  0%|          | 0/3 [00:00<?, ?it/s]


val Loss: 0.6929 Acc: 0.5109
Epoch 9/10
--------------------



  0%|          | 0/13 [00:00<?, ?it/s]


train Loss: 0.6929 Acc: 0.5025



  0%|          | 0/3 [00:00<?, ?it/s]


val Loss: 0.6929 Acc: 0.5109
Epoch 10/10
--------------------



  0%|          | 0/13 [00:00<?, ?it/s]


train Loss: 0.6930 Acc: 0.5025



  0%|          | 0/3 [00:00<?, ?it/s]


val Loss: 0.6929 Acc: 0.5109
Training complete in 0m 37s

Evaluation

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import pandas as pd
id_list = []
pred_list = []
_id = 0
with torch.no_grad():
    for test_path in tqdm(test_images_filepath): #test dataset사용
        img = Image.open(test_path)
        _id = test_path.split('/')[-1].split('.')[1]# data의 index가져오기 (Ex dog.113.jpg라는 이미지 이름에서 113)
        transform = ImageTransform(size, mean, std)
        img = transform(img, phase='val')# test dataset pre processing
        img = img.unsqueeze(0)
        img = img.to(device)

        model.eval()
        outputs = model(img)
        preds = F.softmax(outputs, dim=1)[:, 1].tolist()

        id_list.append(_id)
        pred_list.append(preds[0])

res = pd.DataFrame({
    'id': id_list,
    'label': pred_list
}) # dataframe에 이미지의 id(번호)와 레이블 저장
res.to_csv('/content/drive/MyDrive/Pytorch_study/alexnet.csv', index=False) #이미지의 id와 레이블을 alexnet.csv 파일에 저장
1
  0%|          | 0/10 [00:00<?, ?it/s]

Dataframe 확인

1
res.head(10)
idlabel
01450.507523
12110.508150
21620.507744
32000.508484
42100.508456
52240.507943
62130.507922
71090.508929
8150.508291
91670.507889

Evaluation 결과 시각화

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
class_ = classes = {0:'cat', 1:'dog'}
def display_image_grid(images_filepaths, predicted_labels=(), cols=5):
    rows = len(images_filepaths) // cols
    figure, ax = plt.subplots(nrows=rows, ncols=cols, figsize=(12, 6))
    for i, image_filepath in enumerate(images_filepaths):
        image = cv2.imread(image_filepath)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        a = random.choice(res['id'].values)
        label = res.loc[res['id'] == a, 'label'].values[0]
        if label > 0.5:
            label = 1
        else:
            label = 0
        ax.ravel()[i].imshow(image)
        ax.ravel()[i].set_title(class_[label])
        ax.ravel()[i].set_axis_off()
    plt.tight_layout()
    plt.show()
  • parameter가 많지만, 실습을 위해 데이터셋 양이 적기 때문에 성능이 좋지 않다.
1
display_image_grid(test_images_filepath)

Ch6_1_2_38_0

This post is licensed under CC BY 4.0 by the author.