1. RNN
◼ import
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import DataLoader, Dataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
|
◼ sklearn.datasets.fetch_20newsgroups를 사용하여 데이터 가져오기
newsgroups_data = fetch_20newsgroups(subset='all')
texts, labels = newsgroups_data.data, newsgroups_data.target
|
texts label array([10, 3, 17, ..., 3, 1, 7]) |
◼ 데이터를 학습: 80%, 테스트: 20% 로 나누기
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=2024)
# 나눠진 데이터셋의 크기 확인
print(f"훈련 데이터셋 크기: {len(X_train)}")
print(f"검증 데이터셋 크기: {len(y_train)}")
|
◼ 테스트 데이터를 CountVectorizer를 사용하여 벡터화
# 테스트 데이터를 벡터화
# r=r'\b\w+\b'
# r: \해석하지 않게 함
# \b: 단어의 시작 또는 끝을 의미
# \w: 단어의 문자를 의미
vectorizer = CountVectorizer(max_features=1000, token_pattern=r'\b\w+\b')
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()
|
◼ 파이토치 텐서로 변환
# 파이토치 텐서로 변환
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)
|
◼ 데이터셋 클래스 정의
# 데이터셋 클래스 정의
class NewsGroupDataset(Dataset):
def __init__(self, X, y):
self.X = X
self.y = y
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
return self.X[idx], self.y[idx]
|
◼
train_dataset = NewsGroupDataset(X_train_tensor, y_train_tensor)
test_dataset = NewsGroupDataset(X_test_tensor, y_test_tensor)
len(train_dataset)
train_dataset[0]
|
15076 ------------------------------------------------------------ (tensor([0., 0., 0., ..., 0., 0., 0.]), tensor(11)) |
◼ 데이터 로더 생성
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
|
◼ RNN 모델 만들기
# RNN 모델 class RNNModel(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers=1):
super(RNNModel, self).__init__()
self.hidden_size = hidden_size
self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
h = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
out, _ = self.rnn(x, h)
out = self.fc(out[:, -1, :])
return out
input_size = 10000
hidden_size = 128
output_size = len(label_encoder.classes_)
num_layers = 1
model = RNNModel(input_size, hidden_size, output_size, num_layers)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
|
◼ 학습
# 학습
loss_fun = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10
for epoch in range(num_epochs):
model.train()
for X_batch, y_batch in train_loader:
X_batch = X_batch.unsqueeze(1)
# X_batch, y_batch = X_batch.to(model.device), y_batch.to(model.device)
outputs = model(X_batch)
loss = loss_fun(outputs, y_batch)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'Epoch: {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')
|
Epoch: 1/10, Loss: 0.5859 Epoch: 2/10, Loss: 0.2082 Epoch: 3/10, Loss: 0.0576 Epoch: 4/10, Loss: 0.0193 Epoch: 5/10, Loss: 0.0186 Epoch: 6/10, Loss: 0.0040 Epoch: 7/10, Loss: 0.0042 Epoch: 8/10, Loss: 0.0135 Epoch: 9/10, Loss: 0.0040 Epoch: 10/10, Loss: 0.0012 |
◼ 모델 평가
model.eval()
y_test, y_pred = [], []
with torch.no_grad():
for X_batch, y_batch in test_loader:
X_batch = X_batch.unsqueeze(1)
outputs = model(X_batch)
_, pred = torch.max(outputs, 1)
y_test.extend(y_batch.detach().numpy())
y_pred.extend(pred.detach().numpy())
accuracy = accuracy_score(y_test, y_pred)
print(f'accuracy: {accuracy:.4f}')
|
accuracy: 0.8984 |
2. LSTM(Long Short-Term Memory)
* 바닐라 RNN은 시퀀스 데이터를 처리할 때 시간이 지남에 따라 정보가 소실되거나 기울기가 소실되는 문제 발생
* 순환 신경망(RNN)의 한 종류로, 긴 시퀀스 데이터를 효과적으로 학습할 수 있도록 고안된
- 참고 : https://wikidocs.net/22888
1. LSTM의 구조
|
|
◼
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import DataLoader, Dataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
newsgroups_data = fetch_20newsgroups(subset='all')
texts, labels = newsgroups_data.data, newsgroups_data.target
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=2024)
vectorizer = CountVectorizer(max_features=10000, token_pattern=r'\b\w+\b')
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)
class NewsGroupDataset(Dataset):
def __init__(self, X, y):
self.X = X
self.y = y
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
return self.X[idx], self.y[idx]
train_dataset = NewsGroupDataset(X_train_tensor, y_train_tensor)
test_dataset = NewsGroupDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
class LSTMModel(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers=1):
super(LSTMModel, self).__init__()
self.hidden_size = hidden_size
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
h = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
c = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
out, _ = self.lstm(x, (h, c))
out = self.fc(out[:, -1, :])
return out
input_size = 10000
hidden_size = 128
output_size = len(label_encoder.classes_)
num_layers = 1
model = LSTMModel(input_size, hidden_size, output_size, num_layers)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
loss_fun = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10
for epoch in range(num_epochs):
model.train()
for X_batch, y_batch in train_loader:
X_batch = X_batch.unsqueeze(1)
# X_batch, y_batch = X_batch.to(model.device), y_batch.to(model.device)
outputs = model(X_batch)
loss = loss_fun(outputs, y_batch)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'Epoch: {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')
model.eval()
y_test, y_pred = [], []
with torch.no_grad():
for X_batch, y_batch in test_loader:
X_batch = X_batch.unsqueeze(1)
outputs = model(X_batch)
_, pred = torch.max(outputs, 1)
y_test.extend(y_batch.detach().numpy())
y_pred.extend(pred.detach().numpy())
accuracy = accuracy_score(y_test, y_pred)
print(f'accuracy: {accuracy:.4f}')
|
Epoch: 1/10, Loss: 0.5154 Epoch: 2/10, Loss: 0.1622 Epoch: 3/10, Loss: 0.0408 Epoch: 4/10, Loss: 0.0124 Epoch: 5/10, Loss: 0.0042 Epoch: 6/10, Loss: 0.0155 Epoch: 7/10, Loss: 0.0037 Epoch: 8/10, Loss: 0.0028 Epoch: 9/10, Loss: 0.0050 Epoch: 10/10, Loss: 0.0019 accuracy: 0.9040 |
3. GRU(Gated Reccurrent Unit)
- LSTM과 유사하지만 구조가 더 간단한 RNN의 한 종류
- LSTM과 달리 셀 상태(cell state)를 가지지 않으며, 업데이트 게이트와 리셋 게이트를 사용하여 정보를 처리
* 참고: https://wikidocs.net/22889
◼
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import DataLoader, Dataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
newsgroups_data = fetch_20newsgroups(subset='all')
texts, labels = newsgroups_data.data, newsgroups_data.target
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=2024)
vectorizer = CountVectorizer(max_features=10000, token_pattern=r'\b\w+\b')
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)
class NewsGroupDataset(Dataset):
def __init__(self, X, y):
self.X = X
self.y = y
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
return self.X[idx], self.y[idx]
train_dataset = NewsGroupDataset(X_train_tensor, y_train_tensor)
test_dataset = NewsGroupDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
class GRUModel(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers=1):
super(GRUModel, self).__init__()
self.hidden_size = hidden_size
self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
h = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
out, _ = self.gru(x, h)
out = self.fc(out[:, -1, :])
return out
input_size = 10000
hidden_size = 128
output_size = len(label_encoder.classes_)
num_layers = 1
model = GRUModel(input_size, hidden_size, output_size, num_layers)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
loss_fun = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10
for epoch in range(num_epochs):
model.train()
for X_batch, y_batch in train_loader:
X_batch = X_batch.unsqueeze(1)
# X_batch, y_batch = X_batch.to(model.device), y_batch.to(model.device)
outputs = model(X_batch)
loss = loss_fun(outputs, y_batch)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'Epoch: {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')
model.eval()
y_test, y_pred = [], []
with torch.no_grad():
for X_batch, y_batch in test_loader:
X_batch = X_batch.unsqueeze(1)
outputs = model(X_batch)
_, pred = torch.max(outputs, 1)
y_test.extend(y_batch.detach().numpy())
y_pred.extend(pred.detach().numpy())
accuracy = accuracy_score(y_test, y_pred)
print(f'accuracy: {accuracy:.4f}')
|
Epoch: 1/10, Loss: 0.4873 Epoch: 2/10, Loss: 0.2774 Epoch: 3/10, Loss: 0.1635 Epoch: 4/10, Loss: 0.0073 Epoch: 5/10, Loss: 0.0141 Epoch: 6/10, Loss: 0.0086 Epoch: 7/10, Loss: 0.0077 Epoch: 8/10, Loss: 0.0026 Epoch: 9/10, Loss: 0.0062 Epoch: 10/10, Loss: 0.0022 accuracy: 0.9027 |
4. LSTM vs GRU
- LSTM 과 GRU는 RNN의 기울기 소실 단점을 해결하기 위해 고안
- 게이트 메커니즘을 사용하여 중요한 정보를 유지하고 불필요한 정보를 제거
- 긴 시퀀스를 효과적으로 처리할 수 있어서 많은 자연어 처리 작업에서 사용
LSTM |
GRU |
|
|
@ 과제 1 : 네이버 쇼핑 리뷰 감성 분류하기
모델은 LSTM 또는 GRU를 사용하고, 파이토치를 이용해서 만들기
: https://wikidocs.net/94600
@ 과제 2 : 논문리뷰
Sequence to Sequence Learning with Neural Networks: https://arxiv.org/abs/1409.3215
Neural Machine Translation by Jointly Learning to Align and Translate : https://arxiv.org/abs/1409.0473
Effective Approaches to Attention-based Neural Machine Translation: https://arxiv.org/abs/1508.04025
'AI > 자연어처리' 카테고리의 다른 글
13. 문장 임베딩 | Attention Meshanism (0) | 2024.07.04 |
---|---|
12. 문장 임베딩 | Seq2Seq (1) | 2024.07.03 |
10. CNN Text Classification (0) | 2024.07.02 |
09. CBOW Text Classification (0) | 2024.07.01 |
08. RNN 기초 (0) | 2024.06.27 |