# **CIS 7000: Trustworthy Machine Learning HW 1**

Instructions: You should implement the following functions below:

- adversarial_example: Compute an adversarial example for the given input
- randomized_smoothing: The randomized smoothing algorithm

You should submit your code together with a PDF containing the following:

- The plot produced at the very end
- A one-paragraph description of the plot, interpreting the comparison across diferent lines as well as the slopes of the lines

In [None]:
import torch
from torchvision import datasets, transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt

torch.manual_seed(0)

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(784, 512)
        self.fc2 = nn.Linear(512, 10)

    def forward(self, x):
        x = x.view(-1, 784)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

def load_data():
    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
    trainset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
    testset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)
    testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False)
    return trainloader, testloader

def train_model(trainloader):
    net = Net()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=0.001)
    for epoch in range(5):
        running_loss = 0.0
        for data in trainloader:
            inputs, labels = data
            optimizer.zero_grad()
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print("epoch: %d, loss: %.3f" % (epoch, running_loss / 50000))
    return net

def evaluate_model(testimages, testlabels, net):
    correct = 0
    total = 0
    with torch.no_grad():
        outputs = net(testimages)
        _, predicted = torch.max(outputs.data, 1)
        total += testlabels.size(0)
        correct += (predicted == testlabels).sum().item()
    return correct/total

In [None]:
# TODO: Implement the Fast Gradient Sign Method (FGSM) for computing adversarial examples
# Note: Do NOT clamp the output in your implementation (due to the normalization)
def adversarial_example(x, net, eps):
    ...

def evaluate_adversarial(testimages, testlabels, net, eps):
    correct = 0
    total = 0
    for image, label in zip(testimages, testlabels):
        image_adv = adversarial_example(image, net, eps)
        outputs = net(image_adv)
        _, predicted = torch.max(outputs.data, 1)
        correct += (predicted[0] == label).item()
        total += 1
    return correct/total

# TODO: Implement the randomized smoothing algorithm
def randomized_smoothing(x, net, eps, n_samples):
    ...

def evaluate_smoothing(testimages, testlabels, net, eps, n_samples):
    correct = 0
    correct_adv = 0
    total = 0
    for image, label in zip(testimages, testlabels):
        outputs = randomized_smoothing(image, net, eps, n_samples)
        _, predicted = torch.max(outputs.data, 1)
        correct += (predicted[0] == label).item()
        image_adv = adversarial_example(image, net, eps)
        outputs_adv = randomized_smoothing(image_adv, net, eps, n_samples)
        _, predicted_adv = torch.max(outputs_adv.data, 1)
        correct_adv += (predicted_adv[0] == label).item()
        total += 1
    return correct/total, correct_adv/total

In [None]:
trainloader, testloader = load_data()

In [None]:
net = train_model(trainloader)

In [None]:
eps = 0.2
n_samples = 50
testimages, testlabels = next(iter(testloader))
acc = evaluate_model(testimages, testlabels, net)
print("accuracy: %.3f" % acc)
acc_adv = evaluate_adversarial(testimages, testlabels, net, eps)
print("adversarial accuracy: %.3f" % acc_adv)
acc_smooth, acc_smooth_adv = evaluate_smoothing(testimages, testlabels, net, eps, n_samples)
print("accuracy:             %.3f" % acc_smooth)
print("adversarial accuracy: %.3f" % acc_smooth_adv)

In [None]:
eps_list = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3]
acc_list = []
acc_adv_list = []
acc_smooth_list = []
acc_smooth_adv_list = []
for eps in eps_list:
    acc = evaluate_model(testimages, testlabels, net)
    acc_adv = evaluate_adversarial(testimages, testlabels, net, eps)
    acc_smooth, acc_smooth_adv = evaluate_smoothing(testimages, testlabels, net, eps, n_samples)
    acc_list.append(acc)
    acc_adv_list.append(acc_adv)
    acc_smooth_list.append(acc_smooth)
    acc_smooth_adv_list.append(acc_smooth_adv)

In [None]:
plt.plot(eps_list, acc_list, marker='o', linestyle='-', color='b')
plt.plot(eps_list, acc_adv_list, marker='o', linestyle='-', color='r')
plt.plot(eps_list, acc_smooth_list, marker='o', linestyle='-', color='g')
plt.plot(eps_list, acc_smooth_adv_list, marker='o', linestyle='-', color='y')
plt.legend(['Accuracy', 'Adversarial Accuracy', 'Smoothed Accuracy', 'Adversarial Smoothed Accuracy'])
plt.title('Accuracy of Different Approaches')
plt.xlabel('Epsilon')
plt.ylabel('Accuracy')
plt.grid(True)
plt.show()