2020-06-14

java(spring)でOAuth(1.0)認証してみる

java spring

はじめに

はてなのOAuth認証を使う機会があったので，備忘録として残しときます．

build.gradle

ライブラリはhttpリクエストにokhttp3，oauth認証にsignpost，双方の橋渡しにokhttp-signpostを使いました．

plugins {
	id 'org.springframework.boot' version '2.3.0.RELEASE'
	id 'io.spring.dependency-management' version '1.0.9.RELEASE'
	id 'java'
}

group = 'com.example'
version = '0.0.1-SNAPSHOT'
sourceCompatibility = '11'

repositories {
	mavenCentral()
}

dependencies {
	implementation 'org.springframework.boot:spring-boot-starter-oauth2-client'
	implementation 'org.springframework.boot:spring-boot-starter-thymeleaf'
	implementation 'org.springframework.boot:spring-boot-starter-web'
	implementation 'com.squareup.okhttp3:okhttp:4.7.1'
	implementation 'com.squareup.okio:okio:2.6.0'
	implementation 'se.akerfeldt:okhttp-signpost:1.1.0'
	implementation 'oauth.signpost:signpost-core:1.2.1.2'
	implementation 'oauth.signpost:signpost-commonshttp4:1.2.1.2'
	developmentOnly 'org.springframework.boot:spring-boot-devtools'
	testImplementation('org.springframework.boot:spring-boot-starter-test') {
		exclude group: 'org.junit.vintage', module: 'junit-vintage-engine'
	}
	testImplementation 'org.springframework.security:spring-security-test'
}

test {
	useJUnitPlatform()
}

Main.java

処理の流れとしては，indexで認証URLへリダイレクト，認証後resultへリダイレクト，getInfo()でははてなのAPIでブログ上位10件のカテゴリとその数を取ってきて表示という感じです．

import java.util.HashMap;
import java.util.Map;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;

import oauth.signpost.OAuthConsumer;
import oauth.signpost.OAuthProvider;
import oauth.signpost.commonshttp.CommonsHttpOAuthConsumer;
import oauth.signpost.commonshttp.CommonsHttpOAuthProvider;
import oauth.signpost.exception.OAuthCommunicationException;
import oauth.signpost.exception.OAuthExpectationFailedException;
import oauth.signpost.exception.OAuthMessageSignerException;
import oauth.signpost.exception.OAuthNotAuthorizedException;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import se.akerfeldt.okhttp.signpost.OkHttpOAuthConsumer;

@Controller
public class Main {

	private static final String CONTEXT = "http://localhost:8081";

	private static final String CONSUMER_KEY = "Your API key";
	private static final String CONSUMER_SECRET = "Your secret API key";
	private static final String CALLBACK_URL = "/result";
	private static final String REQUEST_TOKEN_URL = "https://www.hatena.com/oauth/initiate";
	private static final String AUTHORIZE_URL = "https://www.hatena.com/oauth/authorize";
	private static final String ACCESS_TOKEN_URL = "https://www.hatena.com/oauth/token";
	private static final String ATOM_ENTRY_URL = "https://blog.hatena.ne.jp/%s/%s/atom/entry";
	private String accountId = "Your accountID";
	private String domain = "Your blog domain";
	private Map<String, Integer> terms = new HashMap<String, Integer>();

	private OAuthConsumer consumer;
	private OAuthProvider provider;

	private String accessToken = "";
	private String accessTokenSecret = "";

	private static final String scope = "read_public%2Cread_private%2Cwrite_public%2Cwrite_private";


	@RequestMapping("/")
	public String index(Model model) {

	    consumer = new CommonsHttpOAuthConsumer(CONSUMER_KEY, CONSUMER_SECRET);
    	    provider = new CommonsHttpOAuthProvider(REQUEST_TOKEN_URL + "?scope=" + scope, ACCESS_TOKEN_URL, AUTHORIZE_URL);

	    String url = "";
	    try {
	        url = provider.retrieveRequestToken(consumer, CONTEXT + CALLBACK_URL);
	    } catch (OAuthMessageSignerException | OAuthNotAuthorizedException | OAuthExpectationFailedException | OAuthCommunicationException e) {
	        e.printStackTrace();
	    }

	    return "redirect:" + url;
	}

	@RequestMapping("/result")
	public String login(@RequestParam("oauth_verifier") String verifier, Model model) {

	    try {
		provider.retrieveAccessToken(consumer, verifier);
	    } catch (OAuthMessageSignerException | OAuthNotAuthorizedException | OAuthExpectationFailedException | OAuthCommunicationException e) {
	        e.printStackTrace();
	    }

            accessToken = consumer.getToken();
	    accessTokenSecret = consumer.getTokenSecret();
	    System.out.println(accessToken);

	    getInfo();

	    model.addAttribute("terms", terms);

	    return "result";
	}

	public void getInfo() {

	    OAuthConsumer cons = new OkHttpOAuthConsumer(CONSUMER_KEY, CONSUMER_SECRET);
	    cons.setTokenWithSecret(accessToken, accessTokenSecret);

            try {
                Request signedRequest = (Request) cons.sign(
            		    new Request.Builder()
            		    .url(String.format(ATOM_ENTRY_URL, accountId, domain))
                            .build()).unwrap();

            OkHttpClient okHttpClient = new OkHttpClient.Builder().build();
            Response response = okHttpClient.newCall(signedRequest).execute();

            System.out.println("responseCode: " + response.code());

            if (!response.isSuccessful()) {
                System.out.println("Response error");
            }
            if (response.body() != null) {
            	DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
                DocumentBuilder builder = factory.newDocumentBuilder();
                Document document = builder.parse(response.body().byteStream());
                Element bookList = document.getDocumentElement();
                NodeList categories = bookList.getElementsByTagName("category");

                for (int i = 0; i < categories.getLength(); i++) {
                    Element book = (Element) categories.item(i);

                    String term = book.getAttribute("term");
                    if (!terms.containsKey(term)) {
                        terms.put(term, 1);
                    } else {
                        terms.put(term, terms.get(term) + 1);
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

2020-05-10

自然言語処理100本ノック2020を解く（8章-後半）

python pytorch ニューラルネットワーク機械学習自然言語処理

73．確率的勾配降下法による学習

確率的勾配降下法による学習を行なっていきます．
まずはニューラルネットワークのモデルを定義します．

コード

class MyNet(nn.Module):
    def __init__(self, input_size):
        super(MyNet, self).__init__()
        self.fc1 = torch.nn.Linear(input_size, 4)
        
    def forward(self, x):
        x = self.fc1(x)
        return x

本問題では，とりあえず入力→出力のレイヤのみとします．
forward()は，推論処理をする部分です．順方向伝搬とも呼びます．

コード

mynet = MyNet(dim)
optimizer = optim.SGD(mynet.parameters(), lr = 0.01, momentum = 0.9)
num_epochs = 100
save_path = '../data/params.tar'

入力は，単語ベクトルの次元と同じにします．次に，pytorchからSGDを持ってきてインスタンスを作ります．
確率的勾配法や学習係数，モメンタムについてここで説明するとN番煎じな感じになるので，割愛します．
プロフェッショナルシリーズの深層学習が数式が載っていてわかりやすいです．

コード

import time
import matplotlib.pyplot as plt
%matplotlib inline

class Model():
    def __init__(self, model):
        self.model = model
        
    def set_params(self, num_epochs, num_batches, optimizer, criterion):
        # エポック数，バッチサイズ，最適化アルゴリズム，損失関数の初期化
        self.num_epochs = num_epochs
        self.num_batches = num_batches
        self.optimizer = optimizer
        self.criterion = criterion
        
        # epoch毎の損失と精度を保持するリスト
        self.losses = np.zeros(num_epochs)
        self.accs = np.zeros(num_epochs)

    def fit(self, x_train, y_train):
        num_data = x_train.shape[0]
        for epoch in range(self.num_epochs):

            # epoch毎の損失と精度
            running_loss = 0.0
            accuracy = 0.0

            # 開始時間
            t = time.time()

            # ミニバッチ学習のために訓練データのインデックスをランダムに並べ替える
            index = np.random.permutation(num_data)

            for i in range(0, num_data, self.num_batches):
                # NNへの入力と正解ラベル
                in_, label = torch.tensor(x_train[index[i : i + self.num_batches if i + self.num_batches < num_data else num_data]]), \
                             torch.tensor(y_train[index[i : i + self.num_batches if i + self.num_batches < num_data else num_data]]).long()

                # 最適化関数の初期化
                self.optimizer.zero_grad()

                # 予測とロスの計算
                out_ = self.model(in_.float())
                loss = self.criterion(out_, label)

                _, idx = torch.max(out_, 1)

                # バッチ数のラベルが出てくるので，いくつ正解しているかカウント
                cnt = 0
                for i in range(idx.shape[0]):
                    if idx[i] == label[i]:
                        cnt += 1
                accuracy += cnt / idx.shape[0]

                # パラメーターの更新
                loss.backward()
                optimizer.step()
                running_loss += loss.data

            # 各種データの出力・保存
            num_loop = num_data / self.num_batches
            accuracy /= num_loop
            loss = running_loss / num_loop
            print('[ epoch:{0}, time:{3} ]  acc: {1:.4f}  loss:{2:.4f}'.format(epoch + 1, accuracy, loss, time.time() - t))
            self.accs[epoch] = accuracy
            self.losses[epoch] = loss
 
            # 状況の保存
            torch.save({
              'epoch': epoch,
              'model_state_dict': self.model.state_dict(),
              'optimizer_state_dict': self.optimizer.state_dict(),
              'loss': loss,
              'accuracy': accuracy,
              }, save_path)
    
    def predict(self, x):
        out_ = self.model(x)
        return out_
    
    def evaluate(self, x, y):
        p = self.predict(x)
        _, idx = torch.max(p, 1)
        cnt = 0
        for i in range(idx.shape[0]):
            if idx[i] == y[i]:
                cnt += 1
        accuracy = cnt / idx.shape[0]
        
        return accuracy
    
    def visualize(self):
        x = np.arange(len(self.accs))
        fig = plt.figure()
        ax = fig.add_subplot(211)
        ax.plot(x, self.accs, color = 'red', linewidth = 2, label = 'accuracy')
        ax2 = fig.add_subplot(212)
        ax2.plot(x, self.losses, color = 'green', linewidth = 2, label = 'loss')
        ax.legend()
        ax2.legend()
        plt.plot()

model = Model(mynet)
model.set_params(num_epochs, 1, optimizer, criterion)

model.fit(x_feature, y_train)

実行結果

[ epoch:1, time:2.592939853668213 ]  acc: 0.8776  loss:0.3732
[ epoch:2, time:2.466198205947876 ]  acc: 0.9112  loss:0.2633
[ epoch:3, time:2.4603400230407715 ]  acc: 0.9181  loss:0.2404
...
[ epoch:98, time:2.58657169342041 ]  acc: 0.9427  loss:0.1618
[ epoch:99, time:2.535313129425049 ]  acc: 0.9415  loss:0.1610
[ epoch:100, time:2.559777021408081 ]  acc: 0.9419  loss:0.1612

細かい説明などは問題に合わせてしていきます．
エポックは100で学習しています．
ロスが減っていくのが確認できます．

74．正解率の計測

学習データと評価データに対する予測をし，その正解率を求めます．
Modelクラスのevaluateメソッドを使います．

コード

    # Modelクラス内
    def evaluate(self, x, y):
        p = self.predict(x)
        _, idx = torch.max(p, 1)
        cnt = 0
        for i in range(idx.shape[0]):
            if idx[i] == y[i]:
                cnt += 1
        accuracy = cnt / idx.shape[0]
        
        return accuracy

print(model.evaluate(x_feature.float(), y_train))
print(model.evaluate(torch.tensor(vec2sum(x_valid, dim)).float(), y_valid))

実行結果

0.9435002296738632
0.9237132352941176

75．損失と正解率のプロット

この問題では，Modelクラス内のvisualizeメソッドを使います．

    def visualize(self):
        x = np.arange(len(self.accs))
        fig = plt.figure()
        ax = fig.add_subplot(211)
        ax.plot(x, self.accs, color = 'red', linewidth = 2, label = 'accuracy')
        ax2 = fig.add_subplot(212)
        ax2.plot(x, self.losses, color = 'green', linewidth = 2, label = 'loss')
        ax.legend()
        ax2.legend()
        plt.plot()

model.visualize()

実行結果

f:id:homuhomu0131:20200511211840p:plain — 正解率と損失の推移

76．チェックポイント

73のコードの＃状況の保存の部分でチェックポイントを記録しています．

77．ミニバッチ化

いくつかの事例毎にパラメータを更新するようにします．
73のコードでは，num_batchesでバッチサイズを指定しています．
毎エポックでインデックスをランダムにしたものを作成し，まとめて入力とします．

コード

model2 = Model(mynet)
model2.set_params(num_epochs, 4, optimizer, criterion)
model2.fit(x_feature, y_train)

実行結果

[ epoch:1, time:0.7158238887786865 ]  acc: 0.8294  loss:0.5254
[ epoch:2, time:0.6863267421722412 ]  acc: 0.8870  loss:0.3467
[ epoch:3, time:0.6839478015899658 ]  acc: 0.9010  loss:0.3032
...
[ epoch:98, time:0.7376530170440674 ]  acc: 0.9402  loss:0.1720
[ epoch:99, time:0.723405122756958 ]  acc: 0.9386  loss:0.1723
[ epoch:100, time:0.7342729568481445 ]  acc: 0.9388  loss:0.1723

バッチサイズを4にしたところ，1エポックにかかる時間が大体1/4になりました．

78．GPU上での学習

割愛します．

79．多層ニューラルネットワーク

ニューラルネットワークをカスタマイズします．

コード

class MyNet2(nn.Module):
    def __init__(self, input_size, h_size, output_size):
        super(MyNet2, self).__init__()
        self.fc1 = torch.nn.Linear(input_size, h_size)
        self.fc2 = torch.nn.Linear(h_size, h_size / 2)
        self.fc3 = torch.nn.Linear(h_size / 2, output_size)
        
    def forward(self, x):
        x = self.fc1(x)
        x = func.relu(x)
        x = self.fc2(x)
        x = func.relu(x)
        x = self.fc3(x)
        return x

mynet = MyNet2(dim, 100, 4)
optimizer = optim.SGD(mynet.parameters(), lr = 0.01, momentum = 0.9)
criterion = torch.nn.CrossEntropyLoss()
save_path = '../data/params.tar'

model = Model(mynet)
model.set_params(num_epochs, 2, optimizer, criterion)
model.fit(x_feature, y_train)

実行結果

[ epoch:1, time:2.569727897644043 ]  acc: 0.8461  loss:0.4161
[ epoch:2, time:2.516397714614868 ]  acc: 0.9080  loss:0.2681
[ epoch:3, time:2.531559944152832 ]  acc: 0.9192  loss:0.2329
...
[ epoch:98, time:3.5405173301696777 ]  acc: 0.9985  loss:0.0020
[ epoch:99, time:3.155651092529297 ]  acc: 0.9987  loss:0.0020
[ epoch:100, time:3.258673906326294 ]  acc: 0.9985  loss:0.0020

中間層を追加しました．
精度も見てみます．

コード

print(model.evaluate(x_feature.float(), y_train))
print(model.evaluate(torch.tensor(vec2sum(x_valid, dim)).float(), y_valid))
model.visualize()

0.9989664676159853
0.9365808823529411

f:id:homuhomu0131:20200511212316p:plain — 正解率と損失の推移

過適合してますね．．．

とりあえず，以上とします．

何かございましたらコメントください．．．

全コード：
github.com

前回：
pongyun.hatenablog.com

2020-05-09

自然言語処理100本ノック2020を解く（8章-前半）

機械学習ニューラルネットワーク自然言語処理 python pytorch

あなたは何？

都市圏でITエンジニア（の研修）をやっているものです．
ありがたいことに，研修中にたくさんインプットができているので，少しずつ
アウトプットしていけたらと考えています．

本題

自然言語処理100本ノックの2020年版の解説があまり出回っていない気がしたので，需要
があるかと思い書いています．クオリティは担保しません．
今回は，8章の前半の解き方を載せます．合っているかはわかりません．
間違っていたらご指摘いただくと非常に非常に助かります．
　

70．単語ベクトルの和による特徴量

この章では，ニューラルネットによる学習をします．
入力が300次元の特徴ベクトル，出力が4次元のカテゴリ，というイメージです．

本問題では，入力の特徴ベクトルを作ります．特徴ベクトルの数式は以下．

$x_i=\displaystyle \frac{1}{T_i} \sum_{t=1}^{T_i}emb(w_{i,t})$

$T_i$ は記事見出しに含まれる単語数で， $emb(w_{i,t})$ はそれぞれの単語のベクトルです．
つまり，記事見出し中の単語ベクトルの平均を計算すれば良いです．
次に，正解ラベルについてですが，これはカテゴリ毎に何か番号を振れば良いです．

コード

import re
import numpy
from gensim.models import KeyedVectors
news_path = '../data/GoogleNews-vectors-negative300.bin'
words = KeyedVectors.load_word2vec_format(news_path, binary = True)#単語ベクトルの読み込み

def vec2sum(x_data, dim):
    pattern = re.compile(r'[a-z|A-Z]+', re.MULTILINE + re.VERBOSE)  
    vector_sum = np.zeros((len(vector_data), dim))
    for i, s in enumerate(vector_data):
        vector = np.zeros(dim) 
        cnt = 0
        ss = pattern.findall(s)
        for noun in ss:
            try :
                cnt += 1
                vector += words[noun]
            except KeyError as error:
                continue
        if cnt == 0:
            continue
        vector_sum[i] = (vector / cnt)

    return vector_sum

dim = words['US'].shape[0]
print(len(x_train))
x_train = vec2sum(x_train, dim)
print(x_train.shape)

実行結果

8708
(8708, 300)

ベクトル辞書に含まれていない単語があるので，その場合は文字数をカウントせずにループを続けます．
とりあえず，記事数8708の300次元の特徴ベクトルができました．

71．単層ニューラルネットワークによる予測

次に，上記の特徴ベクトルとランダムな重み行列 $W$ を掛け合わせ，それをソフトマックス関数の入力として計算します．
ソフトマックス関数は以下．

$y_k = \displaystyle \frac{exp(x_k)}{\sum_{i=1} ^ {n}exp(x_i)}$

これをそのまま実装しても良いですが，今回はpytorchを用いて楽をします．

コード

import torch
import torch.nn as nn
import torch.nn.functional as func
import torch.optim as optim
from torch.autograd import Variable

w = torch.randn(dim, 4, requires_grad = True)
x_feature = torch.tensor(x_feature, requires_grad = True)

# 単体事例
y = func.softmax(torch.mm(x_feature[:1].float(), w))
print(y)

# 複数事例
Y = func.softmax(torch.mm(x_feature[:4].float(), w), dim = 1)
print(Y)

実行結果

tensor([[0.0795, 0.1212, 0.1688, 0.6305]], grad_fn=<SoftmaxBackward>)
tensor([[0.0645, 0.4789, 0.1243, 0.3324],
        [0.3788, 0.2161, 0.1569, 0.2482],
        [0.2830, 0.0544, 0.2407, 0.4219],
        [0.1088, 0.0486, 0.3351, 0.5075]], grad_fn=<SoftmaxBackward>)

72．損失と勾配の計算

クロスエントロピー誤差と重み $W$ に対する勾配を計算します.
クロスエントロピー誤差の算出は以下．tを正解ラベルとします．

$loss(x, t) = \displaystyle \frac{exp(x_t)}{\sum_{i=1} ^ {n}exp(x_i)}$

これにsoftmaxの結果を放り込めばOK．集合に対してはその平均を返す．
ここもpytorchで楽をします．pytorchでは，CrossEntropyLoss
y_trainは正解ラベルです．

コード

from math import exp, log

criterion = nn.CrossEntropyLoss()
y_train = torch.tensor(y_train.astype(float), requires_grad = True)

# 単体事例
y_ = criterion(torch.mm(x_feature[:1].float(), w), y_train[:1].long())
print(y_)
y_.backward()
print(w.grad)

# 複数事例
Y_ = criterion(torch.mm(x_feature[:4].float(), w), y_train[:4].long())
print(Y_)
Y_.backward()
print(w.grad)

# 検算
ans = []
for yy, i in zip(Y, y_train[:4].long()):
    ans.append(-log(yy[i]))
print(sum(ans) / len(ans))

実行結果

tensor(2.0852, grad_fn=<NllLossBackward>)
tensor([[-0.0011, -0.0080,  0.0147, -0.0056],
        [ 0.0072,  0.0533, -0.0976,  0.0370],
        [-0.0008, -0.0062,  0.0113, -0.0043],
        ...,
        [-0.0055, -0.0412,  0.0753, -0.0286],
        [-0.0021, -0.0154,  0.0282, -0.0107],
        [-0.0010, -0.0071,  0.0129, -0.0049]])
tensor(1.2895, grad_fn=<NllLossBackward>)
tensor([[-9.5167e-03, -1.6568e-03,  4.1909e-03,  6.9826e-03],
        [-5.0837e-03,  7.4656e-02, -1.3673e-01,  6.7157e-02],
        [ 7.1513e-03, -1.1068e-02,  1.4147e-02, -1.0230e-02],
        ...,
        [-9.3671e-03, -4.7332e-02,  9.5406e-02, -3.8707e-02],
        [ 9.5714e-05, -1.8119e-02,  3.1322e-02, -1.3299e-02],
        [ 7.7742e-03, -1.2784e-02,  3.0687e-02, -2.5677e-02]])
1.2895281521241457

出ましたね．検算を一応してみましたが，合っていました．
勾配に関しては，backward()で微分をしてくれるので，とても楽ができます．

今回は一旦ここまでとします．後半はひとまとまりにできるので．．．
では．．．！

次回：
pongyun.hatenablog.com

出典：
自然言語処理100本ノック
nlp100.github.io