refirio.org

Memo

メモ > 技術 > プログラミング言語: Python > scikit-learn（機械学習）での最適なアルゴリズムやパラメータを探す
■scikit-learn（機械学習）での最適なアルゴリズムやパラメータを探す
$ sudo pip3 install pytest
$ sudo pip3 install cython


各アルゴリズムの正解率を比較

ただし2021年2月現在、実行すると以下のエラーになる

$ python3 algorithm.py 
Traceback (most recent call last):
  File "algorithm.py", line 5, in <module>
    from sklearn.utils.testing import all_estimators
ModuleNotFoundError: No module named 'sklearn.utils.testing'


以下をインストールしても変化なし

$ sudo pip3 install sklearn.utils


以下は以前のメモ

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
from sklearn.utils.testing import all_estimators

import warnings
warnings.filterwarnings('ignore')

# アヤメデータの読み込み
iris_data = pd.read_csv("iris.csv", encoding="utf-8")

# アヤメデータをラベルと入力データに分離する
y = iris_data.loc[:,"Name"]
x = iris_data.loc[:,["SepalLength","SepalWidth","PetalLength","PetalWidth"]]

# 学習用とテスト用に分離する
warnings.filterwarnings('ignore')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, train_size = 0.8, shuffle = True)

# classifierのアルゴリズム全てを取得する
warnings.filterwarnings("ignore")
allAlgorithms = all_estimators(type_filter="classifier")

for (name, algorithm) in allAlgorithms:
    # 一部のアルゴリズムでエラーになるので除外（要調査）
    if name == "CheckingClassifier" or name == "ClassifierChain" or name == "MultiOutputClassifier" or name == "OneVsOneClassifier" or name == "OneVsRestClassifier" or name == "OutputCodeClassifier" or name == "VotingClassifier":
        continue

    # 各アリゴリズムのオブジェクトを作成
    clf = algorithm()

    # 学習して、評価する
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(name, "の正解率 = " , accuracy_score(y_test, y_pred))


クロスバリデーション

import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import warnings
from sklearn.utils.testing import all_estimators

# アヤメデータの読み込み
iris_data = pd.read_csv("iris.csv", encoding="utf-8")

# アヤメデータをラベルと入力データに分離する 
y = iris_data.loc[:,"Name"]
x = iris_data.loc[:,["SepalLength","SepalWidth","PetalLength","PetalWidth"]]

# classifierのアルゴリズム全てを取得する
warnings.filterwarnings("ignore")
allAlgorithms = all_estimators(type_filter="classifier")

# K分割クロスバリデーション用オブジェクト
kfold_cv = KFold(n_splits=5, shuffle=True)

for (name, algorithm) in allAlgorithms:
    # 一部のアルゴリズムでエラーになるので除外（要調査）
    if name == "CheckingClassifier" or name == "ClassifierChain" or name == "MultiOutputClassifier" or name == "OneVsOneClassifier" or name == "OneVsRestClassifier" or name == "OutputCodeClassifier" or name == "VotingClassifier":
        continue

    # 各アリゴリズムのオブジェクトを作成
    clf = algorithm()

    # scoreメソッドをもつクラスを対象とする
    if hasattr(clf,"score"):

        # クロスバリデーションを行う
        scores = cross_val_score(clf, x, y, cv=kfold_cv)
        print(name,"の正解率=")
        print(scores)


最適なパラメータを探す

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

# アヤメデータの読み込み
iris_data = pd.read_csv("iris.csv", encoding="utf-8")

# アヤメデータをラベルと入力データに分離する 
y = iris_data.loc[:,"Name"]
x = iris_data.loc[:,["SepalLength","SepalWidth","PetalLength","PetalWidth"]]

# 学習用とテスト用に分離する 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, train_size = 0.8, shuffle = True)

# グリッドサーチで利用するパラメータを指定
parameters = [
    {"C": [1, 10, 100, 1000], "kernel":["linear"]},
    {"C": [1, 10, 100, 1000], "kernel":["rbf"], "gamma":[0.001, 0.0001]},
    {"C": [1, 10, 100, 1000], "kernel":["sigmoid"], "gamma": [0.001, 0.0001]}
]

# グリッドサーチを行う
kfold_cv = KFold(n_splits=5, shuffle=True)
clf = GridSearchCV( SVC(), parameters, cv=kfold_cv)
clf.fit(x_train, y_train)
print("最適なパラメータ = ", clf.best_estimator_)

# 最適なパラメータで評価
y_pred = clf.predict(x_test)
print("評価時の正解率 = " , accuracy_score(y_test, y_pred))
Memo

Advertisement