본문으로 바로가기
728x90

1. 데이터 로딩

import numpy as np
import pandas as pd

df = pd.read_csv("***.csv")
print(df.shape)
print(df.head())

df_processed = df.dropna().drop(cols, axis=1, inplace=False)

X = df_processed.iloc[1:]
y = df_processed.iloc[:1]

 

2. 데이터 전처리

import statsmodels.api as sm
# 상수항 추가
df_processed = sm.add_constant(df_processed, has_constant='add')

# 타입 변경
y = df_processed['col_name'].astype('category')

# 원핫 인코딩
y = pd.get_dummies(y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, 
							stratify=y, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

import sklearn.preprocessing as preprocessing
scaler = preprocessing.Normalizer() / preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

 

3. 모델 생성

# KNN 분류
from sklearn.neighbors import KNeighborsClassifier

training_accuracy = []
test_accuracy = []
neighbors_settings = range(1, 25)

for n in neighbors_settings:
    clf = KNeighborsClassifier(n_neighbors=n)
    clf.fit(X_train, y_train)
    training_accuracy.append(clf.score(X_train, y_train))
    test_accuracy.append(clf.score(X_test, y_test))
# 로지스틱 회귀
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
result = model.fit(X_train, y_train)
result.summary()

pred = model.predict(X_test) / model.predict_proba(X_test)
score = model.score(X_test, y_test)

 

4. 분석

import matplotlib.pyplot as plt

plt.plot(neighbors_settings, training_accuracy, label="Training Accuracy")
plt.plot(neighbors_settings, test_accuracy, label="Test Accuracy")
plt.xlabel("n_neighbors")
plt.ylabel("Accuracy")
plt.legend()
from sklearn.metrics import f1_score
f1_score(y_test, pred)

 

5. 저장 및 확인

# pd.DataFrame(...)
# pd.concat(...)

test_accuracy.to_csv("***.csv", index=False)

check = pd.read_csv("***.csv")
print(check.head())
728x90