카테고리 없음

46. TIL) 팀 프로젝트 - 실시간 광물 품질 데이터셋 : 농도 예측 1

kaseomi 2025. 2. 26. 20:49
728x90
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from sklearn.pipeline import Pipeline
df = pd.read_csv(r'C:\Users\Owner\Desktop\심화프로젝트\MiningProcess.csv')
df.shape
df.info()
df.head()
df.isnull().sum()
numeric_cols = df.select_dtypes(include=['number']).columns
# 이상치 개수를 저장할 딕셔너리
outlier_counts = {}

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)  # 1사분위수
    Q3 = df[col].quantile(0.75)  # 3사분위수
    IQR = Q3 - Q1  # IQR 계산
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
   
    outlier_count = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
   
    outlier_counts[col] = outlier_count

outlier_df = pd.DataFrame(list(outlier_counts.items()), columns=['Column', 'Outlier Count'])

print(outlier_df)
#박스플랏
for col in numeric_cols:
    plt.figure(figsize=(6,4))
    df.boxplot(column=[col])
    plt.title("outliers")
    plt.ylabel("Values")
    plt.show()
# 숫자형 컬럼만 선택하여 상관관계 분석
numeric_df = df.select_dtypes(include=['number'])  # 숫자형 데이터만 필터링
correlation_matrix = numeric_df.corr()  # 상관계수 계산

# 히트맵 시각화
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

히트맵