2018年10月31日 星期三

[Pandas] 資料探索與前處理實作(2)

pandas dataframe資料探索與前處理範例二


找出資料關聯型
散佈圖
import pandas as pd

hours_phone_used = [0,0,0,1,1.3,1.5,2,2.2,2.6,3.2,4.1,4.4,4.4,5]
work_performance = [87,89,91,90,82,80,78,81,76,85,80,75,73,72]

df = pd.DataFrame({"hours_phone_used":hours_phone_used,
                   "work_performance":work_performance})

df.plot(kind="scatter", x="hours_phone_used", y="work_performance")

相關係數(Correlation Coefficient)
計算兩個變數線性相關性強弱, 範圍在-1~1之間
相關係數公式 : 
Rxy = Sxy/Sx*Sy 
 xy共變異數除以x與y的標準差
Numpy與pandas有支援直接做計算
import numpy as np
import pandas as pd

hours_phone_used = [0,0,0,1,1.3,1.5,2,2.2,2.6,3.2,4.1,4.4,4.4,5]
work_performance = [87,89,91,90,82,80,78,81,76,85,80,75,73,72]

x = np.array(hours_phone_used)
y = np.array(work_performance)
n = len(x)
x_mean = x.mean()
y_mean = y.mean()

diff = (x-x_mean)*(y-y_mean)
covar = diff.sum()/n
print("共變異數:", covar)

corr = covar/(x.std()*y.std())
print("相關係數:", corr)
# pandas內建函式
df = pd.DataFrame({"hours_phone_used":hours_phone_used,
                   "work_performance":work_performance})
print(df.corr())





特徵縮放標準化
import pandas as pd
from sklearn import preprocessing

f_tracking= [110, 1018, 1130, 417, 626,
             957, 90, 951, 946, 797,
             981, 125, 456, 731, 1640,
             486, 1309, 472, 1133, 1773,
             906, 532, 742, 621, 855]
happiness = [0.3, 0.8, 0.5, 0.4, 0.6,
             0.4, 0.7, 0.5, 0.4, 0.3, 
             0.3, 0.6, 0.2, 0.8, 1,
             0.6, 0.2, 0.7, 0.5, 0.7,
             0.1, 0.4, 0.3, 0.6, 0.3]

df = pd.DataFrame({"f_tracking" : f_tracking,
                   "happiness" : happiness})
print(df.head())

scaler = preprocessing.StandardScaler()
np_std = scaler.fit_transform(df)
df_std = pd.DataFrame(np_std, columns=["f_tracking_s", "happiness_s"])
print(df_std.head())

df_std.plot(kind="scatter", x="f_tracking_s", y="happiness_s")
df_scaled = pd.DataFrame(preprocessing.scale(df), 
                         columns=["f_tracking_s", "happiness_s"])
print(df_scaled.head())
df_scaled.plot(kind="scatter", x="f_tracking_s", y="happiness_s")

scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
np_minmax = scaler.fit_transform(df)
df_minmax = pd.DataFrame(np_minmax, columns=["f_tracking_m", "happiness_m"])
print(df_minmax.head())

df_minmax.plot(kind="scatter", x="f_tracking_m", y="happiness_m")

資料清理轉換
import pandas as pd
df = pd.read_csv("test.csv")
# 顯示資料集形狀,目的是知道行列數
df.shape()
# 顯示資料集敘述統計值
df.describe()
# 瀏覽資料五筆, 了解每一個欄位和資料列意義
df.head()
#找出id欄位是否都是唯一值size = row數量, 將此欄為設定為dataframe的index, inplace直接取代現在的dataframe
np.unique(df['ID'].values).size
df.set_index(["ID"], inplace=True)
# 進一步檢視個欄位是否有遺漏值,可以看出每個欄位有多少nan
df.info()
# 清理空值與補值
df1 = df.dropna()
print(df1)
# 有nan就刪除
df2 = df.dropna(how="any")
print(df2)
# 全部欄位都是nan才刪
df3 = df.dropna(how="all")
print(df3)
#刪除b,c欄位有nan
df4 = df.dropna(subset=["B", "C"])
print(df4)
# 塞1
df1 = df.fillna(value=1)
print(df1)
# 塞平均
df["B"] = df["B"].fillna(df["B"].mean())
print(df)
# 塞中位數
df["C"] = df["C"].fillna(df["C"].median())
print(df)
# 建立布林遮罩
df1 = pd.isnull(df)
# 補值完再確認
sum(df['Age'].isnull())
# 清理重複值
print(df.duplicated())
print(df.duplicated("B"))
df1 = df.drop_duplicates("B")
print(df1)
# 留最後一筆
df2 = df.drop_duplicates("B", keep="last")
print(df2)
# 不保留任何有重複資料
df3 = df.drop_duplicates("B", keep=False)
print(df3)
# 名目尺度分類資料, 轉成數值index代碼
size_mapping = {"XXL": 5,
                "XL": 4,
                "L": 3,
                "M": 2,
                "S": 1,
                "XS": 0}

df["Size"] = df["Size"].map(size_mapping)
print(df)
label_encoder = preprocessing.LabelEncoder()
df["Gender"] = label_encoder.fit_transform(df["Gender"])
print(df)
# 計算平均
df['Sex'].groupby(df["Sex"].size())
df.groupby("Sex")["Age"].mean()
# 計算各類別生存率
df['title'].groupby(df['title']).size()
df[['title','survived']].groupby(df['title']).mean()
# 其他技巧
# 將aaa=0的用1取代傳給new_column
df['new_column'] = np.where(df['aaa']==0, 1, 0)
# 取代欄位值
df['new_column'] = df['new_column'].replace("AA","BB")


Ref:

沒有留言:

張貼留言