2018年2月3日 星期六

[Pandas] 資料探索與前處理實作(1)

pandas dataframe以類似表格方式展示資料, 相當快速且方便,  也提供豐富的函式庫供資料處理分析使用, 當我們將資料從各種來源匯入後, 因為來源相當多元, 各種系統格式皆不同, 遺漏資料的狀況也屢見不鮮, 必須先對資料進行探索與前處理, 並可搭配matplotlib視覺化函式庫整合, 將探索的結果視覺化


拿到原始資料後, 會先進行EDA, 檢視資料
瀏覽資料後再進一步定義clean data的流程

以下提供一些常見資料探索與前處理語法
>>>完整範例請參考<<<

查看欄位
emp_pd.columns
列出基本統計值
可用來檢查是否有極端值
可以指定欄位
emp_pd.describe()
emp_pd.describe(include=['object'])
emp_pd.BirthDate.describe()
count                    5491
unique                   4193
top       1981-07-29 00:00:00
freq                        6
first     1949-04-24 00:00:00
last      1998-05-28 00:00:00
Name: BirthDate, dtype: object
確認型別是否都相同
set(df.EmployeeId.apply(type))
取代字串
import numpy as np 
bi_col.replace('',np.nan, inplace = True) #把空字串換成Nan
df[df.name.str.contains('\*')]['name'] #找出包含星號列
df.name = df.name.str.replace('*','') #去除星號
df.name = df.name.str.strip() #去掉空白
取得特定欄位
"""
1)當用 行號 索引的時候, 儘量用 iloc 來進行索引;
2)而用 行標籤 索引的時候用 loc ;
3)ix 就別用了。
"""
emp_pd['EmployeeId'][0]
emp_pd.loc[0:3, ["column_1","column_2"]] #依照標籤索引
emp_pd.iloc[0:3, 1:3]  #依照位置索引
技巧 : 可以把特定欄位set_index, 再來進行切片, EX:時間欄位變成index, 就可以使用loc過濾資料
df_clean.loc["2011-06-20 08:29:00", :]

找出遺失資料, 如下5496爲總數, StayJobDate只有217欄位有值
emp_pd.count()
總共幾種價錢?
train.price.nunique()
Serial 5496 CompanyId 5496 CompanyName 5496 EmployeePlant 5496 EmployeeId 5496 EmployeeChineseName 5496 EmployeeEnglishName 5495 JobStatus 5496 JobPositionCode 5496 DeptPlant 5496 DeptCode 5496 ExtensionNumber 5496 Email 5496 RegisterDate 5496 LeaveDate 0 StayJobDate 217
找出空值
train.isnull().sum()
# train_id                  0
# name                      0
# item_condition_id         0
# category_name          6327
# brand_name           632682
# price                     0
# shipping                  0
# item_description          4
# dtype: int64

# 特定欄位
emp_pd[emp_pd.BirthDate.isnull()][['EmployeeId','EmployeeChineseName','BirthDate']]
移除掉空值
emp_pd = emp_pd[emp_pd.BirthDate.isnull() == False]
找出重覆值
dupes_by_emp = emp_pd[emp_pd.duplicated('EmployeeId')]
dupes_by_emp.count()

for name, rows in  emp_pd.groupby('EmployeeEnglishName'):
    if len(rows) > 5:
        print 'name: %s, number of rows: %d' %(name, len(rows))
85 Andy Chen 1382 Andy Chen 1484 Andy Chen 1772 Andy Chen 2197 Andy Chen 3731 Andy Chen 4227 Andy Chen 4862 Andy Chen Name: EmployeeEnglishName, dtype: object
pd.concat([g for _,g in emp_pd.groupby('EmployeeEnglishName') if len(g) > 7])['EmployeeEnglishName']
emp_pd = emp_pd[emp_pd.BirthDate.isnull() == False]
boolean mask 找出錯誤
with_birth_dates = emp_pd[emp_pd.BirthDate.notnull()]
bad_dates = pd.isnull(pd.to_datetime( with_birth_dates.BirthDate, errors='coerce' ))
with_birth_dates[bad_dates][['EmployeeId', 'BirthDate']]
排序
emp_pd.sort_values(['DeptCode','EmployeeEnglishName'],ascending=[0,0])[['EmployeeChineseName','EmployeeId','EmployeeEnglishName']].head(10)
修改值
emp_pd.loc[emp_pd.EmployeeId == '10301009', 'EmployeeEnglishName'] = 'CCCChales Huang'
移除列
emp_pd = emp_pd[~((emp_pd.EmployeeId == '10301009'))]
丟掉重覆值
emp_pd = emp_pd.drop_duplicates(['EmployeeId'])
時間處理
pd.to_datetime(emp_pd.BirthDate, errors='raise')#有error跳出 or ignore

for i,row in emp_pd.iterrows():
    try:
        pd.to_datetime(emp_pd.BirthDate, error='raise')
    except:
        print '%s(%s, %d)' '%(row.a, row.b, i)
自定義函式
針對特定列依照特定規則轉換
def udf(row):

    return (datetime.now() - row['RegisterDate']).days / 365

emp_pd['ServiceYears'] = emp_pd.apply(udf, axis = 1)

所以一個完整的資料清洗流程如下:
def clean_data(df):
    """The full clean data function, which returns both the cleaned Nobel data (df) and a DataFrame 
    containing those winners with a born_in field."""
    df = df.replace('', np.nan)
    df_born_in = df[df.born_in.notnull()] 
    df = df[df.born_in.isnull()]
    df = df.drop('born_in', axis=1) 
    df.drop(df[df.year == 1809].index, inplace=True) 
    df = df[~(df.name == 'Marie Curie')]
    df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
           (df.year == 1911), 'country'] = 'France'
    df = df[~((df.name == 'Sidney Altman') & (df.year == 1990))]
    df = df.reindex(np.random.permutation(df.index)) 
    df = df.drop_duplicates(['name', 'year'])         
    df = df.sort_index()
    df.ix[df.name == 'Alexis Carrel', 'category'] =\
        'Physiology or Medicine' 
    df.ix[df.name == 'Ragnar Granit', 'gender'] = 'male'
    df = df[df.gender.notnull()] # remove institutional prizes
    df.ix[df.name == 'Hiroshi Amano', 'date_of_birth'] =\
    '11 September 1960'
    df.date_of_birth = pd.to_datetime(df.date_of_birth) 
    df.date_of_death = pd.to_datetime(df.date_of_death, errors='coerce') 
    df['award_age'] = df.year - pd.DatetimeIndex(df.date_of_birth).year 
    return df, df_born_in

基本統計值指令






樞紐分析表
products = pd.DataFrame({
        "分類": ["居家", "居家", "娛樂", "娛樂", "科技", "科技"],
        "商店": ["家樂福", "頂好", "家樂福", "全聯", "頂好","家樂福"],
        "價格":[11.42, 23.50, 19.99, 15.95, 55.75, 111.55],
        "測試分數": [4, 3, 5, 7, 5, 8]})
print(products)
products.to_html("Ch9_5_2_01.html")
# 呼叫 pivot_table() 方法
pivot_products = products.pivot_table(index='分類',columns='商店',values='價格')
print(pivot_products)

建立測試資料表
import numpy as np
# # set seed
np.random.seed(111)

# Function to generate test data
def CreateDataSet(Number=1):
    Output = []
    for i in range(Number):
        # Create a weekly (mondays) date range
        rng = pd.date_range(start='1/1/2009', end='12/31/2012', freq='W-MON') #random number generator
        
        # Create random data
        data = np.random.randint(low=25, high=1000, size=len(rng))
        
        # Status pool
        status = [1,2,3]
        
        # Make a random list of statuses
        random_status = [status[np.random.randint(low=0, high=len(status))] for i in range(len(rng))]
        
        # State pool
        states = ['GA','FL','fl','NY','NJ','TX']
        
        # Make a random list of states 
        random_states = [states[np.random.randint(low=0, high=len(states))] for i in range(len(rng))]
    
        Output.extend(zip(random_states, random_status, data, rng))
        #extend& append 差異
        
    return Output
dataset = CreateDataSet(4)
dataset
len(dataset) #check shape
df = pd.DataFrame(data=dataset, columns=['State','Status','CustomerCount','StatusDate'])
df.head(10)



Group By


#summary data by group
g_state = df.groupby(['State'], axis=0)

g_state.size() #group後的數量
#State
#FL    145
#GA    147
#NJ    140
#NY    124
#TX    134
#fl    146
#dtype: int64

g_state.groups

g_state.get_group("NJ")#特定group

g_state.sum()#特數值變數的都加總起來
g_state.count()#數量

g_state = df.groupby(['StatusDate','State'], axis=0, sort=True).sum()#重要:依照多個col group

Encoder

label encoder : 將類別資料對應字典 1,2,3,4... 會有順序issue
one hot encoder : 使用dummy將類別欄位況展到多欄位, 看有多少類別, 但會成長太快太稀疏, 建議類別不要太多, 以下為範例


count encoder : 類別本身出現次數會影響結果

整合一起使用

# 將前十名依據採購次數和交易額做排序
# 依照名字分群取兩個欄位, 分別對兩個欄位計算sum.count, 排序, 取前10筆, 重新設定index
top_10 = df.groupby('name')['ext price', 'quantity'].agg({'ext price': 'sum', 'quantity': 'count'}).sort_values(by='ext price', ascending=False)[:10].reset_index()
print(top_10)
top_10.rename(columns={'name': 'Name', 'ext price': 'Sales', 'quantity': 'Purchases'}, inplace=True) #replace 
top_10

讀取實體位置檔案

def load_train_dir_to_df(train_dir):
    print("-->load_train_dir")
    dirs = glob.glob(train_dir + "/*")
    print (dirs)
    files = []
    class_id = []
    print ('<-- class_id.extend="" class_id="" d.split="" d="" df="" dirs:="" file_name="" files.extend="" files="" for="" get="" in="" jpg="" jpg_set="" len="" load_train_dir="" pre="" print="" return="" type="">


這裡有一篇簡單的教學寫的不錯

>>> 資料前處理(Missing data, One-hot encoding, Feature Scaling)


Ref:

沒有留言:

張貼留言