在蟒蛇的眼裡,熊貓是很會分析資料的!
pip3 install pandas
import pandas as pd
import numpy as np
from os.path import join
header: 指定哪一行當作column名稱(=0表示指定第一行)
DataFrame.head(n): 顯示前n筆資料
data = pd.read_csv(join("data", "heart_disease_dataset.csv"), sep=',',
index_col=None, header=0)
data.head(3)
print(type(data), data.shape)
data.describe()
.iloc[]
: 用數字index選取.loc[]
: 用名稱選取age = data.loc[:, 'age']
patient0 = data.iloc[0, :]
chestpain_and_sex = data.loc[:, ['cp', 'sex']]
print(type(patient0), type(age), type(chestpain_and_sex))
female = data[data.sex == 0]
female.shape
.index
和.columns
reset_index(), set_index()
fillna(VAL), dropna()
: 當表格中有缺失值時,可以一步填補或丟棄。.apply(FUN, axis, ...)
: 將每個row(axis=1)/column(axis=0)送進FUN處理並回傳,簡潔又比for-loop還快速。data.index = ['patient{}'.format(i) for i in data.index]
data.columns = data.columns[:-1].tolist() + ['predict_target']
data.head(5)
reset_data = data.reset_index()
reset_data.head(2)
age_index_data = data.set_index(['age'])
age_index_data.head(2)
gender = {1: "male", 0: "female"}
gender_data = data.apply(lambda x: gender[x.sex], axis=1)
gender_data1 = data['sex'].apply(lambda x: gender[x])
(gender_data == gender_data1).all()
%%timeit
gender_data = data.apply(lambda x: gender[x.sex], axis=1)
%%timeit
gender_data = []
# .iterrows() 可以一行一行進入for-loop,每一行會分成(index, 該row的series)
for index, series in data.iterrows():
gender_data += [gender[series.sex]]
gender_data = pd.Series(gender_data, index=data.index)
groupby
¶data.groupby('predict_target').mean()
for group_name, group in data.groupby('cp'):
chest_pain_group = group.groupby('predict_target').count()
no_d = chest_pain_group.iloc[0,0]
is_d = chest_pain_group.iloc[1,0]
print("Group name {}: No disease: {}, Heart disease: {}".format(group_name, no_d, is_d))
# 對每一組的剩下column都計算mean, std和count。
data.groupby('cp').agg([np.mean, np.std, 'count'])
concat
: 將別的DataFrame和原本的接在ㄧ起,可以接在新的rows或是columns,但是dimension要一樣。append
: 將新的row接在原本的DataFrame下面。join
: 將另一個DataFrame的column接到原本的右邊。merge
: 跟SQL很像的合併資料方式,可以合併相同的column。df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
'B': ['B8', 'B9', 'B10', 'B11'],
'C': ['C8', 'C9', 'C10', 'C11'],
'D': ['D8', 'D9', 'D10', 'D11']},
index=[8, 9, 10, 11])
result = pd.concat([df1, df2, df3])
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
result = pd.merge(left, right, on='key')
df = pd.read_csv(join('data', 'wpbc.data'), sep=',')
import matplotlib.pyplot as plt
import seaborn as sns
t = np.arange(0., 5., 0.2)
# red dashes, blue squares and green triangles
plt.plot(t, t, 'r--', t, t**2, 'bs', t, t**3, 'g^', t, t**(0.5), 'y.')
plt.show()
plt.close()
Full documentation: https://seaborn.pydata.org/index.html
基本上用來視覺化的資料可以有四種:
plt.style.use('seaborn-dark')
tips = sns.load_dataset("tips")
tips.head(3) # wide-form
groups = tips.groupby(["sex", "day"]).mean().total_bill
sns.set_context("talk") # "poster", "paper"
fig, ax = plt.subplots(2, 2, figsize=(15,12))
sns.barplot(x='sex', y='total_bill', data=tips, ax=ax[0][0])
sns.scatterplot(x='tip', y='total_bill', data=tips, ax=ax[0][1],
hue='size', palette='Set2', style='smoker')
sns.boxplot(x='time', y='tip', data=tips, ax=ax[1][0])
sns.heatmap(groups.unstack(), ax=ax[1][1], annot=True, fmt=".1f",
cmap='coolwarm')
ax[1][1].set_title("Average total bill")
plt.tight_layout() # 讓圖表們可以盡量不重疊又節省空間
plt.show()
plt.close()
ax.set_title
set_xlabel, set_ylabel
set_xticks, set_yticks
plt.text()
print(plt.style.available)
plt.style.use('fivethirtyeight')
print(sns.get_dataset_names())