Pandas碎碎念¶
In [27]:
Copied!
import pandas as pd
import pandas as pd
In [60]:
Copied!
data = {
'Name': ['Abies', 'x', 'y', 'z', 'p', 'q'],
'Age': [17, 18, 17, 19, 20, 21],
'Address': ['CN', 'CN', 'CN', 'US', 'UK', 'RA']
}
df = pd.DataFrame(data)
print(df)
data = {
'Name': ['Abies', 'x', 'y', 'z', 'p', 'q'],
'Age': [17, 18, 17, 19, 20, 21],
'Address': ['CN', 'CN', 'CN', 'US', 'UK', 'RA']
}
df = pd.DataFrame(data)
print(df)
Name Age Address 0 Abies 17 CN 1 x 18 CN 2 y 17 CN 3 z 19 US 4 p 20 UK 5 q 21 RA
DataFrame访问数据¶
In [29]:
Copied!
df.head() # 打印前5行
df.head() # 打印前5行
Out[29]:
| Name | Age | Address | |
|---|---|---|---|
| 0 | Abies | 17 | CN |
| 1 | x | 18 | CN |
| 2 | y | 17 | CN |
| 3 | z | 19 | US |
| 4 | p | 20 | UK |
In [30]:
Copied!
df.tail() # 打印后5行
df.tail() # 打印后5行
Out[30]:
| Name | Age | Address | |
|---|---|---|---|
| 1 | x | 18 | CN |
| 2 | y | 17 | CN |
| 3 | z | 19 | US |
| 4 | p | 20 | UK |
| 5 | q | 21 | RA |
In [31]:
Copied!
df.info() # 打印基本信息
df.info() # 打印基本信息
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6 entries, 0 to 5 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Name 6 non-null object 1 Age 6 non-null int64 2 Address 6 non-null object dtypes: int64(1), object(2) memory usage: 276.0+ bytes
In [32]:
Copied!
df.describe() # 打印数据的统计信息
df.describe() # 打印数据的统计信息
Out[32]:
| Age | |
|---|---|
| count | 6.000000 |
| mean | 18.666667 |
| std | 1.632993 |
| min | 17.000000 |
| 25% | 17.250000 |
| 50% | 18.500000 |
| 75% | 19.750000 |
| max | 21.000000 |
In [33]:
Copied!
df.columns # 打印列名
df.columns # 打印列名
Out[33]:
Index(['Name', 'Age', 'Address'], dtype='object')
In [34]:
Copied!
df.index # 打印索引范围
df.index # 打印索引范围
Out[34]:
RangeIndex(start=0, stop=6, step=1)
In [35]:
Copied!
df[['Name', 'Address']] # 打印所选列中所有信息,返回类型为Series或DataFrame
df[['Name', 'Address']] # 打印所选列中所有信息,返回类型为Series或DataFrame
Out[35]:
| Name | Address | |
|---|---|---|
| 0 | Abies | CN |
| 1 | x | CN |
| 2 | y | CN |
| 3 | z | US |
| 4 | p | UK |
| 5 | q | RA |
In [36]:
Copied!
df.iloc[0] # index location,按行号选
# 可df.iloc[0]['Name']直接访问
df.iloc[0] # index location,按行号选
# 可df.iloc[0]['Name']直接访问
Out[36]:
Name Abies Age 17 Address CN Name: 0, dtype: object
In [43]:
Copied!
mask = df['Age'] > 17 # 生成mask并筛选
print(df[mask])
print('---')
mask = df['Address'].isin(['CN', 'US'])
print(df[mask])
mask = df['Age'] > 17 # 生成mask并筛选
print(df[mask])
print('---')
mask = df['Address'].isin(['CN', 'US'])
print(df[mask])
Name Age Address
1 x 18 CN
3 z 19 US
4 p 20 UK
5 q 21 RA
---
Name Age Address
0 Abies 17 CN
1 x 18 CN
2 y 17 CN
3 z 19 US
DataFrame修改数据¶
In [47]:
Copied!
df.loc[df['Name']=='Abies', 'Address'] = 'zju' # 第一个参数选择,第二个参数修改
print(df.loc[df['Name']=='Abies'])
print('---')
df.loc[df['Address']=='CN', 'Address'] = 'China' # 一种使用场景:整体修改
print(df)
df.loc[df['Name']=='Abies', 'Address'] = 'zju' # 第一个参数选择,第二个参数修改
print(df.loc[df['Name']=='Abies'])
print('---')
df.loc[df['Address']=='CN', 'Address'] = 'China' # 一种使用场景:整体修改
print(df)
Name Age Address
0 Abies 17 zju
---
Name Age Address
0 Abies 17 zju
1 x 18 China
2 y 17 China
3 z 19 US
4 p 20 UK
5 q 21 RA
In [61]:
Copied!
print('num of rows: ', df.shape[0]) # 输出行数
df = df.drop(2) # 按索引删除行
print(df)
print('num of rows: ', df.shape[0])
print('num of rows: ', df.shape[0]) # 输出行数
df = df.drop(2) # 按索引删除行
print(df)
print('num of rows: ', df.shape[0])
num of rows: 6
Name Age Address
0 Abies 17 CN
1 x 18 CN
3 z 19 US
4 p 20 UK
5 q 21 RA
num of rows: 5
In [62]:
Copied!
df.dropna() # drop包含null或na的数据
df.fillna({'Age': 0}, inplace=True) # 将Age中na的数据填为0
df.dropna() # drop包含null或na的数据
df.fillna({'Age': 0}, inplace=True) # 将Age中na的数据填为0
In [63]:
Copied!
df.rename(columns={'Address': 'Country'}, inplace=True)
print(df.columns)
df.rename(columns={'Address': 'Country'}, inplace=True)
print(df.columns)
Index(['Name', 'Age', 'Country'], dtype='object')
DataFrame分析数据¶
In [64]:
Copied!
df['Country'].value_counts() # 返回统计数据
df['Country'].value_counts() # 返回统计数据
Out[64]:
Country CN 2 US 1 UK 1 RA 1 Name: count, dtype: int64
In [65]:
Copied!
df.groupby('Country')['Age'].mean() # 分组返回统计数据
df.groupby('Country')['Age'].mean() # 分组返回统计数据
Out[65]:
Country CN 17.5 RA 21.0 UK 20.0 US 19.0 Name: Age, dtype: float64
In [67]:
Copied!
df.sort_values('Age', ascending=False)
df.sort_values('Age', ascending=False)
Out[67]:
| Name | Age | Country | |
|---|---|---|---|
| 5 | q | 21 | RA |
| 4 | p | 20 | UK |
| 3 | z | 19 | US |
| 1 | x | 18 | CN |
| 0 | Abies | 17 | CN |
In [ ]:
Copied!
df.to_csv('data.csv', index=False) # 导出为csv
df.to_csv('data.csv', index=False) # 导出为csv