Pandas使用规范

194 阅读17分钟

创建 Pandas Series

import pandas as pd
groceries = pd.Series(data=[30,6,'Yes','No'],index = ['eggs','apples','milk','bread'])
                      
# We display the Groceries Pandas Series
groceries
eggs       30
apples      6
milk      Yes
bread      No
dtype: object
groceries.shape
(4,)
groceries.ndim
1
groceries.size
4
# We print some information about Groceries
print('Groceries has shape:', groceries.shape)
print('Groceries has dimension:', groceries.ndim)
print('Groceries has a total of', groceries.size, 'elements')
Groceries has shape: (4,)
Groceries has dimension: 1
Groceries has a total of 4 elements
# We check whether bananas is a food item (an index) in Groceries
x = 'bananas' in groceries

# We check whether bread is a food item (an index) in Groceries
y = 'bread' in groceries

# We print the results
print('Is bananas an index label in Groceries:', x)
print('Is bread an index label in Groceries:', y)
Is bananas an index label in Groceries: False
Is bread an index label in Groceries: True
# We access elements in Groceries using index labels:

# We use a single index label
print('How many eggs do we need to buy:', groceries['eggs'])
print()

# we can access multiple index labels
print('Do we need milk and bread:\n', groceries[['milk', 'bread']]) 
print()

# we use loc to access multiple index labels
print('How many eggs and apples do we need to buy:\n', groceries.loc[['eggs', 'apples']]) 
print()

# We access elements in Groceries using numerical indices:

# we use multiple numerical indices
print('How many eggs and apples do we need to buy:\n',  groceries[[0, 1]]) 
print()

# We use a negative numerical index
print('Do we need bread:\n', groceries[[-1]]) 
print()

# We use a single numerical index
print('How many eggs do we need to buy:', groceries[0]) 
print()
# we use iloc to access multiple numerical indices
print('Do we need milk and bread:\n', groceries.iloc[[2, 3]]) 
How many eggs do we need to buy: 30

Do we need milk and bread:
 milk     Yes
bread     No
dtype: object

How many eggs and apples do we need to buy:
 eggs      30
apples     6
dtype: object

How many eggs and apples do we need to buy:
 eggs      30
apples     6
dtype: object

Do we need bread:
 bread    No
dtype: object

How many eggs do we need to buy: 30

Do we need milk and bread:
 milk     Yes
bread     No
dtype: object
print(groceries[['milk','bread']])
milk     Yes
bread     No
dtype: object
groceries.loc[['eggs','apples']]
eggs      30
apples     6
dtype: object
groceries[[0,1]]
eggs      30
apples     6
dtype: object
groceries[[-1]]
bread    No
dtype: object
groceries['eggs']
30
groceries[[0]]
eggs    30
dtype: object
groceries[[2,3]]
milk     Yes
bread     No
dtype: object
print('Original Grocery List:\n',groceries)
groceries['eggs']  = 5
print()
print('After change eggs:\n',groceries)
Original Grocery List:
 eggs        2
apples      6
milk      Yes
bread      No
dtype: object

After change eggs:
 eggs        5
apples      6
milk      Yes
bread      No
dtype: object

我们还可以使用 .drop() 方法删除 Pandas Series 中的条目。Series.drop(label) 方法会从给定 Series 中删除给定的 label。请注意,Series.drop(label) 方法不在原地地从 Series 中删除元素,即不会更改被修改的原始 Series。我们来看看代码编写方式

print(groceries.drop('apples'))
print('\n',groceries)
eggs       5
milk     Yes
bread     No
dtype: object

 eggs        5
apples      6
milk      Yes
bread      No
dtype: object
print('Original List:\n',groceries)

groceries.drop('apples', inplace = True)

print()
print('After removing:\n',groceries)
Original List:
 eggs        5
apples      6
milk      Yes
bread      No
dtype: object

After removing:
 eggs       5
milk     Yes
bread     No
dtype: object

对 Pandas Series 执行算术运算

# We create a Pandas Series that stores a grocery list of just fruits
fruits= pd.Series(data = [10, 6, 3,], index = ['apples', 'oranges', 'bananas'])

# We display the fruits Pandas Series
fruits
apples     10
oranges     6
bananas     3
dtype: int64
# We print fruits for reference
print('Original grocery list of fruits:\n ', fruits)

# We perform basic element-wise operations using arithmetic symbols
print()
print('fruits + 2:\n', fruits + 2) # We add 2 to each item in fruits
print()
print('fruits - 2:\n', fruits - 2) # We subtract 2 to each item in fruits
print()
print('fruits * 2:\n', fruits * 2) # We multiply each item in fruits by 2 
print()
print('fruits / 2:\n', fruits / 2) # We divide each item in fruits by 2
print()
Original grocery list of fruits:
  apples     10
oranges     6
bananas     3
dtype: int64

fruits + 2:
 apples     12
oranges     8
bananas     5
dtype: int64

fruits - 2:
 apples     8
oranges    4
bananas    1
dtype: int64

fruits * 2:
 apples     20
oranges    12
bananas     6
dtype: int64

fruits / 2:
 apples     5.0
oranges    3.0
bananas    1.5
dtype: float64
# We import NumPy as np to be able to use the mathematical functions
import numpy as np

# We print fruits for reference
print('Original grocery list of fruits:\n', fruits)

# We apply different mathematical functions to all elements of fruits
print()
print('EXP(X) = \n', np.exp(fruits))
print() 
print('SQRT(X) =\n', np.sqrt(fruits))
print()
print('POW(X,2) =\n',np.power(fruits,2)) # We raise all elements of fruits to the power of 2
Original grocery list of fruits:
 apples     10
oranges     6
bananas     3
dtype: int64

EXP(X) = 
 apples     22026.465795
oranges      403.428793
bananas       20.085537
dtype: float64

SQRT(X) =
 apples     3.162278
oranges    2.449490
bananas    1.732051
dtype: float64

POW(X,2) =
 apples     100
oranges     36
bananas      9
dtype: int64
# We print fruits for reference
print('Original grocery list of fruits:\n ', fruits)
print()

# We add 2 only to the bananas
print('Amount of bananas + 2 = ', fruits['bananas'] + 2)
print()

# We subtract 2 from apples
print('Amount of apples - 2 = ', fruits.iloc[0] - 2)
print()

# We multiply apples and oranges by 2
print('We double the amount of apples and oranges:\n', fruits[['apples', 'oranges']] * 2)
print()

# We divide apples and oranges by 2
print('We half the amount of apples and oranges:\n', fruits.loc[['apples', 'oranges']] / 2)
Original grocery list of fruits:
  apples     10
oranges     6
bananas     3
dtype: int64

Amount of bananas + 2 =  5

Amount of apples - 2 =  8

We double the amount of apples and oranges:
 apples     20
oranges    12
dtype: int64

We half the amount of apples and oranges:
 apples     5.0
oranges    3.0
dtype: float64
# We multiply our grocery list by 2
groceries * 2
eggs         10
milk     YesYes
bread      NoNo
dtype: object

例子

import pandas as pd

# DO NOT CHANGE THE VARIABLE NAMES

# Given a list representing a few planets
planets = ['Earth','Saturn', 'Venus', 'Mars', 'Jupiter']

# Given another list representing the the distance of the selected planets from the Sun
# The distance from the Sun is in units of 10^6 km
distance_from_sun = [149.6, 1433.5, 108.2, 227.9, 778.6]


# TO DO: Create a Pandas Series using the lists above, representing the distance of some planets from the Sun.
# Use the `distance_from_sun` as your data, and `planets` as your index.
dist_planets = pd.Series(data = distance_from_sun,index = planets)


# TO DO: Calculate the time (minutes) it takes sunlight to reach each planet. 
# You can do this by dividing each planet's distance from the Sun by the speed of light.
# Use the speed of light, c = 18, since light travels 18 x 10^6 km/minute.
time_light = dist_planets / 18


# TO DO: Use Boolean indexing to select only those planets for which sunlight takes less
# than 40 minutes to reach them.
close_planets = time_light[time_light < 40]

创建 Pandas DataFrame

首先,我们将使用 Pandas Series 字典手动创建一个 DataFrame。第一步是创建 Pandas Series 字典。字典创建完毕后,我们可以将该字典传递给 pd.DataFrame()函数。

字典已经创建完毕,我们可以通过将其传递给 pd.DataFrame() 函数,创建 DataFrame。我们将创建一个可以表示多位用户的购物车的 DataFrame,在此例中只有两位用户,即 Alice 和 Bob。

import pandas as pd

items = {
    'Bob':pd.Series(data=[220,25,55], index=['bike','pants','watch']),
    'Alice':pd.Series(data=[40,100,600,45],index=['book','glasses','bike','pants'])
}

print(type(items))
<class 'dict'>
shopping_carts = pd.DataFrame(items)

shopping_carts
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Bob Alice
bike 220.0 600.0
book NaN 40.0
glasses NaN 100.0
pants 25.0 45.0
watch 55.0 NaN

有几个事项需要注意。我们发现 DataFrame 以表格形式显示,和 Excel 电子表格很像,行和列的标签以粗体形式显示。此外注意,DataFrame 的行标签根据构建字典所用的两个 Pandas Series 的索引标签创建而成。DataFrame 的列标签来自字典的键。另一个注意事项是,列按照字母顺序排序,而不是字典中的顺序。稍后我们将发现,当我们从数据文件中向 DataFrame 加载数据时,不会发生这种情况。最后要注意的是,我们发现该 DataFrame 中出现了一些 NaN 值。NaN 是指非数字,Pandas 通过这种方式表示该行和列索引没有值。例如,如果我们查看 Alice 列,我们发现手表索引的值是 NaN。你可以通过查看一开始创建的字典,了解为何是这种情况。可以清晰地看出,Alice 手表标签没有条目。因此,在创建 DataFrame 时,如果特定行索引的特定列没有值,Pandas 将用 NaN 值填充。如果要将此数据馈送到机器学习算法中,我们首先需要删掉这些 NaN 值。在后面的课程中,我们将学习如何处理 NaN 值以及如何清理数据。暂时先将这些值留在我们的 DataFrame 中。

在上述示例中,我们使用具有定义清晰的索引的 Pandas Series 字典创建了 Pandas DataFrame。如果我们不向 Pandas Series 提供索引标签,Pandas 在创建 DataFrame 时将使用数字行索引。我们来看一个示例:

# We create a dictionary of Pandas Series without indexes
data = {'Bob' : pd.Series([245, 25, 55]),
        'Alice' : pd.Series([40, 110, 500, 45])}

# We create a DataFrame
df = pd.DataFrame(data)

# We display the DataFramea
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Bob Alice
0 245.0 40
1 25.0 110
2 55.0 500
3 NaN 45
# We print some information about shopping_carts
print('shopping_carts has shape:', shopping_carts.shape)
print('shopping_carts has dimension:', shopping_carts.ndim)
print('shopping_carts has a total of:', shopping_carts.size, 'elements')
print()
print('The data in shopping_carts is:\n', shopping_carts.values)
print()
print('The row index in shopping_carts is:', shopping_carts.index)
print()
print('The column index in shopping_carts is:', shopping_carts.columns)
shopping_carts has shape: (5, 2)
shopping_carts has dimension: 2
shopping_carts has a total of: 10 elements

The data in shopping_carts is:
 [[220. 600.]
 [ nan  40.]
 [ nan 100.]
 [ 25.  45.]
 [ 55.  nan]]

The row index in shopping_carts is: Index(['bike', 'book', 'cars', 'pants', 'watch'], dtype='object')

The column index in shopping_carts is: Index(['Bob', 'Alice'], dtype='object')

在 shopping_carts DataFrame 时,我们将整个字典传递给了pd.DataFrame()函数。但是,有时候你可能只对一部分数据感兴趣。在 Pandas 中,我们可以通过关键字 columns index 选择要将哪些数据放入 DataFrame 中。我们来看一些示例:

bob_shopping_cart = pd.DataFrame(items, columns = ['Bob'])

bob_shopping_cart
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Bob
bike 220
pants 25
watch 55
# We Create a DataFrame that only has selected items for both Alice and Bob
sel_shopping_cart = pd.DataFrame(items, index = ['pants', 'book'])

sel_shopping_cart
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Bob Alice
pants 25.0 45
book NaN 40
# We Create a DataFrame that only has selected items for Alice
alice_sel_shopping_cart = pd.DataFrame(items, index = ['glasses', 'bike'], columns = ['Alice'])

# We display alice_sel_shopping_cart
alice_sel_shopping_cart
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Alice
glasses 100
bike 600

你还可以使用列表(数组)字典手动地创建 DataFrame。流程和之前一样,首先创建一个字典,然后将该字典传递给pd.DataFrame()函数。但是在这种情况下,字典中的所有列表(数组)长度必须一样。我们来看一个示例:

data = {
    'Integers' : [1,2,3],
    'Floats' : [4.5,8.2,9.6]
}

df = pd.DataFrame(data)

df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Integers Floats
0 1 4.5
1 2 8.2
2 3 9.6
data = {
    'Integers' : [1,2,3],
    'Floats' : [4.5,8.2,9.6]
}

df = pd.DataFrame(data, index = ['label 1','label 2','label 3'])

df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Integers Floats
label 1 1 4.5
label 2 2 8.2
label 3 3 9.6

手动创建 Pandas DataFrame 的最后一种方式是使用 Python 字典列表。

item2 = [
    {'bike':20,'pants':90,'watches':34},
    {'watches':10,'glasses':19,'bikes':15,'pants':20}
]

store_item = pd.DataFrame(item2)

store_item
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bike pants watches glasses bikes
0 20.0 90 34 NaN NaN
1 NaN 20 10 19.0 15.0
item2 = [
    {'bike':20,'pants':90,'watches':34},
    {'watches':10,'glasses':19,'bikes':15,'pants':20}
]

store_items = pd.DataFrame(item2, index = ['store 1','store 2'])

store_items
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bike pants watches glasses bikes
store 1 20.0 90 34 NaN NaN
store 2 NaN 20 10 19.0 15.0

访问 Pandas DataFrame 中的元素

# We print the store_items DataFrame
print(store_items)

# We access rows, columns and elements using labels
print()
print('How many bikes are in each store:\n', store_items[['bikes']])
print()
print('How many bikes and pants are in each store:\n', store_items[['bikes', 'pants']])
print()
print('What items are in Store 1:\n', store_items.loc[['store 1']])
print()
print('How many bikes are in Store 2:', store_items['bikes']['store 2'])
         bike  pants  watches  glasses  bikes
store 1  20.0     90       34      NaN    NaN
store 2   NaN     20       10     19.0   15.0

How many bikes are in each store:
          bikes
store 1    NaN
store 2   15.0

How many bikes and pants are in each store:
          bikes  pants
store 1    NaN     90
store 2   15.0     20

What items are in Store 1:
          bike  pants  watches  glasses  bikes
store 1  20.0     90       34      NaN    NaN

How many bikes are in Store 2: 15.0
# We add a new column named shirts to our store_items DataFrame indicating the number of shirts in stock at each store. We
# will put 15 shirts in store 1 and 2 shirts in store 2
store_items['shirts'] = [15,2]

# We display the modified DataFrame
store_items
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bike pants watches glasses bikes shirts
store 1 20.0 90 34 NaN NaN 15
store 2 NaN 20 10 19.0 15.0 2
# We make a new column called suits by adding the number of shirts and pants
store_items['suits'] = store_items['pants'] + store_items['shirts']

# We display the modified DataFrame
store_items
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bike pants watches glasses bikes shirts suits
store 1 20.0 90 34 NaN NaN 15 105
store 2 NaN 20 10 19.0 15.0 2 22
# We create a dictionary from a list of Python dictionaries that will number of items at the new store
new_items = [{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4}]

# We create new DataFrame with the new_items and provide and index labeled store 3
new_store = pd.DataFrame(new_items, index = ['store 3'])

# We display the items at the new store
new_store
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bikes pants watches glasses
store 3 20 30 35 4
 store_items = store_items.append(new_store)
store_items
/var/folders/n3/gp474k_159z57lgv69cph80c0000gn/T/ipykernel_35873/1931142937.py:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  store_items = store_items.append(new_store)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bike pants watches glasses bikes shirts suits
store 1 20.0 90 34 NaN NaN 15.0 105.0
store 2 NaN 20 10 19.0 15.0 2.0 22.0
store 3 NaN 30 35 4.0 20.0 NaN NaN
# We add a new column using data from particular rows in the watches column
# We add a new column using data from particular rows in the watches column
store_items['new watches'] = store_items['watches'][1:]

# We display the modified DataFrame
store_items
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bike pants watches glasses bikes shirts suits new watches
store 1 20.0 90 34 NaN NaN 15.0 105.0 NaN
store 2 NaN 20 10 19.0 15.0 2.0 22.0 10.0
store 3 NaN 30 35 4.0 20.0 NaN NaN 35.0

我们还可以将新列插入 DataFrames 的任何位置。dataframe.insert(loc,label,data) 方法使我们能够将新列(具有给定列标签和给定数据)插入 dataframeloc 位置。我们将名称为 shoes 的新列插入 suits 列前面。因为 suits 的数字索引值为 4,我们将此值作为 loc。我们来看看代码编写方式:

# We insert a new column with label shoes right before the column with numerical index 4
store_items.insert(4, 'shoes', [8,5,0])

# we display the modified DataFrame
store_items
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bike pants watches glasses shoes bikes shirts suits new watches
store 1 20.0 90 34 NaN 8 NaN 15.0 105.0 NaN
store 2 NaN 20 10 19.0 5 15.0 2.0 22.0 10.0
store 3 NaN 30 35 4.0 0 20.0 NaN NaN 35.0

就像我们可以添加行和列一样,我们也可以删除它们。要删除 DataFrame 中的行和列,我们将使用 .pop() .drop() 方法。.pop() 方法仅允许我们删除列,而 .drop() 方法可以同时用于删除行和列,只需使用关键字 axis 即可。我们来看一些示例:

# We remove the new watches column
store_items.pop('new watches')

# we display the modified DataFrame
store_items
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bike pants watches glasses shoes bikes shirts suits
store 1 20.0 90 34 NaN 8 NaN 15.0 105.0
store 2 NaN 20 10 19.0 5 15.0 2.0 22.0
store 3 NaN 30 35 4.0 0 20.0 NaN NaN
# We remove the watches and shoes columns
store_items = store_items.drop(['watches', 'shoes'], axis = 1)

# we display the modified DataFrame
store_items
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bike pants glasses bikes shirts suits
store 1 20.0 90 NaN NaN 15.0 105.0
store 2 NaN 20 19.0 15.0 2.0 22.0
store 3 NaN 30 4.0 20.0 NaN NaN
# We remove the store 2 and store 1 rows
store_items = store_items.drop(['store 2', 'store 1'], axis = 0)

# we display the modified DataFrame
store_items
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bike pants glasses bikes shirts suits
store 3 NaN 30 4.0 20.0 NaN NaN

有时候,我们可能需要更改行和列标签。我们使用.rename()方法将 bikes 列标签改为 hats

store_items = store_items.rename(columns={'bikes':'hats'})
store_items
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bike pants glasses hats shirts suits
store 3 NaN 30 4.0 20.0 NaN NaN
store_items = store_items.rename(index = {'store 3':'last store'})
store_items
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bike pants glasses hats shirts suits
last store NaN 30 4.0 20.0 NaN NaN

你还可以将索引改为 DataFrame 中的某个列。

# We change the row index to be the data in the pants column
store_items = store_items.set_index('pants')

# we display the modified DataFrame
store_items
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bike glasses hats shirts suits
pants
30 NaN 4.0 20.0 NaN NaN

处理 NaN

# We create a list of Python dictionaries
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35, 'shirts': 15, 'shoes':8, 'suits':45},
{'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5, 'shirts': 2, 'shoes':5, 'suits':7},
{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4, 'shoes':10}]

# We create a DataFrame  and provide the row index
store_items = pd.DataFrame(items2, index = ['store 1', 'store 2', 'store 3'])

# We display the DataFrame
store_items
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bikes pants watches shirts shoes suits glasses
store 1 20 30 35 15.0 8 45.0 NaN
store 2 15 5 10 2.0 5 7.0 50.0
store 3 20 30 35 NaN 10 NaN 4.0
# We count the number of NaN values in store_items
x =  store_items.isnull().sum().sum()
# We print x
print('Number of NaN values in our DataFrame:', x)
Number of NaN values in our DataFrame: 3

在上述示例中,.isnull() 方法返回一个大小和 store_items 一样的布尔型 DataFrame,并用 True 表示具有 NaN 值的元素,用 False 表示非 NaN 值的元素。我们来看一个示例:

store_items.isnull()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bikes pants watches shirts shoes suits glasses
store 1 False False False False False False True
store 2 False False False False False False False
store 3 False False False True False True False

在 Pandas 中,逻辑值 True 的数字值是 1,逻辑值 False 的数字值是 0。因此,我们可以通过数逻辑值True的数量数出NaN值的数量。为了数逻辑值 True 的总数,我们使用 .sum() 方法两次。要使用该方法两次,是因为第一个 sum() 返回一个 Pandas Series,其中存储了列上的逻辑值 True 的总数,如下所示:

store_items.isnull().sum()
bikes      0
pants      0
watches    0
shirts     1
shoes      0
suits      1
glasses    1
dtype: int64

第二个 sum() 将上述 Pandas Series 中的 1 相加。

除了数 NaN 值的数量之外,我们还可以采用相反的方式,我们可以数非 NaN 值的数量。为此,我们可以使用 .count() 方法,如下所示:

# We print the number of non-NaN values in our DataFrame
print()
print('Number of non-NaN values in the columns of our DataFrame:\n', store_items.count())
print(store_items.count().sum())
Number of non-NaN values in the columns of our DataFrame:
 bikes      3
pants      3
watches    3
shirts     2
shoes      3
suits      2
glasses    2
dtype: int64
18

现在我们已经知道如何判断数据集中是否有任何 NaN 值,下一步是决定如何处理这些 NaN 值。通常,我们有两种选择,可以删除或替换 NaN 值。在下面的示例中,我们将介绍这两种方式。

首先,我们将学习如何从 DataFrame 中删除包含任何 NaN 值的行或列。如果 axis = 0.dropna(axis) 方法将删除包含 NaN 值的任何行,如果 axis = 1.dropna(axis) 方法将删除包含 NaN 值的任何列。我们来看一些示例:

# We drop any rows with NaN values
store_items.dropna(axis = 0)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bikes pants watches shirts shoes suits glasses
store 2 15 5 10 2.0 5 7.0 50.0
# We drop any columns with NaN values
store_items.dropna(axis = 1)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bikes pants watches shoes
store 1 20 30 35 8
store 2 15 5 10 5
store 3 20 30 35 10

注意,.dropna() 方法不在原地地删除具有 NaN 值的行或列。也就是说,原始 DataFrame 不会改变。你始终可以在 dropna() 方法中将关键字 inplace 设为 True,在原地删除目标行或列。

现在,我们不再删除 NaN 值,而是将它们替换为合适的值。例如,我们可以选择将所有 NaN 值替换为 0。为此,我们可以使用 .fillna() 方法,如下所示。

# We replace all NaN values with 0
store_items.fillna(0)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bikes pants watches shirts shoes suits glasses
store 1 20 30 35 15.0 8 45.0 0.0
store 2 15 5 10 2.0 5 7.0 50.0
store 3 20 30 35 0.0 10 0.0 4.0
# We replace NaN values with the previous value in the column
store_items.fillna(method = 'ffill', axis = 0)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bikes pants watches shirts shoes suits glasses
store 1 20 30 35 15.0 8 45.0 NaN
store 2 15 5 10 2.0 5 7.0 50.0
store 3 20 30 35 2.0 10 7.0 4.0

注意 store 3 中的两个 NaN 值被替换成了它们所在列中的上个值。但是注意, store 1 中的 NaN 值没有被替换掉。因为这列前面没有值,因为 NaN 值是该列的第一个值。但是,如果使用上个行值进行前向填充,则不会发生这种情况。我们来看看具体情形:

# We replace NaN values with the previous value in the row
store_items.fillna(method = 'ffill', axis = 1)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bikes pants watches shirts shoes suits glasses
store 1 20.0 30.0 35.0 15.0 8.0 45.0 45.0
store 2 15.0 5.0 10.0 2.0 5.0 7.0 50.0
store 3 20.0 30.0 35.0 35.0 10.0 10.0 4.0

同样,你可以选择用 DataFrame 中之后的值替换 NaN 值,称之为后向填充。.fillna(method = 'backfill', axis) 将通过后向填充 (backfill) 方法沿着给定 axis 使用下个已知值替换 NaN 值。和前向填充一样,我们可以选择使用行值或列值。我们来看一些示例:

# We replace NaN values with the next value in the column
store_items.fillna(method = 'backfill', axis = 0)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bikes pants watches shirts shoes suits glasses
store 1 20 30 35 15.0 8 45.0 50.0
store 2 15 5 10 2.0 5 7.0 50.0
store 3 20 30 35 NaN 10 NaN 4.0
# We replace NaN values with the next value in the row
store_items.fillna(method = 'backfill', axis = 1)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bikes pants watches shirts shoes suits glasses
store 1 20.0 30.0 35.0 15.0 8.0 45.0 NaN
store 2 15.0 5.0 10.0 2.0 5.0 7.0 50.0
store 3 20.0 30.0 35.0 10.0 10.0 4.0 4.0

注意,.fillna() 方法不在原地地替换(填充)NaN 值。也就是说,原始 DataFrame 不会改变。你始终可以在fillna()函数中将关键字 inplace 设为 True,在原地替换 NaN 值。

我们还可以选择使用不同的插值方法替换 NaN 值。例如,.interpolate(method = 'linear', axis) 方法将通过 linear 插值使用沿着给定 axis 的值替换 NaN 值。我们来看一些示例:

# We replace NaN values by using linear interpolation using column values
store_items.interpolate(method = 'linear', axis = 0)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bikes pants watches shirts shoes suits glasses
store 1 20 30 35 15.0 8 45.0 NaN
store 2 15 5 10 2.0 5 7.0 50.0
store 3 20 30 35 2.0 10 7.0 4.0

注意,store 3 中的两个NaN值被替换成了线性插值。但是注意,store 1 中的NaN值没有被替换掉。因为该 NaN 值是该列中的第一个值,因为它前面没有数据,因此插值函数无法计算值。现在,我们使用行值插入值:

# We replace NaN values by using linear interpolation using row values
store_items.interpolate(method = 'linear', axis = "rows")
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bikes pants watches shirts shoes suits glasses
store 1 20 30 35 15.0 8 45.0 NaN
store 2 15 5 10 2.0 5 7.0 50.0
store 3 20 30 35 2.0 10 7.0 4.0
import pandas as pd
import numpy as np

# DO NOT CHANGE THE VARIABLE NAMES

# Set the precision of our dataframes to one decimal place.
pd.options.display.precision=1

# Create a Pandas DataFrame that contains the ratings some users have given to a series of books. 
# The ratings given are in the range from 1 to 5, with 5 being the best score. 
# The names of the books, the corresponding authors, and the ratings of each user are given below:

books = pd.Series(data = ['Great Expectations', 'Of Mice and Men', 'Romeo and Juliet', 'The Time Machine', 'Alice in Wonderland' ])
authors = pd.Series(data = ['Charles Dickens', 'John Steinbeck', 'William Shakespeare', ' H. G. Wells', 'Lewis Carroll' ])

# User ratings are in the order of the book titles mentioned above
# If a user has not rated all books, Pandas will automatically consider the missing values as NaN.
# If a user has mentioned `np.nan` value, then also it means that the user has not yet rated that book.
user_1 = pd.Series(data = [3.2, np.nan ,2.5])
user_2 = pd.Series(data = [5., 1.3, 4.0, 3.8])
user_3 = pd.Series(data = [2.0, 2.3, np.nan, 4])
user_4 = pd.Series(data = [4, 3.5, 4, 5, 4.2])


# Use the data above to create a Pandas DataFrame that has the following column
# labels: 'Author', 'Book Title', 'User 1', 'User 2', 'User 3', 'User 4'. 
# Let Pandas automatically assign numerical row indices to the DataFrame. 

# TO DO: Create a dictionary with the data given above
dat = {
    'Book Title':books,
    'Author':authors,
    'User 1':user_1,
    'User 2':user_2,
    'User 3':user_3,
    'User 4':user_4,
}

# TO DO: Create a Pandas DataFrame using the dictionary created above
book_ratings = pd.DataFrame(dat)
book_ratings.fillna(book_ratings.mean(), inplace = True)

# TO DO:
# If you created the dictionary correctly you should have a Pandas DataFrame
# that has column labels: 
# 'Author', 'Book Title', 'User 1', 'User 2', 'User 3', 'User 4' 
# and row indices 0 through 4.

# Now replace all the NaN values in your DataFrame with the average rating in
# each column. Replace the NaN values in place. 
# HINT: Use the `pandas.DataFrame.fillna(value, inplace = True)` function for substituting the NaN values. 
# Write your code below:
best_rated = book_ratings[(book_ratings == 5).any(axis = 1)]['Book Title'].values
print(best_rated)
['Great Expectations' 'The Time Machine']


/var/folders/n3/gp474k_159z57lgv69cph80c0000gn/T/ipykernel_35873/1665517927.py:41: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  book_ratings.fillna(book_ratings.mean(), inplace = True)