Handling Missing Values
查看数据的缺失情况
# get the number of missing data points per column
missing_values_count = nfl_data.isnull().sum()
# look at the # of missing points in the first ten columns
missing_values_count[0:10]
# how many total missing values do we have?
total_cells = np.product(nfl_data.shape)
total_missing = missing_values_count.sum()
# percent of data that is missing
percent_missing = (total_missing/total_cells) * 100
print(percent_missing)
关键问题:缺失的数据是因为没有记录还是表示它不存在
填充值的几种方法:
# axis默认值是0
nfl_data.dropna(axis=0)
# replace all NA's with 0
subset_nfl_data.fillna(0)
# replace all NA's the value that comes directly after it in the same column,
# then replace all the remaining na's with 0
subset_nfl_data.fillna(method='bfill', axis=0).fillna(0)
Scaling and Normalization 缩放和正态化
scaled_data = minmax_scaling(original_data, columns=[0])
normalized_data = stats.boxcox(original_data)
Parsing Dates 解析时间
landslides['date_parsed'] = pd.to_datetime(landslides['date'], format="%m/%d/%y")
day_of_month_landslides = landslides['date_parsed'].dt.day
day_of_month_landslides.head()
# remove na's
day_of_month_landslides = day_of_month_landslides.dropna()
# plot the day of the month
sns.distplot(day_of_month_landslides, kde=False, bins=31)
Character Encodings 字符编码
before = "This is the euro symbol: €"
type(before)#str
# encode it to a different encoding, replacing characters that raise errors
after = before.encode("utf-8", errors="replace")
# check the type
type(after)#bytes
# 默认按照ASCII打印,说明 utf-8里面部分字符和 ASCII 编码是一样的,能按照 ASCII 解析答应字符,否则打印转义16进制值
print(after)#b'This is the euro symbol: \xe2\x82\xac'
# convert it back to utf-8
print(after.decode("utf-8"))#This is the euro symbol: €
# try to decode our bytes with the ascii encoding
print(after.decode("ascii"))#会报错,因为包含 ASCII 无法表示的字符
# start with a string
before = "This is the euro symbol: €"
# encode it to a different encoding, replacing characters that raise errors
after = before.encode("ascii", errors = "replace")
# convert it back to utf-8
print(after.decode("ascii"))#This is the euro symbol: ?
#用来探测文件的编码
with open("../input/kickstarter-projects/ks-projects-201801.csv", 'rb') as rawdata:
result = chardet.detect(rawdata.read(10000))
# check what the character encoding might be
print(result)#{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}
# 用探测的编码读取文件
kickstarter_2016 = pd.read_csv("../input/kickstarter-projects/ks-projects-201612.csv", encoding='Windows-1252')
# look at the first few lines·
kickstarter_2016.head()
#默认按照 utf8保存文件
kickstarter_2016.to_csv("ks-projects-201801-utf8.csv")
Inconsistent Data Entry
professors.head()#大概看一下数据
# get all the unique values in the 'Country' column
countries = professors['Country'].unique()
# 按照字母排序
countries.sort()
countries
# 转化为小写
professors['Country'] = professors['Country'].str.lower()
# 删掉前后空格
professors['Country'] = professors['Country'].str.strip()
# helpful modules
import fuzzywuzzy
from fuzzywuzzy import process
import chardet
# get the top 10 closest matches to "south korea"
matches = fuzzywuzzy.process.extract("south korea", countries, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
# take a look at them
matches
# function to replace rows in the provided column of the provided dataframe
# that match the provided string above the provided ratio with the provided string
def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
# get a list of unique strings
strings = df[column].unique()
# get the top 10 closest matches to our input string
matches = fuzzywuzzy.process.extract(string_to_match, strings,
limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
# only get matches with a ratio > 90
close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]
# get the rows of all the close matches in our dataframe
rows_with_matches = df[column].isin(close_matches)
# replace all rows with close matches with the input matches
df.loc[rows_with_matches, column] = string_to_match
# let us know the function's done
print("All done!")
# use the function we just wrote to replace close matches to "south korea" with "south korea"
replace_matches_in_column(df=professors, column='Country', string_to_match="south korea")