Data Cleaning 数据清洗

378 阅读2分钟

Handling Missing Values

查看数据的缺失情况

# get the number of missing data points per column
missing_values_count = nfl_data.isnull().sum()

# look at the # of missing points in the first ten columns
missing_values_count[0:10]
# how many total missing values do we have?
total_cells = np.product(nfl_data.shape)
total_missing = missing_values_count.sum()

# percent of data that is missing
percent_missing = (total_missing/total_cells) * 100
print(percent_missing)

关键问题:缺失的数据是因为没有记录还是表示它不存在
填充值的几种方法:

# axis默认值是0
nfl_data.dropna(axis=0)
# replace all NA's with 0
subset_nfl_data.fillna(0)
# replace all NA's the value that comes directly after it in the same column, 
# then replace all the remaining na's with 0
subset_nfl_data.fillna(method='bfill', axis=0).fillna(0)

Scaling and Normalization 缩放和正态化

scaled_data = minmax_scaling(original_data, columns=[0])

normalized_data = stats.boxcox(original_data)

Parsing Dates 解析时间

landslides['date_parsed'] = pd.to_datetime(landslides['date'], format="%m/%d/%y")

day_of_month_landslides = landslides['date_parsed'].dt.day
day_of_month_landslides.head()
# remove na's
day_of_month_landslides = day_of_month_landslides.dropna()

# plot the day of the month
sns.distplot(day_of_month_landslides, kde=False, bins=31)

image.png

Character Encodings 字符编码

before = "This is the euro symbol: €"
type(before)#str
# encode it to a different encoding, replacing characters that raise errors
after = before.encode("utf-8", errors="replace")

# check the type
type(after)#bytes
# 默认按照ASCII打印,说明 utf-8里面部分字符和 ASCII 编码是一样的,能按照 ASCII 解析答应字符,否则打印转义16进制值
print(after)#b'This is the euro symbol: \xe2\x82\xac'
# convert it back to utf-8
print(after.decode("utf-8"))#This is the euro symbol: €
# try to decode our bytes with the ascii encoding
print(after.decode("ascii"))#会报错,因为包含 ASCII 无法表示的字符

# start with a string
before = "This is the euro symbol: €"
# encode it to a different encoding, replacing characters that raise errors
after = before.encode("ascii", errors = "replace")

# convert it back to utf-8
print(after.decode("ascii"))#This is the euro symbol: ?

#用来探测文件的编码
with open("../input/kickstarter-projects/ks-projects-201801.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))

# check what the character encoding might be
print(result)#{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}

# 用探测的编码读取文件
kickstarter_2016 = pd.read_csv("../input/kickstarter-projects/ks-projects-201612.csv", encoding='Windows-1252')

# look at the first few lines·
kickstarter_2016.head()

#默认按照 utf8保存文件
kickstarter_2016.to_csv("ks-projects-201801-utf8.csv")

Inconsistent Data Entry

professors.head()#大概看一下数据
# get all the unique values in the 'Country' column
countries = professors['Country'].unique()

# 按照字母排序
countries.sort()
countries

# 转化为小写
professors['Country'] = professors['Country'].str.lower()
# 删掉前后空格
professors['Country'] = professors['Country'].str.strip()
# helpful modules
import fuzzywuzzy
from fuzzywuzzy import process
import chardet

# get the top 10 closest matches to "south korea"
matches = fuzzywuzzy.process.extract("south korea", countries, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

# take a look at them
matches

image.png

# function to replace rows in the provided column of the provided dataframe
# that match the provided string above the provided ratio with the provided string
def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
    # get a list of unique strings
    strings = df[column].unique()
    
    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    # only get matches with a ratio > 90
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match
    
    # let us know the function's done
    print("All done!")
# use the function we just wrote to replace close matches to "south korea" with "south korea"
replace_matches_in_column(df=professors, column='Country', string_to_match="south korea")