pandasで欠損値を見つける・カウントする・置き換える

print(pd.isna(pd.NA))
print(pd.isna(np.nan))
print(pd.isna(math.nan))
print(pd.isna(None))
print(pd.isna(0))
print(pd.isna(''))
print(pd.isna(' '))
# True
# True
# True
# True
# False
# False
# False

表にするとこうなります。

isnaで引っかかる	pd.NA np.nan math.nan None
isnaで引っかからない	0 空文字スペース

欠損値をprintしたり、型を見てみると、以下のようになります。

import pandas as pd
import numpy as np
import math

print(pd.NA)
print(type(pd.NA))
print(pd.isna(pd.NA))
# <NA>
# <class 'pandas._libs.missing.NAType'>
# True

print(np.nan)
print(type(np.nan))
print(pd.isna(np.nan))
# nan
# <class 'float'>
# True

print(math.nan)
print(type(math.nan))
print(pd.isna(math.nan))
# nan
# <class 'float'>
# True

print(None)
print(type(None))
print(pd.isna(None))
# None
# <class 'NoneType'>
# True

print(pd.isna(0))
# False
# 0は引っかからない。

print(pd.isna(''))
# False
# 空文字は引っかからない。

print(pd.isna(' '))
# False
# 半角スペースは引っかからない。

print(pd.isna('　'))
# False
# 全角スペースは引っかからない。

isnaとisnullは同じものですが、確認しましょう。

print(pd.isnull(pd.NA))
print(pd.isnull(np.nan))
print(pd.isnull(math.nan))
print(pd.isnull(None))

# True
# True
# True
# True
# isnullはisnaの結果と同じ。

欠損値をカウントする

DataFrameの欠損値をカウントしてみます。全体、横方向、縦方向のそれぞれでカウントできます。

# DataFrameの例
df = pd.DataFrame(dict(
    name=['John', 'Nana', 'Ken'],
    age=[10, np.nan, 3],
    country=['us', np.nan, np.nan],))

print(df)
#    name   age country
# 0  John  10.0      us
# 1  Nana   NaN     NaN
# 2   Ken   3.0     NaN

print(df.isna())
#     name    age  country
# 0  False  False    False
# 1  False   True     True
# 2  False  False     True

# 各列の欠損値カウント
print(df.isna().sum())
# name       0
# age        1
# country    2
# dtype: int64

# 各行の欠損値カウント
print(df.isna().sum(axis='columns'))
# 0    0
# 1    2
# 2    1
# dtype: int64

# DataFrame全体の欠損値カウント
print(df.isna().sum().sum())
# 3

Seriesの欠損値をカウントしてみます。

# Seriesの例
s = pd.Series(
    data=[5, None, 2],
    index=['orange', 'apple', 'lemon'],
    name='num')

print(s)
# orange    5.0
# apple     NaN
# lemon     2.0
# Name: num, dtype: float64

print(s.isna())
# orange    False
# apple      True
# lemon     False
# Name: num, dtype: bool

# Seriesの欠損値をカウントする。
print(s.isna().sum())
# 1

欠損値を見つけて、置換する

欠損値を補間してみましょう。

下の例では、DataFrameのデータを用いて、age列の欠損値はゼロに、country列の欠損値は”unknown”に置換し欠損値を無くしています。

# DataFrameの例
df = pd.DataFrame(dict(
    name=['John', 'Nana', 'Ken'],
    age=[np.nan, np.nan, 3],
    country=['us', np.nan, np.nan],))

print(df)
#    name  age country
# 0  John  NaN      us
# 1  Nana  NaN     NaN
# 2   Ken  3.0     NaN
    
print(df['age'])
# 0    NaN
# 1    NaN
# 2    3.0
# Name: age, dtype: float64

print(df['age'].isna())
# 0     True
# 1     True
# 2    False
# Name: age, dtype: bool

# ageが欠損値の行のみ、表示する。
print(df[df['age'].isna()])
#    name  age country
# 0  John  NaN      us
# 1  Nana  NaN     NaN

# ageが欠損値のIndexを指定して、0に置き換える。
df.loc[df['age'].isna(), 'age'] = 0
print(df)
#    name  age country
# 0  John  0.0      us
# 1  Nana  0.0     NaN
# 2   Ken  3.0     NaN

# countryが欠損値になっている行を、unknownに置き換える。
df.loc[df['country'].isna(), 'country'] = 'unknown'
print(df)
#    name  age  country
# 0  John  0.0       us
# 1  Nana  0.0  unknown
# 2   Ken  3.0  unknown
    
# 欠損値をカウントする。
print(df.isna().sum().sum())
# 0

Seriesで欠損値を置換してみます。

# Seriesの例
s = pd.Series(
    data=[5, None, 2],
    index=['orange', 'apple', 'lemon'],
    name='num')

print(s)
# orange    5.0
# apple     NaN
# lemon     2.0
# Name: num, dtype: float64

# 欠損値をゼロに置き換える。
s[s.isna()] = 0
print(s)
# orange    5.0
# apple     0.0
# lemon     2.0
# Name: num, dtype: float64

# 置換後の欠損値のカウント。
print(s.isna().sum())
# 0