import pandas as pd
import numpy as np


np.random.seed(12345)
np.set_printoptions(precision=4, suppress=True)

import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))


PREVIOUS_MAX_ROWS = pd.options.display.max_rows # 원래 60이 기본.
pd.set_option("max_rows", 20)


obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64


obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64


obj2 = obj.reindex(['a', 'c', 'd', 'e'])
obj2

a   -5.3
c    3.6
d    4.5
e    NaN
dtype: float64


obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 5])
obj3

0      blue
2    purple
5    yellow
dtype: object


obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    purple
5    yellow
dtype: object


obj3.reindex(range(-1, 6), method='ffill')

-1       NaN
0       blue
1       blue
2     purple
3     purple
4     purple
5     yellow
dtype: object


obj3.reindex(range(-1, 6), method='bfill')

-1      blue
0       blue
1     purple
2     purple
3     yellow
4     yellow
5     yellow
dtype: object


obj3.reindex(range(-1, 6), method='nearest')

-1      blue
0       blue
1     purple
2     purple
3     purple
4     yellow
5     yellow
dtype: object


obj3.reindex(range(-1, 6), fill_value='No Color')

-1    No Color
0         blue
1     No Color
2       purple
3     No Color
4     No Color
5       yellow
dtype: object


obj3

0      blue
2    purple
5    yellow
dtype: object


frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
frame


frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2


states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)


obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64


new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64


obj.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64


obj.drop('c', inplace=True)

obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64


data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data


data.drop(['Colorado', 'Ohio'])


data.drop('two', axis=1)


data.drop(['two', 'four'], axis='columns')


data.drop('two', axis=1, inplace=True)


data


obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64


obj['b']

1.0


obj[1]

1.0


obj[2:4]

c    2.0
d    3.0
dtype: float64


obj[['b', 'a', 'd']]

b    1.0
a    0.0
d    3.0
dtype: float64


obj[[1, 3]]

b    1.0
d    3.0
dtype: float64


obj < 2

a     True
b     True
c    False
d    False
dtype: bool


obj[obj < 2]

a    0.0
b    1.0
dtype: float64


obj['b':'c']

b    1.0
c    2.0
dtype: float64


obj['b':'c'] = 5
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64


obj.reindex(['b', 'd', 'c', 'a'])

b    5.0
d    3.0
c    5.0
a    0.0
dtype: float64


obj['b':'d']

b    5.0
c    5.0
d    3.0
dtype: float64


data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data


data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32


data[['three', 'one']]


data[:2]


mask1 = data['three'] > 5
mask1

Ohio        False
Colorado     True
Utah         True
New York     True
Name: three, dtype: bool


data[mask1]


data[~mask1] = 0
data


mask2 = data < 6
mask2


data[mask2] = 0
data


data.loc['Colorado']

one      0
two      0
three    6
four     7
Name: Colorado, dtype: int32


data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32


data.loc['Colorado', ['two', 'three']]

two      0
three    6
Name: Colorado, dtype: int32


data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int32


data.iloc[[1, 2], [3, 0, 1]]


data.loc[:'Colorado', 'two']

Ohio        0
Colorado    0
Name: two, dtype: int32


data.iloc[:, :3][data.three > 5]


s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64


s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
               index=['a', 'c', 'e', 'f', 'g'])
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64


s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64


df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), 
                   columns=list('bcd'),
                   index=['Ohio', 'Texas', 'Colorado'])
df1


df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df2


df1 + df2


df1.add(df2, fill_value=0)


arr = np.arange(12.).reshape((3, 4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])


arr[0]

array([0., 1., 2., 3.])


arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])


arr[:,1]

array([1., 5., 9.])


try:
    arr + arr[:, 1]
except:
    print("브로드캐스팅 불가능!")

브로드캐스팅 불가능!


arr_1 = arr[:,1][:, np.newaxis]
arr_1

array([[1.],
       [5.],
       [9.]])


arr + arr_1

array([[ 1.,  2.,  3.,  4.],
       [ 9., 10., 11., 12.],
       [17., 18., 19., 20.]])


frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame


series = frame.iloc[0]
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64


frame - series


series2 = pd.Series(range(3), index=['b', 'e', 'f'])
series2

b    0
e    1
f    2
dtype: int64


frame + series2


series3 = frame['d']
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64


frame.sub(series3, axis=0)


frame.sub(series3, axis='index')


frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame


np.abs(frame)


np.abs(frame['b'])

Utah      0.204708
Ohio      0.555730
Texas     0.092908
Oregon    1.246435
Name: b, dtype: float64


format = lambda x: '%.2f' % x


frame['e'].map(format)

Utah      -0.52
Ohio       1.39
Texas      0.77
Oregon    -1.30
Name: e, dtype: object


frame.applymap(format)


f1 = lambda x: x.max() - x.min()


frame.apply(f1)

b    1.802165
d    1.684034
e    2.689627
dtype: float64


frame.apply(f1, axis='columns')

Utah      0.998382
Ohio      2.521511
Texas     0.676115
Oregon    2.542656
dtype: float64


frame.apply(f1, axis=1)

Utah      0.998382
Ohio      2.521511
Texas     0.676115
Oregon    2.542656
dtype: float64


def f2(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])


frame.apply(f2)


obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj

d    0
a    1
b    2
c    3
dtype: int64


obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64


obj.sort_index(ascending=False)

d    0
c    3
b    2
a    1
dtype: int64


frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
frame


frame.sort_index()


frame.sort_index(axis=1)


frame.sort_index(axis='columns')


frame.sort_index(axis=1, ascending=False)


obj = pd.Series([4, 7, -3, 2])
obj

0    4
1    7
2   -3
3    2
dtype: int64


obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64


obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj

0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64


obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64


frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame


frame.sort_values(by='b')


frame.sort_values(by=['a', 'b'])


frame.sort_values(by=2, axis=1, ascending=False)

메서드	설명
`add()`	덧셈(`+`) 계산 메서드
`sub()`	뺄셈(`-`) 계산 메서드
`mul()`	곱셈(`*`) 계산 메서드
`div()`	나눗셈(`/`) 계산 메서드
`floordiv()`	몫 (`//`) 계산 메서드
`pow()`	거듭제곱(`**`) 메서드

	b	d	e
Utah	-0.204708	0.478943	-0.519439
Ohio	-0.555730	1.965781	1.393406
Texas	0.092908	0.281746	0.769023
Oregon	1.246435	1.007189	-1.296221

	b	d	e
min	-0.555730	0.281746	-1.296221
max	1.246435	1.965781	1.393406

판다스 2편¶

주요 내용¶

기본 설정¶

5.2 핵심 기능 (p. 198)¶

5.2.1 리인덱싱 (p. 198)¶

시리즈 리인덱싱¶

결측치 채우기 1: `method` 키워드 인자¶

결측치 채우기 2: `fill_value` 키워드 인자¶

데이터프레임 리인덱싱¶

5.2.2 `drop()` 메서드 (p. 201)¶

5.2.3 인덱싱, 슬라이싱, 필터링(부울 인덱싱) (p. 203)¶

시리즈의 인덱싱, 슬라이싱, 필터링(부울 인덱싱)¶

데이터프레임의 인덱싱, 슬라이싱, 필터링(부울 인덱싱)¶

행 단위 인덱싱/슬라이싱 (p. 206)¶

5.2.5 산술 연산 (p. 210)¶

연산 과정에서 결측치 채우기¶

데이터프레임과 시리즈 사이의 연산¶

5.2.6 함수 적용 (p. 217)¶

유니버설 함수¶

`map()`과 `applymap()` 메서드¶

`apply()` 메서드¶

5.2.7 정렬 (p. 220)¶

`sort_index()` 메서드¶

`sort_values()` 메서드¶

	one	two	three	four
Ohio	True	True	True	True
Colorado	True	True	False	False
Utah	False	False	False	False
New York	False	False	False	False

	b	d	e
Utah	0.204708	0.478943	0.519439
Ohio	0.555730	1.965781	1.393406
Texas	0.092908	0.281746	0.769023
Oregon	1.246435	1.007189	1.296221

	b	d	e
Utah	-0.20	0.48	-0.52
Ohio	-0.56	1.97	1.39
Texas	0.09	0.28	0.77
Oregon	1.25	1.01	-1.30

판다스 2편¶

주요 내용¶

기본 설정¶

5.2 핵심 기능 (p. 198)¶

5.2.1 리인덱싱 (p. 198)¶

시리즈 리인덱싱¶

결측치 채우기 1: method 키워드 인자¶

결측치 채우기 2: fill_value 키워드 인자¶

데이터프레임 리인덱싱¶

5.2.2 drop() 메서드 (p. 201)¶

5.2.3 인덱싱, 슬라이싱, 필터링(부울 인덱싱) (p. 203)¶

시리즈의 인덱싱, 슬라이싱, 필터링(부울 인덱싱)¶

데이터프레임의 인덱싱, 슬라이싱, 필터링(부울 인덱싱)¶

행 단위 인덱싱/슬라이싱 (p. 206)¶

5.2.5 산술 연산 (p. 210)¶

연산 과정에서 결측치 채우기¶

데이터프레임과 시리즈 사이의 연산¶

5.2.6 함수 적용 (p. 217)¶

유니버설 함수¶

map()과 applymap() 메서드¶

apply() 메서드¶

5.2.7 정렬 (p. 220)¶

sort_index() 메서드¶

sort_values() 메서드¶

결측치 채우기 1: `method` 키워드 인자¶

결측치 채우기 2: `fill_value` 키워드 인자¶

5.2.2 `drop()` 메서드 (p. 201)¶

`map()`과 `applymap()` 메서드¶

`apply()` 메서드¶

`sort_index()` 메서드¶

`sort_values()` 메서드¶