설치

python -m pip install --upgrade pip
python -m pip install pandas
python -m pip install openpyxl

python -m pip install xlsxwriter
# xlsxwriter은 파일을 쓸 때 인코딩 혹은 문자깨짐으로인한 오류를 처리하기 위해 설치

기능

엑셀파일 불러오기

import pandas as pd

dataset = pd.read_excel("{파일경로}/{파일명}.xlsx", header=0)

엑셀파일 저장하기

단일 Sheet 저장

import pandas as pd

# 1) data frame 생성
my_header = ["단어", "단어 길이"]
my_data = [["바나나", "3"], ["사과", "2"], ["대머리독수리", "6"]]

df = pd.DataFrame(my_data, columns=my_header)

# 2) excel 파일로 저장
df.to_excel("my_output.xlsx", sheet_name="단어 길이")

다중 Sheet 저장

import pandas as pd

my_header = ["단어", "단어 길이"]
my_data1 = [["바나나", "3"], ["사과", "2"], ["대머리독수리", "6"]]
my_data2 = [["수영", "2"], ["사이클", "3"], ["마라톤", "3"]]

# 엑셀파일 생성 및 my_data1저장
df1 = pd.DataFrame(my_data1, columns=my_header)
with pd.ExcelWriter(f"./word_counting.xlsx", mode="w", engine="openpyxl") as excel_writer:
    df1.to_excel(excel_writer, sheet_name=f"과일")

# 기존 엑셀파일에 my_data2 Sheet 추가
df2 = pd.DataFrame(my_data2, columns=my_header)
with pd.ExcelWriter(f"./word_counting.xlsx", mode="a", engine="openpyxl") as excel_writer:
    df2.to_excel(excel_writer, sheet_name=f"운동")

데이터 처리

Index 순환

import pandas as pd

dataset = pd.read_excel("{파일경로}/{파일명}.xlsx", header=0)

for row in dataset.iloc:
    print(row)

Row 순환 (Header 정보 유지)

import pandas as pd

dataset = pd.read_excel("{파일경로}/{파일명}.xlsx", header=0)

for idx, row in dataset.iterrows():
    print(idx, row[col이름])

마지막 Column에 데이터 추가

### Load dataset
dataset = pd.read_excel("./my_data.xlsx", header=0)
print(m_dataset.info())


### 추가할 데이터 생성
added = []
for _ in range(dataset.shape[0]):
    added.append(_)


### 마지막 행에 데이터 추가
dataset.insert(dataset.shape[1], "added", added, True)


### 결과 확인
print(m_dataset.info())
print(m_dataset.head())

특정 포맷으로 데이터 생성 및 추가

import pandas as pd

# 데이터 형태 정의
data_form = {
    'A' : [],
    'B' : [],
    'C' : []
}
# 데이터 형태에 알맞게 개체 생성
data = pd.DataFrame(data_form)

# 신규 데이터 생성
new_data = {
    'A' : 1.
    'C' : 2
}

# 마지막 행에 데이터 추가
df.loc[len(df)] = new_data

특정 조건의 데이터 추출

- 특정 조건을 만족하는 row만 추출

import pandas as pd

filepath = "my/file/path.xlsx"
df = pd.read_excel(filepath)

# cond 값이 0인 row를 제외
df = df[df['cond']!=0]

print(df)

- 특정 col의 값이 nan인 row를 제외

import pandas as pd

filepath = "my/file/path.xlsx"
df = pd.read_excel(filepath)

# cond 값이 nan인 row 제외
df = df.dropna(subset='cond')

print(df)

함수 적용

- df.map : Series 객체에서 개별 요소(element-wise) 변환을 수행

import pandas as pd

sample = pd.Dataframe(...)

# 사용 예1)sample의 모든 데이터에 some_func 함수를 적용 & 반영
sample = sample.map({some_func})

# 사용 예2) lambda를 사용해 기존 값에 1을 더해서 overwrite
sample['first'] = sample['first'].map(lambda x : x+1)

- df.apply : DataFrame과 Series 모두에서 사용 가능

import pandas as pd

sample = pd.Dataframe(...)

# sample내 item1 데이터 컬럼에 대해 some_func를 적용하고, 그 결과를 new 컬럼으로 새롭게 생성
sample['new'] = sample['item1'].apply({some_func})

- dataframe의 여러 column 값을 기반으로 출력 도출

import pandas as pd

def sumfunction(rows):
	return rows['first'] + rows['second']

sample = pd.Dataframe(...)

# 각 행을 기준으로 데이터 처리를 위해 axi=1 지정
sample['new'] = sample.apply(sumfunction, axis=1)

기타

Nan 확인

import pandas as pd

if pd.isna(some_value):
    print("nan value")
else:
    print("not Nan")

[Python] 엑셀 파일 다루기 with Pandas

설치

기능

엑셀파일 불러오기

엑셀파일 저장하기

데이터 처리

기타

관련 게시글