Matplotlib-stack overflow annual developer survey

tags

Playdata_Python

summary

Matplotlib-연습

type

Post

thumbnail

updatedAt

Jan 31, 2023 07:05 AM

import pandas as pd
import seaborn as sns # 서브 패키지
import numpy as np
import matplotlib as mpl # 데이터 시각화
import matplotlib.pyplot as plt # 서브 패키지

#데이터 불러오기
# notebook_dir/data 경로에 csv 파일을 옮겨둔 것을 pd.read_csv() 메서드를 활용하여 DataFrame으로 불러오기
survey_raw_df = pd.read_csv('data/survey_results_public.csv',index_col = 'ResponseId')

# schema 파일 불러오고 "qusetion" 부분만 가져오기
# schema_raw["survey_raw_df의 column"]을 조회하면 그 column에 대한 question에 대한 값이 조회되도록 만들려는 목적
schema_raw = pd.read_csv('data/survey_results_schema.csv',index_col = 'qname')["question"]

# CompTotal이란 항목에 대한 질문이 궁금하다면 조회
schema_raw["CompTotal"]

schema_raw["CompFreq"]

# survey_raw_df에 대한 info() 조회 총 78개의 column이 존재하는걸 확인
survey_raw_df.info()

# survey_raw_df에 대해 조회할 떄 생략되는 column이 없도록 설정
# -display.max_columns : 표현되는 column의 최대 개수
# -display.max_rows : 표현되는 row의 최대 개수

pd.set_option('display.max_columns',78)
pd.set_option('display.max_rows',78)

# 매우 많은 column 중 일부를 발췌하여 사용
#- 인구 통계(성별,나이,학력 등)
#- 프로그래밍 기술, 경험 및 선호도
#- 고용 관련 정보, 선호도 및 의견

selected_columns = [
    # 인구 통계
    'Country',
    'Age',
    'Gender',
    'EdLevel',
    # 프로그래밍 경험
    'YearsCode',
    'YearsCodePro',
    'LanguageHaveWorkedWith',
    'LanguageWantToWorkWith',
    'LearnCodeCoursesCert',
    'ProfessionalTech',
    'SOAccount',
    # 고용 관련 정보
    'Employment',
    'DevType',
    'WorkExp',
]

# 위에서 만든 리스트를 활용하여 column에 대해 배열 인덱싱
# 얻어진 결과를 survey_raw_df로 부터 분리하여 작업하고자 copy(0 메서드를 통해 새롭게 복사
survey_df = survey_raw_df[selected_columns].copy()
survey_df

73268 rows × 14 columns

# 새롭게 만든 survet_df에 대해 info()를 조회
survey_df.info()

# YearsCode 질문 내용 확인
print(schema_raw["YearsCode"])

Including any education, how many years have you been coding in total?

# value_counts() 활용하여 어던 값의 유형을 갖는지 확인
survey_df["YearsCode"].value_counts()

10                    5217
5                     5193
6                     4651
4                     4480
7                     4237
8                     4227
3                     4122
2                     3351
12                    2995
15                    2962
20                    2659
9                     2581
11                    1819
14                    1811
13                    1624
25                    1582
1                     1566
Less than 1 year      1413
16                    1406
30                    1330
22                    1176
18                    1150
17                    1080
40                     777
24                     752
35                     742
23                     644
19                     563
26                     489
27                     485
21                     483
28                     425
32                     371
38                     288
37                     277
34                     241
36                     236
33                     229
42                     216
29                     208
More than 50 years     172
31                     170
45                     167
39                     159
41                     143
43                     115
44                     104
50                      68
46                      57
47                      42
48                      41
49                      35
Name: YearsCode, dtype: int64

# column(Series)이 NaN 값을 얼마나 갖는지 확인하기 위한 함수

def count_nan(series) :
    '''
    column(Series)이 NaN 값을 얼마나 갖는지 확인하기 위한 함수
    '''
    return len(series) - series.count()

def count_nan1(series) :
    '''
    column(Series)이 NaN 값을 얼마나 갖는지 확인하기 위한 함수
    '''
    return series.isnull().sum()

count_nan(survey_df["YearsCode"])

# YearsCode의 'Less than 1 year' 의 값은 0으로 변경하고 'More than 50 years'의 값은 51로 일괄적으로 변경
# 또 dtype을 float로 변경하고 이를 survey_Df["YearsCode"]에 적용
# - 아래의 Series의 메서드를 활용
# - 힌트 : replace(변경 전 값, 변경 후 값, inplace = True)
a = pd.Series(survey_df["YearsCode"])
a.replace('Less than 1 year',0,inplace=True)
a.replace('More than 50 years',51,inplace=True)
survey_df["YearsCode"]=a.astype({'YearsCode':'float'})

# YearsCodePro의 'Less than 1 year' 의 값은 0으로 변경하고 'More than 50 years'의 값은 51로 일괄적으로 변경
# 또 dtype을 float로 변경하고 이를 survey_Df["YearsCode"]에 적용
# - 아래의 Series의 메서드를 활용
# - 힌트 : replace(변경 전 값, 변경 후 값, inplace = True)

b = pd.Series(survey_df["YearsCodePro"])
b.replace('Less than 1 year',0,inplace=True)
b.replace('More than 50 years',51,inplace=True)
survey_df["YearsCodePro"]=b.astype({'YearsCodePro':'float'})

# 'less than 1 year', 'More than 50 years' 의 값을 NaN으로 처리하면서 수치 자료형으로 바꾼다면 to_numeric()메서드 활용

survey_df['YearsCode'] = pd.to_numeric(survey_df.YearsCode, errors='coerce')
survey_df['YearsCodePro'] = pd.to_numeric(survey_df.YearsCodePro,errors='coerce')

# 성별의 개수
# 둘 이상의 옵션이 포함된 값을 배제
survey_df["Gender"].value_counts()

# np.nan 값이 섞인 Series 자료형에 대해 특정 문자 값이 포함되었는지 사용할 수 있는 메서드로 contains()가 있음.
# 첫 인자로 포함된 것을 찾을 문자열을 넣어준다.
# na 키워드 인수를 활용하면 NaN 값을 대신할 값을 넣을 수 있습니다.
# 앞의 두 메서드를 활용하면 ';'를 포함하는 row를 NaN 값으로 변경할 수있음
# '~'은 비트 연산자로 Series 자료형의 벡터화 연산을 위해 True를 False로 False는 True로 반전해주는 역할

survey_df.where(~(survey_df["Gender"].str.contains(";",na=False)),np.nan,inplace=True)

survey_df["Gender"].value_counts() # 변환을 마치면 성별로 1개의 값만 선택한 선택지만 남은 것을 확인 할 수 있음

# 한글 표시를 위해 폰트 지정 필수
# 기본 폰트 적용은 rcParams 설정으로
# - font.family : 기본 폰트 종류 적용
# - axes.unicode_minus : False로 설정 시 마이너스 기호로 ASCII로 한다.

mpl.rcParams["font.family"] = 'Nanum Pen'
mpl.rcParams["axes.unicode_minus"]=False

# 거주하는 국가가 무엇인지 분포를 파악

schema_raw['Country'] #문제

survey_df["Country"].unique()

# NaN 값은 count에 포함되지 않는값
survey_df["Country"].nunique()

# 설문 응답자가 속한 국가 중 응답자 수 상위 15개 국가를 구연
survey_df["Country"].value_counts()[0:15]

top_countries = survey_df["Country"].value_counts().head(15)
top_countries

# xticks차트 rotation = 90 (각도를 의미하는 정수)


plt.figure(figsize=(12,6))
plt.title("응답자의 지역")
plt.xticks(rotation=90)
sns.barplot(x=top_countries.index , y=top_countries)
plt.ylabel('응답자의 수')
    
plt.show()

- 데이터가 영어로 진행되고, 또 이들 국가의 영어 사용 인구가 가장 많기 때문에 응답자의 비율이 미국과 인도에서 불균형적으로 높은 것으로 보임

- 즉 이 설문 조사는 비영어권 국가의 글로벌 프로그래밍 커뮤니티를 대표하지 않을 수 있음을 알 수 있다.

gender = survey_df["Gender"].value_counts()


labels = gender.index
sizes = gender
plt.title("Which of the following describe you, if any? Please check all that apply.")
plt.style.use('seaborn-bright')
plt.pie(sizes, labels=labels, autopct='%1.1f%%',startangle=180) #autopct 값표시"
plt.axis('equal') #원의 형태를 유치
plt.show()

- 남자가 더 많은걸 확인 할 수 있었음

plt.figure(figsize=(12,6))
plt.title(schema_raw["EdLevel"])
plt.xticks(rotation=90)
sns.countplot(data=survey_df, y="EdLevel")
plt.xlabel("count")

plt.show()

# x축의 단위가 퍼센트로 변경

edlevel = survey_df["EdLevel"].value_counts()

total = (edlevel / edlevel.sum())*100 ## 단위 퍼센트 전체 갯수에서 더한값 나누기 총 합이 1이되어야함
total

EdLevel_pct = survey_df.EdLevel.value_counts(True) * 100
EdLevel_pct

# x축의 단위가 퍼센트로 변경

edlevel = survey_df["EdLevel"].value_counts()

total = (edlevel / edlevel.sum())*100 ## 단위 퍼센트 전체 갯수에서 더한값 나누기 총 합이 1이되어야함
total

plt.figure(figsize=(12,6))
plt.title(schema_raw["EdLevel"])
sns.barplot(x=total, y=edlevel.index)

    
plt.show()

# 고용형태
employmnet = (survey_df.Employment.value_counts(True, ascending=False) * 100). head(15)
sns.barplot(x=employmnet, y=employmnet.index)
plt.title(schema_raw.Employment)
plt.xlabel('Percentange')

DevType column을 제일 작은 단위의 응답 항목으로 나누어 DataFrame 형태로 만들기

Series를 인자로 받아오기

value의 구분자는 ';'

Series는 items() 메서드를 사용하여 루프할 수 있음

반환 값으로 각 제일 작은 단위의 응답 항목을 column으로 갖는 DataFrame을 줌

응답자가 제일 작은 단위의 기술 스텍을 선택할 경우 해당 열의 값을 True로 아닌 경우 False로 한다.

def split_multicolumn(col_series):
    # 반환할 DataFrame이자 최초의 col_series도 담는다.
    result_df = pd.DataFrame(col_series.values, index=col_series.index, columns=["DevType"])
    # result_df = col_series.to_frame() --> 위의 코드와 같다.
    # 기술 스텍 목록을 담았다가 마지막 반환할 때 인덱싱으로 활용
    options = []
    # NaN 값을 갖는 row를 제외하고 반복
    for idx, value  in col_series[col_series.notnull()].items():
        # value를 가져와 ';'를 기준으로 분리하여 각 기술 스텍 문자열 리스트를 만든다.
        for option in value.split(';'):
            # 현재 option이 result_df.columns에 없으면 추가한다.
            if option not in result_df.columns:
                options.append(option)
                result_df[option] = False
            # 파싱한 기술 스텍에 해당하는 column의 값에 True를 넣는다.
            result_df.at[idx, option] = True
            # result_df.loc[idx, option] = True
    return result_df[options]
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.at.html
split_multicolumn(survey_df['DevType'])

##################################################################################

def split_multicolumn(Series):
    df = pd.DataFrame(False, columns = (Series.dropna().str.split(';').str[0].unique()), index = Series.index)
    # for idx, rep in Series[Series.notnull()].str.split(';').items:
    #     for one in rep:
    #         df.at[idx, one.strip()] = True
    for idx, rep in Series.str.split(';').items():
        # NaN이 아닌 값을 분기
        if isinstance(rep, list):
            for one in rep:
                df.at[idx, one.strip()] = True
        else :
            pass
    return df
split_multicolumn(survey_df['DevType'])

# 2022년 가장 인기 있는 프로그래밍 언어
b = split_multicolumn(survey_df['LanguageHaveWorkedWith'])

Language_type_totals = b.sum().sort_values(ascending=False)

Language_type = Language_type_totals = Language_type_totals / Language_type_totals.sum() *100

plt.figure(figsize=(12,12))
plt.title("Languages used in the past year")
sns.barplot(x=Language_type, y=Language_type.index)
plt.xlabel('Percentage')
    
plt.show()

# 데이터 사이언스 관련 분야에서 일하는 응답자들

devtype_dt = survey_df[survey_df.DevType =='Data scientist or machine learning specialist']
devtype_dt

devtype_dt_df = split_multicolumn(devtype_dt['LanguageHaveWorkedWith'])

devtype_dt_totals = devtype_dt_df.sum().sort_values(ascending=False)

devtype_dt_totals_percentages = devtype_dt_totals /  devtype_dt_totals.sum() *100

plt.figure(figsize=(12,12))
plt.title("Languages used by Data scientist or machine learning specialist")
sns.barplot(x=devtype_dt_totals_percentages, y=devtype_dt_totals_percentages.index)
plt.xlabel('Percentage')
    
plt.show()

#내년에 가장 많은 사람들이 배우고 싶어하는 언어
survey_df.LanguageWantToWorkWith


languagewant_dt_df = split_multicolumn(survey_df['LanguageWantToWorkWith'])

languagewant_dt_totals = languagewant_dt_df.sum().sort_values(ascending=False)

languagewant_dt_totals_percentages = languagewant_dt_totals /  languagewant_dt_totals.sum() *100

plt.figure(figsize=(12,12))
plt.title("LanguageWantToWorkWith")
sns.barplot(x=languagewant_dt_totals_percentages, y=languagewant_dt_totals_percentages.index)
plt.xlabel('Percentage')
    
plt.show()

#데이터베이스에서 가장 많이 사용되는 언어

database_dt_df = split_multicolumn(survey_raw_df['DatabaseHaveWorkedWith'])

database_dt_totals = database_dt_df.sum().sort_values(ascending=False)

database_dt_totals_percentages = database_dt_totals /  database_dt_totals.sum() *100

plt.figure(figsize=(12,12))
plt.title("Database")
sns.barplot(x=database_dt_totals_percentages, y=database_dt_totals_percentages.index)
plt.xlabel('Percentage')
    
plt.show()