-
Notifications
You must be signed in to change notification settings - Fork 0
/
misc.py
120 lines (84 loc) · 5.03 KB
/
misc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import pandas as pd
def preprocess(data):
"""
Drop irrelevant columns and combine categories to match the suggested demographics.
Suggested Demographics
----------------------
Party: Democrat, Independent, Republican
Gender: Male, Female
Age: 18-24, 25-34, 35-44, 45-54, >54
Race: White, Black, Hispanic, Other Race
Education: No Bachelor, Bachelors
"""
index = data.copy()
index.drop(['Country'], axis=1, inplace=True)
#Remove rows where the respondent is in the 14-17 age range
index = index[index.loc[:, 'Age'] != '14 - 17']
#Combine the hispanic and latino categories
index['Race'] = index['Race'].str.replace('latino', 'hispanic')
#Combine other races into the "other" category
races = ['white', 'black', 'hispanic', 'other']
index.loc[~index['Race'].isin(races), 'Race'] = 'other'
#Seperate education levels into "College" and "No college"
bachelors = ['university', 'postgraduate']
no_bachelors = ['high_school', 'other', 'vocational_technical_college', 'middle_school']
index.loc[index['Education'].isin(bachelors), 'Education'] = 'College'
index.loc[index['Education'].isin(no_bachelors), 'Education'] = 'No college'
#Combine political affiliations to compensate for respondents who claim to be independent but aren't
republican = ['Strong Republican', 'Weak Republican', 'Lean Republican/Independent']
democrat = ['Strong Democrat', 'Weak Democrat', 'Lean Democrat/Independent']
index.loc[index['What is your political party affiliation?'].isin(republican),
'What is your political party affiliation?'] = 'Republican'
index.loc[index['What is your political party affiliation?'].isin(democrat),
'What is your political party affiliation?'] = 'Democrat'
#Impute "Unknown" for missing US Census Division values
index.loc[index['US Census Division'].isnull(), 'US Census Division'] = 'Unknown'
#Combine respondents who won't vote for a major party in 2018 with those who won't vote at all
index.loc[index['Who will you vote for in the House of Representatives in 2018?'] == 'Will vote other/not sure',
'Who will you vote for in the House of Representatives in 2018?'] = "Won't Vote Major Party"
index.loc[index['Who will you vote for in the House of Representatives in 2018?'] == "Won't Vote",
'Who will you vote for in the House of Representatives in 2018?'] = "Won't Vote Major Party"
#Impute missing congressional districts for respondents living in states with only one district
at_large = ['Alaska', 'Delaware', 'Montana', 'North Dakota', 'South Dakota', 'Vermont', 'Wyoming', 'District of Columbia']
index.loc[index['Area'].isin(at_large) & pd.isnull(index['Postal Code']), 'US Congressional District'] = 1
#Concatenate the Area and US Congressional District columns to get the full district name
index['US Congressional District'] = index['US Congressional District'].astype(str)
index['US Congressional District'] = index.apply(lambda row: row['Area'] + '-' + row['US Congressional District'].replace('.0', '')
if pd.notnull(row['US Congressional District']) else 'Unknown', axis=1)
#Remove '-nan' from rows with missing congressional districts
index['US Congressional District'] = index['US Congressional District'].str.replace('-nan', '')
return index
def filter_df(data, party=None, gender=None, age=None, race=None, education=None):
"""
Filter a survey dataframe based on demographics, ignoring columns that have been dropped.
Suggested Demographics
----------------------
Party: Democrat, Independent, Republican
Gender: Male, Female
Age: 18-24, 25-34, 35-44, 45-54, >54
Race: White, Black, Hispanic, Other Race
Education: No Bachelor, Bachelors
"""
results = data
for index, col in enumerate(data.columns):
if col == 'What is your political party affiliation?':
party_col = index
elif col == 'Gender':
gender_col = index
elif col == 'Age':
age_col = index
elif col == 'Race':
race_col = index
elif col == 'Education':
edu_col = index
if party != None and 'What is your political party affiliation?' in data.columns:
results = results[results.iloc[:, party_col].str.find(party) > -1]
if gender != None and 'Gender' in data.columns:
results = results[results.iloc[:, gender_col].str.startswith(gender)]
if age != None and 'Age' in data.columns:
results = results[results.iloc[:, age_col].str.startswith(age)]
if race != None and 'Race' in data.columns:
results = results[results.iloc[:, race_col].str.startswith(race)]
if education !=None and 'Education' in data.columns:
results = results[results.iloc[:, edu_col].str.contains(education)]
return results