forked from zmjones/deathpenalty
-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_data.py
173 lines (160 loc) · 6.61 KB
/
get_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import requests, re
from bs4 import BeautifulSoup
import pandas as pd
import csv
def read_table(url):
soup = BeautifulSoup(requests.get(url).content, "html.parser")
return soup.find(lambda tag: tag.name == 'table')
def parse_table(url):
table = read_table(url)
return [[td.get_text(strip=True)
if td.find("a") is None
else url + td.find("a")["href"]
for td in tr.find_all("td") if td.get_text(strip=True) is not u""]
for tr in table.find_all("tr")]
def parse_statement(url):
soup = BeautifulSoup(requests.get(url).content, "html.parser")
statements = [statement.text.strip() for statement in soup.findAll('p')]
try:
start = statements.index(u'Last Statement:')
except:
start = 0
statements = [statements[i] for i in range(0, len(statements)) if i > start]
return u''.join(statements)
def parse_details(url):
global counter
soup = BeautifulSoup(requests.get(url).content, "html.parser")
tdcj_num = ""
try:
for statement in soup.find(text='TDCJ Number').parent.find_next_siblings():
tdcj_num = statement.text.encode('utf-8').strip()
except:
tdcj_num = ""
birth_date = ""
try:
for statement in soup.find(text='Date of Birth').parent.find_next_siblings():
birth_date = statement.text.encode('utf-8').strip()
except:
birth_date = ""
date_received = ""
try:
for statement in soup.find(text='Date Received').parent.find_next_siblings():
date_received = statement.text.encode('utf-8').strip()
except:
date_received = ""
received_age = ""
try:
for statement in soup.find(text='Age (when Received)').parent.find_next_siblings():
received_age = statement.text.encode('utf-8').strip()
except:
received_age = ""
edu_level = ""
try:
for statement in soup.find(text='Education Level (Highest Grade Completed)').parent.find_next_siblings():
edu_level = statement.text.encode('utf-8').strip()
except:
edu_level = ""
statement5 = ""
try:
for statement in soup.find(text='Date of Offense').parent.find_next_siblings():
statement5 = statement.text.encode('utf-8').strip()
except:
statement5 = ""
offense_age = ""
try:
for statement in soup.find(text='Age (at the time of Offense)').parent.find_next_siblings():
offense_age = statement.text.encode('utf-8').strip()
except:
offense_age = ""
gender = ""
try:
for statement in soup.find(text='Gender').parent.find_next_siblings():
gender = statement.text.encode('utf-8').strip()
except:
gender = ""
hair_color = ""
try:
for statement in soup.find(text='Hair Color').parent.find_next_siblings():
hair_color = statement.text.encode('utf-8').strip()
except:
hair_color = ""
height = ""
try:
for statement in soup.find(text='Height').parent.find_next_siblings():
height = statement.text.encode('utf-8').strip()
except:
height = ""
weight = ""
try:
for statement in soup.find(text='Weight').parent.find_next_siblings():
weight = statement.text.encode('utf-8').strip()
except:
weight = ""
eye_color = ""
try:
for statement in soup.find(text='Eye Color').parent.find_next_siblings():
eye_color = statement.text.encode('utf-8').strip()
except:
eye_color = ""
native_county = ""
try:
for statement in soup.find(text='Native County').parent.find_next_siblings():
native_county = statement.text.encode('utf-8').strip()
except:
native_county = ""
native_state = ""
try:
for statement in soup.find(text='Native State').parent.find_next_siblings():
native_state = statement.text.encode('utf-8').strip()
except:
native_state = ""
prior_occupation = ""
try:
for statement in soup.find(text='Prior Occupation').parent.find_next_siblings():
prior_occupation = statement.nextSibling.encode('utf-8').strip()
except:
prior_occupation = ""
prison_record = ""
try:
for statement in soup.find(text='Prior Prison Record').parent.find_next_siblings():
prison_record = statement.nextSibling.encode('utf-8').strip()
except:
prison_record = ""
incident_summary = ""
try:
for statement in soup.find(text='Summary of Incident').parent.find_next_siblings():
incident_summary = statement.nextSibling.encode('utf-8').strip()
except:
incident_summary = ""
coDefendants = ""
try:
for statement in soup.find(text='Co-Defendants').parent.find_next_siblings():
coDefendants = statement.nextSibling.encode('utf-8').strip()
except:
coDefendants = ""
victim_info = ""
try:
for statement in soup.find(text='Race and Gender of Victim').parent.find_next_siblings():
victim_info = statement.nextSibling.encode('utf-8').strip()
except:
victim_info = ""
counter = counter + 1
print("Line Item: " + str(counter))
f.writerow([tdcj_num, birth_date, date_received, received_age, edu_level, statement5, offense_age, gender,
hair_color, height, weight, eye_color, native_county, native_state,
prior_occupation, prison_record, incident_summary, coDefendants, victim_info])
counter = 0
columns = ['Execution #', 'Offender Information Link', 'Last Statement Link', 'Last Name',
'First Name', 'TDCJ Number', 'Age', 'Execution Date', 'Race', 'County']
data = parse_table('http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html')
del data[0]
data = pd.DataFrame(data, columns = columns)
data.ix[:, (1,2)] = data.ix[:, (1,2)].applymap(lambda x: re.sub('dr_executed_offenders.html', '', x))
f = csv.writer(open("offender_details.csv", "w"))
f.writerow(["TDCJ Number","Date of Birth", "Date Received", "Age when Received", "Education Level", "Date of Offense",
"Age at Offense", "Gender", "Hair Color", "Height", "Weight", "Eye Color", "Native County",
"Native State", "Prior Occupation", "Prior Prison Record", "Summary of Incident", "Co-Defendents","Victim Information"])
data['Statement Text'] = data['Last Statement Link'].map(lambda x: parse_statement(x))
data['Offender Information Link'].map(lambda x: parse_details(x))
# data['offender_information'].map(lambda x: parse_details('https://www.tdcj.state.tx.us/death_row/dr_info/wardadam.html'))
data.to_csv('TDCJ_with_statements.csv', index=False, encoding='utf-8')