-
Notifications
You must be signed in to change notification settings - Fork 0
/
Final_output.py
60 lines (36 loc) · 1.26 KB
/
Final_output.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# coding: utf-8
# In[1]:
import pandas as pd
#importing dataframe
df = pd.read_csv('Intermediate_Output.csv')
#First 5 rows of datfarame
df.head()
# In[2]:
#Sorting the dataframe with cluster_id columns.
df = df.sort_values('Cluster ID')
df = df.reset_index(drop=True)
df['Cluster ID'].max()
# In[3]:
#Again looking at the 5 rows of dataset.
df.head()
# In[4]:
df
# In[5]:
#Creating new dataframe that finally will converted into output file.
df1 = pd.DataFrame(columns=['ln','dob','gn','fn'])
# In[9]:
# Logic to enter data in new dataframe of all the cluster_ids. If the cluster_id is same then only data corresponding to
# one row is taken.
for i in range(df['Cluster ID'].max()+1):
if(str(df[df['Cluster ID'] == int(i)].iloc[0][1]) == 'nan'):
df1 = df1.append(df[df['Cluster ID'] == int(i)].iloc[0][3:7])
else:
ln = df[df['Cluster ID'] == int(i)].iloc[0][-4][2:-1]
dob = df[df['Cluster ID'] == int(i)].iloc[0][-3][2:-1]
gn = df[df['Cluster ID'] == int(i)].iloc[0][-2][2:-1]
fn = df[df['Cluster ID'] == int(i)].iloc[0][-1][2:-1]
diction = {'ln':ln, 'dob':dob, 'gn':gn, 'fn':fn}
df1 = df1.append([diction])
# In[11]:
df1 = df1.reset_index(drop=True)
df1.to_csv('Final_Output.csv',index=False)