-
Notifications
You must be signed in to change notification settings - Fork 0
/
fd.py
222 lines (183 loc) · 8.4 KB
/
fd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import seaborn as sns
import streamlit as st
import pandas as pd
import subprocess
import jsonlines
from io import StringIO
from PIL import Image
@st.cache_data
def get_data(file: str, header: bool=True) -> pd.DataFrame:
"""Read the data file and return a dataframe.
Args:
file : str
File path of the data file.
header : bool, optional
True if the file has a header, False otherwise.
Returns:
df : pd.DataFrame
A pandas dataframe with the data read from the file.
"""
if header:
return pd.read_csv(file)
else:
df = pd.read_csv(file, header=None)
df.columns = [f"Column {i}" for i in range(df.shape[1])]
return df
@st.cache_data
def read_json(df: pd.DataFrame, df_cols: int, results: pd.DataFrame) -> pd.DataFrame:
"""Read the output file from Metanome and return a dataframe with the results.
Args:
df : pd.DataFrame
The dataframe with the data.
df_cols : int
The number of columns in the dataset.
results : pd.DataFrame
The dataframe with the results.
Returns:
results : pd.DataFrame
A pandas dataframe with the results.
"""
try:
# read the output file from Metanome
with jsonlines.open('results/data_fds') as lines:
for line in lines:
# get the determinants and dependant attributes
determinants = line['determinant']['columnIdentifiers']
for i in range(len(determinants)):
determinants[i] = determinants[i]['columnIdentifier']
dependant = line['dependant']['columnIdentifier']
# if the union of Determinants and Dependant contains all attributes, then the FD is filtered out
if len(determinants)+1 < df_cols:
score = df[determinants+[dependant]].drop_duplicates().shape[0]
row = {'Determinants': determinants, 'Dependant': dependant, 'Score': score}
results.loc[len(results)] = row
except Exception as exception:
print(exception)
if len(results) == 0:
return pd.DataFrame(columns=['Determinants', 'Dependant', 'Score'])
return results
@st.cache_data
def get_fds(results: pd.DataFrame, df_cols: int) -> pd.DataFrame:
"""This function takes the results of the FD algorithm and returns a dataframe with the FDs as columns and the attributes as rows.
Args:
results : pd.DataFrame
The results of the FD algorithm.
df_cols : int
The number of columns in the dataframe.
Returns:
rows : pd.DataFrame
A dataframe with Determinants, Dependant and Score as attributes and FDs as rows.
"""
# Convert the determinants to a string
results['Determinants'] = [', '.join(map(str, l)) for l in results['Determinants']]
# Create a new dataframe with the columns 'Determinants', 'Dependant' and 'Score'
#rows = pd.DataFrame(columns=['Determinants', 'Dependant', 'Score'])
rows = results.copy()
# Group the dataframe by the determinants
rows = rows.groupby('Determinants').agg({'Dependant': ', '.join, 'Score': 'max'}).reset_index()
# Loop through all rows of the dataframe
for row in rows.itertuples():
# Loop through all rows of the dataframe again
for row2 in rows.itertuples():
# If the determinants of row2 are a subset of the determinants of row
if row.Index != row2.Index and set(row2.Determinants.split(", ")).issubset(set(row.Determinants.split(", "))):
# Add the dependants of row2 to the dependants of row
rows.loc[row.Index, 'Dependant'] = rows.loc[row.Index, 'Dependant']+", "+row2.Dependant
# Remove all rows with more than df_cols columns
for row in rows.itertuples():
if len(row.Determinants.split(", "))+len(row.Dependant.split(", ")) >= df_cols:
rows = rows.drop(row.Index)
# Return the dataframe
return rows
icon = Image.open('icon.png')
st.set_page_config(
page_title="Functional Dependency App",
page_icon=icon,
layout="wide",
initial_sidebar_state="expanded",
)
# CSS to remove links
st.markdown("""
<style>
.css-15zrgzn {display: none}
.css-10pw50 {display: none}
.css-16a7pdm {display: none}
</style>
""", unsafe_allow_html=True)
# title of the app and the icon
col1, col2 = st.columns([2, 0.7])
col1.title("Functional Dependency App")
col2.image(icon, width=80)
# sidebar to select a dataset
with st.sidebar:
st.title("Select a Dataset")
# select a dataset
dataset = st.selectbox("Choose a Dataset from the Seaborn library",sns.get_dataset_names())
# load dataset
df = sns.load_dataset(dataset)
# upload a dataset
uploaded_file = st.file_uploader("Or upload a CSV file from your computer.")
header = True
if uploaded_file is not None and not uploaded_file.name.endswith(".csv"):
st.error("You need to select a CSV file.")
uploaded_file = None
if uploaded_file is not None:
try:
# select if header is present in the dataset
header = st.checkbox("Does the dataset have a header?", value=False)
df = get_data(uploaded_file, header)
dataset = uploaded_file.name
except(pd.errors.EmptyDataError):
st.error("The selected CSV file is empty.")
uploaded_file = None
# write the selected dataset to a csv file
if df is not None:
# remove all files from the results folder
subprocess.run(["rm", "-r", "results/*"])
# write the dataset to a csv file
df.to_csv("results/data.csv", index=False)
st.header("Selected Dataset")
st.dataframe(df, use_container_width=True)
# run Metanome through subprocess.run
# if the dataset doesn't have any FDs, then the output file will be empty
subprocess.run(["java", "-cp", "jars/*", "de.metanome.cli.App", "--algorithm", "de.metanome.algorithms.hyfd.HyFD",
"--file-key", "INPUT_GENERATOR", "--files", "results/data.csv", "--separator", ",",
"--header", "-o", "file:data"])
df_cols = len(df.columns)
df_rows = df.shape[0]
results = pd.DataFrame(columns=['Determinants', 'Dependant', 'Score'])
results = read_json(df, df_cols, results)
rows = get_fds(results, df_cols)
st.header("Functional Dependencies")
# write the results to the streamlit app
if rows.shape[0] > 0:
st.write("In the following dropdown menu you can see all the **Functional Dependencies** in the dataset. "
+"The **Functional Dependencies** are grouped by **Determinants** and sorted by their unique number of rows. ")
st.success("The dataset has **"+str(df_cols)+" attributes** and **"+str(df_rows)+" rows**. "
+"The HyFD Algorithm has found **"+str(len(results))+" Functional Dependencies**.")
st.write("An FD is filtered out if the union of Determinants and Dependants **contains ALL the attributes**.")
# sort the FDs by score
rows = rows.sort_values(by=['Score'], ascending=False)
selected = st.selectbox("Choose a FD to see the dataset normalized. (**Determinants -> Dependants: Score**)",rows['Determinants']+" -> "+rows['Dependant']+": "+rows['Score'].astype(str))
# get the Determinants and Dependants attributes from the selected FD
dependants = selected.split(" -> ")[1].split(", ")
dependants[-1] = dependants[-1].split(":")[0]
# get all the attributes of the selected FD based on the dataframe columns
attr1 = []
output1 = df.copy()
for attr in df.columns:
if str(attr) in selected:
attr1.append(attr)
# remove the Dependants attributes from the dataframe
if attr in dependants:
output1 = output1.drop(attr, axis=1)
output2 = df[attr1].copy()
output2 = output2.drop_duplicates()
col1, col2 = st.columns(2)
col1.write("Table without the FD's Dependants attributes:")
col1.dataframe(output1, use_container_width=True)
col2.write("Table for the selected FD:")
col2.dataframe(output2, use_container_width=True)
else:
st.warning("In the selected dataset, the **HyFD Algorithm** did **NOT** find any **FD** or they got filtered out.")
st.write("An FD is filtered out if the union of Determinants and Dependants **contains ALL the attributes**.")