-
Notifications
You must be signed in to change notification settings - Fork 0
/
Code.py
133 lines (93 loc) · 3.89 KB
/
Code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.impute import SimpleImputer
# Load your dataset
df = pd.read_csv('/content/drive/MyDrive/financial_anomaly_data.csv')
# Check for missing values
print(df.isnull().sum())
# Impute missing values (replace NaN with the mean)
imputer = SimpleImputer(strategy='mean')
df[features] = imputer.fit_transform(df[features])
# Normalize data if needed
df[features] = (df[features] - df[features].mean()) / df[features].std()
# Train Isolation Forest model
model = IsolationForest(contamination=0.01)
model.fit(df[features])
# Predict outliers/fraudulent transactions
df['Fraudulent'] = model.predict(df[features])
# Identify potential fraudulent transactions
potential_fraud = df[df['Fraudulent'] == -1]
print(potential_fraud)
# Example feature engineering
df['TransactionHour'] = pd.to_datetime(df['Timestamp']).dt.hour
# Filter data for June
june_data = df[(df['Timestamp'] >= '2023-06-01') & (df['Timestamp'] < '2023-07-01')]
# Group by TransactionType and calculate spend
monthly_spend = june_data.groupby('TransactionType')['Amount'].sum()
print(monthly_spend)
import warnings
# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# Split the data
train, test = train_test_split(df, test_size=0.2, random_state=42)
# Train the model on the training set
model.fit(train[features])
# Evaluate on the test set
predictions = model.predict(test[features])
# Assess the performance
print(classification_report(test['Fraudulent'], predictions))
import matplotlib.pyplot as plt
import seaborn as sns
# Convert 'Timestamp' to datetime format
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
# Assuming df is your dataframe with the 'Fraudulent' column and the anomaly scores
anomaly_scores = model.decision_function(df[features])
# Create a scatter plot
plt.figure(figsize=(12, 6))
sns.scatterplot(x=df['Timestamp'], y=anomaly_scores, hue=df['Fraudulent'], palette={-1: 'red', 1: 'green'})
plt.title('Anomaly Scores Over Time')
plt.xlabel('Timestamp')
plt.ylabel('Anomaly Score')
plt.legend(title='Fraudulent', loc='upper right', labels={-1: 'Potential Fraud', 1: 'Normal'})
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Create a histogram of anomaly scores
plt.figure(figsize=(10, 6))
sns.histplot(anomaly_scores, bins=50, kde=True, color='skyblue')
plt.title('Distribution of Anomaly Scores')
plt.xlabel('Anomaly Score')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
# Create a box plot of anomaly scores
plt.figure(figsize=(8, 6))
sns.boxplot(x=df['Fraudulent'], y=anomaly_scores, palette={-1: 'red', 1: 'green'})
plt.title('Box Plot of Anomaly Scores for Normal and Fraudulent Transactions')
plt.xlabel('Transaction Class')
plt.ylabel('Anomaly Score')
plt.xticks(ticks=[0, 1], labels=['Normal', 'Potential Fraud'])
plt.tight_layout()
plt.show()
# Create a violin plot of anomaly scores
plt.figure(figsize=(10, 6))
sns.violinplot(x=df['Fraudulent'], y=anomaly_scores, palette={-1: 'red', 1: 'green'}, inner="quartile")
plt.title('Violin Plot of Anomaly Scores for Normal and Fraudulent Transactions')
plt.xlabel('Transaction Class')
plt.ylabel('Anomaly Score')
plt.xticks(ticks=[0, 1], labels=['Normal', 'Potential Fraud'])
plt.tight_layout()
plt.show()
# Assuming df is your dataframe with the 'Fraudulent' column and the anomaly scores
features = ['Amount'] # Adjust based on your features
# Predict outliers/fraudulent transactions and add AnomalyScore to the DataFrame
df['AnomalyScore'] = model.decision_function(df[features])
# Select relevant columns for the pair plot
pair_plot_cols = ['Amount', 'AnomalyScore', 'Fraudulent']
# Create a pair plot
plt.figure(figsize=(12, 8))
sns.pairplot(df[pair_plot_cols], hue='Fraudulent', palette={-1: 'red', 1: 'green'})
plt.suptitle('Pair Plot of Relevant Variables', y=1.02)
plt.show()