-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_data.py
35 lines (30 loc) · 973 Bytes
/
prepare_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# -*- coding: utf-8 -*-
from tweets import Helpers
from sklearn.model_selection import train_test_split
from data import original_train_data as data
import pandas as pd
Helpers.correct_data(data)
# id_t, id_val, keyword, keyword_val, location, location_val, text, text_val, target, target_val = train_test_split(
# data.id.values,
# data.keyword.values,
# data.location.values,
# data.text.values,
# data['target_relabeled'].values,
# test_size=0.2
# )
train = pd.DataFrame({
'id': data.id,
'keyword': data.keyword,
'location': data.location,
'text': data.text,
'target': data.target_relabeled
})
# validation = pd.DataFrame({
# 'id': pd.Series(id_val),
# 'keyword': pd.Series(keyword_val),
# 'location': pd.Series(location_val),
# 'text': pd.Series(text_val),
# 'target': pd.Series(target_val)
# })
train.to_csv('./data/train.csv', index=False)
# validation.to_csv('./data/validation.csv', index=False)