-
Notifications
You must be signed in to change notification settings - Fork 0
/
processing.py
69 lines (56 loc) · 2.59 KB
/
processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# Import necessary libraries and modules
import os
import re
import config # Assumes a 'config.py' file is present
import argparse
import pandas as pd
from tqdm import tqdm
from Source.utils import save_file
from nltk.tokenize import word_tokenize
# Define the main function for preprocessing text data
def main(args_):
print("Reading the data file...")
# Read the data from a CSV file
data = pd.read_csv(os.path.join(args_.input_path, args_.file_name))
# Remove rows with missing values in the specified column
data.dropna(subset=[args_.col_name], inplace=True)
# Extract the text data from the specified column
input_text = data[args_.col_name]
# Convert text to lowercase
print("Converting text to lower case...")
input_text = [i.lower() for i in tqdm(input_text)]
# Remove punctuations except apostrophe
print("Removing punctuations in text...")
input_text = [re.sub(r"[^\w\d'\s]+", " ", i) for i in tqdm(input_text)]
# Remove digits
print("Removing digits in text...")
input_text = [re.sub("\d+", "", i) for i in tqdm(input_text)]
# Remove more than one consecutive instance of 'x'
print("Removing 'xxxx...' in text")
input_text = [re.sub(r'[x]{2,}', "", i) for i in tqdm(input_text)]
# Replace multiple spaces with a single space
print("Removing additional spaces in text...")
input_text = [re.sub(' +', ' ', i) for i in tqdm(input_text)]
# Tokenize the text using NLTK
print("Tokenizing the text...")
tokens = [word_tokenize(t) for t in tqdm(input_text)]
# Save the tokenized data to a file
print("Saving tokens...")
save_file(os.path.join(args_.output_path, args_.token_file), tokens)
# Entry point of the script
if __name__ == "__main__":
# Create an argument parser to specify input and output file paths, column names, etc.
parser = argparse.ArgumentParser()
parser.add_argument("--file_name", type=str, default=config.file_name,
help="Input file name")
parser.add_argument("--col_name", type=str, default=config.col_name,
help="Text column name")
parser.add_argument("--input_path", type=str, default=config.input_folder,
help="Input folder name")
parser.add_argument("--output_path", type=str, default=config.output_folder,
help="Output folder name")
parser.add_argument("--token_file", type=str, default=config.token_file,
help="File containing word tokens")
args = parser.parse_args()
# Call the main function with the parsed arguments
main(args)