-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathEngine.py
More file actions
82 lines (78 loc) · 3.38 KB
/
Engine.py
File metadata and controls
82 lines (78 loc) · 3.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# Import necessary libraries
import os
import config
import argparse
import pandas as pd
from Source.utils import save_file
from Source.model import vectorize
from Source.processing import process_text
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
# Define a function to train the model
def train_model(X_train, X_test, y_train, y_test):
"""
Function to train the model
:param X_train: Training feature data
:param X_test: Testing feature data
:param y_train: Training labels
:param y_test: Testing labels
:return: trained model
"""
model = LogisticRegression()
model.fit(X_train, y_train)
# Make train predictions
train_pred = model.predict(X_train)
# Make test predictions
test_pred = model.predict(X_test)
# Calculate train accuracy
train_acc = round(accuracy_score(y_train, train_pred) * 100, 2)
# Calculate test accuracy
test_acc = round(accuracy_score(y_test, test_pred) * 100, 2)
print(f"Train Accuracy: {train_acc}%")
print(f"Test Accuracy: {test_acc}%")
return model
# Define the main function to execute the entire workflow
def main(args):
# Create input data file path
input_file = os.path.join(config.input_path, args.file_name)
# Create vectorizer file path
vect_file = os.path.join(config.output_path, f"{args.output_name}.pkl")
# Create model file path
model_file = os.path.join(config.output_path, f"{args.output_name}_lr.pkl")
# Read raw data from an Excel file
data = pd.read_excel(input_file)
# Select text and label columns
data = data[[config.text_col, config.label_col]]
# Convert text column to a list of reviews
reviews = list(data[config.text_col])
# Pre-process the text data
reviews = [process_text(r, config.stem) for r in reviews]
# Create dependent variable (labels)
y = data[config.label_col]
# Vectorize the data and split it into train and test sets
X_train, X_test, y_train, y_test, vectorizer = vectorize(reviews, y,
vect=args.vectorizer,
min_df=config.min_df,
ng_low=config.ng_low,
ng_high=config.ng_high,
test_size=config.test_size,
rs=config.rs)
# Save the vectorizer to a file
save_file(vect_file, vectorizer)
# Train the model
model = train_model(X_train, X_test, y_train, y_test)
# Save the trained model to a file
save_file(model_file, model)
# Check if the script is being run as the main program
if __name__ == "__main__":
# Define command-line arguments and their default values
parser = argparse.ArgumentParser()
parser.add_argument("--file_name", type=str, default="Canva_reviews.xlsx",
help="Input file name")
parser.add_argument("--vectorizer", type=str, default="bow",
help="Vectorizer, one of - 'bow', 'bowb', 'ng','tf'")
parser.add_argument("--output_name", type=str, default="model",
help="Output file name")
args = parser.parse_args()
# Call the main function with the provided arguments
main(args)