import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
# Load log data into a pandas DataFrame
log_data = pd.read_csv('log_file.csv')
# Define the features and labels for log parsing
features = log_data['log_message']
labels = log_data['parsed_info']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
# Convert log messages to feature vectors using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)
# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_vectors, y_train)
# Predict the parsed information for test data
predictions = model.predict(X_test_vectors)
# Evaluate the model
accuracy = np.mean(predictions == y_test)
print("Accuracy:", accuracy)
A sample csv file could be like this
log_message,parsed_info
[INFO] Application started,START_EVENT
[ERROR] Invalid input detected,INPUT_ERROR
[INFO] Processing completed,PROCESS_COMPLETE
[WARNING] Low disk space,DISK_WARNING
[DEBUG] Connection timed out,CONNECTION_TIMEOUT
[ERROR] Database connection failed,DB_CONNECTION_ERROR
No comments:
Post a Comment