-- Living Mobile --: Code snippet for Logistic regression for parsing files

Saturday, June 24, 2023

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression

# Load log data into a pandas DataFrame

log_data = pd.read_csv('log_file.csv')

# Define the features and labels for log parsing

features = log_data['log_message']

labels = log_data['parsed_info']

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Convert log messages to feature vectors using CountVectorizer

vectorizer = CountVectorizer()

X_train_vectors = vectorizer.fit_transform(X_train)

X_test_vectors = vectorizer.transform(X_test)

# Train a logistic regression model

model = LogisticRegression()

model.fit(X_train_vectors, y_train)

# Predict the parsed information for test data

predictions = model.predict(X_test_vectors)

# Evaluate the model

accuracy = np.mean(predictions == y_test)

print("Accuracy:", accuracy)

A sample csv file could be like this

log_message,parsed_info

[INFO] Application started,START_EVENT

[ERROR] Invalid input detected,INPUT_ERROR

[INFO] Processing completed,PROCESS_COMPLETE

[WARNING] Low disk space,DISK_WARNING

[DEBUG] Connection timed out,CONNECTION_TIMEOUT

[ERROR] Database connection failed,DB_CONNECTION_ERROR

-- Living Mobile --