Saturday, June 24, 2023

Code to extract timestamp and error type from log using ML

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression


# Load log data into a pandas DataFrame

log_data = pd.read_csv('log_file.csv')


# Define the features and labels for log parsing

features = log_data['log_message']

timestamps = log_data['timestamp']

error_codes = log_data['error_code']


# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(features, timestamps, test_size=0.2, random_state=42)


# Convert log messages to feature vectors using CountVectorizer

vectorizer = CountVectorizer()

X_train_vectors = vectorizer.fit_transform(X_train)

X_test_vectors = vectorizer.transform(X_test)


# Train a logistic regression model for timestamp parsing

timestamp_model = LogisticRegression()

timestamp_model.fit(X_train_vectors, y_train)


# Predict the timestamps for test data

timestamp_predictions = timestamp_model.predict(X_test_vectors)


# Evaluate the timestamp model

timestamp_accuracy = np.mean(timestamp_predictions == y_test)

print("Timestamp Accuracy:", timestamp_accuracy)


# Split the data into training and testing sets for error code parsing

X_train, X_test, y_train, y_test = train_test_split(features, error_codes, test_size=0.2, random_state=42)


# Convert log messages to feature vectors using CountVectorizer

X_train_vectors = vectorizer.fit_transform(X_train)

X_test_vectors = vectorizer.transform(X_test)


# Train a logistic regression model for error code parsing

error_code_model = LogisticRegression()

error_code_model.fit(X_train_vectors, y_train)


# Predict the error codes for test data

error_code_predictions = error_code_model.predict(X_test_vectors)


# Evaluate the error code model

error_code_accuracy = np.mean(error_code_predictions == y_test)

print("Error Code Accuracy:", error_code_accuracy)


A sample file is 


log_message,timestamp,error_code

[INFO] Application started,2023-06-01 10:23:45,

[ERROR] Invalid input detected,2023-06-02 14:57:21,ERR001

[INFO] Processing completed,2023-06-03 09:12:34,

[WARNING] Low disk space,2023-06-04 16:35:02,

[DEBUG] Connection timed out,2023-06-05 11:45:10,

[ERROR] Database connection failed,2023-06-06 08:21:56,ERR002





No comments:

Post a Comment