import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
# Load log data from a CSV file
log_data = pd.read_csv('log_file.csv')
# Preprocessing: Encode categorical variables
label_encoder = LabelEncoder()
log_data['user_id'] = label_encoder.fit_transform(log_data['user_id'])
log_data['action'] = label_encoder.fit_transform(log_data['action'])
# User Session Identification: Group log entries by user sessions
session_duration = pd.Timedelta(minutes=30)
log_data['timestamp'] = pd.to_datetime(log_data['timestamp'])
log_data['session_id'] = (log_data['timestamp'].diff() > session_duration).cumsum()
# Behavioral Metrics Calculation: Calculate session duration and action frequency
session_metrics = log_data.groupby('session_id').agg({
'user_id': 'first',
'timestamp': ['min', 'max'],
'action': 'count'
})
session_metrics.columns = ['user_id', 'start_time', 'end_time', 'action_count']
# Anomaly Detection: Identify anomalous user sessions
model = IsolationForest(contamination=0.05) # Adjust contamination based on expected anomaly rate
session_metrics['is_anomaly'] = model.fit_predict(session_metrics[['action_count']])
# Visualization: Plot session duration and action count
plt.scatter(session_metrics['action_count'], session_metrics['end_time'] - session_metrics['start_time'])
plt.xlabel('Action Count')
plt.ylabel('Session Duration')
plt.title('User Session Duration vs. Action Count')
plt.show()
Sample csv file is as below
timestamp,user_id,action
2023-06-01 10:00:00,user1,login
2023-06-01 10:01:00,user1,browse
2023-06-01 10:02:00,user1,purchase
2023-06-01 10:03:00,user2,login
2023-06-01 10:04:00,user2,browse
2023-06-01 10:05:00,user2,add_to_cart
2023-06-01 10:06:00,user3,login
2023-06-01 10:07:00,user3,browse
2023-06-01 10:08:00,user3,browse
2023-06-01 10:09:00,user3,checkout
No comments:
Post a Comment