import numpy as np
import pandas as pd
import json

from matplotlib import pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = (8,6)

import netml

import logging
logging.getLogger("scapy.runtime").setLevel(logging.ERROR)

from netml.pparser.parser import PCAP

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs


import matplotlib as mpl

def plot(X,y):
    colors = ['red','orange','green','blue']
    colormap = mpl.colors.ListedColormap(colors)
#    colormap='rgl'
    plt.scatter(*zip(*X), c=y, cmap=colormap)
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.show()


randint = np.random.randint(3, 10)
X, y_true = make_blobs(n_samples=100, centers=randint,cluster_std=0.4, random_state=0)
plt.scatter(X[:, 0], X[:, 1], s=50)
plt.show()


k_means = KMeans(n_clusters=4)
k_means.fit(X)
clusters = k_means.predict(X)
plot(X, clusters)


from matplotlib.ticker import MaxNLocator
from matplotlib.pyplot import figure, show

guesses = []
guess_error_rates = []

guess = np.arange(1,10)

# make a k_means variable equal to KMeans with "guess" number of clusters (See above!)
# fit KMeans to the data "X"
for c in guess:
    k_means = KMeans(n_clusters=c)
    k_means.fit(X)

    guesses.append(c)
    guess_error_rates.append(k_means.inertia_)

ax = figure().gca()
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    
plt.plot(guesses, guess_error_rates)
plt.xlabel("Clusters")
plt.ylabel("Error")
plt.show()


hpcap = PCAP('data/http.pcap', flow_ptks_thres=2, verbose=10)
lpcap = PCAP('data/log4j.pcap', flow_ptks_thres=2, verbose=10)

hpcap.pcap2flows()
lpcap.pcap2flows()

# extract features from each flow via STATS
lpcap.flow2features('STATS', fft=False, header=False)
ld = pd.DataFrame(lpcap.features)

# extract features from each flow via STATS
hpcap.flow2features('STATS', fft=False, header=False)
hd = pd.DataFrame(hpcap.features)

'_pcap2flows()' starts at 2022-11-10 17:22:24
pcap_file: data/http.pcap
ith_packets: 0
ith_packets: 10000
ith_packets: 20000
len(flows): 593
total number of flows: 593. Num of flows < 2 pkts: 300, and >=2 pkts: 293 without timeout splitting.
kept flows: 293. Each of them has at least 2 pkts after timeout splitting.
flow_durations.shape: (293, 1)
        col_0
count 293.000
mean   11.629
std    15.820
min     0.000
25%     0.076
50%     0.455
75%    20.097
max    46.235
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293 entries, 0 to 292
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   col_0   293 non-null    float64
dtypes: float64(1)
memory usage: 2.4 KB
None
0th_flow: len(pkts): 4
After splitting flows, the number of subflows: 291 and each of them has at least 2 packets.
'_pcap2flows()' ends at 2022-11-10 17:22:29 and takes 0.0857 mins.
'_pcap2flows()' starts at 2022-11-10 17:22:29
pcap_file: data/log4j.pcap
ith_packets: 0
ith_packets: 10000
ith_packets: 20000
ith_packets: 30000
ith_packets: 40000
ith_packets: 50000
ith_packets: 60000
ith_packets: 70000
ith_packets: 80000
len(flows): 44707
total number of flows: 44707. Num of flows < 2 pkts: 39869, and >=2 pkts: 4838 without timeout splitting.
kept flows: 4766. Each of them has at least 2 pkts after timeout splitting.
flow_durations.shape: (4766, 1)
          col_0
count  4766.000
mean     12.068
std     191.592
min       0.000
25%       0.454
50%       5.075
75%       7.065
max   12889.480
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4766 entries, 0 to 4765
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   col_0   4766 non-null   float64
dtypes: float64(1)
memory usage: 37.4 KB
None
0th_flow: len(pkts): 6
1000th_flow: len(pkts): 5
2000th_flow: len(pkts): 5
3000th_flow: len(pkts): 6
4000th_flow: len(pkts): 5
After splitting flows, the number of subflows: 4795 and each of them has at least 2 packets.
'_pcap2flows()' ends at 2022-11-10 17:22:45 and takes 0.2634 mins.
'_flow2features()' starts at 2022-11-10 17:22:45
True
'_flow2features()' ends at 2022-11-10 17:22:45 and takes 0.0116 mins.
'_flow2features()' starts at 2022-11-10 17:22:45
True
'_flow2features()' ends at 2022-11-10 17:22:46 and takes 0.0055 mins.


pd.set_option('mode.chained_assignment', None)
hd['label'] = 0
ld['label'] = 1

data = pd.concat([ld,hd])
data

	0	1	2	3	4	5	6	7	8	9	10	11	label
0	5.163	1.162	117.959	101.500	75.859	66.000	66.000	72.000	66.000	271.000	6.000	609.000	1
1	5.163	0.968	119.316	123.200	110.443	66.000	66.000	74.000	66.000	344.000	5.000	616.000	1
2	7.274	0.550	40.690	74.000	0.000	74.000	74.000	74.000	74.000	74.000	4.000	296.000	1
3	17.825	0.561	32.539	58.000	0.000	58.000	58.000	58.000	58.000	58.000	10.000	580.000	1
4	0.000	25653.235	6002857.101	234.000	0.000	234.000	234.000	234.000	234.000	234.000	2.000	468.000	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...
286	0.054	334.995	45298.759	135.222	184.344	66.000	66.000	66.000	66.000	719.000	18.000	2434.000	0
287	0.040	895.299	1085500.886	1212.444	544.655	1494.000	1494.000	1494.000	66.000	1494.000	36.000	43648.000	0
288	0.040	124.434	17644.771	141.800	93.034	66.000	74.000	212.000	66.000	291.000	5.000	709.000	0
289	0.146	54.815	33053.585	603.000	612.425	76.500	212.500	1389.000	75.000	1392.000	8.000	4824.000	0
290	0.160	43.743	15722.532	359.429	452.006	68.000	134.000	393.500	67.000	1392.000	7.000	2516.000	0

Clustering¶

K-means Clustering¶

k-Means Clustering: Example¶

Tuning K in K-Means Clustering¶

Load Datasets¶

Dimensionality Reduction for Visualization¶

Clustering¶

K-Means Clustering¶

K-Means Failure Scenarios¶

Evaluation¶

Density-Based Clustering¶

DBSCAN¶

Distribution-Based Clustering¶

Gaussian Mixture Models¶