Demo

[1]:
import pandas as pd
[2]:
df = pd.read_csv('/srv/data/uhricdan/data/demo/home.csv')

Preprocessing functions

[3]:
from fet.common import convert_times

print(convert_times.__doc__)
Convert time strings and calculate duration.

    Args:
        df (pandas.DataFrame): DataFrame with time_first and time_last.
        inplace (bool, optional): Extract features within provided DataFrame
            or return new DataFrame. Defaults to False.

    Returns:
        pandas.DataFrame: DataFrame is returned only if inplace=False,
            otherwise returns None.

[4]:
convert_times(df, inplace=True)
[5]:
from fet.pstats import aggregate

print(aggregate.__doc__)
Time aggregation of basic + pstats fields.

    Args:
        df (pandas.DataFrame): DataFrame with basic + pstats fields.
        window (str, optional): Aggregation time window. Defaults to "5min".

[6]:
agg = aggregate(df)
df[agg.columns] = agg
[7]:
df = df[df['duration'] > 20.]
[8]:
from fet.pstats import swap_directions

print(swap_directions.__doc__)
Swap directional columns.

    Args:
        df (pandas.DataFrame): DataFrame with directional columns.
        swap (pandas.Series): Bool series of affected rows.
        inplace (bool, optional): Extract features within provided DataFrame
            or return new DataFrame. Defaults to False.

    Returns:
        pandas.DataFrame: DataFrame is returned only if inplace=False,
            otherwise returns None.

[9]:
swap_directions(df, df['dst_ip'].str.startswith('192.168.'), inplace=True)

Explorer

[10]:
from fet.explorer import Explorer

print(Explorer.__doc__)
Dataset explorer.

    Args:
        y (str, optional): Target/dependent variable. Defaults to None.

[11]:
e = Explorer(y='label')
[12]:
print(e.fit.__doc__)
Fit DataFrame to Explorer.

        Args:
            df (pandas.DataFrame): DataFrame to explore.
            remove_low_variance (bool, optional): Remove low variance features. Defaults to True.
            module (string, optional): Features extraction module. Defaults to 'pstats'.

[13]:
e.fit(df)
[14]:
e.plot_pca()
_images/demo_16_0.png
[15]:
from fet.pstats import extract_features

print(extract_features.__doc__)
Extracts per flow statistics.

    Args:
        df (pandas.DataFrame): Dataframe with basic and pstats values.
        inplace (bool, optional): Extract features within provided DataFrame
            or return new DataFrame. Defaults to False.

    Returns:
        pandas.DataFrame: DataFrame is returned only if inplace=False - otherwise
            returns None.

[16]:
from fet.pstats import feature_cols

print(feature_cols)
['bytes_rate', 'bytes_rev_rate', 'bytes_total_rate', 'packets_rate', 'packets_rev_rate', 'packets_total_rate', 'fin_count', 'syn_count', 'rst_count', 'psh_count', 'ack_count', 'urg_count', 'fin_ratio', 'syn_ratio', 'rst_ratio', 'psh_ratio', 'ack_ratio', 'urg_ratio', 'lengths_min', 'lengths_max', 'lengths_mean', 'lengths_std', 'fwd_lengths_min', 'fwd_lengths_max', 'fwd_lengths_mean', 'fwd_lengths_std', 'bwd_lengths_min', 'bwd_lengths_max', 'bwd_lengths_mean', 'bwd_lengths_std', 'pkt_iat_min', 'pkt_iat_max', 'pkt_iat_mean', 'pkt_iat_std', 'fwd_pkt_iat_min', 'fwd_pkt_iat_max', 'fwd_pkt_iat_mean', 'fwd_pkt_iat_std', 'bwd_pkt_iat_min', 'bwd_pkt_iat_max', 'bwd_pkt_iat_mean', 'bwd_pkt_iat_std', 'norm_pkt_iat_mean', 'norm_pkt_iat_std', 'norm_fwd_pkt_iat_mean', 'norm_fwd_pkt_iat_std', 'norm_bwd_pkt_iat_mean', 'norm_bwd_pkt_iat_std']
[17]:
e.df
[17]:
dst_ip src_ip bytes bytes_rev link_bit_field time_first time_last dst_mac src_mac packets ... bwd_pkt_iat_min bwd_pkt_iat_max bwd_pkt_iat_mean bwd_pkt_iat_std norm_pkt_iat_mean norm_pkt_iat_std norm_fwd_pkt_iat_mean norm_fwd_pkt_iat_std norm_bwd_pkt_iat_mean norm_bwd_pkt_iat_std
0 255.255.255.255 0.0.0.0 3700.0 0.0 1 2016-09-23 02:27:54.634735 2016-09-23 02:28:46.061507 ff:ff:ff:ff:ff:ff b4:ce:f6:a7:a3:c2 11.0 ... 0.000000 0.000000 0.000000 0.000000 0.100000 0.300000 0.100000 0.300000 0.000000 0.000000
12 209.107.220.167 192.168.1.208 4308.0 288.0 1 2016-09-23 02:28:50.919010 2016-09-23 02:29:39.587075 14:cc:20:51:33:ea 74:2f:68:81:69:42 30.0 ... 48.457147 48.457147 48.457147 0.000000 0.064516 0.245670 0.068966 0.253395 1.000000 0.000000
15 213.248.117.217 192.168.1.208 4308.0 288.0 1 2016-09-23 02:28:50.918896 2016-09-23 02:29:23.415141 14:cc:20:51:33:ea 74:2f:68:81:69:42 30.0 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
17 224.0.0.251 192.168.1.208 5467.0 0.0 1 2016-09-23 02:28:46.720692 2016-09-23 02:29:50.554750 01:00:5e:00:00:fb 74:2f:68:81:69:42 17.0 ... 0.000000 0.000000 0.000000 0.000000 0.187500 0.390312 0.187500 0.390312 0.000000 0.000000
27 239.255.255.250 192.168.1.208 6126.0 0.0 1 2016-09-23 02:28:46.175568 2016-09-23 02:29:32.970697 01:00:5e:7f:ff:fa 74:2f:68:81:69:42 6.0 ... 0.000000 0.000000 0.000000 0.000000 0.400000 0.489898 0.400000 0.489898 0.000000 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8745 216.58.199.46 192.168.1.243 3349.0 1569.0 1 2016-10-04 06:20:23.360257 2016-10-04 06:23:25.431831 14:cc:20:51:33:ea ac:bc:32:d4:6f:2f 15.0 ... 0.000077 66.792117 13.002044 24.902418 0.103448 0.304543 0.214286 0.410326 0.214286 0.410326
8748 255.255.255.255 192.168.1.243 2040.0 0.0 1 2016-10-04 06:21:39.017970 2016-10-04 06:24:09.444672 ff:ff:ff:ff:ff:ff ac:bc:32:d4:6f:2f 6.0 ... 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.000000 0.000000 0.000000 0.000000
8749 52.16.36.153 192.168.1.243 7812.0 5576.0 1 2016-10-04 06:20:17.291276 2016-10-04 06:24:17.203163 14:cc:20:51:33:ea ac:bc:32:d4:6f:2f 27.0 ... 0.000061 20.001821 13.999735 8.664132 0.310345 0.462635 0.500000 0.500000 0.700000 0.458258
8762 64.233.187.189 192.168.1.243 2027.0 2330.0 1 2016-10-04 06:23:14.031705 2016-10-04 06:25:24.690977 14:cc:20:51:33:ea ac:bc:32:d4:6f:2f 17.0 ... 0.016166 25.764317 3.995628 8.778483 0.068966 0.253395 0.142857 0.349927 0.142857 0.349927
8784 52.206.12.44 192.168.1.248 3283.0 8745.0 1 2016-10-04 06:55:08.624737 2016-10-04 06:56:08.692950 14:cc:20:51:33:ea b4:ce:f6:a7:a3:c2 17.0 ... 0.000067 9.619998 1.001864 2.620626 0.038462 0.192308 0.076923 0.266469 0.083333 0.276385

3096 rows × 75 columns

[18]:
e.feature_cols
[18]:
['bytes_rate',
 'bytes_rev_rate',
 'bytes_total_rate',
 'packets_rate',
 'packets_rev_rate',
 'packets_total_rate',
 'fin_count',
 'syn_count',
 'rst_count',
 'psh_count',
 'ack_count',
 'fin_ratio',
 'syn_ratio',
 'rst_ratio',
 'psh_ratio',
 'ack_ratio',
 'lengths_min',
 'lengths_max',
 'lengths_mean',
 'lengths_std',
 'fwd_lengths_min',
 'fwd_lengths_max',
 'fwd_lengths_mean',
 'fwd_lengths_std',
 'bwd_lengths_min',
 'bwd_lengths_max',
 'bwd_lengths_mean',
 'bwd_lengths_std',
 'pkt_iat_min',
 'pkt_iat_max',
 'pkt_iat_mean',
 'pkt_iat_std',
 'fwd_pkt_iat_min',
 'fwd_pkt_iat_max',
 'fwd_pkt_iat_mean',
 'fwd_pkt_iat_std',
 'bwd_pkt_iat_min',
 'bwd_pkt_iat_max',
 'bwd_pkt_iat_mean',
 'bwd_pkt_iat_std',
 'norm_pkt_iat_mean',
 'norm_pkt_iat_std',
 'norm_fwd_pkt_iat_mean',
 'norm_fwd_pkt_iat_std',
 'norm_bwd_pkt_iat_mean',
 'norm_bwd_pkt_iat_std']
[19]:
e.correlation_matrix()
_images/demo_21_0.png
[20]:
e.plot_pca()
_images/demo_22_0.png
[21]:
e.correlated_features()
bytes_rev_rate    0.996155
Name: bytes_total_rate, dtype: float64

bytes_rev_rate      0.998237
bytes_total_rate    0.999454
Name: packets_rev_rate, dtype: float64

bytes_rev_rate      0.988283
bytes_total_rate    0.995803
packets_rev_rate    0.994611
Name: packets_total_rate, dtype: float64

bwd_lengths_max    0.97001
Name: bwd_lengths_std, dtype: float64

pkt_iat_min    0.952162
Name: pkt_iat_mean, dtype: float64

pkt_iat_max    0.997035
Name: fwd_pkt_iat_max, dtype: float64

[22]:
e.remove_features(
    [
        'packets_total_rate',
        'pkt_iat_min',
        'packets_total_rate',
        'bytes_total_rate',
        'bwd_lengths_max'
    ]
)
[23]:
e.boxplot()
_images/demo_25_0.png
[24]:
e.plot_feature_scores()
_images/demo_26_0.png
[25]:
e.plot_feature_importances()
_images/demo_27_0.png
[26]:
from sklearn.ensemble import AdaBoostClassifier

ab = AdaBoostClassifier()
[27]:
e.plot_feature_importances(ab)
_images/demo_29_0.png
[28]:
e.feature_importances(ab)
[28]:
[('lengths_std', 0.08),
 ('fwd_pkt_iat_min', 0.08),
 ('fwd_pkt_iat_max', 0.08),
 ('fin_ratio', 0.06),
 ('lengths_min', 0.06),
 ('fwd_lengths_min', 0.04),
 ('fwd_lengths_max', 0.04),
 ('bwd_lengths_min', 0.04),
 ('bwd_lengths_std', 0.04),
 ('pkt_iat_max', 0.04),
 ('pkt_iat_mean', 0.04),
 ('pkt_iat_std', 0.04),
 ('bytes_rate', 0.02),
 ('bytes_rev_rate', 0.02),
 ('packets_rev_rate', 0.02),
 ('syn_count', 0.02),
 ('ack_count', 0.02),
 ('rst_ratio', 0.02),
 ('psh_ratio', 0.02),
 ('ack_ratio', 0.02),
 ('lengths_max', 0.02),
 ('lengths_mean', 0.02),
 ('fwd_lengths_mean', 0.02),
 ('fwd_lengths_std', 0.02),
 ('fwd_pkt_iat_mean', 0.02),
 ('bwd_pkt_iat_min', 0.02),
 ('bwd_pkt_iat_max', 0.02),
 ('norm_pkt_iat_mean', 0.02),
 ('norm_pkt_iat_std', 0.02),
 ('norm_fwd_pkt_iat_std', 0.02),
 ('packets_rate', 0.0),
 ('fin_count', 0.0),
 ('rst_count', 0.0),
 ('psh_count', 0.0),
 ('syn_ratio', 0.0),
 ('bwd_lengths_mean', 0.0),
 ('fwd_pkt_iat_std', 0.0),
 ('bwd_pkt_iat_mean', 0.0),
 ('bwd_pkt_iat_std', 0.0),
 ('norm_fwd_pkt_iat_mean', 0.0),
 ('norm_bwd_pkt_iat_mean', 0.0),
 ('norm_bwd_pkt_iat_std', 0.0)]
[29]:
e.pairplot(e.kbest(5))
_images/demo_31_0.png
[30]:
e.ecdfplot([x[0] for x in e.feature_importances(ab)[:6]])
_images/demo_32_0.png