(netflow/flowd) data aggregator, initial version / concept

a206e64b · Ad Schellevis · 0c358e0d · a206e64b · a206e64b · a206e64b
Commit a206e64b authored Mar 31, 2016 by Ad Schellevis
4 changed files
--- a/src/opnsense/scripts/netflow/flowd_aggregate.py
+++ b/src/opnsense/scripts/netflow/flowd_aggregate.py
+#!/usr/local/bin/python2.7
+"""
+    Copyright (c) 2016 Ad Schellevis
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    1. Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+
+    2. Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+    THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+    INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+    AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+    AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+    OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    --------------------------------------------------------------------------------------
+    Aggregate flowd data for reporting
+"""
+import time
+import datetime
+import os
+import sys
+from lib.parse import parse_flow
+from lib.aggregate import BaseFlowAggregator, AggMetadata
+import lib.aggregates
+
+# init metadata (progress maintenance)
+metadata = AggMetadata()
+
+# register aggregate classes to stream data to
+stream_agg_objects = list()
+resolutions = [60, 60*5]
+for agg_class in lib.aggregates.get_aggregators():
+    for resolution in resolutions:
+        stream_agg_objects.append(agg_class(resolution))
+
+# parse flow data and stream to registered consumers
+prev_recv=metadata.last_sync()
+for flow_record in parse_flow(prev_recv):
+    if flow_record is None or prev_recv != flow_record['recv']:
+        # commit data on receive timestamp change or last record
+        for stream_agg_object in stream_agg_objects:
+            stream_agg_object.commit()
+        metadata.update_sync_time(prev_recv)
+    if flow_record is not None:
+        # send to aggregator
+        for stream_agg_object in stream_agg_objects:
+            stream_agg_object.add(flow_record)
+        prev_recv = flow_record['recv']
--- a/src/opnsense/scripts/netflow/lib/aggregate.py
+++ b/src/opnsense/scripts/netflow/lib/aggregate.py
@@ -24,13 +24,56 @@
    POSSIBILITY OF SUCH DAMAGE.

    --------------------------------------------------------------------------------------
-    aggregate flow data (format in parse.py) into sqlite structured container, using a table per resolution and a
-    file per collection.
+    aggregate flow data (format in parse.py) into sqlite structured container per type/resolution.
+    Implementations are collected in lib\aggregates\
 """
 import os
 import datetime
 import sqlite3

+class AggMetadata(object):
+    """ store some metadata needed to keep track of parse progress
+    """
+    def __init__(self):
+        self._filename = '/var/netflow/metadata.sqlite'
+        # make sure the target directory exists
+        target_path = os.path.dirname(self._filename)
+        if not os.path.isdir(target_path):
+            os.makedirs(target_path)
+        # open sqlite database and cursor
+        self._db_connection = sqlite3.connect(self._filename)
+        self._db_cursor = self._db_connection.cursor()
+        # known tables
+        self._tables = list()
+        # cache known tables
+        self._update_known_tables()
+
+    def _update_known_tables(self):
+        """ request known tables
+        """
+        result = list()
+        self._db_cursor.execute('SELECT name FROM sqlite_master')
+        for record in self._db_cursor.fetchall():
+            self._tables.append(record[0])
+
+    def update_sync_time(self, timestamp):
+        """ update (last) sync timestamp
+        """
+        if 'sync_timestamp' not in self._tables:
+            self._db_cursor.execute('create table sync_timestamp(mtime timestamp)')
+            self._db_cursor.execute('insert into sync_timestamp(mtime) values(0)')
+            self._db_connection.commit()
+            self._update_known_tables()
+        # update last sync timestamp, if this date > timestamp
+        self._db_cursor.execute('update sync_timestamp set mtime = :mtime where mtime < :mtime', {'mtime': timestamp})
+        self._db_connection.commit()
+
+    def last_sync(self):
+        if 'sync_timestamp' not in self._tables:
+            return 0.0
+        else:
+            self._db_cursor.execute('select max(mtime) from sync_timestamp')
+            return self._db_cursor.fetchall()[0][0]

 class BaseFlowAggregator(object):
    # target location ('/var/netflow/<store>.sqlite')
@@ -43,19 +86,15 @@ class BaseFlowAggregator(object):
        """
        self.resolution = resolution
        # target table name, data_<resolution in seconds>
-        self._target_table_name = 'data_%d' % self.resolution
        self._db_connection = None
        self._update_cur = None
        self._known_targets = list()
        # construct update and insert sql statements
-        tmp = 'update %s set octets = octets + :octets_consumed, packets = packets + :packets_consumed '
+        tmp = 'update timeserie set octets = octets + :octets_consumed, packets = packets + :packets_consumed '
        tmp += 'where mtime = :mtime and %s '
-        self._update_stmt = tmp % (self._target_table_name,
-                                   ' and '.join(map(lambda x: '%s = :%s' % (x, x), self.agg_fields)))
-        tmp = 'insert into %s (mtime, octets, packets, %s) values (:mtime, :octets_consumed, :packets_consumed, %s)'
-        self._insert_stmt = tmp % (self._target_table_name,
-                                   ','.join(self.agg_fields),
-                                   ','.join(map(lambda x: ':%s' % x, self.agg_fields)))
+        self._update_stmt = tmp % (' and '.join(map(lambda x: '%s = :%s' % (x, x), self.agg_fields)))
+        tmp = 'insert into timeserie (mtime, octets, packets, %s) values (:mtime, :octets_consumed, :packets_consumed, %s)'
+        self._insert_stmt = tmp % (','.join(self.agg_fields), ','.join(map(lambda x: ':%s' % x, self.agg_fields)))
        # open database
        self._open_db()
        self._fetch_known_targets()
@@ -77,7 +116,7 @@ class BaseFlowAggregator(object):
        if self._db_connection is not None:
            # construct new aggregate table
            sql_text = list()
-            sql_text.append('create table %s ( ' % self._target_table_name)
+            sql_text.append('create table timeserie ( ')
            sql_text.append('  mtime timestamp')
            for agg_field in self.agg_fields:
                sql_text.append(', %s varchar(255)' % agg_field)
@@ -104,11 +143,11 @@ class BaseFlowAggregator(object):
        """
        if self.target_filename is not None:
            # make sure the target directory exists
-            target_path = os.path.dirname(self.target_filename)
+            target_path = os.path.dirname(self.target_filename % self.resolution)
            if not os.path.isdir(target_path):
                os.makedirs(target_path)
            # open sqlite database
-            self._db_connection = sqlite3.connect(self.target_filename)
+            self._db_connection = sqlite3.connect(self.target_filename % self.resolution)
            # open update/insert cursor
            self._update_cur = self._db_connection.cursor()

@@ -123,7 +162,7 @@ class BaseFlowAggregator(object):
        :param flow: flow data (from parse.py)
        """
        # make sure target exists
-        if self._target_table_name not in self._known_targets:
+        if 'timeserie' not in self._known_targets:
            self._create_target_table()

        # push record(s) depending on resolution
@@ -139,7 +178,7 @@ class BaseFlowAggregator(object):
                # upsert data
                flow['octets_consumed'] = consume_perc * flow['octets']
                flow['packets_consumed'] = consume_perc * flow['packets']
-                flow['mtime'] = datetime.datetime.utcfromtimestamp(start_time)
+                flow['mtime'] = datetime.datetime.fromtimestamp(start_time)
                self._update_cur.execute(self._update_stmt, flow)
                if self._update_cur.rowcount == 0:
                    self._update_cur.execute(self._insert_stmt, flow)

--- a/src/opnsense/scripts/netflow/lib/aggregates/__init__.py
+++ b/src/opnsense/scripts/netflow/lib/aggregates/__init__.py
+"""
+    Copyright (c) 2016 Ad Schellevis
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    1. Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+
+    2. Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+    THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+    INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+    AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+    AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+    OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    --------------------------------------------------------------------------------------
+    data aggregator loader
+"""
+import sys
+import os
+import glob
+from lib.aggregate import BaseFlowAggregator
+
+def get_aggregators():
+    """ collect and return available aggregators
+        :return: list of class references
+    """
+    result = list()
+    for filename in glob.glob('%s/*.py'%os.path.dirname(__file__)):
+        filename_base = os.path.basename(filename)
+        if filename_base[0:2] != '__':
+            module_name = 'lib.aggregates.%s' % '.'.join(filename_base.split('.')[:-1])
+            __import__(module_name)
+            for clsname in dir(sys.modules[module_name]):
+                clshandle = getattr(sys.modules[module_name], clsname)
+                if type(clshandle) == type and issubclass(clshandle, BaseFlowAggregator):
+                    if hasattr(clshandle, 'target_filename') and clshandle.target_filename is not None:
+                        result.append(clshandle)
+    return result
--- a/src/opnsense/scripts/netflow/lib/aggregates/interface.py
+++ b/src/opnsense/scripts/netflow/lib/aggregates/interface.py
+"""
+    Copyright (c) 2016 Ad Schellevis
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    1. Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+
+    2. Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+    THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+    INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+    AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+    AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+    OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    --------------------------------------------------------------------------------------
+    data aggregator type
+"""
+from lib.aggregate import BaseFlowAggregator
+
+class FlowInterfaceTotals(BaseFlowAggregator):
+    """ collect interface totals
+    """
+    target_filename = '/var/netflow/interface_%06d.sqlite'
+    agg_fields = ['if_in', 'if_out']
+
+    def __init__(self, resolution):
+        super(FlowInterfaceTotals, self).__init__(resolution)